Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -802,7 +802,100 @@ def create_gaia_app():
|
|
802 |
)
|
803 |
|
804 |
# ===============================
|
805 |
-
# TAB 4:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
806 |
# ===============================
|
807 |
with gr.Tab("ℹ️ Information"):
|
808 |
gr.Markdown("""
|
@@ -815,6 +908,15 @@ def create_gaia_app():
|
|
815 |
- **Web browsing**: Finding and using external information
|
816 |
- **Tool use**: Calculator, code execution, etc.
|
817 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
818 |
## 🎯 How to Use This Space
|
819 |
|
820 |
### 1. Model Setup
|
@@ -832,31 +934,89 @@ def create_gaia_app():
|
|
832 |
- Then try "GAIA Test Set" for real benchmark evaluation
|
833 |
- Download results in JSONL format for submission
|
834 |
|
|
|
|
|
|
|
|
|
|
|
835 |
## 📊 Model Recommendations
|
836 |
|
837 |
-
| Model | Best For | Memory | Speed | Quality |
|
838 |
-
|
839 |
-
| Fast & Light | Quick testing | Low | Fast | Good |
|
840 |
-
| Balanced | General use | Medium | Medium | Better |
|
841 |
-
| High Quality | Best results | High | Slow | Best |
|
842 |
-
| Instruction Following | Complex reasoning | High | Medium | Excellent |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
843 |
|
844 |
## 🔗 Resources
|
845 |
-
- [GAIA Paper](https://arxiv.org/abs/2311.12983)
|
846 |
-
- [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
|
847 |
-
- [
|
|
|
848 |
|
849 |
-
##
|
850 |
-
Results are saved in GAIA leaderboard format:
|
851 |
```json
|
852 |
-
{"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[REASONING]"}
|
|
|
853 |
```
|
854 |
|
855 |
-
## ⚡ Tips for Best Results
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
860 |
""")
|
861 |
|
862 |
return app
|
|
|
802 |
)
|
803 |
|
804 |
# ===============================
|
805 |
+
# TAB 4: FULL BENCHMARK (NEW)
|
806 |
+
# ===============================
|
807 |
+
with gr.Tab("🏆 Full Benchmark"):
|
808 |
+
gr.Markdown("## Official GAIA Leaderboard Benchmark")
|
809 |
+
|
810 |
+
with gr.Row():
|
811 |
+
with gr.Column():
|
812 |
+
gr.Markdown(get_leaderboard_info())
|
813 |
+
|
814 |
+
with gr.Column():
|
815 |
+
# Test questions preview
|
816 |
+
test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary")
|
817 |
+
test_preview_output = gr.Markdown(
|
818 |
+
value="Click above to preview official test questions"
|
819 |
+
)
|
820 |
+
|
821 |
+
# Full benchmark
|
822 |
+
gr.Markdown("### 🚀 Run Complete Benchmark")
|
823 |
+
gr.Markdown("""
|
824 |
+
**Warning**: This will evaluate your model on all ~300 official GAIA test questions.
|
825 |
+
This process may take 1-3 hours depending on your model and hardware.
|
826 |
+
""")
|
827 |
+
|
828 |
+
full_benchmark_btn = gr.Button(
|
829 |
+
"🏆 Start Full Benchmark (300 Questions)",
|
830 |
+
variant="primary",
|
831 |
+
size="lg"
|
832 |
+
)
|
833 |
+
|
834 |
+
# Benchmark results
|
835 |
+
benchmark_status = gr.Textbox(
|
836 |
+
label="📊 Benchmark Status",
|
837 |
+
value="Ready to run benchmark",
|
838 |
+
interactive=False
|
839 |
+
)
|
840 |
+
|
841 |
+
with gr.Row():
|
842 |
+
with gr.Column():
|
843 |
+
benchmark_report = gr.Markdown(
|
844 |
+
label="📈 Benchmark Report",
|
845 |
+
value="Run benchmark to see detailed results"
|
846 |
+
)
|
847 |
+
|
848 |
+
with gr.Column():
|
849 |
+
# Download files
|
850 |
+
submission_file = gr.File(
|
851 |
+
label="💾 Download Submission File (JSONL)",
|
852 |
+
visible=False
|
853 |
+
)
|
854 |
+
|
855 |
+
metadata_file = gr.File(
|
856 |
+
label="📋 Download Metadata File",
|
857 |
+
visible=False
|
858 |
+
)
|
859 |
+
|
860 |
+
gr.Markdown("""
|
861 |
+
### 📤 Leaderboard Submission
|
862 |
+
1. Download the JSONL file above
|
863 |
+
2. Visit [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
|
864 |
+
3. Upload your submission file
|
865 |
+
4. View your model's ranking!
|
866 |
+
""")
|
867 |
+
|
868 |
+
# Event handlers
|
869 |
+
test_preview_btn.click(
|
870 |
+
fn=load_test_questions_interface,
|
871 |
+
outputs=[test_preview_output]
|
872 |
+
)
|
873 |
+
|
874 |
+
def full_benchmark_with_files(*args):
|
875 |
+
status, report, sub_file, meta_file = run_leaderboard_benchmark_interface(*args)
|
876 |
+
return (
|
877 |
+
status,
|
878 |
+
report,
|
879 |
+
sub_file,
|
880 |
+
meta_file,
|
881 |
+
gr.update(visible=True), # Show submission file
|
882 |
+
gr.update(visible=True) # Show metadata file
|
883 |
+
)
|
884 |
+
|
885 |
+
full_benchmark_btn.click(
|
886 |
+
fn=full_benchmark_with_files,
|
887 |
+
outputs=[
|
888 |
+
benchmark_status,
|
889 |
+
benchmark_report,
|
890 |
+
submission_file,
|
891 |
+
metadata_file,
|
892 |
+
submission_file, # Update visibility
|
893 |
+
metadata_file # Update visibility
|
894 |
+
]
|
895 |
+
)
|
896 |
+
|
897 |
+
# ===============================
|
898 |
+
# TAB 5: INFORMATION (UPDATED)
|
899 |
# ===============================
|
900 |
with gr.Tab("ℹ️ Information"):
|
901 |
gr.Markdown("""
|
|
|
908 |
- **Web browsing**: Finding and using external information
|
909 |
- **Tool use**: Calculator, code execution, etc.
|
910 |
|
911 |
+
## 🏆 GAIA Public Leaderboard
|
912 |
+
GAIA provides a **public leaderboard hosted on Hugging Face** where you can:
|
913 |
+
- Test your models against **300 official testing questions**
|
914 |
+
- Compare performance with state-of-the-art systems
|
915 |
+
- Track progress in AI reasoning capabilities
|
916 |
+
- Contribute to research community benchmarks
|
917 |
+
|
918 |
+
**Leaderboard URL**: [https://huggingface.co/spaces/gaia-benchmark/leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
|
919 |
+
|
920 |
## 🎯 How to Use This Space
|
921 |
|
922 |
### 1. Model Setup
|
|
|
934 |
- Then try "GAIA Test Set" for real benchmark evaluation
|
935 |
- Download results in JSONL format for submission
|
936 |
|
937 |
+
### 4. Full Benchmark (NEW!)
|
938 |
+
- Run complete evaluation on all 300 official test questions
|
939 |
+
- Get leaderboard-ready submission files
|
940 |
+
- Upload directly to GAIA leaderboard for ranking
|
941 |
+
|
942 |
## 📊 Model Recommendations
|
943 |
|
944 |
+
| Model | Best For | Memory | Speed | Quality | Leaderboard Ready |
|
945 |
+
|-------|----------|---------|-------|---------|------------------|
|
946 |
+
| Fast & Light | Quick testing | Low | Fast | Good | ✅ |
|
947 |
+
| Balanced | General use | Medium | Medium | Better | ✅ |
|
948 |
+
| High Quality | Best results | High | Slow | Best | ✅ |
|
949 |
+
| Instruction Following | Complex reasoning | High | Medium | Excellent | ✅ |
|
950 |
+
|
951 |
+
## 🏅 Benchmark Performance Expectations
|
952 |
+
|
953 |
+
Based on current leaderboard standings, expect these performance ranges:
|
954 |
+
|
955 |
+
| Difficulty Level | Top Models | Good Models | Baseline Models |
|
956 |
+
|------------------|------------|-------------|-----------------|
|
957 |
+
| **Level 1** (Basic) | 85-95% | 70-85% | 50-70% |
|
958 |
+
| **Level 2** (Intermediate) | 65-80% | 45-65% | 25-45% |
|
959 |
+
| **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
|
960 |
+
| **Overall Average** | 65-75% | 45-65% | 30-45% |
|
961 |
+
|
962 |
+
## 🚀 Continuous Benchmarking Workflow
|
963 |
+
|
964 |
+
1. **Development**: Test with sample questions
|
965 |
+
2. **Validation**: Run batch evaluation (10-50 questions)
|
966 |
+
3. **Benchmarking**: Full evaluation (300 questions)
|
967 |
+
4. **Submission**: Upload to leaderboard
|
968 |
+
5. **Analysis**: Compare with other models
|
969 |
+
6. **Iteration**: Improve and re-benchmark
|
970 |
|
971 |
## 🔗 Resources
|
972 |
+
- [GAIA Paper](https://arxiv.org/abs/2311.12983) - Original research paper
|
973 |
+
- [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - Official rankings
|
974 |
+
- [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - Training/validation data
|
975 |
+
- [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
|
976 |
|
977 |
+
## 📋 Submission Format
|
978 |
+
Results are saved in official GAIA leaderboard format:
|
979 |
```json
|
980 |
+
{"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
|
981 |
+
{"task_id": "gaia_002", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
|
982 |
```
|
983 |
|
984 |
+
## ⚡ Pro Tips for Best Results
|
985 |
+
|
986 |
+
### Performance Optimization
|
987 |
+
1. **Start Small**: Always test with sample questions first
|
988 |
+
2. **Choose Wisely**: Balance speed vs quality based on your goals
|
989 |
+
3. **Monitor Resources**: Use GPU acceleration for larger models
|
990 |
+
4. **Validate Format**: Ensure JSONL files are properly formatted
|
991 |
+
|
992 |
+
### Leaderboard Strategy
|
993 |
+
1. **Baseline First**: Get initial results with fast model
|
994 |
+
2. **Iterate Quickly**: Test improvements on small batches
|
995 |
+
3. **Full Benchmark**: Run complete evaluation when ready
|
996 |
+
4. **Compare Results**: Analyze performance across difficulty levels
|
997 |
+
5. **Document Approach**: Include model details and methodology
|
998 |
+
|
999 |
+
### Common Pitfalls to Avoid
|
1000 |
+
- Don't run full benchmark on untested models
|
1001 |
+
- Ensure stable internet connection for long evaluations
|
1002 |
+
- Verify submission file format before uploading
|
1003 |
+
- Check GPU memory usage for large models
|
1004 |
+
- Save intermediate results during long runs
|
1005 |
+
|
1006 |
+
## 🎯 Getting Started Checklist
|
1007 |
+
|
1008 |
+
- [ ] Load and test a model in "Model Setup"
|
1009 |
+
- [ ] Try example questions in "Single Question"
|
1010 |
+
- [ ] Run small batch in "Batch Evaluation"
|
1011 |
+
- [ ] Review test questions in "Full Benchmark"
|
1012 |
+
- [ ] Run complete benchmark when ready
|
1013 |
+
- [ ] Download submission files
|
1014 |
+
- [ ] Upload to GAIA leaderboard
|
1015 |
+
- [ ] Compare your results with others!
|
1016 |
+
|
1017 |
+
---
|
1018 |
+
|
1019 |
+
**Ready to start benchmarking?** Begin with the Model Setup tab and work your way through each stage. Good luck! 🚀
|
1020 |
""")
|
1021 |
|
1022 |
return app
|