Spaces:

MaziyarPanahi
/

FACTS-Leaderboard

Running

App Files Files Community

MaziyarPanahi commited on May 28

Commit

6c3704b

1 Parent(s): 3d1714b

update

Browse files

Files changed (4) hide show

FACTS.tsv +32 -0
README.md +1 -1
app.py +16 -26
results.csv +0 -33

FACTS.tsv ADDED Viewed

	@@ -0,0 +1,32 @@

+model	size	Separate Grounding Score	Separate Quality Score	Combined Score
+deepseek-ai/DeepSeek-R1-Distill-Qwen-14B	14	0.817797	0.542373	0.457627
+VIDraft/Gemma-3-R1984-27B	27	0.93617	0.459574	0.434043
+meta-llama/Llama-3.3-70B-Instruct	70	0.842553	0.510638	0.425532
+Qwen/Qwen3-30B-A3B	30	0.812766	0.540426	0.425532
+Qwen/Qwen3-4B	4	0.770213	0.540426	0.425532
+Qwen/Qwen3-32B	32	0.740426	0.553191	0.417021
+deepseek-ai/DeepSeek-R1-Distill-Llama-8B	8	0.766949	0.516949	0.40678
+Qwen/Qwen3-8B	8	0.748936	0.523404	0.4
+Qwen/Qwen3-14B	14	0.778723	0.502128	0.382979
+google/gemma-3-27b-it	27	0.936	0.391	0.378
+Qwen/Qwen2.5-VL-32B-Instruct	32	0.621277	0.570213	0.357447
+meta-llama/Llama-3.1-70B-Instruct	70	0.855932	0.389831	0.334746
+google/gemma-3-12b-it	12	0.944	0.343	0.313
+google/gemma-3-4b-it	4	0.9	0.33	0.3
+Qwen/Qwen3-1.7B	1.7	0.702128	0.451064	0.297872
+deepseek-ai/DeepSeek-R1-Distill-Qwen-7B	7	0.59322	0.449153	0.275424
+Qwen/Qwen3-0.6B	0.6	0.682203	0.330508	0.266949
+Qwen/Qwen2.5-7B-Instruct	7	0.731915	0.310638	0.255319
+Qwen/Qwen2.5-14B-Instruct-1M	14	0.70339	0.300847	0.254237
+nvidia/Llama-Nemotron-Nano-8B	8	0.576271	0.402542	0.241525
+OpenScholar/Llama-3.1-OpenScholar-8B	8	0.690678	0.283898	0.241525
+Qwen/Qwen2.5-7B-Instruct-1M	7	0.737288	0.271186	0.207627
+nvidia/Llama-Nemotron-Nano-4B-v1.1	4	0.548936	0.340426	0.2
+google/gemma-3-1b-it	1	0.65	0.28	0.19
+mistralai/Ministral-8B-Instruct-2410	8	0.94	0.184	0.175
+meta-llama/Llama-3.1-8B-Instruct	8	0.665254	0.194915	0.169492
+mistralai/Mistral-Small-3.1-24B-Instruct-2503	24	0.953191	0.165957	0.157447
+mistralai/Mistral-Small-24B-Instruct-2501	24	0.95339	0.135593	0.131356
+open-thoughts/OpenThinker-7B	7	0.478814	0.152542	0.110169
+PleIAs/Pleias-RAG-350M	0.35	0.236264	0.021978	0.010989
+PleIAs/Pleias-RAG-1B	1	0.190476	0.037037	0

README.md CHANGED Viewed

@@ -8,7 +8,7 @@ sdk_version: 5.31.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: This is Google's FACT Leaderboard, but for Open LLMs!
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: This is Google's FACTS Grounding Leaderboard, but for Open LLMs and medical domain!
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -2,25 +2,17 @@ import gradio as gr
 import pandas as pd
 import numpy as np
-# Read data from CSV file
-try:
-    df = pd.read_csv('results.csv', skipinitialspace=True)
-    print(f"Successfully loaded {len(df)} rows from CSV")
-    print(f"Columns: {list(df.columns)}")
-except Exception as e:
-    print(f"Error reading CSV: {e}")
-    print("Attempting to read with error handling...")
-    # Try reading with error handling for bad lines
-    df = pd.read_csv('results.csv', skipinitialspace=True, on_bad_lines='skip')
-    print(f"Loaded {len(df)} rows after skipping bad lines")
-# Map CSV columns to expected column names
 df = df.rename(columns={
-    'model_name': 'Model Name',
-    'size': 'Size',
-    'grounding_score': 'Separate Grounding Score',
-    'quality_score': 'Separate Quality Score',
-    'combined_score': 'Combined Score'
 })
 # Create size display format
@@ -85,14 +77,14 @@ def filter_and_search_models(search_query, size_ranges, sort_by):
     # Round numerical values for better display
     for col in ["Separate Grounding Score", "Separate Quality Score", "Combined Score"]:
         display_df = display_df.copy()  # Create a copy to avoid SettingWithCopyWarning
-        display_df[col] = display_df[col]
     return display_df
 # Create the Gradio interface
-with gr.Blocks(title="FACTS Grounding Benchmark", theme=gr.themes.Base()) as app:
-    gr.Markdown("# 🏆 FACTS Grounding Benchmark")
     gr.Markdown(
         "### FACTS Medical Grounding is a benchmark designed to evaluate Open Models over medical domain."
     )
@@ -144,9 +136,9 @@ with gr.Blocks(title="FACTS Grounding Benchmark", theme=gr.themes.Base()) as app
                 headers=[
                     "Rank",
                     "Model Name",
                     "Separate Grounding Score",
                     "Separate Quality Score",
-                    "Size",
                     "Combined Score",
                 ],
                 datatype=["number", "str", "str", "number", "number", "number"],
@@ -157,13 +149,11 @@ with gr.Blocks(title="FACTS Grounding Benchmark", theme=gr.themes.Base()) as app
             # Metric explanations at the bottom
             with gr.Accordion("Metric Explanations", open=False):
-                gr.Markdown(
-                    """
                 - **Grounding Score**: Measures the model's ability to provide factually accurate responses based on given context
                 - **Quality Score**: Evaluates the overall quality of the model's responses including coherence and relevance
                 - **Combined Score**: A weighted combination of grounding and quality scores representing overall performance
-                """
-                )
         with gr.TabItem("About"):
             gr.Markdown(

 import pandas as pd
 import numpy as np
+# Load data from TSV file
+df = pd.read_csv('FACTS.tsv', sep='\t')
+# Clean up the data
+df = df.dropna()  # Remove any rows with missing values
+df.columns = df.columns.str.strip()  # Remove any whitespace from column names
+# Rename columns to match our expected format
 df = df.rename(columns={
+    'model': 'Model Name',
+    'size': 'Size'
 })
 # Create size display format
     # Round numerical values for better display
     for col in ["Separate Grounding Score", "Separate Quality Score", "Combined Score"]:
         display_df = display_df.copy()  # Create a copy to avoid SettingWithCopyWarning
+        display_df[col] = display_df[col].round(6)
     return display_df
 # Create the Gradio interface
+with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as app:
+    gr.Markdown("# 🏆 FACTS Grounding Leaderboard")
     gr.Markdown(
         "### FACTS Medical Grounding is a benchmark designed to evaluate Open Models over medical domain."
     )
                 headers=[
                     "Rank",
                     "Model Name",
+                    "Size",
                     "Separate Grounding Score",
                     "Separate Quality Score",
                     "Combined Score",
                 ],
                 datatype=["number", "str", "str", "number", "number", "number"],
             # Metric explanations at the bottom
             with gr.Accordion("Metric Explanations", open=False):
+                gr.Markdown("""
                 - **Grounding Score**: Measures the model's ability to provide factually accurate responses based on given context
                 - **Quality Score**: Evaluates the overall quality of the model's responses including coherence and relevance
                 - **Combined Score**: A weighted combination of grounding and quality scores representing overall performance
+                """)
         with gr.TabItem("About"):
             gr.Markdown(

results.csv DELETED Viewed

@@ -1,33 +0,0 @@
-model_name,size,grounding_score,quality_score,combined_score
-Qwen/Qwen2.5-7B-Instruct,7,0.800000,0.800000,0.800000
-deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,14,0.817797,0.542373,0.457627
-VIDraft/Gemma-3-R1984-27B,27,0.93617,0.459574,0.434043
-meta-llama/Llama-3.3-70B-Instruct,70,0.842553,0.510638,0.425532
-Qwen/Qwen3-30B-A3B,30,0.812766,0.540426,0.425532,
-Qwen/Qwen3-4B,4,0.770213,0.540426,0.425532
-Qwen/Qwen3-32B,32,0.740426,0.553191,0.417021
-deepseek-ai/DeepSeek-R1-Distill-Llama-8B,8,0.766949,0.516949,0.40678
-Qwen/Qwen3-8B,8,0.748936,0.523404,0.4
-Qwen/Qwen3-14B,14,0.778723,0.502128,0.382979
-google/gemma-3-27b-it,27,0.936,0.391,0.378
-Qwen/Qwen2.5-VL-32B-Instruct,32,0.621277,0.570213,0.357447
-meta-llama/Llama-3.1-70B-Instruct,70,0.855932,0.389831,0.334746
-google/gemma-3-12b-it,12,0.944,0.343,0.313
-google/gemma-3-4b-it,4,0.9,0.33,0.3
-Qwen/Qwen3-1.7B,1.7,0.702128,0.451064,0.297872
-deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,7,0.59322,0.449153,0.275424
-Qwen/Qwen3-0.6B,0.6,0.682203,0.330508,0.266949
-Qwen/Qwen2.5-7B-Instruct,7,0.731915,0.310638,0.255319
-Qwen/Qwen2.5-14B-Instruct-1M,14,0.70339,0.300847,0.254237
-nvidia/Llama-Nemotron-Nano-8B,8,0.576271,0.402542,0.241525
-OpenScholar/Llama-3.1-OpenScholar-8B,8,0.690678,0.283898,0.241525
-Qwen/Qwen2.5-7B-Instruct-1M,7,0.737288,0.271186,0.207627
-nvidia/Llama-Nemotron-Nano-4B-v1.1,4,0.548936,0.340426,0.2
-google/gemma-3-1b-it,1,0.65,0.28,0.19
-mistralai/Ministral-8B-Instruct-2410,8,0.94,0.184,0.175
-meta-llama/Llama-3.1-8B-Instruct,8,0.665254,0.194915,0.169492
-mistralai/Mistral-Small-3.1-24B-Instruct-2503,24,0.953191,0.165957,0.157447
-mistralai/Mistral-Small-24B-Instruct-2501,24,0.95339,0.135593,0.131356
-open-thoughts/OpenThinker-7B,7,0.478814,0.152542,0.110169
-PleIAs/Pleias-RAG-350M,0.35,0.236264,0.021978,0.010989
-PleIAs/Pleias-RAG-1B,1,0.190476,0.037037,0