MaziyarPanahi commited on
Commit
6c3704b
Β·
1 Parent(s): 3d1714b
Files changed (4) hide show
  1. FACTS.tsv +32 -0
  2. README.md +1 -1
  3. app.py +16 -26
  4. results.csv +0 -33
FACTS.tsv ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model size Separate Grounding Score Separate Quality Score Combined Score
2
+ deepseek-ai/DeepSeek-R1-Distill-Qwen-14B 14 0.817797 0.542373 0.457627
3
+ VIDraft/Gemma-3-R1984-27B 27 0.93617 0.459574 0.434043
4
+ meta-llama/Llama-3.3-70B-Instruct 70 0.842553 0.510638 0.425532
5
+ Qwen/Qwen3-30B-A3B 30 0.812766 0.540426 0.425532
6
+ Qwen/Qwen3-4B 4 0.770213 0.540426 0.425532
7
+ Qwen/Qwen3-32B 32 0.740426 0.553191 0.417021
8
+ deepseek-ai/DeepSeek-R1-Distill-Llama-8B 8 0.766949 0.516949 0.40678
9
+ Qwen/Qwen3-8B 8 0.748936 0.523404 0.4
10
+ Qwen/Qwen3-14B 14 0.778723 0.502128 0.382979
11
+ google/gemma-3-27b-it 27 0.936 0.391 0.378
12
+ Qwen/Qwen2.5-VL-32B-Instruct 32 0.621277 0.570213 0.357447
13
+ meta-llama/Llama-3.1-70B-Instruct 70 0.855932 0.389831 0.334746
14
+ google/gemma-3-12b-it 12 0.944 0.343 0.313
15
+ google/gemma-3-4b-it 4 0.9 0.33 0.3
16
+ Qwen/Qwen3-1.7B 1.7 0.702128 0.451064 0.297872
17
+ deepseek-ai/DeepSeek-R1-Distill-Qwen-7B 7 0.59322 0.449153 0.275424
18
+ Qwen/Qwen3-0.6B 0.6 0.682203 0.330508 0.266949
19
+ Qwen/Qwen2.5-7B-Instruct 7 0.731915 0.310638 0.255319
20
+ Qwen/Qwen2.5-14B-Instruct-1M 14 0.70339 0.300847 0.254237
21
+ nvidia/Llama-Nemotron-Nano-8B 8 0.576271 0.402542 0.241525
22
+ OpenScholar/Llama-3.1-OpenScholar-8B 8 0.690678 0.283898 0.241525
23
+ Qwen/Qwen2.5-7B-Instruct-1M 7 0.737288 0.271186 0.207627
24
+ nvidia/Llama-Nemotron-Nano-4B-v1.1 4 0.548936 0.340426 0.2
25
+ google/gemma-3-1b-it 1 0.65 0.28 0.19
26
+ mistralai/Ministral-8B-Instruct-2410 8 0.94 0.184 0.175
27
+ meta-llama/Llama-3.1-8B-Instruct 8 0.665254 0.194915 0.169492
28
+ mistralai/Mistral-Small-3.1-24B-Instruct-2503 24 0.953191 0.165957 0.157447
29
+ mistralai/Mistral-Small-24B-Instruct-2501 24 0.95339 0.135593 0.131356
30
+ open-thoughts/OpenThinker-7B 7 0.478814 0.152542 0.110169
31
+ PleIAs/Pleias-RAG-350M 0.35 0.236264 0.021978 0.010989
32
+ PleIAs/Pleias-RAG-1B 1 0.190476 0.037037 0
README.md CHANGED
@@ -8,7 +8,7 @@ sdk_version: 5.31.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: This is Google's FACT Leaderboard, but for Open LLMs!
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: This is Google's FACTS Grounding Leaderboard, but for Open LLMs and medical domain!
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -2,25 +2,17 @@ import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
 
5
- # Read data from CSV file
6
- try:
7
- df = pd.read_csv('results.csv', skipinitialspace=True)
8
- print(f"Successfully loaded {len(df)} rows from CSV")
9
- print(f"Columns: {list(df.columns)}")
10
- except Exception as e:
11
- print(f"Error reading CSV: {e}")
12
- print("Attempting to read with error handling...")
13
- # Try reading with error handling for bad lines
14
- df = pd.read_csv('results.csv', skipinitialspace=True, on_bad_lines='skip')
15
- print(f"Loaded {len(df)} rows after skipping bad lines")
16
-
17
- # Map CSV columns to expected column names
18
  df = df.rename(columns={
19
- 'model_name': 'Model Name',
20
- 'size': 'Size',
21
- 'grounding_score': 'Separate Grounding Score',
22
- 'quality_score': 'Separate Quality Score',
23
- 'combined_score': 'Combined Score'
24
  })
25
 
26
  # Create size display format
@@ -85,14 +77,14 @@ def filter_and_search_models(search_query, size_ranges, sort_by):
85
  # Round numerical values for better display
86
  for col in ["Separate Grounding Score", "Separate Quality Score", "Combined Score"]:
87
  display_df = display_df.copy() # Create a copy to avoid SettingWithCopyWarning
88
- display_df[col] = display_df[col]
89
 
90
  return display_df
91
 
92
 
93
  # Create the Gradio interface
94
- with gr.Blocks(title="FACTS Grounding Benchmark", theme=gr.themes.Base()) as app:
95
- gr.Markdown("# πŸ† FACTS Grounding Benchmark")
96
  gr.Markdown(
97
  "### FACTS Medical Grounding is a benchmark designed to evaluate Open Models over medical domain."
98
  )
@@ -144,9 +136,9 @@ with gr.Blocks(title="FACTS Grounding Benchmark", theme=gr.themes.Base()) as app
144
  headers=[
145
  "Rank",
146
  "Model Name",
 
147
  "Separate Grounding Score",
148
  "Separate Quality Score",
149
- "Size",
150
  "Combined Score",
151
  ],
152
  datatype=["number", "str", "str", "number", "number", "number"],
@@ -157,13 +149,11 @@ with gr.Blocks(title="FACTS Grounding Benchmark", theme=gr.themes.Base()) as app
157
 
158
  # Metric explanations at the bottom
159
  with gr.Accordion("Metric Explanations", open=False):
160
- gr.Markdown(
161
- """
162
  - **Grounding Score**: Measures the model's ability to provide factually accurate responses based on given context
163
  - **Quality Score**: Evaluates the overall quality of the model's responses including coherence and relevance
164
  - **Combined Score**: A weighted combination of grounding and quality scores representing overall performance
165
- """
166
- )
167
 
168
  with gr.TabItem("About"):
169
  gr.Markdown(
 
2
  import pandas as pd
3
  import numpy as np
4
 
5
+ # Load data from TSV file
6
+ df = pd.read_csv('FACTS.tsv', sep='\t')
7
+
8
+ # Clean up the data
9
+ df = df.dropna() # Remove any rows with missing values
10
+ df.columns = df.columns.str.strip() # Remove any whitespace from column names
11
+
12
+ # Rename columns to match our expected format
 
 
 
 
 
13
  df = df.rename(columns={
14
+ 'model': 'Model Name',
15
+ 'size': 'Size'
 
 
 
16
  })
17
 
18
  # Create size display format
 
77
  # Round numerical values for better display
78
  for col in ["Separate Grounding Score", "Separate Quality Score", "Combined Score"]:
79
  display_df = display_df.copy() # Create a copy to avoid SettingWithCopyWarning
80
+ display_df[col] = display_df[col].round(6)
81
 
82
  return display_df
83
 
84
 
85
  # Create the Gradio interface
86
+ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as app:
87
+ gr.Markdown("# πŸ† FACTS Grounding Leaderboard")
88
  gr.Markdown(
89
  "### FACTS Medical Grounding is a benchmark designed to evaluate Open Models over medical domain."
90
  )
 
136
  headers=[
137
  "Rank",
138
  "Model Name",
139
+ "Size",
140
  "Separate Grounding Score",
141
  "Separate Quality Score",
 
142
  "Combined Score",
143
  ],
144
  datatype=["number", "str", "str", "number", "number", "number"],
 
149
 
150
  # Metric explanations at the bottom
151
  with gr.Accordion("Metric Explanations", open=False):
152
+ gr.Markdown("""
 
153
  - **Grounding Score**: Measures the model's ability to provide factually accurate responses based on given context
154
  - **Quality Score**: Evaluates the overall quality of the model's responses including coherence and relevance
155
  - **Combined Score**: A weighted combination of grounding and quality scores representing overall performance
156
+ """)
 
157
 
158
  with gr.TabItem("About"):
159
  gr.Markdown(
results.csv DELETED
@@ -1,33 +0,0 @@
1
- model_name,size,grounding_score,quality_score,combined_score
2
- Qwen/Qwen2.5-7B-Instruct,7,0.800000,0.800000,0.800000
3
- deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,14,0.817797,0.542373,0.457627
4
- VIDraft/Gemma-3-R1984-27B,27,0.93617,0.459574,0.434043
5
- meta-llama/Llama-3.3-70B-Instruct,70,0.842553,0.510638,0.425532
6
- Qwen/Qwen3-30B-A3B,30,0.812766,0.540426,0.425532,
7
- Qwen/Qwen3-4B,4,0.770213,0.540426,0.425532
8
- Qwen/Qwen3-32B,32,0.740426,0.553191,0.417021
9
- deepseek-ai/DeepSeek-R1-Distill-Llama-8B,8,0.766949,0.516949,0.40678
10
- Qwen/Qwen3-8B,8,0.748936,0.523404,0.4
11
- Qwen/Qwen3-14B,14,0.778723,0.502128,0.382979
12
- google/gemma-3-27b-it,27,0.936,0.391,0.378
13
- Qwen/Qwen2.5-VL-32B-Instruct,32,0.621277,0.570213,0.357447
14
- meta-llama/Llama-3.1-70B-Instruct,70,0.855932,0.389831,0.334746
15
- google/gemma-3-12b-it,12,0.944,0.343,0.313
16
- google/gemma-3-4b-it,4,0.9,0.33,0.3
17
- Qwen/Qwen3-1.7B,1.7,0.702128,0.451064,0.297872
18
- deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,7,0.59322,0.449153,0.275424
19
- Qwen/Qwen3-0.6B,0.6,0.682203,0.330508,0.266949
20
- Qwen/Qwen2.5-7B-Instruct,7,0.731915,0.310638,0.255319
21
- Qwen/Qwen2.5-14B-Instruct-1M,14,0.70339,0.300847,0.254237
22
- nvidia/Llama-Nemotron-Nano-8B,8,0.576271,0.402542,0.241525
23
- OpenScholar/Llama-3.1-OpenScholar-8B,8,0.690678,0.283898,0.241525
24
- Qwen/Qwen2.5-7B-Instruct-1M,7,0.737288,0.271186,0.207627
25
- nvidia/Llama-Nemotron-Nano-4B-v1.1,4,0.548936,0.340426,0.2
26
- google/gemma-3-1b-it,1,0.65,0.28,0.19
27
- mistralai/Ministral-8B-Instruct-2410,8,0.94,0.184,0.175
28
- meta-llama/Llama-3.1-8B-Instruct,8,0.665254,0.194915,0.169492
29
- mistralai/Mistral-Small-3.1-24B-Instruct-2503,24,0.953191,0.165957,0.157447
30
- mistralai/Mistral-Small-24B-Instruct-2501,24,0.95339,0.135593,0.131356
31
- open-thoughts/OpenThinker-7B,7,0.478814,0.152542,0.110169
32
- PleIAs/Pleias-RAG-350M,0.35,0.236264,0.021978,0.010989
33
- PleIAs/Pleias-RAG-1B,1,0.190476,0.037037,0