Commit
Β·
6c3704b
1
Parent(s):
3d1714b
update
Browse files
FACTS.tsv
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model size Separate Grounding Score Separate Quality Score Combined Score
|
2 |
+
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B 14 0.817797 0.542373 0.457627
|
3 |
+
VIDraft/Gemma-3-R1984-27B 27 0.93617 0.459574 0.434043
|
4 |
+
meta-llama/Llama-3.3-70B-Instruct 70 0.842553 0.510638 0.425532
|
5 |
+
Qwen/Qwen3-30B-A3B 30 0.812766 0.540426 0.425532
|
6 |
+
Qwen/Qwen3-4B 4 0.770213 0.540426 0.425532
|
7 |
+
Qwen/Qwen3-32B 32 0.740426 0.553191 0.417021
|
8 |
+
deepseek-ai/DeepSeek-R1-Distill-Llama-8B 8 0.766949 0.516949 0.40678
|
9 |
+
Qwen/Qwen3-8B 8 0.748936 0.523404 0.4
|
10 |
+
Qwen/Qwen3-14B 14 0.778723 0.502128 0.382979
|
11 |
+
google/gemma-3-27b-it 27 0.936 0.391 0.378
|
12 |
+
Qwen/Qwen2.5-VL-32B-Instruct 32 0.621277 0.570213 0.357447
|
13 |
+
meta-llama/Llama-3.1-70B-Instruct 70 0.855932 0.389831 0.334746
|
14 |
+
google/gemma-3-12b-it 12 0.944 0.343 0.313
|
15 |
+
google/gemma-3-4b-it 4 0.9 0.33 0.3
|
16 |
+
Qwen/Qwen3-1.7B 1.7 0.702128 0.451064 0.297872
|
17 |
+
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B 7 0.59322 0.449153 0.275424
|
18 |
+
Qwen/Qwen3-0.6B 0.6 0.682203 0.330508 0.266949
|
19 |
+
Qwen/Qwen2.5-7B-Instruct 7 0.731915 0.310638 0.255319
|
20 |
+
Qwen/Qwen2.5-14B-Instruct-1M 14 0.70339 0.300847 0.254237
|
21 |
+
nvidia/Llama-Nemotron-Nano-8B 8 0.576271 0.402542 0.241525
|
22 |
+
OpenScholar/Llama-3.1-OpenScholar-8B 8 0.690678 0.283898 0.241525
|
23 |
+
Qwen/Qwen2.5-7B-Instruct-1M 7 0.737288 0.271186 0.207627
|
24 |
+
nvidia/Llama-Nemotron-Nano-4B-v1.1 4 0.548936 0.340426 0.2
|
25 |
+
google/gemma-3-1b-it 1 0.65 0.28 0.19
|
26 |
+
mistralai/Ministral-8B-Instruct-2410 8 0.94 0.184 0.175
|
27 |
+
meta-llama/Llama-3.1-8B-Instruct 8 0.665254 0.194915 0.169492
|
28 |
+
mistralai/Mistral-Small-3.1-24B-Instruct-2503 24 0.953191 0.165957 0.157447
|
29 |
+
mistralai/Mistral-Small-24B-Instruct-2501 24 0.95339 0.135593 0.131356
|
30 |
+
open-thoughts/OpenThinker-7B 7 0.478814 0.152542 0.110169
|
31 |
+
PleIAs/Pleias-RAG-350M 0.35 0.236264 0.021978 0.010989
|
32 |
+
PleIAs/Pleias-RAG-1B 1 0.190476 0.037037 0
|
README.md
CHANGED
@@ -8,7 +8,7 @@ sdk_version: 5.31.0
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
-
short_description: This is Google's
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
+
short_description: This is Google's FACTS Grounding Leaderboard, but for Open LLMs and medical domain!
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -2,25 +2,17 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
|
5 |
-
#
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
# Try reading with error handling for bad lines
|
14 |
-
df = pd.read_csv('results.csv', skipinitialspace=True, on_bad_lines='skip')
|
15 |
-
print(f"Loaded {len(df)} rows after skipping bad lines")
|
16 |
-
|
17 |
-
# Map CSV columns to expected column names
|
18 |
df = df.rename(columns={
|
19 |
-
'
|
20 |
-
'size': 'Size'
|
21 |
-
'grounding_score': 'Separate Grounding Score',
|
22 |
-
'quality_score': 'Separate Quality Score',
|
23 |
-
'combined_score': 'Combined Score'
|
24 |
})
|
25 |
|
26 |
# Create size display format
|
@@ -85,14 +77,14 @@ def filter_and_search_models(search_query, size_ranges, sort_by):
|
|
85 |
# Round numerical values for better display
|
86 |
for col in ["Separate Grounding Score", "Separate Quality Score", "Combined Score"]:
|
87 |
display_df = display_df.copy() # Create a copy to avoid SettingWithCopyWarning
|
88 |
-
display_df[col] = display_df[col]
|
89 |
|
90 |
return display_df
|
91 |
|
92 |
|
93 |
# Create the Gradio interface
|
94 |
-
with gr.Blocks(title="FACTS Grounding
|
95 |
-
gr.Markdown("# π FACTS Grounding
|
96 |
gr.Markdown(
|
97 |
"### FACTS Medical Grounding is a benchmark designed to evaluate Open Models over medical domain."
|
98 |
)
|
@@ -144,9 +136,9 @@ with gr.Blocks(title="FACTS Grounding Benchmark", theme=gr.themes.Base()) as app
|
|
144 |
headers=[
|
145 |
"Rank",
|
146 |
"Model Name",
|
|
|
147 |
"Separate Grounding Score",
|
148 |
"Separate Quality Score",
|
149 |
-
"Size",
|
150 |
"Combined Score",
|
151 |
],
|
152 |
datatype=["number", "str", "str", "number", "number", "number"],
|
@@ -157,13 +149,11 @@ with gr.Blocks(title="FACTS Grounding Benchmark", theme=gr.themes.Base()) as app
|
|
157 |
|
158 |
# Metric explanations at the bottom
|
159 |
with gr.Accordion("Metric Explanations", open=False):
|
160 |
-
gr.Markdown(
|
161 |
-
"""
|
162 |
- **Grounding Score**: Measures the model's ability to provide factually accurate responses based on given context
|
163 |
- **Quality Score**: Evaluates the overall quality of the model's responses including coherence and relevance
|
164 |
- **Combined Score**: A weighted combination of grounding and quality scores representing overall performance
|
165 |
-
"""
|
166 |
-
)
|
167 |
|
168 |
with gr.TabItem("About"):
|
169 |
gr.Markdown(
|
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
|
5 |
+
# Load data from TSV file
|
6 |
+
df = pd.read_csv('FACTS.tsv', sep='\t')
|
7 |
+
|
8 |
+
# Clean up the data
|
9 |
+
df = df.dropna() # Remove any rows with missing values
|
10 |
+
df.columns = df.columns.str.strip() # Remove any whitespace from column names
|
11 |
+
|
12 |
+
# Rename columns to match our expected format
|
|
|
|
|
|
|
|
|
|
|
13 |
df = df.rename(columns={
|
14 |
+
'model': 'Model Name',
|
15 |
+
'size': 'Size'
|
|
|
|
|
|
|
16 |
})
|
17 |
|
18 |
# Create size display format
|
|
|
77 |
# Round numerical values for better display
|
78 |
for col in ["Separate Grounding Score", "Separate Quality Score", "Combined Score"]:
|
79 |
display_df = display_df.copy() # Create a copy to avoid SettingWithCopyWarning
|
80 |
+
display_df[col] = display_df[col].round(6)
|
81 |
|
82 |
return display_df
|
83 |
|
84 |
|
85 |
# Create the Gradio interface
|
86 |
+
with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as app:
|
87 |
+
gr.Markdown("# π FACTS Grounding Leaderboard")
|
88 |
gr.Markdown(
|
89 |
"### FACTS Medical Grounding is a benchmark designed to evaluate Open Models over medical domain."
|
90 |
)
|
|
|
136 |
headers=[
|
137 |
"Rank",
|
138 |
"Model Name",
|
139 |
+
"Size",
|
140 |
"Separate Grounding Score",
|
141 |
"Separate Quality Score",
|
|
|
142 |
"Combined Score",
|
143 |
],
|
144 |
datatype=["number", "str", "str", "number", "number", "number"],
|
|
|
149 |
|
150 |
# Metric explanations at the bottom
|
151 |
with gr.Accordion("Metric Explanations", open=False):
|
152 |
+
gr.Markdown("""
|
|
|
153 |
- **Grounding Score**: Measures the model's ability to provide factually accurate responses based on given context
|
154 |
- **Quality Score**: Evaluates the overall quality of the model's responses including coherence and relevance
|
155 |
- **Combined Score**: A weighted combination of grounding and quality scores representing overall performance
|
156 |
+
""")
|
|
|
157 |
|
158 |
with gr.TabItem("About"):
|
159 |
gr.Markdown(
|
results.csv
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
model_name,size,grounding_score,quality_score,combined_score
|
2 |
-
Qwen/Qwen2.5-7B-Instruct,7,0.800000,0.800000,0.800000
|
3 |
-
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,14,0.817797,0.542373,0.457627
|
4 |
-
VIDraft/Gemma-3-R1984-27B,27,0.93617,0.459574,0.434043
|
5 |
-
meta-llama/Llama-3.3-70B-Instruct,70,0.842553,0.510638,0.425532
|
6 |
-
Qwen/Qwen3-30B-A3B,30,0.812766,0.540426,0.425532,
|
7 |
-
Qwen/Qwen3-4B,4,0.770213,0.540426,0.425532
|
8 |
-
Qwen/Qwen3-32B,32,0.740426,0.553191,0.417021
|
9 |
-
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,8,0.766949,0.516949,0.40678
|
10 |
-
Qwen/Qwen3-8B,8,0.748936,0.523404,0.4
|
11 |
-
Qwen/Qwen3-14B,14,0.778723,0.502128,0.382979
|
12 |
-
google/gemma-3-27b-it,27,0.936,0.391,0.378
|
13 |
-
Qwen/Qwen2.5-VL-32B-Instruct,32,0.621277,0.570213,0.357447
|
14 |
-
meta-llama/Llama-3.1-70B-Instruct,70,0.855932,0.389831,0.334746
|
15 |
-
google/gemma-3-12b-it,12,0.944,0.343,0.313
|
16 |
-
google/gemma-3-4b-it,4,0.9,0.33,0.3
|
17 |
-
Qwen/Qwen3-1.7B,1.7,0.702128,0.451064,0.297872
|
18 |
-
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,7,0.59322,0.449153,0.275424
|
19 |
-
Qwen/Qwen3-0.6B,0.6,0.682203,0.330508,0.266949
|
20 |
-
Qwen/Qwen2.5-7B-Instruct,7,0.731915,0.310638,0.255319
|
21 |
-
Qwen/Qwen2.5-14B-Instruct-1M,14,0.70339,0.300847,0.254237
|
22 |
-
nvidia/Llama-Nemotron-Nano-8B,8,0.576271,0.402542,0.241525
|
23 |
-
OpenScholar/Llama-3.1-OpenScholar-8B,8,0.690678,0.283898,0.241525
|
24 |
-
Qwen/Qwen2.5-7B-Instruct-1M,7,0.737288,0.271186,0.207627
|
25 |
-
nvidia/Llama-Nemotron-Nano-4B-v1.1,4,0.548936,0.340426,0.2
|
26 |
-
google/gemma-3-1b-it,1,0.65,0.28,0.19
|
27 |
-
mistralai/Ministral-8B-Instruct-2410,8,0.94,0.184,0.175
|
28 |
-
meta-llama/Llama-3.1-8B-Instruct,8,0.665254,0.194915,0.169492
|
29 |
-
mistralai/Mistral-Small-3.1-24B-Instruct-2503,24,0.953191,0.165957,0.157447
|
30 |
-
mistralai/Mistral-Small-24B-Instruct-2501,24,0.95339,0.135593,0.131356
|
31 |
-
open-thoughts/OpenThinker-7B,7,0.478814,0.152542,0.110169
|
32 |
-
PleIAs/Pleias-RAG-350M,0.35,0.236264,0.021978,0.010989
|
33 |
-
PleIAs/Pleias-RAG-1B,1,0.190476,0.037037,0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|