simplify and unpin
Browse files- app.py +268 -405
- requirements.txt +5 -5
app.py
CHANGED
@@ -1,44 +1,87 @@
|
|
1 |
import datetime
|
2 |
import os
|
3 |
import re
|
4 |
-
|
5 |
-
import gradio as gr
|
6 |
-
import pandas as pd
|
7 |
import yaml
|
|
|
|
|
|
|
8 |
|
9 |
# Constants
|
10 |
-
|
11 |
-
TEMPLATE_PATH = "template.yaml"
|
12 |
-
|
13 |
-
|
14 |
-
# Ensure the eval cards directory exists
|
15 |
-
os.makedirs(EVAL_CARDS_DIR, exist_ok=True)
|
16 |
-
|
17 |
-
# Copy the template to the appropriate location
|
18 |
-
with open("template.yaml", "w") as f:
|
19 |
-
with open("yaml_template.yaml", "r") as template_file:
|
20 |
-
f.write(template_file.read())
|
21 |
|
22 |
|
23 |
-
def
|
24 |
-
"""
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
|
29 |
-
def
|
30 |
-
"""
|
31 |
try:
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
|
37 |
def compute_coverage_score(eval_data):
|
38 |
-
"""
|
39 |
-
Compute a coverage score for the eval card
|
40 |
-
Returns a score from 0-100 and a breakdown of coverage by section
|
41 |
-
"""
|
42 |
sections = {
|
43 |
"metadata": 5,
|
44 |
"evaluation_design": 10,
|
@@ -51,32 +94,23 @@ def compute_coverage_score(eval_data):
|
|
51 |
"citation_and_usage": 5,
|
52 |
}
|
53 |
|
54 |
-
|
55 |
-
total_score = 0
|
56 |
-
|
57 |
-
def count_filled_fields(data, prefix=""):
|
58 |
if isinstance(data, dict):
|
59 |
-
filled = 0
|
60 |
-
|
61 |
-
for key, value in data.items():
|
62 |
if isinstance(value, (dict, list)):
|
63 |
-
sub_filled, sub_total = count_filled_fields(
|
64 |
-
value, f"{prefix}.{key}" if prefix else key
|
65 |
-
)
|
66 |
filled += sub_filled
|
67 |
total += sub_total
|
68 |
else:
|
69 |
total += 1
|
70 |
-
if value and not
|
71 |
-
isinstance(value, str) and value.strip() in ["", "[]", "{}"]
|
72 |
-
):
|
73 |
filled += 1
|
74 |
return filled, total
|
75 |
elif isinstance(data, list):
|
76 |
if not data:
|
77 |
return 0, 1
|
78 |
-
filled = 0
|
79 |
-
total = 0
|
80 |
for item in data:
|
81 |
sub_filled, sub_total = count_filled_fields(item)
|
82 |
filled += sub_filled
|
@@ -85,426 +119,255 @@ def compute_coverage_score(eval_data):
|
|
85 |
else:
|
86 |
return 1 if data else 0, 1
|
87 |
|
88 |
-
|
89 |
for section, weight in sections.items():
|
90 |
if section in eval_data:
|
91 |
filled, total = count_filled_fields(eval_data[section])
|
92 |
completion_rate = filled / total if total > 0 else 0
|
93 |
-
|
94 |
-
"score": round(completion_rate * weight, 2),
|
95 |
-
"max_score": weight,
|
96 |
-
"completion_rate": round(completion_rate * 100, 2),
|
97 |
-
"fields_filled": filled,
|
98 |
-
"fields_total": total,
|
99 |
-
}
|
100 |
-
total_score += scores[section]["score"]
|
101 |
-
else:
|
102 |
-
scores[section] = {
|
103 |
-
"score": 0,
|
104 |
-
"max_score": weight,
|
105 |
-
"completion_rate": 0,
|
106 |
-
"fields_filled": 0,
|
107 |
-
"fields_total": 0,
|
108 |
-
}
|
109 |
|
110 |
-
return min(round(total_score, 2), 100)
|
111 |
|
112 |
|
113 |
-
def
|
114 |
-
"""
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
import os
|
119 |
|
120 |
-
|
121 |
-
from dotenv import load_dotenv
|
122 |
|
123 |
-
|
124 |
-
|
|
|
|
|
125 |
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
128 |
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
-
|
133 |
-
headers = {
|
134 |
-
"Content-Type": "application/json",
|
135 |
-
"Authorization": f"Bearer {api_token}",
|
136 |
-
}
|
137 |
-
|
138 |
-
prompt = f"""
|
139 |
-
I'm reviewing an Evaluation Card in YAML format. Please analyze it for completeness,
|
140 |
-
consistency, and clarity. Provide specific recommendations for improvement.
|
141 |
-
|
142 |
-
Focus on:
|
143 |
-
1. Sections that need more detail
|
144 |
-
2. Inconsistencies or contradictions
|
145 |
-
3. Clarity of language and explanations
|
146 |
-
4. Alignment with best practices for ML evaluation
|
147 |
-
|
148 |
-
Here's the YAML content:
|
149 |
-
|
150 |
-
```yaml
|
151 |
-
{yaml_content}
|
152 |
-
```
|
153 |
-
|
154 |
-
Provide your feedback in a structured format with specific, actionable recommendations.
|
155 |
-
"""
|
156 |
-
|
157 |
-
payload = {
|
158 |
-
"model": "llama-3.3-70b-versatile", # or another groq supported model
|
159 |
-
"messages": [{"role": "user", "content": prompt}],
|
160 |
-
}
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
166 |
)
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
|
173 |
except Exception as e:
|
174 |
-
return f"Error
|
175 |
|
176 |
|
177 |
-
def
|
178 |
-
"""
|
179 |
try:
|
180 |
-
|
|
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
eval_data["metadata"]["paper_link"] = paper_url
|
185 |
-
if repo_url:
|
186 |
-
eval_data["metadata"]["repository_link"] = repo_url
|
187 |
|
188 |
-
|
189 |
-
yaml_content = yaml.dump(eval_data)
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
)
|
195 |
-
file_path = os.path.join(EVAL_CARDS_DIR, filename)
|
196 |
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
-
return f"Evaluation card saved successfully as {filename}", file_path
|
201 |
except Exception as e:
|
202 |
-
return f"Error
|
203 |
-
|
204 |
-
|
205 |
-
def load_all_eval_cards():
|
206 |
-
"""Load all eval cards from the repository"""
|
207 |
-
eval_cards = []
|
208 |
-
|
209 |
-
for filename in os.listdir(EVAL_CARDS_DIR):
|
210 |
-
if filename.endswith(".yaml"):
|
211 |
-
file_path = os.path.join(EVAL_CARDS_DIR, filename)
|
212 |
-
try:
|
213 |
-
with open(file_path, "r") as file:
|
214 |
-
yaml_content = file.read()
|
215 |
-
eval_data = yaml.safe_load(yaml_content)
|
216 |
-
|
217 |
-
# Compute coverage score
|
218 |
-
score, score_details = compute_coverage_score(eval_data)
|
219 |
-
score = min(score, 100)
|
220 |
-
|
221 |
-
# Extract key metadata
|
222 |
-
eval_cards.append(
|
223 |
-
{
|
224 |
-
"filename": filename,
|
225 |
-
"title": eval_data.get("title", "Unnamed Evaluation"),
|
226 |
-
"summary": eval_data.get("summary", ""),
|
227 |
-
"authors": ", ".join(
|
228 |
-
eval_data.get("metadata", {}).get("authors", [])
|
229 |
-
),
|
230 |
-
"creation_date": eval_data.get("metadata", {}).get(
|
231 |
-
"creation_date", ""
|
232 |
-
),
|
233 |
-
"coverage_score": score,
|
234 |
-
"score_details": score_details,
|
235 |
-
"yaml_content": yaml_content,
|
236 |
-
"data": eval_data,
|
237 |
-
}
|
238 |
-
)
|
239 |
-
except Exception as e:
|
240 |
-
print(f"Error loading {filename}: {str(e)}")
|
241 |
-
|
242 |
-
return eval_cards
|
243 |
-
|
244 |
-
|
245 |
-
def format_eval_card_as_html(eval_card):
|
246 |
-
"""Format an eval card as HTML for display"""
|
247 |
-
html = f"""
|
248 |
-
<div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
|
249 |
-
<h3>{eval_card["title"]}</h3>
|
250 |
-
<p>{eval_card["summary"]}</p>
|
251 |
-
<p><strong>Authors:</strong> {eval_card["authors"]}</p>
|
252 |
-
<p><strong>Created:</strong> {eval_card["creation_date"]}</p>
|
253 |
-
|
254 |
-
<!-- Add repository and paper links if available -->
|
255 |
-
{f'<p><strong>Repository:</strong> <a href="{eval_card["data"]["metadata"].get("repository_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("repository_link", "")}</a></p>' if eval_card["data"]["metadata"].get("repository_link") else ""}
|
256 |
-
{f'<p><strong>Paper:</strong> <a href="{eval_card["data"]["metadata"].get("paper_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("paper_link", "")}</a></p>' if eval_card["data"]["metadata"].get("paper_link") else ""}
|
257 |
-
|
258 |
-
<p><strong>Coverage Score:</strong> {eval_card["coverage_score"]}%</p>
|
259 |
-
|
260 |
-
<h4>Coverage by Section:</h4>
|
261 |
-
<table style="width: 100%; border-collapse: collapse;">
|
262 |
-
<tr>
|
263 |
-
<th style="text-align: left; padding: 5px; border-bottom: 1px solid #ddd;">Section</th>
|
264 |
-
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Score</th>
|
265 |
-
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th>
|
266 |
-
</tr>
|
267 |
-
"""
|
268 |
-
|
269 |
-
for section, details in eval_card["score_details"].items():
|
270 |
-
html += f"""
|
271 |
-
<tr>
|
272 |
-
<td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td>
|
273 |
-
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["score"]}/{details["max_score"]}</td>
|
274 |
-
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["completion_rate"]}%</td>
|
275 |
-
</tr>
|
276 |
-
"""
|
277 |
-
|
278 |
-
html += """
|
279 |
-
</table>
|
280 |
-
<div style="margin-top: 15px;">
|
281 |
-
<!-- Additional actions can go here -->
|
282 |
-
</div>
|
283 |
-
</div>
|
284 |
-
"""
|
285 |
-
|
286 |
-
return html
|
287 |
-
|
288 |
-
|
289 |
-
def create_eval_cards_table(eval_cards):
|
290 |
-
"""Create an HTML table of eval cards"""
|
291 |
-
if not eval_cards:
|
292 |
-
return "<p>No evaluation cards found.</p>"
|
293 |
-
|
294 |
-
# Sort by coverage score (highest first)
|
295 |
-
eval_cards.sort(key=lambda x: x["coverage_score"], reverse=True)
|
296 |
-
|
297 |
-
html = ""
|
298 |
-
for eval_card in eval_cards:
|
299 |
-
html += format_eval_card_as_html(eval_card)
|
300 |
-
|
301 |
-
return html
|
302 |
-
|
303 |
-
|
304 |
-
def upload_file(file):
|
305 |
-
"""Process an uploaded YAML file"""
|
306 |
-
if file is None:
|
307 |
-
return "No file uploaded", None
|
308 |
|
309 |
-
try:
|
310 |
-
yaml_content = file.decode("utf-8")
|
311 |
-
# Validate YAML
|
312 |
-
eval_data = yaml.safe_load(yaml_content)
|
313 |
-
return yaml_content, eval_data
|
314 |
-
except Exception as e:
|
315 |
-
return f"Error processing file: {str(e)}", None
|
316 |
|
|
|
|
|
|
|
|
|
|
|
317 |
|
318 |
-
|
319 |
-
|
320 |
-
if not yaml_content:
|
321 |
-
return "Please upload or paste a YAML file first."
|
322 |
|
323 |
-
|
324 |
-
|
325 |
|
326 |
-
|
327 |
-
|
328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
)
|
330 |
|
331 |
-
|
332 |
-
|
|
|
|
|
|
|
|
|
|
|
333 |
|
334 |
|
335 |
-
|
336 |
-
|
337 |
-
if not
|
338 |
-
return "Please
|
339 |
|
340 |
try:
|
341 |
-
# Validate YAML
|
342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
|
344 |
-
# Compute coverage score
|
345 |
-
score, score_details = compute_coverage_score(eval_data)
|
346 |
|
347 |
-
|
348 |
-
|
349 |
|
350 |
-
if file_path:
|
351 |
-
return (
|
352 |
-
f"Evaluation card saved successfully! Coverage score: {score}%",
|
353 |
-
score,
|
354 |
-
score_details,
|
355 |
-
)
|
356 |
-
else:
|
357 |
-
return f"Error saving evaluation card: {result}", None, None
|
358 |
|
359 |
-
|
360 |
-
|
361 |
|
362 |
|
363 |
def refresh_gallery():
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
else:
|
380 |
-
author_counts[author] = 1
|
381 |
-
|
382 |
-
# Get top authors
|
383 |
-
top_authors = sorted(author_counts.items(), key=lambda x: x[1], reverse=True)[:20]
|
384 |
-
top_authors = [author for author, count in top_authors]
|
385 |
-
|
386 |
-
# Create table data with one entry per card
|
387 |
-
for card in eval_cards:
|
388 |
-
authors = card["authors"].split(", ")
|
389 |
-
filtered_authors = [author for author in authors if author in top_authors]
|
390 |
-
table_data.append(
|
391 |
-
{
|
392 |
-
"Title": card["title"],
|
393 |
-
"Authors": ", ".join(filtered_authors),
|
394 |
-
"Creation Date": card["creation_date"],
|
395 |
-
"Coverage Score": f"{card['coverage_score']}%",
|
396 |
-
}
|
397 |
-
)
|
398 |
|
399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
|
401 |
-
|
|
|
|
|
402 |
|
|
|
|
|
|
|
|
|
|
|
403 |
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
else:
|
410 |
-
return yaml_text
|
411 |
|
|
|
412 |
|
413 |
-
|
414 |
-
|
415 |
-
with gr.Row():
|
416 |
-
with gr.Column(scale=2):
|
417 |
-
gr.Markdown(
|
418 |
-
"# Evaluation Cards for Machine Learning in Materials Science. "
|
419 |
)
|
420 |
-
gr.Markdown("""
|
421 |
-
Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the gallery.
|
422 |
-
checkout the [GitHub repository](https://github.com/lamalab-org/eval-cards) for more information.
|
423 |
-
""")
|
424 |
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
)
|
432 |
-
|
433 |
-
with gr.Accordion("Or paste YAML content", open=False):
|
434 |
-
yaml_input = gr.TextArea(
|
435 |
-
label="YAML Content",
|
436 |
-
placeholder="Paste your YAML content here...",
|
437 |
-
lines=10,
|
438 |
-
)
|
439 |
-
paper_url_input = gr.Textbox(
|
440 |
-
label="Paper URL (Optional)",
|
441 |
-
placeholder="https://arxiv.org/abs/...",
|
442 |
-
)
|
443 |
-
|
444 |
-
repo_url_input = gr.Textbox(
|
445 |
-
label="Repository URL (Optional)",
|
446 |
-
placeholder="https://github.com/...",
|
447 |
-
)
|
448 |
-
|
449 |
-
load_template_btn = gr.Button("Load Template")
|
450 |
-
|
451 |
-
# api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password")
|
452 |
-
|
453 |
-
with gr.Row():
|
454 |
-
get_feedback_btn = gr.Button("Get LLM Feedback")
|
455 |
-
submit_btn = gr.Button(
|
456 |
-
"Submit Evaluation Card", variant="primary"
|
457 |
-
)
|
458 |
-
|
459 |
-
with gr.Column():
|
460 |
-
yaml_display = gr.TextArea(label="Current YAML", lines=20)
|
461 |
-
|
462 |
-
with gr.Accordion("LLM Feedback", open=True):
|
463 |
-
feedback_display = gr.Markdown()
|
464 |
-
|
465 |
-
with gr.Accordion("Submission Result", open=True):
|
466 |
-
result_display = gr.Markdown()
|
467 |
-
coverage_score = gr.Number(
|
468 |
-
label="Coverage Score", visible=False
|
469 |
-
)
|
470 |
-
coverage_details = gr.JSON(
|
471 |
-
label="Coverage Details", visible=False
|
472 |
-
)
|
473 |
-
|
474 |
-
with gr.TabItem("Gallery"):
|
475 |
-
refresh_btn = gr.Button("Refresh Gallery")
|
476 |
-
|
477 |
-
with gr.Tabs():
|
478 |
-
with gr.TabItem("Card View"):
|
479 |
-
gallery_html = gr.HTML()
|
480 |
-
|
481 |
-
with gr.TabItem("Table View"):
|
482 |
-
gallery_table = gr.DataFrame()
|
483 |
-
|
484 |
-
# Set up event handlers
|
485 |
-
load_template_btn.click(fn=load_template, outputs=[yaml_display])
|
486 |
-
|
487 |
-
file_upload.change(
|
488 |
-
fn=handle_upload_tab, inputs=[file_upload, yaml_input], outputs=[yaml_display]
|
489 |
-
)
|
490 |
|
491 |
-
|
492 |
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
|
|
|
|
|
|
|
497 |
submit_btn.click(
|
498 |
-
|
499 |
-
inputs=[
|
500 |
-
outputs=[
|
501 |
)
|
|
|
502 |
|
503 |
-
refresh_btn.click(fn=refresh_gallery, outputs=[gallery_html, gallery_table])
|
504 |
-
|
505 |
-
# Initialize the gallery on app start
|
506 |
-
app.load(fn=refresh_gallery, outputs=[gallery_html, gallery_table])
|
507 |
-
|
508 |
-
# Launch the app
|
509 |
if __name__ == "__main__":
|
510 |
-
|
|
|
1 |
import datetime
|
2 |
import os
|
3 |
import re
|
|
|
|
|
|
|
4 |
import yaml
|
5 |
+
from datasets import Dataset, load_dataset
|
6 |
+
from huggingface_hub import create_repo, login
|
7 |
+
import gradio as gr
|
8 |
|
9 |
# Constants
|
10 |
+
DATASET_NAME = "jablonkagroup/eval-cards-dataset"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
|
13 |
+
def setup_hf_auth():
|
14 |
+
"""Setup Hugging Face authentication"""
|
15 |
+
try:
|
16 |
+
hf_token = os.environ.get("HF_TOKEN")
|
17 |
+
if hf_token:
|
18 |
+
login(token=hf_token)
|
19 |
+
return True
|
20 |
+
return False
|
21 |
+
except Exception:
|
22 |
+
return False
|
23 |
|
24 |
|
25 |
+
def ensure_dataset_exists():
|
26 |
+
"""Ensure the dataset repository exists"""
|
27 |
try:
|
28 |
+
load_dataset(DATASET_NAME, split="train")
|
29 |
+
return True
|
30 |
+
except Exception:
|
31 |
+
try:
|
32 |
+
if not setup_hf_auth():
|
33 |
+
return False
|
34 |
+
create_repo(
|
35 |
+
repo_id=DATASET_NAME, repo_type="dataset", private=False, exist_ok=True
|
36 |
+
)
|
37 |
+
empty_data = {
|
38 |
+
"filename": [],
|
39 |
+
"title": [],
|
40 |
+
"summary": [],
|
41 |
+
"authors": [],
|
42 |
+
"creation_date": [],
|
43 |
+
"coverage_score": [],
|
44 |
+
"yaml_content": [],
|
45 |
+
"paper_link": [],
|
46 |
+
"repository_link": [],
|
47 |
+
"timestamp": [],
|
48 |
+
}
|
49 |
+
empty_dataset = Dataset.from_dict(empty_data)
|
50 |
+
empty_dataset.push_to_hub(DATASET_NAME)
|
51 |
+
return True
|
52 |
+
except Exception:
|
53 |
+
return False
|
54 |
+
|
55 |
+
|
56 |
+
def get_template():
|
57 |
+
"""Get a basic YAML template"""
|
58 |
+
return """title: "Your Evaluation Title"
|
59 |
+
summary: "Brief description of your evaluation"
|
60 |
+
metadata:
|
61 |
+
authors: []
|
62 |
+
creation_date: ""
|
63 |
+
evaluation_design:
|
64 |
+
purpose: ""
|
65 |
+
scope: ""
|
66 |
+
estimand:
|
67 |
+
definition: ""
|
68 |
+
estimator:
|
69 |
+
method: ""
|
70 |
+
estimate:
|
71 |
+
results: ""
|
72 |
+
results_communication:
|
73 |
+
format: ""
|
74 |
+
known_issues_and_limitations:
|
75 |
+
issues: []
|
76 |
+
version_and_maintenance:
|
77 |
+
version: "1.0"
|
78 |
+
citation_and_usage:
|
79 |
+
citation: ""
|
80 |
+
"""
|
81 |
|
82 |
|
83 |
def compute_coverage_score(eval_data):
|
84 |
+
"""Compute a coverage score for the eval card"""
|
|
|
|
|
|
|
85 |
sections = {
|
86 |
"metadata": 5,
|
87 |
"evaluation_design": 10,
|
|
|
94 |
"citation_and_usage": 5,
|
95 |
}
|
96 |
|
97 |
+
def count_filled_fields(data):
|
|
|
|
|
|
|
98 |
if isinstance(data, dict):
|
99 |
+
filled = total = 0
|
100 |
+
for value in data.values():
|
|
|
101 |
if isinstance(value, (dict, list)):
|
102 |
+
sub_filled, sub_total = count_filled_fields(value)
|
|
|
|
|
103 |
filled += sub_filled
|
104 |
total += sub_total
|
105 |
else:
|
106 |
total += 1
|
107 |
+
if value and str(value).strip() not in ["", "[]", "{}"]:
|
|
|
|
|
108 |
filled += 1
|
109 |
return filled, total
|
110 |
elif isinstance(data, list):
|
111 |
if not data:
|
112 |
return 0, 1
|
113 |
+
filled = total = 0
|
|
|
114 |
for item in data:
|
115 |
sub_filled, sub_total = count_filled_fields(item)
|
116 |
filled += sub_filled
|
|
|
119 |
else:
|
120 |
return 1 if data else 0, 1
|
121 |
|
122 |
+
total_score = 0
|
123 |
for section, weight in sections.items():
|
124 |
if section in eval_data:
|
125 |
filled, total = count_filled_fields(eval_data[section])
|
126 |
completion_rate = filled / total if total > 0 else 0
|
127 |
+
total_score += completion_rate * weight
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
+
return min(round(total_score, 2), 100)
|
130 |
|
131 |
|
132 |
+
def save_eval_card(yaml_content, paper_url="", repo_url=""):
|
133 |
+
"""Save an eval card to the dataset"""
|
134 |
+
try:
|
135 |
+
if not setup_hf_auth():
|
136 |
+
return "Error: HF_TOKEN not found. Please set your Hugging Face token in Space settings."
|
|
|
137 |
|
138 |
+
eval_data = yaml.safe_load(yaml_content)
|
|
|
139 |
|
140 |
+
if paper_url:
|
141 |
+
eval_data.setdefault("metadata", {})["paper_link"] = paper_url
|
142 |
+
if repo_url:
|
143 |
+
eval_data.setdefault("metadata", {})["repository_link"] = repo_url
|
144 |
|
145 |
+
yaml_content = yaml.dump(eval_data)
|
146 |
+
filename = re.sub(r"[^\w\-_]", "_", eval_data.get("title", "Unnamed"))
|
147 |
+
filename = (
|
148 |
+
f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
|
149 |
+
)
|
150 |
|
151 |
+
try:
|
152 |
+
dataset = load_dataset(DATASET_NAME, split="train")
|
153 |
+
existing_data = dataset.to_dict()
|
154 |
+
except Exception:
|
155 |
+
existing_data = {
|
156 |
+
"filename": [],
|
157 |
+
"title": [],
|
158 |
+
"summary": [],
|
159 |
+
"authors": [],
|
160 |
+
"creation_date": [],
|
161 |
+
"coverage_score": [],
|
162 |
+
"yaml_content": [],
|
163 |
+
"paper_link": [],
|
164 |
+
"repository_link": [],
|
165 |
+
"timestamp": [],
|
166 |
+
}
|
167 |
|
168 |
+
score = compute_coverage_score(eval_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
+
existing_data["filename"].append(filename)
|
171 |
+
existing_data["title"].append(eval_data.get("title", "Unnamed"))
|
172 |
+
existing_data["summary"].append(eval_data.get("summary", ""))
|
173 |
+
existing_data["authors"].append(
|
174 |
+
", ".join(eval_data.get("metadata", {}).get("authors", []))
|
175 |
+
)
|
176 |
+
existing_data["creation_date"].append(
|
177 |
+
eval_data.get("metadata", {}).get("creation_date", "")
|
178 |
)
|
179 |
+
existing_data["coverage_score"].append(float(score))
|
180 |
+
existing_data["yaml_content"].append(yaml_content)
|
181 |
+
existing_data["paper_link"].append(paper_url or "")
|
182 |
+
existing_data["repository_link"].append(repo_url or "")
|
183 |
+
existing_data["timestamp"].append(datetime.datetime.now().isoformat())
|
184 |
|
185 |
+
updated_dataset = Dataset.from_dict(existing_data)
|
186 |
+
updated_dataset.push_to_hub(DATASET_NAME)
|
187 |
+
|
188 |
+
return f"β
Successfully saved! Coverage score: {score}%\nFilename: {filename}"
|
189 |
|
190 |
except Exception as e:
|
191 |
+
return f"β Error: {str(e)}"
|
192 |
|
193 |
|
194 |
+
def load_gallery():
|
195 |
+
"""Load all evaluation cards from dataset"""
|
196 |
try:
|
197 |
+
if not setup_hf_auth():
|
198 |
+
return "Please set HF_TOKEN in Space settings to view gallery."
|
199 |
|
200 |
+
if not ensure_dataset_exists():
|
201 |
+
return "Dataset not accessible. Please check HF_TOKEN."
|
|
|
|
|
|
|
202 |
|
203 |
+
dataset = load_dataset(DATASET_NAME, split="train")
|
|
|
204 |
|
205 |
+
if len(dataset) == 0:
|
206 |
+
return "No evaluation cards found. Submit your first card!"
|
207 |
+
|
208 |
+
cards_html = "<h3>π Evaluation Cards Gallery</h3>"
|
209 |
+
|
210 |
+
# Sort by coverage score
|
211 |
+
sorted_indices = sorted(
|
212 |
+
range(len(dataset)),
|
213 |
+
key=lambda i: dataset[i]["coverage_score"],
|
214 |
+
reverse=True,
|
215 |
)
|
|
|
216 |
|
217 |
+
for i in sorted_indices[:10]: # Show top 10
|
218 |
+
row = dataset[i]
|
219 |
+
cards_html += f"""
|
220 |
+
<div style="border: 1px solid #ddd; padding: 15px; margin: 10px 0; border-radius: 8px; background-color: #f9f9f9;">
|
221 |
+
<h4>π― {row["title"]}</h4>
|
222 |
+
<p><strong>Summary:</strong> {row["summary"]}</p>
|
223 |
+
<p><strong>Authors:</strong> {row["authors"]}</p>
|
224 |
+
<p><strong>Coverage Score:</strong> <span style="color: #2e7d32; font-weight: bold;">{row["coverage_score"]}%</span></p>
|
225 |
+
<p><strong>Created:</strong> {row["creation_date"]}</p>
|
226 |
+
{f'<p><strong>π Paper:</strong> <a href="{row["paper_link"]}" target="_blank">View</a></p>' if row.get("paper_link") else ""}
|
227 |
+
{f'<p><strong>π» Repository:</strong> <a href="{row["repository_link"]}" target="_blank">View</a></p>' if row.get("repository_link") else ""}
|
228 |
+
</div>
|
229 |
+
"""
|
230 |
+
|
231 |
+
return cards_html
|
232 |
|
|
|
233 |
except Exception as e:
|
234 |
+
return f"Error loading gallery: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
+
def get_llm_feedback(yaml_content):
|
238 |
+
"""Get LLM feedback using Groq"""
|
239 |
+
api_token = os.environ.get("GROQ_API_KEY")
|
240 |
+
if not api_token:
|
241 |
+
return "Please set GROQ_API_KEY in Space settings to get LLM feedback."
|
242 |
|
243 |
+
if not yaml_content.strip():
|
244 |
+
return "Please provide YAML content first."
|
|
|
|
|
245 |
|
246 |
+
try:
|
247 |
+
import requests
|
248 |
|
249 |
+
response = requests.post(
|
250 |
+
"https://api.groq.com/openai/v1/chat/completions",
|
251 |
+
headers={
|
252 |
+
"Content-Type": "application/json",
|
253 |
+
"Authorization": f"Bearer {api_token}",
|
254 |
+
},
|
255 |
+
json={
|
256 |
+
"model": "llama-3.3-70b-versatile",
|
257 |
+
"messages": [
|
258 |
+
{
|
259 |
+
"role": "user",
|
260 |
+
"content": f"Analyze this evaluation card YAML and provide specific improvement suggestions:\n\n```yaml\n{yaml_content}\n```\n\nFocus on completeness, clarity, and best practices.",
|
261 |
+
}
|
262 |
+
],
|
263 |
+
},
|
264 |
)
|
265 |
|
266 |
+
if response.status_code == 200:
|
267 |
+
return response.json()["choices"][0]["message"]["content"]
|
268 |
+
else:
|
269 |
+
return f"API Error {response.status_code}: {response.text}"
|
270 |
+
|
271 |
+
except Exception as e:
|
272 |
+
return f"Error getting feedback: {str(e)}"
|
273 |
|
274 |
|
275 |
+
# Simple functions for the interface
|
276 |
+
def submit_card(yaml_text, paper_url, repo_url):
|
277 |
+
if not yaml_text.strip():
|
278 |
+
return "Please provide YAML content", ""
|
279 |
|
280 |
try:
|
281 |
+
yaml.safe_load(yaml_text) # Validate YAML
|
282 |
+
result = save_eval_card(yaml_text, paper_url, repo_url)
|
283 |
+
gallery = load_gallery()
|
284 |
+
return result, gallery
|
285 |
+
except yaml.YAMLError as e:
|
286 |
+
return f"Invalid YAML: {str(e)}", ""
|
287 |
+
except Exception as e:
|
288 |
+
return f"Error: {str(e)}", ""
|
289 |
|
|
|
|
|
290 |
|
291 |
+
def load_template_text():
|
292 |
+
return get_template()
|
293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
|
295 |
+
def get_feedback(yaml_text):
|
296 |
+
return get_llm_feedback(yaml_text)
|
297 |
|
298 |
|
299 |
def refresh_gallery():
|
300 |
+
return load_gallery()
|
301 |
+
|
302 |
+
|
303 |
+
# Create the interface using older, more stable Gradio approach
|
304 |
+
with gr.Blocks(
|
305 |
+
title="Evaluation Cards Gallery",
|
306 |
+
theme=gr.themes.Soft(),
|
307 |
+
css="footer {visibility: hidden}",
|
308 |
+
) as demo:
|
309 |
+
gr.Markdown("""
|
310 |
+
# π Evaluation Cards for Machine Learning
|
311 |
+
|
312 |
+
Upload your evaluation card in YAML format and submit it to the persistent gallery.
|
313 |
+
Data is stored in HF dataset: [jablonkagroup/eval-cards-dataset](https://huggingface.co/datasets/jablonkagroup/eval-cards-dataset)
|
314 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
316 |
+
with gr.Row():
|
317 |
+
with gr.Column(scale=1):
|
318 |
+
gr.Markdown("### βοΈ Create/Edit Evaluation Card")
|
319 |
+
|
320 |
+
yaml_editor = gr.Textbox(
|
321 |
+
label="YAML Content",
|
322 |
+
lines=15,
|
323 |
+
placeholder="Paste your YAML content or click 'Load Template'...",
|
324 |
+
value="",
|
325 |
+
)
|
326 |
|
327 |
+
with gr.Row():
|
328 |
+
template_btn = gr.Button("π Load Template", size="sm")
|
329 |
+
feedback_btn = gr.Button("π€ Get AI Feedback", size="sm")
|
330 |
|
331 |
+
paper_url = gr.Textbox(
|
332 |
+
label="π Paper URL (Optional)",
|
333 |
+
placeholder="https://arxiv.org/abs/...",
|
334 |
+
value="",
|
335 |
+
)
|
336 |
|
337 |
+
repo_url = gr.Textbox(
|
338 |
+
label="π» Repository URL (Optional)",
|
339 |
+
placeholder="https://github.com/...",
|
340 |
+
value="",
|
341 |
+
)
|
|
|
|
|
342 |
|
343 |
+
submit_btn = gr.Button("π Submit to Gallery", variant="primary", size="lg")
|
344 |
|
345 |
+
result_box = gr.Textbox(
|
346 |
+
label="π€ Submission Result", lines=3, interactive=False
|
|
|
|
|
|
|
|
|
347 |
)
|
|
|
|
|
|
|
|
|
348 |
|
349 |
+
with gr.Column(scale=1):
|
350 |
+
gr.Markdown("### π― AI Feedback")
|
351 |
+
|
352 |
+
feedback_box = gr.Textbox(
|
353 |
+
label="π‘ Improvement Suggestions", lines=8, interactive=False
|
354 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
|
356 |
+
gr.Markdown("### ποΈ Gallery")
|
357 |
|
358 |
+
refresh_btn = gr.Button("π Refresh Gallery", size="sm")
|
359 |
+
|
360 |
+
gallery_display = gr.HTML(value=load_gallery())
|
361 |
|
362 |
+
# Event handlers
|
363 |
+
template_btn.click(load_template_text, outputs=[yaml_editor])
|
364 |
+
feedback_btn.click(get_feedback, inputs=[yaml_editor], outputs=[feedback_box])
|
365 |
submit_btn.click(
|
366 |
+
submit_card,
|
367 |
+
inputs=[yaml_editor, paper_url, repo_url],
|
368 |
+
outputs=[result_box, gallery_display],
|
369 |
)
|
370 |
+
refresh_btn.click(refresh_gallery, outputs=[gallery_display])
|
371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
if __name__ == "__main__":
|
373 |
+
demo.launch()
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
gradio
|
2 |
-
pyyaml
|
3 |
-
|
4 |
-
|
5 |
-
|
|
|
1 |
+
gradio
|
2 |
+
pyyaml
|
3 |
+
datasets
|
4 |
+
huggingface_hub
|
5 |
+
requests
|