gmancino-ball commited on
Commit
56b1556
·
1 Parent(s): c7e15a9

gmb/initial-upload (#1)

Browse files
Files changed (12) hide show
  1. .gitignore +6 -0
  2. .streamlit/config.toml +2 -0
  3. README.md +8 -5
  4. app.py +447 -0
  5. index.html +0 -19
  6. metric.py +207 -0
  7. pyproject.toml +15 -0
  8. requirements.txt +4 -0
  9. style.css +0 -28
  10. test.sh +1 -0
  11. updated.txt +1 -0
  12. utils.py +303 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ temp*
2
+ __pycache__
3
+ .ipynb_checkpoints/
4
+ competition_cache/
5
+ .env
6
+ .vscode/launch.json
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [browser]
2
+ gatherUsageStats = false
README.md CHANGED
@@ -1,10 +1,13 @@
1
  ---
2
  title: Video Challenge Leaderboard
3
- emoji: 🐨
4
- colorFrom: green
5
- colorTo: gray
6
- sdk: static
 
 
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Video Challenge Leaderboard
3
+ emoji: 🏢
4
+ colorFrom: yellow
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.43.2
8
+ app_file: app.py
9
  pinned: false
10
+ short_description: Leaderboard
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+ import pandas as pd
4
+ import altair as alt
5
+ import subprocess
6
+ import os
7
+
8
+ ## Save results path
9
+ results_path = Path("competition_cache/cached_results")
10
+ TASKS = ["video-challenge-pilot-config", "video-challenge-task-1-config"]
11
+ valid_splits = ["public", "private"]
12
+
13
+ ## Check for files initially
14
+ if not os.path.exists(results_path):
15
+ process = subprocess.Popen(
16
+ ["python3", "utils.py"],
17
+ stdout=subprocess.PIPE,
18
+ stderr=subprocess.PIPE,
19
+ text=True, # Decode stdout/stderr as text
20
+ )
21
+ process.wait()
22
+ process.kill()
23
+
24
+
25
+ #####################################################################
26
+ ## Data loading ##
27
+ #####################################################################
28
+ ## Data loading
29
+ @st.cache_data
30
+ def load_results(task, best_only):
31
+ if best_only:
32
+ return {
33
+ f"{s}_score": pd.read_csv(f"{results_path}/{task}_{s}_score.csv")
34
+ .sort_values(["team", "balanced_accuracy"], ascending=False)
35
+ .drop_duplicates(subset=["team"])
36
+ .sort_values("balanced_accuracy", ascending=False)
37
+ .set_index("team")
38
+ for s in valid_splits
39
+ }
40
+
41
+ else:
42
+ return {
43
+ f"{s}_score": pd.read_csv(f"{results_path}/{task}_{s}_score.csv").set_index("team") for s in valid_splits
44
+ }
45
+
46
+
47
+ @st.cache_data
48
+ def load_submission():
49
+ out = []
50
+ for task in TASKS:
51
+ data = pd.read_csv(f"{results_path}/{task}_submissions.csv")
52
+ data["task"] = task
53
+ out.append(data)
54
+
55
+ return pd.concat(out, ignore_index=True)
56
+
57
+
58
+ @st.cache_data
59
+ def get_updated_time(file="updated.txt"):
60
+ return open(file).read()
61
+
62
+
63
+ @st.cache_data
64
+ def get_volume():
65
+ subs = pd.concat(
66
+ [pd.read_csv(f"{results_path}/{task}_submissions.csv") for task in TASKS],
67
+ ignore_index=True,
68
+ )
69
+ subs["datetime"] = pd.DatetimeIndex(subs["datetime"])
70
+ subs["date"] = subs["datetime"].dt.date
71
+ subs = subs.groupby(["date", "status_reason"]).size().unstack().fillna(0).reset_index()
72
+
73
+ return subs
74
+
75
+
76
+ @st.cache_data
77
+ def make_heatmap(results, label="generated", symbol="🤖"):
78
+
79
+ # Assuming df is your wide-format DataFrame (models as rows, datasets as columns)
80
+ df_long = results.set_index("team")
81
+
82
+ team_order = results.index.tolist()
83
+ df_long = df_long.loc[:, [c for c in df_long.columns if c.startswith(label) and "accuracy" not in c]]
84
+
85
+ df_long.columns = [c.replace(f"{label}_", "") for c in df_long.columns]
86
+
87
+ if "none" in df_long.columns:
88
+ df_long = df_long.drop(columns=["none"])
89
+
90
+ df_long = df_long.reset_index().melt(id_vars="team", var_name="source", value_name="acc")
91
+ # df_long.rename(columns={'index': 'source'}, inplace=True)
92
+ # df_long
93
+ # return
94
+
95
+ # Base chart for rectangles
96
+ base = alt.Chart(df_long).encode(
97
+ x=alt.X("source:O", title="Source", axis=alt.Axis(orient="top", labelAngle=-60)),
98
+ y=alt.Y("team:O", title="Team", sort=team_order),
99
+ )
100
+
101
+ # Heatmap rectangles
102
+ heatmap = base.mark_rect().encode(
103
+ color=alt.Color("acc:Q", scale=alt.Scale(scheme="greens"), title=f"{label} Accuracy")
104
+ )
105
+
106
+ # Text labels
107
+ text = base.mark_text(baseline="middle", fontSize=16).encode(
108
+ text=alt.Text("acc:Q", format=".2f"),
109
+ color=alt.condition(
110
+ alt.datum.acc < 0.5, # you can tune this for readability
111
+ alt.value("black"),
112
+ alt.value("white"),
113
+ ),
114
+ )
115
+
116
+ # Combine heatmap and text
117
+ chart = (heatmap + text).properties(width=600, height=500, title=f"Accuracy on {symbol} {label} sources heatmap")
118
+
119
+ return chart
120
+
121
+
122
+ @st.cache_data
123
+ def make_roc_curves(task, best_only=False):
124
+
125
+ rocs = pd.read_csv(f"{results_path}/{task}_rocs.csv")
126
+
127
+ # if best_only:
128
+ # rocs = rocs.sort_values(by=["auc"],ascending=False).drop_duplicates("team")
129
+
130
+ roc_chart = (
131
+ alt.Chart(rocs)
132
+ .mark_line()
133
+ .encode(
134
+ x="fpr",
135
+ y="tpr",
136
+ color="team:N",
137
+ detail="submission_id:N"
138
+ )
139
+ )
140
+
141
+ return roc_chart
142
+
143
+
144
+ #####################################################################
145
+ ## Page definition ##
146
+ #####################################################################
147
+
148
+ ## Set title
149
+ st.set_page_config(
150
+ page_title="Leaderboard",
151
+ initial_sidebar_state="collapsed",
152
+ layout="wide", # This makes the app use the full width of the screen
153
+ )
154
+
155
+ ## Pull new results or toggle private public if you are an owner
156
+ with st.sidebar:
157
+
158
+ hf_token = os.getenv("HF_TOKEN")
159
+ password = st.text_input("Admin login:", type="password")
160
+
161
+ if password == hf_token:
162
+ if st.button("Pull New Results"):
163
+ with st.spinner("Pullin new results", show_time=True):
164
+ try:
165
+ process = subprocess.Popen(
166
+ ["python3", "utils.py"],
167
+ stdout=subprocess.PIPE,
168
+ stderr=subprocess.PIPE,
169
+ text=True, # Decode stdout/stderr as text
170
+ )
171
+ st.success(f"Background task started with PID: {process.pid}")
172
+ process.wait()
173
+ process.kill()
174
+ st.success(f"PID {process.pid} finished!")
175
+ # If a user has the right perms, then this clears the cache
176
+ load_results.clear()
177
+ get_volume.clear()
178
+ load_submission.clear()
179
+ st.rerun()
180
+ except Exception as e:
181
+ st.error(f"Error starting background task: {e}")
182
+
183
+ ## Initialize the toggle state in session_state if it doesn't exist
184
+ if "private_view" not in st.session_state:
185
+ st.session_state.private_view = False
186
+
187
+ # Create the toggle widget
188
+ # The 'value' parameter sets the initial state, here linked to session_state
189
+ # The 'key' parameter is crucial for identifying the widget across reruns and linking to session_state
190
+ toggle_value = st.toggle("Private Scores", value=st.session_state.private_view, key="private_view")
191
+
192
+ # The 'toggle_value' variable will hold the current state of the toggle (True or False)
193
+ if toggle_value:
194
+ st.write("Showing **PRIVATE** scores.")
195
+ else:
196
+ st.write("Showing **PUBLIC** scores.")
197
+
198
+ split = "public" if not toggle_value else "private"
199
+ else:
200
+ split = "public"
201
+
202
+
203
+ def show_leaderboad(results,task):
204
+ cols = [
205
+ "generated_accuracy",
206
+ "real_accuracy",
207
+ # "pristine_accuracy",
208
+ "balanced_accuracy",
209
+ "auc",
210
+ "fail_rate",
211
+ "total_time",
212
+ ]
213
+
214
+ # st.dataframe(results[f"{split}_score"])
215
+
216
+ column_config = {
217
+ "balanced_accuracy": st.column_config.NumberColumn(
218
+ "⚖️ Balanced Accruacy",
219
+ format="compact",
220
+ min_value=0,
221
+ pinned=True,
222
+ max_value=1.0,
223
+ # width="small",
224
+ ),
225
+ "generated_accuracy": st.column_config.NumberColumn(
226
+ "🤖 True Postive Rate",
227
+ format="compact",
228
+ min_value=0,
229
+ pinned=True,
230
+ max_value=1.0,
231
+ # width="small",
232
+ ),
233
+ "real_accuracy": st.column_config.NumberColumn(
234
+ "🧑‍🎤 True Negative Rate",
235
+ format="compact",
236
+ min_value=0,
237
+ pinned=True,
238
+ max_value=1.0,
239
+ # width="small",
240
+ ),
241
+ "auc": st.column_config.NumberColumn(
242
+ "📐 AUC",
243
+ format="compact",
244
+ min_value=0,
245
+ pinned=True,
246
+ max_value=1.0,
247
+ # width="small",
248
+ ),
249
+ "fail_rate": st.column_config.NumberColumn(
250
+ "❌ Fail Rate",
251
+ format="compact",
252
+ # width="small",
253
+ ),
254
+ "total_time": st.column_config.NumberColumn(
255
+ "🕒 Inference Time",
256
+ format="compact",
257
+ # width="small",
258
+ ),
259
+ }
260
+
261
+ labels = {"real": "🧑‍🎤", "generated": "🤖"}
262
+
263
+ for c in results[f"{split}_score"].columns:
264
+ if "accuracy" in c:
265
+ continue
266
+ if any(p in c for p in ["generated", "real"]):
267
+ s = c.split("_")
268
+ pred = s[0]
269
+ source = " ".join(s[1:])
270
+ column_config[c] = st.column_config.NumberColumn(
271
+ labels[pred] + " " + source,
272
+ help=c,
273
+ format="compact",
274
+ min_value=0,
275
+ max_value=1.0,
276
+ )
277
+
278
+ "#### Summary"
279
+ st.dataframe(results[f"{split}_score"].loc[:, cols], column_config=column_config)
280
+
281
+ cond_bacc = st.toggle("Conditional Balanced Accuracy",value=False, key = f"cond_bacc_{task}")
282
+
283
+
284
+
285
+
286
+
287
+ cols = [c for c in results[f"{split}_score"].columns if "generated_" in c and "accuracy" not in c]
288
+ temp = results[f"{split}_score"].loc[:, cols].copy()
289
+
290
+
291
+ if cond_bacc:
292
+ tnr = results[f"{split}_score"].loc[:, ["real_accuracy"]]
293
+ temp[:] = (temp.values + tnr.values)/2.
294
+ "#### 🤖 Balanced Accuracy | Generated Source"
295
+ else:
296
+ "#### 🤖 True Positive Rate | Generated Source"
297
+
298
+ st.dataframe(temp, column_config=column_config)
299
+
300
+
301
+
302
+
303
+ cols = [c for c in results[f"{split}_score"].columns if "real_" in c and "accuracy" not in c]
304
+ temp = results[f"{split}_score"].loc[:, cols].copy()
305
+
306
+ if cond_bacc:
307
+ tpr = results[f"{split}_score"].loc[:, ["generated_accuracy"]]
308
+ temp[:] = (temp.values + tpr.values)/2.
309
+ "#### 🧑‍🎤 Balanced Accuracy | Real Source"
310
+ else:
311
+ "#### 🧑‍🎤 True Negative Rate | Real Source"
312
+
313
+
314
+ st.dataframe(temp,column_config=column_config)
315
+
316
+
317
+ def make_roc(results):
318
+ results["FA"] = 1.0 - results["real_accuracy"]
319
+
320
+ chart = (
321
+ alt.Chart(results)
322
+ .mark_circle()
323
+ .encode(
324
+ x=alt.X("FA:Q", title="🧑‍🎤 False Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
325
+ y=alt.Y("generated_accuracy:Q", title="🤖 True Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
326
+ color="team:N", # Color by categorical field
327
+ size=alt.Size(
328
+ "total_time:Q", title="🕒 Inference Time", scale=alt.Scale(rangeMin=100)
329
+ ), # Size by quantitative field
330
+ )
331
+ .properties(width=400, height=400, title="Detection vs False Alarm vs Inference Time")
332
+ )
333
+
334
+ diag_line = (
335
+ alt.Chart(pd.DataFrame(dict(tpr=[0, 1], fpr=[0, 1])))
336
+ .mark_line(color="lightgray", strokeDash=[8, 4])
337
+ .encode(x="fpr", y="tpr")
338
+ )
339
+
340
+ return chart + diag_line
341
+
342
+
343
+ def make_acc(results):
344
+ # results["FA"] = 1. - results["pristine_accuracy"]
345
+ # results = results[results["total_time"] >= 0]
346
+ # results["total_time"] = results["total_time"]
347
+
348
+ results = results.loc[results["total_time"] >= 0]
349
+
350
+ chart = (
351
+ alt.Chart(results)
352
+ .mark_circle(size=200)
353
+ .encode(
354
+ x=alt.X("total_time:Q", title="🕒 Inference Time", scale = alt.Scale(domain=[0., 10000])),
355
+ y=alt.Y(
356
+ "balanced_accuracy:Q",
357
+ title="Balanced Accuracy",
358
+ scale=alt.Scale(domain=[0.4, 1]),
359
+ ),
360
+ color="team:N", # Color by categorical field # Size by quantitative field
361
+ )
362
+ .properties(width=400, height=400, title="Inference Time vs Balanced Accuracy")
363
+ )
364
+ diag_line = (
365
+ alt.Chart(pd.DataFrame(dict(t=[0, results["total_time"].max()], y=[0.5, 0.5])))
366
+ .mark_line(color="lightgray", strokeDash=[8, 4])
367
+ .encode(x="t", y="y")
368
+ )
369
+ return chart + diag_line
370
+
371
+
372
+ def get_heatmaps(temp):
373
+ h1 = make_heatmap(temp, "generated", symbol="🤖")
374
+ h2 = make_heatmap(temp, "real", symbol="🧑‍🎤")
375
+
376
+ st.altair_chart(h1, use_container_width=True)
377
+ st.altair_chart(h2, use_container_width=True)
378
+
379
+ if temp.columns.str.contains("aug", case=False).any():
380
+ h3 = make_heatmap(temp, "aug", symbol="🛠️")
381
+ st.altair_chart(h3, use_container_width=True)
382
+
383
+
384
+ def make_plots_for_task(task, split, best_only):
385
+ results = load_results(task, best_only=best_only)
386
+ # results1[f"{split}_score"]
387
+ temp = results[f"{split}_score"].reset_index()
388
+
389
+ # st.write(temp)
390
+
391
+ t1, t2 = st.tabs(["Tables", "Charts"])
392
+ with t1:
393
+ show_leaderboad(results,task)
394
+
395
+ with t2:
396
+
397
+ roc_scatter = make_roc(temp)
398
+ acc_vs_time = make_acc(temp)
399
+
400
+ if split == "private" and hf_token is not None:
401
+ # with t2:
402
+ full_curves = st.toggle("full curves", value=True, key=f"all curves {task}")
403
+
404
+ if full_curves:
405
+ roc_scatter = make_roc_curves(task, best_only) + roc_scatter
406
+
407
+ st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
408
+ else:
409
+ # with t2:
410
+ st.altair_chart(roc_scatter | acc_vs_time, use_container_width=False)
411
+
412
+ # with t3:
413
+ # get_heatmaps(temp)
414
+
415
+
416
+ updated = get_updated_time()
417
+ st.markdown(updated)
418
+ # st.markdown("#### Detailed Public Leaderboard")
419
+ # st.markdown("[SAFE: Synthetic Audio Forensics Evaluation Challenge](https://stresearch.github.io/SAFE/)")
420
+ best_only = True # st.toggle("Only Best per Team", value=True)
421
+ # show_chart = st.toggle("Show Table", value=True)
422
+
423
+
424
+ tp, t1, volume_tab, all_submission_tab = st.tabs(["**Pilot Task**","**Task 1**", "**Submission Volume**", "**All Submissions**"])
425
+ with tp:
426
+ "*Detection of Synthetic Video Content. Video files are unmodified from the original output from the models or the real sources.*"
427
+ make_plots_for_task(TASKS[0], split, best_only)
428
+ with t1:
429
+ "*Detection of Synthetic Video Content. Video files are unmodified from the original output from the models or the real sources.*"
430
+ make_plots_for_task(TASKS[1], split, best_only)
431
+
432
+ with volume_tab:
433
+ subs = get_volume()
434
+ status_lookup = "QUEUED,PROCESSING,SUCCESS,FAILED".split(",")
435
+ found_columns = subs.columns.values.tolist()
436
+ status_lookup = list(set(status_lookup) & set(found_columns))
437
+ st.bar_chart(subs, x="date", y=status_lookup, stack=True)
438
+
439
+ total_submissions = int(subs.loc[:, status_lookup].fillna(0).values.sum())
440
+ st.metric("Total Submissions", value=total_submissions)
441
+
442
+ st.metric("Duration", f'{(subs["date"].max() - subs["date"].min()).days} days')
443
+
444
+ if split == "private":
445
+ with all_submission_tab:
446
+ data = load_submission()
447
+ st.dataframe(data)
index.html DELETED
@@ -1,19 +0,0 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
metric.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import pandas as pd
4
+ from huggingface_hub import hf_hub_download
5
+ from sklearn.metrics import roc_auc_score, roc_curve
6
+ import numpy as np
7
+
8
+
9
+ def compute_roc(solution_df):
10
+
11
+ ## fix weird submissions
12
+ if isinstance(solution_df.iloc[0]["score"], str):
13
+ solution_df.loc[:, "score"] = solution_df.loc[:, "score"].apply(
14
+ lambda a: float(
15
+ # np.array(json.loads(re.sub(r"\b(\d+)\.(?!\d)", r"\1.0", a))).squeeze()
16
+ np.array(json.loads(re.sub(r"\b(\d+)\.(?!\d)", r"\1.0", a))).squeeze()
17
+ if isinstance(a, str)
18
+ else float("nan")
19
+ )
20
+ )
21
+
22
+ isna = solution_df["score"].isna()
23
+
24
+ if isna.all():
25
+ ## if all nans
26
+ return -1
27
+
28
+ solution_df = solution_df.loc[~isna]
29
+ auc = roc_auc_score(solution_df["pred"] == "generated", solution_df["score"])
30
+ return auc
31
+
32
+
33
+ def compute_roc_curve(solution_df, keep_every: int = 10):
34
+
35
+ ## fix weird submissions
36
+ if isinstance(solution_df.iloc[0]["score"], str):
37
+ solution_df.loc[:, "score"] = solution_df.loc[:, "score"].apply(
38
+ lambda a: float(
39
+ # np.array(json.loads(re.sub(r"\b(\d+)\.(?!\d)", r"\1.0", a))).squeeze()
40
+ np.array(json.loads(re.sub(r"\b(\d+)\.(?!\d)", r"\1.0", a))).squeeze()
41
+ if isinstance(a, str)
42
+ else float("nan")
43
+ )
44
+ )
45
+
46
+ isna = solution_df["score"].isna()
47
+
48
+ if isna.all():
49
+ ## if all nans
50
+ return {"fpr": [], "tpr": [], "threshold": []}
51
+
52
+ solution_df = solution_df.loc[~isna]
53
+ fpr, tpr, threshold = roc_curve(solution_df["pred"] == "generated", solution_df["score"])
54
+ if len(fpr) < keep_every:
55
+ return {"fpr": fpr.tolist(), "tpr": tpr.tolist(), "threshold": threshold.tolist()}
56
+
57
+ # Sample every keep_every
58
+ return {
59
+ "fpr": fpr.tolist()[::keep_every],
60
+ "tpr": tpr.tolist()[::keep_every],
61
+ "threshold": threshold.tolist()[::keep_every],
62
+ }
63
+
64
+
65
+ def _metric(solution_df, submission_df, mode="top_level", full: bool = False):
66
+ """
67
+ This function calculates the accuracy of the generated predictions.
68
+
69
+ Parameters
70
+ ----------
71
+ solution_df : pandas.DataFrame
72
+ The dataframe containing the solution data.
73
+ submission_df : pandas.DataFrame
74
+ The dataframe containing the submission data.
75
+ mode : str, optional
76
+ The mode of evaluation. Can be "top_level" or "bottom_level". The default is "top_level".
77
+ full: bool, optional
78
+ Full evaluation mode breaks up scores by source (both anonymized and original)
79
+
80
+ Returns
81
+ -------
82
+ None.
83
+ """
84
+
85
+ ## Allocate space
86
+ evaluation = {}
87
+
88
+ ## Ensure alignment of keys and group relevant columns
89
+ solution_df["submission_pred"] = solution_df.join(submission_df, lsuffix="_solution", rsuffix="_submission")[
90
+ "pred_submission"
91
+ ].values
92
+ cols = ["split", "pred", "source", "source_og"]
93
+ solution_df["correct"] = solution_df["pred"] == solution_df["submission_pred"]
94
+ accuracy = solution_df.groupby(cols)["correct"].mean().to_frame("accuracy").reset_index()
95
+ accuracy["score_name"] = accuracy["pred"] + "_" + accuracy["source"]
96
+
97
+ ## Create public dataframe and private dataframe
98
+ public_df = accuracy.query(f"split=='public'").copy()
99
+ private_df = accuracy.copy()
100
+ private_df["score_name"] = private_df["pred"] + "_" + private_df["source_og"]
101
+
102
+
103
+ ## Perform a loop over categories for reported metrics
104
+ for split, temp in zip(["public", "private"], [public_df, private_df]):
105
+ scores_by_source = temp.set_index("score_name")["accuracy"].sort_index()
106
+ scores_by_source["generated_accuracy"] = temp.query("pred=='generated'")["accuracy"].mean()
107
+ scores_by_source["real_accuracy"] = temp.query("pred=='real'")["accuracy"].mean()
108
+ scores_by_source["balanced_accuracy"] = (
109
+ scores_by_source["generated_accuracy"] + scores_by_source["real_accuracy"]
110
+ ) / 2.0
111
+ if mode == "top_level":
112
+ scores_to_save = ["generated_accuracy", "real_accuracy", "balanced_accuracy"]
113
+ evaluation[f"{split}_score"] = scores_by_source.loc[scores_to_save].to_dict()
114
+ else:
115
+ evaluation[f"{split}_score"] = scores_by_source.to_dict()
116
+
117
+ ## Compute by source - anonymized and original
118
+ # if full:
119
+ # evaluation[f"{split}_score"]["anon_source"] = temp.groupby("source")["accuracy"].mean().to_dict()
120
+ # evaluation[f"{split}_score"]["original_source"] = temp.groupby("source_og")["accuracy"].mean().to_dict()
121
+
122
+ ## Save data split
123
+ evaluation["public_score"]["proportion"] = len(solution_df.query(f"split=='public'").copy()) / len(solution_df)
124
+ evaluation["private_score"]["proportion"] = 1.0
125
+
126
+ ## Compute AUC
127
+ if "score" in submission_df.columns:
128
+ solution_df["score"] = submission_df["score"]
129
+
130
+ ## Public
131
+ split = "public"
132
+ temp = solution_df.query(f"split=='{split}'").copy()
133
+ try:
134
+ auc = compute_roc(temp)
135
+ except Exception as e:
136
+ print("failed auc")
137
+ print(e)
138
+ auc = "nan"
139
+ evaluation[f"{split}_score"]["auc"] = float(auc)
140
+ evaluation[f"{split}_score"]["fail_rate"] = float(temp["score"].isna().mean())
141
+
142
+ ## Private
143
+ split = "private"
144
+ temp = solution_df.copy()
145
+ try:
146
+ auc = compute_roc(temp)
147
+ except Exception as e:
148
+ print("failed auc")
149
+ print(e)
150
+ auc = "nan"
151
+ evaluation[f"{split}_score"]["auc"] = float(auc)
152
+ evaluation[f"{split}_score"]["fail_rate"] = float(temp["score"].isna().mean())
153
+
154
+ ## Full data computations
155
+ if not full:
156
+ return evaluation
157
+
158
+ ## Roc
159
+ if "score" in submission_df.columns:
160
+ solution_df["score"] = submission_df["score"]
161
+
162
+ ## Public
163
+ split = "public"
164
+ temp = solution_df.query(f"split=='{split}'").copy()
165
+ try:
166
+ roc_curve = compute_roc_curve(temp)
167
+ except Exception as e:
168
+ print("failed roc")
169
+ print(e)
170
+ roc_curve = {"fpr": [], "tpr": [], "threshold": []}
171
+ evaluation[f"{split}_score"]["roc"] = roc_curve
172
+
173
+ ## Private
174
+ split = "private"
175
+ temp = solution_df.copy()
176
+ try:
177
+ roc_curve = compute_roc_curve(temp)
178
+ except Exception as e:
179
+ print("failed roc")
180
+ print(e)
181
+ roc_curve = {"fpr": [], "tpr": [], "threshold": []}
182
+ evaluation[f"{split}_score"]["roc"] = roc_curve
183
+
184
+ return evaluation
185
+
186
+
187
+ def compute(params):
188
+ solution_file = hf_hub_download(
189
+ repo_id=params.competition_id,
190
+ filename="solution.csv",
191
+ token=params.token,
192
+ repo_type="dataset",
193
+ )
194
+
195
+ solution_df = pd.read_csv(solution_file).set_index(params.submission_id_col)
196
+
197
+ submission_filename = f"submissions/{params.team_id}-{params.submission_id}.csv"
198
+ submission_file = hf_hub_download(
199
+ repo_id=params.competition_id,
200
+ filename=submission_filename,
201
+ token=params.token,
202
+ repo_type="dataset",
203
+ )
204
+
205
+ submission_df = pd.read_csv(submission_file).set_index(params.submission_id_col)
206
+
207
+ return _metric(solution_df, submission_df)
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "leaderboard"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "streamlit",
9
+ "pandas",
10
+ "altair",
11
+ "scikit-learn",
12
+ "huggingface_hub",
13
+ "vl-convert-python",
14
+ "hf_transfer"
15
+ ]
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ scikit-learn
2
+ numpy
3
+ streamlit
4
+ huggingface_hub
style.css DELETED
@@ -1,28 +0,0 @@
1
- body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
- }
5
-
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
9
- }
10
-
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
- }
17
-
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
24
- }
25
-
26
- .card p:last-child {
27
- margin-bottom: 0;
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ HF_TOKEN=test streamlit run app.py
updated.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Updated on 2025-08-06 11:28:08 EST
utils.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from huggingface_hub import snapshot_download
5
+ import tqdm.auto as tqdm
6
+ from typing import Any, Dict, List, Tuple
7
+ from collections import defaultdict
8
+ from metric import _metric
9
+ import os
10
+ import pandas as pd
11
+
12
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
13
+ os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "20"
14
+ COMP_CACHE = os.environ.get("COMP_CACHE", "./competition_cache")
15
+
16
+
17
+ def download_competition_data(competition_names: List[str]) -> None:
18
+ """Download copies to local environment"""
19
+ for repo_id in tqdm.tqdm(competition_names):
20
+ snapshot_download(
21
+ repo_id=repo_id,
22
+ local_dir=os.path.join(COMP_CACHE, repo_id),
23
+ repo_type="dataset",
24
+ token=os.environ.get("HF_TOKEN"),
25
+ )
26
+
27
+
28
+ STATUS_MAP = {0: "PENDING", 1: "QUEUED", 2: "PROCESSING", 3: "SUCCESS", 4: "FAILED"}
29
+
30
+ ## Make a directory to store computed results
31
+ os.makedirs(Path("competition_cache") / "cached_results", exist_ok=True)
32
+
33
+
34
+ def load_teams(competition_space_path: Path) -> pd.DataFrame:
35
+ team_file_name = "teams.json"
36
+ return pd.read_json(Path(competition_space_path) / team_file_name).T
37
+
38
+
39
+ def json_to_dataframe(data, extra_column_name=None, extra_column_value=None):
40
+ flat_data = []
41
+ for entry in data:
42
+ original_flat_entry = {**entry}
43
+ flat_entry = {k: v for k, v in original_flat_entry.items() if not "score" in k}
44
+ times = {
45
+ k.replace("score", "time"): v.get("total_time", -1) for k, v in original_flat_entry.items() if "score" in k
46
+ }
47
+ flat_entry.update(times)
48
+ if extra_column_name:
49
+ flat_entry[extra_column_name] = extra_column_value
50
+ flat_data.append(flat_entry)
51
+ df = pd.DataFrame(flat_data)
52
+ return df
53
+
54
+
55
+ def load_submission_map(competition_space_path: Path) -> Tuple[Dict[str, str], pd.DataFrame]:
56
+ submission_info_dir = "submission_info"
57
+ submission_info_files = list((Path(competition_space_path) / submission_info_dir).glob("*.json"))
58
+
59
+ # Loop and collect submission IDs by team
60
+ team_submissions: Dict[str, str] = {}
61
+ submission_summaries: List[pd.DataFrame] = []
62
+ for file in submission_info_files:
63
+ with open(file, "r") as fn:
64
+ json_data = json.load(fn)
65
+ submission_summaries.append(
66
+ json_to_dataframe(
67
+ data=json_data["submissions"], extra_column_name="team_id", extra_column_value=json_data["id"]
68
+ )
69
+ )
70
+ submission_list = pd.read_json(file).submissions.values.tolist()
71
+ for submission in submission_list:
72
+ team_submissions[submission["submission_id"]] = submission["submitted_by"]
73
+ submission_summary = pd.concat(submission_summaries, axis=0)
74
+ submission_summary["status_reason"] = submission_summary["status"].apply(lambda x: STATUS_MAP[x])
75
+ return team_submissions, submission_summary
76
+
77
+
78
+ def get_member_to_team_map(teams: pd.DataFrame, team_submissions: Dict[str, str]) -> Dict[str, str]:
79
+ member_map: Dict[str, str] = {}
80
+ for member_id in team_submissions.values():
81
+ member_map[member_id] = teams[teams.members.apply(lambda x: member_id in x)].id.values[0]
82
+ return member_map
83
+
84
+
85
+ def load_submissions(competition_space_path: Path) -> Dict[str, Dict[str, pd.DataFrame]]:
86
+ submission_dir = "submissions"
87
+ submissions: Dict[str, Dict[str, pd.DataFrame]] = defaultdict(dict)
88
+ for file in list((Path(competition_space_path) / submission_dir).glob("*.csv")):
89
+ file_name = str(file).split("/")[-1].split(".")[0]
90
+ team_id = "-".join(file_name.split("/")[-1].split("-")[:5])
91
+ sub_id = "-".join(file_name.split("/")[-1].split("-")[5:])
92
+ submissions[team_id][sub_id] = pd.read_csv(file).set_index("id")
93
+ return submissions
94
+
95
+
96
+ def compute_metric_per_team(solution_df: pd.DataFrame, team_submissions: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
97
+ results: Dict[str, Any] = {}
98
+ for submission_id, submission in team_submissions.items():
99
+ results[submission_id] = _metric(solution_df=solution_df, submission_df=submission, mode="detailed", full=True)
100
+ return results
101
+
102
+
103
+ def prep_public(public_results: Dict[str, Any]) -> Dict[str, Any]:
104
+ new: Dict[str, Any] = {}
105
+ for key, value in public_results.items():
106
+ # if key == "anon_source":
107
+ # for sub_key, sub_value in value.items():
108
+ # sub_key = ("generated" if sub_key[0] == "g" else "real") + "_" + sub_key.split("_")[-1]
109
+ # new[sub_key] = sub_value
110
+ # continue
111
+ if key in ["proportion", "roc", "original_source"]:
112
+ continue
113
+ new[key] = value
114
+ return new
115
+
116
+
117
+ def prep_private(private_results: Dict[str, Any]) -> Dict[str, Any]:
118
+ new: Dict[str, Any] = {}
119
+ for key, value in private_results.items():
120
+ # if key == "original_source":
121
+ # for sub_key, sub_value in value.items():
122
+ # sub_key = ("real" if sub_key in REAL_MAP else "generated") + "_" + sub_key
123
+ # new[sub_key] = sub_value
124
+ # continue
125
+ if key in ["proportion", "roc", "anon_source"]:
126
+ continue
127
+ new[key] = value
128
+ return new
129
+
130
+
131
+ def extract_roc(results: Dict[str, Any]) -> Dict[str, Any]:
132
+ new: Dict[str, Any] = {}
133
+ for key, value in results.items():
134
+ if key in ["roc"]:
135
+ for sub_key, sub_value in value.items():
136
+ new[sub_key] = sub_value
137
+ continue
138
+ if key in ["auc"]:
139
+ new[key] = value
140
+ return new
141
+
142
+
143
+ if __name__ == "__main__":
144
+
145
+ ## Download data
146
+ spaces: List[str] = ["safe-challenge/video-challenge-pilot-config", "safe-challenge/video-challenge-task-1-config"]
147
+ download_competition_data(competition_names=spaces)
148
+
149
+ ## Loop
150
+ for space in spaces:
151
+ local_dir = Path("competition_cache") / space
152
+
153
+ ## Load relevant data
154
+ teams = load_teams(competition_space_path=local_dir)
155
+ team_submissions, submission_summaries = load_submission_map(competition_space_path=local_dir)
156
+ member_map = get_member_to_team_map(teams=teams, team_submissions=team_submissions)
157
+ submissions = load_submissions(competition_space_path=local_dir)
158
+
159
+ ## Load solutions
160
+ solutions_df = pd.read_csv(local_dir / "solution.csv").set_index("id")
161
+
162
+ ## Loop and save by team
163
+ public, private, rocs = [], [], []
164
+ for team_id, submission_set in submissions.items():
165
+ results = compute_metric_per_team(solution_df=solutions_df, team_submissions=submission_set)
166
+ public_results = {
167
+ key: prep_public(value["public_score"]) for key, value in results.items() if key in team_submissions
168
+ }
169
+ private_results = {
170
+ key: prep_private(value["private_score"]) for key, value in results.items() if key in team_submissions
171
+ }
172
+
173
+
174
+ ## Add timing
175
+ public_times = {
176
+ x["submission_id"]: x["public_time"]
177
+ for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][
178
+ ["submission_id", "public_time"]
179
+ ].to_dict(orient="records")
180
+ }
181
+ private_times = {
182
+ x["submission_id"]: x["private_time"]
183
+ for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][
184
+ ["submission_id", "private_time"]
185
+ ].to_dict(orient="records")
186
+ }
187
+ for key in public_results.keys():
188
+ public_results[key]["total_time"] = public_times[key]
189
+ for key in private_results.keys():
190
+ private_results[key]["total_time"] = private_times[key]
191
+
192
+ ## Roc computations
193
+ roc_results = {
194
+ key: extract_roc(value["private_score"]) for key, value in results.items() if key in team_submissions
195
+ }
196
+ roc_df = pd.json_normalize(roc_results.values())
197
+ if len(roc_df)==0:
198
+ continue
199
+ roc_df.insert(loc=0, column="submission_id", value=roc_results.keys())
200
+ roc_df.insert(
201
+ loc=0,
202
+ column="team",
203
+ value=[
204
+ teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
205
+ for submission_id in roc_results.keys()
206
+ ],
207
+ )
208
+ roc_df.insert(
209
+ loc=0,
210
+ column="submission_repo",
211
+ value=[
212
+ submission_summaries[
213
+ submission_summaries.team_id == member_map[team_submissions[submission_id]]
214
+ ].submission_repo.values[0]
215
+ for submission_id in roc_results.keys()
216
+ ],
217
+ )
218
+ roc_df["label"] = roc_df.apply(
219
+ lambda x: f"AUC: {round(x['auc'], 2)} - {x['team']} - {x['submission_repo']}", axis=1
220
+ )
221
+ rocs.append(roc_df)
222
+
223
+ ## Append results to save in cache
224
+ public_df = pd.json_normalize(public_results.values())
225
+ public_df.insert(
226
+ loc=0,
227
+ column="submission",
228
+ value=[
229
+ teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
230
+ for submission_id in public_results.keys()
231
+ ],
232
+ )
233
+ public_df.insert(
234
+ loc=0,
235
+ column="team",
236
+ value=[
237
+ teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
238
+ for submission_id in public_results.keys()
239
+ ],
240
+ )
241
+ public_df.insert(
242
+ loc=0,
243
+ column="team_id",
244
+ value=[
245
+ teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0]
246
+ for submission_id in public_results.keys()
247
+ ],
248
+ )
249
+ public.append(public_df)
250
+
251
+ ## Private results
252
+ private_df = pd.json_normalize(private_results.values())
253
+ private_df.insert(
254
+ loc=0,
255
+ column="submission",
256
+ value=[
257
+ teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
258
+ for submission_id in private_results.keys()
259
+ ],
260
+ )
261
+ private_df.insert(
262
+ loc=0,
263
+ column="team",
264
+ value=[
265
+ teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
266
+ for submission_id in private_results.keys()
267
+ ],
268
+ )
269
+ private_df.insert(
270
+ loc=0,
271
+ column="team_id",
272
+ value=[
273
+ teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0]
274
+ for submission_id in private_results.keys()
275
+ ],
276
+ )
277
+ private.append(private_df)
278
+
279
+ ## Save as csvs
280
+ public = pd.concat(public, axis=0).sort_values(by="balanced_accuracy", ascending=False)
281
+ private = pd.concat(private, axis=0).sort_values(by="balanced_accuracy", ascending=False)
282
+ rocs = pd.concat(rocs, axis=0).explode(["tpr", "fpr", "threshold"], ignore_index=True)
283
+ public.to_csv(
284
+ Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_public_score.csv",
285
+ index=False,
286
+ )
287
+ private.to_csv(
288
+ Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_private_score.csv",
289
+ index=False,
290
+ )
291
+ rocs.to_csv(
292
+ Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_rocs.csv", index=False
293
+ )
294
+ submission_summaries.to_csv(
295
+ Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_submissions.csv",
296
+ index=False,
297
+ )
298
+
299
+ ## Update time
300
+ now = datetime.now()
301
+ formatted = now.strftime("Updated on %Y-%m-%d %H:%M:%S EST")
302
+ with open("updated.txt", "w") as file:
303
+ file.write(formatted)