kt-test-account commited on
Commit
4f25de8
·
1 Parent(s): 1bf4289
Files changed (8) hide show
  1. app.py +112 -0
  2. metric.py +125 -0
  3. process_data.py +140 -0
  4. requirements.txt +2 -0
  5. run.sh +1 -0
  6. task1.csv +10 -0
  7. task2.csv +5 -0
  8. update_data.sh +3 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+ import pandas as pd
4
+ import json
5
+ import metric
6
+ from sklearn.metrics import roc_auc_score, roc_curve
7
+ import numpy as np
8
+ import altair as alt
9
+
10
+
11
+
12
+ st.set_page_config(
13
+ page_title="Public Leaderboard",
14
+ initial_sidebar_state="collapsed",
15
+ layout="wide", # This makes the app use the full width of the screen
16
+ )
17
+
18
+ @st.cache_data
19
+ def load_results(task):
20
+ return pd.read_csv(task).set_index("team")
21
+
22
+
23
+ split = "public"
24
+
25
+ def show_leaderboad(results):
26
+ cols = [
27
+ "generated_accuracy",
28
+ "pristine_accuracy",
29
+ "balanced_accuracy",
30
+ "fail_rate",
31
+ "total_time",
32
+ ]
33
+
34
+ # st.dataframe(results[f"{split}_score"])
35
+
36
+ column_config = {
37
+ "balanced_accuracy": st.column_config.ProgressColumn(
38
+ "Balanced Acc", format="compact", min_value=0, pinned=True, max_value=1.0, width = "large"
39
+ ),
40
+ "generated_accuracy": st.column_config.ProgressColumn(
41
+ "🤖 Acc", format="compact", min_value=0, pinned=True, max_value=1.0, width = "large"
42
+ ),
43
+ "pristine_accuracy": st.column_config.ProgressColumn(
44
+ "🧑‍🎤 Acc", format="compact", min_value=0, pinned=True, max_value=1.0, width = "large"
45
+ ),
46
+ "fail_rate": st.column_config.NumberColumn(
47
+ "❌ Fail Rate",
48
+ format="compact",
49
+ width = "small",
50
+ ),
51
+ "fail_total_timerate": st.column_config.NumberColumn(
52
+ "🕒 Inference Time",
53
+ format="compact",
54
+ width = "small",
55
+ ),
56
+ }
57
+
58
+ labels = {"pristine": "🧑‍🎤", "generated": "🤖"}
59
+
60
+ for c in results[f"{split}_score"].columns:
61
+ if "accuracy" in c:
62
+ continue
63
+ if any(p in c for p in ["generated", "pristine"]):
64
+ s = c.split("_")
65
+ pred = s[0]
66
+ source = " ".join(s[1:])
67
+ column_config[c] = st.column_config.ProgressColumn(
68
+ labels[pred] + " " + source,
69
+ help=c,
70
+ format="compact",
71
+ min_value=0,
72
+ max_value=1.0,
73
+ )
74
+
75
+ "#### Summary"
76
+ st.dataframe(results[f"{split}_score"].loc[:, cols], column_config=column_config)
77
+
78
+ "#### Accuracy on 🤖 Generated by Source"
79
+
80
+ cols = [
81
+ c
82
+ for c in results[f"{split}_score"].columns
83
+ if "generated" in c and "accuracy" not in c
84
+ ]
85
+ st.dataframe(results[f"{split}_score"].loc[:, cols], column_config=column_config)
86
+
87
+ "#### Accuracy on 🧑‍🎤 Pristine by Source"
88
+
89
+ cols = [
90
+ c
91
+ for c in results[f"{split}_score"].columns
92
+ if "pristine" in c and "accuracy" not in c
93
+ ]
94
+ st.dataframe(results[f"{split}_score"].loc[:, cols], column_config=column_config)
95
+
96
+
97
+ split = "public"
98
+
99
+
100
+ st.markdown("#### Detailed Public Leaderboard")
101
+ st.markdown("[SAFE: Synthetic Audio Forensics Evaluation Challenge](https://stresearch.github.io/SAFE/)")
102
+
103
+ t1,t2 = st.tabs(["**Task 1**","**Task 2**"])
104
+ with t1:
105
+ results1 = {f"{split}_score": load_results("task1.csv")}
106
+ show_leaderboad(results1)
107
+
108
+ with t2:
109
+ results2 = {f"{split}_score": load_results("task2.csv")}
110
+ show_leaderboad(results2)
111
+
112
+
metric.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from huggingface_hub import hf_hub_download
3
+ import json
4
+
5
+
6
+ def _metric(solution_df,submission_df, mode = "top_level", admin = False):
7
+ """
8
+ This function calculates the accuracy of the generated predictions.
9
+
10
+ Parameters
11
+ ----------
12
+ solution_df : pandas.DataFrame
13
+ The dataframe containing the solution data.
14
+ submission_df : pandas.DataFrame
15
+ The dataframe containing the submission data.
16
+ mode : str, optional
17
+ The mode of evaluation. Can be "top_level" or "bottom_level". The default is "top_level".
18
+
19
+ Returns
20
+ -------
21
+ None.
22
+ """
23
+
24
+
25
+ solution_df["submission_pred"] = submission_df["pred"]
26
+
27
+ if admin:
28
+ source_col = "source_og"
29
+ else:
30
+ source_col = "source"
31
+
32
+
33
+ cols = ["split","pred", source_col]
34
+
35
+
36
+ solution_df["correct"] = solution_df["pred"] == solution_df["submission_pred"]
37
+ accuracy = solution_df.groupby(cols)["correct"].mean().to_frame("accuracy").reset_index()
38
+ accuracy["score_name"] = accuracy["pred"] +"_"+ accuracy[source_col]
39
+
40
+ evaluation = {}
41
+
42
+ split = "public"
43
+
44
+ temp = accuracy.query(f"split=='{split}'")
45
+ scores_by_source = temp.set_index("score_name")["accuracy"].sort_index()
46
+ scores_by_source["generated_accuracy"] = temp.query("pred=='generated'")["accuracy"].mean()
47
+ scores_by_source["pristine_accuracy"] = temp.query("pred=='pristine'")["accuracy"].mean()
48
+ scores_by_source["balanced_accuracy"] = (scores_by_source["generated_accuracy"] + scores_by_source["pristine_accuracy"])/2.
49
+
50
+
51
+ if mode == "top_level":
52
+ scores_to_save = ["generated_accuracy", "pristine_accuracy", "balanced_accuracy"]
53
+ evaluation[f"{split}_score"] = scores_by_source.loc[scores_to_save].to_dict()
54
+ else:
55
+ evaluation[f"{split}_score"] = scores_by_source.to_dict()
56
+
57
+ split = "private"
58
+ # private has everything
59
+
60
+ temp = accuracy
61
+ scores_by_source = temp.set_index("score_name")["accuracy"].sort_index()
62
+ scores_by_source["generated_accuracy"] = temp.query("pred=='generated'")["accuracy"].mean()
63
+ scores_by_source["pristine_accuracy"] = temp.query("pred=='pristine'")["accuracy"].mean()
64
+ scores_by_source["balanced_accuracy"] = (scores_by_source["generated_accuracy"] + scores_by_source["pristine_accuracy"])/2.
65
+
66
+ if mode == "top_level":
67
+ scores_to_save = ["generated_accuracy", "pristine_accuracy", "balanced_accuracy"]
68
+ evaluation[f"{split}_score"] = scores_by_source.loc[scores_to_save].to_dict()
69
+ else:
70
+ evaluation[f"{split}_score"] = scores_by_source.to_dict()
71
+
72
+
73
+ if "time" in submission_df.columns:
74
+ solution_df["submission_time"] = submission_df["time"]
75
+
76
+ split = "public"
77
+ evaluation[f"{split}_score"]["total_time"] = float(solution_df.query(f"split=='{split}'")["submission_time"].sum())
78
+
79
+ split = "private"
80
+ evaluation[f"{split}_score"]["total_time"] = float(solution_df["submission_time"].sum())
81
+ else:
82
+ for split in ["public","private"]:
83
+ evaluation[f"{split}_score"]["total_time"] = -1
84
+
85
+
86
+ if "score" in submission_df.columns:
87
+ solution_df["submission_score"] = submission_df["score"]
88
+
89
+ split = "public"
90
+ evaluation[f"{split}_score"]["fail_rate"] = float(solution_df.query(f"split=='{split}'")["submission_score"].isna().mean())
91
+
92
+ split = "private"
93
+ evaluation[f"{split}_score"]["fail_rate"] = float(solution_df["submission_score"].isna().mean())
94
+
95
+ else:
96
+ for split in ["public","private"]:
97
+ evaluation[f"{split}_score"]["fail_rate"] = -1
98
+
99
+
100
+
101
+ return evaluation
102
+
103
+
104
+
105
+ def compute(params):
106
+ solution_file = hf_hub_download(
107
+ repo_id=params.competition_id,
108
+ filename="solution.csv",
109
+ token=params.token,
110
+ repo_type="dataset",
111
+ )
112
+
113
+ solution_df = pd.read_csv(solution_file).set_index(params.submission_id_col)
114
+
115
+ submission_filename = f"submissions/{params.team_id}-{params.submission_id}.csv"
116
+ submission_file = hf_hub_download(
117
+ repo_id=params.competition_id,
118
+ filename=submission_filename,
119
+ token=params.token,
120
+ repo_type="dataset",
121
+ )
122
+
123
+ submission_df = pd.read_csv(submission_file).set_index(params.submission_id_col)
124
+
125
+ return _metric(solution_df,submission_df)
process_data.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from pathlib import Path
4
+ import pandas as pd
5
+ import json
6
+ import metric
7
+ from sklearn.metrics import roc_auc_score, roc_curve
8
+ import numpy as np
9
+ import altair as alt
10
+
11
+ _metric = metric._metric
12
+
13
+
14
+ def get_submission(f):
15
+ submission_info = json.load(open(f))
16
+ submissions = pd.DataFrame(submission_info["submissions"])
17
+ submissions["team_id"] = submission_info["id"]
18
+
19
+ return submissions
20
+
21
+ def get_submissions_file(f):
22
+ submission_df = pd.read_csv(f).set_index("id")
23
+ if isinstance(submission_df.iloc[0]["score"],str):
24
+ submission_df.loc[:, "score"] = submission_df.loc[:, "score"].apply(lambda a: json.loads(a)[0])
25
+ return submission_df
26
+
27
+ def load_results(local_dir):
28
+ team_file_name = "teams.json"
29
+ team_info = pd.read_json(Path(local_dir) / team_file_name).T
30
+ team_info.loc["baselines", "name"] = "baselines"
31
+ submission_info_dir = "submission_info"
32
+ submission_info_files = list((Path(local_dir) / submission_info_dir).glob("*.json"))
33
+ # submission_info_files += ["baselines/baselines.json"]
34
+ submissions = pd.concat(
35
+ [get_submission(f) for f in submission_info_files], ignore_index=True
36
+ )
37
+ submissions.loc[:, "team"] = team_info.loc[
38
+ submissions["team_id"].values, "name"
39
+ ].values
40
+
41
+ submissions["submission_files"] = submissions.apply(
42
+ lambda a: (
43
+ str(
44
+ Path(local_dir)
45
+ / "submissions"
46
+ / (a["team_id"] + "-" + a["submission_id"] + ".csv")
47
+ )
48
+ if a["team_id"] != "baselines"
49
+ else str(
50
+ Path("baselines") / (a["team_id"] + "-" + a["submission_id"] + ".csv")
51
+ )
52
+ ),
53
+ axis=1,
54
+ )
55
+ submissions = submissions.drop(columns=["public_score", "private_score"])
56
+ submissions["submission"] = (
57
+ submissions["team"] + " - " + submissions["submission_repo"]
58
+ )
59
+ return submissions
60
+
61
+
62
+ def compute_metrics(submissions, local_dir, admin=True):
63
+
64
+ submissions=submissions.query("status==3.0")
65
+
66
+ if not admin:
67
+ selected_by_team = submissions.groupby("team")["selected"].sum()
68
+ teams_no_selected = selected_by_team.index[selected_by_team==0]
69
+ submissions.loc[submissions.team.isin(teams_no_selected),"selected"] = True
70
+ submissions = submissions.query("selected")
71
+
72
+ solution_df = pd.read_csv(Path(local_dir) / "solution.csv").set_index("id")
73
+
74
+ results = {"private_score": [], "public_score": []}
75
+
76
+ fields = ["team_id", "team", "submission_id", "submission_repo"]
77
+ for i, row in submissions.T.items():
78
+ # r = pd.read_csv(row["submission_files"]).set_index("id")
79
+ r = get_submissions_file(row["submission_files"])
80
+ eval = _metric(solution_df, r, mode="detailed", admin=admin)
81
+ for m in ["private_score", "public_score"]:
82
+ for f in fields:
83
+ eval[m][f] = row[f]
84
+ eval[m]["submission"] = f"{row.team} - {row.submission_repo}"
85
+
86
+ eval[m] = pd.Series(eval[m]).to_frame().T
87
+ results[m].append(eval[m])
88
+
89
+ for m in ["private_score", "public_score"]:
90
+ temp = pd.concat(results[m], ignore_index=True).T
91
+ temp.index.name = "metric"
92
+ temp = temp.reset_index()
93
+
94
+ # def parse(s):
95
+ # if any(p in s for p in ["generated","pristine"]):
96
+ # s = s.split("_")
97
+ # return pd.Series(dict(pred = s[0], source = "_".join(s[1:])))
98
+ # else:
99
+ # return pd.Series(dict(pred = s, source = None))
100
+
101
+ # temp = pd.concat([temp, temp["metric"].apply(parse)], axis = 1)
102
+ # results[m] = temp.set_index(["pred","source"])
103
+ # results[m] = results[m].drop(columns = ["metric"]).T
104
+ results[m] = (
105
+ temp.set_index("metric")
106
+ .T.sort_values("balanced_accuracy", ascending=False)
107
+ .drop_duplicates(subset=["team", "submission_repo"])
108
+ )
109
+
110
+ if not admin:
111
+ # only show top selected
112
+ results[m] = (
113
+ results[m]
114
+ .sort_values(["team", "balanced_accuracy"], ascending=False)
115
+ .drop_duplicates(subset=["team"])
116
+ .sort_values("balanced_accuracy", ascending=False)
117
+ )
118
+
119
+ results[m] = results[m].set_index("submission" if admin else "team")
120
+
121
+
122
+ fields_to_merge = ['generated_accuracy', 'pristine_accuracy', 'balanced_accuracy', 'total_time', 'fail_rate']
123
+
124
+ submissions = pd.concat([submissions.set_index("submission_id"),
125
+ results["private_score"].reset_index().set_index("submission_id").loc[:,fields_to_merge]],axis = 1).reset_index()
126
+
127
+ return results, submissions
128
+
129
+
130
+ def process_data(path,save_path):
131
+ submissions = load_results(path)
132
+ results,submissions = compute_metrics(submissions, path, admin=False)
133
+ cols_to_drop = ["team_id","submission_id","submission_repo","submission"]
134
+ results["public_score"].drop(columns =cols_to_drop).to_csv(save_path)
135
+
136
+
137
+ if __name__=="__main__":
138
+ process_data("comp_data_task1","task1.csv")
139
+ process_data("comp_data_task2","task2.csv")
140
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ scikit-learn
2
+ numpy
run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ streamlit run app.py
task1.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ team,generated_g_02,generated_g_04,generated_g_05,generated_g_06,generated_g_09,generated_g_10,generated_g_11,pristine_p_00,pristine_p_01,pristine_p_02,pristine_p_05,pristine_p_09,pristine_p_10,pristine_p_11,pristine_p_16,pristine_p_18,pristine_p_20,generated_accuracy,pristine_accuracy,balanced_accuracy,total_time,fail_rate
2
+ baseline-2,0.82,0.985,0.715,1.0,0.875,1.0,0.68,0.9,0.66,0.895,0.915,0.935,0.68,0.815,0.85,0.88,0.945,0.8678571428571428,0.8474999999999999,0.8576785714285713,1095.5729746818542,0.0
3
+ ISPL,0.965,0.985,0.445,1.0,1.0,1.0,0.29,1.0,0.08,0.985,0.98,0.97,0.835,0.925,0.965,0.86,0.995,0.812142857142857,0.8595,0.8358214285714285,86.06722044944746,0.0
4
+ baseline-1,0.825,0.995,0.465,0.97,1.0,1.0,0.925,0.96,0.37,0.955,0.815,1.0,0.69,0.285,0.635,0.885,0.955,0.8828571428571428,0.755,0.8189285714285715,2053.3383333683014,0.002058823529411765
5
+ DMF,0.925,0.74,0.76,0.965,0.835,1.0,1.0,0.37,0.0,0.11,0.105,0.81,0.175,0.99,0.53,0.43,0.45,0.8892857142857142,0.397,0.6431428571428571,73.15069174766523,0.0
6
+ Anon_Peking,0.985,0.92,0.91,0.98,0.89,0.97,0.95,0.28,0.105,0.125,0.23,0.2,0.355,0.09,0.15,0.165,0.125,0.9435714285714285,0.1825,0.5630357142857143,366.5311744213103,0.0
7
+ gylin,0.115,0.36,0.03,0.035,0.535,0.0,0.005,0.98,0.985,0.96,0.9,0.99,0.8,1.0,0.82,0.93,0.945,0.1542857142857143,0.931,0.5426428571428572,21.466523647308183,0.0
8
+ UCASRikki,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5,116.53438615798935,0.0
9
+ safe-test,0.435,0.5,0.475,0.55,0.495,0.53,0.52,0.51,0.485,0.48,0.58,0.435,0.53,0.43,0.545,0.48,0.445,0.5007142857142858,0.492,0.4963571428571429,142.9702701568602,0.0
10
+ JAIST-HIS,0.895,0.94,0.985,0.525,0.89,0.98,0.455,0.01,0.035,0.025,0.03,0.13,0.065,0.0,0.04,0.035,0.07,0.8099999999999999,0.044,0.427,61.43132781982406,0.0
task2.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ team,generated_g_02,generated_g_04,generated_g_05,generated_g_06,generated_g_09,generated_g_10,generated_g_11,pristine_p_00,pristine_p_01,pristine_p_02,pristine_p_05,pristine_p_09,pristine_p_10,pristine_p_11,pristine_p_16,pristine_p_18,pristine_p_20,generated_accuracy,pristine_accuracy,balanced_accuracy,total_time,fail_rate
2
+ baseline-2,0.8789473684210526,0.9394736842105263,0.6868421052631579,0.9394736842105263,0.9078947368421053,0.9263157894736842,0.8236842105263158,0.9,0.66,0.895,0.915,0.935,0.68,0.815,0.85,0.88,0.945,0.8718045112781956,0.8474999999999999,0.8596522556390978,1558.1854865550995,0.0
3
+ baseline-1,0.9210526315789473,0.9868421052631579,0.7684210526315789,0.9289473684210526,0.9763157894736842,0.9815789473684211,0.9157894736842105,0.96,0.36,0.955,0.815,1.0,0.69,0.29,0.635,0.885,0.955,0.925563909774436,0.7545,0.8400319548872179,2615.167044878006,0.008583690987124463
4
+ ISPL,0.9105263157894737,0.9131578947368421,0.7,0.9710526315789474,0.8710526315789474,0.95,0.5921052631578947,0.985,0.015,0.925,0.8,0.74,0.83,0.805,0.935,0.8,0.975,0.8439849624060151,0.7809999999999999,0.8124924812030074,4660.0,0.0
5
+ safe-test,0.49736842105263157,0.4710526315789474,0.4789473684210526,0.4842105263157895,0.5236842105263158,0.5026315789473684,0.49473684210526314,0.5,0.455,0.495,0.495,0.505,0.515,0.505,0.535,0.51,0.51,0.4932330827067669,0.5025,0.49786654135338343,200.45346903800942,0.0
update_data.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ huggingface-cli download safe-challenge/SAFEChallengeTask1 --local-dir ./comp_data_task1 --repo-type dataset
2
+ huggingface-cli download safe-challenge/SAFEChallengeTask2 --local-dir ./comp_data_task2 --repo-type dataset
3
+ python process_data.py