Commit
·
2dafeb1
1
Parent(s):
7ae8833
Minor text edits and reformatting
Browse files- about.py +16 -10
- app.py +40 -27
- constants.py +22 -4
- data/example-predictions-heldout.csv +1 -1
- submit.py +4 -2
- utils.py +15 -9
about.py
CHANGED
@@ -1,4 +1,10 @@
|
|
1 |
-
from constants import
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
ABOUT_INTRO = f"""
|
4 |
## About this challenge
|
@@ -7,15 +13,15 @@ ABOUT_INTRO = f"""
|
|
7 |
|
8 |
#### What is antibody developability and why is it important?
|
9 |
|
10 |
-
Antibodies have to be manufacturable, stable in high concentrations, and have low off-target effects.
|
11 |
Properties such as these can often hinder the progression of an antibody to the clinic, and are collectively referred to as 'developability'.
|
12 |
Here we invite the community to submit and develop better predictors, which will be tested out on a heldout private set to assess model generalization.
|
13 |
|
14 |
#### 🏆 Prizes
|
15 |
|
16 |
-
For each of the 5 properties in the competition, there is a prize for the model with the highest performance for that property on the private test set.
|
17 |
There is also an 'open-source' prize for the best model trained on the GDPa1 dataset (reporting cross-validation results) and assessed on the private test set where authors provide all training code and data.
|
18 |
-
For each of these 6 prizes, participants have the choice between **$10k in data generation credits** with [Ginkgo Datapoints](https://datapoints.ginkgo.bio/) or a **cash prize** with a value of $2000.
|
19 |
|
20 |
See the "{FAQ_TAB_NAME}" tab above (you are currently on the "{ABOUT_TAB_NAME}" tab) or the [competition terms]({TERMS_URL}) for more details.
|
21 |
"""
|
@@ -85,7 +91,7 @@ FAQS = {
|
|
85 |
),
|
86 |
"How are winners determined?": (
|
87 |
'There will be 6 prizes (one for each of the assay properties plus an "open-source" prize). '
|
88 |
-
|
89 |
'For the "open-source" prize, this will be determined by the highest average Spearman across all properties. '
|
90 |
"We reserve the right to award the open-source prize to a predictor with competitive results for a subset of properties (e.g. a top polyreactivity model)."
|
91 |
),
|
@@ -94,8 +100,8 @@ FAQS = {
|
|
94 |
),
|
95 |
"What do I need to submit?": (
|
96 |
'There is a tab on the Hugging Face competition page to upload predictions for datasets - for each dataset participants need to submit a CSV containing a column for each property they would like to predict (e.g. called "HIC"), '
|
97 |
-
|
98 |
-
|
99 |
),
|
100 |
"Can I submit predictions for only one property?": (
|
101 |
"Yes. You do not need to predict all 5 properties to participate. Each property has its own leaderboard and prize, so you may submit models for a subset of the assays if you wish."
|
@@ -118,7 +124,7 @@ FAQS = {
|
|
118 |
SUBMIT_INTRUCTIONS = f"""
|
119 |
# Antibody Developability Submission
|
120 |
Upload a CSV to get a score!
|
121 |
-
List of valid property names: `{', '.join(ASSAY_LIST)}`.
|
122 |
|
123 |
You do **not** need to predict all 5 properties — each property has its own leaderboard and prize.
|
124 |
|
@@ -126,11 +132,11 @@ You do **not** need to predict all 5 properties — each property has its own le
|
|
126 |
1. **Submit your predictions** as a CSV with `antibody_name` + one column per property you are predicting (e.g. `"antibody_name,Titer,PR_CHO"` if your model predicts Titer and Polyreactivity).
|
127 |
2. **Final test submission**: Download test sequences from the example files below and upload predictions.
|
128 |
|
129 |
-
The validation set results should appear on the leaderboard within a minute. The **private test set results will not appear on the leaderboards**, and will be used to determine the winners at the close of the competition.
|
130 |
We may release private test set results at intermediate points during the competition.
|
131 |
|
132 |
## Cross-validation
|
133 |
|
134 |
-
For the cross-validation metrics (if training only on the GDPa1 dataset), use the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column to split the dataset into folds and make predictions for each of the folds.
|
135 |
Submit a CSV file in the same format but also containing the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column.
|
136 |
"""
|
|
|
1 |
+
from constants import (
|
2 |
+
ABOUT_TAB_NAME,
|
3 |
+
ASSAY_LIST,
|
4 |
+
SUBMIT_TAB_NAME,
|
5 |
+
TERMS_URL,
|
6 |
+
FAQ_TAB_NAME,
|
7 |
+
)
|
8 |
|
9 |
ABOUT_INTRO = f"""
|
10 |
## About this challenge
|
|
|
13 |
|
14 |
#### What is antibody developability and why is it important?
|
15 |
|
16 |
+
Antibodies have to be manufacturable, stable in high concentrations, and have low off-target effects.
|
17 |
Properties such as these can often hinder the progression of an antibody to the clinic, and are collectively referred to as 'developability'.
|
18 |
Here we invite the community to submit and develop better predictors, which will be tested out on a heldout private set to assess model generalization.
|
19 |
|
20 |
#### 🏆 Prizes
|
21 |
|
22 |
+
For each of the 5 properties in the competition, there is a prize for the model with the highest performance for that property on the private test set.
|
23 |
There is also an 'open-source' prize for the best model trained on the GDPa1 dataset (reporting cross-validation results) and assessed on the private test set where authors provide all training code and data.
|
24 |
+
For each of these 6 prizes, participants have the choice between **$10k in data generation credits** with [Ginkgo Datapoints](https://datapoints.ginkgo.bio/) or a **cash prize** with a value of $2000.
|
25 |
|
26 |
See the "{FAQ_TAB_NAME}" tab above (you are currently on the "{ABOUT_TAB_NAME}" tab) or the [competition terms]({TERMS_URL}) for more details.
|
27 |
"""
|
|
|
91 |
),
|
92 |
"How are winners determined?": (
|
93 |
'There will be 6 prizes (one for each of the assay properties plus an "open-source" prize). '
|
94 |
+
"For the property-specific prizes, winners will be determined by the submission with the highest Spearman rank correlation coefficient on the private holdout set. "
|
95 |
'For the "open-source" prize, this will be determined by the highest average Spearman across all properties. '
|
96 |
"We reserve the right to award the open-source prize to a predictor with competitive results for a subset of properties (e.g. a top polyreactivity model)."
|
97 |
),
|
|
|
100 |
),
|
101 |
"What do I need to submit?": (
|
102 |
'There is a tab on the Hugging Face competition page to upload predictions for datasets - for each dataset participants need to submit a CSV containing a column for each property they would like to predict (e.g. called "HIC"), '
|
103 |
+
"and a row with the sequence matching the sequence in the input file. These predictions are then evaluated in the backend using the Spearman rank correlation between predictions and experimental values, and these metrics are then added to the leaderboard. "
|
104 |
+
"Predictions remain private and are not seen by other contestants."
|
105 |
),
|
106 |
"Can I submit predictions for only one property?": (
|
107 |
"Yes. You do not need to predict all 5 properties to participate. Each property has its own leaderboard and prize, so you may submit models for a subset of the assays if you wish."
|
|
|
124 |
SUBMIT_INTRUCTIONS = f"""
|
125 |
# Antibody Developability Submission
|
126 |
Upload a CSV to get a score!
|
127 |
+
List of valid property names: `{', '.join(ASSAY_LIST)}`.
|
128 |
|
129 |
You do **not** need to predict all 5 properties — each property has its own leaderboard and prize.
|
130 |
|
|
|
132 |
1. **Submit your predictions** as a CSV with `antibody_name` + one column per property you are predicting (e.g. `"antibody_name,Titer,PR_CHO"` if your model predicts Titer and Polyreactivity).
|
133 |
2. **Final test submission**: Download test sequences from the example files below and upload predictions.
|
134 |
|
135 |
+
The validation set results should appear on the leaderboard within a minute. The **private test set results will not appear on the leaderboards**, and will be used to determine the winners at the close of the competition.
|
136 |
We may release private test set results at intermediate points during the competition.
|
137 |
|
138 |
## Cross-validation
|
139 |
|
140 |
+
For the cross-validation metrics (if training only on the GDPa1 dataset), use the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column to split the dataset into folds and make predictions for each of the folds.
|
141 |
Submit a CSV file in the same format but also containing the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column.
|
142 |
"""
|
app.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
import pandas as pd
|
2 |
|
3 |
import gradio as gr
|
4 |
-
from gradio.themes.utils import
|
5 |
from gradio_leaderboard import Leaderboard
|
6 |
|
7 |
from about import ABOUT_INTRO, ABOUT_TEXT, FAQS, SUBMIT_INTRUCTIONS
|
8 |
from constants import (
|
9 |
-
ASSAY_RENAME, #
|
10 |
EXAMPLE_FILE_DICT,
|
11 |
LEADERBOARD_DISPLAY_COLUMNS,
|
12 |
ABOUT_TAB_NAME,
|
@@ -19,6 +19,7 @@ from constants import (
|
|
19 |
from submit import make_submission
|
20 |
from utils import fetch_hf_results, show_output_box
|
21 |
|
|
|
22 |
def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
|
23 |
df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
|
24 |
if assay is not None:
|
@@ -29,8 +30,10 @@ def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None)
|
|
29 |
# Note: We can also just say the following as a text box at the bottom of the leaderboard: "Note: Results for the Heldout Test Set are only evaluated at competition close"
|
30 |
# Convert spearman column to string to avoid dtype incompatibility when assigning text
|
31 |
df["spearman"] = df["spearman"].astype(str)
|
32 |
-
df.loc[
|
33 |
-
|
|
|
|
|
34 |
# Finally, rename columns for readability
|
35 |
df = df.rename(columns=LEADERBOARD_COLUMNS_RENAME)
|
36 |
return df
|
@@ -46,8 +49,10 @@ def get_leaderboard_object(assay: str | None = None):
|
|
46 |
lb = Leaderboard(
|
47 |
value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
|
48 |
datatype=["str", "str", "str", "number"],
|
49 |
-
select_columns=LEADERBOARD_COLUMNS_RENAME_LIST(
|
50 |
-
|
|
|
|
|
51 |
filter_columns=LEADERBOARD_COLUMNS_RENAME_LIST(filter_columns),
|
52 |
every=15,
|
53 |
render=True,
|
@@ -62,32 +67,30 @@ current_dataframe = fetch_hf_results()
|
|
62 |
with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
63 |
timer = gr.Timer(3) # Run every 3 seconds when page is focused
|
64 |
data_version = gr.State(value=0) # Track data changes
|
65 |
-
|
66 |
def update_current_dataframe():
|
67 |
global current_dataframe
|
68 |
new_dataframe = fetch_hf_results()
|
69 |
-
|
70 |
# Check if data has actually changed
|
71 |
if not current_dataframe.equals(new_dataframe):
|
72 |
current_dataframe = new_dataframe
|
73 |
return data_version.value + 1 # Increment version to trigger updates
|
74 |
return data_version.value
|
75 |
-
|
76 |
timer.tick(fn=update_current_dataframe, outputs=data_version)
|
77 |
-
|
78 |
## Header
|
79 |
-
|
80 |
with gr.Row():
|
81 |
with gr.Column(scale=6): # bigger text area
|
82 |
gr.Markdown(
|
83 |
f"""
|
84 |
## Welcome to the Ginkgo Antibody Developability Benchmark!
|
85 |
|
86 |
-
**Beta version, not publicly launched yet**
|
87 |
-
|
88 |
Participants can submit their model to the leaderboards by simply uploading a CSV file (see the "✉️ Submit" tab).
|
89 |
-
|
90 |
-
You can **predict any or all of the 5 properties**, and
|
91 |
See more details in the "{ABOUT_TAB_NAME}" tab.
|
92 |
"""
|
93 |
)
|
@@ -96,13 +99,18 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
96 |
value="./assets/competition_logo.jpg",
|
97 |
show_label=False,
|
98 |
show_download_button=False,
|
99 |
-
width="25vw",
|
100 |
)
|
101 |
-
|
102 |
with gr.Tabs(elem_classes="tab-buttons"):
|
103 |
with gr.TabItem(ABOUT_TAB_NAME, elem_id="abdev-benchmark-tab-table"):
|
104 |
gr.Markdown(ABOUT_INTRO)
|
105 |
-
gr.Image(
|
|
|
|
|
|
|
|
|
|
|
106 |
gr.Markdown(ABOUT_TEXT)
|
107 |
|
108 |
# Procedurally make these 5 tabs
|
@@ -113,26 +121,31 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
113 |
# ) as tab_item:
|
114 |
# gr.Markdown(f"# {ASSAY_DESCRIPTION[assay]}")
|
115 |
# lb = get_leaderboard_object(assay=assay)
|
116 |
-
|
117 |
# def refresh_leaderboard(assay=assay):
|
118 |
# return format_leaderboard_table(df_results=current_dataframe, assay=assay)
|
119 |
-
|
120 |
# # Refresh when data version changes
|
121 |
# data_version.change(fn=refresh_leaderboard, outputs=lb)
|
122 |
|
123 |
# Note(Lood): Trying out just one leaderboard. We could also have a dropdown here that shows different leaderboards for each property, but that's just the same as the filters
|
124 |
-
with gr.TabItem(
|
|
|
|
|
125 |
gr.Markdown(
|
126 |
-
"
|
|
|
|
|
|
|
127 |
)
|
128 |
lb = get_leaderboard_object()
|
129 |
-
|
130 |
def refresh_overall_leaderboard():
|
131 |
return format_leaderboard_table(df_results=current_dataframe)
|
132 |
-
|
133 |
# Refresh when data version changes
|
134 |
data_version.change(fn=refresh_overall_leaderboard, outputs=lb)
|
135 |
-
|
136 |
# At the bottom of the leaderboard, we can keep as NaN and explain missing test set results
|
137 |
# gr.Markdown(
|
138 |
# "_ℹ️ Results for the private test set will not be shown here and will be used for final judging at the close of the competition._"
|
@@ -245,7 +258,7 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
245 |
question = f"{i+1}. {question}"
|
246 |
with gr.Accordion(question, open=False):
|
247 |
gr.Markdown(f"*{answer}*") # Italics for answers
|
248 |
-
|
249 |
# Footnote
|
250 |
gr.Markdown(
|
251 |
f"""
|
@@ -258,4 +271,4 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
258 |
)
|
259 |
|
260 |
if __name__ == "__main__":
|
261 |
-
demo.launch(ssr_mode=False)
|
|
|
1 |
import pandas as pd
|
2 |
|
3 |
import gradio as gr
|
4 |
+
from gradio.themes.utils import sizes
|
5 |
from gradio_leaderboard import Leaderboard
|
6 |
|
7 |
from about import ABOUT_INTRO, ABOUT_TEXT, FAQS, SUBMIT_INTRUCTIONS
|
8 |
from constants import (
|
9 |
+
ASSAY_RENAME, # noqa: F401
|
10 |
EXAMPLE_FILE_DICT,
|
11 |
LEADERBOARD_DISPLAY_COLUMNS,
|
12 |
ABOUT_TAB_NAME,
|
|
|
19 |
from submit import make_submission
|
20 |
from utils import fetch_hf_results, show_output_box
|
21 |
|
22 |
+
|
23 |
def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
|
24 |
df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
|
25 |
if assay is not None:
|
|
|
30 |
# Note: We can also just say the following as a text box at the bottom of the leaderboard: "Note: Results for the Heldout Test Set are only evaluated at competition close"
|
31 |
# Convert spearman column to string to avoid dtype incompatibility when assigning text
|
32 |
df["spearman"] = df["spearman"].astype(str)
|
33 |
+
df.loc[
|
34 |
+
(df["dataset"] == "Heldout Test Set") & (df["spearman"] == "nan"), "spearman"
|
35 |
+
] = "N/A, evaluated at competition close"
|
36 |
+
|
37 |
# Finally, rename columns for readability
|
38 |
df = df.rename(columns=LEADERBOARD_COLUMNS_RENAME)
|
39 |
return df
|
|
|
49 |
lb = Leaderboard(
|
50 |
value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
|
51 |
datatype=["str", "str", "str", "number"],
|
52 |
+
select_columns=LEADERBOARD_COLUMNS_RENAME_LIST(
|
53 |
+
["model", "property", "spearman", "dataset"]
|
54 |
+
),
|
55 |
+
search_columns=["Model Name"],
|
56 |
filter_columns=LEADERBOARD_COLUMNS_RENAME_LIST(filter_columns),
|
57 |
every=15,
|
58 |
render=True,
|
|
|
67 |
with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
68 |
timer = gr.Timer(3) # Run every 3 seconds when page is focused
|
69 |
data_version = gr.State(value=0) # Track data changes
|
70 |
+
|
71 |
def update_current_dataframe():
|
72 |
global current_dataframe
|
73 |
new_dataframe = fetch_hf_results()
|
74 |
+
|
75 |
# Check if data has actually changed
|
76 |
if not current_dataframe.equals(new_dataframe):
|
77 |
current_dataframe = new_dataframe
|
78 |
return data_version.value + 1 # Increment version to trigger updates
|
79 |
return data_version.value
|
80 |
+
|
81 |
timer.tick(fn=update_current_dataframe, outputs=data_version)
|
82 |
+
|
83 |
## Header
|
84 |
+
|
85 |
with gr.Row():
|
86 |
with gr.Column(scale=6): # bigger text area
|
87 |
gr.Markdown(
|
88 |
f"""
|
89 |
## Welcome to the Ginkgo Antibody Developability Benchmark!
|
90 |
|
|
|
|
|
91 |
Participants can submit their model to the leaderboards by simply uploading a CSV file (see the "✉️ Submit" tab).
|
92 |
+
|
93 |
+
You can **predict any or all of the 5 properties**, and you can filter the main leaderboard by property.
|
94 |
See more details in the "{ABOUT_TAB_NAME}" tab.
|
95 |
"""
|
96 |
)
|
|
|
99 |
value="./assets/competition_logo.jpg",
|
100 |
show_label=False,
|
101 |
show_download_button=False,
|
102 |
+
width="25vw", # Take up the width of the column (2/8 = 1/4)
|
103 |
)
|
104 |
+
|
105 |
with gr.Tabs(elem_classes="tab-buttons"):
|
106 |
with gr.TabItem(ABOUT_TAB_NAME, elem_id="abdev-benchmark-tab-table"):
|
107 |
gr.Markdown(ABOUT_INTRO)
|
108 |
+
gr.Image(
|
109 |
+
value="./assets/prediction_explainer.png",
|
110 |
+
show_label=False,
|
111 |
+
show_download_button=False,
|
112 |
+
width="50vw",
|
113 |
+
)
|
114 |
gr.Markdown(ABOUT_TEXT)
|
115 |
|
116 |
# Procedurally make these 5 tabs
|
|
|
121 |
# ) as tab_item:
|
122 |
# gr.Markdown(f"# {ASSAY_DESCRIPTION[assay]}")
|
123 |
# lb = get_leaderboard_object(assay=assay)
|
124 |
+
|
125 |
# def refresh_leaderboard(assay=assay):
|
126 |
# return format_leaderboard_table(df_results=current_dataframe, assay=assay)
|
127 |
+
|
128 |
# # Refresh when data version changes
|
129 |
# data_version.change(fn=refresh_leaderboard, outputs=lb)
|
130 |
|
131 |
# Note(Lood): Trying out just one leaderboard. We could also have a dropdown here that shows different leaderboards for each property, but that's just the same as the filters
|
132 |
+
with gr.TabItem(
|
133 |
+
"🏆 Leaderboard", elem_id="abdev-benchmark-tab-table"
|
134 |
+
) as leaderboard_tab:
|
135 |
gr.Markdown(
|
136 |
+
"""
|
137 |
+
# Overall Leaderboard (filter below by property)
|
138 |
+
Each property has its own prize, and participants can submit models for any combination of properties.
|
139 |
+
"""
|
140 |
)
|
141 |
lb = get_leaderboard_object()
|
142 |
+
|
143 |
def refresh_overall_leaderboard():
|
144 |
return format_leaderboard_table(df_results=current_dataframe)
|
145 |
+
|
146 |
# Refresh when data version changes
|
147 |
data_version.change(fn=refresh_overall_leaderboard, outputs=lb)
|
148 |
+
|
149 |
# At the bottom of the leaderboard, we can keep as NaN and explain missing test set results
|
150 |
# gr.Markdown(
|
151 |
# "_ℹ️ Results for the private test set will not be shown here and will be used for final judging at the close of the competition._"
|
|
|
258 |
question = f"{i+1}. {question}"
|
259 |
with gr.Accordion(question, open=False):
|
260 |
gr.Markdown(f"*{answer}*") # Italics for answers
|
261 |
+
|
262 |
# Footnote
|
263 |
gr.Markdown(
|
264 |
f"""
|
|
|
271 |
)
|
272 |
|
273 |
if __name__ == "__main__":
|
274 |
+
demo.launch(ssr_mode=False, share=True)
|
constants.py
CHANGED
@@ -55,7 +55,9 @@ ANTIBODY_NAMES_DICT = {
|
|
55 |
"GDPa1_cross_validation": pd.read_csv(EXAMPLE_FILE_DICT["GDPa1_cross_validation"])[
|
56 |
"antibody_name"
|
57 |
].tolist(),
|
58 |
-
"Heldout Test Set": pd.read_csv(EXAMPLE_FILE_DICT["Heldout Test Set"])[
|
|
|
|
|
59 |
}
|
60 |
|
61 |
# Huggingface API
|
@@ -69,8 +71,22 @@ SUBMISSIONS_REPO = f"{ORGANIZATION}/abdev-bench-submissions"
|
|
69 |
RESULTS_REPO = f"{ORGANIZATION}/abdev-bench-results"
|
70 |
|
71 |
# Leaderboard dataframes
|
72 |
-
LEADERBOARD_RESULTS_COLUMNS = [
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
LEADERBOARD_COLUMNS_RENAME = {
|
75 |
"spearman": "Spearman Correlation",
|
76 |
"dataset": "Dataset",
|
@@ -79,5 +95,7 @@ LEADERBOARD_COLUMNS_RENAME = {
|
|
79 |
"model": "Model Name",
|
80 |
"property": "Property",
|
81 |
}
|
|
|
|
|
82 |
def LEADERBOARD_COLUMNS_RENAME_LIST(columns: list[str]) -> list[str]:
|
83 |
-
return list(map(lambda x: LEADERBOARD_COLUMNS_RENAME.get(x,x), columns))
|
|
|
55 |
"GDPa1_cross_validation": pd.read_csv(EXAMPLE_FILE_DICT["GDPa1_cross_validation"])[
|
56 |
"antibody_name"
|
57 |
].tolist(),
|
58 |
+
"Heldout Test Set": pd.read_csv(EXAMPLE_FILE_DICT["Heldout Test Set"])[
|
59 |
+
"antibody_name"
|
60 |
+
].tolist(),
|
61 |
}
|
62 |
|
63 |
# Huggingface API
|
|
|
71 |
RESULTS_REPO = f"{ORGANIZATION}/abdev-bench-results"
|
72 |
|
73 |
# Leaderboard dataframes
|
74 |
+
LEADERBOARD_RESULTS_COLUMNS = [
|
75 |
+
"model",
|
76 |
+
"assay",
|
77 |
+
"spearman",
|
78 |
+
"dataset",
|
79 |
+
"user",
|
80 |
+
"submission_time",
|
81 |
+
] # The columns expected from the results dataset
|
82 |
+
LEADERBOARD_DISPLAY_COLUMNS = [
|
83 |
+
"model",
|
84 |
+
"property",
|
85 |
+
"spearman",
|
86 |
+
"dataset",
|
87 |
+
"user",
|
88 |
+
"submission_time",
|
89 |
+
] # After changing assay to property (pretty formatting)
|
90 |
LEADERBOARD_COLUMNS_RENAME = {
|
91 |
"spearman": "Spearman Correlation",
|
92 |
"dataset": "Dataset",
|
|
|
95 |
"model": "Model Name",
|
96 |
"property": "Property",
|
97 |
}
|
98 |
+
|
99 |
+
|
100 |
def LEADERBOARD_COLUMNS_RENAME_LIST(columns: list[str]) -> list[str]:
|
101 |
+
return list(map(lambda x: LEADERBOARD_COLUMNS_RENAME.get(x, x), columns))
|
data/example-predictions-heldout.csv
CHANGED
@@ -78,4 +78,4 @@ P907-A14-unary-estuary-9ae8d,EVQLVESGGGLVQPGGSLRLSCAASGFTFSRYWMSWVRQAPGKGLEWVANI
|
|
78 |
P907-A14-undirected-hull-8daff,QMQLVQSGAEVRKPGASVKVSCKASGYTFTGHYIHWVRQAPGRGPEWMGWINPNSGGTNSSQSFQGRVTMTRDTSISTAYMELSRLTSDDTAVYSCARARYGDYYYFDSWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRASQDISSYLAWYQQKPEKAPKSLIYAASSLQGGVPSRFSGSGSGTHFTLTISSLQPEDFATYYCQQYYSYPVTFGPGTKVDIK,QMQLVQS-GAEVRKPGASVKVSCKASG-YTFTG-----HYIHWVRQAPGRGPEWMGWINPN---SGGTNSSQSFQGRVTMTRDTSISTAYMELSRLTSDDTAVYSCARARYGD-------------------YYYFDSWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRAS--QDIS------SYLAWYQQKPEKAPKSLIYA--------ASSLQGGVPSRFSGSGSG--THFTLTISSLQPEDFATYYCQQYYS-----------------------YPVTFGPGTKVDIK-,IgG1,Kappa
|
79 |
P907-A14-vain-bucket-0f231,QVQLQQWGAGLLKPSETLSLTCAVYNGSSSAHYWSWVRQPPGKGLEWIGEISHGGSTTYNPSLKGRVSISVDTPKNQFSLNLSSVTAADTAVYYCATRAIHFRNRNFYSFYVEVWGKGTTVTVSS,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSKLVWYQQRPGQAPRPLIYGASSRATGIPDRFSGSGSETDFTLTISWLEPEDFAVYYCHQYGSSPRTFGQGTKVEIK,QVQLQQW-GAGLLKPSETLSLTCAVYN-GSSSA-----HYWSWVRQPPGKGLEWIGEISH----GGSTTYNPSLKGRVSISVDTPKNQFSLNLSSVTAADTAVYYCATRAIHFRNR-------------NFYSFYVEVWGKGTTVTVSS,EIVLTQSPGTLSLSPGERATLSCRAS--QSVSS-----SKLVWYQQRPGQAPRPLIYG--------ASSRATGIPDRFSGSGSE--TDFTLTISWLEPEDFAVYYCHQYGS-----------------------SPRTFGQGTKVEIK-,IgG1,Kappa
|
80 |
P907-A14-wintry-couple-24188,QVQLQQWGAGLLKPSETLSVTCAVYGGSFIGSSWIWIRQPPEKGLEWIGEINHGGSTTYNPSLKSRVTISLDMSKNQFSLNLTSVTAADTAVYYCATDRGSLAAVDWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRASQAISSYLAWYQQKPGKVPKLLIYAASTLQSGVASRFTGSGSGTDFTLTISSLQPEDVATYYCQKYNSAPRTFGQGTRVEIK,QVQLQQW-GAGLLKPSETLSVTCAVYG-GSFIG-----SSWIWIRQPPEKGLEWIGEINH----GGSTTYNPSLKSRVTISLDMSKNQFSLNLTSVTAADTAVYYCATDRGS---------------------LAAVDWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRAS--QAIS------SYLAWYQQKPGKVPKLLIYA--------ASTLQSGVASRFTGSGSG--TDFTLTISSLQPEDVATYYCQKYNS-----------------------APRTFGQGTRVEIK-,IgG1,Kappa
|
81 |
-
P907-A14-witty-fugue-86932,EVQLVESGGGLVQPGRSLRLSCTASGFTFGDYAMNWVRQAPGKGLEWLGFIESKGYGGTTEYAASVKGRFIISRDDSKSIAYLQMNSLKTEDTAVYYCTPGDYWGQGTLVTVSS,SYELTQPPSVSVSPGQTARITCSGDALPKKYAYWYQQKSGQAPVQVIYEDSGRPSGIPERFSGSSSGTMATLTISGAQVEDEADYYCYSIDSSGNHRVFGGGTKLTVL,EVQLVES-GGGLVQPGRSLRLSCTASG-FTFGD-----YAMNWVRQAPGKGLEWLGFIESKG-YGGTTEYAASVKGRFIISRDDSKSIAYLQMNSLKTEDTAVYYCTPG---------------------------DYWGQGTLVTVSS,SYELTQP-PSVSVSPGQTARITCSGD---ALPK-----KYAYWYQQKSGQAPVQVIYE--------DSGRPSGIPERFSGSSSG--TMATLTISGAQVEDEADYYCYSIDSS---------------------GNHRVFGGGTKLTVL-,IgG1,Lambda
|
|
|
78 |
P907-A14-undirected-hull-8daff,QMQLVQSGAEVRKPGASVKVSCKASGYTFTGHYIHWVRQAPGRGPEWMGWINPNSGGTNSSQSFQGRVTMTRDTSISTAYMELSRLTSDDTAVYSCARARYGDYYYFDSWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRASQDISSYLAWYQQKPEKAPKSLIYAASSLQGGVPSRFSGSGSGTHFTLTISSLQPEDFATYYCQQYYSYPVTFGPGTKVDIK,QMQLVQS-GAEVRKPGASVKVSCKASG-YTFTG-----HYIHWVRQAPGRGPEWMGWINPN---SGGTNSSQSFQGRVTMTRDTSISTAYMELSRLTSDDTAVYSCARARYGD-------------------YYYFDSWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRAS--QDIS------SYLAWYQQKPEKAPKSLIYA--------ASSLQGGVPSRFSGSGSG--THFTLTISSLQPEDFATYYCQQYYS-----------------------YPVTFGPGTKVDIK-,IgG1,Kappa
|
79 |
P907-A14-vain-bucket-0f231,QVQLQQWGAGLLKPSETLSLTCAVYNGSSSAHYWSWVRQPPGKGLEWIGEISHGGSTTYNPSLKGRVSISVDTPKNQFSLNLSSVTAADTAVYYCATRAIHFRNRNFYSFYVEVWGKGTTVTVSS,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSKLVWYQQRPGQAPRPLIYGASSRATGIPDRFSGSGSETDFTLTISWLEPEDFAVYYCHQYGSSPRTFGQGTKVEIK,QVQLQQW-GAGLLKPSETLSLTCAVYN-GSSSA-----HYWSWVRQPPGKGLEWIGEISH----GGSTTYNPSLKGRVSISVDTPKNQFSLNLSSVTAADTAVYYCATRAIHFRNR-------------NFYSFYVEVWGKGTTVTVSS,EIVLTQSPGTLSLSPGERATLSCRAS--QSVSS-----SKLVWYQQRPGQAPRPLIYG--------ASSRATGIPDRFSGSGSE--TDFTLTISWLEPEDFAVYYCHQYGS-----------------------SPRTFGQGTKVEIK-,IgG1,Kappa
|
80 |
P907-A14-wintry-couple-24188,QVQLQQWGAGLLKPSETLSVTCAVYGGSFIGSSWIWIRQPPEKGLEWIGEINHGGSTTYNPSLKSRVTISLDMSKNQFSLNLTSVTAADTAVYYCATDRGSLAAVDWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRASQAISSYLAWYQQKPGKVPKLLIYAASTLQSGVASRFTGSGSGTDFTLTISSLQPEDVATYYCQKYNSAPRTFGQGTRVEIK,QVQLQQW-GAGLLKPSETLSVTCAVYG-GSFIG-----SSWIWIRQPPEKGLEWIGEINH----GGSTTYNPSLKSRVTISLDMSKNQFSLNLTSVTAADTAVYYCATDRGS---------------------LAAVDWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRAS--QAIS------SYLAWYQQKPGKVPKLLIYA--------ASTLQSGVASRFTGSGSG--TDFTLTISSLQPEDVATYYCQKYNS-----------------------APRTFGQGTRVEIK-,IgG1,Kappa
|
81 |
+
P907-A14-witty-fugue-86932,EVQLVESGGGLVQPGRSLRLSCTASGFTFGDYAMNWVRQAPGKGLEWLGFIESKGYGGTTEYAASVKGRFIISRDDSKSIAYLQMNSLKTEDTAVYYCTPGDYWGQGTLVTVSS,SYELTQPPSVSVSPGQTARITCSGDALPKKYAYWYQQKSGQAPVQVIYEDSGRPSGIPERFSGSSSGTMATLTISGAQVEDEADYYCYSIDSSGNHRVFGGGTKLTVL,EVQLVES-GGGLVQPGRSLRLSCTASG-FTFGD-----YAMNWVRQAPGKGLEWLGFIESKG-YGGTTEYAASVKGRFIISRDDSKSIAYLQMNSLKTEDTAVYYCTPG---------------------------DYWGQGTLVTVSS,SYELTQP-PSVSVSPGQTARITCSGD---ALPK-----KYAYWYQQKSGQAPVQVIYE--------DSGRPSGIPERFSGSSSG--TMATLTISGAQVEDEADYYCYSIDSS---------------------GNHRVFGGGTKLTVL-,IgG1,Lambda
|
submit.py
CHANGED
@@ -3,7 +3,6 @@ import tempfile
|
|
3 |
from typing import BinaryIO
|
4 |
import json
|
5 |
|
6 |
-
from click import pass_obj
|
7 |
import gradio as gr
|
8 |
from datetime import datetime, timezone
|
9 |
import uuid
|
@@ -58,6 +57,7 @@ def upload_submission(
|
|
58 |
)
|
59 |
Path(tmp_name).unlink()
|
60 |
|
|
|
61 |
def make_submission(
|
62 |
submitted_file: BinaryIO,
|
63 |
user_state,
|
@@ -79,7 +79,9 @@ def make_submission(
|
|
79 |
model_description = ""
|
80 |
# raise gr.Error("Please provide a model description.") # Not mandatory anymore
|
81 |
if str(registration_code).strip().upper() != REGISTRATION_CODE:
|
82 |
-
raise gr.Error(
|
|
|
|
|
83 |
if submitted_file is None:
|
84 |
raise gr.Error("Please upload a CSV file before submitting.")
|
85 |
|
|
|
3 |
from typing import BinaryIO
|
4 |
import json
|
5 |
|
|
|
6 |
import gradio as gr
|
7 |
from datetime import datetime, timezone
|
8 |
import uuid
|
|
|
57 |
)
|
58 |
Path(tmp_name).unlink()
|
59 |
|
60 |
+
|
61 |
def make_submission(
|
62 |
submitted_file: BinaryIO,
|
63 |
user_state,
|
|
|
79 |
model_description = ""
|
80 |
# raise gr.Error("Please provide a model description.") # Not mandatory anymore
|
81 |
if str(registration_code).strip().upper() != REGISTRATION_CODE:
|
82 |
+
raise gr.Error(
|
83 |
+
"Invalid registration code. Please register on the <a href='https://datapoints.ginkgo.bio/ai-competitions/2025-abdev-competition'>Competition Registration page</a> or email <a href='mailto:[email protected]'>[email protected]</a>."
|
84 |
+
)
|
85 |
if submitted_file is None:
|
86 |
raise gr.Error("Please upload a CSV file before submitting.")
|
87 |
|
utils.py
CHANGED
@@ -5,12 +5,13 @@ import hashlib
|
|
5 |
from typing import Iterable, Union
|
6 |
from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS
|
7 |
|
8 |
-
pd.set_option(
|
9 |
|
10 |
|
11 |
def show_output_box(message):
|
12 |
return gr.update(value=message, visible=True)
|
13 |
|
|
|
14 |
def anonymize_user(username: str) -> str:
|
15 |
# Anonymize using a hash of the username
|
16 |
return hashlib.sha256(username.encode()).hexdigest()[:8]
|
@@ -20,16 +21,21 @@ def fetch_hf_results():
|
|
20 |
# For debugging
|
21 |
# # Print current time in EST
|
22 |
# EST = timezone(timedelta(hours=-4))
|
23 |
-
# print(f"tmp: Fetching results from HF at {datetime.now(EST)}")
|
24 |
# Should cache by default if not using force_redownload
|
25 |
df = load_dataset(
|
26 |
-
RESULTS_REPO,
|
|
|
27 |
)["train"].to_pandas()
|
28 |
-
assert all(
|
|
|
|
|
29 |
# Show latest submission only
|
30 |
-
df = df.sort_values("submission_time", ascending=False).drop_duplicates(
|
|
|
|
|
31 |
df["property"] = df["assay"].map(ASSAY_RENAME)
|
32 |
-
|
33 |
# Anonymize the user column at this point
|
34 |
df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
|
35 |
|
@@ -66,14 +72,14 @@ def readable_hash(
|
|
66 |
data: Union[str, bytes, Iterable[int]],
|
67 |
*,
|
68 |
salt: Union[str, bytes, None] = None,
|
69 |
-
words: tuple[list[str], list[str]] = (ADJECTIVES, ANIMALS+NOUNS),
|
70 |
sep: str = "-",
|
71 |
checksum_len: int = 2, # 0 to disable; 2–3 is plenty
|
72 |
-
case: str = "lower"
|
73 |
) -> str:
|
74 |
"""
|
75 |
Deterministically map input data to 'adjective-animal[-checksum]'. Generated using ChatGPT.
|
76 |
-
|
77 |
Examples
|
78 |
--------
|
79 |
>>> readable_hash("hello world")
|
|
|
5 |
from typing import Iterable, Union
|
6 |
from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS
|
7 |
|
8 |
+
pd.set_option("display.max_columns", None)
|
9 |
|
10 |
|
11 |
def show_output_box(message):
|
12 |
return gr.update(value=message, visible=True)
|
13 |
|
14 |
+
|
15 |
def anonymize_user(username: str) -> str:
|
16 |
# Anonymize using a hash of the username
|
17 |
return hashlib.sha256(username.encode()).hexdigest()[:8]
|
|
|
21 |
# For debugging
|
22 |
# # Print current time in EST
|
23 |
# EST = timezone(timedelta(hours=-4))
|
24 |
+
# print(f"tmp: Fetching results from HF at {datetime.now(EST)}")
|
25 |
# Should cache by default if not using force_redownload
|
26 |
df = load_dataset(
|
27 |
+
RESULTS_REPO,
|
28 |
+
data_files="auto_submissions/metrics_all.csv",
|
29 |
)["train"].to_pandas()
|
30 |
+
assert all(
|
31 |
+
col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS
|
32 |
+
), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_RESULTS_COLUMNS) - set(df.columns)}"
|
33 |
# Show latest submission only
|
34 |
+
df = df.sort_values("submission_time", ascending=False).drop_duplicates(
|
35 |
+
subset=["model", "assay", "user"], keep="first"
|
36 |
+
)
|
37 |
df["property"] = df["assay"].map(ASSAY_RENAME)
|
38 |
+
|
39 |
# Anonymize the user column at this point
|
40 |
df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
|
41 |
|
|
|
72 |
data: Union[str, bytes, Iterable[int]],
|
73 |
*,
|
74 |
salt: Union[str, bytes, None] = None,
|
75 |
+
words: tuple[list[str], list[str]] = (ADJECTIVES, ANIMALS + NOUNS),
|
76 |
sep: str = "-",
|
77 |
checksum_len: int = 2, # 0 to disable; 2–3 is plenty
|
78 |
+
case: str = "lower", # "lower" | "title" | "upper"
|
79 |
) -> str:
|
80 |
"""
|
81 |
Deterministically map input data to 'adjective-animal[-checksum]'. Generated using ChatGPT.
|
82 |
+
|
83 |
Examples
|
84 |
--------
|
85 |
>>> readable_hash("hello world")
|