Spaces:
Running
on
Zero
Running
on
Zero
Add files
Browse files- .pre-commit-config.yaml +33 -0
- .python-version +1 -0
- .vscode/extensions.json +8 -0
- .vscode/settings.json +17 -0
- README.md +4 -4
- app.py +186 -0
- app_mcp.py +133 -0
- pyproject.toml +58 -0
- requirements.txt +335 -0
- search.py +30 -0
- style.css +19 -0
- table.py +140 -0
- uv.lock +0 -0
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
repos:
|
2 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
3 |
+
rev: v5.0.0
|
4 |
+
hooks:
|
5 |
+
- id: check-executables-have-shebangs
|
6 |
+
- id: check-json
|
7 |
+
- id: check-merge-conflict
|
8 |
+
- id: check-shebang-scripts-are-executable
|
9 |
+
- id: check-toml
|
10 |
+
- id: check-yaml
|
11 |
+
- id: end-of-file-fixer
|
12 |
+
- id: mixed-line-ending
|
13 |
+
args: ["--fix=lf"]
|
14 |
+
- id: requirements-txt-fixer
|
15 |
+
- id: trailing-whitespace
|
16 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
17 |
+
rev: v0.11.13
|
18 |
+
hooks:
|
19 |
+
- id: ruff-check
|
20 |
+
args: ["--fix"]
|
21 |
+
- id: ruff-format
|
22 |
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
23 |
+
rev: v1.16.0
|
24 |
+
hooks:
|
25 |
+
- id: mypy
|
26 |
+
args: ["--ignore-missing-imports"]
|
27 |
+
additional_dependencies:
|
28 |
+
[
|
29 |
+
"types-python-slugify",
|
30 |
+
"types-pytz",
|
31 |
+
"types-PyYAML",
|
32 |
+
"types-requests",
|
33 |
+
]
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.10
|
.vscode/extensions.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"recommendations": [
|
3 |
+
"ms-python.python",
|
4 |
+
"charliermarsh.ruff",
|
5 |
+
"streetsidesoftware.code-spell-checker",
|
6 |
+
"tamasfe.even-better-toml"
|
7 |
+
]
|
8 |
+
}
|
.vscode/settings.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"editor.formatOnSave": true,
|
3 |
+
"files.insertFinalNewline": false,
|
4 |
+
"[python]": {
|
5 |
+
"editor.defaultFormatter": "charliermarsh.ruff",
|
6 |
+
"editor.formatOnType": true,
|
7 |
+
"editor.codeActionsOnSave": {
|
8 |
+
"source.fixAll.ruff": "explicit",
|
9 |
+
"source.organizeImports": "explicit"
|
10 |
+
}
|
11 |
+
},
|
12 |
+
"[jupyter]": {
|
13 |
+
"files.insertFinalNewline": false
|
14 |
+
},
|
15 |
+
"notebook.output.scrolling": true,
|
16 |
+
"notebook.formatOnSave.enabled": true
|
17 |
+
}
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
title: CVPR2025
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.33.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
1 |
---
|
2 |
title: CVPR2025
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.33.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
app.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import polars as pl
|
5 |
+
|
6 |
+
from app_mcp import demo as demo_mcp
|
7 |
+
from search import search
|
8 |
+
from table import df_orig
|
9 |
+
|
10 |
+
DESCRIPTION = "# CVPR 2025"
|
11 |
+
|
12 |
+
# TODO: remove this once https://github.com/gradio-app/gradio/issues/10916 https://github.com/gradio-app/gradio/issues/11001 https://github.com/gradio-app/gradio/issues/11002 are fixed # noqa: TD002, FIX002
|
13 |
+
NOTE = """\
|
14 |
+
Note: Sorting by upvotes or comments may not work correctly due to a known bug in Gradio.
|
15 |
+
"""
|
16 |
+
|
17 |
+
df_main = df_orig.select(
|
18 |
+
"title",
|
19 |
+
"authors_str",
|
20 |
+
"cvf_md",
|
21 |
+
"paper_page_md",
|
22 |
+
"upvotes",
|
23 |
+
"num_comments",
|
24 |
+
"project_page_md",
|
25 |
+
"github_md",
|
26 |
+
"Spaces",
|
27 |
+
"Models",
|
28 |
+
"Datasets",
|
29 |
+
"claimed",
|
30 |
+
"abstract",
|
31 |
+
"paper_id",
|
32 |
+
)
|
33 |
+
# TODO: Fix this once https://github.com/gradio-app/gradio/issues/10916 is fixed # noqa: FIX002, TD002
|
34 |
+
# format numbers as strings
|
35 |
+
df_main = df_main.with_columns(
|
36 |
+
[pl.col(col).cast(pl.Utf8).fill_null("").alias(col) for col in ["upvotes", "num_comments"]]
|
37 |
+
)
|
38 |
+
|
39 |
+
df_main = df_main.rename(
|
40 |
+
{
|
41 |
+
"title": "Title",
|
42 |
+
"authors_str": "Authors",
|
43 |
+
"cvf_md": "CVF",
|
44 |
+
"paper_page_md": "Paper page",
|
45 |
+
"upvotes": "👍",
|
46 |
+
"num_comments": "💬",
|
47 |
+
"project_page_md": "Project page",
|
48 |
+
"github_md": "GitHub",
|
49 |
+
}
|
50 |
+
)
|
51 |
+
|
52 |
+
COLUMN_INFO = {
|
53 |
+
"Title": ("str", "40%"),
|
54 |
+
"Authors": ("str", "20%"),
|
55 |
+
"Paper page": ("markdown", "135px"),
|
56 |
+
"👍": ("number", "50px"),
|
57 |
+
"💬": ("number", "50px"),
|
58 |
+
"CVF": ("markdown", None),
|
59 |
+
"Project page": ("markdown", None),
|
60 |
+
"GitHub": ("markdown", None),
|
61 |
+
"Spaces": ("markdown", None),
|
62 |
+
"Models": ("markdown", None),
|
63 |
+
"Datasets": ("markdown", None),
|
64 |
+
"claimed": ("markdown", None),
|
65 |
+
}
|
66 |
+
|
67 |
+
|
68 |
+
DEFAULT_COLUMNS = [
|
69 |
+
"Title",
|
70 |
+
"Paper page",
|
71 |
+
"👍",
|
72 |
+
"💬",
|
73 |
+
"CVF",
|
74 |
+
"Project page",
|
75 |
+
"GitHub",
|
76 |
+
"Spaces",
|
77 |
+
"Models",
|
78 |
+
"Datasets",
|
79 |
+
]
|
80 |
+
|
81 |
+
|
82 |
+
def update_num_papers(df: pl.DataFrame) -> str:
|
83 |
+
if "claimed" in df.columns:
|
84 |
+
return f"{len(df)} / {len(df_main)} ({df.select(pl.col('claimed').str.contains('✅').sum()).item()} claimed)"
|
85 |
+
return f"{len(df)} / {len(df_main)}"
|
86 |
+
|
87 |
+
|
88 |
+
def update_df(
|
89 |
+
search_query: str,
|
90 |
+
candidate_pool_size: int,
|
91 |
+
num_results: int,
|
92 |
+
column_names: list[str],
|
93 |
+
) -> gr.Dataframe:
|
94 |
+
if num_results > candidate_pool_size:
|
95 |
+
raise gr.Error("Number of results must be less than or equal to candidate pool size", print_exception=False)
|
96 |
+
|
97 |
+
df = df_main.clone()
|
98 |
+
column_names = ["Title", *column_names]
|
99 |
+
|
100 |
+
if search_query:
|
101 |
+
results = search(search_query, candidate_pool_size, num_results)
|
102 |
+
if not results:
|
103 |
+
df = df.head(0)
|
104 |
+
else:
|
105 |
+
df = pl.DataFrame(results).join(df, on="paper_id", how="inner")
|
106 |
+
df = df.sort("ce_score", descending=True).drop("ce_score")
|
107 |
+
|
108 |
+
sorted_column_names = [col for col in COLUMN_INFO if col in column_names]
|
109 |
+
df = df.select(sorted_column_names)
|
110 |
+
return gr.Dataframe(
|
111 |
+
value=df,
|
112 |
+
datatype=[COLUMN_INFO[col][0] for col in sorted_column_names],
|
113 |
+
column_widths=[COLUMN_INFO[col][1] for col in sorted_column_names],
|
114 |
+
)
|
115 |
+
|
116 |
+
|
117 |
+
with gr.Blocks(css_paths="style.css") as demo:
|
118 |
+
gr.Markdown(DESCRIPTION)
|
119 |
+
search_query = gr.Textbox(label="Search", submit_btn=True, show_label=False, placeholder="Search...")
|
120 |
+
with gr.Accordion(label="Advanced Search Options", open=False) as advanced_search_options:
|
121 |
+
with gr.Row():
|
122 |
+
candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=600, step=1, value=200)
|
123 |
+
num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
|
124 |
+
|
125 |
+
column_names = gr.CheckboxGroup(
|
126 |
+
label="Columns",
|
127 |
+
choices=[col for col in COLUMN_INFO if col != "Title"],
|
128 |
+
value=[col for col in DEFAULT_COLUMNS if col != "Title"],
|
129 |
+
)
|
130 |
+
|
131 |
+
num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(df_orig), interactive=False)
|
132 |
+
|
133 |
+
gr.Markdown(NOTE)
|
134 |
+
df = gr.Dataframe(
|
135 |
+
value=df_main,
|
136 |
+
datatype=list(COLUMN_INFO.values()),
|
137 |
+
type="polars",
|
138 |
+
row_count=(0, "dynamic"),
|
139 |
+
show_row_numbers=True,
|
140 |
+
interactive=False,
|
141 |
+
max_height=1000,
|
142 |
+
elem_id="table",
|
143 |
+
column_widths=[COLUMN_INFO[col][1] for col in COLUMN_INFO],
|
144 |
+
)
|
145 |
+
|
146 |
+
inputs = [
|
147 |
+
search_query,
|
148 |
+
candidate_pool_size,
|
149 |
+
num_results,
|
150 |
+
column_names,
|
151 |
+
]
|
152 |
+
gr.on(
|
153 |
+
triggers=[
|
154 |
+
search_query.submit,
|
155 |
+
column_names.input,
|
156 |
+
],
|
157 |
+
fn=update_df,
|
158 |
+
inputs=inputs,
|
159 |
+
outputs=df,
|
160 |
+
api_name=False,
|
161 |
+
).then(
|
162 |
+
fn=update_num_papers,
|
163 |
+
inputs=df,
|
164 |
+
outputs=num_papers,
|
165 |
+
queue=False,
|
166 |
+
api_name=False,
|
167 |
+
)
|
168 |
+
demo.load(
|
169 |
+
fn=update_df,
|
170 |
+
inputs=inputs,
|
171 |
+
outputs=df,
|
172 |
+
api_name=False,
|
173 |
+
).then(
|
174 |
+
fn=update_num_papers,
|
175 |
+
inputs=df,
|
176 |
+
outputs=num_papers,
|
177 |
+
queue=False,
|
178 |
+
api_name=False,
|
179 |
+
)
|
180 |
+
|
181 |
+
with gr.Row(visible=False):
|
182 |
+
demo_mcp.render()
|
183 |
+
|
184 |
+
|
185 |
+
if __name__ == "__main__":
|
186 |
+
demo.launch(mcp_server=True)
|
app_mcp.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import polars as pl
|
3 |
+
|
4 |
+
from search import search
|
5 |
+
from table import df_orig
|
6 |
+
|
7 |
+
COLUMNS_MCP = [
|
8 |
+
"title",
|
9 |
+
"authors",
|
10 |
+
"abstract",
|
11 |
+
"cvf_page_url",
|
12 |
+
"pdf_url",
|
13 |
+
"supp_url",
|
14 |
+
"arxiv_id",
|
15 |
+
"paper_page",
|
16 |
+
"bibtex",
|
17 |
+
"space_ids",
|
18 |
+
"model_ids",
|
19 |
+
"dataset_ids",
|
20 |
+
"upvotes",
|
21 |
+
"num_comments",
|
22 |
+
"project_page",
|
23 |
+
"github",
|
24 |
+
"row_index",
|
25 |
+
]
|
26 |
+
DEFAULT_COLUMNS_MCP = [
|
27 |
+
"title",
|
28 |
+
"authors",
|
29 |
+
"abstract",
|
30 |
+
"cvf_page_url",
|
31 |
+
"pdf_url",
|
32 |
+
"arxiv_id",
|
33 |
+
"project_page",
|
34 |
+
"github",
|
35 |
+
"row_index",
|
36 |
+
]
|
37 |
+
|
38 |
+
df_mcp = df_orig.rename({"cvf": "cvf_page_url", "paper_id": "row_index"}).select(COLUMNS_MCP)
|
39 |
+
|
40 |
+
|
41 |
+
def search_papers(
|
42 |
+
search_query: str,
|
43 |
+
candidate_pool_size: int,
|
44 |
+
num_results: int,
|
45 |
+
columns: list[str],
|
46 |
+
) -> list[dict]:
|
47 |
+
"""Searches CVPR 2025 papers relevant to a user query in English.
|
48 |
+
|
49 |
+
This function performs a semantic search over CVPR 2025 papers.
|
50 |
+
It uses a dual-stage retrieval process:
|
51 |
+
- First, it retrieves `candidate_pool_size` papers using dense vector similarity.
|
52 |
+
- Then, it re-ranks them with a cross-encoder model to select the top `num_results` most relevant papers.
|
53 |
+
- The search results are returned as a list of dictionaries.
|
54 |
+
|
55 |
+
Note:
|
56 |
+
The search query must be written in English. Queries in other languages are not supported.
|
57 |
+
|
58 |
+
Args:
|
59 |
+
search_query (str): The natural language query input by the user. Must be in English.
|
60 |
+
candidate_pool_size (int): Number of candidate papers to retrieve using the dense vector model.
|
61 |
+
num_results (int): Final number of top-ranked papers to return after re-ranking.
|
62 |
+
columns (list[str]): The columns to select from the DataFrame.
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
list[dict]: A list of dictionaries of the top-ranked papers matching the query, sorted by relevance.
|
66 |
+
"""
|
67 |
+
if not search_query:
|
68 |
+
raise ValueError("Search query cannot be empty")
|
69 |
+
if num_results > candidate_pool_size:
|
70 |
+
raise ValueError("Number of results must be less than or equal to candidate pool size")
|
71 |
+
|
72 |
+
df = df_mcp.clone()
|
73 |
+
results = search(search_query, candidate_pool_size, num_results)
|
74 |
+
df = pl.DataFrame(results).rename({"paper_id": "row_index"}).join(df, on="row_index", how="inner")
|
75 |
+
df = df.sort("ce_score", descending=True)
|
76 |
+
return df.select(columns).to_dicts()
|
77 |
+
|
78 |
+
|
79 |
+
def get_metadata(row_index: int) -> dict:
|
80 |
+
"""Returns a dictionary of metadata for a CVPR 2025 paper at the given table row index.
|
81 |
+
|
82 |
+
Args:
|
83 |
+
row_index (int): The index of the paper in the internal paper list table.
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
dict: A dictionary containing metadata for the corresponding paper.
|
87 |
+
"""
|
88 |
+
return df_mcp.filter(pl.col("row_index") == row_index).to_dicts()[0]
|
89 |
+
|
90 |
+
|
91 |
+
def get_table(columns: list[str]) -> list[dict]:
|
92 |
+
"""Returns a list of dictionaries of all CVPR 2025 papers.
|
93 |
+
|
94 |
+
Args:
|
95 |
+
columns (list[str]): The columns to select from the DataFrame.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
list[dict]: A list of dictionaries of all CVPR 2025 papers.
|
99 |
+
"""
|
100 |
+
return df_mcp.select(columns).to_dicts()
|
101 |
+
|
102 |
+
|
103 |
+
with gr.Blocks() as demo:
|
104 |
+
search_query = gr.Textbox(label="Search", submit_btn=True)
|
105 |
+
candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=500, step=1, value=200)
|
106 |
+
num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
|
107 |
+
column_names = gr.CheckboxGroup(label="Columns", choices=COLUMNS_MCP, value=DEFAULT_COLUMNS_MCP)
|
108 |
+
row_index = gr.Slider(label="Row Index", minimum=0, maximum=len(df_mcp) - 1, step=1, value=0)
|
109 |
+
|
110 |
+
out = gr.JSON()
|
111 |
+
|
112 |
+
search_papers_btn = gr.Button("Search Papers")
|
113 |
+
get_metadata_btn = gr.Button("Get Metadata")
|
114 |
+
get_table_btn = gr.Button("Get Table")
|
115 |
+
|
116 |
+
search_papers_btn.click(
|
117 |
+
fn=search_papers,
|
118 |
+
inputs=[search_query, candidate_pool_size, num_results, column_names],
|
119 |
+
outputs=out,
|
120 |
+
)
|
121 |
+
get_metadata_btn.click(
|
122 |
+
fn=get_metadata,
|
123 |
+
inputs=row_index,
|
124 |
+
outputs=out,
|
125 |
+
)
|
126 |
+
get_table_btn.click(
|
127 |
+
fn=get_table,
|
128 |
+
inputs=column_names,
|
129 |
+
outputs=out,
|
130 |
+
)
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
demo.launch(mcp_server=True)
|
pyproject.toml
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "cvpr2025"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
readme = "README.md"
|
6 |
+
requires-python = ">=3.10"
|
7 |
+
dependencies = [
|
8 |
+
"datasets>=3.6.0",
|
9 |
+
"faiss-cpu>=1.11.0",
|
10 |
+
"gradio[mcp]>=5.33.1",
|
11 |
+
"hf-transfer>=0.1.9",
|
12 |
+
"loguru>=0.7.3",
|
13 |
+
"polars>=1.30.0",
|
14 |
+
"sentence-transformers>=4.1.0",
|
15 |
+
"spaces>=0.37.0",
|
16 |
+
"torch==2.4.0",
|
17 |
+
]
|
18 |
+
|
19 |
+
[tool.ruff]
|
20 |
+
line-length = 119
|
21 |
+
|
22 |
+
[tool.ruff.lint]
|
23 |
+
select = ["ALL"]
|
24 |
+
ignore = [
|
25 |
+
"COM812", # missing-trailing-comma
|
26 |
+
"D203", # one-blank-line-before-class
|
27 |
+
"D213", # multi-line-summary-second-line
|
28 |
+
"E501", # line-too-long
|
29 |
+
"SIM117", # multiple-with-statements
|
30 |
+
#
|
31 |
+
"D100", # undocumented-public-module
|
32 |
+
"D101", # undocumented-public-class
|
33 |
+
"D102", # undocumented-public-method
|
34 |
+
"D103", # undocumented-public-function
|
35 |
+
"D104", # undocumented-public-package
|
36 |
+
"D105", # undocumented-magic-method
|
37 |
+
"D107", # undocumented-public-init
|
38 |
+
"EM101", # raw-string-in-exception
|
39 |
+
"FBT001", # boolean-type-hint-positional-argument
|
40 |
+
"FBT002", # boolean-default-value-positional-argument
|
41 |
+
"PD901", # pandas-df-variable-name
|
42 |
+
"PGH003", # blanket-type-ignore
|
43 |
+
"PLR0913", # too-many-arguments
|
44 |
+
"PLR0915", # too-many-statements
|
45 |
+
"TRY003", # raise-vanilla-args
|
46 |
+
]
|
47 |
+
unfixable = [
|
48 |
+
"F401", # unused-import
|
49 |
+
]
|
50 |
+
|
51 |
+
[tool.ruff.lint.pydocstyle]
|
52 |
+
convention = "google"
|
53 |
+
|
54 |
+
[tool.ruff.lint.per-file-ignores]
|
55 |
+
"*.ipynb" = ["T201", "T203"]
|
56 |
+
|
57 |
+
[tool.ruff.format]
|
58 |
+
docstring-code-format = true
|
requirements.txt
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file was autogenerated by uv via the following command:
|
2 |
+
# uv pip compile pyproject.toml -o requirements.txt
|
3 |
+
aiofiles==24.1.0
|
4 |
+
# via gradio
|
5 |
+
aiohappyeyeballs==2.6.1
|
6 |
+
# via aiohttp
|
7 |
+
aiohttp==3.12.11
|
8 |
+
# via fsspec
|
9 |
+
aiosignal==1.3.2
|
10 |
+
# via aiohttp
|
11 |
+
annotated-types==0.7.0
|
12 |
+
# via pydantic
|
13 |
+
anyio==4.9.0
|
14 |
+
# via
|
15 |
+
# gradio
|
16 |
+
# httpx
|
17 |
+
# mcp
|
18 |
+
# sse-starlette
|
19 |
+
# starlette
|
20 |
+
async-timeout==5.0.1
|
21 |
+
# via aiohttp
|
22 |
+
attrs==25.3.0
|
23 |
+
# via aiohttp
|
24 |
+
certifi==2025.4.26
|
25 |
+
# via
|
26 |
+
# httpcore
|
27 |
+
# httpx
|
28 |
+
# requests
|
29 |
+
charset-normalizer==3.4.2
|
30 |
+
# via requests
|
31 |
+
click==8.2.1
|
32 |
+
# via
|
33 |
+
# typer
|
34 |
+
# uvicorn
|
35 |
+
datasets==3.6.0
|
36 |
+
# via cvpr2025 (pyproject.toml)
|
37 |
+
dill==0.3.8
|
38 |
+
# via
|
39 |
+
# datasets
|
40 |
+
# multiprocess
|
41 |
+
exceptiongroup==1.3.0
|
42 |
+
# via anyio
|
43 |
+
faiss-cpu==1.11.0
|
44 |
+
# via cvpr2025 (pyproject.toml)
|
45 |
+
fastapi==0.115.12
|
46 |
+
# via gradio
|
47 |
+
ffmpy==0.6.0
|
48 |
+
# via gradio
|
49 |
+
filelock==3.18.0
|
50 |
+
# via
|
51 |
+
# datasets
|
52 |
+
# huggingface-hub
|
53 |
+
# torch
|
54 |
+
# transformers
|
55 |
+
# triton
|
56 |
+
frozenlist==1.6.2
|
57 |
+
# via
|
58 |
+
# aiohttp
|
59 |
+
# aiosignal
|
60 |
+
fsspec==2025.3.0
|
61 |
+
# via
|
62 |
+
# datasets
|
63 |
+
# gradio-client
|
64 |
+
# huggingface-hub
|
65 |
+
# torch
|
66 |
+
gradio==5.33.1
|
67 |
+
# via
|
68 |
+
# cvpr2025 (pyproject.toml)
|
69 |
+
# spaces
|
70 |
+
gradio-client==1.10.3
|
71 |
+
# via gradio
|
72 |
+
groovy==0.1.2
|
73 |
+
# via gradio
|
74 |
+
h11==0.16.0
|
75 |
+
# via
|
76 |
+
# httpcore
|
77 |
+
# uvicorn
|
78 |
+
hf-transfer==0.1.9
|
79 |
+
# via cvpr2025 (pyproject.toml)
|
80 |
+
hf-xet==1.1.3
|
81 |
+
# via huggingface-hub
|
82 |
+
httpcore==1.0.9
|
83 |
+
# via httpx
|
84 |
+
httpx==0.28.1
|
85 |
+
# via
|
86 |
+
# gradio
|
87 |
+
# gradio-client
|
88 |
+
# mcp
|
89 |
+
# safehttpx
|
90 |
+
# spaces
|
91 |
+
httpx-sse==0.4.0
|
92 |
+
# via mcp
|
93 |
+
huggingface-hub==0.32.4
|
94 |
+
# via
|
95 |
+
# datasets
|
96 |
+
# gradio
|
97 |
+
# gradio-client
|
98 |
+
# sentence-transformers
|
99 |
+
# tokenizers
|
100 |
+
# transformers
|
101 |
+
idna==3.10
|
102 |
+
# via
|
103 |
+
# anyio
|
104 |
+
# httpx
|
105 |
+
# requests
|
106 |
+
# yarl
|
107 |
+
jinja2==3.1.6
|
108 |
+
# via
|
109 |
+
# gradio
|
110 |
+
# torch
|
111 |
+
joblib==1.5.1
|
112 |
+
# via scikit-learn
|
113 |
+
loguru==0.7.3
|
114 |
+
# via cvpr2025 (pyproject.toml)
|
115 |
+
markdown-it-py==3.0.0
|
116 |
+
# via rich
|
117 |
+
markupsafe==3.0.2
|
118 |
+
# via
|
119 |
+
# gradio
|
120 |
+
# jinja2
|
121 |
+
mcp==1.9.3
|
122 |
+
# via gradio
|
123 |
+
mdurl==0.1.2
|
124 |
+
# via markdown-it-py
|
125 |
+
mpmath==1.3.0
|
126 |
+
# via sympy
|
127 |
+
multidict==6.4.4
|
128 |
+
# via
|
129 |
+
# aiohttp
|
130 |
+
# yarl
|
131 |
+
multiprocess==0.70.16
|
132 |
+
# via datasets
|
133 |
+
networkx==3.4.2
|
134 |
+
# via torch
|
135 |
+
numpy==2.2.6
|
136 |
+
# via
|
137 |
+
# datasets
|
138 |
+
# faiss-cpu
|
139 |
+
# gradio
|
140 |
+
# pandas
|
141 |
+
# scikit-learn
|
142 |
+
# scipy
|
143 |
+
# transformers
|
144 |
+
nvidia-cublas-cu12==12.1.3.1
|
145 |
+
# via
|
146 |
+
# nvidia-cudnn-cu12
|
147 |
+
# nvidia-cusolver-cu12
|
148 |
+
# torch
|
149 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
150 |
+
# via torch
|
151 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
152 |
+
# via torch
|
153 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
154 |
+
# via torch
|
155 |
+
nvidia-cudnn-cu12==9.1.0.70
|
156 |
+
# via torch
|
157 |
+
nvidia-cufft-cu12==11.0.2.54
|
158 |
+
# via torch
|
159 |
+
nvidia-curand-cu12==10.3.2.106
|
160 |
+
# via torch
|
161 |
+
nvidia-cusolver-cu12==11.4.5.107
|
162 |
+
# via torch
|
163 |
+
nvidia-cusparse-cu12==12.1.0.106
|
164 |
+
# via
|
165 |
+
# nvidia-cusolver-cu12
|
166 |
+
# torch
|
167 |
+
nvidia-nccl-cu12==2.20.5
|
168 |
+
# via torch
|
169 |
+
nvidia-nvjitlink-cu12==12.9.86
|
170 |
+
# via
|
171 |
+
# nvidia-cusolver-cu12
|
172 |
+
# nvidia-cusparse-cu12
|
173 |
+
nvidia-nvtx-cu12==12.1.105
|
174 |
+
# via torch
|
175 |
+
orjson==3.10.18
|
176 |
+
# via gradio
|
177 |
+
packaging==25.0
|
178 |
+
# via
|
179 |
+
# datasets
|
180 |
+
# faiss-cpu
|
181 |
+
# gradio
|
182 |
+
# gradio-client
|
183 |
+
# huggingface-hub
|
184 |
+
# spaces
|
185 |
+
# transformers
|
186 |
+
pandas==2.3.0
|
187 |
+
# via
|
188 |
+
# datasets
|
189 |
+
# gradio
|
190 |
+
pillow==11.2.1
|
191 |
+
# via
|
192 |
+
# gradio
|
193 |
+
# sentence-transformers
|
194 |
+
polars==1.30.0
|
195 |
+
# via cvpr2025 (pyproject.toml)
|
196 |
+
propcache==0.3.1
|
197 |
+
# via
|
198 |
+
# aiohttp
|
199 |
+
# yarl
|
200 |
+
psutil==5.9.8
|
201 |
+
# via spaces
|
202 |
+
pyarrow==20.0.0
|
203 |
+
# via datasets
|
204 |
+
pydantic==2.11.5
|
205 |
+
# via
|
206 |
+
# fastapi
|
207 |
+
# gradio
|
208 |
+
# mcp
|
209 |
+
# pydantic-settings
|
210 |
+
# spaces
|
211 |
+
pydantic-core==2.33.2
|
212 |
+
# via pydantic
|
213 |
+
pydantic-settings==2.9.1
|
214 |
+
# via mcp
|
215 |
+
pydub==0.25.1
|
216 |
+
# via gradio
|
217 |
+
pygments==2.19.1
|
218 |
+
# via rich
|
219 |
+
python-dateutil==2.9.0.post0
|
220 |
+
# via pandas
|
221 |
+
python-dotenv==1.1.0
|
222 |
+
# via pydantic-settings
|
223 |
+
python-multipart==0.0.20
|
224 |
+
# via
|
225 |
+
# gradio
|
226 |
+
# mcp
|
227 |
+
pytz==2025.2
|
228 |
+
# via pandas
|
229 |
+
pyyaml==6.0.2
|
230 |
+
# via
|
231 |
+
# datasets
|
232 |
+
# gradio
|
233 |
+
# huggingface-hub
|
234 |
+
# transformers
|
235 |
+
regex==2024.11.6
|
236 |
+
# via transformers
|
237 |
+
requests==2.32.3
|
238 |
+
# via
|
239 |
+
# datasets
|
240 |
+
# huggingface-hub
|
241 |
+
# spaces
|
242 |
+
# transformers
|
243 |
+
rich==14.0.0
|
244 |
+
# via typer
|
245 |
+
ruff==0.11.13
|
246 |
+
# via gradio
|
247 |
+
safehttpx==0.1.6
|
248 |
+
# via gradio
|
249 |
+
safetensors==0.5.3
|
250 |
+
# via transformers
|
251 |
+
scikit-learn==1.7.0
|
252 |
+
# via sentence-transformers
|
253 |
+
scipy==1.15.3
|
254 |
+
# via
|
255 |
+
# scikit-learn
|
256 |
+
# sentence-transformers
|
257 |
+
semantic-version==2.10.0
|
258 |
+
# via gradio
|
259 |
+
sentence-transformers==4.1.0
|
260 |
+
# via cvpr2025 (pyproject.toml)
|
261 |
+
shellingham==1.5.4
|
262 |
+
# via typer
|
263 |
+
six==1.17.0
|
264 |
+
# via python-dateutil
|
265 |
+
sniffio==1.3.1
|
266 |
+
# via anyio
|
267 |
+
spaces==0.37.0
|
268 |
+
# via cvpr2025 (pyproject.toml)
|
269 |
+
sse-starlette==2.3.6
|
270 |
+
# via mcp
|
271 |
+
starlette==0.46.2
|
272 |
+
# via
|
273 |
+
# fastapi
|
274 |
+
# gradio
|
275 |
+
# mcp
|
276 |
+
sympy==1.14.0
|
277 |
+
# via torch
|
278 |
+
threadpoolctl==3.6.0
|
279 |
+
# via scikit-learn
|
280 |
+
tokenizers==0.21.1
|
281 |
+
# via transformers
|
282 |
+
tomlkit==0.13.3
|
283 |
+
# via gradio
|
284 |
+
torch==2.4.0
|
285 |
+
# via
|
286 |
+
# cvpr2025 (pyproject.toml)
|
287 |
+
# sentence-transformers
|
288 |
+
tqdm==4.67.1
|
289 |
+
# via
|
290 |
+
# datasets
|
291 |
+
# huggingface-hub
|
292 |
+
# sentence-transformers
|
293 |
+
# transformers
|
294 |
+
transformers==4.52.4
|
295 |
+
# via sentence-transformers
|
296 |
+
triton==3.0.0
|
297 |
+
# via torch
|
298 |
+
typer==0.16.0
|
299 |
+
# via gradio
|
300 |
+
typing-extensions==4.14.0
|
301 |
+
# via
|
302 |
+
# anyio
|
303 |
+
# exceptiongroup
|
304 |
+
# fastapi
|
305 |
+
# gradio
|
306 |
+
# gradio-client
|
307 |
+
# huggingface-hub
|
308 |
+
# multidict
|
309 |
+
# pydantic
|
310 |
+
# pydantic-core
|
311 |
+
# rich
|
312 |
+
# sentence-transformers
|
313 |
+
# spaces
|
314 |
+
# torch
|
315 |
+
# typer
|
316 |
+
# typing-inspection
|
317 |
+
# uvicorn
|
318 |
+
typing-inspection==0.4.1
|
319 |
+
# via
|
320 |
+
# pydantic
|
321 |
+
# pydantic-settings
|
322 |
+
tzdata==2025.2
|
323 |
+
# via pandas
|
324 |
+
urllib3==2.4.0
|
325 |
+
# via requests
|
326 |
+
uvicorn==0.34.3
|
327 |
+
# via
|
328 |
+
# gradio
|
329 |
+
# mcp
|
330 |
+
websockets==15.0.1
|
331 |
+
# via gradio-client
|
332 |
+
xxhash==3.5.0
|
333 |
+
# via datasets
|
334 |
+
yarl==1.20.0
|
335 |
+
# via aiohttp
|
search.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datasets
|
2 |
+
import numpy as np
|
3 |
+
import spaces
|
4 |
+
from sentence_transformers import CrossEncoder, SentenceTransformer
|
5 |
+
|
6 |
+
from table import BASE_REPO_ID
|
7 |
+
|
8 |
+
ds = datasets.load_dataset(BASE_REPO_ID, split="train")
|
9 |
+
ds.add_faiss_index(column="embedding")
|
10 |
+
|
11 |
+
bi_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
|
12 |
+
ce_model = CrossEncoder("BAAI/bge-reranker-base")
|
13 |
+
|
14 |
+
|
15 |
+
@spaces.GPU(duration=10)
|
16 |
+
def search(query: str, candidate_pool_size: int = 100, retrieval_k: int = 50) -> list[dict]:
|
17 |
+
prefix = "Represent this sentence for searching relevant passages: "
|
18 |
+
q_vec = bi_model.encode(prefix + query, normalize_embeddings=True)
|
19 |
+
|
20 |
+
_, retrieved_ds = ds.get_nearest_examples("embedding", q_vec, k=candidate_pool_size)
|
21 |
+
|
22 |
+
ce_inputs = [
|
23 |
+
(query, f"{retrieved_ds['title'][i]} {retrieved_ds['abstract'][i]}") for i in range(len(retrieved_ds["title"]))
|
24 |
+
]
|
25 |
+
ce_scores = ce_model.predict(ce_inputs, batch_size=16)
|
26 |
+
|
27 |
+
sorted_idx = np.argsort(ce_scores)[::-1]
|
28 |
+
return [
|
29 |
+
{"paper_id": retrieved_ds["paper_id"][i], "ce_score": float(ce_scores[i])} for i in sorted_idx[:retrieval_k]
|
30 |
+
]
|
style.css
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
h1 {
|
2 |
+
text-align: center;
|
3 |
+
display: block;
|
4 |
+
}
|
5 |
+
|
6 |
+
#abstract-modal .modal-block {
|
7 |
+
position: fixed !important;
|
8 |
+
top: 50% !important;
|
9 |
+
left: 50% !important;
|
10 |
+
transform: translate(-50%, -50%) !important;
|
11 |
+
width: 80vw !important;
|
12 |
+
max-width: 900px !important;
|
13 |
+
margin: 0 !important;
|
14 |
+
}
|
15 |
+
|
16 |
+
#abstract-modal .modal-block,
|
17 |
+
#abstract-modal .modal-block * {
|
18 |
+
font-size: 1.0rem !important;
|
19 |
+
}
|
table.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datasets
|
2 |
+
import polars as pl
|
3 |
+
from loguru import logger
|
4 |
+
from polars import datatypes as pdt
|
5 |
+
|
6 |
+
BASE_REPO_ID = "ai-conferences/CVPR2025"
|
7 |
+
PATCH_REPO_ID = "ai-conferences/CVPR2025-patches"
|
8 |
+
PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"
|
9 |
+
|
10 |
+
|
11 |
+
def get_patch_latest_values(
|
12 |
+
df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp", delimiter: str = ","
|
13 |
+
) -> pl.DataFrame:
|
14 |
+
df = df.sort(timestamp_col)
|
15 |
+
|
16 |
+
list_cols = [
|
17 |
+
col for col, dtype in df.schema.items() if col not in (id_col, timestamp_col) and dtype.base_type() is pdt.List
|
18 |
+
]
|
19 |
+
df = df.with_columns(
|
20 |
+
[
|
21 |
+
pl.when(pl.col(c).is_not_null()).then(pl.col(c).list.join(delimiter)).otherwise(None).alias(c)
|
22 |
+
for c in list_cols
|
23 |
+
]
|
24 |
+
)
|
25 |
+
|
26 |
+
update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)]
|
27 |
+
melted = df.unpivot(on=update_columns, index=[timestamp_col, id_col]).drop_nulls()
|
28 |
+
|
29 |
+
latest_rows = (
|
30 |
+
melted.sort(timestamp_col)
|
31 |
+
.group_by([id_col, "variable"])
|
32 |
+
.agg(pl.col("value").last())
|
33 |
+
.pivot("variable", index=id_col, values="value")
|
34 |
+
)
|
35 |
+
|
36 |
+
latest_rows = latest_rows.with_columns(
|
37 |
+
[
|
38 |
+
pl.when(pl.col(c).is_not_null()).then(pl.col(c).str.split(delimiter)).otherwise(None).alias(c)
|
39 |
+
for c in list_cols
|
40 |
+
]
|
41 |
+
)
|
42 |
+
|
43 |
+
missing_cols = [c for c in all_columns if c not in latest_rows.columns and c != id_col]
|
44 |
+
if missing_cols:
|
45 |
+
latest_rows = latest_rows.with_columns([pl.lit(None).alias(c) for c in missing_cols])
|
46 |
+
|
47 |
+
return latest_rows.select([id_col] + [col for col in all_columns if col != id_col])
|
48 |
+
|
49 |
+
|
50 |
+
def format_author_claim_ratio(row: dict) -> str:
|
51 |
+
n_linked_authors = row["n_linked_authors"]
|
52 |
+
n_authors = row["n_authors"]
|
53 |
+
|
54 |
+
if n_linked_authors is None or n_authors is None:
|
55 |
+
return ""
|
56 |
+
|
57 |
+
author_linked = "✅" if n_linked_authors > 0 else ""
|
58 |
+
return f"{n_linked_authors}/{n_authors} {author_linked}".strip()
|
59 |
+
|
60 |
+
|
61 |
+
df_orig = (
|
62 |
+
datasets.load_dataset(BASE_REPO_ID, split="train")
|
63 |
+
.to_polars()
|
64 |
+
.rename({"cvf_url": "cvf"})
|
65 |
+
.with_columns(
|
66 |
+
pl.lit([], dtype=pl.List(pl.Utf8)).alias(col_name) for col_name in ["space_ids", "model_ids", "dataset_ids"]
|
67 |
+
)
|
68 |
+
)
|
69 |
+
|
70 |
+
df_paper_page = (
|
71 |
+
datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train")
|
72 |
+
.to_polars()
|
73 |
+
.drop(["summary", "author_names", "ai_keywords"])
|
74 |
+
)
|
75 |
+
df_orig = df_orig.join(df_paper_page, on="arxiv_id", how="left")
|
76 |
+
|
77 |
+
try:
|
78 |
+
df_patches = (
|
79 |
+
datasets.load_dataset(PATCH_REPO_ID, split="train")
|
80 |
+
.to_polars()
|
81 |
+
.drop("diff")
|
82 |
+
.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%+"))
|
83 |
+
)
|
84 |
+
df_patches = get_patch_latest_values(df_patches, df_orig.columns, id_col="paper_id", timestamp_col="timestamp")
|
85 |
+
df_orig = (
|
86 |
+
df_orig.join(df_patches, on="paper_id", how="left")
|
87 |
+
.with_columns(
|
88 |
+
[
|
89 |
+
pl.coalesce([pl.col(col + "_right"), pl.col(col)]).alias(col)
|
90 |
+
for col in df_orig.columns
|
91 |
+
if col != "paper_id"
|
92 |
+
]
|
93 |
+
)
|
94 |
+
.select(df_orig.columns)
|
95 |
+
)
|
96 |
+
except Exception as e: # noqa: BLE001
|
97 |
+
logger.warning(e)
|
98 |
+
|
99 |
+
# format authors
|
100 |
+
df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str"))
|
101 |
+
# format links
|
102 |
+
df_orig = df_orig.with_columns(
|
103 |
+
[
|
104 |
+
pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md")
|
105 |
+
for col in ["cvf", "project_page", "github"]
|
106 |
+
]
|
107 |
+
)
|
108 |
+
# format paper page link
|
109 |
+
df_orig = df_orig.with_columns(
|
110 |
+
(pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
|
111 |
+
).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))
|
112 |
+
|
113 |
+
# count authors
|
114 |
+
df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors"))
|
115 |
+
df_orig = df_orig.with_columns(
|
116 |
+
pl.col("author_usernames")
|
117 |
+
.map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64)
|
118 |
+
.alias("n_linked_authors")
|
119 |
+
)
|
120 |
+
df_orig = df_orig.with_columns(
|
121 |
+
pl.struct(["n_linked_authors", "n_authors"])
|
122 |
+
.map_elements(format_author_claim_ratio, return_dtype=pl.Utf8)
|
123 |
+
.alias("claimed")
|
124 |
+
)
|
125 |
+
|
126 |
+
# format spaces, models, datasets
|
127 |
+
for repo_id_col, markdown_col, base_url in [
|
128 |
+
("space_ids", "Spaces", "https://huggingface.co/spaces/"),
|
129 |
+
("model_ids", "Models", "https://huggingface.co/"),
|
130 |
+
("dataset_ids", "Datasets", "https://huggingface.co/datasets/"),
|
131 |
+
]:
|
132 |
+
df_orig = df_orig.with_columns(
|
133 |
+
pl.col(repo_id_col)
|
134 |
+
.map_elements(
|
135 |
+
lambda lst: "\n".join([f"[link]({base_url}{x})" for x in lst]) if lst is not None else None, # noqa: B023
|
136 |
+
return_dtype=pl.Utf8,
|
137 |
+
)
|
138 |
+
.fill_null("")
|
139 |
+
.alias(markdown_col)
|
140 |
+
)
|
uv.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|