hysts HF Staff commited on
Commit
8a9d0f0
·
1 Parent(s): ac6cc26
Files changed (13) hide show
  1. .pre-commit-config.yaml +33 -0
  2. .python-version +1 -0
  3. .vscode/extensions.json +8 -0
  4. .vscode/settings.json +17 -0
  5. README.md +4 -4
  6. app.py +186 -0
  7. app_mcp.py +133 -0
  8. pyproject.toml +58 -0
  9. requirements.txt +335 -0
  10. search.py +30 -0
  11. style.css +19 -0
  12. table.py +140 -0
  13. uv.lock +0 -0
.pre-commit-config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: check-executables-have-shebangs
6
+ - id: check-json
7
+ - id: check-merge-conflict
8
+ - id: check-shebang-scripts-are-executable
9
+ - id: check-toml
10
+ - id: check-yaml
11
+ - id: end-of-file-fixer
12
+ - id: mixed-line-ending
13
+ args: ["--fix=lf"]
14
+ - id: requirements-txt-fixer
15
+ - id: trailing-whitespace
16
+ - repo: https://github.com/astral-sh/ruff-pre-commit
17
+ rev: v0.11.13
18
+ hooks:
19
+ - id: ruff-check
20
+ args: ["--fix"]
21
+ - id: ruff-format
22
+ - repo: https://github.com/pre-commit/mirrors-mypy
23
+ rev: v1.16.0
24
+ hooks:
25
+ - id: mypy
26
+ args: ["--ignore-missing-imports"]
27
+ additional_dependencies:
28
+ [
29
+ "types-python-slugify",
30
+ "types-pytz",
31
+ "types-PyYAML",
32
+ "types-requests",
33
+ ]
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
.vscode/extensions.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "recommendations": [
3
+ "ms-python.python",
4
+ "charliermarsh.ruff",
5
+ "streetsidesoftware.code-spell-checker",
6
+ "tamasfe.even-better-toml"
7
+ ]
8
+ }
.vscode/settings.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.formatOnSave": true,
3
+ "files.insertFinalNewline": false,
4
+ "[python]": {
5
+ "editor.defaultFormatter": "charliermarsh.ruff",
6
+ "editor.formatOnType": true,
7
+ "editor.codeActionsOnSave": {
8
+ "source.fixAll.ruff": "explicit",
9
+ "source.organizeImports": "explicit"
10
+ }
11
+ },
12
+ "[jupyter]": {
13
+ "files.insertFinalNewline": false
14
+ },
15
+ "notebook.output.scrolling": true,
16
+ "notebook.formatOnSave.enabled": true
17
+ }
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: CVPR2025
3
- emoji: 🏃
4
- colorFrom: yellow
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.33.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
  title: CVPR2025
3
+ emoji:
4
+ colorFrom: red
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.33.1
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import gradio as gr
4
+ import polars as pl
5
+
6
+ from app_mcp import demo as demo_mcp
7
+ from search import search
8
+ from table import df_orig
9
+
10
+ DESCRIPTION = "# CVPR 2025"
11
+
12
+ # TODO: remove this once https://github.com/gradio-app/gradio/issues/10916 https://github.com/gradio-app/gradio/issues/11001 https://github.com/gradio-app/gradio/issues/11002 are fixed # noqa: TD002, FIX002
13
+ NOTE = """\
14
+ Note: Sorting by upvotes or comments may not work correctly due to a known bug in Gradio.
15
+ """
16
+
17
+ df_main = df_orig.select(
18
+ "title",
19
+ "authors_str",
20
+ "cvf_md",
21
+ "paper_page_md",
22
+ "upvotes",
23
+ "num_comments",
24
+ "project_page_md",
25
+ "github_md",
26
+ "Spaces",
27
+ "Models",
28
+ "Datasets",
29
+ "claimed",
30
+ "abstract",
31
+ "paper_id",
32
+ )
33
+ # TODO: Fix this once https://github.com/gradio-app/gradio/issues/10916 is fixed # noqa: FIX002, TD002
34
+ # format numbers as strings
35
+ df_main = df_main.with_columns(
36
+ [pl.col(col).cast(pl.Utf8).fill_null("").alias(col) for col in ["upvotes", "num_comments"]]
37
+ )
38
+
39
+ df_main = df_main.rename(
40
+ {
41
+ "title": "Title",
42
+ "authors_str": "Authors",
43
+ "cvf_md": "CVF",
44
+ "paper_page_md": "Paper page",
45
+ "upvotes": "👍",
46
+ "num_comments": "💬",
47
+ "project_page_md": "Project page",
48
+ "github_md": "GitHub",
49
+ }
50
+ )
51
+
52
+ COLUMN_INFO = {
53
+ "Title": ("str", "40%"),
54
+ "Authors": ("str", "20%"),
55
+ "Paper page": ("markdown", "135px"),
56
+ "👍": ("number", "50px"),
57
+ "💬": ("number", "50px"),
58
+ "CVF": ("markdown", None),
59
+ "Project page": ("markdown", None),
60
+ "GitHub": ("markdown", None),
61
+ "Spaces": ("markdown", None),
62
+ "Models": ("markdown", None),
63
+ "Datasets": ("markdown", None),
64
+ "claimed": ("markdown", None),
65
+ }
66
+
67
+
68
+ DEFAULT_COLUMNS = [
69
+ "Title",
70
+ "Paper page",
71
+ "👍",
72
+ "💬",
73
+ "CVF",
74
+ "Project page",
75
+ "GitHub",
76
+ "Spaces",
77
+ "Models",
78
+ "Datasets",
79
+ ]
80
+
81
+
82
+ def update_num_papers(df: pl.DataFrame) -> str:
83
+ if "claimed" in df.columns:
84
+ return f"{len(df)} / {len(df_main)} ({df.select(pl.col('claimed').str.contains('✅').sum()).item()} claimed)"
85
+ return f"{len(df)} / {len(df_main)}"
86
+
87
+
88
+ def update_df(
89
+ search_query: str,
90
+ candidate_pool_size: int,
91
+ num_results: int,
92
+ column_names: list[str],
93
+ ) -> gr.Dataframe:
94
+ if num_results > candidate_pool_size:
95
+ raise gr.Error("Number of results must be less than or equal to candidate pool size", print_exception=False)
96
+
97
+ df = df_main.clone()
98
+ column_names = ["Title", *column_names]
99
+
100
+ if search_query:
101
+ results = search(search_query, candidate_pool_size, num_results)
102
+ if not results:
103
+ df = df.head(0)
104
+ else:
105
+ df = pl.DataFrame(results).join(df, on="paper_id", how="inner")
106
+ df = df.sort("ce_score", descending=True).drop("ce_score")
107
+
108
+ sorted_column_names = [col for col in COLUMN_INFO if col in column_names]
109
+ df = df.select(sorted_column_names)
110
+ return gr.Dataframe(
111
+ value=df,
112
+ datatype=[COLUMN_INFO[col][0] for col in sorted_column_names],
113
+ column_widths=[COLUMN_INFO[col][1] for col in sorted_column_names],
114
+ )
115
+
116
+
117
+ with gr.Blocks(css_paths="style.css") as demo:
118
+ gr.Markdown(DESCRIPTION)
119
+ search_query = gr.Textbox(label="Search", submit_btn=True, show_label=False, placeholder="Search...")
120
+ with gr.Accordion(label="Advanced Search Options", open=False) as advanced_search_options:
121
+ with gr.Row():
122
+ candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=600, step=1, value=200)
123
+ num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
124
+
125
+ column_names = gr.CheckboxGroup(
126
+ label="Columns",
127
+ choices=[col for col in COLUMN_INFO if col != "Title"],
128
+ value=[col for col in DEFAULT_COLUMNS if col != "Title"],
129
+ )
130
+
131
+ num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(df_orig), interactive=False)
132
+
133
+ gr.Markdown(NOTE)
134
+ df = gr.Dataframe(
135
+ value=df_main,
136
+ datatype=list(COLUMN_INFO.values()),
137
+ type="polars",
138
+ row_count=(0, "dynamic"),
139
+ show_row_numbers=True,
140
+ interactive=False,
141
+ max_height=1000,
142
+ elem_id="table",
143
+ column_widths=[COLUMN_INFO[col][1] for col in COLUMN_INFO],
144
+ )
145
+
146
+ inputs = [
147
+ search_query,
148
+ candidate_pool_size,
149
+ num_results,
150
+ column_names,
151
+ ]
152
+ gr.on(
153
+ triggers=[
154
+ search_query.submit,
155
+ column_names.input,
156
+ ],
157
+ fn=update_df,
158
+ inputs=inputs,
159
+ outputs=df,
160
+ api_name=False,
161
+ ).then(
162
+ fn=update_num_papers,
163
+ inputs=df,
164
+ outputs=num_papers,
165
+ queue=False,
166
+ api_name=False,
167
+ )
168
+ demo.load(
169
+ fn=update_df,
170
+ inputs=inputs,
171
+ outputs=df,
172
+ api_name=False,
173
+ ).then(
174
+ fn=update_num_papers,
175
+ inputs=df,
176
+ outputs=num_papers,
177
+ queue=False,
178
+ api_name=False,
179
+ )
180
+
181
+ with gr.Row(visible=False):
182
+ demo_mcp.render()
183
+
184
+
185
+ if __name__ == "__main__":
186
+ demo.launch(mcp_server=True)
app_mcp.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import polars as pl
3
+
4
+ from search import search
5
+ from table import df_orig
6
+
7
+ COLUMNS_MCP = [
8
+ "title",
9
+ "authors",
10
+ "abstract",
11
+ "cvf_page_url",
12
+ "pdf_url",
13
+ "supp_url",
14
+ "arxiv_id",
15
+ "paper_page",
16
+ "bibtex",
17
+ "space_ids",
18
+ "model_ids",
19
+ "dataset_ids",
20
+ "upvotes",
21
+ "num_comments",
22
+ "project_page",
23
+ "github",
24
+ "row_index",
25
+ ]
26
+ DEFAULT_COLUMNS_MCP = [
27
+ "title",
28
+ "authors",
29
+ "abstract",
30
+ "cvf_page_url",
31
+ "pdf_url",
32
+ "arxiv_id",
33
+ "project_page",
34
+ "github",
35
+ "row_index",
36
+ ]
37
+
38
+ df_mcp = df_orig.rename({"cvf": "cvf_page_url", "paper_id": "row_index"}).select(COLUMNS_MCP)
39
+
40
+
41
+ def search_papers(
42
+ search_query: str,
43
+ candidate_pool_size: int,
44
+ num_results: int,
45
+ columns: list[str],
46
+ ) -> list[dict]:
47
+ """Searches CVPR 2025 papers relevant to a user query in English.
48
+
49
+ This function performs a semantic search over CVPR 2025 papers.
50
+ It uses a dual-stage retrieval process:
51
+ - First, it retrieves `candidate_pool_size` papers using dense vector similarity.
52
+ - Then, it re-ranks them with a cross-encoder model to select the top `num_results` most relevant papers.
53
+ - The search results are returned as a list of dictionaries.
54
+
55
+ Note:
56
+ The search query must be written in English. Queries in other languages are not supported.
57
+
58
+ Args:
59
+ search_query (str): The natural language query input by the user. Must be in English.
60
+ candidate_pool_size (int): Number of candidate papers to retrieve using the dense vector model.
61
+ num_results (int): Final number of top-ranked papers to return after re-ranking.
62
+ columns (list[str]): The columns to select from the DataFrame.
63
+
64
+ Returns:
65
+ list[dict]: A list of dictionaries of the top-ranked papers matching the query, sorted by relevance.
66
+ """
67
+ if not search_query:
68
+ raise ValueError("Search query cannot be empty")
69
+ if num_results > candidate_pool_size:
70
+ raise ValueError("Number of results must be less than or equal to candidate pool size")
71
+
72
+ df = df_mcp.clone()
73
+ results = search(search_query, candidate_pool_size, num_results)
74
+ df = pl.DataFrame(results).rename({"paper_id": "row_index"}).join(df, on="row_index", how="inner")
75
+ df = df.sort("ce_score", descending=True)
76
+ return df.select(columns).to_dicts()
77
+
78
+
79
+ def get_metadata(row_index: int) -> dict:
80
+ """Returns a dictionary of metadata for a CVPR 2025 paper at the given table row index.
81
+
82
+ Args:
83
+ row_index (int): The index of the paper in the internal paper list table.
84
+
85
+ Returns:
86
+ dict: A dictionary containing metadata for the corresponding paper.
87
+ """
88
+ return df_mcp.filter(pl.col("row_index") == row_index).to_dicts()[0]
89
+
90
+
91
+ def get_table(columns: list[str]) -> list[dict]:
92
+ """Returns a list of dictionaries of all CVPR 2025 papers.
93
+
94
+ Args:
95
+ columns (list[str]): The columns to select from the DataFrame.
96
+
97
+ Returns:
98
+ list[dict]: A list of dictionaries of all CVPR 2025 papers.
99
+ """
100
+ return df_mcp.select(columns).to_dicts()
101
+
102
+
103
+ with gr.Blocks() as demo:
104
+ search_query = gr.Textbox(label="Search", submit_btn=True)
105
+ candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=500, step=1, value=200)
106
+ num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
107
+ column_names = gr.CheckboxGroup(label="Columns", choices=COLUMNS_MCP, value=DEFAULT_COLUMNS_MCP)
108
+ row_index = gr.Slider(label="Row Index", minimum=0, maximum=len(df_mcp) - 1, step=1, value=0)
109
+
110
+ out = gr.JSON()
111
+
112
+ search_papers_btn = gr.Button("Search Papers")
113
+ get_metadata_btn = gr.Button("Get Metadata")
114
+ get_table_btn = gr.Button("Get Table")
115
+
116
+ search_papers_btn.click(
117
+ fn=search_papers,
118
+ inputs=[search_query, candidate_pool_size, num_results, column_names],
119
+ outputs=out,
120
+ )
121
+ get_metadata_btn.click(
122
+ fn=get_metadata,
123
+ inputs=row_index,
124
+ outputs=out,
125
+ )
126
+ get_table_btn.click(
127
+ fn=get_table,
128
+ inputs=column_names,
129
+ outputs=out,
130
+ )
131
+
132
+ if __name__ == "__main__":
133
+ demo.launch(mcp_server=True)
pyproject.toml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "cvpr2025"
3
+ version = "0.1.0"
4
+ description = ""
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "datasets>=3.6.0",
9
+ "faiss-cpu>=1.11.0",
10
+ "gradio[mcp]>=5.33.1",
11
+ "hf-transfer>=0.1.9",
12
+ "loguru>=0.7.3",
13
+ "polars>=1.30.0",
14
+ "sentence-transformers>=4.1.0",
15
+ "spaces>=0.37.0",
16
+ "torch==2.4.0",
17
+ ]
18
+
19
+ [tool.ruff]
20
+ line-length = 119
21
+
22
+ [tool.ruff.lint]
23
+ select = ["ALL"]
24
+ ignore = [
25
+ "COM812", # missing-trailing-comma
26
+ "D203", # one-blank-line-before-class
27
+ "D213", # multi-line-summary-second-line
28
+ "E501", # line-too-long
29
+ "SIM117", # multiple-with-statements
30
+ #
31
+ "D100", # undocumented-public-module
32
+ "D101", # undocumented-public-class
33
+ "D102", # undocumented-public-method
34
+ "D103", # undocumented-public-function
35
+ "D104", # undocumented-public-package
36
+ "D105", # undocumented-magic-method
37
+ "D107", # undocumented-public-init
38
+ "EM101", # raw-string-in-exception
39
+ "FBT001", # boolean-type-hint-positional-argument
40
+ "FBT002", # boolean-default-value-positional-argument
41
+ "PD901", # pandas-df-variable-name
42
+ "PGH003", # blanket-type-ignore
43
+ "PLR0913", # too-many-arguments
44
+ "PLR0915", # too-many-statements
45
+ "TRY003", # raise-vanilla-args
46
+ ]
47
+ unfixable = [
48
+ "F401", # unused-import
49
+ ]
50
+
51
+ [tool.ruff.lint.pydocstyle]
52
+ convention = "google"
53
+
54
+ [tool.ruff.lint.per-file-ignores]
55
+ "*.ipynb" = ["T201", "T203"]
56
+
57
+ [tool.ruff.format]
58
+ docstring-code-format = true
requirements.txt ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ aiofiles==24.1.0
4
+ # via gradio
5
+ aiohappyeyeballs==2.6.1
6
+ # via aiohttp
7
+ aiohttp==3.12.11
8
+ # via fsspec
9
+ aiosignal==1.3.2
10
+ # via aiohttp
11
+ annotated-types==0.7.0
12
+ # via pydantic
13
+ anyio==4.9.0
14
+ # via
15
+ # gradio
16
+ # httpx
17
+ # mcp
18
+ # sse-starlette
19
+ # starlette
20
+ async-timeout==5.0.1
21
+ # via aiohttp
22
+ attrs==25.3.0
23
+ # via aiohttp
24
+ certifi==2025.4.26
25
+ # via
26
+ # httpcore
27
+ # httpx
28
+ # requests
29
+ charset-normalizer==3.4.2
30
+ # via requests
31
+ click==8.2.1
32
+ # via
33
+ # typer
34
+ # uvicorn
35
+ datasets==3.6.0
36
+ # via cvpr2025 (pyproject.toml)
37
+ dill==0.3.8
38
+ # via
39
+ # datasets
40
+ # multiprocess
41
+ exceptiongroup==1.3.0
42
+ # via anyio
43
+ faiss-cpu==1.11.0
44
+ # via cvpr2025 (pyproject.toml)
45
+ fastapi==0.115.12
46
+ # via gradio
47
+ ffmpy==0.6.0
48
+ # via gradio
49
+ filelock==3.18.0
50
+ # via
51
+ # datasets
52
+ # huggingface-hub
53
+ # torch
54
+ # transformers
55
+ # triton
56
+ frozenlist==1.6.2
57
+ # via
58
+ # aiohttp
59
+ # aiosignal
60
+ fsspec==2025.3.0
61
+ # via
62
+ # datasets
63
+ # gradio-client
64
+ # huggingface-hub
65
+ # torch
66
+ gradio==5.33.1
67
+ # via
68
+ # cvpr2025 (pyproject.toml)
69
+ # spaces
70
+ gradio-client==1.10.3
71
+ # via gradio
72
+ groovy==0.1.2
73
+ # via gradio
74
+ h11==0.16.0
75
+ # via
76
+ # httpcore
77
+ # uvicorn
78
+ hf-transfer==0.1.9
79
+ # via cvpr2025 (pyproject.toml)
80
+ hf-xet==1.1.3
81
+ # via huggingface-hub
82
+ httpcore==1.0.9
83
+ # via httpx
84
+ httpx==0.28.1
85
+ # via
86
+ # gradio
87
+ # gradio-client
88
+ # mcp
89
+ # safehttpx
90
+ # spaces
91
+ httpx-sse==0.4.0
92
+ # via mcp
93
+ huggingface-hub==0.32.4
94
+ # via
95
+ # datasets
96
+ # gradio
97
+ # gradio-client
98
+ # sentence-transformers
99
+ # tokenizers
100
+ # transformers
101
+ idna==3.10
102
+ # via
103
+ # anyio
104
+ # httpx
105
+ # requests
106
+ # yarl
107
+ jinja2==3.1.6
108
+ # via
109
+ # gradio
110
+ # torch
111
+ joblib==1.5.1
112
+ # via scikit-learn
113
+ loguru==0.7.3
114
+ # via cvpr2025 (pyproject.toml)
115
+ markdown-it-py==3.0.0
116
+ # via rich
117
+ markupsafe==3.0.2
118
+ # via
119
+ # gradio
120
+ # jinja2
121
+ mcp==1.9.3
122
+ # via gradio
123
+ mdurl==0.1.2
124
+ # via markdown-it-py
125
+ mpmath==1.3.0
126
+ # via sympy
127
+ multidict==6.4.4
128
+ # via
129
+ # aiohttp
130
+ # yarl
131
+ multiprocess==0.70.16
132
+ # via datasets
133
+ networkx==3.4.2
134
+ # via torch
135
+ numpy==2.2.6
136
+ # via
137
+ # datasets
138
+ # faiss-cpu
139
+ # gradio
140
+ # pandas
141
+ # scikit-learn
142
+ # scipy
143
+ # transformers
144
+ nvidia-cublas-cu12==12.1.3.1
145
+ # via
146
+ # nvidia-cudnn-cu12
147
+ # nvidia-cusolver-cu12
148
+ # torch
149
+ nvidia-cuda-cupti-cu12==12.1.105
150
+ # via torch
151
+ nvidia-cuda-nvrtc-cu12==12.1.105
152
+ # via torch
153
+ nvidia-cuda-runtime-cu12==12.1.105
154
+ # via torch
155
+ nvidia-cudnn-cu12==9.1.0.70
156
+ # via torch
157
+ nvidia-cufft-cu12==11.0.2.54
158
+ # via torch
159
+ nvidia-curand-cu12==10.3.2.106
160
+ # via torch
161
+ nvidia-cusolver-cu12==11.4.5.107
162
+ # via torch
163
+ nvidia-cusparse-cu12==12.1.0.106
164
+ # via
165
+ # nvidia-cusolver-cu12
166
+ # torch
167
+ nvidia-nccl-cu12==2.20.5
168
+ # via torch
169
+ nvidia-nvjitlink-cu12==12.9.86
170
+ # via
171
+ # nvidia-cusolver-cu12
172
+ # nvidia-cusparse-cu12
173
+ nvidia-nvtx-cu12==12.1.105
174
+ # via torch
175
+ orjson==3.10.18
176
+ # via gradio
177
+ packaging==25.0
178
+ # via
179
+ # datasets
180
+ # faiss-cpu
181
+ # gradio
182
+ # gradio-client
183
+ # huggingface-hub
184
+ # spaces
185
+ # transformers
186
+ pandas==2.3.0
187
+ # via
188
+ # datasets
189
+ # gradio
190
+ pillow==11.2.1
191
+ # via
192
+ # gradio
193
+ # sentence-transformers
194
+ polars==1.30.0
195
+ # via cvpr2025 (pyproject.toml)
196
+ propcache==0.3.1
197
+ # via
198
+ # aiohttp
199
+ # yarl
200
+ psutil==5.9.8
201
+ # via spaces
202
+ pyarrow==20.0.0
203
+ # via datasets
204
+ pydantic==2.11.5
205
+ # via
206
+ # fastapi
207
+ # gradio
208
+ # mcp
209
+ # pydantic-settings
210
+ # spaces
211
+ pydantic-core==2.33.2
212
+ # via pydantic
213
+ pydantic-settings==2.9.1
214
+ # via mcp
215
+ pydub==0.25.1
216
+ # via gradio
217
+ pygments==2.19.1
218
+ # via rich
219
+ python-dateutil==2.9.0.post0
220
+ # via pandas
221
+ python-dotenv==1.1.0
222
+ # via pydantic-settings
223
+ python-multipart==0.0.20
224
+ # via
225
+ # gradio
226
+ # mcp
227
+ pytz==2025.2
228
+ # via pandas
229
+ pyyaml==6.0.2
230
+ # via
231
+ # datasets
232
+ # gradio
233
+ # huggingface-hub
234
+ # transformers
235
+ regex==2024.11.6
236
+ # via transformers
237
+ requests==2.32.3
238
+ # via
239
+ # datasets
240
+ # huggingface-hub
241
+ # spaces
242
+ # transformers
243
+ rich==14.0.0
244
+ # via typer
245
+ ruff==0.11.13
246
+ # via gradio
247
+ safehttpx==0.1.6
248
+ # via gradio
249
+ safetensors==0.5.3
250
+ # via transformers
251
+ scikit-learn==1.7.0
252
+ # via sentence-transformers
253
+ scipy==1.15.3
254
+ # via
255
+ # scikit-learn
256
+ # sentence-transformers
257
+ semantic-version==2.10.0
258
+ # via gradio
259
+ sentence-transformers==4.1.0
260
+ # via cvpr2025 (pyproject.toml)
261
+ shellingham==1.5.4
262
+ # via typer
263
+ six==1.17.0
264
+ # via python-dateutil
265
+ sniffio==1.3.1
266
+ # via anyio
267
+ spaces==0.37.0
268
+ # via cvpr2025 (pyproject.toml)
269
+ sse-starlette==2.3.6
270
+ # via mcp
271
+ starlette==0.46.2
272
+ # via
273
+ # fastapi
274
+ # gradio
275
+ # mcp
276
+ sympy==1.14.0
277
+ # via torch
278
+ threadpoolctl==3.6.0
279
+ # via scikit-learn
280
+ tokenizers==0.21.1
281
+ # via transformers
282
+ tomlkit==0.13.3
283
+ # via gradio
284
+ torch==2.4.0
285
+ # via
286
+ # cvpr2025 (pyproject.toml)
287
+ # sentence-transformers
288
+ tqdm==4.67.1
289
+ # via
290
+ # datasets
291
+ # huggingface-hub
292
+ # sentence-transformers
293
+ # transformers
294
+ transformers==4.52.4
295
+ # via sentence-transformers
296
+ triton==3.0.0
297
+ # via torch
298
+ typer==0.16.0
299
+ # via gradio
300
+ typing-extensions==4.14.0
301
+ # via
302
+ # anyio
303
+ # exceptiongroup
304
+ # fastapi
305
+ # gradio
306
+ # gradio-client
307
+ # huggingface-hub
308
+ # multidict
309
+ # pydantic
310
+ # pydantic-core
311
+ # rich
312
+ # sentence-transformers
313
+ # spaces
314
+ # torch
315
+ # typer
316
+ # typing-inspection
317
+ # uvicorn
318
+ typing-inspection==0.4.1
319
+ # via
320
+ # pydantic
321
+ # pydantic-settings
322
+ tzdata==2025.2
323
+ # via pandas
324
+ urllib3==2.4.0
325
+ # via requests
326
+ uvicorn==0.34.3
327
+ # via
328
+ # gradio
329
+ # mcp
330
+ websockets==15.0.1
331
+ # via gradio-client
332
+ xxhash==3.5.0
333
+ # via datasets
334
+ yarl==1.20.0
335
+ # via aiohttp
search.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import numpy as np
3
+ import spaces
4
+ from sentence_transformers import CrossEncoder, SentenceTransformer
5
+
6
+ from table import BASE_REPO_ID
7
+
8
+ ds = datasets.load_dataset(BASE_REPO_ID, split="train")
9
+ ds.add_faiss_index(column="embedding")
10
+
11
+ bi_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
12
+ ce_model = CrossEncoder("BAAI/bge-reranker-base")
13
+
14
+
15
+ @spaces.GPU(duration=10)
16
+ def search(query: str, candidate_pool_size: int = 100, retrieval_k: int = 50) -> list[dict]:
17
+ prefix = "Represent this sentence for searching relevant passages: "
18
+ q_vec = bi_model.encode(prefix + query, normalize_embeddings=True)
19
+
20
+ _, retrieved_ds = ds.get_nearest_examples("embedding", q_vec, k=candidate_pool_size)
21
+
22
+ ce_inputs = [
23
+ (query, f"{retrieved_ds['title'][i]} {retrieved_ds['abstract'][i]}") for i in range(len(retrieved_ds["title"]))
24
+ ]
25
+ ce_scores = ce_model.predict(ce_inputs, batch_size=16)
26
+
27
+ sorted_idx = np.argsort(ce_scores)[::-1]
28
+ return [
29
+ {"paper_id": retrieved_ds["paper_id"][i], "ce_score": float(ce_scores[i])} for i in sorted_idx[:retrieval_k]
30
+ ]
style.css ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ display: block;
4
+ }
5
+
6
+ #abstract-modal .modal-block {
7
+ position: fixed !important;
8
+ top: 50% !important;
9
+ left: 50% !important;
10
+ transform: translate(-50%, -50%) !important;
11
+ width: 80vw !important;
12
+ max-width: 900px !important;
13
+ margin: 0 !important;
14
+ }
15
+
16
+ #abstract-modal .modal-block,
17
+ #abstract-modal .modal-block * {
18
+ font-size: 1.0rem !important;
19
+ }
table.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import polars as pl
3
+ from loguru import logger
4
+ from polars import datatypes as pdt
5
+
6
+ BASE_REPO_ID = "ai-conferences/CVPR2025"
7
+ PATCH_REPO_ID = "ai-conferences/CVPR2025-patches"
8
+ PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"
9
+
10
+
11
+ def get_patch_latest_values(
12
+ df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp", delimiter: str = ","
13
+ ) -> pl.DataFrame:
14
+ df = df.sort(timestamp_col)
15
+
16
+ list_cols = [
17
+ col for col, dtype in df.schema.items() if col not in (id_col, timestamp_col) and dtype.base_type() is pdt.List
18
+ ]
19
+ df = df.with_columns(
20
+ [
21
+ pl.when(pl.col(c).is_not_null()).then(pl.col(c).list.join(delimiter)).otherwise(None).alias(c)
22
+ for c in list_cols
23
+ ]
24
+ )
25
+
26
+ update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)]
27
+ melted = df.unpivot(on=update_columns, index=[timestamp_col, id_col]).drop_nulls()
28
+
29
+ latest_rows = (
30
+ melted.sort(timestamp_col)
31
+ .group_by([id_col, "variable"])
32
+ .agg(pl.col("value").last())
33
+ .pivot("variable", index=id_col, values="value")
34
+ )
35
+
36
+ latest_rows = latest_rows.with_columns(
37
+ [
38
+ pl.when(pl.col(c).is_not_null()).then(pl.col(c).str.split(delimiter)).otherwise(None).alias(c)
39
+ for c in list_cols
40
+ ]
41
+ )
42
+
43
+ missing_cols = [c for c in all_columns if c not in latest_rows.columns and c != id_col]
44
+ if missing_cols:
45
+ latest_rows = latest_rows.with_columns([pl.lit(None).alias(c) for c in missing_cols])
46
+
47
+ return latest_rows.select([id_col] + [col for col in all_columns if col != id_col])
48
+
49
+
50
+ def format_author_claim_ratio(row: dict) -> str:
51
+ n_linked_authors = row["n_linked_authors"]
52
+ n_authors = row["n_authors"]
53
+
54
+ if n_linked_authors is None or n_authors is None:
55
+ return ""
56
+
57
+ author_linked = "✅" if n_linked_authors > 0 else ""
58
+ return f"{n_linked_authors}/{n_authors} {author_linked}".strip()
59
+
60
+
61
+ df_orig = (
62
+ datasets.load_dataset(BASE_REPO_ID, split="train")
63
+ .to_polars()
64
+ .rename({"cvf_url": "cvf"})
65
+ .with_columns(
66
+ pl.lit([], dtype=pl.List(pl.Utf8)).alias(col_name) for col_name in ["space_ids", "model_ids", "dataset_ids"]
67
+ )
68
+ )
69
+
70
+ df_paper_page = (
71
+ datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train")
72
+ .to_polars()
73
+ .drop(["summary", "author_names", "ai_keywords"])
74
+ )
75
+ df_orig = df_orig.join(df_paper_page, on="arxiv_id", how="left")
76
+
77
+ try:
78
+ df_patches = (
79
+ datasets.load_dataset(PATCH_REPO_ID, split="train")
80
+ .to_polars()
81
+ .drop("diff")
82
+ .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%+"))
83
+ )
84
+ df_patches = get_patch_latest_values(df_patches, df_orig.columns, id_col="paper_id", timestamp_col="timestamp")
85
+ df_orig = (
86
+ df_orig.join(df_patches, on="paper_id", how="left")
87
+ .with_columns(
88
+ [
89
+ pl.coalesce([pl.col(col + "_right"), pl.col(col)]).alias(col)
90
+ for col in df_orig.columns
91
+ if col != "paper_id"
92
+ ]
93
+ )
94
+ .select(df_orig.columns)
95
+ )
96
+ except Exception as e: # noqa: BLE001
97
+ logger.warning(e)
98
+
99
+ # format authors
100
+ df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str"))
101
+ # format links
102
+ df_orig = df_orig.with_columns(
103
+ [
104
+ pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md")
105
+ for col in ["cvf", "project_page", "github"]
106
+ ]
107
+ )
108
+ # format paper page link
109
+ df_orig = df_orig.with_columns(
110
+ (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
111
+ ).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))
112
+
113
+ # count authors
114
+ df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors"))
115
+ df_orig = df_orig.with_columns(
116
+ pl.col("author_usernames")
117
+ .map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64)
118
+ .alias("n_linked_authors")
119
+ )
120
+ df_orig = df_orig.with_columns(
121
+ pl.struct(["n_linked_authors", "n_authors"])
122
+ .map_elements(format_author_claim_ratio, return_dtype=pl.Utf8)
123
+ .alias("claimed")
124
+ )
125
+
126
+ # format spaces, models, datasets
127
+ for repo_id_col, markdown_col, base_url in [
128
+ ("space_ids", "Spaces", "https://huggingface.co/spaces/"),
129
+ ("model_ids", "Models", "https://huggingface.co/"),
130
+ ("dataset_ids", "Datasets", "https://huggingface.co/datasets/"),
131
+ ]:
132
+ df_orig = df_orig.with_columns(
133
+ pl.col(repo_id_col)
134
+ .map_elements(
135
+ lambda lst: "\n".join([f"[link]({base_url}{x})" for x in lst]) if lst is not None else None, # noqa: B023
136
+ return_dtype=pl.Utf8,
137
+ )
138
+ .fill_null("")
139
+ .alias(markdown_col)
140
+ )
uv.lock ADDED
The diff for this file is too large to render. See raw diff