rishitdagli commited on
Commit
f9bf356
·
1 Parent(s): 49c967c

leaderboard

Browse files
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ eval-queue/
10
+ eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from huggingface_hub import HfApi, create_repo
5
+ from datasets import Dataset, load_dataset
6
+ import os
7
+ import html
8
+ from src.about import (
9
+ CITATION_BUTTON_LABEL,
10
+ CITATION_BUTTON_TEXT,
11
+ EVALUATION_QUEUE_TEXT,
12
+ INTRODUCTION_TEXT,
13
+ LLM_BENCHMARKS_TEXT,
14
+ TITLE,
15
+ )
16
+ from src.display.css_html_js import custom_css
17
+
18
+ HF_TOKEN = os.getenv('HF_TOKEN')
19
+ if not HF_TOKEN:
20
+ raise ValueError("HF_TOKEN environment variable not found")
21
+ api = HfApi(token=HF_TOKEN)
22
+ DATASET_NAME = "airletters-leaderboard-results"
23
+
24
+
25
+ INITIAL_DATA = {
26
+ "name": [
27
+ "ViT-B/16", "MaxViT-T", "ResNet-200", "ResNeXt-101", "SE-ResNeXt-26",
28
+ "ResNet-50", "VideoMAE (16)", "ResNet-101 + LSTM", "ResNet-50 + LSTM",
29
+ "ResNext-152 3D", "Strided Inflated EfficientNet 3D", "ResNext-50 3D",
30
+ "ResNext-101 3D", "ResNext-200 3D", "Video-LLaVA (w/o contrast class)",
31
+ "VideoLLaMA2 (w/o contrast class)", "Video-LLaVA", "VideoLLaMA2",
32
+ "Human Performance (10 videos/class)"
33
+ ],
34
+ "url": ["https://arxiv.org/abs/2410.02921"] * 19,
35
+ "top1_accuracy": [
36
+ 7.49, 7.56, 11.44, 13.09, 13.29, 13.87, 57.96, 58.45, 63.24,
37
+ 65.77, 65.97, 66.54, 69.74, 71.20, 2.53, 2.47, 7.29, 7.58, 96.67
38
+ ],
39
+ "organization": ["AirLetters Authors"] * 19,
40
+ "model_type": [
41
+ "Image", "Image", "Image", "Image", "Image",
42
+ "Image", "Video", "Video", "Video",
43
+ "Video", "Video", "Video", "Video", "Video",
44
+ "Vision Language", "Vision Language", "Vision Language", "Vision Language",
45
+ "Human Evaluation"
46
+ ]
47
+ }
48
+
49
+ def initialize_dataset():
50
+ try:
51
+ dataset = load_dataset(f"rishitdagli/{DATASET_NAME}", split="train", token=HF_TOKEN, download_mode="force_redownload")
52
+ df = dataset.to_pandas()
53
+ if 'model_url' in df.columns:
54
+ df = df.map(lambda row: {"name": model_hyperlink(row["model_url"], row["name"])})
55
+ df = df.drop('model_url', axis=1)
56
+ dataset = Dataset.from_pandas(df)
57
+ dataset.push_to_hub(DATASET_NAME, token=HF_TOKEN)
58
+ except Exception as e:
59
+ print(f"Creating new dataset due to: {str(e)}")
60
+ df = pd.DataFrame(INITIAL_DATA)
61
+ dataset = Dataset.from_pandas(df)
62
+ try:
63
+ create_repo(DATASET_NAME, repo_type="dataset", token=HF_TOKEN)
64
+ except Exception as e:
65
+ print(f"Repo might already exist: {str(e)}")
66
+ dataset.push_to_hub(DATASET_NAME, token=HF_TOKEN)
67
+ return dataset
68
+
69
+ def calculate_accuracy(test_file, submitted_file):
70
+ test = pd.read_csv(test_file)
71
+ test2 = pd.read_csv(submitted_file)
72
+ test.columns = test.columns.str.strip()
73
+ test2.columns = test2.columns.str.strip()
74
+ merged = pd.merge(test, test2, on="filename", how="left", suffixes=("_true", "_pred"))
75
+ merged["label_pred"] = merged["label_pred"].fillna("")
76
+ correct_predictions = (merged["label_true"] == merged["label_pred"]).sum()
77
+ total_entries = len(merged)
78
+ return (correct_predictions / total_entries) * 100
79
+
80
+ def model_hyperlink(link, model_name):
81
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
82
+
83
+ def update_leaderboard(name, organization, model_type, model_url, csv_file):
84
+ top1_acc = calculate_accuracy("test.csv", csv_file)
85
+
86
+ dataset = load_dataset(f"rishitdagli/{DATASET_NAME}", split="train", token=HF_TOKEN)
87
+ df = dataset.to_pandas()
88
+
89
+ new_row = {
90
+ 'name': name,
91
+ 'url': model_url,
92
+ 'organization': organization,
93
+ 'model_type': model_type,
94
+ 'top1_accuracy': top1_acc,
95
+ }
96
+
97
+ df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
98
+ df = df.sort_values('top1_accuracy', ascending=False)
99
+ df = df.reset_index(drop=True)
100
+
101
+ new_dataset = Dataset.from_pandas(df)
102
+ new_dataset.push_to_hub(DATASET_NAME, token=HF_TOKEN)
103
+
104
+ return "Successfully updated leaderboard"
105
+
106
+ def init_leaderboard(dataframe):
107
+ return Leaderboard(
108
+ value=dataframe,
109
+ datatype=["markdown", "str", "str", "number"],
110
+ select_columns=SelectColumns(
111
+ default_selection=["name", "organization", "model_type", "top1_accuracy"],
112
+ cant_deselect=["name", "top1_accuracy"],
113
+ label="Select Columns to Display",
114
+ ),
115
+ search_columns=["name", "organization"],
116
+ filter_columns=[
117
+ ColumnFilter("model_type", type="checkboxgroup", label="Model Type"),
118
+ ],
119
+ )
120
+
121
+ def refresh():
122
+ dataset = load_dataset(f"rishitdagli/{DATASET_NAME}", split="train", token=HF_TOKEN, download_mode="force_redownload")
123
+ dataset = dataset.map(lambda row: {"name": model_hyperlink(row["url"], row["name"])})
124
+ df = dataset.to_pandas()
125
+ return df
126
+
127
+ def create_interface():
128
+ demo = gr.Blocks(css=custom_css)
129
+
130
+ with demo:
131
+ gr.HTML(TITLE)
132
+ gr.Video("30fps.mp4", autoplay=True, width=900, loop=True, include_audio=False)
133
+
134
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
135
+ with gr.TabItem("🏅 Leaderboard", elem_id="leaderboard-tab"):
136
+ dataset = initialize_dataset()
137
+ dataset = dataset.map(lambda row: {"name": model_hyperlink(row["url"], row["name"])})
138
+ df = dataset.to_pandas()
139
+ leaderboard = init_leaderboard(df)
140
+
141
+ refresh_button = gr.Button("Refresh")
142
+ refresh_button.click(
143
+ refresh,
144
+ inputs=[],
145
+ outputs=[
146
+ leaderboard,
147
+ ],
148
+ )
149
+
150
+ with gr.TabItem("📝 About", elem_id="about-tab"):
151
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
152
+
153
+ with gr.TabItem("🚀 Submit", elem_id="submit-tab"):
154
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
155
+
156
+ with gr.Column():
157
+ name = gr.Textbox(label="Model Name")
158
+ model_url = gr.Textbox(label="Model URL", placeholder="https://huggingface.co/...")
159
+ org = gr.Textbox(label="Organization")
160
+ model_type = gr.Dropdown(
161
+ choices=["Image", "Video", "Vision Language", "Tracking", "Other"],
162
+ label="Model Type"
163
+ )
164
+ csv_file = gr.File(label="Results CSV", type="filepath")
165
+ submit_btn = gr.Button("Submit")
166
+ result = gr.Textbox(label="Result")
167
+
168
+ submit_btn.click(
169
+ update_leaderboard,
170
+ inputs=[name, org, model_type, model_url, csv_file],
171
+ outputs=[result]
172
+ )
173
+ with gr.Row():
174
+ with gr.Accordion("📙 Citation", open=False):
175
+ citation_button = gr.Textbox(
176
+ value=CITATION_BUTTON_TEXT,
177
+ label=CITATION_BUTTON_LABEL,
178
+ lines=7,
179
+ elem_id="citation-button",
180
+ show_copy_button=True,
181
+ )
182
+
183
+ return demo
184
+
185
+ if __name__ == "__main__":
186
+ demo = create_interface()
187
+ demo.queue().launch()
emoji/envelope.png ADDED
emoji/wind.png ADDED
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler
2
+ black
3
+ datasets
4
+ gradio
5
+ gradio[oauth]
6
+ gradio_leaderboard==0.0.13
7
+ gradio_client
8
+ huggingface-hub>=0.18.0
9
+ matplotlib
10
+ numpy
11
+ pandas
12
+ python-dateutil
13
+ tqdm
14
+ transformers
15
+ tokenizers>=0.15.0
16
+ sentencepiece
src/about.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = """<h1 align="center" id="space-title">Unofficial AirLetters Leaderboard</h1>"""
2
+
3
+ INTRODUCTION_TEXT = """
4
+ We introduce a new real-world dataset, utilizing human generated articulated motions with videos of people drawing Latin characters and labels. Unlike existing video datasets, accurate video understanding on our dataset requires detailed understanding of motion in the video and the integration of long-range information across the entire video. We show that existing image and video understanding models perform poorly and fall far behind the human baseline.
5
+
6
+ Unlike many video dataset which are overly dpeendent on one key frame or some which are highly dpeendent on 2-4 key frames, AirLetters require strong temporal capabilities. Our study revealed that while trivial for humans, accurate representations of complex articulated motions remain an open problem for end-to-end learning for models.
7
+
8
+ See:
9
+
10
+ - [The Paper](https://arxiv.org/abs/2410.02921)
11
+ - [Dataset Download Link](https://www.qualcomm.com/developer/software/airletters-dataset)
12
+ """
13
+
14
+ LLM_BENCHMARKS_TEXT = INTRODUCTION_TEXT
15
+
16
+ EVALUATION_QUEUE_TEXT = """
17
+ 1. Prepare your results in CSV format with columns: filename and label
18
+ 2. Fill in your model details and URL
19
+ 3. Upload your results file
20
+ 4. Your model will be evaluated against the test set and added to the leaderboard automatically.
21
+ """
22
+
23
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
24
+ CITATION_BUTTON_TEXT = r"""
25
+ @inproceedings{dagliairletters,
26
+ title={AirLetters: An Open Video Dataset of Characters Drawn in the Air},
27
+ author={Dagli, Rishit and Berger, Guillaume and Materzynska, Joanna and Bax, Ingo and Memisevic, Roland},
28
+ booktitle={European Conference on Computer Vision Workshops},
29
+ year={2024}
30
+ }
31
+ """
src/display/css_html_js.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ .markdown-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #models-to-add-text {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ #citation-button span {
12
+ font-size: 16px !important;
13
+ }
14
+
15
+ #citation-button textarea {
16
+ font-size: 16px !important;
17
+ }
18
+
19
+ #citation-button > label > button {
20
+ margin: 6px;
21
+ transform: scale(1.3);
22
+ }
23
+
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
+ }
31
+
32
+ #search-bar-table-box > div:first-child {
33
+ background: none;
34
+ border: none;
35
+ }
36
+
37
+ #search-bar {
38
+ padding: 0px;
39
+ }
40
+
41
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
+ #leaderboard-table td:nth-child(2),
43
+ #leaderboard-table th:nth-child(2) {
44
+ max-width: 400px;
45
+ overflow: auto;
46
+ white-space: nowrap;
47
+ }
48
+
49
+ .tab-buttons button {
50
+ font-size: 20px;
51
+ }
52
+
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
+ }
61
+
62
+ #scale-logo .download {
63
+ display: none;
64
+ }
65
+ #filter_type{
66
+ border: 0;
67
+ padding-left: 0;
68
+ padding-top: 0;
69
+ }
70
+ #filter_type label {
71
+ display: flex;
72
+ }
73
+ #filter_type label > span{
74
+ margin-top: var(--spacing-lg);
75
+ margin-right: 0.5em;
76
+ }
77
+ #filter_type label > .wrap{
78
+ width: 103px;
79
+ }
80
+ #filter_type label > .wrap .wrap-inner{
81
+ padding: 2px;
82
+ }
83
+ #filter_type label > .wrap .wrap-inner input{
84
+ width: 1px
85
+ }
86
+ #filter-columns-type{
87
+ border:0;
88
+ padding:0.5;
89
+ }
90
+ #filter-columns-size{
91
+ border:0;
92
+ padding:0.5;
93
+ }
94
+ #box-filter > .form{
95
+ border: 0
96
+ }
97
+ """
98
+
99
+ get_window_url_params = """
100
+ function(url_params) {
101
+ const params = new URLSearchParams(window.location.search);
102
+ url_params = Object.fromEntries(params);
103
+ return url_params;
104
+ }
105
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def model_hyperlink(link, model_name):
2
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
+
4
+
5
+ def make_clickable_model(model_name):
6
+ link = f"https://huggingface.co/{model_name}"
7
+ return model_hyperlink(link, model_name)
8
+
9
+
10
+ def styled_error(error):
11
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
+
13
+
14
+ def styled_warning(warn):
15
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
+
17
+
18
+ def styled_message(message):
19
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
+
21
+
22
+ def has_no_nan_values(df, columns):
23
+ return df[columns].notna().all(axis=1)
24
+
25
+
26
+ def has_nan_values(df, columns):
27
+ return df[columns].isna().any(axis=1)
teaser.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ from moviepy.editor import VideoFileClip, CompositeVideoClip, ColorClip
5
+ from tqdm import tqdm
6
+ import glob
7
+ import concurrent.futures
8
+ import time
9
+ import random
10
+
11
+ def create_text_overlay(text, subtitle, width, height, start_time, duration):
12
+ overlay = np.zeros((height, width, 4), dtype=np.uint8)
13
+
14
+ box_width = int(width * 0.75)
15
+ box_x_start = (width - box_width) // 2
16
+
17
+ cv2.rectangle(overlay,
18
+ (box_x_start, height//3),
19
+ (box_x_start + box_width, 2*height//3),
20
+ (0,0,0,180), -1)
21
+
22
+ font = cv2.FONT_HERSHEY_DUPLEX
23
+
24
+ if "AirLetters" in text:
25
+ title_scale = 3.0
26
+ subtitle_scale = 1.5
27
+
28
+ envelope_emoji = cv2.imread("emoji/envelope.png", cv2.IMREAD_UNCHANGED)
29
+ wind_emoji = cv2.imread("emoji/wind.png", cv2.IMREAD_UNCHANGED)
30
+
31
+ target_height = int(title_scale * 30)
32
+ envelope_aspect = envelope_emoji.shape[1] / envelope_emoji.shape[0]
33
+ wind_aspect = wind_emoji.shape[1] / wind_emoji.shape[0]
34
+
35
+ envelope_emoji = cv2.resize(envelope_emoji,
36
+ (int(target_height * envelope_aspect), target_height))
37
+ wind_emoji = cv2.resize(wind_emoji,
38
+ (int(target_height * wind_aspect), target_height))
39
+ else:
40
+ title_scale = 2.0
41
+ subtitle_scale = 1.0
42
+
43
+ title_color = (138, 223, 178, 255)
44
+ subtitle_color = (255, 255, 255, 255)
45
+
46
+ title_size = cv2.getTextSize(text, font, title_scale, 2)[0]
47
+
48
+ # Center text within the box
49
+ title_x = box_x_start + (box_width - title_size[0]) // 2
50
+ title_y = height // 2
51
+
52
+ if "AirLetters" in text:
53
+ emoji_y = title_y - target_height + 5
54
+ envelope_x = title_x - envelope_emoji.shape[1] - 20
55
+ wind_x = title_x + title_size[0] + 20
56
+
57
+ def overlay_image_with_alpha(background, foreground, x, y):
58
+ if x >= background.shape[1] or y >= background.shape[0]:
59
+ return
60
+
61
+ h, w = foreground.shape[:2]
62
+ if len(foreground.shape) == 2:
63
+ alpha = foreground
64
+ foreground = cv2.cvtColor(foreground, cv2.COLOR_GRAY2BGR)
65
+ else:
66
+ alpha = foreground[:, :, 3] / 255.0
67
+ foreground = foreground[:, :, :3]
68
+
69
+ y1, y2 = max(0, y), min(background.shape[0], y + h)
70
+ x1, x2 = max(0, x), min(background.shape[1], x + w)
71
+
72
+ alpha_slice = alpha[y1-y:y2-y, x1-x:x2-x]
73
+ alpha_expanded = np.expand_dims(alpha_slice, axis=-1)
74
+
75
+ background_slice = background[y1:y2, x1:x2, :3]
76
+ foreground_slice = foreground[y1-y:y2-y, x1-x:x2-x]
77
+
78
+ background[y1:y2, x1:x2, :3] = background_slice * (1 - alpha_expanded) + foreground_slice * alpha_expanded
79
+
80
+ background[y1:y2, x1:x2, 3] = background[y1:y2, x1:x2, 3] * (1 - alpha_slice) + 255 * alpha_slice
81
+
82
+ overlay_image_with_alpha(overlay, envelope_emoji, envelope_x, emoji_y)
83
+ overlay_image_with_alpha(overlay, wind_emoji, wind_x, emoji_y)
84
+ else:
85
+ if len(subtitle) > 50:
86
+ words = subtitle.split()
87
+ mid = len(words) // 2
88
+ subtitle = " ".join(words[:mid]) + "\n" + " ".join(words[mid:])
89
+
90
+ title_size = cv2.getTextSize(text, font, title_scale, 2)[0]
91
+
92
+ title_x = box_x_start + (box_width - title_size[0]) // 2
93
+ title_y = height // 2
94
+
95
+ cv2.putText(overlay, text, (title_x, title_y), font, title_scale, title_color, 2)
96
+
97
+ if "\n" in subtitle:
98
+ subtitle_lines = subtitle.split("\n")
99
+ subtitle_y = title_y + 50
100
+ for line in subtitle_lines:
101
+ subtitle_size = cv2.getTextSize(line, font, subtitle_scale, 2)[0]
102
+ subtitle_x = box_x_start + (box_width - subtitle_size[0]) // 2
103
+ cv2.putText(overlay, line, (subtitle_x, subtitle_y), font, subtitle_scale, subtitle_color, 2)
104
+ subtitle_y += 50
105
+ else:
106
+ subtitle_size = cv2.getTextSize(subtitle, font, subtitle_scale, 2)[0]
107
+ subtitle_x = box_x_start + (box_width - subtitle_size[0]) // 2
108
+ cv2.putText(overlay, subtitle, (subtitle_x, title_y + 60), font, subtitle_scale, subtitle_color, 2)
109
+
110
+ overlay_clip = ColorClip(size=(width, height), color=[0,0,0,0])
111
+ overlay_clip.mask = ColorClip(size=(width, height), color=[1,1,1,1])
112
+ overlay_clip.mask.get_frame = lambda t: overlay[:,:,3:4] / 255.0
113
+ overlay_clip.get_frame = lambda t: overlay[:,:,:3]
114
+
115
+ overlay_clip = overlay_clip.set_start(start_time)
116
+ overlay_clip = overlay_clip.set_duration(duration)
117
+ overlay_clip = overlay_clip.fadein(0.5).fadeout(0.5)
118
+
119
+ return overlay_clip
120
+
121
+ def load_video(args):
122
+ video_path, target_size, padding, idx, grid_width = args
123
+ try:
124
+ clip = VideoFileClip(video_path, audio=False)
125
+
126
+ clip = clip.resize(height=target_size)
127
+ clip = clip.crop(x1=(clip.w - target_size)//2, x2=(clip.w + target_size)//2) if clip.w > target_size else clip
128
+ clip = clip.loop()
129
+
130
+ bg = ColorClip(size=(target_size + padding*2, target_size + padding*2), color=(255,255,255))
131
+ clip = clip.set_position((padding, padding))
132
+ clip = CompositeVideoClip([bg, clip])
133
+
134
+ x = (idx % grid_width) * (target_size + padding*2)
135
+ y = (idx // grid_width) * (target_size + padding*2)
136
+
137
+ clip = clip.set_position((x, y))
138
+ return clip
139
+ except Exception as e:
140
+ print(f"\nError processing {video_path}: {str(e)}")
141
+ return None
142
+
143
+ def create_montage(video_dir, output_path, width=1920, height=1080, fps=30):
144
+ print("Starting video creation...")
145
+ start_time = time.time()
146
+
147
+ TOTAL_DURATION = 15
148
+ FIRST_PHASE = 5
149
+ TRANSITION = 5
150
+ FINAL_PHASE = 5
151
+
152
+ video_paths = glob.glob(os.path.join(video_dir, "*.mp4"))
153
+
154
+ base_grid_videos = 400
155
+ aspect_ratio = 16/9
156
+ grid_width = int(np.sqrt(base_grid_videos * aspect_ratio))
157
+ grid_height = int(np.sqrt(base_grid_videos / aspect_ratio))
158
+
159
+ padding = 1
160
+ target_size = min(width // grid_width, height // grid_height) - padding*2
161
+
162
+ print(f"Creating grid of {grid_width}x{grid_height} videos")
163
+ print(f"Video size: {target_size}x{target_size} pixels")
164
+
165
+ needed_videos = grid_width * grid_height
166
+ if len(video_paths) > needed_videos:
167
+ video_paths = random.sample(video_paths, needed_videos)
168
+
169
+ args_list = [(path, target_size, padding, idx, grid_width)
170
+ for idx, path in enumerate(video_paths)]
171
+
172
+ with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
173
+ futures = list(tqdm(
174
+ executor.map(load_video, args_list),
175
+ total=len(args_list),
176
+ desc="Loading videos"
177
+ ))
178
+ clips = [clip for clip in futures if clip is not None]
179
+
180
+ if not clips:
181
+ raise ValueError("No videos were successfully loaded!")
182
+
183
+ bg = ColorClip((width, height), color=(0, 0, 0))
184
+ video_clips = [bg] + clips
185
+
186
+ print("Creating video composition...")
187
+ video_comp = CompositeVideoClip(video_clips, size=(width, height))
188
+
189
+ w, h = video_comp.size
190
+ def get_zoom_crop(t):
191
+ if t < FIRST_PHASE:
192
+ return (w, h)
193
+ elif t < FIRST_PHASE + TRANSITION:
194
+ progress = (t - FIRST_PHASE) / TRANSITION
195
+ zoom_factor = 1 + (progress * 2)
196
+ else:
197
+ zoom_factor = 3
198
+ return (int(w/zoom_factor), int(h/zoom_factor))
199
+
200
+ def apply_zoom(gf, t):
201
+ frame = gf(t)
202
+ cw, ch = get_zoom_crop(t)
203
+ if cw >= w or ch >= h:
204
+ return frame
205
+ x = (w - cw) // 2
206
+ y = (h - ch) // 2
207
+ cropped = frame[y:y+ch, x:x+cw]
208
+ return cv2.resize(cropped, (w, h), interpolation=cv2.INTER_LINEAR)
209
+
210
+ video_comp = video_comp.fl(apply_zoom)
211
+ video_comp = video_comp.set_duration(TOTAL_DURATION)
212
+
213
+ text1 = create_text_overlay(
214
+ "AirLetters",
215
+ "\nAn Open Video Dataset of Characters Drawn in the Air",
216
+ width, height, 0, FIRST_PHASE
217
+ )
218
+
219
+ text2 = create_text_overlay(
220
+ "Novel Video Understanding Benchmark",
221
+ "for evaluating the ability to understand articulated motions which requires very strong temporal capabilities, a task very challenging for current models",
222
+ width, height, FIRST_PHASE + TRANSITION, FINAL_PHASE
223
+ )
224
+
225
+ final = CompositeVideoClip([video_comp, text1, text2])
226
+
227
+ print("Writing final video...")
228
+ final.write_videofile(
229
+ output_path,
230
+ fps=fps,
231
+ codec='libx264',
232
+ audio=False,
233
+ threads=16,
234
+ logger='bar'
235
+ )
236
+ print("Cleaning up...")
237
+ final.close()
238
+ for clip in clips:
239
+ if clip is not None:
240
+ clip.close()
241
+
242
+ print(f"\nTotal processing time: {time.time() - start_time:.2f} seconds")
243
+ print(f"Output saved to: {output_path}")
244
+
245
+ if __name__ == "__main__":
246
+ create_montage(
247
+ video_dir="airletters/videos",
248
+ output_path="30fps.mp4",
249
+ fps=30,
250
+ )
test.csv ADDED
The diff for this file is too large to render. See raw diff