Spaces:

rishitdagli
/

airletters-leaderboard

Sleeping

App Files Files Community

rishitdagli commited on 30 days ago

Commit

f9bf356

1 Parent(s): 49c967c

leaderboard

Browse files

Files changed (13) hide show

.gitignore +13 -0
.pre-commit-config.yaml +53 -0
Makefile +13 -0
app.py +187 -0
emoji/envelope.png +0 -0
emoji/wind.png +0 -0
pyproject.toml +13 -0
requirements.txt +16 -0
src/about.py +31 -0
src/display/css_html_js.py +105 -0
src/display/formatting.py +27 -0
teaser.py +250 -0
test.csv +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+auto_evals/
+venv/
+__pycache__/
+.env
+.ipynb_checkpoints
+*ipynb
+.vscode/
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+default_language_version:
+  python: python3
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-case-conflict
+      - id: detect-private-key
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: requirements-txt-fixer
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: Format imports
+  - repo: https://github.com/psf/black
+    rev: 22.12.0
+    hooks:
+      - id: black
+        name: Format code
+        additional_dependencies: ['click==8.0.2']
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    # Ruff version.
+    rev: 'v0.0.267'
+    hooks:
+      - id: ruff

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+import pandas as pd
+from huggingface_hub import HfApi, create_repo
+from datasets import Dataset, load_dataset
+import os
+import html
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+HF_TOKEN = os.getenv('HF_TOKEN')
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN environment variable not found")
+api = HfApi(token=HF_TOKEN)
+DATASET_NAME = "airletters-leaderboard-results"
+INITIAL_DATA = {
+    "name": [
+        "ViT-B/16", "MaxViT-T", "ResNet-200", "ResNeXt-101", "SE-ResNeXt-26",
+        "ResNet-50", "VideoMAE (16)", "ResNet-101 + LSTM", "ResNet-50 + LSTM",
+        "ResNext-152 3D", "Strided Inflated EfficientNet 3D", "ResNext-50 3D",
+        "ResNext-101 3D", "ResNext-200 3D", "Video-LLaVA (w/o contrast class)",
+        "VideoLLaMA2 (w/o contrast class)", "Video-LLaVA", "VideoLLaMA2",
+        "Human Performance (10 videos/class)"
+    ],
+    "url": ["https://arxiv.org/abs/2410.02921"] * 19,
+    "top1_accuracy": [
+        7.49, 7.56, 11.44, 13.09, 13.29, 13.87, 57.96, 58.45, 63.24,
+        65.77, 65.97, 66.54, 69.74, 71.20, 2.53, 2.47, 7.29, 7.58, 96.67
+    ],
+    "organization": ["AirLetters Authors"] * 19,
+    "model_type": [
+        "Image", "Image", "Image", "Image", "Image",
+        "Image", "Video", "Video", "Video",
+        "Video", "Video", "Video", "Video", "Video",
+        "Vision Language", "Vision Language", "Vision Language", "Vision Language",
+        "Human Evaluation"
+    ]
+}
+def initialize_dataset():
+    try:
+        dataset = load_dataset(f"rishitdagli/{DATASET_NAME}", split="train", token=HF_TOKEN, download_mode="force_redownload")
+        df = dataset.to_pandas()
+        if 'model_url' in df.columns:
+            df = df.map(lambda row: {"name": model_hyperlink(row["model_url"], row["name"])})
+            df = df.drop('model_url', axis=1)
+            dataset = Dataset.from_pandas(df)
+            dataset.push_to_hub(DATASET_NAME, token=HF_TOKEN)
+    except Exception as e:
+        print(f"Creating new dataset due to: {str(e)}")
+        df = pd.DataFrame(INITIAL_DATA)
+        dataset = Dataset.from_pandas(df)
+        try:
+            create_repo(DATASET_NAME, repo_type="dataset", token=HF_TOKEN)
+        except Exception as e:
+            print(f"Repo might already exist: {str(e)}")
+        dataset.push_to_hub(DATASET_NAME, token=HF_TOKEN)
+    return dataset
+def calculate_accuracy(test_file, submitted_file):
+    test = pd.read_csv(test_file)
+    test2 = pd.read_csv(submitted_file)
+    test.columns = test.columns.str.strip()
+    test2.columns = test2.columns.str.strip()
+    merged = pd.merge(test, test2, on="filename", how="left", suffixes=("_true", "_pred"))
+    merged["label_pred"] = merged["label_pred"].fillna("")
+    correct_predictions = (merged["label_true"] == merged["label_pred"]).sum()
+    total_entries = len(merged)
+    return (correct_predictions / total_entries) * 100
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def update_leaderboard(name, organization, model_type, model_url, csv_file):
+    top1_acc = calculate_accuracy("test.csv", csv_file)
+    dataset = load_dataset(f"rishitdagli/{DATASET_NAME}", split="train", token=HF_TOKEN)
+    df = dataset.to_pandas()
+    new_row = {
+        'name': name,
+        'url': model_url,
+        'organization': organization,
+        'model_type': model_type,
+        'top1_accuracy': top1_acc,
+    }
+    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
+    df = df.sort_values('top1_accuracy', ascending=False)
+    df = df.reset_index(drop=True)
+    new_dataset = Dataset.from_pandas(df)
+    new_dataset.push_to_hub(DATASET_NAME, token=HF_TOKEN)
+    return "Successfully updated leaderboard"
+def init_leaderboard(dataframe):
+    return Leaderboard(
+        value=dataframe,
+        datatype=["markdown", "str", "str", "number"],
+        select_columns=SelectColumns(
+            default_selection=["name", "organization", "model_type", "top1_accuracy"],
+            cant_deselect=["name", "top1_accuracy"],
+            label="Select Columns to Display",
+        ),
+        search_columns=["name", "organization"],
+        filter_columns=[
+            ColumnFilter("model_type", type="checkboxgroup", label="Model Type"),
+        ],
+    )
+def refresh():
+    dataset = load_dataset(f"rishitdagli/{DATASET_NAME}", split="train", token=HF_TOKEN, download_mode="force_redownload")
+    dataset = dataset.map(lambda row: {"name": model_hyperlink(row["url"], row["name"])})
+    df = dataset.to_pandas()
+    return df
+def create_interface():
+    demo = gr.Blocks(css=custom_css)
+    with demo:
+        gr.HTML(TITLE)
+        gr.Video("30fps.mp4", autoplay=True, width=900, loop=True, include_audio=False)
+        with gr.Tabs(elem_classes="tab-buttons") as tabs:
+            with gr.TabItem("🏅 Leaderboard", elem_id="leaderboard-tab"):
+                dataset = initialize_dataset()
+                dataset = dataset.map(lambda row: {"name": model_hyperlink(row["url"], row["name"])})
+                df = dataset.to_pandas()
+                leaderboard = init_leaderboard(df)
+                refresh_button = gr.Button("Refresh")
+                refresh_button.click(
+                    refresh,
+                    inputs=[],
+                    outputs=[
+                        leaderboard,
+                    ],
+                )
+            with gr.TabItem("📝 About", elem_id="about-tab"):
+                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("🚀 Submit", elem_id="submit-tab"):
+                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Column():
+                    name = gr.Textbox(label="Model Name")
+                    model_url = gr.Textbox(label="Model URL", placeholder="https://huggingface.co/...")
+                    org = gr.Textbox(label="Organization")
+                    model_type = gr.Dropdown(
+                        choices=["Image", "Video", "Vision Language", "Tracking", "Other"],
+                        label="Model Type"
+                    )
+                    csv_file = gr.File(label="Results CSV", type="filepath")
+                    submit_btn = gr.Button("Submit")
+                    result = gr.Textbox(label="Result")
+                    submit_btn.click(
+                        update_leaderboard,
+                        inputs=[name, org, model_type, model_url, csv_file],
+                        outputs=[result]
+                    )
+        with gr.Row():
+            with gr.Accordion("📙 Citation", open=False):
+                citation_button = gr.Textbox(
+                    value=CITATION_BUTTON_TEXT,
+                    label=CITATION_BUTTON_LABEL,
+                    lines=7,
+                    elem_id="citation-button",
+                    show_copy_button=True,
+                )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.queue().launch()

emoji/envelope.png ADDED Viewed

emoji/wind.png ADDED Viewed

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+APScheduler
+black
+datasets
+gradio
+gradio[oauth]
+gradio_leaderboard==0.0.13
+gradio_client
+huggingface-hub>=0.18.0
+matplotlib
+numpy
+pandas
+python-dateutil
+tqdm
+transformers
+tokenizers>=0.15.0
+sentencepiece

src/about.py ADDED Viewed

	@@ -0,0 +1,31 @@

+TITLE = """<h1 align="center" id="space-title">Unofficial AirLetters Leaderboard</h1>"""
+INTRODUCTION_TEXT = """
+We introduce a new real-world dataset, utilizing human generated articulated motions with videos of people drawing Latin characters and labels. Unlike existing video datasets, accurate video understanding on our dataset requires detailed understanding of motion in the video and the integration of long-range information across the entire video. We show that existing image and video understanding models perform poorly and fall far behind the human baseline.
+Unlike many video dataset which are overly dpeendent on one key frame or some which are highly dpeendent on 2-4 key frames, AirLetters require strong temporal capabilities. Our study revealed that while trivial for humans, accurate representations of complex articulated motions remain an open problem for end-to-end learning for models.
+See:
+- [The Paper](https://arxiv.org/abs/2410.02921)
+- [Dataset Download Link](https://www.qualcomm.com/developer/software/airletters-dataset)
+"""
+LLM_BENCHMARKS_TEXT = INTRODUCTION_TEXT
+EVALUATION_QUEUE_TEXT = """
+1. Prepare your results in CSV format with columns: filename and label
+2. Fill in your model details and URL
+3. Upload your results file
+4. Your model will be evaluated against the test set and added to the leaderboard automatically.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
+@inproceedings{dagliairletters,
+    title={AirLetters: An Open Video Dataset of Characters Drawn in the Air},
+    author={Dagli, Rishit and Berger, Guillaume and Materzynska, Joanna and Bax, Ingo and Memisevic, Roland},
+    booktitle={European Conference on Computer Vision Workshops},
+    year={2024}
+}
+"""

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,105 @@

+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+#leaderboard-table td:nth-child(2),
+#leaderboard-table th:nth-child(2) {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
+#filter_type{
+    border: 0;
+    padding-left: 0;
+    padding-top: 0;
+}
+#filter_type label {
+    display: flex;
+}
+#filter_type label > span{
+    margin-top: var(--spacing-lg);
+    margin-right: 0.5em;
+}
+#filter_type label > .wrap{
+    width: 103px;
+}
+#filter_type label > .wrap .wrap-inner{
+    padding: 2px;
+}
+#filter_type label > .wrap .wrap-inner input{
+    width: 1px
+}
+#filter-columns-type{
+    border:0;
+    padding:0.5;
+}
+#filter-columns-size{
+    border:0;
+    padding:0.5;
+}
+#box-filter > .form{
+    border: 0
+}
+"""
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = Object.fromEntries(params);
+        return url_params;
+    }
+    """

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,27 @@

+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)

teaser.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import os
+import cv2
+import numpy as np
+from moviepy.editor import VideoFileClip, CompositeVideoClip, ColorClip
+from tqdm import tqdm
+import glob
+import concurrent.futures
+import time
+import random
+def create_text_overlay(text, subtitle, width, height, start_time, duration):
+    overlay = np.zeros((height, width, 4), dtype=np.uint8)
+    box_width = int(width * 0.75)
+    box_x_start = (width - box_width) // 2
+    cv2.rectangle(overlay,
+                 (box_x_start, height//3),
+                 (box_x_start + box_width, 2*height//3),
+                 (0,0,0,180), -1)
+    font = cv2.FONT_HERSHEY_DUPLEX
+    if "AirLetters" in text:
+        title_scale = 3.0
+        subtitle_scale = 1.5
+        envelope_emoji = cv2.imread("emoji/envelope.png", cv2.IMREAD_UNCHANGED)
+        wind_emoji = cv2.imread("emoji/wind.png", cv2.IMREAD_UNCHANGED)
+        target_height = int(title_scale * 30)
+        envelope_aspect = envelope_emoji.shape[1] / envelope_emoji.shape[0]
+        wind_aspect = wind_emoji.shape[1] / wind_emoji.shape[0]
+        envelope_emoji = cv2.resize(envelope_emoji,
+                                  (int(target_height * envelope_aspect), target_height))
+        wind_emoji = cv2.resize(wind_emoji,
+                              (int(target_height * wind_aspect), target_height))
+    else:
+        title_scale = 2.0
+        subtitle_scale = 1.0
+    title_color = (138, 223, 178, 255)
+    subtitle_color = (255, 255, 255, 255)
+    title_size = cv2.getTextSize(text, font, title_scale, 2)[0]
+    # Center text within the box
+    title_x = box_x_start + (box_width - title_size[0]) // 2
+    title_y = height // 2
+    if "AirLetters" in text:
+        emoji_y = title_y - target_height + 5
+        envelope_x = title_x - envelope_emoji.shape[1] - 20
+        wind_x = title_x + title_size[0] + 20
+        def overlay_image_with_alpha(background, foreground, x, y):
+            if x >= background.shape[1] or y >= background.shape[0]:
+                return
+            h, w = foreground.shape[:2]
+            if len(foreground.shape) == 2:
+                alpha = foreground
+                foreground = cv2.cvtColor(foreground, cv2.COLOR_GRAY2BGR)
+            else:
+                alpha = foreground[:, :, 3] / 255.0
+                foreground = foreground[:, :, :3]
+            y1, y2 = max(0, y), min(background.shape[0], y + h)
+            x1, x2 = max(0, x), min(background.shape[1], x + w)
+            alpha_slice = alpha[y1-y:y2-y, x1-x:x2-x]
+            alpha_expanded = np.expand_dims(alpha_slice, axis=-1)
+            background_slice = background[y1:y2, x1:x2, :3]
+            foreground_slice = foreground[y1-y:y2-y, x1-x:x2-x]
+            background[y1:y2, x1:x2, :3] = background_slice * (1 - alpha_expanded) + foreground_slice * alpha_expanded
+            background[y1:y2, x1:x2, 3] = background[y1:y2, x1:x2, 3] * (1 - alpha_slice) + 255 * alpha_slice
+        overlay_image_with_alpha(overlay, envelope_emoji, envelope_x, emoji_y)
+        overlay_image_with_alpha(overlay, wind_emoji, wind_x, emoji_y)
+    else:
+        if len(subtitle) > 50:
+            words = subtitle.split()
+            mid = len(words) // 2
+            subtitle = " ".join(words[:mid]) + "\n" + " ".join(words[mid:])
+    title_size = cv2.getTextSize(text, font, title_scale, 2)[0]
+    title_x = box_x_start + (box_width - title_size[0]) // 2
+    title_y = height // 2
+    cv2.putText(overlay, text, (title_x, title_y), font, title_scale, title_color, 2)
+    if "\n" in subtitle:
+        subtitle_lines = subtitle.split("\n")
+        subtitle_y = title_y + 50
+        for line in subtitle_lines:
+            subtitle_size = cv2.getTextSize(line, font, subtitle_scale, 2)[0]
+            subtitle_x = box_x_start + (box_width - subtitle_size[0]) // 2
+            cv2.putText(overlay, line, (subtitle_x, subtitle_y), font, subtitle_scale, subtitle_color, 2)
+            subtitle_y += 50
+    else:
+        subtitle_size = cv2.getTextSize(subtitle, font, subtitle_scale, 2)[0]
+        subtitle_x = box_x_start + (box_width - subtitle_size[0]) // 2
+        cv2.putText(overlay, subtitle, (subtitle_x, title_y + 60), font, subtitle_scale, subtitle_color, 2)
+    overlay_clip = ColorClip(size=(width, height), color=[0,0,0,0])
+    overlay_clip.mask = ColorClip(size=(width, height), color=[1,1,1,1])
+    overlay_clip.mask.get_frame = lambda t: overlay[:,:,3:4] / 255.0
+    overlay_clip.get_frame = lambda t: overlay[:,:,:3]
+    overlay_clip = overlay_clip.set_start(start_time)
+    overlay_clip = overlay_clip.set_duration(duration)
+    overlay_clip = overlay_clip.fadein(0.5).fadeout(0.5)
+    return overlay_clip
+def load_video(args):
+    video_path, target_size, padding, idx, grid_width = args
+    try:
+        clip = VideoFileClip(video_path, audio=False)
+        clip = clip.resize(height=target_size)
+        clip = clip.crop(x1=(clip.w - target_size)//2, x2=(clip.w + target_size)//2) if clip.w > target_size else clip
+        clip = clip.loop()
+        bg = ColorClip(size=(target_size + padding*2, target_size + padding*2), color=(255,255,255))
+        clip = clip.set_position((padding, padding))
+        clip = CompositeVideoClip([bg, clip])
+        x = (idx % grid_width) * (target_size + padding*2)
+        y = (idx // grid_width) * (target_size + padding*2)
+        clip = clip.set_position((x, y))
+        return clip
+    except Exception as e:
+        print(f"\nError processing {video_path}: {str(e)}")
+        return None
+def create_montage(video_dir, output_path, width=1920, height=1080, fps=30):
+    print("Starting video creation...")
+    start_time = time.time()
+    TOTAL_DURATION = 15
+    FIRST_PHASE = 5
+    TRANSITION = 5
+    FINAL_PHASE = 5
+    video_paths = glob.glob(os.path.join(video_dir, "*.mp4"))
+    base_grid_videos = 400
+    aspect_ratio = 16/9
+    grid_width = int(np.sqrt(base_grid_videos * aspect_ratio))
+    grid_height = int(np.sqrt(base_grid_videos / aspect_ratio))
+    padding = 1
+    target_size = min(width // grid_width, height // grid_height) - padding*2
+    print(f"Creating grid of {grid_width}x{grid_height} videos")
+    print(f"Video size: {target_size}x{target_size} pixels")
+    needed_videos = grid_width * grid_height
+    if len(video_paths) > needed_videos:
+        video_paths = random.sample(video_paths, needed_videos)
+    args_list = [(path, target_size, padding, idx, grid_width)
+                 for idx, path in enumerate(video_paths)]
+    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+        futures = list(tqdm(
+            executor.map(load_video, args_list),
+            total=len(args_list),
+            desc="Loading videos"
+        ))
+        clips = [clip for clip in futures if clip is not None]
+    if not clips:
+        raise ValueError("No videos were successfully loaded!")
+    bg = ColorClip((width, height), color=(0, 0, 0))
+    video_clips = [bg] + clips
+    print("Creating video composition...")
+    video_comp = CompositeVideoClip(video_clips, size=(width, height))
+    w, h = video_comp.size
+    def get_zoom_crop(t):
+        if t < FIRST_PHASE:
+            return (w, h)
+        elif t < FIRST_PHASE + TRANSITION:
+            progress = (t - FIRST_PHASE) / TRANSITION
+            zoom_factor = 1 + (progress * 2)
+        else:
+            zoom_factor = 3
+        return (int(w/zoom_factor), int(h/zoom_factor))
+    def apply_zoom(gf, t):
+        frame = gf(t)
+        cw, ch = get_zoom_crop(t)
+        if cw >= w or ch >= h:
+            return frame
+        x = (w - cw) // 2
+        y = (h - ch) // 2
+        cropped = frame[y:y+ch, x:x+cw]
+        return cv2.resize(cropped, (w, h), interpolation=cv2.INTER_LINEAR)
+    video_comp = video_comp.fl(apply_zoom)
+    video_comp = video_comp.set_duration(TOTAL_DURATION)
+    text1 = create_text_overlay(
+        "AirLetters",
+        "\nAn Open Video Dataset of Characters Drawn in the Air",
+        width, height, 0, FIRST_PHASE
+    )
+    text2 = create_text_overlay(
+        "Novel Video Understanding Benchmark",
+        "for evaluating the ability to understand articulated motions which requires very strong temporal capabilities, a task very challenging for current models",
+        width, height, FIRST_PHASE + TRANSITION, FINAL_PHASE
+    )
+    final = CompositeVideoClip([video_comp, text1, text2])
+    print("Writing final video...")
+    final.write_videofile(
+        output_path,
+        fps=fps,
+        codec='libx264',
+        audio=False,
+        threads=16,
+        logger='bar'
+    )
+    print("Cleaning up...")
+    final.close()
+    for clip in clips:
+        if clip is not None:
+            clip.close()
+    print(f"\nTotal processing time: {time.time() - start_time:.2f} seconds")
+    print(f"Output saved to: {output_path}")
+if __name__ == "__main__":
+    create_montage(
+        video_dir="airletters/videos",
+        output_path="30fps.mp4",
+        fps=30,
+    )

test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff