import json
from datetime import datetime, date

import gradio as gr
import plotly.graph_objects as go


def create_big_five_capex_plot() -> go.Figure:
    # Read data from the JSON Lines file.
    with open("big_five_capex.jsonl", "r") as file:
        data = [json.loads(line) for line in file if line.strip()]

    quarters: list[str] = [entry["Quarter"] for entry in data]
    companies = ['Microsoft', 'Google', 'Meta', 'Amazon']
    colors = ['#80bb00', '#ee161f', '#0065e3', '#ff6200']

    x_positions = list(range(len(quarters)))

    traces = []
    for company, color in zip(companies, colors):
        y_data = [entry[company] for entry in data]
        traces.append(go.Bar(
            name=company,
            x=x_positions,
            y=y_data,
            marker_color=color
        ))

    fig = go.Figure(data=traces)
    fig.update_layout(
        barmode="stack",
        title="Capital Expenditures of Amazon, Meta, Google and Microsoft in Millions of USD per Quarter",
        xaxis_title="Quarter",
        yaxis_title="Capital Expenditures (Millions USD)",
        xaxis=dict(
            tickmode='array',
            tickvals=x_positions,
            ticktext=quarters
        ),
        height=800
    )

    # Calculate the x position for the vertical dotted line.
    # We want the line drawn between "2023 Q1" and "2023 Q2".
    try:
        idx_q1 = quarters.index("2023 Q1")
        idx_q2 = quarters.index("2023 Q2")
        vline_x = (idx_q1 + idx_q2) / 2  # position midway between the two quarters
    except ValueError:
        # Fall back if quarters not found.
        vline_x = 0

    # Add a vertical dotted line spanning the full height
    fig.add_shape(
        type="line",
        xref="x",
        yref="paper",
        x0=vline_x,
        y0=0,
        x1=vline_x,
        y1=1,
        line=dict(
            color="black",
            dash="dot",
            width=2
        )
    )

    # Add an annotation label above the vertical line.
    fig.add_annotation(
        x=vline_x,
        y=1.05,  # place just above the top of the plotting area
        xref="x",
        yref="paper",
        text="AI arms race begins",
        showarrow=False,
        font=dict(
            color="black",
            size=12
        ),
        align="center"
    )

    return fig


def create_simple_plot(data_path: str,
                       name: str,
                       subtitle: str,
                       start_date: datetime, end_date: datetime,
                       min_value: int = 0, max_value: int = 100,
                       labeled_horizontal_lines: dict[str, float] = None) -> go.Figure:
    leaderboard = []
    with open(data_path, 'r') as file:
        for line in file:
            leaderboard.append(json.loads(line))

    models = []
    with open("models.jsonl", 'r') as file:
        for line in file:
            models.append(json.loads(line))

    data = []
    for entry in leaderboard:
        model_name = entry['model']
        score = entry['score']
        model_info = next((m for m in models if m['Name'] == model_name), None)
        if model_info:
            release_date = datetime.strptime(model_info['Release Date'], "%Y-%m-%d")
            data.append({'model': model_name, 'score': score, 'release_date': release_date})
        else:
            print(f"[WARNING] Model '{model_name}' not found in models.jsonl")

    data.sort(key=lambda x: x['release_date'])

    x_dates = [d['release_date'] for d in data]
    y_scores = []
    max_score = 0
    for entry in data:
        if entry['score'] > max_score:
            max_score = entry['score']
        y_scores.append(max_score)

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=x_dates,
        y=y_scores,
        mode='lines',
        line=dict(shape='hv', width=2),
        name='Best Score to Date'
    ))

    for i, entry in enumerate(data):
        if i == 0 or y_scores[i] > y_scores[i - 1]:
            fig.add_trace(go.Scatter(
                x=[entry['release_date']],
                y=[entry['score']],
                mode='markers+text',
                marker=dict(size=10),
                text=[entry['model']],
                textposition="top center",
                name=entry['model']
            ))

    fig.update_layout(
        title=f'{name} Over Time<br><sup>{subtitle}</sup>',
        xaxis_title='Publication or Release Date',
        yaxis_title=name,
        hovermode='x unified',
        xaxis=dict(
            range=[start_date, end_date],
            type='date'
        ),
        yaxis=dict(
            range=[min_value, max_value]
        ),
        height=800
    )

    if labeled_horizontal_lines:
        for label, y_value in labeled_horizontal_lines.items():
            fig.add_hline(
                y=y_value,
                line_dash="dot",
                line_color="black",
                annotation_text=label,
                annotation_position="right",
                annotation=dict(
                    font_size=12,
                    font_color="black",
                    xanchor="left",
                    yanchor="middle",
                    xshift=10
                )
            )

    return fig


with gr.Blocks() as demo:
    with gr.Tab("System Performance Over Time"):
        with gr.Tab("Legend"):
            legend_markdown: gr.Markdown = gr.Markdown(
                value="""
## Benchmarks and Top Scores

| Benchmark | Top Score |
|-----------|-----------|
| Humanity's Last Exam | 🔴 7% |
| BigCodeBench | 🟠 36% |
| Simple Bench | 🟠 42% |
| EMMA-Mini | 🟠 48% |
| PlanBench | 🟠 53% |
| NYT Connections | 🟡 60% |
| GAIA | 🟡 65% |
| LiveBench Language | 🟡 65% |
| LiveBench Data Analysis | 🟡 71% |
| LiveCodeBench | 🟡 73% |
| ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
| LiveBench | 🟡 76% |
| GPQA | 🟡 76% |
| LiveBench Mathematics | 🟡 81% |
| ZebraLogic | 🟡 81% |
| LiveBench Coding | 🟡 83% |
| ARC-AGI-Pub (Public Eval) | 🟡 83% |
| LiveBench IF | 🟡 86% |
| ZeroEval | 🟡 86% |
| MATH-L5 | 🟡 89% |
| LiveBench Reasoning | 🟢 92% |
| MMLU-Redux | 🟢 93% |
| CRUX | 🟢 96% |

## Colors

| Color | Score Range |
|-------|------------|
| 🔴 Red | Below 30% |
| 🟠 Orange | 30% to 60% |
| 🟡 Yellow | 60% to 90% |
| 🟢 Green | Above 90% |"""
            )
        with gr.Tab("🔴 Humanity's Last Exam") as humanitys_last_exam_tab:
            humanitys_last_exam_plot: gr.Plot = gr.Plot()
            humanitys_last_exam_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [Humanity's Last Exam Quantitative Results](https://lastexam.ai/)"""
            )
        with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
            bigcodebench_plot: gr.Plot = gr.Plot()
            bigcodebench_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [BigCodeBench Leaderboard](https://bigcode-bench.github.io/)"""
            )
        with gr.Tab("🟠 Simple Bench") as simple_bench_tab:
            simple_bench_plot: gr.Plot = gr.Plot()
            simple_bench_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
            )
        with gr.Tab("🟠 EMMA-Mini") as emma_tab:
            emma_plot: gr.Plot = gr.Plot()
            emma_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
            )
        with gr.Tab("🟠 PlanBench") as planbench_tab:
            planbench_plot: gr.Plot = gr.Plot()
            planbench_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
            )
        with gr.Tab("🟡 NYT Connections") as nyt_connections_tab:
            nyt_connections_plot: gr.Plot = gr.Plot()
            nyt_connections_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
            )
        with gr.Tab("🟡 GAIA") as gaia_tab:
            gaia_plot: gr.Plot = gr.Plot()
            gaia_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
            )
        with gr.Tab("🟡 LiveBench Language") as livebench_language_tab:
            livebench_language_plot: gr.Plot = gr.Plot()
            livebench_language_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
            )
        with gr.Tab("🟡 LiveBench Data Analysis") as livebench_data_analysis_tab:
            livebench_data_analysis_plot: gr.Plot = gr.Plot()
            livebench_data_analysis_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
            )
        with gr.Tab("🟡 LiveCodeBench") as livecodebench_tab:
            livecodebench_plot: gr.Plot = gr.Plot()
            livecodebench_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [LiveCodeBench Leaderboard](https://livecodebench.github.io/leaderboard.html)"""
            )
        with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
            with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
                arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
            with gr.Tab("🟡 Public Eval") as arc_agi_public_eval_tab:
                arc_agi_public_eval_plot: gr.Plot = gr.Plot()
            arc_agi_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
            )
        with gr.Tab("🟡 LiveBench") as livebench_tab:
            livebench_plot: gr.Plot = gr.Plot()
            livebench_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
            )
        with gr.Tab("🟡 GPQA") as gpqa_tab:
            gpqa_plot: gr.Plot = gr.Plot()
            gpqa_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
            )
        with gr.Tab("🟡 LiveBench Mathematics") as livebench_mathematics_tab:
            livebench_mathematics_plot: gr.Plot = gr.Plot()
            livebench_mathematics_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
            )
        with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
            zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
            zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
            )
        with gr.Tab("🟡 LiveBench Coding") as livebench_coding_tab:
            livebench_coding_plot: gr.Plot = gr.Plot()
            livebench_coding_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
            )
        with gr.Tab("🟡 LiveBench IF") as livebench_if_tab:
            livebench_if_plot: gr.Plot = gr.Plot()
            livebench_if_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [LiveBench IF](https://livebench.ai/)"""
            )
        with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
            zeroeval_average_plot: gr.Plot = gr.Plot()
            zeroeval_average_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
            )
        with gr.Tab("🟡 MATH-L5") as zeroeval_math_l5_tab:
            zeroeval_math_l5_plot: gr.Plot = gr.Plot()
            zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
            )
        with gr.Tab("🟢 LiveBench Reasoning") as livebench_reasoning_tab:
            livebench_reasoning_plot: gr.Plot = gr.Plot()
            livebench_reasoning_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
            )
        with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
            zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
            zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
            )
        with gr.Tab("🟢 CRUX") as zeroeval_crux_tab:
            zeroeval_crux_plot: gr.Plot = gr.Plot()
            zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
            )
        with gr.Tab("OpenCompass", visible=False):
            opencompass_plot: gr.Plot = gr.Plot()
            opencompass_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [OpenCompass LLM Leaderboard](https://huggingface.co/spaces/opencompass/opencompass-llm-leaderboard)"""
            )
        with gr.Tab("SWE-bench", visible=False):
            swe_bench_plot: gr.Plot = gr.Plot()
            swe_bench_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
            )
        with gr.Tab("SWE-bench Multimodal", visible=False):
            swe_bench_multimodal_plot: gr.Plot = gr.Plot()
            swe_bench_multimodal_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/#multimodal)"""
            )
        with gr.Tab("WebArena", visible=False):
            webarena_plot: gr.Plot = gr.Plot()
            webarena_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
            )
        with gr.Tab("OSWorld", visible=False):
            osworld_plot: gr.Plot = gr.Plot()
            osworld_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
            )
        with gr.Tab("MathVista", visible=False):
            mathvista_plot: gr.Plot = gr.Plot()
            mathvista_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [Leaderboard on MathVista](https://mathvista.github.io/#leaderboard)"""
            )
        with gr.Tab("DABStep", visible=False):
            dabstep_plot: gr.Plot = gr.Plot()
            dabstep_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [DABStep Leaderboard](https://huggingface.co/spaces/adyen/DABstep)"""
            )
        with gr.Tab("lineage-bench", visible=False):
            lineage_bench_plot: gr.Plot = gr.Plot()
            lineage_bench_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [lineage-bench Results](https://github.com/fairydreaming/lineage-bench)"""
            )
        with gr.Tab("Step-Game", visible=False):
            step_game_plot: gr.Plot = gr.Plot()
            step_game_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [Step-Game TrueSkill Leaderboard](https://github.com/lechmazur/step_game)"""
            )
        with gr.Tab("HHEM", visible=False):
            hhem_plot: gr.Plot = gr.Plot()
            hhem_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
            )
        with gr.Tab("USACO", visible=False):
            usaco_plot: gr.Plot = gr.Plot()
            usaco_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [USACO Leaderboard](https://hal.cs.princeton.edu/usaco)"""
            )
        with gr.Tab("AppWorld", visible=False):
            appworld_plot: gr.Plot = gr.Plot()
            appworld_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [AppWorld Agent Scores](https://appworld.dev/leaderboard)"""
            )
        with gr.Tab("CORE-Bench", visible=False):
            core_bench_plot: gr.Plot = gr.Plot()
            core_bench_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [HAL Leaderboards](https://hal.cs.princeton.edu/#leaderboards)"""
            )
        with gr.Tab("Cybench", visible=False):
            cybench_plot: gr.Plot = gr.Plot()
            cybench_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [Cybench Leaderboard](https://hal.cs.princeton.edu/cybench)"""
            )
        with gr.Tab("MultiChallenge", visible=False):
            multichallenge_plot: gr.Plot = gr.Plot()
            multichallenge_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [SEAL Leaderboard: MultiChallenge](https://scale.com/leaderboard/multichallenge)"""
            )
        with gr.Tab("VISTA", visible=False):
            vista_plot: gr.Plot = gr.Plot()
            vista_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [SEAL Leaderboard: Visual-Language Understanding](https://scale.com/leaderboard/visual_language_understanding)"""
            )
        with gr.Tab("ToolComp", visible=False):
            with gr.Tab("Enterprise"):
                toolcomp_enterprise_plot: gr.Plot = gr.Plot()
                toolcomp_enterprise_markdown: gr.Markdown = gr.Markdown(
                    value="""Source: [SEAL Leaderboard: Agentic Tool Use (Enterprise)](https://scale.com/leaderboard/tool_use_enterprise)"""
                )
            with gr.Tab("Chat"):
                toolcomp_chat_plot: gr.Plot = gr.Plot()
                toolcomp_chat_markdown: gr.Markdown = gr.Markdown(
                    value="""Source: [SEAL Leaderboard: Agentic Tool Use (Chat)](https://scale.com/leaderboard/tool_use_chat)"""
                )
        with gr.Tab("BFCL", visible=False):
            bfcl_plot: gr.Plot = gr.Plot()
            bfcl_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [BFCL Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html)"""
            )
        with gr.Tab("EvalPlus", visible=False):
            evalplus_plot: gr.Plot = gr.Plot()
            evalplus_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [EvalPlus Leaderboard](https://evalplus.github.io/leaderboard.html)"""
            )
        with gr.Tab("Aider Polyglot", visible=False):
            aider_plot: gr.Plot = gr.Plot()
            aider_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [Aider LLM Leaderboards](https://aider.chat/docs/leaderboards/)"""
            )
        with gr.Tab("QuALITY", visible=False):
            quality_plot: gr.Plot = gr.Plot()
            quality_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
            )
        with gr.Tab("MMVU", visible=False):
            mmvu_plot: gr.Plot = gr.Plot()
            mmvu_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [MMVU Leaderboard](https://mmvu-benchmark.github.io/#leaderboard)"""
            )
        with gr.Tab("PhysBench", visible=False):
            physbench_plot: gr.Plot = gr.Plot()
            physbench_markdown: gr.Markdown = gr.Markdown(
                value="""Source: [PhysBench Leaderboard](https://physbench.github.io/#leaderboard)"""
            )
    with gr.Tab("Finance") as finance_tab:
        with gr.Tab("Big Tech Capex") as big_five_capex_tab:
            big_five_capex_plot: gr.Plot = gr.Plot()
        with gr.Tab("NVIDIA Revenue", visible=False) as nvidia_revenue:
            nvidia_revenue_plot: gr.Plot = gr.Plot()
    big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
    arc_agi_public_eval_tab.select(fn=create_simple_plot,
                                   inputs=[gr.State("arc_agi_leaderboard.jsonl"),
                                           gr.State(
                                               "ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
                                           gr.State(
                                               "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
                                           gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)),
                                           gr.State(0), gr.State(100),
                                           gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
                                   outputs=arc_agi_public_eval_plot)
    arc_agi_tab.select(fn=create_simple_plot,
                       inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
                               gr.State(
                                   "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
                               gr.State(
                                   "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
                               gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)),
                               gr.State(0), gr.State(100),
                               gr.State({"MTurkers": 77})],
                       outputs=arc_agi_semi_private_eval_plot)
    arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
                                         inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
                                                 gr.State(
                                                     "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
                                                 gr.State(
                                                     "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
                                                 gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
                                                 gr.State(0), gr.State(100),
                                                 gr.State({"MTurkers": 77})],
                                         outputs=arc_agi_semi_private_eval_plot)
    finance_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
    simple_bench_tab.select(fn=create_simple_plot,
                            inputs=[gr.State("simple_bench_leaderboard.jsonl"),
                                    gr.State("Simple Bench Score"),
                                    gr.State(
                                        "\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"),
                                    gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1)),
                                    gr.State(0), gr.State(100),
                                    gr.State({"Humans": 83.7})],
                            outputs=simple_bench_plot)
    planbench_tab.select(fn=create_simple_plot,
                         inputs=[gr.State("planbench_leaderboard.jsonl"),
                                 gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
                                 gr.State(
                                     "\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"),
                                 gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
                         outputs=planbench_plot)
    bigcodebench_tab.select(fn=create_simple_plot,
                            inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
                                    gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
                                    gr.State(
                                        "\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"),
                                    gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
                            outputs=bigcodebench_plot)
    gaia_tab.select(fn=create_simple_plot,
                    inputs=[gr.State("gaia_leaderboard.jsonl"),
                            gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
                            gr.State(
                                "\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"),
                            gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
                            gr.State(0), gr.State(100),
                            gr.State({"Humans": 92})],
                    outputs=gaia_plot)
    gpqa_tab.select(fn=create_simple_plot,
                    inputs=[gr.State("gpqa_leaderboard.jsonl"),
                            gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
                            gr.State(
                                "\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"),
                            gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
                            gr.State(25), gr.State(100),
                            gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
                    outputs=gpqa_plot)
    zeroeval_average_tab.select(fn=create_simple_plot,
                                inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
                                        gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
                                        gr.State(
                                            "\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"),
                                        gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                outputs=zeroeval_average_plot)
    zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
                                   inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
                                           gr.State(
                                               "ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"),
                                           gr.State(
                                               "\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"),
                                           gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                   outputs=zeroeval_mmlu_redux_plot)
    zeroeval_zebralogic_tab.select(fn=create_simple_plot,
                                   inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
                                           gr.State("ZeroEval ZebraLogic Score"),
                                           gr.State(
                                               "\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"),
                                           gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                   outputs=zeroeval_zebralogic_plot)
    zeroeval_crux_tab.select(fn=create_simple_plot,
                             inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
                                     gr.State(
                                         "ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"),
                                     gr.State(
                                         "\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"),
                                     gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                             outputs=zeroeval_crux_plot)
    zeroeval_math_l5_tab.select(fn=create_simple_plot,
                                inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
                                        gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
                                        gr.State(
                                            "\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"),
                                        gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                outputs=zeroeval_math_l5_plot)
    livebench_tab.select(fn=create_simple_plot,
                         inputs=[gr.State("livebench.jsonl"),
                                 gr.State("LiveBench-2024-11-25: Global Average Score"),
                                 gr.State(
                                     "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
                                 gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
                         outputs=livebench_plot)
    livebench_reasoning_tab.select(fn=create_simple_plot,
                                   inputs=[gr.State("livebench_reasoning.jsonl"),
                                           gr.State("LiveBench-2024-11-25: Reasoning Average Score"),
                                           gr.State(
                                               "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
                                           gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
                                   outputs=livebench_reasoning_plot)
    livebench_coding_tab.select(fn=create_simple_plot,
                                inputs=[gr.State("livebench_coding.jsonl"),
                                        gr.State("LiveBench-2024-11-25: Coding Average Score"),
                                        gr.State(
                                            "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
                                        gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
                                outputs=livebench_coding_plot)
    livebench_mathematics_tab.select(fn=create_simple_plot,
                                     inputs=[gr.State("livebench_mathematics.jsonl"),
                                             gr.State("LiveBench-2024-11-25: Mathematics Average Score"),
                                             gr.State(
                                                 "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
                                             gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
                                     outputs=livebench_mathematics_plot)
    livebench_data_analysis_tab.select(fn=create_simple_plot,
                                       inputs=[gr.State("livebench_data_analysis.jsonl"),
                                               gr.State("LiveBench-2024-11-25: Data Analysis Average Score"),
                                               gr.State(
                                                   "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
                                               gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
                                       outputs=livebench_data_analysis_plot)
    livebench_language_tab.select(fn=create_simple_plot,
                                  inputs=[gr.State("livebench_language.jsonl"),
                                          gr.State("LiveBench-2024-11-25: Language Average Score"),
                                          gr.State(
                                              "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
                                          gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
                                  outputs=livebench_language_plot)
    livebench_if_tab.select(fn=create_simple_plot,
                            inputs=[gr.State("livebench_if.jsonl"),
                                    gr.State("LiveBench-2024-11-25: IF Average Score"),
                                    gr.State(
                                        "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
                                    gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
                            outputs=livebench_if_plot)
    humanitys_last_exam_tab.select(fn=create_simple_plot,
                                   inputs=[gr.State("humanitys_last_exam.jsonl"),
                                           gr.State("Humanity's Last Exam (Multi-Modal Models Only) Score"),
                                           gr.State(
                                               "\"multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage\" (Phan et al. 2025)"),
                                           gr.State(date(2024, 5, 13)), gr.State(date(2025, 2, 11))],
                                   outputs=humanitys_last_exam_plot)
    livecodebench_tab.select(fn=create_simple_plot,
                             inputs=[gr.State("livecodebench.jsonl"),
                                     gr.State("LiveCodeBench (7/1/2024 to 2/1/2025) Score"),
                                     gr.State(
                                         "\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
                                     gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
                             outputs=livecodebench_plot)
    emma_tab.select(fn=create_simple_plot,
                    inputs=[gr.State("emma_mini.jsonl"),
                            gr.State("EMMA-Mini (Enhanced MultiModal ReAsoning) Score"),
                            gr.State("\"benchmark targeting organic multimodal reasoning across mathematics, physics, chemistry, and coding\" (Hao et al. 2025)"),
                            gr.State(date(2024, 9, 17)), gr.State(date(2025, 2, 1)),
                            gr.State(22.75), gr.State(100),
                            gr.State({"Human experts": 77.75})],
                    outputs=emma_plot)
    nyt_connections_tab.select(fn=create_simple_plot,
                               inputs=[gr.State("nyt_connections.jsonl"),
                                       gr.State("NYT Connections (Extended Version, Newest 100 Puzzles) Score"),
                                       gr.State("\"NYT Connections puzzles [...] To increase difficulty, Extended Connections adds up to four extra trick words to each puzzle.\" (Mazur, 2025)"),
                                       gr.State(date(2024, 7, 23)), gr.State(date(2025, 2, 1))],
                               outputs=nyt_connections_plot)

if __name__ == "__main__":
    demo.launch()