Spaces:

alisartazkhan
/

tempo_control2

Running

App Files Files Community

Ali Sartaz Khan commited on 13 days ago

Commit

3c8c320

1 Parent(s): c2b5b47

Add application file

Browse files

Files changed (17) hide show

.gradio/certificate.pem +31 -0
app.py +8 -0
audio_out_votes.json +0 -0
requirements.txt +15 -0
talk_arena/.env +2 -0
talk_arena/__init__.py +0 -0
talk_arena/__pycache__/__init__.cpython-312.pyc +0 -0
talk_arena/__pycache__/db_utils.cpython-312.pyc +0 -0
talk_arena/__pycache__/streaming_helpers.cpython-312.pyc +0 -0
talk_arena/audio_collection.py +448 -0
talk_arena/db_utils.py +37 -0
talk_arena/demo.py +432 -0
talk_arena/leaderboard_viz.py +463 -0
talk_arena/streaming_helpers.py +348 -0
talk_arena/styles.css +25 -0
talk_arena/viz/core.py +324 -0
talk_arena/viz/server.py +323 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# app.py
+from talk_arena.audio_collection import demo
+import sys
+sys.path.append("talk-arena")
+from talk_arena.audio_collection import demo
+demo.queue(default_concurrency_limit=40, api_open=False).launch(share=True, ssr_mode=False)

audio_out_votes.json ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+transformers==4.45.2
+transformers-stream-generator==0.0.5
+accelerate>=0.26.0
+peft
+gradio==5.8.0
+tinydb==4.8.0
+xxhash==3.4.1
+google-ai-generativelanguage==0.6.10
+google-generativeai
+datasets==2.18.0
+librosa==0.10.1
+soundfile==0.12.1
+openai==1.52.0
+python-dotenv==1.0.1
+httpx==0.27.2

talk_arena/.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ OPENAI_API_KEY="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"
2	+ GEMINI_API_KEY="AIzaSyAM6XTT9S9nzE09jj5o-UNDZ4f8INPyWBM"

talk_arena/__init__.py ADDED Viewed

File without changes

talk_arena/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (175 Bytes). View file

talk_arena/__pycache__/db_utils.cpython-312.pyc ADDED Viewed

Binary file (2.62 kB). View file

talk_arena/__pycache__/streaming_helpers.cpython-312.pyc ADDED Viewed

Binary file (18.8 kB). View file

talk_arena/audio_collection.py ADDED Viewed

	@@ -0,0 +1,448 @@

+import argparse
+import asyncio
+import os
+import random
+import textwrap
+import time
+import gradio as gr
+import numpy as np
+import soundfile as sf
+import xxhash
+from datasets import Audio
+from dotenv import load_dotenv
+from openai import OpenAI
+import talk_arena.streaming_helpers as sh
+from talk_arena.db_utils import TinyThreadSafeDB
+load_dotenv()
+resampler = Audio(sampling_rate=16_000)
+# /nlp/scr/askhan1/audioLLM_as_a_judge/talk-arena../src/talk_arena/audio_collection.py
+def parse_args():
+    parser = argparse.ArgumentParser(description="Talk Arena Demo")
+    parser.add_argument("--free_only", action="store_true", help="Only use free models")
+    return parser.parse_args()
+args = parse_args()
+if gr.NO_RELOAD:  # Prevents Re-init during hot reloading
+    # Transcription Disabled for Public Interface
+    # asr_pipe = pipeline(
+    #    task="automatic-speech-recognition",
+    #    model="openai/whisper-large-v3-turbo",
+    #    chunk_length_s=30,
+    #    device="cuda:1",
+    # )
+    anonymous = True
+    gpt4o_audio, gpt4o_model = sh.gpt4o_streaming("models/gpt4o")
+    gemini2_audio, gemini2_model = sh.gemini_streaming("models/gemini-2.0-flash-exp")
+    competitor_info = [
+        (sh.gradio_gen_factory(gpt4o_audio, "GPT4o", anonymous), "gpt4o", "GPT-4o"),
+        (sh.gradio_gen_factory(gemini2_audio, "Gemini 2 Flash", anonymous), "gemini_2f", "Gemini 2 Flash"),
+    ]
+    resp_generators = [generator for generator, _, _ in competitor_info]
+    model_shorthand = [shorthand for _, shorthand, _ in competitor_info]
+    model_name = [full_name for _, _, full_name in competitor_info]
+    all_models = list(range(len(model_shorthand)))
+async def pairwise_response_async(audio_input, state, model_order):
+    if audio_input == None:
+        raise StopAsyncIteration(
+            "",
+            "",
+            gr.Button(visible=False),
+            gr.Button(visible=False),
+            gr.Button(visible=False),
+            state,
+            audio_input,
+            None,
+            None,
+            None,
+        )
+    spinner_id = 0
+    spinners = ["◐ ", "◓ ", "◑", "◒"]
+    spinner = spinners[0]
+    gen_pair = [resp_generators[model_order[0]], resp_generators[model_order[1]]]
+    latencies = [{}, {}]  # Store timing info for each model
+    resps = [gr.Textbox(value="", info="", visible=False), gr.Textbox(value="", info="", visible=False)]
+    tts_resps = [gr.Audio(), gr.Audio()]
+    error_in_model = False
+    for order, generator in enumerate(gen_pair):
+        start_time = time.time()
+        first_token = True
+        total_length = 0
+        try:
+            async for local_resp in generator(audio_input, order):
+                total_length += 1
+                if first_token:
+                    latencies[order]["time_to_first_token"] = time.time() - start_time
+                    first_token = False
+                resps[order] = local_resp
+                spinner = spinners[spinner_id]
+                spinner_id = (spinner_id + 1) % 4
+                yield (
+                    gr.Button(
+                        value=spinner + " Generating Responses " + spinner,
+                        interactive=False,
+                        variant="primary",
+                    ),
+                    resps[0],
+                    resps[1],
+                    tts_resps[0],
+                    tts_resps[1],
+                    gr.Button(visible=False),
+                    gr.Button(visible=False),
+                    gr.Button(visible=False),
+                    state,
+                    audio_input,
+                    None,
+                    None,
+                    latencies,
+                )
+            latencies[order]["total_time"] = time.time() - start_time
+            latencies[order]["response_length"] = total_length
+        except:
+            error_in_model = True
+            resps[order] = gr.Textbox(
+                info=f"<strong>Error thrown by Model {order+1} API</strong>",
+                value="" if first_token else resps[order]._constructor_args[0]["value"],
+                visible=True,
+                label=f"Model {order+1}",
+            )
+            yield (
+                gr.Button(
+                    value=spinner + " Generating Responses " + spinner,
+                    interactive=False,
+                    variant="primary",
+                ),
+                resps[0],
+                resps[1],
+                tts_resps[0],
+                tts_resps[1],
+                gr.Button(visible=False),
+                gr.Button(visible=False),
+                gr.Button(visible=False),
+                state,
+                audio_input,
+                None,
+                None,
+                latencies,
+            )
+        sr, y = audio_input
+        x = xxhash.xxh32(bytes(y)).hexdigest()
+        y = y.astype(np.float32)
+        y /= np.max(np.abs(y))
+        a = resampler.decode_example(resampler.encode_example({"array": y, "sampling_rate": sr}))
+        sf.write(f"{x}_resp{order}.wav", a["array"], a["sampling_rate"], format="wav")
+        tts_options = {
+            "model": "gpt-4o-mini-tts",
+            "voice": "alloy",
+            "input": resps[order].__dict__["_constructor_args"][0]["value"],
+            "response_format": "wav",
+        }
+        abytes = OpenAI(api_key=os.environ["OPENAI_API_KEY"]).audio.speech.create(**tts_options).content
+        tts_resps[order] = gr.Audio(
+            value=abytes,
+            visible=True,
+        )
+        latencies[order]["total_time"] = time.time() - start_time
+        latencies[order]["response_length"] = total_length
+    print(latencies)
+    yield (
+        gr.Button(value="Vote for which model is better!", interactive=False, variant="primary", visible=False),
+        resps[0],
+        resps[1],
+        tts_resps[0],
+        tts_resps[1],
+        gr.Button(visible=not error_in_model),
+        gr.Button(visible=not error_in_model),
+        gr.Button(visible=not error_in_model),
+        responses_complete(state),
+        audio_input,
+        gr.Textbox(visible=False),
+        gr.Audio(visible=False),
+        latencies,
+    )
+def on_page_load(state, model_order):
+    if state == 0:
+        # gr.Info(
+        #    "Record something you'd say to an AI Assistant! Think about what you usually use Siri, Google Assistant,"
+        #    " or ChatGPT for."
+        # )
+        state = 1
+        model_order = random.sample(all_models, 2) if anonymous else model_order
+    return state, model_order
+def recording_complete(state):
+    if state == 1:
+        # gr.Info(
+        #    "Once you submit your recording, you'll receive responses from different models. This might take a second."
+        # )
+        state = 2
+    return (
+        gr.Button(value="Starting Generation", interactive=False, variant="primary"),
+        state,
+    )
+def responses_complete(state):
+    if state == 2:
+        gr.Info(
+            "Give us your feedback! Mark which model gave you the best response so we can understand the quality of"
+            " these different voice assistant models."
+        )
+        state = 3
+    return state
+def clear_factory(button_id):
+    async def clear(audio_input, model_order, pref_counter, reasoning, latency):
+        textbox1 = gr.Textbox(visible=False)
+        textbox2 = gr.Textbox(visible=False)
+        if button_id != None:
+            sr, y = audio_input
+            x = xxhash.xxh32(bytes(y)).hexdigest()
+            await db.insert(
+                {
+                    "audio_hash": x,
+                    "outcome": button_id,
+                    "model_a": model_shorthand[model_order[0]],
+                    "model_b": model_shorthand[model_order[1]],
+                    "why": reasoning,
+                    "model_a_latency": latency[0],
+                    "model_b_latency": latency[1],
+                }
+            )
+            pref_counter += 1
+            model_a = model_name[model_order[0]]
+            model_b = model_name[model_order[1]]
+        counter_text = f"# {pref_counter}/10 Preferences Submitted"
+        if pref_counter >= 10:
+            code = "C1ARB3D6"
+            counter_text = f"# Completed! Completion Code: {code}"
+        if anonymous:
+            model_order = random.sample(all_models, 2)
+        return (
+            model_order,
+            gr.Button(
+                value="Record Audio to Submit Again!",
+                interactive=False,
+                visible=True,
+            ),
+            gr.Button(visible=False),
+            gr.Button(visible=False),
+            gr.Button(visible=False),
+            None,
+            textbox1,
+            textbox2,
+            gr.Audio(visible=False),
+            gr.Audio(visible=False),
+            pref_counter,
+            counter_text,
+            gr.Textbox(visible=False),
+            gr.Audio(visible=False),
+        )
+    return clear
+def transcribe(transc, voice_reason):
+    if transc is None:
+        transc = ""
+    transc += " " + asr_pipe(voice_reason, generate_kwargs={"task": "transcribe"}, return_timestamps=False)["text"]
+    return transc, gr.Audio(value=None)
+theme = gr.themes.Soft(
+    primary_hue=gr.themes.Color(
+        c100="#82000019",
+        c200="#82000033",
+        c300="#8200004c",
+        c400="#82000066",
+        c50="#8200007f",
+        c500="#8200007f",
+        c600="#82000099",
+        c700="#820000b2",
+        c800="#820000cc",
+        c900="#820000e5",
+        c950="#820000f2",
+    ),
+    secondary_hue="rose",
+    neutral_hue="stone",
+)
+with open("../src/talk_arena/styles.css", "r") as css_file:
+    custom_css = css_file.read()
+db = TinyThreadSafeDB("audio_out_votes.json")
+with gr.Blocks(theme=theme, fill_height=True, css=custom_css) as demo:
+    submitted_preferences = gr.State(0)
+    state = gr.State(0)
+    model_order = gr.State([])
+    latency = gr.State([])
+    with gr.Row():
+        counter_text = gr.Markdown(
+            "# 0/10 Preferences Submitted.\n Follow the pop-up tips to submit your first preference."
+        )
+        category_description_text = gr.Markdown("PLACEHOLDER FOR ALI TO FILL IN LATER")
+    with gr.Row():
+        audio_input = gr.Audio(sources=["microphone"], streaming=False, label="Audio Input")
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            out1 = gr.Textbox(visible=False, lines=5, autoscroll=True)
+            audio_out1 = gr.Audio(visible=False)
+        with gr.Column(scale=1):
+            out2 = gr.Textbox(visible=False, lines=5, autoscroll=True)
+            audio_out2 = gr.Audio(visible=False)
+    with gr.Row():
+        btn = gr.Button(value="Record Audio to Submit!", interactive=False)
+    with gr.Row(equal_height=True):
+        reason = gr.Textbox(label="[Optional] Explain Your Preferences", visible=False, scale=4)
+        reason_record = gr.Audio(
+            sources=["microphone"],
+            interactive=True,
+            streaming=False,
+            label="Speak to transcribe!",
+            visible=False,
+            type="filepath",
+            # waveform_options={"show_recording_waveform": False},
+            scale=1,
+        )
+    with gr.Row():
+        best1 = gr.Button(value="Model 1 is better", visible=False)
+        tie = gr.Button(value="Tie", visible=False)
+        best2 = gr.Button(value="Model 2 is better", visible=False)
+    with gr.Row():
+        contact = gr.Markdown("")
+    # reason_record.stop_recording(transcribe, inputs=[reason, reason_record], outputs=[reason, reason_record])
+    audio_input.stop_recording(
+        recording_complete,
+        [state],
+        [btn, state],
+    ).then(
+        fn=pairwise_response_async,
+        inputs=[audio_input, state, model_order],
+        outputs=[
+            btn,
+            out1,
+            out2,
+            audio_out1,
+            audio_out2,
+            best1,
+            best2,
+            tie,
+            state,
+            audio_input,
+            reason,
+            reason_record,
+            latency,
+        ],
+    )
+    audio_input.start_recording(
+        lambda: gr.Button(value="Uploading Audio to Cloud", interactive=False, variant="primary"),
+        None,
+        btn,
+    )
+    best1.click(
+        fn=clear_factory(0),
+        inputs=[audio_input, model_order, submitted_preferences, reason, latency],
+        outputs=[
+            model_order,
+            btn,
+            best1,
+            best2,
+            tie,
+            audio_input,
+            out1,
+            out2,
+            audio_out1,
+            audio_out2,
+            submitted_preferences,
+            counter_text,
+            reason,
+            reason_record,
+        ],
+    )
+    tie.click(
+        fn=clear_factory(0.5),
+        inputs=[audio_input, model_order, submitted_preferences, reason, latency],
+        outputs=[
+            model_order,
+            btn,
+            best1,
+            best2,
+            tie,
+            audio_input,
+            out1,
+            out2,
+            audio_out1,
+            audio_out2,
+            submitted_preferences,
+            counter_text,
+            reason,
+            reason_record,
+        ],
+    )
+    best2.click(
+        fn=clear_factory(1),
+        inputs=[audio_input, model_order, submitted_preferences, reason, latency],
+        outputs=[
+            model_order,
+            btn,
+            best1,
+            best2,
+            tie,
+            audio_input,
+            out1,
+            out2,
+            audio_out1,
+            audio_out2,
+            submitted_preferences,
+            counter_text,
+            reason,
+            reason_record,
+        ],
+    )
+    audio_input.clear(
+        clear_factory(None),
+        [audio_input, model_order, submitted_preferences, reason, latency],
+        [
+            model_order,
+            btn,
+            best1,
+            best2,
+            tie,
+            audio_input,
+            out1,
+            out2,
+            audio_out1,
+            audio_out2,
+            submitted_preferences,
+            counter_text,
+            reason,
+            reason_record,
+        ],
+    )
+    demo.load(fn=on_page_load, inputs=[state, model_order], outputs=[state, model_order])
+if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=40, api_open=False).launch(share=True, ssr_mode=False)

talk_arena/db_utils.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import uuid
+from asyncio import Lock as ALock
+from contextlib import asynccontextmanager
+from threading import Lock as TLock
+from tinydb import TinyDB
+from tinydb.table import Table as TinyDBTable
+class UUIDTable(TinyDBTable):
+    document_id_class = uuid.UUID
+    def _get_next_id(self):
+        return uuid.uuid4()
+class UUIDB(TinyDB):
+    table_class = UUIDTable
+class TinyThreadSafeDB:
+    def __init__(self, db_path: str):
+        self.db = UUIDB(db_path)
+        self._lock1 = TLock()
+        self._lock2 = ALock()
+    @asynccontextmanager
+    async def atomic_operation(self):
+        """Context manager for thread-safe database operations"""
+        with self._lock1:
+            async with self._lock2:
+                yield self.db
+    async def insert(self, data: dict):
+        """Thread-safe insertion of preference data"""
+        async with self.atomic_operation() as db:
+            db.insert(data)

talk_arena/demo.py ADDED Viewed

	@@ -0,0 +1,432 @@

+import argparse
+import asyncio
+import random
+import textwrap
+import time
+import gradio as gr
+import xxhash
+from dotenv import load_dotenv
+from transformers import pipeline
+import talk_arena.streaming_helpers as sh
+from talk_arena.db_utils import TinyThreadSafeDB
+load_dotenv()
+def parse_args():
+    parser = argparse.ArgumentParser(description="Talk Arena Demo")
+    parser.add_argument("--free_only", action="store_true", help="Only use free models")
+    return parser.parse_args()
+args = parse_args()
+if gr.NO_RELOAD:  # Prevents Re-init during hot reloading
+    # Transcription Disabled for Public Interface
+    # asr_pipe = pipeline(
+    #    task="automatic-speech-recognition",
+    #    model="openai/whisper-large-v3-turbo",
+    #    chunk_length_s=30,
+    #    device="cuda:1",
+    # )
+    anonymous = True
+    # Generation Setup
+    diva_audio, diva = sh.api_streaming("WillHeld/DiVA-llama-3-v0-8b")
+    qwen2_audio, qwen2 = sh.api_streaming("Qwen/Qwen2-Audio-7B-Instruct")
+    pipelined_system, pipeline_model = sh.api_streaming("pipeline/meta-llama/Meta-Llama-3-8B-Instruct")
+    if not args.free_only:
+        gemini_audio, gemini_model = sh.gemini_streaming("models/gemini-1.5-flash")
+        gpt4o_audio, gpt4o_model = sh.gpt4o_streaming("models/gpt4o")
+        geminip_audio, geminip_model = sh.gemini_streaming("models/gemini-1.5-pro")
+        gemini2_audio, gemini2_model = sh.gemini_streaming("models/gemini-2.0-flash-exp")
+    typhoon_audio, typhoon_model = sh.api_streaming("scb10x/llama-3-typhoon-audio-8b-2411")
+    competitor_info = [
+        (sh.gradio_gen_factory(diva_audio, "DiVA Llama 3 8B", anonymous), "diva_3_8b", "DiVA Llama 3 8B"),
+        (sh.gradio_gen_factory(qwen2_audio, "Qwen 2", anonymous), "qwen2", "Qwen 2 Audio"),
+        (
+            sh.gradio_gen_factory(pipelined_system, "Pipelined Llama 3 8B", anonymous),
+            "pipe_l3.0",
+            "Pipelined Llama 3 8B",
+        ),
+        (sh.gradio_gen_factory(typhoon_audio, "Typhoon Audio", anonymous), "typhoon_audio", "Typhoon Audio"),
+    ]
+    # Add paid models if flag is not set
+    if not args.free_only:
+        competitor_info += [
+            (sh.gradio_gen_factory(gemini_audio, "Gemini 1.5 Flash", anonymous), "gemini_1.5f", "Gemini 1.5 Flash"),
+            (sh.gradio_gen_factory(gpt4o_audio, "GPT4o", anonymous), "gpt4o", "GPT-4o"),
+            (sh.gradio_gen_factory(geminip_audio, "Gemini 1.5 Pro", anonymous), "gemini_1.5p", "Gemini 1.5 Pro"),
+            (sh.gradio_gen_factory(geminip_audio, "Gemini 2 Flash", anonymous), "gemini_2f", "Gemini 2 Flash"),
+        ]
+    resp_generators = [generator for generator, _, _ in competitor_info]
+    model_shorthand = [shorthand for _, shorthand, _ in competitor_info]
+    model_name = [full_name for _, _, full_name in competitor_info]
+    all_models = list(range(len(model_shorthand)))
+async def pairwise_response_async(audio_input, state, model_order):
+    if audio_input == None:
+        raise StopAsyncIteration(
+            "",
+            "",
+            gr.Button(visible=False),
+            gr.Button(visible=False),
+            gr.Button(visible=False),
+            state,
+            audio_input,
+            None,
+            None,
+            None,
+        )
+    spinner_id = 0
+    spinners = ["◐ ", "◓ ", "◑", "◒"]
+    spinner = spinners[0]
+    gen_pair = [resp_generators[model_order[0]], resp_generators[model_order[1]]]
+    latencies = [{}, {}]  # Store timing info for each model
+    resps = [gr.Textbox(value="", info="", visible=False), gr.Textbox(value="", info="", visible=False)]
+    error_in_model = False
+    for order, generator in enumerate(gen_pair):
+        start_time = time.time()
+        first_token = True
+        total_length = 0
+        try:
+            async for local_resp in generator(audio_input, order):
+                total_length += 1
+                if first_token:
+                    latencies[order]["time_to_first_token"] = time.time() - start_time
+                    first_token = False
+                resps[order] = local_resp
+                spinner = spinners[spinner_id]
+                spinner_id = (spinner_id + 1) % 4
+                yield (
+                    gr.Button(
+                        value=spinner + " Generating Responses " + spinner,
+                        interactive=False,
+                        variant="primary",
+                    ),
+                    resps[0],
+                    resps[1],
+                    gr.Button(visible=False),
+                    gr.Button(visible=False),
+                    gr.Button(visible=False),
+                    state,
+                    audio_input,
+                    None,
+                    None,
+                    latencies,
+                )
+            latencies[order]["total_time"] = time.time() - start_time
+            latencies[order]["response_length"] = total_length
+        except:
+            error_in_model = True
+            resps[order] = gr.Textbox(
+                info=f"<strong>Error thrown by Model {order+1} API</strong>",
+                value="" if first_token else resps[order]._constructor_args[0]["value"],
+                visible=True,
+                label=f"Model {order+1}",
+            )
+            yield (
+                gr.Button(
+                    value=spinner + " Generating Responses " + spinner,
+                    interactive=False,
+                    variant="primary",
+                ),
+                resps[0],
+                resps[1],
+                gr.Button(visible=False),
+                gr.Button(visible=False),
+                gr.Button(visible=False),
+                state,
+                audio_input,
+                None,
+                None,
+                latencies,
+            )
+        latencies[order]["total_time"] = time.time() - start_time
+        latencies[order]["response_length"] = total_length
+    print(latencies)
+    yield (
+        gr.Button(value="Vote for which model is better!", interactive=False, variant="primary", visible=False),
+        resps[0],
+        resps[1],
+        gr.Button(visible=not error_in_model),
+        gr.Button(visible=not error_in_model),
+        gr.Button(visible=not error_in_model),
+        responses_complete(state),
+        audio_input,
+        gr.Textbox(visible=False),
+        gr.Audio(visible=False),
+        latencies,
+    )
+def on_page_load(state, model_order):
+    if state == 0:
+        # gr.Info(
+        #    "Record something you'd say to an AI Assistant! Think about what you usually use Siri, Google Assistant,"
+        #    " or ChatGPT for."
+        # )
+        state = 1
+        model_order = random.sample(all_models, 2) if anonymous else model_order
+    return state, model_order
+def recording_complete(state):
+    if state == 1:
+        # gr.Info(
+        #    "Once you submit your recording, you'll receive responses from different models. This might take a second."
+        # )
+        state = 2
+    return (
+        gr.Button(value="Starting Generation", interactive=False, variant="primary"),
+        state,
+    )
+def responses_complete(state):
+    if state == 2:
+        gr.Info(
+            "Give us your feedback! Mark which model gave you the best response so we can understand the quality of"
+            " these different voice assistant models."
+        )
+        state = 3
+    return state
+def clear_factory(button_id):
+    async def clear(audio_input, model_order, pref_counter, reasoning, latency):
+        textbox1 = gr.Textbox(visible=False)
+        textbox2 = gr.Textbox(visible=False)
+        if button_id != None:
+            sr, y = audio_input
+            x = xxhash.xxh32(bytes(y)).hexdigest()
+            await db.insert(
+                {
+                    "audio_hash": x,
+                    "outcome": button_id,
+                    "model_a": model_shorthand[model_order[0]],
+                    "model_b": model_shorthand[model_order[1]],
+                    "why": reasoning,
+                    "model_a_latency": latency[0],
+                    "model_b_latency": latency[1],
+                }
+            )
+            pref_counter += 1
+            model_a = model_name[model_order[0]]
+            model_b = model_name[model_order[1]]
+            textbox1 = gr.Textbox(
+                visible=True,
+                info=f"<strong style='color: #53565A'>Response from {model_a}</strong><p>Time-to-First-Character: {latency[0]['time_to_first_token']:.2f} ms, Time Per Character: {latency[0]['total_time']/latency[0]['response_length']:.2f} ms</p>",
+            )
+            textbox2 = gr.Textbox(
+                visible=True,
+                info=f"<strong style='color: #53565A'>Response from {model_b}</strong><p>Time-to-First-Character: {latency[1]['time_to_first_token']:.2f} ms, Time Per Character: {latency[1]['total_time']/latency[1]['response_length']:.2f} ms</p>",
+            )
+        try:
+            sr, y = audio_input
+            x = xxhash.xxh32(bytes(y)).hexdigest()
+            os.remove(f"{x}.wav")
+        except:
+            # file already deleted, this is just a failsafe to assure data is cleared
+            pass
+        counter_text = f"# {pref_counter}/10 Preferences Submitted"
+        if pref_counter >= 10 and False:  # Currently Disabled, Manages Prolific Completionx
+            code = "PLACEHOLDER"
+            counter_text = f"# Completed! Completion Code: {code}"
+        counter_text = ""
+        if anonymous:
+            model_order = random.sample(all_models, 2)
+        return (
+            model_order,
+            gr.Button(
+                value="Record Audio to Submit Again!",
+                interactive=False,
+                visible=True,
+            ),
+            gr.Button(visible=False),
+            gr.Button(visible=False),
+            gr.Button(visible=False),
+            None,
+            textbox1,
+            textbox2,
+            pref_counter,
+            counter_text,
+            gr.Textbox(visible=False),
+            gr.Audio(visible=False),
+        )
+    return clear
+def transcribe(transc, voice_reason):
+    if transc is None:
+        transc = ""
+    transc += " " + asr_pipe(voice_reason, generate_kwargs={"task": "transcribe"}, return_timestamps=False)["text"]
+    return transc, gr.Audio(value=None)
+theme = gr.themes.Soft(
+    primary_hue=gr.themes.Color(
+        c100="#82000019",
+        c200="#82000033",
+        c300="#8200004c",
+        c400="#82000066",
+        c50="#8200007f",
+        c500="#8200007f",
+        c600="#82000099",
+        c700="#820000b2",
+        c800="#820000cc",
+        c900="#820000e5",
+        c950="#820000f2",
+    ),
+    secondary_hue="rose",
+    neutral_hue="stone",
+)
+with open("src/talk_arena/styles.css", "r") as css_file:
+    custom_css = css_file.read()
+db = TinyThreadSafeDB("live_votes.json")
+with gr.Blocks(theme=theme, fill_height=True, css=custom_css) as demo:
+    submitted_preferences = gr.State(0)
+    state = gr.State(0)
+    model_order = gr.State([])
+    latency = gr.State([])
+    with gr.Row():
+        counter_text = gr.Markdown(
+            ""
+        )  # "# 0/10 Preferences Submitted.\n Follow the pop-up tips to submit your first preference.")
+    with gr.Row():
+        audio_input = gr.Audio(sources=["microphone"], streaming=False, label="Audio Input")
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            out1 = gr.Textbox(visible=False, lines=5, autoscroll=True)
+        with gr.Column(scale=1):
+            out2 = gr.Textbox(visible=False, lines=5, autoscroll=True)
+    with gr.Row():
+        btn = gr.Button(value="Record Audio to Submit!", interactive=False)
+    with gr.Row(equal_height=True):
+        reason = gr.Textbox(label="[Optional] Explain Your Preferences", visible=False, scale=4)
+        reason_record = gr.Audio(
+            sources=["microphone"],
+            interactive=True,
+            streaming=False,
+            label="Speak to transcribe!",
+            visible=False,
+            type="filepath",
+            # waveform_options={"show_recording_waveform": False},
+            scale=1,
+        )
+    with gr.Row():
+        best1 = gr.Button(value="Model 1 is better", visible=False)
+        tie = gr.Button(value="Tie", visible=False)
+        best2 = gr.Button(value="Model 2 is better", visible=False)
+    with gr.Row():
+        contact = gr.Markdown("")
+    # reason_record.stop_recording(transcribe, inputs=[reason, reason_record], outputs=[reason, reason_record])
+    audio_input.stop_recording(
+        recording_complete,
+        [state],
+        [btn, state],
+    ).then(
+        fn=pairwise_response_async,
+        inputs=[audio_input, state, model_order],
+        outputs=[btn, out1, out2, best1, best2, tie, state, audio_input, reason, reason_record, latency],
+    )
+    audio_input.start_recording(
+        lambda: gr.Button(value="Uploading Audio to Cloud", interactive=False, variant="primary"),
+        None,
+        btn,
+    )
+    best1.click(
+        fn=clear_factory(0),
+        inputs=[audio_input, model_order, submitted_preferences, reason, latency],
+        outputs=[
+            model_order,
+            btn,
+            best1,
+            best2,
+            tie,
+            audio_input,
+            out1,
+            out2,
+            submitted_preferences,
+            counter_text,
+            reason,
+            reason_record,
+        ],
+    )
+    tie.click(
+        fn=clear_factory(0.5),
+        inputs=[audio_input, model_order, submitted_preferences, reason, latency],
+        outputs=[
+            model_order,
+            btn,
+            best1,
+            best2,
+            tie,
+            audio_input,
+            out1,
+            out2,
+            submitted_preferences,
+            counter_text,
+            reason,
+            reason_record,
+        ],
+    )
+    best2.click(
+        fn=clear_factory(1),
+        inputs=[audio_input, model_order, submitted_preferences, reason, latency],
+        outputs=[
+            model_order,
+            btn,
+            best1,
+            best2,
+            tie,
+            audio_input,
+            out1,
+            out2,
+            submitted_preferences,
+            counter_text,
+            reason,
+            reason_record,
+        ],
+    )
+    audio_input.clear(
+        clear_factory(None),
+        [audio_input, model_order, submitted_preferences, reason, latency],
+        [
+            model_order,
+            btn,
+            best1,
+            best2,
+            tie,
+            audio_input,
+            out1,
+            out2,
+            submitted_preferences,
+            counter_text,
+            reason,
+            reason_record,
+        ],
+    )
+    demo.load(fn=on_page_load, inputs=[state, model_order], outputs=[state, model_order])
+if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=40, api_open=False).launch(share=True, ssr_mode=False)

talk_arena/leaderboard_viz.py ADDED Viewed

	@@ -0,0 +1,463 @@

+import json
+import random
+import textwrap
+from collections import defaultdict
+from datetime import datetime
+from typing import Dict, List, Tuple
+from zoneinfo import ZoneInfo
+import gradio as gr
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.io as pio
+from apscheduler.schedulers.background import BackgroundScheduler
+from scipy.optimize import minimize
+from scipy.special import expit
+# Constants
+COLORS = [
+    "#1B7FFF",
+    "#F07D1A",
+    "#BA24C7",
+    "#FE42C7",
+    "#0D4B7C",
+    "#0EAC96",
+    "#AA7CFF",
+    "#B50550",
+    "#009EEB",
+    "#220B55",
+    "#7B3301",
+]
+WR_PLOT = None
+BT_PLOT = None
+UPDATE_TIME = None
+NAME_MAPPING = {
+    "gemini_2f": "Gemini 2.0 Flash (Experimental)",
+    "diva_3_8b": "DiVA Llama 3 8B",
+    "qwen2": "Qwen 2 Audio",
+    "pipe_l3.0": "Pipelined Llama 3 8B",
+    "gemini_1.5f": "Gemini 1.5 Flash",
+    "gpt4o": "GPT-4o",
+    "gemini_1.5p": "Gemini 1.5 Pro",
+    "typhoon_audio": "Typhoon Audio",
+}
+def get_aesthetic_timestamp():
+    """
+    Returns a beautifully formatted timestamp in the format:
+    'Tuesday, December 10th, 2024 at 3:45 PM'
+    """
+    # Get timezone object for PST
+    pst = ZoneInfo("America/Los_Angeles")
+    # Get current time in PST
+    now = datetime.now(pst)
+    # Add suffix to day number (1st, 2nd, 3rd, etc.)
+    day = now.day
+    if 4 <= day <= 20 or 24 <= day <= 30:
+        suffix = "th"
+    else:
+        suffix = ["st", "nd", "rd"][day % 10 - 1]
+    return now.strftime(f"%A, %B {day}{suffix}, %Y at %-I:%M %p")
+def bootstrap_ci(data, n_bootstrap=10000, ci=95):
+    """Calculate bootstrap confidence intervals."""
+    bootstrap_samples = []
+    for _ in range(n_bootstrap):
+        bootstrap_samples.append(np.mean(random.choices(data, k=len(data))))
+    lower_bound = np.percentile(bootstrap_samples, (100 - ci) / 2)
+    upper_bound = np.percentile(bootstrap_samples, 100 - (100 - ci) / 2)
+    return lower_bound, upper_bound
+def calculate_win_rates(json_data):
+    """Calculate win rates from JSON data."""
+    data = json.loads(json_data)
+    model_wins = defaultdict(int)
+    total_matches = defaultdict(int)
+    total_votes = 0
+    for value in data["_default"].values():
+        total_votes += 1
+        if value["outcome"] == 0:
+            model_wins[value["model_a"]] += 1
+        elif value["outcome"] == 1:
+            model_wins[value["model_b"]] += 1
+        elif value["outcome"] == 0.5:
+            model_wins[value["model_a"]] += 0.5
+            model_wins[value["model_b"]] += 0.5
+        total_matches[value["model_a"]] += 1
+        total_matches[value["model_b"]] += 1
+    per_model_wins = {}
+    for model, wins in model_wins.items():
+        win_rate = wins / total_matches[model]
+        wins_data = [1] * int(wins) + [0] * int(total_matches[model] - wins)
+        if int(wins) != wins:
+            wins_data += [0.5]
+        lower, upper = bootstrap_ci(wins_data)
+        per_model_wins[model] = {
+            "model": model,
+            "win_rate": win_rate,
+            "95_lower": (win_rate - lower),
+            "95_upper": (upper - win_rate),
+        }
+    df = pd.DataFrame.from_dict(per_model_wins).T
+    return df, total_votes
+def create_win_rate_plot(wins_df):
+    """Create win rate plot using Plotly."""
+    wins_df["Source"] = wins_df["Source"].astype(str)
+    wins_df = wins_df.sort_values(by=["Source", "win_rate"], ascending=False)
+    wins_df["model"] = wins_df["model"].apply(lambda x: NAME_MAPPING.get(x, x))
+    fig = px.bar(
+        wins_df,
+        x="model",
+        y="win_rate",
+        error_y="95_upper",
+        error_y_minus="95_lower",
+        color="model",
+        color_discrete_sequence=COLORS,
+        animation_group="model",
+        animation_frame="Source",
+    )
+    fig.update_traces(
+        hovertemplate="<b>%{x}</b><br>" + "Win Rate: %{y}" + "<extra></extra>",
+    )
+    fig.update_layout(
+        autosize=True,
+        showlegend=False,
+        plot_bgcolor="white",
+        title={
+            "text": "Talk Arena Live Win Rates<br>with 95% Confidence Intervals",
+            "y": 0.95,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="Model",
+        yaxis_title="Win Rate (%)",
+        bargap=0.2,
+        yaxis=dict(
+            tickformat=".0%", tickmode="auto", range=[0, 1.01], gridcolor="#C9CCD1", griddash="dash", gridwidth=2
+        ),
+        legend=dict(
+            orientation="h",  # Make legend horizontal
+            yanchor="bottom",
+            y=-0.5,  # Position below plot
+            xanchor="center",
+            x=0.5,  # Center horizontally
+            bgcolor="rgba(255, 255, 255, 0.8)",
+            bordercolor="#C9CCD1",
+            borderwidth=1,
+        ),
+        margin=dict(l=10, r=10, t=0, b=10),  # Balanced margins
+        hoverlabel=dict(bgcolor="white", font_size=14, bordercolor="gray"),
+    )
+    fig.update_xaxes(showgrid=False)
+    return fig
+# Bradley-Terry Model Functions
+def load_live_votes(json_str: str) -> pd.DataFrame:
+    """Load and preprocess live votes data from JSON string."""
+    data = json.loads(json_str)
+    df = pd.DataFrame.from_dict(data["_default"], orient="index")
+    df["winner"] = df["outcome"].map({1: "model_b", 0: "model_a", 0.5: "tie"})
+    return df
+def preprocess_for_bt(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, List[str], np.ndarray]:
+    """Preprocess data for Bradley-Terry model fitting."""
+    all_models = pd.concat([df["model_a"], df["model_b"]]).unique()
+    model_to_idx = {model: idx for idx, model in enumerate(all_models)}
+    matchups = np.array([[model_to_idx[row.model_a], model_to_idx[row.model_b]] for _, row in df.iterrows()])
+    outcomes = np.array(
+        [1.0 if row.winner == "model_a" else (0.5 if row.winner == "tie" else 0.0) for _, row in df.iterrows()]
+    )
+    unique_matches = np.column_stack([matchups, outcomes])
+    unique_matches, weights = np.unique(unique_matches, return_counts=True, axis=0)
+    return (unique_matches[:, :2].astype(np.int32), unique_matches[:, 2], list(all_models), weights.astype(np.float64))
+def bt_loss_and_grad(
+    ratings: np.ndarray, matchups: np.ndarray, outcomes: np.ndarray, weights: np.ndarray, alpha: float = 1.0
+) -> Tuple[float, np.ndarray]:
+    """Compute Bradley-Terry loss and gradient."""
+    matchup_ratings = ratings[matchups]
+    logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
+    probs = expit(logits)
+    loss = -((np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes)) * weights).sum()
+    matchups_grads = -alpha * (outcomes - probs) * weights
+    model_grad = np.zeros_like(ratings)
+    np.add.at(model_grad, matchups[:, [0, 1]], matchups_grads[:, None] * np.array([1.0, -1.0], dtype=np.float64))
+    return loss, model_grad
+def fit_bt(
+    matchups: np.ndarray, outcomes: np.ndarray, weights: np.ndarray, n_models: int, alpha: float, tol: float = 1e-6
+) -> np.ndarray:
+    """Fit Bradley-Terry model using L-BFGS-B optimization."""
+    initial_ratings = np.zeros(n_models, dtype=np.float64)
+    result = minimize(
+        fun=bt_loss_and_grad,
+        x0=initial_ratings,
+        args=(matchups, outcomes, weights, alpha),
+        jac=True,
+        method="L-BFGS-B",
+        options={"disp": False, "maxiter": 100, "gtol": tol},
+    )
+    return result["x"]
+def scale_and_offset(
+    ratings: np.ndarray, models: List[str], scale: float = 400, init_rating: float = 1000
+) -> np.ndarray:
+    """Scale ratings to familiar Elo-like scale."""
+    scaled_ratings = (ratings * scale) + init_rating
+    return scaled_ratings
+def compute_bootstrap_bt(
+    data: str,
+    num_round: int = 100,
+    base: float = 10.0,
+    scale: float = 400.0,
+    init_rating: float = 1000.0,
+    tol: float = 1e-6,
+) -> pd.DataFrame:
+    """Compute bootstrap Bradley-Terry ratings from live votes data."""
+    df = load_live_votes(data)
+    matchups, outcomes, models, weights = preprocess_for_bt(df)
+    rng = np.random.default_rng(seed=0)
+    total_matches = len(df)
+    idxs = rng.multinomial(n=total_matches, pvals=weights / weights.sum(), size=num_round)
+    boot_weights = idxs.astype(np.float64) / total_matches
+    ratings_list = []
+    for sample_weights in boot_weights:
+        ratings = fit_bt(
+            matchups=matchups,
+            outcomes=outcomes,
+            weights=sample_weights,
+            n_models=len(models),
+            alpha=np.log(base),
+            tol=tol,
+        )
+        scaled_ratings = scale_and_offset(ratings=ratings, models=models, scale=scale, init_rating=init_rating)
+        ratings_list.append(scaled_ratings)
+    df_ratings = pd.DataFrame(ratings_list, columns=models)
+    return df_ratings[df_ratings.median().sort_values(ascending=False).index]
+def create_bt_plot(bootstrap_ratings):
+    """Create Bradley-Terry ratings plot using Plotly."""
+    melted_bootstrap = bootstrap_ratings.melt(id_vars=["Source", "level_1"], var_name="Model", value_name="BT")
+    melted_bootstrap = melted_bootstrap.dropna()
+    melted_bootstrap = melted_bootstrap.sort_values(by=["Source", "Model", "BT"], ascending=False)
+    # Pretty Names
+    melted_bootstrap["Model"] = melted_bootstrap["Model"].apply(lambda x: NAME_MAPPING.get(x, x))
+    # Compression for Client Side
+    melted_bootstrap["BT"] = melted_bootstrap["BT"].apply(lambda x: int(x))
+    min_samp = melted_bootstrap[melted_bootstrap["BT"] > 0]["BT"].min()
+    max_samp = melted_bootstrap["BT"].max()
+    idx_keep = list(range(0, len(melted_bootstrap), 10))
+    melted_bootstrap = melted_bootstrap.iloc[idx_keep]
+    melted_bootstrap = melted_bootstrap.sort_values(by=["Source", "BT"], ascending=False)
+    fig = px.violin(
+        melted_bootstrap,
+        x="Model",
+        y="BT",
+        color="Model",
+        animation_group="Model",
+        animation_frame="Source",
+        color_discrete_sequence=COLORS,
+    )
+    fig.update_layout(
+        autosize=True,
+        showlegend=False,
+        plot_bgcolor="white",
+        title={
+            "text": "Talk Arena Live Bradley-Terry Ratings<br>with Bootstrapped Variance",
+            "y": 0.9,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="Model",
+        yaxis_title="Rating",
+        yaxis=dict(gridcolor="#C9CCD1", range=[min_samp - 10, max_samp + 10], griddash="dash"),
+        legend=dict(
+            orientation="h",  # Make legend horizontal
+            yanchor="bottom",
+            y=-0.5,  # Position below plot
+            xanchor="center",
+            x=0.5,  # Center horizontally
+            bgcolor="rgba(255, 255, 255, 0.8)",
+            bordercolor="#C9CCD1",
+            borderwidth=1,
+        ),
+        margin=dict(l=10, r=10, t=0, b=10),  # Balanced margins
+    )
+    fig.update_xaxes(showgrid=False)
+    fig.update_yaxes(showgrid=True, gridwidth=2)
+    return fig
+def get_wr_plot():
+    jrep = json.loads(pio.to_json(WR_PLOT))
+    for step in jrep["layout"]["sliders"][0]["steps"]:
+        step["args"][1]["frame"]["duration"] = 500
+        step["args"][1]["transition"]["duration"] = 500
+    jrep["layout"]["updatemenus"] = []
+    jrep["layout"]["sliders"][0]["len"] = 0.8
+    jrep["layout"]["sliders"][0]["pad"] = {}
+    return json.dumps(jrep)
+def get_bt_plot():
+    jrep = json.loads(pio.to_json(BT_PLOT))
+    for step in jrep["layout"]["sliders"][0]["steps"]:
+        step["args"][1]["frame"]["duration"] = 500
+        step["args"][1]["transition"]["duration"] = 500
+    jrep["layout"]["updatemenus"] = []
+    jrep["layout"]["sliders"][0]["len"] = 0.8
+    jrep["layout"]["sliders"][0]["pad"] = {}
+    return json.dumps(jrep)
+def get_update_time():
+    return UPDATE_TIME
+def viz_factory(force=False):
+    def process_and_visualize():
+        """Main function to process JSON data and create visualizations."""
+        global WR_PLOT, BT_PLOT, UPDATE_TIME
+        if WR_PLOT is not None and BT_PLOT is not None and not force:
+            return WR_PLOT, BT_PLOT, UPDATE_TIME
+        try:
+            # Read JSON data
+            pub_json_data = open("/home/wheld3/talk-arena/live_votes.json", "r").read()
+            prolific_json_data = open("/home/wheld3/talk-arena/prolific_votes.json", "r").read()
+            merged_json_data = json.dumps(
+                {"_default": {**json.loads(pub_json_data)["_default"], **json.loads(prolific_json_data)["_default"]}}
+            )
+            # Calculate win rates and create win rate plot
+            pub_win_rates, pub_votes = calculate_win_rates(pub_json_data)
+            pro_win_rates, pro_votes = calculate_win_rates(prolific_json_data)
+            total_win_rates, total_votes = calculate_win_rates(merged_json_data)
+            all_models = total_win_rates["model"].unique()
+            pro_models = pro_win_rates["model"].unique()
+            for model in all_models:
+                if model not in pro_models:
+                    new_index = len(pro_win_rates)
+                    pro_win_rates.loc[new_index] = [model, -0.1, -0.1, -0.2]
+            win_rates = (
+                pd.concat([pub_win_rates, pro_win_rates, total_win_rates], keys=["Public", "Prolific", "Total"])
+                .reset_index()
+                .rename(columns={"level_0": "Source"})
+            )
+            WR_PLOT = create_win_rate_plot(win_rates)
+            # Calculate Bradley-Terry ratings and create BT plot
+            pub_bootstrap_ratings = compute_bootstrap_bt(pub_json_data, num_round=10000)
+            pro_bootstrap_ratings = compute_bootstrap_bt(prolific_json_data, num_round=10000)
+            total_bootstrap_ratings = compute_bootstrap_bt(merged_json_data, num_round=10000)
+            for model in all_models:
+                if model not in pro_models:
+                    pro_bootstrap_ratings[model] = pro_bootstrap_ratings["diva_3_8b"] * -1
+            bootstrap_ratings = (
+                pd.concat(
+                    [pub_bootstrap_ratings, pro_bootstrap_ratings, total_bootstrap_ratings],
+                    keys=["Public", "Prolific", "Total"],
+                )
+                .reset_index()
+                .rename(columns={"level_0": "Source"})
+            )
+            BT_PLOT = create_bt_plot(bootstrap_ratings)
+            UPDATE_TIME = gr.Markdown(
+                value=textwrap.dedent(
+                    f"""
+                    <h4 class="nx-font-semibold nx-tracking-tight nx-text-slate-900 dark:nx-text-slate-100 nx-text-xl">Last Refresh: {get_aesthetic_timestamp()} PST</h4>
+                    <h6 class="nx-font-semibold nx-tracking-tight nx-text-slate-900 dark:nx-text-slate-100 nx-text-base">Total Votes: {total_votes}, Public Votes: {pub_votes}, Prolific Votes: {pro_votes}</h6>
+                    """
+                )
+            )
+            return WR_PLOT, BT_PLOT, UPDATE_TIME
+        except Exception as e:
+            raise gr.Error(f"Error processing file: {str(e)}")
+    return process_and_visualize
+theme = gr.themes.Soft(
+    primary_hue=gr.themes.Color(
+        c100="#82000019",
+        c200="#82000033",
+        c300="#8200004c",
+        c400="#82000066",
+        c50="#8200007f",
+        c500="#8200007f",
+        c600="#82000099",
+        c700="#820000b2",
+        c800="#820000cc",
+        c900="#820000e5",
+        c950="#820000f2",
+    ),
+    secondary_hue="rose",
+    neutral_hue="stone",
+)
+# Create Gradio interface
+with gr.Blocks(title="Talk Arena Leaderboard Analysis", theme=theme) as demo:
+    viz_factory(force=True)()
+    last_updated = UPDATE_TIME
+    with gr.Row():
+        bt_plot = gr.Plot(label="Bradley-Terry Ratings", value=BT_PLOT)
+    with gr.Row():
+        win_rate_plot = gr.Plot(label="Win Rates", value=WR_PLOT)
+    d1 = gr.Textbox(visible=False)
+    demo.load(
+        fn=viz_factory(force=False), inputs=[], outputs=[win_rate_plot, bt_plot, last_updated], show_progress="minimal"
+    )
+    demo.load(fn=get_wr_plot, inputs=[], outputs=[d1])
+    demo.load(fn=get_bt_plot, inputs=[], outputs=[d1])
+    demo.load(fn=get_update_time, inputs=[], outputs=[d1])
+if __name__ == "__main__":
+    scheduler = BackgroundScheduler()
+    scheduler.add_job(func=viz_factory(force=True), trigger="interval", seconds=300)
+    scheduler.start()
+    demo.queue(default_concurrency_limit=10, api_open=True).launch(share=True, server_port=8004, node_port=8005)

talk_arena/streaming_helpers.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import asyncio
+import base64
+import json
+import os
+from collections import defaultdict
+from pathlib import Path
+import google.generativeai as genai
+import gradio as gr
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+import xxhash
+from datasets import Audio
+from openai import AsyncOpenAI
+from transformers import AutoModel, AutoProcessor, Qwen2AudioForConditionalGeneration, TextIteratorStreamer
+from transformers.generation import GenerationConfig
+def _get_prompt_for_model_name(model_id):
+    prompt_dict = defaultdict(lambda: "You are a helpful assistant. Respond conversationally to the speech provided.")
+    # Requested Overrides
+    prompt_dict["scb10x/llama-3-typhoon-audio-8b-2411"] = (
+        "You are a helpful assistant. Respond conversationally to the speech provided in the language it is spoken in."
+    )
+    return prompt_dict[model_id]
+def _get_config_for_model_name(model_id):
+    if "API_MODEL_CONFIG" in os.environ:
+        return json.loads(os.environ["API_MODEL_CONFIG"])[model_id]
+    return {
+        "pipeline/meta-llama/Meta-Llama-3-8B-Instruct": {"base_url": "http://localhost:8001/v1", "api_key": "empty"},
+        "scb10x/llama-3-typhoon-audio-8b-2411": {
+            "base_url": "http://localhost:8002/v1",
+            "api_key": "empty",
+        },
+        "WillHeld/DiVA-llama-3-v0-8b": {
+            "base_url": "http://localhost:8003/v1",
+            "api_key": "empty",
+        },
+        "Qwen/Qwen2-Audio-7B-Instruct": {
+            "base_url": "http://localhost:8004/v1",
+            "api_key": "empty",
+        },
+    }[model_id]
+def gradio_gen_factory(streaming_fn, model_name, anonymous):
+    async def gen_from(audio_input, order):
+        with torch.no_grad():
+            prev_resp = ""
+            async for resp in streaming_fn(audio_input):
+                for char in range(len(prev_resp), len(resp)):
+                    my_resp = gr.Textbox(
+                        value=resp[: char + 1],
+                        info="",
+                        visible=True,
+                        label=model_name if not anonymous else f"Model {order+1}",
+                        elem_classes="lam-response-box",
+                    )
+                    yield my_resp
+                    await asyncio.sleep(0.001)
+                prev_resp = resp
+    return gen_from
+def gemini_streaming(model_id):
+    genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+    resampler = Audio(sampling_rate=16_000)
+    model = genai.GenerativeModel(model_id)
+    async def get_chat_response(audio_input):
+        if audio_input is None:
+            raise StopAsyncIteration("")
+        sr, y = audio_input
+        x = xxhash.xxh32(bytes(y)).hexdigest()
+        y = y.astype(np.float32)
+        y /= np.max(np.abs(y))
+        a = resampler.decode_example(resampler.encode_example({"array": y, "sampling_rate": sr}))
+        sf.write(f"{x}.wav", a["array"], a["sampling_rate"], format="wav")
+        prompt = "You are a helpful assistant. Respond conversationally to the speech provided."
+        inputs = [prompt, {"mime_type": "audio/wav", "data": Path(f"{x}.wav").read_bytes()}]
+        text_response = []
+        responses = model.generate_content(inputs, stream=True)
+        for chunk in responses:
+            text_response.append(chunk.text)
+            yield "".join(text_response)
+        os.remove(f"{x}.wav")
+    return get_chat_response, model
+def gpt4o_streaming(model_id):
+    client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
+    resampler = Audio(sampling_rate=16_000)
+    async def get_chat_response(audio_input):
+        if audio_input is None:
+            raise StopAsyncIteration("")
+        sr, y = audio_input
+        x = xxhash.xxh32(bytes(y)).hexdigest()
+        y = y.astype(np.float32)
+        y /= np.max(np.abs(y))
+        a = resampler.decode_example(resampler.encode_example({"array": y, "sampling_rate": sr}))
+        sf.write(f"{x}.wav", a["array"], a["sampling_rate"], format="wav")
+        with open(f"{x}.wav", "rb") as wav_file:
+            wav_data = wav_file.read()
+        encoded_string = base64.b64encode(wav_data).decode("utf-8")
+        prompt = "You are a helpful assistant. Respond conversationally to the speech provided."
+        try:
+            completion = await client.chat.completions.create(
+                model="gpt-4o-audio-preview",
+                modalities=["text", "audio"],
+                audio={"voice": "alloy", "format": "wav"},
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {"type": "input_audio", "input_audio": {"data": encoded_string, "format": "wav"}},
+                        ],
+                    },
+                ],
+            )
+            os.remove(f"{x}.wav")
+            yield completion.choices[0].message.audio.transcript
+        except:
+            raise StopAsyncIteration("error")
+    return get_chat_response, client
+async def llm_streaming(model_id: str, prompt: str):
+    if "gpt" in model_id:
+        client = AsyncOpenAI()
+    else:
+        client = AsyncOpenAI(**_get_config_for_model_name(model_id))
+    try:
+        completion = await client.chat.completions.create(
+            model=model_id,
+            messages=[
+                {"role": "system", "content": "You are helpful assistant."},
+                {
+                    "role": "user",
+                    "content": prompt,
+                },
+            ],
+            stream=True,
+        )
+        text_response = []
+        async for chunk in completion:
+            if len(chunk.choices) > 0:
+                text_response.append(chunk.choices[0].delta.content)
+                yield "".join(text_response)
+    except:
+        raise StopAsyncIteration("error")
+def asr_streaming(model_id, asr_pipe):
+    resampler = Audio(sampling_rate=16_000)
+    async def pipelined(audio_input):
+        if audio_input is None:
+            raise StopAsyncIteration("")
+        sr, y = audio_input
+        x = xxhash.xxh32(bytes(y)).hexdigest()
+        y = y.astype(np.float32)
+        y /= np.max(np.abs(y))
+        a = resampler.decode_example(resampler.encode_example({"array": y, "sampling_rate": sr}))
+        sf.write(f"{x}.wav", a["array"], a["sampling_rate"], format="wav")
+        text = await asyncio.to_thread(
+            asr_pipe(f"{x}.wav", generate_kwargs={"task": "transcribe"}, return_timestamps=False)["text"]
+        )
+        os.remove(f"{x}.wav")
+        async for response in llm_streaming(model_id, prompt=text):
+            yield response
+    return pipelined
+def api_streaming(model_id):
+    client = AsyncOpenAI(**_get_config_for_model_name(model_id))
+    resampler = Audio(sampling_rate=16_000)
+    async def get_chat_response(audio_input):
+        if audio_input is None:
+            raise StopAsyncIteration("")
+        sr, y = audio_input
+        x = xxhash.xxh32(bytes(y)).hexdigest()
+        y = y.astype(np.float32)
+        y /= np.max(np.abs(y))
+        a = resampler.decode_example(resampler.encode_example({"array": y, "sampling_rate": sr}))
+        sf.write(f"{x}.wav", a["array"], a["sampling_rate"], format="wav")
+        with open(f"{x}.wav", "rb") as wav_file:
+            wav_data = wav_file.read()
+        encoded_string = base64.b64encode(wav_data).decode("utf-8")
+        try:
+            prompt = _get_prompt_for_model_name(model_id)
+            completion = await client.chat.completions.create(
+                model=model_id,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {"type": "audio", "audio_url": "data:audio/wav;base64," + encoded_string},
+                        ],
+                    },
+                ],
+                stream=True,
+            )
+            text_response = []
+            async for chunk in completion:
+                if len(chunk.choices) > 0:
+                    text_response.append(chunk.choices[0].delta.content)
+                    yield "".join(text_response)
+            os.remove(f"{x}.wav")
+        except:
+            print(f"error for {model_id}")
+            raise StopAsyncIteration(f"error for {model_id}")
+    return get_chat_response, client
+# Local Hosting Utilities
+def diva_streaming(diva_model_str):
+    diva_model = AutoModel.from_pretrained(diva_model_str, trust_remote_code=True, device_map="balanced_low_0")
+    resampler = Audio(sampling_rate=16_000)
+    async def diva_audio(audio_input, do_sample=False, temperature=0.001):
+        sr, y = audio_input
+        y = y.astype(np.float32)
+        y /= np.max(np.abs(y))
+        a = resampler.decode_example(resampler.encode_example({"array": y, "sampling_rate": sr}))
+        stream = diva_model.generate_stream(
+            a["array"],
+            (
+                "You are a helpful assistant The user is talking to you with their voice and you are responding with"
+                " text."
+            ),
+            do_sample=do_sample,
+            max_new_tokens=256,
+        )
+        for text in stream:
+            yield text
+    return diva_audio, diva_model
+def qwen2_streaming(qwen2_model_str):
+    resampler = Audio(sampling_rate=16_000)
+    qwen2_processor = AutoProcessor.from_pretrained(qwen2_model_str)
+    qwen2_model = Qwen2AudioForConditionalGeneration.from_pretrained(qwen2_model_str, device_map="auto")
+    qwen2_model.generation_config = GenerationConfig.from_pretrained(
+        qwen2_model_str,
+        trust_remote_code=True,
+        do_sample=False,
+        top_k=50,
+        top_p=1.0,
+    )
+    async def qwen2_audio(audio_input, do_sample=False, temperature=0.001):
+        if audio_input is None:
+            raise StopAsyncIteration("")
+        sr, y = audio_input
+        x = xxhash.xxh32(bytes(y)).hexdigest()
+        y = y.astype(np.float32)
+        y /= np.max(np.abs(y))
+        a = resampler.decode_example(resampler.encode_example({"array": y, "sampling_rate": sr}))
+        sf.write(f"{x}.wav", a["array"], a["sampling_rate"], format="wav")
+        conversation = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "audio_url": f"{x}.wav",
+                    },
+                ],
+            },
+        ]
+        text = qwen2_processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+        audios = [librosa.load(f"{x}.wav", sr=qwen2_processor.feature_extractor.sampling_rate)[0]]
+        inputs = qwen2_processor(text=text, audios=audios, return_tensors="pt", padding=True)
+        streamer = TextIteratorStreamer(qwen2_processor)
+        generation_task = asyncio.create_task(qwen2_model.generate(**inputs, streamer=streamer, max_length=256))
+        generated_text = ""
+        async for new_text in streamer:
+            generated_text += new_text
+            yield generated_text.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "")
+        await generation_task
+        os.remove(f"{x}.wav")
+    return qwen2_audio, qwen2_model
+def typhoon_streaming(typhoon_model_str, device="cuda:0"):
+    resampler = Audio(sampling_rate=16_000)
+    typhoon_model = AutoModel.from_pretrained(typhoon_model_str, torch_dtype=torch.float16, trust_remote_code=True)
+    tokenizer = typhoon_model.llama_tokenizer
+    typhoon_model.to(device)
+    typhoon_model.eval()
+    prompt_pattern = (
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<Speech><SpeechHere></Speech>"
+        " {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    prompt = (
+        "You are a helpful assistant. Respond conversationally to the speech provided in the language it is spoken in."
+    )
+    async def typhoon_audio(audio_input, do_sample=False, temperature=0.001):
+        if audio_input == None:
+            raise StopAsyncIteration("")
+        sr, y = audio_input
+        x = xxhash.xxh32(bytes(y)).hexdigest()
+        y = y.astype(np.float32)
+        y /= np.max(np.abs(y))
+        a = resampler.decode_example(resampler.encode_example({"array": y, "sampling_rate": sr}))
+        streamer = TextIteratorStreamer(tokenizer)
+        generation_task = asyncio.create_task(
+            typhoon_model.generate(
+                audio=a["array"],
+                prompt=prompt,
+                prompt_pattern=prompt_pattern,
+                device=device,
+                do_sample=False,
+                max_length=1200,
+                num_beams=1,
+                streamer=streamer,  # supports TextIteratorStreamer
+            )
+        )
+        generated_text = ""
+        async for new_text in streamer:
+            generated_text += new_text
+            yield generated_text.split("<|start_header_id|>assistant<|end_header_id|>\n\n")[-1].replace(
+                "<|eot_id|>", ""
+            )
+        await generation_task
+    return typhoon_audio, typhoon_model

talk_arena/styles.css ADDED Viewed

	@@ -0,0 +1,25 @@

+@media (max-width: 768px) {
+    .lam-response-box {
+        max-height: 230px;
+    }
+    .lam-response-box > label {
+        max-height: 100%;
+    }
+    .lam-response-box > label > div > textarea{
+        max-height: 100%;
+        height: 100% !important;
+    }
+}
+@media (min-width: 769px) {
+    .lam-response-box {
+        max-height: 40vh;
+    }
+    .lam-response-box > label > div > textarea{
+        max-height: calc(40vh - 50px) !important;
+    }
+}

talk_arena/viz/core.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import json
+import random
+import textwrap
+from collections import defaultdict
+from datetime import datetime
+from typing import Dict, List, Tuple
+from zoneinfo import ZoneInfo
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.io as pio
+from scipy.optimize import minimize
+from scipy.special import expit
+# Constants
+COLORS = [
+    "#1B7FFF",
+    "#F07D1A",
+    "#BA24C7",
+    "#FE42C7",
+    "#0D4B7C",
+    "#0EAC96",
+    "#AA7CFF",
+    "#B50550",
+    "#009EEB",
+    "#220B55",
+    "#7B3301",
+]
+NAME_MAPPING = {
+    "gemini_2f": "Gemini 2.0 (Exp)",
+    "diva_3_8b": "DiVA Llama 3 8B",
+    "qwen2": "Qwen 2 Audio",
+    "pipe_l3.0": "Pipelined Llama 3 8B",
+    "gemini_1.5f": "Gemini 1.5 Flash",
+    "gpt4o": "GPT-4o",
+    "gemini_1.5p": "Gemini 1.5 Pro",
+    "typhoon_audio": "Typhoon Audio",
+}
+def get_aesthetic_timestamp():
+    """
+    Returns a beautifully formatted timestamp in the format:
+    'Tuesday, December 10th, 2024 at 3:45 PM'
+    """
+    # Get timezone object for PST
+    pst = ZoneInfo("America/Los_Angeles")
+    # Get current time in PST
+    now = datetime.now(pst)
+    # Add suffix to day number (1st, 2nd, 3rd, etc.)
+    day = now.day
+    if 4 <= day <= 20 or 24 <= day <= 30:
+        suffix = "th"
+    else:
+        suffix = ["st", "nd", "rd"][day % 10 - 1]
+    return now.strftime(f"%A, %B {day}{suffix}, %Y at %-I:%M %p")
+def bootstrap_ci(data, n_bootstrap=10000, ci=95):
+    """Calculate bootstrap confidence intervals."""
+    bootstrap_samples = []
+    for _ in range(n_bootstrap):
+        bootstrap_samples.append(np.mean(random.choices(data, k=len(data))))
+    lower_bound = np.percentile(bootstrap_samples, (100 - ci) / 2)
+    upper_bound = np.percentile(bootstrap_samples, 100 - (100 - ci) / 2)
+    return lower_bound, upper_bound
+def calculate_win_rates(json_data):
+    """Calculate win rates from JSON data."""
+    data = json.loads(json_data)
+    model_wins = defaultdict(int)
+    total_matches = defaultdict(int)
+    total_votes = 0
+    for value in data["_default"].values():
+        total_votes += 1
+        if value["outcome"] == 0:
+            model_wins[value["model_a"]] += 1
+        elif value["outcome"] == 1:
+            model_wins[value["model_b"]] += 1
+        elif value["outcome"] == 0.5:
+            model_wins[value["model_a"]] += 0.5
+            model_wins[value["model_b"]] += 0.5
+        total_matches[value["model_a"]] += 1
+        total_matches[value["model_b"]] += 1
+    per_model_wins = {}
+    for model, wins in model_wins.items():
+        win_rate = wins / total_matches[model]
+        wins_data = [1] * int(wins) + [0] * int(total_matches[model] - wins)
+        if int(wins) != wins:
+            wins_data += [0.5]
+        lower, upper = bootstrap_ci(wins_data)
+        per_model_wins[model] = {
+            "model": model,
+            "win_rate": win_rate,
+            "95_lower": (win_rate - lower),
+            "95_upper": (upper - win_rate),
+        }
+    df = pd.DataFrame.from_dict(per_model_wins).T
+    return df, total_votes
+def create_win_rate_plot(wins_df):
+    """Create win rate plot using Plotly."""
+    wins_df["Source"] = wins_df["Source"].astype(str)
+    wins_df = wins_df.sort_values(by=["Source", "win_rate"], ascending=False)
+    wins_df["model"] = wins_df["model"].apply(lambda x: NAME_MAPPING.get(x, x))
+    fig = px.bar(
+        wins_df,
+        x="model",
+        y="win_rate",
+        error_y="95_upper",
+        error_y_minus="95_lower",
+        color="model",
+        color_discrete_sequence=COLORS,
+        animation_group="model",
+        animation_frame="Source",
+    )
+    fig.update_traces(
+        hovertemplate="<b>%{x}</b><br>" + "Win Rate: %{y}" + "<extra></extra>",
+    )
+    fig.update_layout(
+        autosize=True,
+        showlegend=False,
+        plot_bgcolor="white",
+        title={
+            "text": "Talk Arena Live Win Rates<br>with 95% Confidence Intervals",
+            "y": 0.95,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="Model",
+        yaxis_title="Win Rate (%)",
+        bargap=0.2,
+        yaxis=dict(
+            tickformat=".0%", tickmode="auto", range=[0, 1.01], gridcolor="#C9CCD1", griddash="dash", gridwidth=2
+        ),
+        legend=dict(
+            orientation="h",  # Make legend horizontal
+            yanchor="bottom",
+            y=-0.5,  # Position below plot
+            xanchor="center",
+            x=0.5,  # Center horizontally
+            bgcolor="rgba(255, 255, 255, 0.8)",
+            bordercolor="#C9CCD1",
+            borderwidth=1,
+        ),
+        margin=dict(l=10, r=10, t=0, b=10),  # Balanced margins
+        hoverlabel=dict(bgcolor="white", font_size=14, bordercolor="gray"),
+    )
+    fig.update_xaxes(showgrid=False)
+    return fig
+# Bradley-Terry Model Functions
+def load_live_votes(json_str: str) -> pd.DataFrame:
+    """Load and preprocess live votes data from JSON string."""
+    data = json.loads(json_str)
+    df = pd.DataFrame.from_dict(data["_default"], orient="index")
+    df["winner"] = df["outcome"].map({1: "model_b", 0: "model_a", 0.5: "tie"})
+    return df
+def preprocess_for_bt(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, List[str], np.ndarray]:
+    """Preprocess data for Bradley-Terry model fitting."""
+    all_models = pd.concat([df["model_a"], df["model_b"]]).unique()
+    model_to_idx = {model: idx for idx, model in enumerate(all_models)}
+    matchups = np.array([[model_to_idx[row.model_a], model_to_idx[row.model_b]] for _, row in df.iterrows()])
+    outcomes = np.array(
+        [1.0 if row.winner == "model_a" else (0.5 if row.winner == "tie" else 0.0) for _, row in df.iterrows()]
+    )
+    unique_matches = np.column_stack([matchups, outcomes])
+    unique_matches, weights = np.unique(unique_matches, return_counts=True, axis=0)
+    return (unique_matches[:, :2].astype(np.int32), unique_matches[:, 2], list(all_models), weights.astype(np.float64))
+def bt_loss_and_grad(
+    ratings: np.ndarray, matchups: np.ndarray, outcomes: np.ndarray, weights: np.ndarray, alpha: float = 1.0
+) -> Tuple[float, np.ndarray]:
+    """Compute Bradley-Terry loss and gradient."""
+    matchup_ratings = ratings[matchups]
+    logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
+    probs = expit(logits)
+    loss = -((np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes)) * weights).sum()
+    matchups_grads = -alpha * (outcomes - probs) * weights
+    model_grad = np.zeros_like(ratings)
+    np.add.at(model_grad, matchups[:, [0, 1]], matchups_grads[:, None] * np.array([1.0, -1.0], dtype=np.float64))
+    return loss, model_grad
+def fit_bt(
+    matchups: np.ndarray, outcomes: np.ndarray, weights: np.ndarray, n_models: int, alpha: float, tol: float = 1e-6
+) -> np.ndarray:
+    """Fit Bradley-Terry model using L-BFGS-B optimization."""
+    initial_ratings = np.zeros(n_models, dtype=np.float64)
+    result = minimize(
+        fun=bt_loss_and_grad,
+        x0=initial_ratings,
+        args=(matchups, outcomes, weights, alpha),
+        jac=True,
+        method="L-BFGS-B",
+        options={"disp": False, "maxiter": 100, "gtol": tol},
+    )
+    return result["x"]
+def scale_and_offset(
+    ratings: np.ndarray, models: List[str], scale: float = 400, init_rating: float = 1000
+) -> np.ndarray:
+    """Scale ratings to familiar Elo-like scale."""
+    scaled_ratings = (ratings * scale) + init_rating
+    return scaled_ratings
+def compute_bootstrap_bt(
+    data: str,
+    num_round: int = 100,
+    base: float = 10.0,
+    scale: float = 400.0,
+    init_rating: float = 1000.0,
+    tol: float = 1e-6,
+) -> pd.DataFrame:
+    """Compute bootstrap Bradley-Terry ratings from live votes data."""
+    df = load_live_votes(data)
+    matchups, outcomes, models, weights = preprocess_for_bt(df)
+    rng = np.random.default_rng(seed=0)
+    total_matches = len(df)
+    idxs = rng.multinomial(n=total_matches, pvals=weights / weights.sum(), size=num_round)
+    boot_weights = idxs.astype(np.float64) / total_matches
+    ratings_list = []
+    for sample_weights in boot_weights:
+        ratings = fit_bt(
+            matchups=matchups,
+            outcomes=outcomes,
+            weights=sample_weights,
+            n_models=len(models),
+            alpha=np.log(base),
+            tol=tol,
+        )
+        scaled_ratings = scale_and_offset(ratings=ratings, models=models, scale=scale, init_rating=init_rating)
+        ratings_list.append(scaled_ratings)
+    df_ratings = pd.DataFrame(ratings_list, columns=models)
+    return df_ratings[df_ratings.median().sort_values(ascending=False).index]
+def create_bt_plot(bootstrap_ratings):
+    """Create Bradley-Terry ratings plot using Plotly."""
+    melted_bootstrap = bootstrap_ratings.melt(id_vars=["Source", "level_1"], var_name="Model", value_name="BT")
+    melted_bootstrap = melted_bootstrap.dropna()
+    melted_bootstrap = melted_bootstrap.sort_values(by=["Source", "Model", "BT"], ascending=False)
+    # Pretty Names
+    melted_bootstrap["Model"] = melted_bootstrap["Model"].apply(lambda x: NAME_MAPPING.get(x, x))
+    # Compression for Client Side
+    melted_bootstrap["BT"] = melted_bootstrap["BT"].apply(lambda x: int(x))
+    min_samp = melted_bootstrap[melted_bootstrap["BT"] > 0]["BT"].min()
+    max_samp = melted_bootstrap["BT"].max()
+    idx_keep = list(range(0, len(melted_bootstrap), 10))
+    melted_bootstrap = melted_bootstrap.iloc[idx_keep]
+    melted_bootstrap = melted_bootstrap.sort_values(by=["Source", "BT"], ascending=False)
+    fig = px.violin(
+        melted_bootstrap,
+        x="Model",
+        y="BT",
+        color="Model",
+        animation_group="Model",
+        animation_frame="Source",
+        color_discrete_sequence=COLORS,
+    )
+    fig.update_layout(
+        autosize=True,
+        showlegend=False,
+        plot_bgcolor="white",
+        title={
+            "text": "Talk Arena Live Bradley-Terry Ratings<br>with Bootstrapped Variance",
+            "y": 0.9,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="Model",
+        yaxis_title="Rating",
+        yaxis=dict(gridcolor="#C9CCD1", range=[min_samp - 10, max_samp + 10], griddash="dash"),
+        legend=dict(
+            orientation="h",  # Make legend horizontal
+            yanchor="bottom",
+            y=-0.5,  # Position below plot
+            xanchor="center",
+            x=0.5,  # Center horizontally
+            bgcolor="rgba(255, 255, 255, 0.8)",
+            bordercolor="#C9CCD1",
+            borderwidth=1,
+        ),
+        margin=dict(l=10, r=10, t=0, b=10),  # Balanced margins
+    )
+    fig.update_xaxes(showgrid=False)
+    fig.update_yaxes(showgrid=True, gridwidth=2)
+    return fig

talk_arena/viz/server.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import hashlib
+import json
+import textwrap
+import time
+from datetime import datetime
+from typing import Optional
+from zoneinfo import ZoneInfo
+import plotly.io as pio
+from apscheduler.schedulers.background import BackgroundScheduler
+from fastapi import FastAPI, HTTPException, Response
+from fastapi.middleware.cors import CORSMiddleware
+from talk_arena.viz.core import *
+app = FastAPI(title="Talk Arena API", description="API for Talk Arena leaderboard and statistics", version="0.0.1")
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with specific origins
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global variables to store the plots and update time
+class GlobalState:
+    WR_PLOT = None
+    BT_PLOT = None
+    UPDATE_TIME = None
+    LAST_PROCESSED = None
+    MIN_UPDATE_INTERVAL = 60  # Minimum seconds between updates
+state = GlobalState()
+def process_and_visualize(force: bool = False):
+    """Process data and create visualizations"""
+    global state
+    current_time = datetime.now(ZoneInfo("America/Los_Angeles"))
+    # Check if enough time has passed since last update
+    if not force and state.LAST_PROCESSED:
+        time_diff = (current_time - state.LAST_PROCESSED).total_seconds()
+        if time_diff < state.MIN_UPDATE_INTERVAL:
+            logger.info(f"Skipping update - only {time_diff:.1f} seconds since last update")
+            return
+    state.LAST_PROCESSED = current_time
+    if state.WR_PLOT is not None and state.BT_PLOT is not None and not force:
+        return
+    try:
+        # Read JSON data
+        pub_json_data = open("/home/wheld3/talk-arena/live_votes.json", "r").read()
+        prolific_json_data = open("/home/wheld3/talk-arena/prolific_votes.json", "r").read()
+        merged_json_data = json.dumps(
+            {"_default": {**json.loads(pub_json_data)["_default"], **json.loads(prolific_json_data)["_default"]}}
+        )
+        # Calculate win rates and create plots
+        pub_win_rates, pub_votes = calculate_win_rates(pub_json_data)
+        pro_win_rates, pro_votes = calculate_win_rates(prolific_json_data)
+        total_win_rates, total_votes = calculate_win_rates(merged_json_data)
+        # Process win rates
+        all_models = total_win_rates["model"].unique()
+        pro_models = pro_win_rates["model"].unique()
+        for model in all_models:
+            if model not in pro_models:
+                new_index = len(pro_win_rates)
+                pro_win_rates.loc[new_index] = [model, -0.1, -0.1, -0.2]
+        win_rates = (
+            pd.concat([pub_win_rates, pro_win_rates, total_win_rates], keys=["Public", "Prolific", "Total"])
+            .reset_index()
+            .rename(columns={"level_0": "Source"})
+        )
+        state.WR_PLOT = create_win_rate_plot(win_rates)
+        # Calculate Bradley-Terry ratings
+        pub_bootstrap_ratings = compute_bootstrap_bt(pub_json_data, num_round=10000)
+        pro_bootstrap_ratings = compute_bootstrap_bt(prolific_json_data, num_round=10000)
+        total_bootstrap_ratings = compute_bootstrap_bt(merged_json_data, num_round=10000)
+        for model in all_models:
+            if model not in pro_models:
+                pro_bootstrap_ratings[model] = pro_bootstrap_ratings["diva_3_8b"] * -1
+        bootstrap_ratings = (
+            pd.concat(
+                [pub_bootstrap_ratings, pro_bootstrap_ratings, total_bootstrap_ratings],
+                keys=["Public", "Prolific", "Total"],
+            )
+            .reset_index()
+            .rename(columns={"level_0": "Source"})
+        )
+        state.BT_PLOT = create_bt_plot(bootstrap_ratings)
+        # Update timestamp and vote counts
+        state.UPDATE_TIME = {
+            "timestamp": get_aesthetic_timestamp(),
+            "total_votes": total_votes,
+            "public_votes": pub_votes,
+            "prolific_votes": pro_votes,
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing data: {str(e)}")
+# Set up logging
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global scheduler instance
+scheduler = None
+def update_job():
+    """Wrapper for the update job with error handling and logging"""
+    try:
+        logger.info("Starting scheduled update...")
+        process_and_visualize(force=True)
+        logger.info("Scheduled update completed successfully")
+    except Exception as e:
+        logger.error(f"Error in scheduled update: {str(e)}", exc_info=True)
+@app.on_event("startup")
+async def startup_event():
+    """Initialize data and start scheduler"""
+    global scheduler
+    try:
+        logger.info("Starting initial data processing...")
+        process_and_visualize(force=True)
+        logger.info("Initial data processing completed")
+        # Clear any existing schedulers
+        if scheduler:
+            scheduler.shutdown(wait=False)
+        # Initialize and start the scheduler
+        scheduler = BackgroundScheduler(
+            timezone=ZoneInfo("America/Los_Angeles"), job_defaults={"coalesce": True, "max_instances": 1}
+        )
+        # Add the job with proper error handling
+        scheduler.add_job(
+            func=update_job,  # Use the wrapper function
+            trigger="interval",
+            seconds=300,
+            id="update_visualizations",
+            name="Update Visualizations",
+            misfire_grace_time=60,
+        )
+        scheduler.start()
+        logger.info("Scheduler started successfully")
+        # Verify the job was added
+        jobs = scheduler.get_jobs()
+        logger.info(f"Current scheduled jobs: {[job.name for job in jobs]}")
+    except Exception as e:
+        logger.error(f"Error during startup: {str(e)}", exc_info=True)
+        raise
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Properly shutdown the scheduler when the app stops"""
+    global scheduler
+    try:
+        if scheduler:
+            logger.info("Shutting down scheduler...")
+            scheduler.shutdown(wait=False)
+            logger.info("Scheduler shutdown complete")
+    except Exception as e:
+        logger.error(f"Error during scheduler shutdown: {str(e)}", exc_info=True)
+# Add an endpoint to manually trigger an update
+@app.post("/api/trigger-update")
+async def trigger_update():
+    """Manually trigger a data update"""
+    try:
+        logger.info("Manual update triggered")
+        process_and_visualize(force=True)
+        logger.info("Manual update completed")
+        return {"status": "success", "message": "Update completed"}
+    except Exception as e:
+        logger.error(f"Error in manual update: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+def generate_etag(data: dict) -> str:
+    """Generate an ETag for the given data"""
+    # Convert data to a consistent string representation and hash it
+    data_str = json.dumps(data, sort_keys=True)
+    return hashlib.md5(data_str.encode()).hexdigest()
+@app.get("/api/win-rate-plot")
+async def get_wr_plot(response: Response):
+    """Get the win rate plot data"""
+    if state.WR_PLOT is None:
+        raise HTTPException(status_code=503, detail="Plot data not yet available")
+    plot_json = json.loads(pio.to_json(state.WR_PLOT))
+    # Customize animation settings
+    for step in plot_json["layout"]["sliders"][0]["steps"]:
+        step["args"][1]["frame"]["duration"] = 500
+        step["args"][1]["transition"]["duration"] = 500
+    plot_json["layout"]["updatemenus"] = []
+    plot_json["layout"]["sliders"][0]["len"] = 0.8
+    plot_json["layout"]["sliders"][0]["pad"] = {}
+    # Generate ETag
+    etag = generate_etag(plot_json)
+    response.headers["ETag"] = etag
+    # Set cache control headers - cache for 4 minutes since we update every 5
+    response.headers["Cache-Control"] = "public, max-age=240"
+    return plot_json
+@app.get("/api/bt-plot")
+async def get_bt_plot(response: Response):
+    """Get the Bradley-Terry plot data"""
+    if state.BT_PLOT is None:
+        raise HTTPException(status_code=503, detail="Plot data not yet available")
+    plot_json = json.loads(pio.to_json(state.BT_PLOT))
+    # Customize animation settings
+    for step in plot_json["layout"]["sliders"][0]["steps"]:
+        step["args"][1]["frame"]["duration"] = 500
+        step["args"][1]["transition"]["duration"] = 500
+    plot_json["layout"]["updatemenus"] = []
+    plot_json["layout"]["sliders"][0]["len"] = 0.8
+    plot_json["layout"]["sliders"][0]["pad"] = {}
+    # Generate ETag
+    etag = generate_etag(plot_json)
+    response.headers["ETag"] = etag
+    # Set cache control headers - cache for 4 minutes since we update every 5
+    response.headers["Cache-Control"] = "public, max-age=240"
+    return plot_json
+@app.get("/api/update-time")
+async def get_update_time(response: Response):
+    """Get the last update time and vote counts"""
+    if state.UPDATE_TIME is None:
+        raise HTTPException(status_code=503, detail="Update time not yet available")
+    # Generate ETag
+    etag = generate_etag(state.UPDATE_TIME)
+    response.headers["ETag"] = etag
+    # Set cache control headers - cache for 4 minutes
+    response.headers["Cache-Control"] = "public, max-age=240"
+    return state.UPDATE_TIME
+@app.get("/api/health")
+async def health_check(response: Response):
+    """Enhanced health check endpoint with scheduler status"""
+    global scheduler
+    scheduler_status = "not_running"
+    next_run = None
+    last_run = state.UPDATE_TIME["timestamp"] if state.UPDATE_TIME else None
+    if scheduler:
+        try:
+            jobs = scheduler.get_jobs()
+            if jobs:
+                scheduler_status = "running"
+                next_run = jobs[0].next_run_time.strftime("%Y-%m-%d %H:%M:%S %Z")
+        except Exception as e:
+            logger.error(f"Error checking scheduler status: {str(e)}")
+            scheduler_status = f"error: {str(e)}"
+    health_data = {
+        "status": "healthy",
+        "scheduler_status": scheduler_status,
+        "last_update": last_run,
+        "next_scheduled_update": next_run,
+        "current_time": datetime.now(ZoneInfo("America/Los_Angeles")).strftime("%Y-%m-%d %H:%M:%S %Z"),
+    }
+    # Generate ETag
+    etag = generate_etag(health_data)
+    response.headers["ETag"] = etag
+    # Set cache control headers - short cache time for health check
+    response.headers["Cache-Control"] = "public, max-age=30"
+    return health_data
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)