Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

1147

sheonhan commited on Jun 12, 2023

Commit

0227006

1 Parent(s): 9cea2a5

Add GPT-4 & human eval tab

Browse files

Files changed (6) hide show

.gitignore +4 -0
app.py +255 -99
content.py +26 -1
elo_utils.py +175 -0
utils.py +4 -20
visualizations.py +137 -0

.gitignore CHANGED Viewed

@@ -4,3 +4,7 @@ __pycache__/
 .env
 .ipynb_checkpoints
 *ipynb

 .env
 .ipynb_checkpoints
 *ipynb
+gpt_4_evals/
+human_evals/
+model_counts.html

app.py CHANGED Viewed

@@ -1,20 +1,24 @@
-import os
 import json
 from datetime import datetime, timezone
-import numpy as np
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
-from content import *
-from huggingface_hub import Repository, HfApi
 from transformers import AutoConfig
 from utils import get_eval_results_dicts, make_clickable_model
 # clone / pull the lmeh eval data
 H4_TOKEN = os.environ.get("H4_TOKEN", None)
 LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", None))
 api = HfApi()
@@ -56,6 +60,27 @@ if H4_TOKEN:
     requested_models_dir = "./evals/eval_requests"
     requested_models = get_all_requested_models(requested_models_dir)
 # parse the results
 BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
@@ -100,6 +125,16 @@ BENCHMARK_COLS = [
     "TruthfulQA (0-shot) ⬆️",
 ]
 def has_no_nan_values(df, columns):
     return df[columns].notna().all(axis=1)
@@ -213,6 +248,42 @@ def get_evaluation_queue_df():
     return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
 original_df = get_leaderboard_df()
 leaderboard_df = original_df.copy()
 (
@@ -220,6 +291,14 @@ leaderboard_df = original_df.copy()
     running_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df()
 def is_model_on_hub(model_name, revision) -> bool:
@@ -359,12 +438,11 @@ custom_css = """
 }
 /* Hides the final column */
-table td:last-child,
-table th:last-child {
     display: none;
 }
 /* Limit the width of the first column so that names don't expand too much */
 table td:first-child,
 table th:first-child {
@@ -373,13 +451,30 @@ table th:first-child {
     white-space: nowrap;
 }
 """
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Column():
@@ -393,97 +488,158 @@ with demo:
             with gr.Accordion("✨ CHANGELOG", open=False):
                 changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
-    with gr.Box(elem_id="search-bar-table-box"):
-        search_bar = gr.Textbox(
-            placeholder="🔍 Search your model and press ENTER...",
-            show_label=False,
-            elem_id="search-bar",
-        )
-        leaderboard_table = gr.components.Dataframe(
-            value=leaderboard_df,
-            headers=COLS,
-            datatype=TYPES,
-            max_rows=5,
-            elem_id="leaderboard-table",
-        )
-        # Dummy leaderboard for handling the case when the user uses backspace key
-        hidden_leaderboard_table_for_search = gr.components.Dataframe(
-            value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
-        )
-        search_bar.submit(
-            search_table,
-            [hidden_leaderboard_table_for_search, search_bar],
-            leaderboard_table,
-        )
-    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-    with gr.Accordion("✅ Finished Evaluations", open=False):
-        finished_eval_table = gr.components.Dataframe(
-            value=finished_eval_queue_df,
-            headers=EVAL_COLS,
-            datatype=EVAL_TYPES,
-            max_rows=5,
-        )
-    with gr.Accordion("🔄 Running Evaluation Queue", open=False):
-        running_eval_table = gr.components.Dataframe(
-            value=running_eval_queue_df,
-            headers=EVAL_COLS,
-            datatype=EVAL_TYPES,
-            max_rows=5,
-        )
-    with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
-        pending_eval_table = gr.components.Dataframe(
-            value=pending_eval_queue_df,
-            headers=EVAL_COLS,
-            datatype=EVAL_TYPES,
-            max_rows=5,
-        )
-    refresh_button = gr.Button("Refresh")
-    refresh_button.click(
-        refresh,
-        inputs=[],
-        outputs=[
-            leaderboard_table,
-            finished_eval_table,
-            running_eval_table,
-            pending_eval_table,
-        ],
-    )
-    with gr.Accordion("Submit a new model for evaluation"):
-        with gr.Row():
-            with gr.Column():
-                model_name_textbox = gr.Textbox(label="Model name")
-                revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
             with gr.Column():
-                is_8bit_toggle = gr.Checkbox(
-                    False, label="8 bit eval", visible=not IS_PUBLIC
-                )
-                private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
-                is_delta_weight = gr.Checkbox(False, label="Delta weights")
-                base_model_name_textbox = gr.Textbox(label="base model (for delta)")
-        submit_button = gr.Button("Submit Eval")
-        submission_result = gr.Markdown()
-        submit_button.click(
-            add_new_eval,
-            [
-                model_name_textbox,
-                base_model_name_textbox,
-                revision_name_textbox,
-                is_8bit_toggle,
-                private,
-                is_delta_weight,
-            ],
-            submission_result,
-        )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600)

 import json
+import os
 from datetime import datetime, timezone
 import gradio as gr
+import numpy as np
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import HfApi, Repository
 from transformers import AutoConfig
+from content import *
+from elo_utils import get_elo_plots, get_elo_results_dicts
 from utils import get_eval_results_dicts, make_clickable_model
 # clone / pull the lmeh eval data
 H4_TOKEN = os.environ.get("H4_TOKEN", None)
 LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
+HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
+GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", None))
 api = HfApi()
     requested_models_dir = "./evals/eval_requests"
     requested_models = get_all_requested_models(requested_models_dir)
+human_eval_repo = None
+if H4_TOKEN and not os.path.isdir("./human_evals"):
+    print("Pulling human evaluation repo")
+    human_eval_repo = Repository(
+        local_dir="./human_evals/",
+        clone_from=HUMAN_EVAL_REPO,
+        use_auth_token=H4_TOKEN,
+        repo_type="dataset",
+    )
+    human_eval_repo.git_pull()
+gpt_4_eval_repo = None
+if H4_TOKEN and not os.path.isdir("./gpt_4_evals"):
+    print("Pulling GPT-4 evaluation repo")
+    gpt_4_eval_repo = Repository(
+        local_dir="./gpt_4_evals/",
+        clone_from=GPT_4_EVAL_REPO,
+        use_auth_token=H4_TOKEN,
+        repo_type="dataset",
+    )
+    gpt_4_eval_repo.git_pull()
 # parse the results
 BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
     "TruthfulQA (0-shot) ⬆️",
 ]
+ELO_COLS = [
+    "Model",
+    "GPT-4 (all)",
+    "Human (all)",
+    "Human (instruct)",
+    "Human (code-instruct)",
+]
+ELO_TYPES = ["markdown", "number", "number", "number", "number"]
+ELO_SORT_COL = "GPT-4 (all)"
 def has_no_nan_values(df, columns):
     return df[columns].notna().all(axis=1)
     return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
+def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False):
+    if human_eval_repo:
+        print("Pulling human_eval_repo changes")
+        human_eval_repo.git_pull()
+    all_data = get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed)
+    dataframe = pd.DataFrame.from_records(all_data)
+    dataframe = dataframe.sort_values(by=ELO_SORT_COL, ascending=False)
+    dataframe = dataframe[ELO_COLS]
+    return dataframe
+def get_elo_elements():
+    df_instruct = pd.read_json("human_evals/without_code.json")
+    df_code_instruct = pd.read_json("human_evals/with_code.json")
+    elo_leaderboard = get_elo_leaderboard(
+        df_instruct, df_code_instruct, tie_allowed=False
+    )
+    elo_leaderboard_with_tie_allowed = get_elo_leaderboard(
+        df_instruct, df_code_instruct, tie_allowed=True
+    )
+    plot_1, plot_2, plot_3, plot_4 = get_elo_plots(
+        df_instruct, df_code_instruct, tie_allowed=False
+    )
+    return (
+        elo_leaderboard,
+        elo_leaderboard_with_tie_allowed,
+        plot_1,
+        plot_2,
+        plot_3,
+        plot_4,
+    )
 original_df = get_leaderboard_df()
 leaderboard_df = original_df.copy()
 (
     running_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df()
+(
+    elo_leaderboard,
+    elo_leaderboard_with_tie_allowed,
+    plot_1,
+    plot_2,
+    plot_3,
+    plot_4,
+) = get_elo_elements()
 def is_model_on_hub(model_name, revision) -> bool:
 }
 /* Hides the final column */
+#llm-benchmark-tab-table table td:last-child,
+#llm-benchmark-tab-table table th:last-child {
     display: none;
 }
 /* Limit the width of the first column so that names don't expand too much */
 table td:first-child,
 table th:first-child {
     white-space: nowrap;
 }
+.tab-buttons button {
+    font-size: 16px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
 """
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
+    with gr.Row():
+        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Column():
             with gr.Accordion("✨ CHANGELOG", open=False):
                 changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
+    with gr.Tabs(elem_classes="tab-buttons"):
+        with gr.TabItem("📊 LLM Benchmarks", elem_id="llm-benchmark-tab-table"):
             with gr.Column():
+                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+                with gr.Box(elem_id="search-bar-table-box"):
+                    search_bar = gr.Textbox(
+                        placeholder="🔍 Search your model and press ENTER...",
+                        show_label=False,
+                        elem_id="search-bar",
+                    )
+                    leaderboard_table = gr.components.Dataframe(
+                        value=leaderboard_df,
+                        headers=COLS,
+                        datatype=TYPES,
+                        max_rows=5,
+                        elem_id="leaderboard-table",
+                    )
+                    # Dummy leaderboard for handling the case when the user uses backspace key
+                    hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                        value=original_df,
+                        headers=COLS,
+                        datatype=TYPES,
+                        max_rows=5,
+                        visible=False,
+                    )
+                    search_bar.submit(
+                        search_table,
+                        [hidden_leaderboard_table_for_search, search_bar],
+                        leaderboard_table,
+                    )
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Accordion("✅ Finished Evaluations", open=False):
+                    with gr.Row():
+                        finished_eval_table = gr.components.Dataframe(
+                            value=finished_eval_queue_df,
+                            headers=EVAL_COLS,
+                            datatype=EVAL_TYPES,
+                            max_rows=5,
+                        )
+                with gr.Accordion("🔄 Running Evaluation Queue", open=False):
+                    with gr.Row():
+                        running_eval_table = gr.components.Dataframe(
+                            value=running_eval_queue_df,
+                            headers=EVAL_COLS,
+                            datatype=EVAL_TYPES,
+                            max_rows=5,
+                        )
+                with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
+                    with gr.Row():
+                        pending_eval_table = gr.components.Dataframe(
+                            value=pending_eval_queue_df,
+                            headers=EVAL_COLS,
+                            datatype=EVAL_TYPES,
+                            max_rows=5,
+                        )
+                with gr.Row():
+                    refresh_button = gr.Button("Refresh")
+                    refresh_button.click(
+                        refresh,
+                        inputs=[],
+                        outputs=[
+                            leaderboard_table,
+                            finished_eval_table,
+                            running_eval_table,
+                            pending_eval_table,
+                        ],
+                    )
+                with gr.Accordion("Submit a new model for evaluation"):
+                    with gr.Row():
+                        with gr.Column():
+                            model_name_textbox = gr.Textbox(label="Model name")
+                            revision_name_textbox = gr.Textbox(
+                                label="revision", placeholder="main"
+                            )
+                        with gr.Column():
+                            is_8bit_toggle = gr.Checkbox(
+                                False, label="8 bit eval", visible=not IS_PUBLIC
+                            )
+                            private = gr.Checkbox(
+                                False, label="Private", visible=not IS_PUBLIC
+                            )
+                            is_delta_weight = gr.Checkbox(False, label="Delta weights")
+                            base_model_name_textbox = gr.Textbox(
+                                label="base model (for delta)"
+                            )
+                    submit_button = gr.Button("Submit Eval")
+                    submission_result = gr.Markdown()
+                    submit_button.click(
+                        add_new_eval,
+                        [
+                            model_name_textbox,
+                            base_model_name_textbox,
+                            revision_name_textbox,
+                            is_8bit_toggle,
+                            private,
+                            is_delta_weight,
+                        ],
+                        submission_result,
+                    )
+        with gr.TabItem(
+            "🧑‍⚖️ Human & GPT-4 Evaluations 🤖", elem_id="human-gpt-tab-table"
+        ):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
+                with gr.Column(scale=1):
+                    gr.Image(
+                        "scale-hf-logo.png", elem_id="scale-logo", show_label=False
+                    )
+            gr.Markdown("## No tie")
+            elo_leaderboard_table = gr.components.Dataframe(
+                value=elo_leaderboard,
+                headers=ELO_COLS,
+                datatype=ELO_TYPES,
+                max_rows=5,
+            )
+            gr.Markdown("## Tie allowed*")
+            elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe(
+                value=elo_leaderboard_with_tie_allowed,
+                headers=ELO_COLS,
+                datatype=ELO_TYPES,
+                max_rows=5,
+            )
+            gr.Markdown("\* Results when the scores of 4 and 5 were treated as ties.", elem_classes="markdown-text")
+        # with gr.Box():
+        #     visualization_title = gr.HTML(VISUALIZATION_TITLE)
+        #     with gr.Row():
+        #         with gr.Column():
+        #             gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
+        #             plot_1 = gr.Plot(plot_1, show_label=False)
+        #         with gr.Column():
+        #             gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
+        #             plot_2 = gr.Plot(plot_2, show_label=False)
+        #     with gr.Row():
+        #         with gr.Column():
+        #             gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
+        #             plot_3 = gr.Plot(plot_3, show_label=False)
+        #         with gr.Column():
+        #             gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
+        #             plot_4 = gr.Plot(plot_4, show_label=False)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600)

content.py CHANGED Viewed

@@ -1,4 +1,7 @@
 CHANGELOG_TEXT = f"""
 ## [2023-06-05]
 - Increase concurrent thread count to 40
 - Search models on ENTER
@@ -47,7 +50,11 @@ INTRODUCTION_TEXT = f"""
 🤗 A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
-📈 We evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks:
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
@@ -56,6 +63,15 @@ INTRODUCTION_TEXT = f"""
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
 """
 EVALUATION_QUEUE_TEXT = f"""
 # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
 """
@@ -128,3 +144,12 @@ CITATION_BUTTON_TEXT = r"""@misc{open-llm-leaderboard,
       primaryClass={cs.CL}
 }"""

 CHANGELOG_TEXT = f"""
+## [2023-06-12]
+- Add Human & GPT-4 Evaluations
 ## [2023-06-05]
 - Increase concurrent thread count to 40
 - Search models on ENTER
 🤗 A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
+📈 In the **first tab (LLM Benchmarks)**, we evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">  Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks. In the **second tab (Human & GPT Evaluations)**, the evaluations are performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts.
+"""
+LLM_BENCHMARKS_TEXT = f"""
+Evaluation is performed against 4 popular benchmarks:
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
 """
+HUMAN_GPT_EVAL_TEXT = f"""
+Evaluation is performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts. The prompts cover tasks such as brainstorming, creative generation, commonsense reasoning, open question answering, summarization, and code generation. Comparisons are made by humans and a model on a 1-8 Likert scale, where the labeler is required to choose a preference each time. Using these preferences, we create bootstrapped Elo rankings.
+We collaborated with **Scale AI** to generate the completions using a professional data labeling workforce on their platform, [following the labeling instructions found here](https://docs.google.com/document/d/1c5-96Lj-UH4lzKjLvJ_MRQaVMjtoEXTYA4dvoAYVCHc/edit?usp=sharing). To understand the evaluation of popular models, we also had GPT-4 label the completions using this prompt.
+For more information on the calibration and initiation of these measurements, please refer to the [announcement blog post](https://huggingface.co/blog/llm-leaderboard). We would like to express our gratitude to **LMSYS** for providing a [useful notebook](https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5?usp=sharing) for computing Elo estimates and plots.
+"""
 EVALUATION_QUEUE_TEXT = f"""
 # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
 """
       primaryClass={cs.CL}
 }"""
+VISUALIZATION_TITLE = """<h1 align="center" id="space-title">📊 Visualizations</h1>"""
+PLOT_1_TITLE = "Fraction of Model A Wins for All Non-tied A vs. B Comparisons"
+PLOT_2_TITLE = "Comparison Count of Each Combination of Models (not allowing ties)"
+PLOT_3_TITLE = "Elo Estimates with error bars (ties allowed)"
+PLOT_4_TITLE = "Fraction of Model A Wins for All Non-tied A vs. B Comparisons"

elo_utils.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
+from utils import make_clickable_model
+from visualizations import (get_bootstrap_result, switch_model_a_b,
+                            visualize_battle_count, visualize_bootstrap_scores,
+                            visualize_pairwise_win_fraction,
+                            visualize_rating_count)
+@dataclass
+class EloEvalResult:
+    model: str
+    gpt_4_all: int
+    human_all: int
+    human_instruct: int
+    human_code_instruct: int
+    tie_allowed: bool
+    def to_dict(self):
+        base_model = f"{self.model}"
+        data_dict = {}
+        data_dict["Model"] = make_clickable_model(base_model)
+        data_dict["GPT-4 (all)"] = self.gpt_4_all
+        data_dict["Human (all)"] = self.human_all
+        data_dict["Human (instruct)"] = self.human_instruct
+        data_dict["Human (code-instruct)"] = self.human_code_instruct
+        return data_dict
+def create_eval_df(df, tie_allowed):
+    responses = []
+    for _, row in df.iterrows():
+        if row["status"] == "canceled":
+            continue
+        rating = row["response"]["annotations"]["Preference"]
+        if rating == "NaN":
+            continue
+        scores = row["response"]["responses"]
+        if any(s["Preference"] == "" for s in scores):
+            continue
+        response = {
+            "id": row["task_id"],
+            "prompt": row["params"]["templateVariables"]["prompt"],
+            "model_a": row["params"]["templateVariables"]["modela"],
+            "model_b": row["params"]["templateVariables"]["modelb"],
+            "response_a": row["params"]["templateVariables"]["response1"],
+            "response_b": row["params"]["templateVariables"]["response2"],
+            "rating": int(rating),
+            "ratings": [np.array([s["Preference"] for s in scores], dtype=np.int32)],
+        }
+        if tie_allowed:
+            response["win"] = "model_a" if response["rating"] < 4 else "model_b" if response["rating"] > 5 else "tie"
+        else:
+            response["win"] = "model_a" if response["rating"] < 5 else "model_b"
+        responses.append(response)
+    return pd.DataFrame(responses)
+def create_eval_df_for_gpt(df, tie_allowed):
+    responses = []
+    for _, row in df.iterrows():
+        response = {
+            "id": row["review_id"],
+            "prompt": row["question"],
+            "model_a": row["model1"],
+            "model_b": row["model2"],
+            "response_a": row["answer1"],
+            "response_b": row["answer2"],
+            "rating": row["score"][0],
+        }
+        if tie_allowed:
+            response["win"] = "model_a" if response["rating"] < 4 else "model_b" if response["rating"] > 5 else "tie"
+        else:
+            response["win"] = "model_a" if response["rating"] < 5 else "model_b"
+        responses.append(response)
+    return pd.DataFrame(responses)
+# Compute the Elo rating for each model
+def compute_elo(df, k=32, scale=400, base=10, initial_rating=1000):
+    rating = defaultdict(lambda: initial_rating)
+    for _, model_a, model_b, win in df[["model_a", "model_b", "win"]].itertuples():
+        ra = rating[model_a]
+        rb = rating[model_b]
+        ea = 1 / (1 + base ** ((rb - ra) / scale))
+        eb = 1 / (1 + base ** ((ra - rb) / scale))
+        if win == "model_a":
+            sa = 1
+        elif win == "model_b":
+            sa = 0
+        elif win == "tie" or win == "tie (bothbad)":
+            sa = 0.5
+        else:
+            raise Exception(f"unexpected vote {win}")
+        rating[model_a] += k * (sa - ea)
+        rating[model_b] += k * (1 - sa - eb)
+    return rating
+def convert_rating_from_float_to_int(df):
+    return {model: int(rating) for model, rating in compute_elo(df).items()}
+def get_elo_results(df_instruct, df_code_instruct, tie_allowed):
+    df_all = pd.concat([df_instruct, df_code_instruct])
+    df_gpt_4 = load_dataset(
+        "gpt_4_evals/data/", split="train", revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846"
+    ).to_pandas()
+    dfs = [df_instruct, df_code_instruct, df_all]
+    elo_ratings = [convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed)) for df in dfs]
+    gpt_4_elo_ratings = convert_rating_from_float_to_int(create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed))
+    elo_ratings.append(gpt_4_elo_ratings)
+    results = [
+        EloEvalResult(
+            model=model_name,
+            gpt_4_all=elo_ratings[3][model_name],
+            human_all=elo_ratings[2][model_name],
+            human_instruct=elo_ratings[0][model_name],
+            human_code_instruct=elo_ratings[1][model_name],
+            tie_allowed=tie_allowed,
+        )
+        for model_name in elo_ratings[0].keys()
+    ]
+    return results
+def get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed) -> List[Dict]:
+    eval_results = get_elo_results(df_instruct, df_code_instruct, tie_allowed)
+    return [r.to_dict() for r in eval_results]
+def get_elo_plots(df_instruct, df_code_instruct, tie_allowed):
+    df_instruct = create_eval_df(df_instruct, tie_allowed=tie_allowed)
+    df_code_instruct = create_eval_df(df_code_instruct, tie_allowed=tie_allowed)
+    df_all = pd.concat([df_instruct, df_code_instruct])
+    game = df_all[["model_a", "model_b", "win"]]
+    game_switch = switch_model_a_b(game)
+    plot_1 = visualize_pairwise_win_fraction(game_switch, PLOT_1_TITLE)
+    plot_2 = visualize_battle_count(game_switch, PLOT_2_TITLE)
+    BOOTSTRAP_ROUNDS = 1000
+    if "bootstrap_elo_lu" not in globals():
+        bootstrap_elo_lu = get_bootstrap_result(game_switch, compute_elo, BOOTSTRAP_ROUNDS)
+    plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
+    plot_4 = visualize_rating_count(game, PLOT_4_TITLE)
+    return plot_1, plot_2, plot_3, plot_4

utils.py CHANGED Viewed

@@ -1,21 +1,11 @@
-import os
-import shutil
-import numpy as np
-import gradio as gr
-from huggingface_hub import Repository, HfApi
-from transformers import AutoConfig, AutoModel
-import json
-from apscheduler.schedulers.background import BackgroundScheduler
-import pandas as pd
-import datetime
 import glob
 from dataclasses import dataclass
-from typing import List, Tuple, Dict
-# clone / pull the lmeh eval data
-H4_TOKEN = os.environ.get("H4_TOKEN", None)
-LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
 METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
 BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
 BENCH_TO_NAME = {
@@ -71,13 +61,11 @@ class EvalResult:
         data_dict["eval_name"] = self.eval_name
         data_dict["8bit"] = self.is_8bit
         data_dict["Model"] = make_clickable_model(base_model)
-        # dummy column to implement search bar (hidden by custom CSS)
         data_dict["model_name_for_query"] = base_model
         data_dict["Revision"] = self.revision
         data_dict["Average ⬆️"] = round(
             sum([v for k, v in self.results.items()]) / 4.0, 1
         )
-        # data_dict["# params"] = get_n_params(base_model)
         for benchmark in BENCHMARKS:
             if not benchmark in self.results.keys():
@@ -151,7 +139,3 @@ def get_eval_results_dicts(is_public=True) -> List[Dict]:
     eval_results = get_eval_results(is_public)
     return [e.to_dict() for e in eval_results]
-eval_results_dict = get_eval_results_dicts()
-# print(eval_results_dict)

 import glob
+import json
 from dataclasses import dataclass
+from typing import Dict, List, Tuple
+import numpy as np
+# clone / pull the lmeh eval data
 METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
 BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
 BENCH_TO_NAME = {
         data_dict["eval_name"] = self.eval_name
         data_dict["8bit"] = self.is_8bit
         data_dict["Model"] = make_clickable_model(base_model)
         data_dict["model_name_for_query"] = base_model
         data_dict["Revision"] = self.revision
         data_dict["Average ⬆️"] = round(
             sum([v for k, v in self.results.items()]) / 4.0, 1
         )
         for benchmark in BENCHMARKS:
             if not benchmark in self.results.keys():
     eval_results = get_eval_results(is_public)
     return [e.to_dict() for e in eval_results]

visualizations.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import math
+import numpy as np
+import pandas as pd
+import plotly.express as px
+# 1
+def compute_pairwise_win_fraction(battles):
+    # Times each model wins as Model A
+    a_win_ptbl = pd.pivot_table(
+        battles[battles["win"] == "model_a"],
+        index="model_a",
+        columns="model_b",
+        aggfunc="size",
+        fill_value=0,
+    )
+    # Table counting times each model wins as Model B
+    b_win_ptbl = pd.pivot_table(
+        battles[battles["win"] == "model_b"],
+        index="model_a",
+        columns="model_b",
+        aggfunc="size",
+        fill_value=0,
+    )
+    # Table counting number of A-B pairs
+    num_battles_ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
+    # Computing the proportion of wins for each model as A and as B
+    # against all other models
+    row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (num_battles_ptbl + num_battles_ptbl.T)
+    # Arrange ordering according to proprition of wins
+    prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
+    model_names = list(prop_wins.keys())
+    row_beats_col = row_beats_col_freq.loc[model_names, model_names]
+    return row_beats_col
+def visualize_pairwise_win_fraction(battles, title):
+    row_beats_col = compute_pairwise_win_fraction(battles)
+    fig = px.imshow(row_beats_col, color_continuous_scale="RdBu", text_auto=".2f", title=title)
+    fig.update_layout(
+        xaxis_title="Model B",
+        yaxis_title="Model A",
+        xaxis_side="top",
+        title_y=0.07,
+        title_x=0.5,
+    )
+    fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")
+    return fig
+# 2
+def switch_model_a_b(df):
+    df_switch = df.copy()
+    # switch with probability 0.5
+    for i, row in df.iterrows():
+        if np.random.rand() < 0.5:
+            df_switch.at[i, "model_a"] = row["model_b"]
+            df_switch.at[i, "model_b"] = row["model_a"]
+            if row["win"] == "model_a":
+                df_switch.at[i, "win"] = "model_b"
+            elif row["win"] == "model_b":
+                df_switch.at[i, "win"] = "model_a"
+    return df_switch
+def visualize_battle_count(battles, title):
+    ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
+    battle_counts = ptbl + ptbl.T
+    ordering = battle_counts.sum().sort_values(ascending=False).index
+    fig = px.imshow(battle_counts.loc[ordering, ordering], title=title, text_auto=True, width=600)
+    fig.update_layout(
+        xaxis_title="Model B",
+        yaxis_title="Model A",
+        xaxis_side="top",
+        title_y=0.07,
+        title_x=0.5,
+    )
+    fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
+    return fig
+# 3
+def get_bootstrap_result(battles, func_compute_elo, num_round):
+    rows = [func_compute_elo(battles.sample(frac=1.0, replace=True)) for _ in range(num_round)]
+    df = pd.DataFrame(rows)
+    return df[df.median().sort_values(ascending=False).index]
+def visualize_bootstrap_scores(df, title):
+    bars = (
+        pd.DataFrame(
+            dict(
+                lower=df.quantile(0.025),
+                rating=df.quantile(0.5),
+                upper=df.quantile(0.975),
+            )
+        )
+        .reset_index(names="model")
+        .sort_values("rating", ascending=False)
+    )
+    bars["error_y"] = bars["upper"] - bars["rating"]
+    bars["error_y_minus"] = bars["rating"] - bars["lower"]
+    bars["rating_rounded"] = np.round(bars["rating"], 2)
+    fig = px.scatter(
+        bars,
+        x="model",
+        y="rating",
+        error_y="error_y",
+        error_y_minus="error_y_minus",
+        text="rating_rounded",
+        title=title,
+    )
+    fig.update_layout(xaxis_title="Model", yaxis_title="Rating")
+    return fig
+# 4
+def visualize_rating_count(df, title):
+    df_all_value_counts = pd.concat([df["model_a"], df["model_b"]]).value_counts()
+    fig = px.bar(df_all_value_counts, title=title, text_auto=True)
+    min_y = df_all_value_counts.min()
+    max_y = df_all_value_counts.max()
+    y_end = math.ceil(min_y / 100) * 100
+    y_begin = math.floor(max_y / 100) * 100
+    fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False)
+    fig.update_yaxes(range=[y_begin, y_end])
+    # save the plot for the blog:
+    fig.write_html("model_counts.html", full_html=False, include_plotlyjs="cdn")
+    return fig