Spaces:

atlasia
/

Atlaset-Arena

Running

App Files Files Community

BounharAbdelaziz commited on Feb 24

Commit

5f36137

verified ·

1 Parent(s): e5d0438

first working version

Browse files

Files changed (6) hide show

.gitattributes +2 -0
README.md +12 -12
battle_leaderboard.gif +3 -0
battle_leaderboard.jpeg +3 -0
human_eval.py +204 -0
requirements.txt +6 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+battle_leaderboard.gif filter=lfs diff=lfs merge=lfs -text
+battle_leaderboard.jpeg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
----
-title: Moroccan Darija LLM Battle Al Atlas
-emoji: 👀
-colorFrom: yellow
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.17.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Arabic Summarization Model Battle Arena
+emoji: 📉
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+sdk_version: 5.13.1
+app_file: human_eval.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

battle_leaderboard.gif ADDED Viewed

Git LFS Details

SHA256: 45e8c896dba02b0b5a3b5c527ad35ca512eea57b635c9741c36906191d0e03e6
Pointer size: 132 Bytes
Size of remote file: 9.48 MB

battle_leaderboard.jpeg ADDED Viewed

Git LFS Details

SHA256: b3c3666e6b6184fb51c45e9b5ead95d448998442f413f26d43c7e35172289c35
Pointer size: 132 Bytes
Size of remote file: 2.36 MB

human_eval.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import gradio as gr
+from collections import defaultdict
+import os
+import base64
+import torch
+from datasets import (
+    Dataset,
+    load_dataset,
+)
+import random
+import pandas as pd
+from collections import defaultdict
+def encode_image_to_base64(image_path):
+    """Encode an image or GIF file to base64."""
+    with open(image_path, "rb") as file:
+        encoded_string = base64.b64encode(file.read()).decode()
+    return encoded_string
+def create_html_media(media_path, is_gif=False):
+    """Create HTML for displaying an image or GIF."""
+    media_base64 = encode_image_to_base64(media_path)
+    media_type = "gif" if is_gif else "jpeg"
+    html_string = f"""
+    <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
+        <div style="max-width: 450px; margin: auto;">
+            <img src="data:image/{media_type};base64,{media_base64}"
+                 style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
+                 alt="Displayed Media">
+        </div>
+    </div>
+    """
+    return html_string
+class LMBattleArena:
+    def __init__(self, dataset_path):
+        """Initialize battle arena with dataset"""
+        self.df = pd.read_csv(dataset_path)
+        print(self.df.head())
+        self.current_index = 0
+        self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
+        self.evaluation_results = []
+        self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
+    def get_next_battle_pair(self):
+        """Retrieve next pair of summaries for comparison"""
+        if self.current_index >= len(self.df):
+            return None
+        row = self.df.iloc[self.current_index]
+        model_summary_cols = [
+            col
+            for col in row.index
+            if col.upper() != 'PROMPT'
+        ]
+        selected_models = random.sample(model_summary_cols, 2)
+        battle_data = {
+            'prompt': row['prompt'],
+            'model_1': row[selected_models[0]],
+            'model_2': row[selected_models[1]],
+            'model1_name': selected_models[0],
+            'model2_name': selected_models[1]
+        }
+        self.current_index += 1
+        return battle_data
+    def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name):
+        """Record user's model preference and update scores"""
+        self.model_scores[model1_name]['total_comparisons'] += 1
+        self.model_scores[model2_name]['total_comparisons'] += 1
+        if preferred_models == "Both Good":
+            self.model_scores[model1_name]['wins'] += 1
+            self.model_scores[model2_name]['wins'] += 1
+        elif preferred_models == "Model A":  # Maps to first model
+            self.model_scores[model1_name]['wins'] += 1
+        elif preferred_models == "Model B":  # Maps to second model
+            self.model_scores[model2_name]['wins'] += 1
+        # "Both Bad" case - no wins recorded
+        evaluation = {
+            'input_text': input_text,
+            'output1': output1,
+            'output2': output2,
+            'model1_name': model1_name,
+            'model2_name': model2_name,
+            'preferred_models': preferred_models
+        }
+        self.evaluation_results.append(evaluation)
+        return self.get_model_scores_df()
+    def get_model_scores_df(self):
+        """Convert model scores to DataFrame"""
+        scores_data = []
+        for model, stats in self.model_scores.items():
+            win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
+            scores_data.append({
+                'Model': model,
+                'Wins': stats['wins'],
+                'Total Comparisons': stats['total_comparisons'],
+                'Win Rate (%)': round(win_rate, 2)
+            })
+        results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False)
+        # save the results in a huggingface dataset
+        if self.current_index % self.saving_freq == 0 and self.current_index > 0:
+            results_dataset = Dataset.from_pandas(results_df)
+            results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True)
+            results_df.to_csv('human_eval_results.csv')
+        return results_df
+def create_battle_arena(dataset_path, is_gif):
+    arena = LMBattleArena(dataset_path)
+    def battle_round():
+        battle_data = arena.get_next_battle_pair()
+        if battle_data is None:
+            return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
+        return (
+            battle_data['prompt'],
+            battle_data['model_1'],
+            battle_data['model_2'],
+            battle_data['model1_name'],
+            battle_data['model2_name'],
+            gr.DataFrame(visible=True)
+        )
+    def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models):
+        scores_df = arena.record_evaluation(
+            preferred_models, input_text, output_1, output_2, model1_name, model2_name
+        )
+        next_battle = battle_round()
+        return (*next_battle[:-1], scores_df)
+    with gr.Blocks(css="footer{display:none !important}") as demo:
+        base_path = os.path.dirname(__file__)
+        local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
+        gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
+        with gr.Tabs():
+            with gr.Tab("Battle Arena"):
+                gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")
+                input_text = gr.Textbox(
+                    label="Input prompt",
+                    interactive=False,
+                )
+                with gr.Row():
+                    output_1 = gr.Textbox(
+                        label="Model A",
+                        interactive=False
+                    )
+                    model1_name = gr.State()  # Hidden state for model1 name
+                with gr.Row():
+                    output_2 = gr.Textbox(
+                        label="Model B",
+                        interactive=False
+                    )
+                    model2_name = gr.State()  # Hidden state for model2 name
+                preferred_models = gr.Radio(
+                    label="Which model is better?",
+                    choices=["Model A", "Model B", "Both Good", "Both Bad"]
+                )
+                submit_btn = gr.Button("Vote", variant="primary")
+                scores_table = gr.DataFrame(
+                    headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
+                    label="🏆 Leaderboard"
+                )
+                submit_btn.click(
+                    submit_preference,
+                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models],
+                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
+                )
+                demo.load(battle_round, outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table])
+    return demo
+if __name__ == "__main__":
+    # load the existing dataset that contains outputs of the LMs
+    human_eval_dataset = load_dataset("atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas", split='train').to_csv('human_eval_dataset.csv')
+    # precision
+    torch_dtype = torch.float16
+    # inference device
+    device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
+    dataset_path = 'human_eval_dataset.csv'
+    is_gif = True
+    demo = create_battle_arena(dataset_path, is_gif)
+    demo.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+pandas
+transformers
+--extra-index-url https://download.pytorch.org/whl/cu113
+torch==2.5.1
+datasets