Spaces:

atlasia
/

Atlaset-Arena

Running

App Files Files Community

BounharAbdelaziz commited on Feb 27

Commit

685fed5

verified ·

1 Parent(s): af3a421

fix to loop through all possibilities

Browse files

Files changed (1) hide show

human_eval.py +134 -52

human_eval.py CHANGED Viewed

@@ -2,17 +2,33 @@ import gradio as gr
 from collections import defaultdict
 import os
 import base64
-import torch
 from datasets import (
     Dataset,
     load_dataset,
 )
-import random
 import pandas as pd
 from collections import defaultdict
 TOKEN = os.environ['TOKEN']
 def encode_image_to_base64(image_path):
     """Encode an image or GIF file to base64."""
     with open(image_path, "rb") as file:
@@ -35,58 +51,101 @@ def create_html_media(media_path, is_gif=False):
     """
     return html_string
-MASKED_LM_MODELS = [
-    "BounharAbdelaziz/XLM-RoBERTa-Morocco",
-    "SI2M-Lab/DarijaBERT",
-    "BounharAbdelaziz/ModernBERT-Morocco",
-    "google-bert/bert-base-multilingual-cased",
-    "FacebookAI/xlm-roberta-large",
-    "aubmindlab/bert-base-arabertv02",
-]
-CAUSAL_LM_MODELS = [
-    "BounharAbdelaziz/Al-Atlas-LLM-0.5B",
-    "Qwen/Qwen2.5-0.5B",
-    "tiiuae/Falcon3-1B-Base",
-    "MBZUAI-Paris/Atlas-Chat-2B",
-]
 class LMBattleArena:
-    def __init__(self, dataset_path):
         """Initialize battle arena with dataset"""
         self.df = pd.read_csv(dataset_path)
-        print(self.df.head())
         self.current_index = 0
-        self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
         self.evaluation_results_masked = []
         self.evaluation_results_causal = []
         self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
     def get_next_battle_pair(self, is_causal):
-        """Retrieve next pair of summaries for comparison"""
-        # if self.current_index >= len(self.df):
-        #     return None
         row = self.df.iloc[self.current_index]
         if is_causal:
-            model_summary_cols = [
-                col
-                for col in CAUSAL_LM_MODELS
-            ]
         else:
-            model_summary_cols = [
-                col
-                for col in MASKED_LM_MODELS
-            ]
-        selected_models = random.sample(model_summary_cols, 2)
         battle_data = {
             'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
-            'model_1': row[selected_models[0]],
-            'model_2': row[selected_models[1]],
-            'model1_name': selected_models[0],
-            'model2_name': selected_models[1]
         }
-        self.current_index += 1
         return battle_data
     def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
@@ -116,8 +175,31 @@ class LMBattleArena:
         else:
             self.evaluation_results_masked.append(evaluation)
         return self.get_model_scores_df(is_causal)
     def get_model_scores_df(self, is_causal):
         """Convert model scores to DataFrame"""
         scores_data = []
@@ -135,16 +217,15 @@ class LMBattleArena:
                 'Total Comparisons': stats['total_comparisons'],
                 'Win Rate (%)': round(win_rate, 2)
             })
-        results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False)
-        # save the results in a huggingface dataset
-        if self.current_index % self.saving_freq == 0 and self.current_index > 0:
-            # results_dataset = Dataset.from_pandas(results_df)
-            # results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True)
-            results_df.to_csv('human_eval_results.csv')
         return results_df
 def create_battle_arena(dataset_path, is_gif, is_causal):
     arena = LMBattleArena(dataset_path)
@@ -153,7 +234,7 @@ def create_battle_arena(dataset_path, is_gif, is_causal):
         battle_data = arena.get_next_battle_pair(is_causal)
         if battle_data is None:
-            return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
         return (
             battle_data['prompt'],
@@ -172,7 +253,7 @@ def create_battle_arena(dataset_path, is_gif, is_causal):
         return (*next_battle[:-1], scores_df)
     with gr.Blocks(css="footer{display:none !important}") as demo:
         base_path = os.path.dirname(__file__)
         local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
         gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
@@ -277,13 +358,14 @@ def create_battle_arena(dataset_path, is_gif, is_causal):
     return demo
 if __name__ == "__main__":
     dataset_path = 'human_eval_dataset.csv'
     is_gif = True
     # load the existing dataset that contains outputs of the LMs
-    human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test', token=TOKEN).to_csv(dataset_path)
-    # load first tab for masked LM
     demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
     demo.launch(debug=True)

 from collections import defaultdict
 import os
 import base64
 from datasets import (
     Dataset,
     load_dataset,
 )
 import pandas as pd
 from collections import defaultdict
+import itertools
 TOKEN = os.environ['TOKEN']
+MASKED_LM_MODELS = [
+    "BounharAbdelaziz/XLM-RoBERTa-Morocco",
+    "SI2M-Lab/DarijaBERT",
+    "BounharAbdelaziz/ModernBERT-Morocco",
+    "google-bert/bert-base-multilingual-cased",
+    "FacebookAI/xlm-roberta-large",
+    "aubmindlab/bert-base-arabertv02",
+]
+CAUSAL_LM_MODELS = [
+    "BounharAbdelaziz/Al-Atlas-LLM-0.5B",
+    "Qwen/Qwen2.5-0.5B",
+    "tiiuae/Falcon3-1B-Base",
+    "MBZUAI-Paris/Atlas-Chat-2B",
+]
 def encode_image_to_base64(image_path):
     """Encode an image or GIF file to base64."""
     with open(image_path, "rb") as file:
     """
     return html_string
 class LMBattleArena:
+    def __init__(self, dataset_path, saving_freq=25):
         """Initialize battle arena with dataset"""
         self.df = pd.read_csv(dataset_path)
         self.current_index = 0
+        self.saving_freq = saving_freq  # save the results in csv/push to hub every saving_freq evaluations
         self.evaluation_results_masked = []
         self.evaluation_results_causal = []
         self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
+        # Generate all possible model pairs
+        self.masked_model_pairs = list(itertools.combinations(MASKED_LM_MODELS, 2))
+        self.causal_model_pairs = list(itertools.combinations(CAUSAL_LM_MODELS, 2))
+        # Pair indices to track which pair is being evaluated
+        self.masked_pair_idx = 0
+        self.causal_pair_idx = 0
+        # To track which rows have been evaluated for which model pairs
+        self.row_model_pairs_evaluated = set()  # Using a simple set
     def get_next_battle_pair(self, is_causal):
+        """Retrieve next pair of summaries for comparison ensuring all pairs are evaluated"""
+        if self.current_index >= len(self.df):
+            # Reset index to go through dataset again with remaining model pairs
+            self.current_index = 0
+            # If we've gone through all model pairs for all rows, we're done
+            if is_causal and self.causal_pair_idx >= len(self.causal_model_pairs):
+                return None
+            elif not is_causal and self.masked_pair_idx >= len(self.masked_model_pairs):
+                return None
         row = self.df.iloc[self.current_index]
+        # Get the current model pair to evaluate
         if is_causal:
+            # Check if we've evaluated all causal model pairs
+            if self.causal_pair_idx >= len(self.causal_model_pairs):
+                # Move to next row and reset pair index
+                self.current_index += 1
+                self.causal_pair_idx = 0
+                # Try again with the next row
+                return self.get_next_battle_pair(is_causal)
+            model_pair = self.causal_model_pairs[self.causal_pair_idx]
+            pair_key = f"{self.current_index}_causal_{self.causal_pair_idx}"
+            # Check if this row-pair combination has been evaluated
+            if pair_key in self.row_model_pairs_evaluated:
+                # Move to next pair
+                self.causal_pair_idx += 1
+                return self.get_next_battle_pair(is_causal)
+            # Mark this row-pair combination as evaluated
+            self.row_model_pairs_evaluated.add(pair_key)
+            # Move to next pair for next evaluation
+            self.causal_pair_idx += 1
+            # Check if we've gone through all pairs for this row
+            if self.causal_pair_idx >= len(self.causal_model_pairs):
+                # Reset pair index and move to next row for next evaluation
+                self.causal_pair_idx = 0
+                self.current_index += 1
         else:
+            # Similar logic for masked models
+            if self.masked_pair_idx >= len(self.masked_model_pairs):
+                self.current_index += 1
+                self.masked_pair_idx = 0
+                return self.get_next_battle_pair(is_causal)
+            model_pair = self.masked_model_pairs[self.masked_pair_idx]
+            pair_key = f"{self.current_index}_masked_{self.masked_pair_idx}"
+            if pair_key in self.row_model_pairs_evaluated:
+                self.masked_pair_idx += 1
+                return self.get_next_battle_pair(is_causal)
+            self.row_model_pairs_evaluated.add(pair_key)
+            self.masked_pair_idx += 1
+            if self.masked_pair_idx >= len(self.masked_model_pairs):
+                self.masked_pair_idx = 0
+                self.current_index += 1
+        # Prepare the battle data with the selected model pair
         battle_data = {
             'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
+            'model_1': row[model_pair[0]],
+            'model_2': row[model_pair[1]],
+            'model1_name': model_pair[0],
+            'model2_name': model_pair[1]
         }
         return battle_data
     def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
         else:
             self.evaluation_results_masked.append(evaluation)
+        # Calculate the total number of evaluations
+        total_evaluations = len(self.evaluation_results_causal) + len(self.evaluation_results_masked)
+        # Save results periodically
+        if total_evaluations % self.saving_freq == 0:
+            self.save_results()
         return self.get_model_scores_df(is_causal)
+    def save_results(self):
+        """Save the evaluation results to Hub and CSV"""
+        results_df = self.get_model_scores_df(is_causal=True)  # Get the latest scores
+        results_dataset = Dataset.from_pandas(results_df)
+        results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True, , token=TOKEN)
+        results_df.to_csv('human_eval_results.csv')
+        # Also save the raw evaluation results
+        masked_df = pd.DataFrame(self.evaluation_results_masked)
+        causal_df = pd.DataFrame(self.evaluation_results_causal)
+        if not masked_df.empty:
+            masked_df.to_csv('masked_evaluations.csv')
+        if not causal_df.empty:
+            causal_df.to_csv('causal_evaluations.csv')
     def get_model_scores_df(self, is_causal):
         """Convert model scores to DataFrame"""
         scores_data = []
                 'Total Comparisons': stats['total_comparisons'],
                 'Win Rate (%)': round(win_rate, 2)
             })
+        results_df = pd.DataFrame(scores_data)
+        print("Generated DataFrame:\n", results_df)  # Debugging print
+        # if 'Win Rate (%)' not in results_df.columns:
+        #     raise ValueError("Win Rate (%) column is missing from DataFrame!")
         return results_df
 def create_battle_arena(dataset_path, is_gif, is_causal):
     arena = LMBattleArena(dataset_path)
         battle_data = arena.get_next_battle_pair(is_causal)
         if battle_data is None:
+            return "All model pairs have been evaluated for all examples!", "", "", "", "", gr.DataFrame(visible=False)
         return (
             battle_data['prompt'],
         return (*next_battle[:-1], scores_df)
     with gr.Blocks(css="footer{display:none !important}") as demo:
+        # Rest of the code remains the same
         base_path = os.path.dirname(__file__)
         local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
         gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
     return demo
 if __name__ == "__main__":
+    # inference device
+    device = "cpu"
     dataset_path = 'human_eval_dataset.csv'
     is_gif = True
     # load the existing dataset that contains outputs of the LMs
+    human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test', token=TOKEN).to_csv(dataset_path) # atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas
     demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
     demo.launch(debug=True)