BounharAbdelaziz commited on
Commit
685fed5
·
verified ·
1 Parent(s): af3a421

fix to loop through all possibilities

Browse files
Files changed (1) hide show
  1. human_eval.py +134 -52
human_eval.py CHANGED
@@ -2,17 +2,33 @@ import gradio as gr
2
  from collections import defaultdict
3
  import os
4
  import base64
5
- import torch
6
  from datasets import (
7
  Dataset,
8
  load_dataset,
9
  )
10
- import random
11
  import pandas as pd
12
  from collections import defaultdict
 
13
 
14
  TOKEN = os.environ['TOKEN']
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def encode_image_to_base64(image_path):
17
  """Encode an image or GIF file to base64."""
18
  with open(image_path, "rb") as file:
@@ -35,58 +51,101 @@ def create_html_media(media_path, is_gif=False):
35
  """
36
  return html_string
37
 
38
- MASKED_LM_MODELS = [
39
- "BounharAbdelaziz/XLM-RoBERTa-Morocco",
40
- "SI2M-Lab/DarijaBERT",
41
- "BounharAbdelaziz/ModernBERT-Morocco",
42
- "google-bert/bert-base-multilingual-cased",
43
- "FacebookAI/xlm-roberta-large",
44
- "aubmindlab/bert-base-arabertv02",
45
- ]
46
-
47
- CAUSAL_LM_MODELS = [
48
- "BounharAbdelaziz/Al-Atlas-LLM-0.5B",
49
- "Qwen/Qwen2.5-0.5B",
50
- "tiiuae/Falcon3-1B-Base",
51
- "MBZUAI-Paris/Atlas-Chat-2B",
52
- ]
53
-
54
  class LMBattleArena:
55
- def __init__(self, dataset_path):
56
  """Initialize battle arena with dataset"""
57
  self.df = pd.read_csv(dataset_path)
58
- print(self.df.head())
59
  self.current_index = 0
60
- self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
61
  self.evaluation_results_masked = []
62
  self.evaluation_results_causal = []
63
  self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  def get_next_battle_pair(self, is_causal):
66
- """Retrieve next pair of summaries for comparison"""
67
- # if self.current_index >= len(self.df):
68
- # return None
 
 
 
 
 
 
 
 
69
 
70
  row = self.df.iloc[self.current_index]
 
 
71
  if is_causal:
72
- model_summary_cols = [
73
- col
74
- for col in CAUSAL_LM_MODELS
75
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  else:
77
- model_summary_cols = [
78
- col
79
- for col in MASKED_LM_MODELS
80
- ]
81
- selected_models = random.sample(model_summary_cols, 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  battle_data = {
83
  'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
84
- 'model_1': row[selected_models[0]],
85
- 'model_2': row[selected_models[1]],
86
- 'model1_name': selected_models[0],
87
- 'model2_name': selected_models[1]
88
  }
89
- self.current_index += 1
90
  return battle_data
91
 
92
  def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
@@ -116,8 +175,31 @@ class LMBattleArena:
116
  else:
117
  self.evaluation_results_masked.append(evaluation)
118
 
 
 
 
 
 
 
 
119
  return self.get_model_scores_df(is_causal)
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def get_model_scores_df(self, is_causal):
122
  """Convert model scores to DataFrame"""
123
  scores_data = []
@@ -135,16 +217,15 @@ class LMBattleArena:
135
  'Total Comparisons': stats['total_comparisons'],
136
  'Win Rate (%)': round(win_rate, 2)
137
  })
138
- results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False)
 
 
139
 
140
- # save the results in a huggingface dataset
141
- if self.current_index % self.saving_freq == 0 and self.current_index > 0:
142
- # results_dataset = Dataset.from_pandas(results_df)
143
- # results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True)
144
- results_df.to_csv('human_eval_results.csv')
145
-
146
  return results_df
147
-
148
 
149
  def create_battle_arena(dataset_path, is_gif, is_causal):
150
  arena = LMBattleArena(dataset_path)
@@ -153,7 +234,7 @@ def create_battle_arena(dataset_path, is_gif, is_causal):
153
  battle_data = arena.get_next_battle_pair(is_causal)
154
 
155
  if battle_data is None:
156
- return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
157
 
158
  return (
159
  battle_data['prompt'],
@@ -172,7 +253,7 @@ def create_battle_arena(dataset_path, is_gif, is_causal):
172
  return (*next_battle[:-1], scores_df)
173
 
174
  with gr.Blocks(css="footer{display:none !important}") as demo:
175
-
176
  base_path = os.path.dirname(__file__)
177
  local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
178
  gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
@@ -277,13 +358,14 @@ def create_battle_arena(dataset_path, is_gif, is_causal):
277
  return demo
278
 
279
  if __name__ == "__main__":
280
-
 
 
281
  dataset_path = 'human_eval_dataset.csv'
282
  is_gif = True
283
 
284
  # load the existing dataset that contains outputs of the LMs
285
- human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test', token=TOKEN).to_csv(dataset_path)
286
-
287
- # load first tab for masked LM
288
  demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
289
  demo.launch(debug=True)
 
2
  from collections import defaultdict
3
  import os
4
  import base64
 
5
  from datasets import (
6
  Dataset,
7
  load_dataset,
8
  )
 
9
  import pandas as pd
10
  from collections import defaultdict
11
+ import itertools
12
 
13
  TOKEN = os.environ['TOKEN']
14
 
15
+
16
+ MASKED_LM_MODELS = [
17
+ "BounharAbdelaziz/XLM-RoBERTa-Morocco",
18
+ "SI2M-Lab/DarijaBERT",
19
+ "BounharAbdelaziz/ModernBERT-Morocco",
20
+ "google-bert/bert-base-multilingual-cased",
21
+ "FacebookAI/xlm-roberta-large",
22
+ "aubmindlab/bert-base-arabertv02",
23
+ ]
24
+
25
+ CAUSAL_LM_MODELS = [
26
+ "BounharAbdelaziz/Al-Atlas-LLM-0.5B",
27
+ "Qwen/Qwen2.5-0.5B",
28
+ "tiiuae/Falcon3-1B-Base",
29
+ "MBZUAI-Paris/Atlas-Chat-2B",
30
+ ]
31
+
32
  def encode_image_to_base64(image_path):
33
  """Encode an image or GIF file to base64."""
34
  with open(image_path, "rb") as file:
 
51
  """
52
  return html_string
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  class LMBattleArena:
55
+ def __init__(self, dataset_path, saving_freq=25):
56
  """Initialize battle arena with dataset"""
57
  self.df = pd.read_csv(dataset_path)
 
58
  self.current_index = 0
59
+ self.saving_freq = saving_freq # save the results in csv/push to hub every saving_freq evaluations
60
  self.evaluation_results_masked = []
61
  self.evaluation_results_causal = []
62
  self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
63
+
64
+ # Generate all possible model pairs
65
+ self.masked_model_pairs = list(itertools.combinations(MASKED_LM_MODELS, 2))
66
+ self.causal_model_pairs = list(itertools.combinations(CAUSAL_LM_MODELS, 2))
67
+
68
+ # Pair indices to track which pair is being evaluated
69
+ self.masked_pair_idx = 0
70
+ self.causal_pair_idx = 0
71
+
72
+ # To track which rows have been evaluated for which model pairs
73
+ self.row_model_pairs_evaluated = set() # Using a simple set
74
 
75
  def get_next_battle_pair(self, is_causal):
76
+ """Retrieve next pair of summaries for comparison ensuring all pairs are evaluated"""
77
+
78
+ if self.current_index >= len(self.df):
79
+ # Reset index to go through dataset again with remaining model pairs
80
+ self.current_index = 0
81
+
82
+ # If we've gone through all model pairs for all rows, we're done
83
+ if is_causal and self.causal_pair_idx >= len(self.causal_model_pairs):
84
+ return None
85
+ elif not is_causal and self.masked_pair_idx >= len(self.masked_model_pairs):
86
+ return None
87
 
88
  row = self.df.iloc[self.current_index]
89
+
90
+ # Get the current model pair to evaluate
91
  if is_causal:
92
+ # Check if we've evaluated all causal model pairs
93
+ if self.causal_pair_idx >= len(self.causal_model_pairs):
94
+ # Move to next row and reset pair index
95
+ self.current_index += 1
96
+ self.causal_pair_idx = 0
97
+ # Try again with the next row
98
+ return self.get_next_battle_pair(is_causal)
99
+
100
+ model_pair = self.causal_model_pairs[self.causal_pair_idx]
101
+ pair_key = f"{self.current_index}_causal_{self.causal_pair_idx}"
102
+
103
+ # Check if this row-pair combination has been evaluated
104
+ if pair_key in self.row_model_pairs_evaluated:
105
+ # Move to next pair
106
+ self.causal_pair_idx += 1
107
+ return self.get_next_battle_pair(is_causal)
108
+
109
+ # Mark this row-pair combination as evaluated
110
+ self.row_model_pairs_evaluated.add(pair_key)
111
+ # Move to next pair for next evaluation
112
+ self.causal_pair_idx += 1
113
+
114
+ # Check if we've gone through all pairs for this row
115
+ if self.causal_pair_idx >= len(self.causal_model_pairs):
116
+ # Reset pair index and move to next row for next evaluation
117
+ self.causal_pair_idx = 0
118
+ self.current_index += 1
119
  else:
120
+ # Similar logic for masked models
121
+ if self.masked_pair_idx >= len(self.masked_model_pairs):
122
+ self.current_index += 1
123
+ self.masked_pair_idx = 0
124
+ return self.get_next_battle_pair(is_causal)
125
+
126
+ model_pair = self.masked_model_pairs[self.masked_pair_idx]
127
+ pair_key = f"{self.current_index}_masked_{self.masked_pair_idx}"
128
+
129
+ if pair_key in self.row_model_pairs_evaluated:
130
+ self.masked_pair_idx += 1
131
+ return self.get_next_battle_pair(is_causal)
132
+
133
+ self.row_model_pairs_evaluated.add(pair_key)
134
+ self.masked_pair_idx += 1
135
+
136
+ if self.masked_pair_idx >= len(self.masked_model_pairs):
137
+ self.masked_pair_idx = 0
138
+ self.current_index += 1
139
+
140
+ # Prepare the battle data with the selected model pair
141
  battle_data = {
142
  'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
143
+ 'model_1': row[model_pair[0]],
144
+ 'model_2': row[model_pair[1]],
145
+ 'model1_name': model_pair[0],
146
+ 'model2_name': model_pair[1]
147
  }
148
+
149
  return battle_data
150
 
151
  def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
 
175
  else:
176
  self.evaluation_results_masked.append(evaluation)
177
 
178
+ # Calculate the total number of evaluations
179
+ total_evaluations = len(self.evaluation_results_causal) + len(self.evaluation_results_masked)
180
+
181
+ # Save results periodically
182
+ if total_evaluations % self.saving_freq == 0:
183
+ self.save_results()
184
+
185
  return self.get_model_scores_df(is_causal)
186
 
187
+ def save_results(self):
188
+ """Save the evaluation results to Hub and CSV"""
189
+ results_df = self.get_model_scores_df(is_causal=True) # Get the latest scores
190
+ results_dataset = Dataset.from_pandas(results_df)
191
+ results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True, , token=TOKEN)
192
+ results_df.to_csv('human_eval_results.csv')
193
+
194
+ # Also save the raw evaluation results
195
+ masked_df = pd.DataFrame(self.evaluation_results_masked)
196
+ causal_df = pd.DataFrame(self.evaluation_results_causal)
197
+
198
+ if not masked_df.empty:
199
+ masked_df.to_csv('masked_evaluations.csv')
200
+ if not causal_df.empty:
201
+ causal_df.to_csv('causal_evaluations.csv')
202
+
203
  def get_model_scores_df(self, is_causal):
204
  """Convert model scores to DataFrame"""
205
  scores_data = []
 
217
  'Total Comparisons': stats['total_comparisons'],
218
  'Win Rate (%)': round(win_rate, 2)
219
  })
220
+
221
+ results_df = pd.DataFrame(scores_data)
222
+ print("Generated DataFrame:\n", results_df) # Debugging print
223
 
224
+ # if 'Win Rate (%)' not in results_df.columns:
225
+ # raise ValueError("Win Rate (%) column is missing from DataFrame!")
226
+
 
 
 
227
  return results_df
228
+
229
 
230
  def create_battle_arena(dataset_path, is_gif, is_causal):
231
  arena = LMBattleArena(dataset_path)
 
234
  battle_data = arena.get_next_battle_pair(is_causal)
235
 
236
  if battle_data is None:
237
+ return "All model pairs have been evaluated for all examples!", "", "", "", "", gr.DataFrame(visible=False)
238
 
239
  return (
240
  battle_data['prompt'],
 
253
  return (*next_battle[:-1], scores_df)
254
 
255
  with gr.Blocks(css="footer{display:none !important}") as demo:
256
+ # Rest of the code remains the same
257
  base_path = os.path.dirname(__file__)
258
  local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
259
  gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
 
358
  return demo
359
 
360
  if __name__ == "__main__":
361
+
362
+ # inference device
363
+ device = "cpu"
364
  dataset_path = 'human_eval_dataset.csv'
365
  is_gif = True
366
 
367
  # load the existing dataset that contains outputs of the LMs
368
+ human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test', token=TOKEN).to_csv(dataset_path) # atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas
369
+
 
370
  demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
371
  demo.launch(debug=True)