BounharAbdelaziz commited on
Commit
5f36137
Β·
verified Β·
1 Parent(s): e5d0438

first working version

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ battle_leaderboard.gif filter=lfs diff=lfs merge=lfs -text
37
+ battle_leaderboard.jpeg filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,12 @@
1
- ---
2
- title: Moroccan Darija LLM Battle Al Atlas
3
- emoji: πŸ‘€
4
- colorFrom: yellow
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.17.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Arabic Summarization Model Battle Arena
3
+ emoji: πŸ“‰
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.13.1
8
+ app_file: human_eval.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
battle_leaderboard.gif ADDED

Git LFS Details

  • SHA256: 45e8c896dba02b0b5a3b5c527ad35ca512eea57b635c9741c36906191d0e03e6
  • Pointer size: 132 Bytes
  • Size of remote file: 9.48 MB
battle_leaderboard.jpeg ADDED

Git LFS Details

  • SHA256: b3c3666e6b6184fb51c45e9b5ead95d448998442f413f26d43c7e35172289c35
  • Pointer size: 132 Bytes
  • Size of remote file: 2.36 MB
human_eval.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from collections import defaultdict
3
+ import os
4
+ import base64
5
+ import torch
6
+ from datasets import (
7
+ Dataset,
8
+ load_dataset,
9
+ )
10
+ import random
11
+ import pandas as pd
12
+ from collections import defaultdict
13
+
14
+ def encode_image_to_base64(image_path):
15
+ """Encode an image or GIF file to base64."""
16
+ with open(image_path, "rb") as file:
17
+ encoded_string = base64.b64encode(file.read()).decode()
18
+ return encoded_string
19
+
20
+ def create_html_media(media_path, is_gif=False):
21
+ """Create HTML for displaying an image or GIF."""
22
+ media_base64 = encode_image_to_base64(media_path)
23
+ media_type = "gif" if is_gif else "jpeg"
24
+
25
+ html_string = f"""
26
+ <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
27
+ <div style="max-width: 450px; margin: auto;">
28
+ <img src="data:image/{media_type};base64,{media_base64}"
29
+ style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
30
+ alt="Displayed Media">
31
+ </div>
32
+ </div>
33
+ """
34
+ return html_string
35
+
36
+ class LMBattleArena:
37
+ def __init__(self, dataset_path):
38
+ """Initialize battle arena with dataset"""
39
+ self.df = pd.read_csv(dataset_path)
40
+ print(self.df.head())
41
+ self.current_index = 0
42
+ self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
43
+ self.evaluation_results = []
44
+ self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
45
+
46
+ def get_next_battle_pair(self):
47
+ """Retrieve next pair of summaries for comparison"""
48
+ if self.current_index >= len(self.df):
49
+ return None
50
+
51
+ row = self.df.iloc[self.current_index]
52
+ model_summary_cols = [
53
+ col
54
+ for col in row.index
55
+ if col.upper() != 'PROMPT'
56
+ ]
57
+ selected_models = random.sample(model_summary_cols, 2)
58
+ battle_data = {
59
+ 'prompt': row['prompt'],
60
+ 'model_1': row[selected_models[0]],
61
+ 'model_2': row[selected_models[1]],
62
+ 'model1_name': selected_models[0],
63
+ 'model2_name': selected_models[1]
64
+ }
65
+ self.current_index += 1
66
+ return battle_data
67
+
68
+ def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name):
69
+ """Record user's model preference and update scores"""
70
+ self.model_scores[model1_name]['total_comparisons'] += 1
71
+ self.model_scores[model2_name]['total_comparisons'] += 1
72
+
73
+ if preferred_models == "Both Good":
74
+ self.model_scores[model1_name]['wins'] += 1
75
+ self.model_scores[model2_name]['wins'] += 1
76
+ elif preferred_models == "Model A": # Maps to first model
77
+ self.model_scores[model1_name]['wins'] += 1
78
+ elif preferred_models == "Model B": # Maps to second model
79
+ self.model_scores[model2_name]['wins'] += 1
80
+ # "Both Bad" case - no wins recorded
81
+
82
+ evaluation = {
83
+ 'input_text': input_text,
84
+ 'output1': output1,
85
+ 'output2': output2,
86
+ 'model1_name': model1_name,
87
+ 'model2_name': model2_name,
88
+ 'preferred_models': preferred_models
89
+ }
90
+ self.evaluation_results.append(evaluation)
91
+
92
+ return self.get_model_scores_df()
93
+
94
+ def get_model_scores_df(self):
95
+ """Convert model scores to DataFrame"""
96
+ scores_data = []
97
+ for model, stats in self.model_scores.items():
98
+ win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
99
+ scores_data.append({
100
+ 'Model': model,
101
+ 'Wins': stats['wins'],
102
+ 'Total Comparisons': stats['total_comparisons'],
103
+ 'Win Rate (%)': round(win_rate, 2)
104
+ })
105
+ results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False)
106
+
107
+ # save the results in a huggingface dataset
108
+ if self.current_index % self.saving_freq == 0 and self.current_index > 0:
109
+ results_dataset = Dataset.from_pandas(results_df)
110
+ results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True)
111
+ results_df.to_csv('human_eval_results.csv')
112
+
113
+ return results_df
114
+
115
+
116
+ def create_battle_arena(dataset_path, is_gif):
117
+ arena = LMBattleArena(dataset_path)
118
+
119
+ def battle_round():
120
+ battle_data = arena.get_next_battle_pair()
121
+
122
+ if battle_data is None:
123
+ return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
124
+
125
+ return (
126
+ battle_data['prompt'],
127
+ battle_data['model_1'],
128
+ battle_data['model_2'],
129
+ battle_data['model1_name'],
130
+ battle_data['model2_name'],
131
+ gr.DataFrame(visible=True)
132
+ )
133
+
134
+ def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models):
135
+ scores_df = arena.record_evaluation(
136
+ preferred_models, input_text, output_1, output_2, model1_name, model2_name
137
+ )
138
+ next_battle = battle_round()
139
+ return (*next_battle[:-1], scores_df)
140
+
141
+ with gr.Blocks(css="footer{display:none !important}") as demo:
142
+
143
+ base_path = os.path.dirname(__file__)
144
+ local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
145
+ gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
146
+
147
+ with gr.Tabs():
148
+ with gr.Tab("Battle Arena"):
149
+ gr.Markdown("# πŸ€– Pretrained SmolLMs Battle Arena")
150
+
151
+ input_text = gr.Textbox(
152
+ label="Input prompt",
153
+ interactive=False,
154
+ )
155
+
156
+ with gr.Row():
157
+ output_1 = gr.Textbox(
158
+ label="Model A",
159
+ interactive=False
160
+ )
161
+ model1_name = gr.State() # Hidden state for model1 name
162
+
163
+ with gr.Row():
164
+ output_2 = gr.Textbox(
165
+ label="Model B",
166
+ interactive=False
167
+ )
168
+ model2_name = gr.State() # Hidden state for model2 name
169
+
170
+ preferred_models = gr.Radio(
171
+ label="Which model is better?",
172
+ choices=["Model A", "Model B", "Both Good", "Both Bad"]
173
+ )
174
+ submit_btn = gr.Button("Vote", variant="primary")
175
+
176
+ scores_table = gr.DataFrame(
177
+ headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
178
+ label="πŸ† Leaderboard"
179
+ )
180
+
181
+ submit_btn.click(
182
+ submit_preference,
183
+ inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models],
184
+ outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
185
+ )
186
+
187
+ demo.load(battle_round, outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table])
188
+
189
+ return demo
190
+
191
+ if __name__ == "__main__":
192
+
193
+ # load the existing dataset that contains outputs of the LMs
194
+ human_eval_dataset = load_dataset("atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas", split='train').to_csv('human_eval_dataset.csv')
195
+
196
+ # precision
197
+ torch_dtype = torch.float16
198
+
199
+ # inference device
200
+ device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
201
+ dataset_path = 'human_eval_dataset.csv'
202
+ is_gif = True
203
+ demo = create_battle_arena(dataset_path, is_gif)
204
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ transformers
4
+ --extra-index-url https://download.pytorch.org/whl/cu113
5
+ torch==2.5.1
6
+ datasets