File size: 15,178 Bytes
e9a40a3
 
 
 
 
 
 
 
 
 
685fed5
e9a40a3
8d84cb6
 
685fed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9a40a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685fed5
e9a40a3
 
 
685fed5
ad3a876
 
e9a40a3
685fed5
 
 
 
 
 
 
 
 
 
 
e9a40a3
ad3a876
685fed5
 
 
 
 
 
 
 
 
 
 
e9a40a3
 
685fed5
 
ad3a876
685fed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad3a876
685fed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9a40a3
ad3a876
685fed5
 
 
 
e9a40a3
685fed5
e9a40a3
 
ad3a876
e9a40a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad3a876
 
 
 
e9a40a3
685fed5
 
 
 
 
 
 
ad3a876
e9a40a3
685fed5
 
 
 
ed7d6ec
685fed5
 
 
 
 
 
 
 
 
 
 
ad3a876
e9a40a3
 
 
ad3a876
 
 
 
 
 
e9a40a3
 
 
 
 
 
 
685fed5
 
 
e9a40a3
685fed5
 
 
e9a40a3
685fed5
e9a40a3
ad3a876
e9a40a3
 
ad3a876
 
e9a40a3
 
685fed5
e9a40a3
 
 
 
 
 
 
 
 
 
ad3a876
e9a40a3
ad3a876
e9a40a3
ad3a876
e9a40a3
 
 
685fed5
e9a40a3
 
 
 
 
ad3a876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9a40a3
 
ad3a876
 
 
e9a40a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad3a876
e9a40a3
 
 
ad3a876
 
 
 
 
e9a40a3
 
 
 
685fed5
 
 
e9a40a3
 
8d84cb6
 
685fed5
 
ad3a876
5f36137
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
import gradio as gr
from collections import defaultdict
import os
import base64
from datasets import (
    Dataset,
    load_dataset,
)
import pandas as pd
from collections import defaultdict
import itertools

TOKEN = os.environ['TOKEN']


MASKED_LM_MODELS = [
    "BounharAbdelaziz/XLM-RoBERTa-Morocco",
    "SI2M-Lab/DarijaBERT",
    "BounharAbdelaziz/ModernBERT-Morocco",
    "google-bert/bert-base-multilingual-cased",
    "FacebookAI/xlm-roberta-large",
    "aubmindlab/bert-base-arabertv02",
]

CAUSAL_LM_MODELS = [
    "BounharAbdelaziz/Al-Atlas-LLM-0.5B",
    "Qwen/Qwen2.5-0.5B",
    "tiiuae/Falcon3-1B-Base",
    "MBZUAI-Paris/Atlas-Chat-2B",
]

def encode_image_to_base64(image_path):
    """Encode an image or GIF file to base64."""
    with open(image_path, "rb") as file:
        encoded_string = base64.b64encode(file.read()).decode()
    return encoded_string

def create_html_media(media_path, is_gif=False):
    """Create HTML for displaying an image or GIF."""
    media_base64 = encode_image_to_base64(media_path)
    media_type = "gif" if is_gif else "jpeg"
    
    html_string = f"""
    <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
        <div style="max-width: 450px; margin: auto;">
            <img src="data:image/{media_type};base64,{media_base64}"
                 style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
                 alt="Displayed Media">
        </div>
    </div>
    """
    return html_string

class LMBattleArena:
    def __init__(self, dataset_path, saving_freq=25):
        """Initialize battle arena with dataset"""
        self.df = pd.read_csv(dataset_path)
        self.current_index = 0
        self.saving_freq = saving_freq  # save the results in csv/push to hub every saving_freq evaluations
        self.evaluation_results_masked = []
        self.evaluation_results_causal = []
        self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
        
        # Generate all possible model pairs
        self.masked_model_pairs = list(itertools.combinations(MASKED_LM_MODELS, 2))
        self.causal_model_pairs = list(itertools.combinations(CAUSAL_LM_MODELS, 2))
                
        # Pair indices to track which pair is being evaluated
        self.masked_pair_idx = 0
        self.causal_pair_idx = 0
        
        # To track which rows have been evaluated for which model pairs
        self.row_model_pairs_evaluated = set()  # Using a simple set
    
    def get_next_battle_pair(self, is_causal):
        """Retrieve next pair of summaries for comparison ensuring all pairs are evaluated"""
        
        if self.current_index >= len(self.df):
            # Reset index to go through dataset again with remaining model pairs
            self.current_index = 0
            
            # If we've gone through all model pairs for all rows, we're done
            if is_causal and self.causal_pair_idx >= len(self.causal_model_pairs):
                return None
            elif not is_causal and self.masked_pair_idx >= len(self.masked_model_pairs):
                return None
        
        row = self.df.iloc[self.current_index]
        
        # Get the current model pair to evaluate
        if is_causal:
            # Check if we've evaluated all causal model pairs
            if self.causal_pair_idx >= len(self.causal_model_pairs):
                # Move to next row and reset pair index
                self.current_index += 1
                self.causal_pair_idx = 0
                # Try again with the next row
                return self.get_next_battle_pair(is_causal)
            
            model_pair = self.causal_model_pairs[self.causal_pair_idx]
            pair_key = f"{self.current_index}_causal_{self.causal_pair_idx}"
            
            # Check if this row-pair combination has been evaluated
            if pair_key in self.row_model_pairs_evaluated:
                # Move to next pair
                self.causal_pair_idx += 1
                return self.get_next_battle_pair(is_causal)
            
            # Mark this row-pair combination as evaluated
            self.row_model_pairs_evaluated.add(pair_key)
            # Move to next pair for next evaluation
            self.causal_pair_idx += 1
            
            # Check if we've gone through all pairs for this row
            if self.causal_pair_idx >= len(self.causal_model_pairs):
                # Reset pair index and move to next row for next evaluation
                self.causal_pair_idx = 0
                self.current_index += 1
        else:
            # Similar logic for masked models
            if self.masked_pair_idx >= len(self.masked_model_pairs):
                self.current_index += 1
                self.masked_pair_idx = 0
                return self.get_next_battle_pair(is_causal)
            
            model_pair = self.masked_model_pairs[self.masked_pair_idx]
            pair_key = f"{self.current_index}_masked_{self.masked_pair_idx}"
            
            if pair_key in self.row_model_pairs_evaluated:
                self.masked_pair_idx += 1
                return self.get_next_battle_pair(is_causal)
            
            self.row_model_pairs_evaluated.add(pair_key)
            self.masked_pair_idx += 1
            
            if self.masked_pair_idx >= len(self.masked_model_pairs):
                self.masked_pair_idx = 0
                self.current_index += 1
        
        # Prepare the battle data with the selected model pair
        battle_data = {
            'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
            'model_1': row[model_pair[0]],
            'model_2': row[model_pair[1]],
            'model1_name': model_pair[0],
            'model2_name': model_pair[1]
        }
        
        return battle_data
    
    def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
        """Record user's model preference and update scores"""
        self.model_scores[model1_name]['total_comparisons'] += 1
        self.model_scores[model2_name]['total_comparisons'] += 1
        
        if preferred_models == "Both Good":
            self.model_scores[model1_name]['wins'] += 1
            self.model_scores[model2_name]['wins'] += 1
        elif preferred_models == "Model A":  # Maps to first model
            self.model_scores[model1_name]['wins'] += 1
        elif preferred_models == "Model B":  # Maps to second model
            self.model_scores[model2_name]['wins'] += 1
        # "Both Bad" case - no wins recorded
        
        evaluation = {
            'input_text': input_text,
            'output1': output1,
            'output2': output2,
            'model1_name': model1_name,
            'model2_name': model2_name,
            'preferred_models': preferred_models
        }
        if is_causal:
            self.evaluation_results_causal.append(evaluation)
        else:
            self.evaluation_results_masked.append(evaluation)
        
        # Calculate the total number of evaluations
        total_evaluations = len(self.evaluation_results_causal) + len(self.evaluation_results_masked)
        
        # Save results periodically
        if total_evaluations % self.saving_freq == 0:
            self.save_results()
            
        return self.get_model_scores_df(is_causal)
    
    def save_results(self):
        """Save the evaluation results to Hub and CSV"""
        results_df = self.get_model_scores_df(is_causal=True)  # Get the latest scores
        results_dataset = Dataset.from_pandas(results_df)
        results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True, token=TOKEN)
        results_df.to_csv('human_eval_results.csv')
        
        # Also save the raw evaluation results
        masked_df = pd.DataFrame(self.evaluation_results_masked)
        causal_df = pd.DataFrame(self.evaluation_results_causal)
        
        if not masked_df.empty:
            masked_df.to_csv('masked_evaluations.csv')
        if not causal_df.empty:
            causal_df.to_csv('causal_evaluations.csv')
    
    def get_model_scores_df(self, is_causal):
        """Convert model scores to DataFrame"""
        scores_data = []
        for model, stats in self.model_scores.items():
            if is_causal:
                if model not in CAUSAL_LM_MODELS:
                    continue
            else:
                if model not in MASKED_LM_MODELS:
                    continue
            win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
            scores_data.append({
                'Model': model,
                'Wins': stats['wins'],
                'Total Comparisons': stats['total_comparisons'],
                'Win Rate (%)': round(win_rate, 2)
            })

        results_df = pd.DataFrame(scores_data)
        print("Generated DataFrame:\n", results_df)  # Debugging print
        
        # if 'Win Rate (%)' not in results_df.columns:
        #     raise ValueError("Win Rate (%) column is missing from DataFrame!")

        return results_df


def create_battle_arena(dataset_path, is_gif, is_causal):
    arena = LMBattleArena(dataset_path)
    
    def battle_round(is_causal):
        battle_data = arena.get_next_battle_pair(is_causal)
        
        if battle_data is None:
            return "All model pairs have been evaluated for all examples!", "", "", "", "", gr.DataFrame(visible=False)
        
        return (
            battle_data['prompt'], 
            battle_data['model_1'], 
            battle_data['model_2'],
            battle_data['model1_name'], 
            battle_data['model2_name'],
            gr.DataFrame(visible=True)
        )
    
    def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal):
        scores_df = arena.record_evaluation(
            preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal
        )
        next_battle = battle_round(is_causal)
        return (*next_battle[:-1], scores_df)

    with gr.Blocks(css="footer{display:none !important}") as demo:
        # Rest of the code remains the same
        base_path = os.path.dirname(__file__)
        local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
        gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
        
        with gr.Tabs():
            with gr.Tab("Masked LM Battle Arena"):
                gr.Markdown("# πŸ€– Pretrained SmolLMs Battle Arena")
                
                # Use gr.State to store the boolean value without displaying it
                is_causal = gr.State(value=False)
                
                input_text = gr.Textbox(
                    label="Input prompt", 
                    interactive=False,
                )
                
                with gr.Row():
                    output_1 = gr.Textbox(
                        label="Model A", 
                        interactive=False
                    )
                    model1_name = gr.State()  # Hidden state for model1 name
                
                with gr.Row():
                    output_2 = gr.Textbox(
                        label="Model B", 
                        interactive=False
                    )
                    model2_name = gr.State()  # Hidden state for model2 name
                
                preferred_models = gr.Radio(
                    label="Which model is better?",
                    choices=["Model A", "Model B", "Both Good", "Both Bad"]
                )
                submit_btn = gr.Button("Vote", variant="primary")
                
                scores_table = gr.DataFrame(
                    headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
                    label="πŸ† Leaderboard"
                )
                
                submit_btn.click(
                    submit_preference,
                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                
                demo.load(
                    battle_round, 
                    inputs=[is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                
            with gr.Tab("Causal LM Battle Arena"):
                gr.Markdown("# πŸ€– Pretrained SmolLMs Battle Arena")
                
                # Use gr.State to store the boolean value without displaying it
                is_causal = gr.State(value=True)
                
                input_text = gr.Textbox(
                    label="Input prompt", 
                    interactive=False,
                )
                
                with gr.Row():
                    output_1 = gr.Textbox(
                        label="Model A", 
                        interactive=False
                    )
                    model1_name = gr.State()  # Hidden state for model1 name
                
                with gr.Row():
                    output_2 = gr.Textbox(
                        label="Model B", 
                        interactive=False
                    )
                    model2_name = gr.State()  # Hidden state for model2 name
                
                preferred_models = gr.Radio(
                    label="Which model is better?",
                    choices=["Model A", "Model B", "Both Good", "Both Bad"]
                )
                submit_btn = gr.Button("Vote", variant="primary")
                
                scores_table = gr.DataFrame(
                    headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
                    label="πŸ† Leaderboard"
                )
                
                submit_btn.click(
                    submit_preference,
                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                
                demo.load(
                    battle_round, 
                    inputs=[is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                        
    return demo

if __name__ == "__main__":
    
    # inference device
    device = "cpu"
    dataset_path = 'human_eval_dataset.csv'
    is_gif = True
    
    # load the existing dataset that contains outputs of the LMs
    human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test', token=TOKEN).to_csv(dataset_path) # atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas

    demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
    demo.launch(debug=True)