Spaces:

atlasia
/

Atlaset-Arena

Running

App Files Files Community

Atlaset-Arena / human_eval.py

BounharAbdelaziz

removed extra comma

ed7d6ec verified 13 days ago

raw

history blame contribute delete

15.2 kB

	import gradio as gr
	from collections import defaultdict
	import os
	import base64
	from datasets import (
	Dataset,
	load_dataset,
	)
	import pandas as pd
	from collections import defaultdict
	import itertools

	TOKEN = os.environ['TOKEN']


	MASKED_LM_MODELS = [
	"BounharAbdelaziz/XLM-RoBERTa-Morocco",
	"SI2M-Lab/DarijaBERT",
	"BounharAbdelaziz/ModernBERT-Morocco",
	"google-bert/bert-base-multilingual-cased",
	"FacebookAI/xlm-roberta-large",
	"aubmindlab/bert-base-arabertv02",
	]

	CAUSAL_LM_MODELS = [
	"BounharAbdelaziz/Al-Atlas-LLM-0.5B",
	"Qwen/Qwen2.5-0.5B",
	"tiiuae/Falcon3-1B-Base",
	"MBZUAI-Paris/Atlas-Chat-2B",
	]

	def encode_image_to_base64(image_path):
	"""Encode an image or GIF file to base64."""
	with open(image_path, "rb") as file:
	encoded_string = base64.b64encode(file.read()).decode()
	return encoded_string

	def create_html_media(media_path, is_gif=False):
	"""Create HTML for displaying an image or GIF."""
	media_base64 = encode_image_to_base64(media_path)
	media_type = "gif" if is_gif else "jpeg"

	html_string = f"""
	<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
	<div style="max-width: 450px; margin: auto;">
	<img src="data:image/{media_type};base64,{media_base64}"
	style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
	alt="Displayed Media">
	</div>
	</div>
	"""
	return html_string

	class LMBattleArena:
	def __init__(self, dataset_path, saving_freq=25):
	"""Initialize battle arena with dataset"""
	self.df = pd.read_csv(dataset_path)
	self.current_index = 0
	self.saving_freq = saving_freq # save the results in csv/push to hub every saving_freq evaluations
	self.evaluation_results_masked = []
	self.evaluation_results_causal = []
	self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})

	# Generate all possible model pairs
	self.masked_model_pairs = list(itertools.combinations(MASKED_LM_MODELS, 2))
	self.causal_model_pairs = list(itertools.combinations(CAUSAL_LM_MODELS, 2))

	# Pair indices to track which pair is being evaluated
	self.masked_pair_idx = 0
	self.causal_pair_idx = 0

	# To track which rows have been evaluated for which model pairs
	self.row_model_pairs_evaluated = set() # Using a simple set

	def get_next_battle_pair(self, is_causal):
	"""Retrieve next pair of summaries for comparison ensuring all pairs are evaluated"""

	if self.current_index >= len(self.df):
	# Reset index to go through dataset again with remaining model pairs
	self.current_index = 0

	# If we've gone through all model pairs for all rows, we're done
	if is_causal and self.causal_pair_idx >= len(self.causal_model_pairs):
	return None
	elif not is_causal and self.masked_pair_idx >= len(self.masked_model_pairs):
	return None

	row = self.df.iloc[self.current_index]

	# Get the current model pair to evaluate
	if is_causal:
	# Check if we've evaluated all causal model pairs
	if self.causal_pair_idx >= len(self.causal_model_pairs):
	# Move to next row and reset pair index
	self.current_index += 1
	self.causal_pair_idx = 0
	# Try again with the next row
	return self.get_next_battle_pair(is_causal)

	model_pair = self.causal_model_pairs[self.causal_pair_idx]
	pair_key = f"{self.current_index}_causal_{self.causal_pair_idx}"

	# Check if this row-pair combination has been evaluated
	if pair_key in self.row_model_pairs_evaluated:
	# Move to next pair
	self.causal_pair_idx += 1
	return self.get_next_battle_pair(is_causal)

	# Mark this row-pair combination as evaluated
	self.row_model_pairs_evaluated.add(pair_key)
	# Move to next pair for next evaluation
	self.causal_pair_idx += 1

	# Check if we've gone through all pairs for this row
	if self.causal_pair_idx >= len(self.causal_model_pairs):
	# Reset pair index and move to next row for next evaluation
	self.causal_pair_idx = 0
	self.current_index += 1
	else:
	# Similar logic for masked models
	if self.masked_pair_idx >= len(self.masked_model_pairs):
	self.current_index += 1
	self.masked_pair_idx = 0
	return self.get_next_battle_pair(is_causal)

	model_pair = self.masked_model_pairs[self.masked_pair_idx]
	pair_key = f"{self.current_index}_masked_{self.masked_pair_idx}"

	if pair_key in self.row_model_pairs_evaluated:
	self.masked_pair_idx += 1
	return self.get_next_battle_pair(is_causal)

	self.row_model_pairs_evaluated.add(pair_key)
	self.masked_pair_idx += 1

	if self.masked_pair_idx >= len(self.masked_model_pairs):
	self.masked_pair_idx = 0
	self.current_index += 1

	# Prepare the battle data with the selected model pair
	battle_data = {
	'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
	'model_1': row[model_pair[0]],
	'model_2': row[model_pair[1]],
	'model1_name': model_pair[0],
	'model2_name': model_pair[1]
	}

	return battle_data

	def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
	"""Record user's model preference and update scores"""
	self.model_scores[model1_name]['total_comparisons'] += 1
	self.model_scores[model2_name]['total_comparisons'] += 1

	if preferred_models == "Both Good":
	self.model_scores[model1_name]['wins'] += 1
	self.model_scores[model2_name]['wins'] += 1
	elif preferred_models == "Model A": # Maps to first model
	self.model_scores[model1_name]['wins'] += 1
	elif preferred_models == "Model B": # Maps to second model
	self.model_scores[model2_name]['wins'] += 1
	# "Both Bad" case - no wins recorded

	evaluation = {
	'input_text': input_text,
	'output1': output1,
	'output2': output2,
	'model1_name': model1_name,
	'model2_name': model2_name,
	'preferred_models': preferred_models
	}
	if is_causal:
	self.evaluation_results_causal.append(evaluation)
	else:
	self.evaluation_results_masked.append(evaluation)

	# Calculate the total number of evaluations
	total_evaluations = len(self.evaluation_results_causal) + len(self.evaluation_results_masked)

	# Save results periodically
	if total_evaluations % self.saving_freq == 0:
	self.save_results()

	return self.get_model_scores_df(is_causal)

	def save_results(self):
	"""Save the evaluation results to Hub and CSV"""
	results_df = self.get_model_scores_df(is_causal=True) # Get the latest scores
	results_dataset = Dataset.from_pandas(results_df)
	results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True, token=TOKEN)
	results_df.to_csv('human_eval_results.csv')

	# Also save the raw evaluation results
	masked_df = pd.DataFrame(self.evaluation_results_masked)
	causal_df = pd.DataFrame(self.evaluation_results_causal)

	if not masked_df.empty:
	masked_df.to_csv('masked_evaluations.csv')
	if not causal_df.empty:
	causal_df.to_csv('causal_evaluations.csv')

	def get_model_scores_df(self, is_causal):
	"""Convert model scores to DataFrame"""
	scores_data = []
	for model, stats in self.model_scores.items():
	if is_causal:
	if model not in CAUSAL_LM_MODELS:
	continue
	else:
	if model not in MASKED_LM_MODELS:
	continue
	win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
	scores_data.append({
	'Model': model,
	'Wins': stats['wins'],
	'Total Comparisons': stats['total_comparisons'],
	'Win Rate (%)': round(win_rate, 2)
	})

	results_df = pd.DataFrame(scores_data)
	print("Generated DataFrame:\n", results_df) # Debugging print

	# if 'Win Rate (%)' not in results_df.columns:
	# raise ValueError("Win Rate (%) column is missing from DataFrame!")

	return results_df


	def create_battle_arena(dataset_path, is_gif, is_causal):
	arena = LMBattleArena(dataset_path)

	def battle_round(is_causal):
	battle_data = arena.get_next_battle_pair(is_causal)

	if battle_data is None:
	return "All model pairs have been evaluated for all examples!", "", "", "", "", gr.DataFrame(visible=False)

	return (
	battle_data['prompt'],
	battle_data['model_1'],
	battle_data['model_2'],
	battle_data['model1_name'],
	battle_data['model2_name'],
	gr.DataFrame(visible=True)
	)

	def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal):
	scores_df = arena.record_evaluation(
	preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal
	)
	next_battle = battle_round(is_causal)
	return (*next_battle[:-1], scores_df)

	with gr.Blocks(css="footer{display:none !important}") as demo:
	# Rest of the code remains the same
	base_path = os.path.dirname(__file__)
	local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
	gr.HTML(create_html_media(local_image_path, is_gif=is_gif))

	with gr.Tabs():
	with gr.Tab("Masked LM Battle Arena"):
	gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")

	# Use gr.State to store the boolean value without displaying it
	is_causal = gr.State(value=False)

	input_text = gr.Textbox(
	label="Input prompt",
	interactive=False,
	)

	with gr.Row():
	output_1 = gr.Textbox(
	label="Model A",
	interactive=False
	)
	model1_name = gr.State() # Hidden state for model1 name

	with gr.Row():
	output_2 = gr.Textbox(
	label="Model B",
	interactive=False
	)
	model2_name = gr.State() # Hidden state for model2 name

	preferred_models = gr.Radio(
	label="Which model is better?",
	choices=["Model A", "Model B", "Both Good", "Both Bad"]
	)
	submit_btn = gr.Button("Vote", variant="primary")

	scores_table = gr.DataFrame(
	headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
	label="🏆 Leaderboard"
	)

	submit_btn.click(
	submit_preference,
	inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	demo.load(
	battle_round,
	inputs=[is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	with gr.Tab("Causal LM Battle Arena"):
	gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")

	# Use gr.State to store the boolean value without displaying it
	is_causal = gr.State(value=True)

	input_text = gr.Textbox(
	label="Input prompt",
	interactive=False,
	)

	with gr.Row():
	output_1 = gr.Textbox(
	label="Model A",
	interactive=False
	)
	model1_name = gr.State() # Hidden state for model1 name

	with gr.Row():
	output_2 = gr.Textbox(
	label="Model B",
	interactive=False
	)
	model2_name = gr.State() # Hidden state for model2 name

	preferred_models = gr.Radio(
	label="Which model is better?",
	choices=["Model A", "Model B", "Both Good", "Both Bad"]
	)
	submit_btn = gr.Button("Vote", variant="primary")

	scores_table = gr.DataFrame(
	headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
	label="🏆 Leaderboard"
	)

	submit_btn.click(
	submit_preference,
	inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	demo.load(
	battle_round,
	inputs=[is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	return demo

	if __name__ == "__main__":

	# inference device
	device = "cpu"
	dataset_path = 'human_eval_dataset.csv'
	is_gif = True

	# load the existing dataset that contains outputs of the LMs
	human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test', token=TOKEN).to_csv(dataset_path) # atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas

	demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
	demo.launch(debug=True)