Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

App Files Files Community

Arabic_Tokenizer / leaderboard.py

HeshamHaroon

Fix leaderboard loading error when no cache exists

b43b847 16 days ago

raw

history blame contribute delete

30 kB

	"""
	Leaderboard Module
	==================
	Evaluate tokenizers on real HuggingFace Arabic datasets
	"""

	import json
	import os
	import statistics
	from typing import Dict, List, Tuple, Optional
	from collections import defaultdict
	import gradio as gr
	from datasets import load_dataset
	from transformers import AutoTokenizer

	from config import LEADERBOARD_DATASETS
	from tokenizer_manager import tokenizer_manager


	# File path for persistent storage of submitted tokenizers
	SUBMISSIONS_FILE = os.path.join(os.path.dirname(__file__), "submissions.json")

	# File path for cached leaderboard results
	LEADERBOARD_CACHE_FILE = os.path.join(os.path.dirname(__file__), "leaderboard_cache.json")


	def load_submitted_tokenizers() -> Dict[str, Dict]:
	"""Load submitted tokenizers from persistent storage"""
	if os.path.exists(SUBMISSIONS_FILE):
	try:
	with open(SUBMISSIONS_FILE, 'r', encoding='utf-8') as f:
	return json.load(f)
	except (json.JSONDecodeError, IOError):
	return {}
	return {}


	def save_submitted_tokenizer(model_id: str, data: Dict) -> None:
	"""Save a submitted tokenizer to persistent storage"""
	submissions = load_submitted_tokenizers()
	submissions[model_id] = data
	try:
	with open(SUBMISSIONS_FILE, 'w', encoding='utf-8') as f:
	json.dump(submissions, f, indent=2, ensure_ascii=False)
	except IOError as e:
	print(f"Warning: Could not save submission: {e}")


	def load_leaderboard_cache() -> Optional[Dict]:
	"""Load cached leaderboard results"""
	if os.path.exists(LEADERBOARD_CACHE_FILE):
	try:
	with open(LEADERBOARD_CACHE_FILE, 'r', encoding='utf-8') as f:
	return json.load(f)
	except (json.JSONDecodeError, IOError):
	return None
	return None


	def save_leaderboard_cache(leaderboard_html: str, per_dataset_html: str, status: str) -> None:
	"""Save leaderboard results to cache"""
	cache_data = {
	"leaderboard_html": leaderboard_html,
	"per_dataset_html": per_dataset_html,
	"status": status
	}
	try:
	with open(LEADERBOARD_CACHE_FILE, 'w', encoding='utf-8') as f:
	json.dump(cache_data, f, ensure_ascii=False)
	except IOError as e:
	print(f"Warning: Could not save leaderboard cache: {e}")


	class HFDatasetLoader:
	"""Load Arabic datasets from HuggingFace"""

	def __init__(self):
	self.cache = {}

	def load_dataset_texts(self, dataset_key: str) -> Tuple[List[str], str]:
	"""Load texts from a HuggingFace dataset"""

	if dataset_key in self.cache:
	return self.cache[dataset_key], f"✅ Loaded {len(self.cache[dataset_key])} samples (cached)"

	config = LEADERBOARD_DATASETS.get(dataset_key)
	if not config:
	return [], f"❌ Unknown dataset: {dataset_key}"

	try:
	# Load dataset from HuggingFace
	if config.get("subset"):
	ds = load_dataset(
	config["hf_id"],
	config["subset"],
	split=config["split"],
	trust_remote_code=True
	)
	else:
	ds = load_dataset(
	config["hf_id"],
	split=config["split"],
	trust_remote_code=True
	)

	texts = []
	text_col = config["text_column"]

	# Try to find text column
	if text_col not in ds.column_names:
	for col in ["text", "content", "sentence", "arabic", "context", "Tweet", "question", "poem_text", "hadith_text_ar"]:
	if col in ds.column_names:
	text_col = col
	break

	# Extract texts
	max_samples = config.get("samples", 500)
	for i, item in enumerate(ds):
	if i >= max_samples:
	break
	text = item.get(text_col, "")
	if text and isinstance(text, str) and len(text.strip()) > 10:
	texts.append(text.strip())

	self.cache[dataset_key] = texts
	return texts, f"✅ Loaded {len(texts)} samples from HuggingFace"

	except Exception as e:
	return [], f"❌ Error loading {config['hf_id']}: {str(e)[:80]}"


	def evaluate_tokenizer_on_texts(tokenizer, texts: List[str]) -> Optional[Dict]:
	"""Evaluate a tokenizer on a list of texts"""

	fertilities = []
	compressions = []
	unk_counts = 0
	total_tokens = 0

	for text in texts:
	try:
	tokens = tokenizer.encode(text, add_special_tokens=False)
	decoded = tokenizer.convert_ids_to_tokens(tokens)

	num_tokens = len(tokens)
	num_words = len(text.split()) or 1
	num_bytes = len(text.encode('utf-8'))

	fertility = num_tokens / num_words
	compression = num_bytes / num_tokens if num_tokens > 0 else 0

	# Count UNKs
	unk_token = getattr(tokenizer, 'unk_token', '[UNK]')
	unks = sum(1 for t in decoded if t and (t == unk_token or '<unk>' in str(t).lower() or '[unk]' in str(t).lower()))

	fertilities.append(fertility)
	compressions.append(compression)
	unk_counts += unks
	total_tokens += num_tokens

	except Exception:
	continue

	if not fertilities:
	return None

	return {
	"avg_fertility": statistics.mean(fertilities),
	"std_fertility": statistics.stdev(fertilities) if len(fertilities) > 1 else 0,
	"avg_compression": statistics.mean(compressions),
	"unk_ratio": unk_counts / total_tokens if total_tokens > 0 else 0,
	"samples": len(fertilities)
	}


	def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio: float) -> float:
	"""Calculate overall score (0-100, higher is better)"""
	# Lower fertility is better (ideal ~1.0 for Arabic)
	fertility_score = max(0, min(1, 2.0 / fertility)) if fertility > 0 else 0
	# Higher compression is better
	compression_score = min(1, compression / 6)
	# Lower UNK is better
	unk_score = 1 - min(1, unk_ratio * 20)

	# Weighted combination
	score = (fertility_score * 0.45 + compression_score * 0.35 + unk_score * 0.20) * 100
	return round(score, 1)


	def get_cached_leaderboard(progress=gr.Progress()) -> Tuple[str, str, str]:
	"""
	Get leaderboard results from cache if available.
	If no cache exists, shows a message to run evaluation.
	Returns: (leaderboard_html, per_dataset_html, status_message)
	"""
	cache = load_leaderboard_cache()
	if cache:
	# Also include any new submissions that were added after the cache
	return (
	cache.get("leaderboard_html", ""),
	cache.get("per_dataset_html", ""),
	cache.get("status", "") + "\n\n📦 Loaded from cache. Click 'Re-evaluate All' to refresh."
	)

	# No cache exists - show message to run evaluation
	no_data_html = """
	<div style="text-align: center; padding: 40px; background: #22272e; border-radius: 12px; border: 1px solid #30363d;">
	<p style="color: #8b949e; font-size: 16px; margin-bottom: 16px;">📊 No evaluation data available yet.</p>
	<p style="color: #e6edf3; font-size: 14px;">Click <strong>"Re-evaluate All"</strong> button above to run the full evaluation.</p>
	<p style="color: #8b949e; font-size: 12px; margin-top: 12px;">This will evaluate all tokenizers on all 8 Arabic datasets (~5-10 minutes).</p>
	</div>
	"""
	return (
	no_data_html,
	no_data_html,
	"⚠️ No cached results found. Click 'Re-evaluate All' to run the evaluation."
	)


	def run_leaderboard_evaluation(
	progress=gr.Progress()
	) -> Tuple[str, str, str]:
	"""
	Run the full leaderboard evaluation with real HF datasets
	Evaluates ALL tokenizers on ALL datasets
	Returns: (leaderboard_html, per_dataset_html, status_message)
	"""

	# Use ALL datasets
	selected_datasets = list(LEADERBOARD_DATASETS.keys())

	# Use ALL available tokenizers
	selected_tokenizers = tokenizer_manager.get_tokenizer_choices()

	loader = HFDatasetLoader()
	results = defaultdict(dict)

	# Status tracking
	status_lines = []

	# Load datasets from HuggingFace
	status_lines.append("📚 Loading Datasets from HuggingFace:\n")
	loaded_datasets = {}

	for i, ds_key in enumerate(selected_datasets):
	progress((i + 1) / len(selected_datasets) * 0.3, f"Loading {ds_key}...")
	texts, msg = loader.load_dataset_texts(ds_key)
	ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
	status_lines.append(f" • {ds_name}: {msg}")
	if texts:
	loaded_datasets[ds_key] = texts

	if not loaded_datasets:
	return "", "", "\n".join(status_lines) + "\n\n❌ No datasets loaded successfully"

	# Evaluate tokenizers
	status_lines.append("\n🔄 Evaluating Tokenizers:\n")

	tokenizer_cache = {}
	total_steps = len(selected_tokenizers) * len(loaded_datasets)
	current_step = 0

	for tok_choice in selected_tokenizers:
	# Get model ID from choice
	tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
	tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
	tok_name = tok_info.name if tok_info else tok_choice

	# Load tokenizer
	try:
	if tok_id not in tokenizer_cache:
	tokenizer_cache[tok_id] = AutoTokenizer.from_pretrained(
	tok_id, trust_remote_code=True
	)
	tokenizer = tokenizer_cache[tok_id]
	status_lines.append(f" • {tok_name}: ✅ Loaded")
	except Exception as e:
	status_lines.append(f" • {tok_name}: ❌ Failed ({str(e)[:30]})")
	continue

	# Evaluate on each dataset
	for ds_key, texts in loaded_datasets.items():
	current_step += 1
	progress(0.3 + (current_step / total_steps) * 0.6, f"Evaluating {tok_name} on {ds_key}...")

	metrics = evaluate_tokenizer_on_texts(tokenizer, texts)
	if metrics:
	results[tok_choice][ds_key] = metrics

	# Generate leaderboard
	progress(0.95, "Generating leaderboard...")

	leaderboard_data = []
	per_dataset_data = []

	for tok_choice, ds_results in results.items():
	if not ds_results:
	continue

	tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
	tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)

	# Aggregate across datasets
	all_fertility = [m["avg_fertility"] for m in ds_results.values()]
	all_compression = [m["avg_compression"] for m in ds_results.values()]
	all_unk = [m["unk_ratio"] for m in ds_results.values()]

	avg_fertility = statistics.mean(all_fertility)
	avg_compression = statistics.mean(all_compression)
	avg_unk = statistics.mean(all_unk)

	score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk)

	leaderboard_data.append({
	"name": tok_info.name if tok_info else tok_choice,
	"type": tok_info.type.value if tok_info else "Unknown",
	"org": tok_info.organization if tok_info else "Unknown",
	"score": score,
	"fertility": avg_fertility,
	"compression": avg_compression,
	"unk_ratio": avg_unk,
	"num_datasets": len(ds_results)
	})

	# Per-dataset row
	per_ds_row = {"Tokenizer": tok_info.name if tok_info else tok_choice}
	for ds_key in selected_datasets:
	ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
	if ds_key in ds_results:
	per_ds_row[ds_name] = round(ds_results[ds_key]["avg_fertility"], 2)
	else:
	per_ds_row[ds_name] = "-"
	per_dataset_data.append(per_ds_row)

	# Add submitted tokenizers to the leaderboard
	submitted = load_submitted_tokenizers()
	for model_id, sub_data in submitted.items():
	# Check if already in leaderboard (avoid duplicates)
	if any(d["name"] == sub_data["name"] for d in leaderboard_data):
	continue

	leaderboard_data.append({
	"name": sub_data["name"],
	"type": sub_data.get("type", "Custom"),
	"org": sub_data.get("org", "Community"),
	"score": sub_data["score"],
	"fertility": sub_data["fertility"],
	"compression": sub_data["compression"],
	"unk_ratio": sub_data["unk_ratio"],
	"num_datasets": len(sub_data.get("per_dataset", {}))
	})

	# Add per-dataset row for submitted tokenizer
	per_ds_row = {"Tokenizer": sub_data["name"]}
	per_dataset_results = sub_data.get("per_dataset", {})
	for ds_key in selected_datasets:
	ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
	if ds_key in per_dataset_results:
	per_ds_row[ds_name] = round(per_dataset_results[ds_key]["avg_fertility"], 2)
	else:
	per_ds_row[ds_name] = "-"
	per_dataset_data.append(per_ds_row)

	# Sort by score
	leaderboard_data.sort(key=lambda x: x["score"], reverse=True)

	# Create HTML tables
	leaderboard_html = generate_leaderboard_html(leaderboard_data)
	per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)

	status_lines.append(f"\n✅ Evaluation Complete! Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
	status_message = "\n".join(status_lines)

	# Save results to cache
	save_leaderboard_cache(leaderboard_html, per_dataset_html, status_message)

	return leaderboard_html, per_dataset_html, status_message


	def generate_leaderboard_html(data: List[Dict]) -> str:
	"""Generate HTML for main leaderboard - dark theme design"""

	if not data:
	return "<p style='color: #e6edf3;'>No results to display</p>"

	html = """
	<style>
	.leaderboard-table {
	width: 100%;
	border-collapse: collapse;
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
	font-size: 14px;
	margin: 16px 0;
	}
	.leaderboard-table th {
	background: #1a5f2a;
	color: #fff;
	padding: 12px 10px;
	text-align: left;
	font-weight: 500;
	border-bottom: 2px solid #145022;
	}
	.leaderboard-table td {
	padding: 10px;
	border-bottom: 1px solid #30363d;
	color: #e6edf3;
	}
	.leaderboard-table tr:nth-child(even) {
	background-color: #1c2128;
	}
	.leaderboard-table tr:nth-child(odd) {
	background-color: #22272e;
	}
	.leaderboard-table tr:hover {
	background-color: #2d333b;
	}
	.leaderboard-table .rank-1 td { background: rgba(255, 215, 0, 0.15); }
	.leaderboard-table .rank-2 td { background: rgba(192, 192, 192, 0.15); }
	.leaderboard-table .rank-3 td { background: rgba(205, 127, 50, 0.15); }
	.score-badge {
	background: #2d8f4e;
	color: #fff;
	padding: 4px 10px;
	border-radius: 4px;
	font-weight: 600;
	font-size: 13px;
	}
	.type-badge {
	background: #30363d;
	color: #8b949e;
	padding: 3px 8px;
	border-radius: 3px;
	font-size: 12px;
	}
	.metric-good { color: #10b981; font-weight: 500; }
	.metric-bad { color: #f87171; font-weight: 500; }
	.rank-medal { font-size: 16px; margin-right: 4px; }
	</style>

	<table class="leaderboard-table">
	<thead>
	<tr>
	<th>Rank</th>
	<th>Tokenizer</th>
	<th>Type</th>
	<th>Organization</th>
	<th>Score</th>
	<th>Fertility</th>
	<th>Compression</th>
	<th>UNK Rate</th>
	<th>Datasets</th>
	</tr>
	</thead>
	<tbody>
	"""

	for i, entry in enumerate(data):
	rank = i + 1
	rank_class = f"rank-{rank}" if rank <= 3 else ""

	# Medal for top 3
	if rank == 1:
	rank_display = '<span class="rank-medal">🥇</span> 1'
	elif rank == 2:
	rank_display = '<span class="rank-medal">🥈</span> 2'
	elif rank == 3:
	rank_display = '<span class="rank-medal">🥉</span> 3'
	else:
	rank_display = f"#{rank}"

	fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
	comp_class = "metric-good" if entry["compression"] > 3.5 else ""
	unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""

	html += f"""
	<tr class="{rank_class}">
	<td><strong>{rank_display}</strong></td>
	<td><strong>{entry["name"]}</strong></td>
	<td><span class="type-badge">{entry["type"]}</span></td>
	<td>{entry["org"]}</td>
	<td><span class="score-badge">{entry["score"]}</span></td>
	<td class="{fert_class}">{entry["fertility"]:.3f}</td>
	<td class="{comp_class}">{entry["compression"]:.2f}</td>
	<td class="{unk_class}">{entry["unk_ratio"]:.2%}</td>
	<td>{entry["num_datasets"]}</td>
	</tr>
	"""

	html += """
	</tbody>
	</table>

	<div style="margin-top: 12px; padding: 12px 16px; background: #22272e; border-left: 3px solid #2d8f4e; font-size: 13px; color: #8b949e; border-radius: 0 8px 8px 0;">
	<strong style="color: #e6edf3;">Metrics:</strong>
	Score (0-100, higher=better) •
	Fertility (tokens/word, lower=better) •
	Compression (bytes/token, higher=better) •
	UNK Rate (lower=better)
	</div>
	"""

	return html


	def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
	"""Generate HTML for per-dataset fertility table - dark theme design"""

	if not data:
	return "<p style='color: #e6edf3;'>No per-dataset results</p>"

	ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]

	html = """
	<style>
	.dataset-table {
	width: 100%;
	border-collapse: collapse;
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
	font-size: 13px;
	margin: 16px 0;
	}
	.dataset-table th {
	background: #1a5f2a;
	color: #fff;
	padding: 10px 8px;
	text-align: center;
	font-weight: 500;
	}
	.dataset-table th:first-child {
	text-align: left;
	}
	.dataset-table td {
	padding: 8px;
	text-align: center;
	border-bottom: 1px solid #30363d;
	color: #e6edf3;
	}
	.dataset-table td:first-child {
	text-align: left;
	font-weight: 500;
	}
	.dataset-table tr:nth-child(even) {
	background-color: #1c2128;
	}
	.dataset-table tr:nth-child(odd) {
	background-color: #22272e;
	}
	.dataset-table tr:hover {
	background-color: #2d333b;
	}
	.fert-excellent { background: rgba(16, 185, 129, 0.25); color: #34d399; font-weight: 500; }
	.fert-good { background: rgba(245, 158, 11, 0.25); color: #fbbf24; font-weight: 500; }
	.fert-poor { background: rgba(248, 113, 113, 0.25); color: #f87171; font-weight: 500; }
	</style>

	<table class="dataset-table">
	<thead>
	<tr>
	<th>Tokenizer</th>
	"""

	for ds_name in ds_names:
	html += f"<th>{ds_name}</th>"

	html += """
	</tr>
	</thead>
	<tbody>
	"""

	for row in data:
	html += f"<tr><td>{row['Tokenizer']}</td>"
	for ds_name in ds_names:
	val = row.get(ds_name, "-")
	if val != "-":
	if val < 1.8:
	cls = "fert-excellent"
	elif val < 2.5:
	cls = "fert-good"
	else:
	cls = "fert-poor"
	html += f'<td class="{cls}">{val}</td>'
	else:
	html += '<td>-</td>'
	html += "</tr>"

	html += """
	</tbody>
	</table>
	"""

	return html


	def evaluate_submitted_tokenizer(
	model_id: str,
	model_name: str,
	organization: str,
	model_type: str,
	progress=gr.Progress()
	) -> Tuple[str, str]:
	"""
	Evaluate a user-submitted tokenizer on ALL datasets
	Returns: (result_html, status_message)
	"""

	if not model_id or not model_id.strip():
	return "", "❌ Please enter a HuggingFace model ID (e.g., 'google/gemma-2-9b')"

	model_id = model_id.strip()

	# Use ALL datasets
	selected_datasets = list(LEADERBOARD_DATASETS.keys())

	# Try to load the tokenizer
	progress(0.1, f"Loading tokenizer: {model_id}...")

	try:
	tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	except Exception as e:
	return "", f"❌ Failed to load tokenizer '{model_id}': {str(e)[:100]}"

	# Get tokenizer info
	vocab_size = tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else len(tokenizer)
	display_name = model_name.strip() if model_name and model_name.strip() else model_id.split('/')[-1]
	org = organization.strip() if organization and organization.strip() else (model_id.split('/')[0] if '/' in model_id else "Unknown")

	progress(0.2, "Loading datasets...")

	# Load datasets
	loader = HFDatasetLoader()
	loaded_datasets = {}

	for ds_key in selected_datasets:
	texts, _ = loader.load_dataset_texts(ds_key)
	if texts:
	loaded_datasets[ds_key] = texts

	if not loaded_datasets:
	return "", "❌ Failed to load any datasets for evaluation"

	# Evaluate
	progress(0.4, "Evaluating tokenizer...")

	all_fertility = []
	all_compression = []
	all_unk = []
	per_dataset_results = {}

	for ds_key, texts in loaded_datasets.items():
	progress(0.4 + (len(per_dataset_results) / len(loaded_datasets)) * 0.4,
	f"Evaluating on {LEADERBOARD_DATASETS[ds_key]['name']}...")

	metrics = evaluate_tokenizer_on_texts(tokenizer, texts)
	if metrics:
	per_dataset_results[ds_key] = metrics
	all_fertility.append(metrics["avg_fertility"])
	all_compression.append(metrics["avg_compression"])
	all_unk.append(metrics["unk_ratio"])

	if not all_fertility:
	return "", "❌ Evaluation failed - no valid results"

	# Calculate overall metrics
	avg_fertility = statistics.mean(all_fertility)
	avg_compression = statistics.mean(all_compression)
	avg_unk = statistics.mean(all_unk)
	score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk)

	progress(0.9, "Saving results...")

	# Save submission to persistent storage
	submission_data = {
	"name": display_name,
	"org": org,
	"type": model_type or "Custom",
	"vocab_size": vocab_size,
	"score": score,
	"fertility": avg_fertility,
	"compression": avg_compression,
	"unk_ratio": avg_unk,
	"per_dataset": per_dataset_results
	}
	save_submitted_tokenizer(model_id, submission_data)

	# Generate result HTML
	result_html = generate_submission_result_html(
	display_name, org, model_type, vocab_size, score,
	avg_fertility, avg_compression, avg_unk,
	per_dataset_results, selected_datasets
	)

	status = f"✅ {display_name} has been evaluated on {len(loaded_datasets)} datasets and added to the leaderboard! Refresh the Leaderboard tab to see the updated rankings."

	return result_html, status


	def generate_submission_result_html(
	name: str, org: str, model_type: str, vocab_size: int, score: float,
	fertility: float, compression: float, unk_ratio: float,
	per_dataset: Dict, dataset_keys: List[str]
	) -> str:
	"""Generate HTML for submission results - dark theme design"""

	# Determine score quality
	if score >= 70:
	score_color = "#10b981"
	score_label = "Excellent"
	elif score >= 50:
	score_color = "#4a90d9"
	score_label = "Good"
	elif score >= 30:
	score_color = "#f59e0b"
	score_label = "Fair"
	else:
	score_color = "#f87171"
	score_label = "Needs Improvement"

	html = f"""
	<div style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;">
	<div style="background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4e 100%); color: white; padding: 24px; border-radius: 12px; margin-bottom: 20px;">
	<h2 style="margin: 0 0 8px 0; font-size: 24px;">📊 Evaluation Results</h2>
	<p style="margin: 0; opacity: 0.9; font-size: 14px;">{name} by {org}</p>
	</div>

	<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 16px; margin-bottom: 24px;">
	<div style="background: #22272e; padding: 20px; border-radius: 8px; text-align: center; border-left: 4px solid {score_color};">
	<div style="font-size: 32px; font-weight: 700; color: {score_color};">{score}</div>
	<div style="font-size: 12px; color: #8b949e; margin-top: 4px;">Overall Score</div>
	<div style="font-size: 11px; color: {score_color}; font-weight: 500;">{score_label}</div>
	</div>
	<div style="background: #22272e; padding: 20px; border-radius: 8px; text-align: center;">
	<div style="font-size: 28px; font-weight: 600; color: #e6edf3;">{fertility:.3f}</div>
	<div style="font-size: 12px; color: #8b949e; margin-top: 4px;">Fertility</div>
	<div style="font-size: 11px; color: #8b949e;">tokens/word</div>
	</div>
	<div style="background: #22272e; padding: 20px; border-radius: 8px; text-align: center;">
	<div style="font-size: 28px; font-weight: 600; color: #e6edf3;">{compression:.2f}</div>
	<div style="font-size: 12px; color: #8b949e; margin-top: 4px;">Compression</div>
	<div style="font-size: 11px; color: #8b949e;">bytes/token</div>
	</div>
	<div style="background: #22272e; padding: 20px; border-radius: 8px; text-align: center;">
	<div style="font-size: 28px; font-weight: 600; color: #e6edf3;">{unk_ratio:.2%}</div>
	<div style="font-size: 12px; color: #8b949e; margin-top: 4px;">UNK Rate</div>
	<div style="font-size: 11px; color: #8b949e;">unknown tokens</div>
	</div>
	</div>

	<div style="background: #22272e; padding: 16px; border-radius: 8px; margin-bottom: 20px; border: 1px solid #30363d;">
	<h4 style="margin: 0 0 12px 0; color: #e6edf3; font-size: 14px;">Model Details</h4>
	<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; font-size: 13px;">
	<div><span style="color: #8b949e;">Type:</span> <strong style="color: #e6edf3;">{model_type or 'Custom'}</strong></div>
	<div><span style="color: #8b949e;">Vocab Size:</span> <strong style="color: #e6edf3;">{vocab_size:,}</strong></div>
	</div>
	</div>

	<h4 style="margin: 0 0 12px 0; color: #e6edf3; font-size: 14px;">📈 Per-Dataset Results</h4>
	<table style="width: 100%; border-collapse: collapse; font-size: 13px;">
	<thead>
	<tr style="background: #1a5f2a; color: white;">
	<th style="padding: 10px; text-align: left;">Dataset</th>
	<th style="padding: 10px; text-align: center;">Fertility</th>
	<th style="padding: 10px; text-align: center;">Compression</th>
	<th style="padding: 10px; text-align: center;">Samples</th>
	</tr>
	</thead>
	<tbody>
	"""

	for ds_key in dataset_keys:
	if ds_key in per_dataset:
	m = per_dataset[ds_key]
	ds_name = LEADERBOARD_DATASETS[ds_key]["name"]

	fert_val = m["avg_fertility"]
	if fert_val < 1.8:
	fert_style = "background: rgba(16, 185, 129, 0.25); color: #34d399;"
	elif fert_val < 2.5:
	fert_style = "background: rgba(245, 158, 11, 0.25); color: #fbbf24;"
	else:
	fert_style = "background: rgba(248, 113, 113, 0.25); color: #f87171;"

	html += f"""
	<tr style="border-bottom: 1px solid #30363d; background: #22272e;">
	<td style="padding: 10px; color: #e6edf3;">{ds_name}</td>
	<td style="padding: 10px; text-align: center; {fert_style} font-weight: 500;">{fert_val:.3f}</td>
	<td style="padding: 10px; text-align: center; color: #e6edf3;">{m["avg_compression"]:.2f}</td>
	<td style="padding: 10px; text-align: center; color: #8b949e;">{m["samples"]}</td>
	</tr>
	"""

	html += """
	</tbody>
	</table>
	</div>
	"""

	return html