"""
UI Components
=============
HTML generation functions for the Gradio interface
"""
from typing import List
from config import TokenizerInfo, TokenizationMetrics
from utils import is_arabic_char
def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str:
"""Generate beautiful HTML visualization of tokens"""
colors = [
('#1a1a2e', '#eaeaea'),
('#16213e', '#f0f0f0'),
('#0f3460', '#ffffff'),
('#533483', '#f5f5f5'),
('#e94560', '#ffffff'),
('#0f4c75', '#f0f0f0'),
('#3282b8', '#ffffff'),
('#bbe1fa', '#1a1a2e'),
]
html_parts = []
for i, (token, tid) in enumerate(zip(tokens, token_ids)):
bg, fg = colors[i % len(colors)]
display_token = token.replace('<', '<').replace('>', '>')
is_arabic = any(is_arabic_char(c) for c in token)
direction = 'rtl' if is_arabic else 'ltr'
html_parts.append(f'''
{display_token}
{tid}
''')
return f'''
{''.join(html_parts)}
'''
def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str:
"""Generate metrics visualization card"""
fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor"
strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor"
compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor"
return f'''
📊
{metrics.total_tokens}
Total Tokens
🎯
{metrics.fertility:.3f}
Fertility (tokens/word)
Lower is better (1.0 ideal)
📦
{metrics.compression_ratio:.2f}
Compression (bytes/token)
Higher is better
✨
{metrics.single_token_retention_rate:.1%}
STRR (Single Token Retention)
Higher is better
🔤
{metrics.char_per_token:.2f}
Characters/Token
❓
{metrics.oov_percentage:.1f}%
OOV Rate
Lower is better (0% ideal)
🌍
{metrics.arabic_fertility:.3f}
Arabic Fertility
⚡
{metrics.tokenization_time_ms:.2f}ms
Processing Time
'''
def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
"""Generate tokenizer information card"""
dialect_badges = ''.join([f'{d}' for d in info.dialect_support])
feature_badges = ''.join([f'{f}' for f in info.special_features])
support_class = "native" if info.arabic_support == "Native" else "supported" if info.arabic_support == "Supported" else "limited"
return f'''
{info.description}
Type:
{info.type.value}
Algorithm:
{info.algorithm.value}
Vocab Size:
{info.vocab_size:,}
Arabic Support:
{info.arabic_support}
Dialects:
{dialect_badges}
Features:
{feature_badges}
'''
def generate_decoded_section(metrics: TokenizationMetrics) -> str:
"""Generate decoded output section"""
return f'''
Decoded Output
{metrics.decoded_text}
Diacritics preserved: {'✅ Yes' if metrics.diacritic_preservation else '❌ No'}
'''
def generate_about_html(tokenizers_by_type: dict, total_count: int) -> str:
"""Generate About page HTML"""
# Build tokenizer lists
sections = []
for category, tokenizers in tokenizers_by_type.items():
if tokenizers:
items = ''.join([f'{t}' for t in tokenizers[:12]])
if len(tokenizers) > 12:
items += f'...and {len(tokenizers) - 12} more'
sections.append(f'''
''')
return f'''
{total_count}
Available Tokenizers
📚 Available Tokenizers
{''.join(sections)}
✨ Features
📊
Comprehensive efficiency metrics (fertility, compression, STRR)
🌍
Arabic-specific analysis (dialect support, diacritic preservation)
⚖️
Side-by-side tokenizer comparison
🎨
Beautiful token visualization
🏆
Leaderboard with real HuggingFace datasets
📖
Support for MSA, dialectal, and Classical Arabic
🎯 Use Cases
🔬 Research
Compare tokenizers for Arabic NLP experiments
🚀 Production
Select optimal tokenizer for deployment
📚 Education
Understand how different algorithms handle Arabic
💰 Optimization
Identify cost-efficient tokenizers for API usage
'''