""" UI Components ============= HTML generation functions for the Gradio interface """ from typing import List from config import TokenizerInfo, TokenizationMetrics from utils import is_arabic_char def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str: """Generate beautiful HTML visualization of tokens""" colors = [ ('#1a1a2e', '#eaeaea'), ('#16213e', '#f0f0f0'), ('#0f3460', '#ffffff'), ('#533483', '#f5f5f5'), ('#e94560', '#ffffff'), ('#0f4c75', '#f0f0f0'), ('#3282b8', '#ffffff'), ('#bbe1fa', '#1a1a2e'), ] html_parts = [] for i, (token, tid) in enumerate(zip(tokens, token_ids)): bg, fg = colors[i % len(colors)] display_token = token.replace('<', '<').replace('>', '>') is_arabic = any(is_arabic_char(c) for c in token) direction = 'rtl' if is_arabic else 'ltr' html_parts.append(f''' {display_token} {tid} ''') return f'''
{''.join(html_parts)}
''' def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str: """Generate metrics visualization card""" fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor" strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor" compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor" return f'''
📊
{metrics.total_tokens}
Total Tokens
🎯
{metrics.fertility:.3f}
Fertility (tokens/word)
Lower is better (1.0 ideal)
📦
{metrics.compression_ratio:.2f}
Compression (bytes/token)
Higher is better
{metrics.single_token_retention_rate:.1%}
STRR (Single Token Retention)
Higher is better
🔤
{metrics.char_per_token:.2f}
Characters/Token
{metrics.oov_percentage:.1f}%
OOV Rate
Lower is better (0% ideal)
🌍
{metrics.arabic_fertility:.3f}
Arabic Fertility
{metrics.tokenization_time_ms:.2f}ms
Processing Time
''' def generate_tokenizer_info_card(info: TokenizerInfo) -> str: """Generate tokenizer information card""" dialect_badges = ''.join([f'{d}' for d in info.dialect_support]) feature_badges = ''.join([f'{f}' for f in info.special_features]) support_class = "native" if info.arabic_support == "Native" else "supported" if info.arabic_support == "Supported" else "limited" return f'''

{info.name}

{info.organization}

{info.description}

Type: {info.type.value}
Algorithm: {info.algorithm.value}
Vocab Size: {info.vocab_size:,}
Arabic Support: {info.arabic_support}
Dialects: {dialect_badges}
Features: {feature_badges}
''' def generate_decoded_section(metrics: TokenizationMetrics) -> str: """Generate decoded output section""" return f'''

Decoded Output

{metrics.decoded_text}
Diacritics preserved: {'✅ Yes' if metrics.diacritic_preservation else '❌ No'}
''' def generate_about_html(tokenizers_by_type: dict, total_count: int) -> str: """Generate About page HTML""" # Build tokenizer lists sections = [] for category, tokenizers in tokenizers_by_type.items(): if tokenizers: items = ''.join([f'
  • {t}
  • ' for t in tokenizers[:12]]) if len(tokenizers) > 12: items += f'
  • ...and {len(tokenizers) - 12} more
  • ' sections.append(f'''

    {category}

    ''') return f'''

    🏟️ Arabic Tokenizer Arena Pro

    A comprehensive platform for evaluating Arabic tokenizers across multiple dimensions

    {total_count}
    Available Tokenizers
    8
    Evaluation Datasets
    8+
    Metrics

    📚 Available Tokenizers

    {''.join(sections)}

    ✨ Features

    📊 Comprehensive efficiency metrics (fertility, compression, STRR)
    🌍 Arabic-specific analysis (dialect support, diacritic preservation)
    ⚖️ Side-by-side tokenizer comparison
    🎨 Beautiful token visualization
    🏆 Leaderboard with real HuggingFace datasets
    📖 Support for MSA, dialectal, and Classical Arabic

    🎯 Use Cases

    🔬 Research

    Compare tokenizers for Arabic NLP experiments

    🚀 Production

    Select optimal tokenizer for deployment

    📚 Education

    Understand how different algorithms handle Arabic

    💰 Optimization

    Identify cost-efficient tokenizers for API usage

    '''