Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

File size: 8,589 Bytes

f32d4c7

"""
Tokenization Analysis
=====================
Core analysis functions for evaluating tokenizers
"""

import time
from typing import Tuple
from config import TokenizerInfo, TokenizationMetrics
from utils import count_arabic_chars, get_arabic_words, has_diacritics, is_arabic_char
from tokenizer_manager import tokenizer_manager


def analyze_tokenization(
    text: str, 
    model_id: str,
    tokenizer_info: TokenizerInfo
) -> TokenizationMetrics:
    """Perform comprehensive tokenization analysis"""
    
    tokenizer = tokenizer_manager.get_tokenizer(model_id)
    
    # Time the tokenization
    start_time = time.perf_counter()
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    tokenization_time = (time.perf_counter() - start_time) * 1000
    
    decoded = tokenizer.decode(token_ids, skip_special_tokens=True)
    
    # Basic counts
    words = text.split()
    total_words = len(words)
    total_tokens = len(tokens)
    total_characters = len(text)
    total_bytes = len(text.encode('utf-8'))
    
    # Efficiency metrics
    fertility = total_tokens / max(total_words, 1)
    compression_ratio = total_bytes / max(total_tokens, 1)
    char_per_token = total_characters / max(total_tokens, 1)
    
    # OOV analysis
    unk_token = tokenizer.unk_token if hasattr(tokenizer, 'unk_token') else '[UNK]'
    oov_count = sum(1 for t in tokens if t == unk_token or '[UNK]' in str(t))
    oov_percentage = (oov_count / max(total_tokens, 1)) * 100
    
    # Single Token Retention Rate (STRR)
    single_token_words = 0
    subwords_per_word = []
    
    for word in words:
        word_tokens = tokenizer.tokenize(word)
        subwords_per_word.append(len(word_tokens))
        if len(word_tokens) == 1:
            single_token_words += 1
    
    strr = single_token_words / max(total_words, 1)
    avg_subwords = sum(subwords_per_word) / max(len(subwords_per_word), 1)
    max_subwords = max(subwords_per_word) if subwords_per_word else 0
    continued_ratio = (total_words - single_token_words) / max(total_words, 1)
    
    # Arabic-specific metrics
    arabic_char_count = count_arabic_chars(text)
    arabic_words = get_arabic_words(text)
    arabic_tokens_count = 0
    
    for token in tokens:
        if any(is_arabic_char(c) for c in str(token)):
            arabic_tokens_count += 1
    
    arabic_fertility = arabic_tokens_count / max(len(arabic_words), 1) if arabic_words else 0
    diacritic_preserved = has_diacritics(text) == has_diacritics(decoded)
    
    return TokenizationMetrics(
        total_tokens=total_tokens,
        total_words=total_words,
        total_characters=total_characters,
        total_bytes=total_bytes,
        fertility=fertility,
        compression_ratio=compression_ratio,
        char_per_token=char_per_token,
        oov_count=oov_count,
        oov_percentage=oov_percentage,
        single_token_words=single_token_words,
        single_token_retention_rate=strr,
        avg_subwords_per_word=avg_subwords,
        max_subwords_per_word=max_subwords,
        continued_words_ratio=continued_ratio,
        arabic_char_count=arabic_char_count,
        arabic_token_count=arabic_tokens_count,
        arabic_fertility=arabic_fertility,
        diacritic_preservation=diacritic_preserved,
        tokenization_time_ms=tokenization_time,
        tokens=tokens,
        token_ids=token_ids,
        decoded_text=decoded
    )


def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]:
    """Analyze a single tokenizer - returns HTML outputs"""
    from ui_components import (
        generate_tokenizer_info_card, 
        generate_metrics_card, 
        generate_token_visualization,
        generate_decoded_section
    )
    
    if not text or not text.strip():
        return (
            '<div class="warning">⚠️ Please enter some text to analyze</div>',
            '', '', ''
        )
    
    if not tokenizer_choice:
        return (
            '<div class="warning">⚠️ Please select a tokenizer</div>',
            '', '', ''
        )
    
    model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice)
    tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)
    
    if not tokenizer_info:
        return (
            '<div class="error-card"><h4>Error</h4><p>Tokenizer not found</p></div>',
            '', '', ''
        )
    
    try:
        metrics = analyze_tokenization(text, model_id, tokenizer_info)
        
        info_html = generate_tokenizer_info_card(tokenizer_info)
        metrics_html = generate_metrics_card(metrics, tokenizer_info)
        tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids)
        decoded_html = generate_decoded_section(metrics)
        
        return info_html, metrics_html, tokens_html, decoded_html
        
    except Exception as e:
        return (
            f'<div class="error-card"><h4>Error</h4><p>{str(e)}</p></div>',
            '', '', ''
        )


def compare_tokenizers(tokenizer_choices: list, text: str) -> str:
    """Compare multiple tokenizers - returns HTML table"""
    from config import TokenizationMetrics
    
    if not text or not text.strip():
        return '<div class="warning">⚠️ Please enter some text to analyze</div>'
    
    if not tokenizer_choices or len(tokenizer_choices) < 2:
        return '<div class="warning">⚠️ Please select at least 2 tokenizers to compare</div>'
    
    results = []
    
    for choice in tokenizer_choices:
        model_id = tokenizer_manager.get_model_id_from_choice(choice)
        tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)
        
        if tokenizer_info:
            try:
                metrics = analyze_tokenization(text, model_id, tokenizer_info)
                results.append({
                    'name': tokenizer_info.name,
                    'org': tokenizer_info.organization,
                    'type': tokenizer_info.type.value,
                    'metrics': metrics
                })
            except Exception as e:
                results.append({
                    'name': tokenizer_info.name,
                    'org': tokenizer_info.organization,
                    'type': tokenizer_info.type.value,
                    'error': str(e)
                })
    
    # Sort by fertility (lower is better)
    def get_fertility(x):
        if 'error' in x:
            return 999
        return x['metrics'].fertility
    
    results.sort(key=get_fertility)
    
    # Generate comparison table
    html = '''
    <div class="comparison-container">
        <table class="comparison-table">
            <thead>
                <tr>
                    <th>Rank</th>
                    <th>Tokenizer</th>
                    <th>Type</th>
                    <th>Tokens</th>
                    <th>Fertility ↓</th>
                    <th>Compression ↑</th>
                    <th>STRR ↑</th>
                    <th>OOV %</th>
                </tr>
            </thead>
            <tbody>
    '''
    
    for i, result in enumerate(results):
        rank = i + 1
        rank_class = 'rank-1' if rank == 1 else 'rank-2' if rank == 2 else 'rank-3' if rank == 3 else ''
        
        if 'error' in result:
            html += f'''
                <tr class="{rank_class}">
                    <td>#{rank}</td>
                    <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
                    <td>{result['type']}</td>
                    <td colspan="5" class="error">Error: {result['error']}</td>
                </tr>
            '''
        else:
            m = result['metrics']
            fertility_class = 'excellent' if m.fertility < 1.5 else 'good' if m.fertility < 2.5 else 'poor'
            
            html += f'''
                <tr class="{rank_class}">
                    <td><strong>#{rank}</strong></td>
                    <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
                    <td>{result['type']}</td>
                    <td>{m.total_tokens}</td>
                    <td class="{fertility_class}">{m.fertility:.3f}</td>
                    <td>{m.compression_ratio:.2f}</td>
                    <td>{m.single_token_retention_rate:.1%}</td>
                    <td>{m.oov_percentage:.1f}%</td>
                </tr>
            '''
    
    html += '''
            </tbody>
        </table>
    </div>
    '''
    
    return html