Spaces:
Running
Running
| """ | |
| Tokenization Analysis | |
| ===================== | |
| Core analysis functions for evaluating tokenizers | |
| """ | |
| import time | |
| from typing import Tuple | |
| from config import TokenizerInfo, TokenizationMetrics | |
| from utils import count_arabic_chars, get_arabic_words, has_diacritics, is_arabic_char | |
| from tokenizer_manager import tokenizer_manager | |
| def analyze_tokenization( | |
| text: str, | |
| model_id: str, | |
| tokenizer_info: TokenizerInfo | |
| ) -> TokenizationMetrics: | |
| """Perform comprehensive tokenization analysis""" | |
| tokenizer = tokenizer_manager.get_tokenizer(model_id) | |
| # Time the tokenization | |
| start_time = time.perf_counter() | |
| tokens = tokenizer.tokenize(text) | |
| token_ids = tokenizer.encode(text, add_special_tokens=False) | |
| tokenization_time = (time.perf_counter() - start_time) * 1000 | |
| decoded = tokenizer.decode(token_ids, skip_special_tokens=True) | |
| # Basic counts | |
| words = text.split() | |
| total_words = len(words) | |
| total_tokens = len(tokens) | |
| total_characters = len(text) | |
| total_bytes = len(text.encode('utf-8')) | |
| # Efficiency metrics | |
| fertility = total_tokens / max(total_words, 1) | |
| compression_ratio = total_bytes / max(total_tokens, 1) | |
| char_per_token = total_characters / max(total_tokens, 1) | |
| # OOV analysis | |
| unk_token = tokenizer.unk_token if hasattr(tokenizer, 'unk_token') else '[UNK]' | |
| oov_count = sum(1 for t in tokens if t == unk_token or '[UNK]' in str(t)) | |
| oov_percentage = (oov_count / max(total_tokens, 1)) * 100 | |
| # Single Token Retention Rate (STRR) | |
| single_token_words = 0 | |
| subwords_per_word = [] | |
| for word in words: | |
| word_tokens = tokenizer.tokenize(word) | |
| subwords_per_word.append(len(word_tokens)) | |
| if len(word_tokens) == 1: | |
| single_token_words += 1 | |
| strr = single_token_words / max(total_words, 1) | |
| avg_subwords = sum(subwords_per_word) / max(len(subwords_per_word), 1) | |
| max_subwords = max(subwords_per_word) if subwords_per_word else 0 | |
| continued_ratio = (total_words - single_token_words) / max(total_words, 1) | |
| # Arabic-specific metrics | |
| arabic_char_count = count_arabic_chars(text) | |
| arabic_words = get_arabic_words(text) | |
| arabic_tokens_count = 0 | |
| for token in tokens: | |
| if any(is_arabic_char(c) for c in str(token)): | |
| arabic_tokens_count += 1 | |
| arabic_fertility = arabic_tokens_count / max(len(arabic_words), 1) if arabic_words else 0 | |
| diacritic_preserved = has_diacritics(text) == has_diacritics(decoded) | |
| return TokenizationMetrics( | |
| total_tokens=total_tokens, | |
| total_words=total_words, | |
| total_characters=total_characters, | |
| total_bytes=total_bytes, | |
| fertility=fertility, | |
| compression_ratio=compression_ratio, | |
| char_per_token=char_per_token, | |
| oov_count=oov_count, | |
| oov_percentage=oov_percentage, | |
| single_token_words=single_token_words, | |
| single_token_retention_rate=strr, | |
| avg_subwords_per_word=avg_subwords, | |
| max_subwords_per_word=max_subwords, | |
| continued_words_ratio=continued_ratio, | |
| arabic_char_count=arabic_char_count, | |
| arabic_token_count=arabic_tokens_count, | |
| arabic_fertility=arabic_fertility, | |
| diacritic_preservation=diacritic_preserved, | |
| tokenization_time_ms=tokenization_time, | |
| tokens=tokens, | |
| token_ids=token_ids, | |
| decoded_text=decoded | |
| ) | |
| def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]: | |
| """Analyze a single tokenizer - returns HTML outputs""" | |
| from ui_components import ( | |
| generate_tokenizer_info_card, | |
| generate_metrics_card, | |
| generate_token_visualization, | |
| generate_decoded_section | |
| ) | |
| if not text or not text.strip(): | |
| return ( | |
| '<div class="warning">⚠️ Please enter some text to analyze</div>', | |
| '', '', '' | |
| ) | |
| if not tokenizer_choice: | |
| return ( | |
| '<div class="warning">⚠️ Please select a tokenizer</div>', | |
| '', '', '' | |
| ) | |
| model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice) | |
| tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id) | |
| if not tokenizer_info: | |
| return ( | |
| '<div class="error-card"><h4>Error</h4><p>Tokenizer not found</p></div>', | |
| '', '', '' | |
| ) | |
| try: | |
| metrics = analyze_tokenization(text, model_id, tokenizer_info) | |
| info_html = generate_tokenizer_info_card(tokenizer_info) | |
| metrics_html = generate_metrics_card(metrics, tokenizer_info) | |
| tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids) | |
| decoded_html = generate_decoded_section(metrics) | |
| return info_html, metrics_html, tokens_html, decoded_html | |
| except Exception as e: | |
| return ( | |
| f'<div class="error-card"><h4>Error</h4><p>{str(e)}</p></div>', | |
| '', '', '' | |
| ) | |
| def compare_tokenizers(tokenizer_choices: list, text: str) -> str: | |
| """Compare multiple tokenizers - returns HTML table""" | |
| from config import TokenizationMetrics | |
| if not text or not text.strip(): | |
| return '<div class="warning">⚠️ Please enter some text to analyze</div>' | |
| if not tokenizer_choices or len(tokenizer_choices) < 2: | |
| return '<div class="warning">⚠️ Please select at least 2 tokenizers to compare</div>' | |
| results = [] | |
| for choice in tokenizer_choices: | |
| model_id = tokenizer_manager.get_model_id_from_choice(choice) | |
| tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id) | |
| if tokenizer_info: | |
| try: | |
| metrics = analyze_tokenization(text, model_id, tokenizer_info) | |
| results.append({ | |
| 'name': tokenizer_info.name, | |
| 'org': tokenizer_info.organization, | |
| 'type': tokenizer_info.type.value, | |
| 'metrics': metrics | |
| }) | |
| except Exception as e: | |
| results.append({ | |
| 'name': tokenizer_info.name, | |
| 'org': tokenizer_info.organization, | |
| 'type': tokenizer_info.type.value, | |
| 'error': str(e) | |
| }) | |
| # Sort by fertility (lower is better) | |
| def get_fertility(x): | |
| if 'error' in x: | |
| return 999 | |
| return x['metrics'].fertility | |
| results.sort(key=get_fertility) | |
| # Generate comparison table | |
| html = ''' | |
| <div class="comparison-container"> | |
| <table class="comparison-table"> | |
| <thead> | |
| <tr> | |
| <th>Rank</th> | |
| <th>Tokenizer</th> | |
| <th>Type</th> | |
| <th>Tokens</th> | |
| <th>Fertility ↓</th> | |
| <th>Compression ↑</th> | |
| <th>STRR ↑</th> | |
| <th>OOV %</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| ''' | |
| for i, result in enumerate(results): | |
| rank = i + 1 | |
| rank_class = 'rank-1' if rank == 1 else 'rank-2' if rank == 2 else 'rank-3' if rank == 3 else '' | |
| if 'error' in result: | |
| html += f''' | |
| <tr class="{rank_class}"> | |
| <td>#{rank}</td> | |
| <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td> | |
| <td>{result['type']}</td> | |
| <td colspan="5" class="error">Error: {result['error']}</td> | |
| </tr> | |
| ''' | |
| else: | |
| m = result['metrics'] | |
| fertility_class = 'excellent' if m.fertility < 1.5 else 'good' if m.fertility < 2.5 else 'poor' | |
| html += f''' | |
| <tr class="{rank_class}"> | |
| <td><strong>#{rank}</strong></td> | |
| <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td> | |
| <td>{result['type']}</td> | |
| <td>{m.total_tokens}</td> | |
| <td class="{fertility_class}">{m.fertility:.3f}</td> | |
| <td>{m.compression_ratio:.2f}</td> | |
| <td>{m.single_token_retention_rate:.1%}</td> | |
| <td>{m.oov_percentage:.1f}%</td> | |
| </tr> | |
| ''' | |
| html += ''' | |
| </tbody> | |
| </table> | |
| </div> | |
| ''' | |
| return html | |