Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

App Files Files Community

Arabic_Tokenizer / analysis.py

HeshamHaroon

Refactor: modularize codebase into separate modules

f32d4c7 17 days ago

raw

history blame contribute delete

8.59 kB

	"""
	Tokenization Analysis
	=====================
	Core analysis functions for evaluating tokenizers
	"""

	import time
	from typing import Tuple
	from config import TokenizerInfo, TokenizationMetrics
	from utils import count_arabic_chars, get_arabic_words, has_diacritics, is_arabic_char
	from tokenizer_manager import tokenizer_manager


	def analyze_tokenization(
	text: str,
	model_id: str,
	tokenizer_info: TokenizerInfo
	) -> TokenizationMetrics:
	"""Perform comprehensive tokenization analysis"""

	tokenizer = tokenizer_manager.get_tokenizer(model_id)

	# Time the tokenization
	start_time = time.perf_counter()
	tokens = tokenizer.tokenize(text)
	token_ids = tokenizer.encode(text, add_special_tokens=False)
	tokenization_time = (time.perf_counter() - start_time) * 1000

	decoded = tokenizer.decode(token_ids, skip_special_tokens=True)

	# Basic counts
	words = text.split()
	total_words = len(words)
	total_tokens = len(tokens)
	total_characters = len(text)
	total_bytes = len(text.encode('utf-8'))

	# Efficiency metrics
	fertility = total_tokens / max(total_words, 1)
	compression_ratio = total_bytes / max(total_tokens, 1)
	char_per_token = total_characters / max(total_tokens, 1)

	# OOV analysis
	unk_token = tokenizer.unk_token if hasattr(tokenizer, 'unk_token') else '[UNK]'
	oov_count = sum(1 for t in tokens if t == unk_token or '[UNK]' in str(t))
	oov_percentage = (oov_count / max(total_tokens, 1)) * 100

	# Single Token Retention Rate (STRR)
	single_token_words = 0
	subwords_per_word = []

	for word in words:
	word_tokens = tokenizer.tokenize(word)
	subwords_per_word.append(len(word_tokens))
	if len(word_tokens) == 1:
	single_token_words += 1

	strr = single_token_words / max(total_words, 1)
	avg_subwords = sum(subwords_per_word) / max(len(subwords_per_word), 1)
	max_subwords = max(subwords_per_word) if subwords_per_word else 0
	continued_ratio = (total_words - single_token_words) / max(total_words, 1)

	# Arabic-specific metrics
	arabic_char_count = count_arabic_chars(text)
	arabic_words = get_arabic_words(text)
	arabic_tokens_count = 0

	for token in tokens:
	if any(is_arabic_char(c) for c in str(token)):
	arabic_tokens_count += 1

	arabic_fertility = arabic_tokens_count / max(len(arabic_words), 1) if arabic_words else 0
	diacritic_preserved = has_diacritics(text) == has_diacritics(decoded)

	return TokenizationMetrics(
	total_tokens=total_tokens,
	total_words=total_words,
	total_characters=total_characters,
	total_bytes=total_bytes,
	fertility=fertility,
	compression_ratio=compression_ratio,
	char_per_token=char_per_token,
	oov_count=oov_count,
	oov_percentage=oov_percentage,
	single_token_words=single_token_words,
	single_token_retention_rate=strr,
	avg_subwords_per_word=avg_subwords,
	max_subwords_per_word=max_subwords,
	continued_words_ratio=continued_ratio,
	arabic_char_count=arabic_char_count,
	arabic_token_count=arabic_tokens_count,
	arabic_fertility=arabic_fertility,
	diacritic_preservation=diacritic_preserved,
	tokenization_time_ms=tokenization_time,
	tokens=tokens,
	token_ids=token_ids,
	decoded_text=decoded
	)


	def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]:
	"""Analyze a single tokenizer - returns HTML outputs"""
	from ui_components import (
	generate_tokenizer_info_card,
	generate_metrics_card,
	generate_token_visualization,
	generate_decoded_section
	)

	if not text or not text.strip():
	return (
	'<div class="warning">⚠️ Please enter some text to analyze</div>',
	'', '', ''
	)

	if not tokenizer_choice:
	return (
	'<div class="warning">⚠️ Please select a tokenizer</div>',
	'', '', ''
	)

	model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice)
	tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)

	if not tokenizer_info:
	return (
	'<div class="error-card"><h4>Error</h4><p>Tokenizer not found</p></div>',
	'', '', ''
	)

	try:
	metrics = analyze_tokenization(text, model_id, tokenizer_info)

	info_html = generate_tokenizer_info_card(tokenizer_info)
	metrics_html = generate_metrics_card(metrics, tokenizer_info)
	tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids)
	decoded_html = generate_decoded_section(metrics)

	return info_html, metrics_html, tokens_html, decoded_html

	except Exception as e:
	return (
	f'<div class="error-card"><h4>Error</h4><p>{str(e)}</p></div>',
	'', '', ''
	)


	def compare_tokenizers(tokenizer_choices: list, text: str) -> str:
	"""Compare multiple tokenizers - returns HTML table"""
	from config import TokenizationMetrics

	if not text or not text.strip():
	return '<div class="warning">⚠️ Please enter some text to analyze</div>'

	if not tokenizer_choices or len(tokenizer_choices) < 2:
	return '<div class="warning">⚠️ Please select at least 2 tokenizers to compare</div>'

	results = []

	for choice in tokenizer_choices:
	model_id = tokenizer_manager.get_model_id_from_choice(choice)
	tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)

	if tokenizer_info:
	try:
	metrics = analyze_tokenization(text, model_id, tokenizer_info)
	results.append({
	'name': tokenizer_info.name,
	'org': tokenizer_info.organization,
	'type': tokenizer_info.type.value,
	'metrics': metrics
	})
	except Exception as e:
	results.append({
	'name': tokenizer_info.name,
	'org': tokenizer_info.organization,
	'type': tokenizer_info.type.value,
	'error': str(e)
	})

	# Sort by fertility (lower is better)
	def get_fertility(x):
	if 'error' in x:
	return 999
	return x['metrics'].fertility

	results.sort(key=get_fertility)

	# Generate comparison table
	html = '''
	<div class="comparison-container">
	<table class="comparison-table">
	<thead>
	<tr>
	<th>Rank</th>
	<th>Tokenizer</th>
	<th>Type</th>
	<th>Tokens</th>
	<th>Fertility ↓</th>
	<th>Compression ↑</th>
	<th>STRR ↑</th>
	<th>OOV %</th>
	</tr>
	</thead>
	<tbody>
	'''

	for i, result in enumerate(results):
	rank = i + 1
	rank_class = 'rank-1' if rank == 1 else 'rank-2' if rank == 2 else 'rank-3' if rank == 3 else ''

	if 'error' in result:
	html += f'''
	<tr class="{rank_class}">
	<td>#{rank}</td>
	<td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
	<td>{result['type']}</td>
	<td colspan="5" class="error">Error: {result['error']}</td>
	</tr>
	'''
	else:
	m = result['metrics']
	fertility_class = 'excellent' if m.fertility < 1.5 else 'good' if m.fertility < 2.5 else 'poor'

	html += f'''
	<tr class="{rank_class}">
	<td><strong>#{rank}</strong></td>
	<td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
	<td>{result['type']}</td>
	<td>{m.total_tokens}</td>
	<td class="{fertility_class}">{m.fertility:.3f}</td>
	<td>{m.compression_ratio:.2f}</td>
	<td>{m.single_token_retention_rate:.1%}</td>
	<td>{m.oov_percentage:.1f}%</td>
	</tr>
	'''

	html += '''
	</tbody>
	</table>
	</div>
	'''

	return html