Arabic_Tokenizer / utils.py
HeshamHaroon's picture
Refactor: modularize codebase into separate modules
f32d4c7
"""
Arabic Text Utilities
=====================
Helper functions for Arabic text analysis
"""
import re
from typing import List
def is_arabic_char(char: str) -> bool:
"""Check if character is Arabic"""
if len(char) != 1:
return False
code = ord(char)
return (
(0x0600 <= code <= 0x06FF) or # Arabic
(0x0750 <= code <= 0x077F) or # Arabic Supplement
(0x08A0 <= code <= 0x08FF) or # Arabic Extended-A
(0xFB50 <= code <= 0xFDFF) or # Arabic Presentation Forms-A
(0xFE70 <= code <= 0xFEFF) # Arabic Presentation Forms-B
)
def count_arabic_chars(text: str) -> int:
"""Count Arabic characters in text"""
return sum(1 for c in text if is_arabic_char(c))
def has_diacritics(text: str) -> bool:
"""Check if text contains Arabic diacritics (tashkeel)"""
diacritics = set('ู‹ูŒููŽููู‘ู’ูฐ')
return any(c in diacritics for c in text)
def normalize_arabic(text: str) -> str:
"""Basic Arabic normalization"""
# Normalize alef variants
text = re.sub('[ุฅุฃุขุง]', 'ุง', text)
# Normalize yeh
text = re.sub('ู‰', 'ูŠ', text)
# Normalize teh marbuta
text = re.sub('ุฉ', 'ู‡', text)
return text
def get_arabic_words(text: str) -> List[str]:
"""Extract Arabic words from text"""
words = text.split()
return [w for w in words if any(is_arabic_char(c) for c in w)]
def remove_diacritics(text: str) -> str:
"""Remove Arabic diacritics from text"""
diacritics = 'ู‹ูŒููŽููู‘ู’ูฐ'
return ''.join(c for c in text if c not in diacritics)