Spaces:
Running
Running
| """ | |
| Arabic Text Utilities | |
| ===================== | |
| Helper functions for Arabic text analysis | |
| """ | |
| import re | |
| from typing import List | |
| def is_arabic_char(char: str) -> bool: | |
| """Check if character is Arabic""" | |
| if len(char) != 1: | |
| return False | |
| code = ord(char) | |
| return ( | |
| (0x0600 <= code <= 0x06FF) or # Arabic | |
| (0x0750 <= code <= 0x077F) or # Arabic Supplement | |
| (0x08A0 <= code <= 0x08FF) or # Arabic Extended-A | |
| (0xFB50 <= code <= 0xFDFF) or # Arabic Presentation Forms-A | |
| (0xFE70 <= code <= 0xFEFF) # Arabic Presentation Forms-B | |
| ) | |
| def count_arabic_chars(text: str) -> int: | |
| """Count Arabic characters in text""" | |
| return sum(1 for c in text if is_arabic_char(c)) | |
| def has_diacritics(text: str) -> bool: | |
| """Check if text contains Arabic diacritics (tashkeel)""" | |
| diacritics = set('ูููููููููฐ') | |
| return any(c in diacritics for c in text) | |
| def normalize_arabic(text: str) -> str: | |
| """Basic Arabic normalization""" | |
| # Normalize alef variants | |
| text = re.sub('[ุฅุฃุขุง]', 'ุง', text) | |
| # Normalize yeh | |
| text = re.sub('ู', 'ู', text) | |
| # Normalize teh marbuta | |
| text = re.sub('ุฉ', 'ู', text) | |
| return text | |
| def get_arabic_words(text: str) -> List[str]: | |
| """Extract Arabic words from text""" | |
| words = text.split() | |
| return [w for w in words if any(is_arabic_char(c) for c in w)] | |
| def remove_diacritics(text: str) -> str: | |
| """Remove Arabic diacritics from text""" | |
| diacritics = 'ูููููููููฐ' | |
| return ''.join(c for c in text if c not in diacritics) | |