Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

Arabic_Tokenizer / utils.py

Refactor: modularize codebase into separate modules

f32d4c7 18 days ago

1.58 kB

	"""
	Arabic Text Utilities
	=====================
	Helper functions for Arabic text analysis
	"""

	import re
	from typing import List


	def is_arabic_char(char: str) -> bool:
	"""Check if character is Arabic"""
	if len(char) != 1:
	return False
	code = ord(char)
	return (
	(0x0600 <= code <= 0x06FF) or # Arabic
	(0x0750 <= code <= 0x077F) or # Arabic Supplement
	(0x08A0 <= code <= 0x08FF) or # Arabic Extended-A
	(0xFB50 <= code <= 0xFDFF) or # Arabic Presentation Forms-A
	(0xFE70 <= code <= 0xFEFF) # Arabic Presentation Forms-B
	)


	def count_arabic_chars(text: str) -> int:
	"""Count Arabic characters in text"""
	return sum(1 for c in text if is_arabic_char(c))


	def has_diacritics(text: str) -> bool:
	"""Check if text contains Arabic diacritics (tashkeel)"""
	diacritics = set('ًٌٍَُِّْٰ')
	return any(c in diacritics for c in text)


	def normalize_arabic(text: str) -> str:
	"""Basic Arabic normalization"""
	# Normalize alef variants
	text = re.sub('[إأآا]', 'ا', text)
	# Normalize yeh
	text = re.sub('ى', 'ي', text)
	# Normalize teh marbuta
	text = re.sub('ة', 'ه', text)
	return text


	def get_arabic_words(text: str) -> List[str]:
	"""Extract Arabic words from text"""
	words = text.split()
	return [w for w in words if any(is_arabic_char(c) for c in w)]


	def remove_diacritics(text: str) -> str:
	"""Remove Arabic diacritics from text"""
	diacritics = 'ًٌٍَُِّْٰ'
	return ''.join(c for c in text if c not in diacritics)