Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -38,6 +38,40 @@ shortcut_map = {
|
|
38 |
"sxp": "saaxiib"
|
39 |
}
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
def number_to_words(number):
|
42 |
number = int(number)
|
43 |
if number < 20:
|
@@ -86,13 +120,19 @@ def normalize_text(text):
|
|
86 |
text = re.sub(r'(?i)(?<!\w)zamzam(?!\w)', 'samsam', text)
|
87 |
|
88 |
# ➤ Bedel shortcuts - eray kasta oo qoraalka ku jira beddel
|
89 |
-
# Ka dhig case-insensitive beddelka
|
90 |
def replace_shortcuts(match):
|
91 |
word = match.group(0).lower()
|
92 |
return shortcut_map.get(word, word)
|
93 |
pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in shortcut_map.keys()) + r')\b', re.IGNORECASE)
|
94 |
text = pattern.sub(replace_shortcuts, text)
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
# ➤ Ka saar tirooyin leh koma iyo tobanle
|
97 |
text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
|
98 |
text = re.sub(r'\.\d+', '', text)
|
|
|
38 |
"sxp": "saaxiib"
|
39 |
}
|
40 |
|
41 |
+
# Countries dictionary (English to Somali)
|
42 |
+
country_map = {
|
43 |
+
"somalia": "Soomaaliya",
|
44 |
+
"ethiopia": "Itoobiya",
|
45 |
+
"kenya": "Kenya",
|
46 |
+
"djibouti": "Jabuuti",
|
47 |
+
"sudan": "Suudaan",
|
48 |
+
"south sudan": "Koonfurta Suudaan",
|
49 |
+
"uganda": "Ugaandha",
|
50 |
+
"tanzania": "Tansaaniya",
|
51 |
+
"egypt": "Masar",
|
52 |
+
"libya": "Liibiya",
|
53 |
+
"algeria": "Aljeeriya",
|
54 |
+
"morocco": "Morooko",
|
55 |
+
"tunisia": "Tuniisiya",
|
56 |
+
"eritrea": "Eriteriya",
|
57 |
+
"malawi": "Malaawi",
|
58 |
+
"mozambique": "Mosambiik",
|
59 |
+
"zambia": "Sambiya",
|
60 |
+
"zimbabwe": "Simbabwe",
|
61 |
+
"niger": "Niyjer",
|
62 |
+
"nigeria": "Nayjeeriya",
|
63 |
+
"united states": "Maraykanka",
|
64 |
+
"china": "Shiinaha",
|
65 |
+
"india": "Hindiya",
|
66 |
+
"russia": "Ruushka",
|
67 |
+
"united kingdom": "Boqortooyada Midowday",
|
68 |
+
"germany": "Jarmalka",
|
69 |
+
"france": "Faransiiska",
|
70 |
+
"japan": "Jabaan",
|
71 |
+
"canada": "Kanada",
|
72 |
+
"australia": "Australia"
|
73 |
+
}
|
74 |
+
|
75 |
def number_to_words(number):
|
76 |
number = int(number)
|
77 |
if number < 20:
|
|
|
120 |
text = re.sub(r'(?i)(?<!\w)zamzam(?!\w)', 'samsam', text)
|
121 |
|
122 |
# ➤ Bedel shortcuts - eray kasta oo qoraalka ku jira beddel
|
|
|
123 |
def replace_shortcuts(match):
|
124 |
word = match.group(0).lower()
|
125 |
return shortcut_map.get(word, word)
|
126 |
pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in shortcut_map.keys()) + r')\b', re.IGNORECASE)
|
127 |
text = pattern.sub(replace_shortcuts, text)
|
128 |
|
129 |
+
# ➤ Bedel magacyada waddamada
|
130 |
+
def replace_countries(match):
|
131 |
+
word = match.group(0).lower()
|
132 |
+
return country_map.get(word, word)
|
133 |
+
country_pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in country_map.keys()) + r')\b', re.IGNORECASE)
|
134 |
+
text = country_pattern.sub(replace_countries, text)
|
135 |
+
|
136 |
# ➤ Ka saar tirooyin leh koma iyo tobanle
|
137 |
text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
|
138 |
text = re.sub(r'\.\d+', '', text)
|