Spaces:

omnyahe
/

Aiiapp

Running

omnyahe commited on 8 days ago

Commit

669d3eb

verified ·

1 Parent(s): d84a4c9

Update preprocess.py

Files changed (1) hide show

preprocess.py CHANGED Viewed

@@ -1,29 +1,22 @@
 # -*- coding: utf-8 -*-
-"""preprocess.ipynb
 Automatically generated by Colab.
 Original file is located at
-    https://colab.research.google.com/drive/12HQFzJI6QIxkTbfwaetk2m3FtCpQabb_
 """
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from transformers import pipeline
-import torch
-import uvicorn
 import re
 import nltk
-import pandas as pd
 import string
-from camel_tools.utils.normalize import normalize_alef_ar
 import torch
 import string
 import arabic_reshaper
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from ruqiya.ruqiya import remove_diacritics
-from camel_tools.utils.normalize import normalize_teh_marbuta_ar
 def remove_numbers(text):
     return re.sub(r"\d+", "", text)
@@ -33,7 +26,12 @@ def remove_punctuation(text):
     all_punctuations = string.punctuation + arabic_punctuations
     translator = str.maketrans('', '', all_punctuations)
     return text.translate(translator)
 def remove_special_characters(text):
     text = re.sub(r'[^\w\s]', '', text)
     # Remove diacritics using ruqiya

 # -*- coding: utf-8 -*-
+"""Untitled158.ipynb
 Automatically generated by Colab.
 Original file is located at
+    https://colab.research.google.com/drive/1tx_ZtCIK17sRvLG5txaco8tL4o1q-qzF
 """
 import re
 import nltk
 import string
 import torch
 import string
 import arabic_reshaper
+import pandas as pd
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from ruqiya.ruqiya import remove_diacritics
 def remove_numbers(text):
     return re.sub(r"\d+", "", text)
     all_punctuations = string.punctuation + arabic_punctuations
     translator = str.maketrans('', '', all_punctuations)
     return text.translate(translator)
+def normalize_alef_ar(text):
+    text = re.sub(r"[إأآا]", "ا", text)
+    return text
+def normalize_teh_marbuta_ar(text):
+    text = re.sub(r"ة", "ه", text)
+    return text
 def remove_special_characters(text):
     text = re.sub(r'[^\w\s]', '', text)
     # Remove diacritics using ruqiya