Update preprocess.py
Browse files- preprocess.py +9 -11
preprocess.py
CHANGED
@@ -1,29 +1,22 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
|
4 |
Automatically generated by Colab.
|
5 |
|
6 |
Original file is located at
|
7 |
-
https://colab.research.google.com/drive/
|
8 |
"""
|
9 |
|
10 |
-
from fastapi import FastAPI, HTTPException
|
11 |
-
from pydantic import BaseModel
|
12 |
-
from transformers import pipeline
|
13 |
-
import torch
|
14 |
-
import uvicorn
|
15 |
import re
|
16 |
import nltk
|
17 |
-
import pandas as pd
|
18 |
import string
|
19 |
-
from camel_tools.utils.normalize import normalize_alef_ar
|
20 |
import torch
|
21 |
import string
|
22 |
import arabic_reshaper
|
|
|
23 |
from nltk.corpus import stopwords
|
24 |
from nltk.tokenize import word_tokenize
|
25 |
from ruqiya.ruqiya import remove_diacritics
|
26 |
-
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
|
27 |
|
28 |
def remove_numbers(text):
|
29 |
return re.sub(r"\d+", "", text)
|
@@ -33,7 +26,12 @@ def remove_punctuation(text):
|
|
33 |
all_punctuations = string.punctuation + arabic_punctuations
|
34 |
translator = str.maketrans('', '', all_punctuations)
|
35 |
return text.translate(translator)
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
37 |
def remove_special_characters(text):
|
38 |
text = re.sub(r'[^\w\s]', '', text)
|
39 |
# Remove diacritics using ruqiya
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
+
"""Untitled158.ipynb
|
3 |
|
4 |
Automatically generated by Colab.
|
5 |
|
6 |
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1tx_ZtCIK17sRvLG5txaco8tL4o1q-qzF
|
8 |
"""
|
9 |
|
|
|
|
|
|
|
|
|
|
|
10 |
import re
|
11 |
import nltk
|
|
|
12 |
import string
|
|
|
13 |
import torch
|
14 |
import string
|
15 |
import arabic_reshaper
|
16 |
+
import pandas as pd
|
17 |
from nltk.corpus import stopwords
|
18 |
from nltk.tokenize import word_tokenize
|
19 |
from ruqiya.ruqiya import remove_diacritics
|
|
|
20 |
|
21 |
def remove_numbers(text):
|
22 |
return re.sub(r"\d+", "", text)
|
|
|
26 |
all_punctuations = string.punctuation + arabic_punctuations
|
27 |
translator = str.maketrans('', '', all_punctuations)
|
28 |
return text.translate(translator)
|
29 |
+
def normalize_alef_ar(text):
|
30 |
+
text = re.sub(r"[إأآا]", "ا", text)
|
31 |
+
return text
|
32 |
+
def normalize_teh_marbuta_ar(text):
|
33 |
+
text = re.sub(r"ة", "ه", text)
|
34 |
+
return text
|
35 |
def remove_special_characters(text):
|
36 |
text = re.sub(r'[^\w\s]', '', text)
|
37 |
# Remove diacritics using ruqiya
|