omnyahe commited on
Commit
669d3eb
·
verified ·
1 Parent(s): d84a4c9

Update preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +9 -11
preprocess.py CHANGED
@@ -1,29 +1,22 @@
1
  # -*- coding: utf-8 -*-
2
- """preprocess.ipynb
3
 
4
  Automatically generated by Colab.
5
 
6
  Original file is located at
7
- https://colab.research.google.com/drive/12HQFzJI6QIxkTbfwaetk2m3FtCpQabb_
8
  """
9
 
10
- from fastapi import FastAPI, HTTPException
11
- from pydantic import BaseModel
12
- from transformers import pipeline
13
- import torch
14
- import uvicorn
15
  import re
16
  import nltk
17
- import pandas as pd
18
  import string
19
- from camel_tools.utils.normalize import normalize_alef_ar
20
  import torch
21
  import string
22
  import arabic_reshaper
 
23
  from nltk.corpus import stopwords
24
  from nltk.tokenize import word_tokenize
25
  from ruqiya.ruqiya import remove_diacritics
26
- from camel_tools.utils.normalize import normalize_teh_marbuta_ar
27
 
28
  def remove_numbers(text):
29
  return re.sub(r"\d+", "", text)
@@ -33,7 +26,12 @@ def remove_punctuation(text):
33
  all_punctuations = string.punctuation + arabic_punctuations
34
  translator = str.maketrans('', '', all_punctuations)
35
  return text.translate(translator)
36
-
 
 
 
 
 
37
  def remove_special_characters(text):
38
  text = re.sub(r'[^\w\s]', '', text)
39
  # Remove diacritics using ruqiya
 
1
  # -*- coding: utf-8 -*-
2
+ """Untitled158.ipynb
3
 
4
  Automatically generated by Colab.
5
 
6
  Original file is located at
7
+ https://colab.research.google.com/drive/1tx_ZtCIK17sRvLG5txaco8tL4o1q-qzF
8
  """
9
 
 
 
 
 
 
10
  import re
11
  import nltk
 
12
  import string
 
13
  import torch
14
  import string
15
  import arabic_reshaper
16
+ import pandas as pd
17
  from nltk.corpus import stopwords
18
  from nltk.tokenize import word_tokenize
19
  from ruqiya.ruqiya import remove_diacritics
 
20
 
21
  def remove_numbers(text):
22
  return re.sub(r"\d+", "", text)
 
26
  all_punctuations = string.punctuation + arabic_punctuations
27
  translator = str.maketrans('', '', all_punctuations)
28
  return text.translate(translator)
29
+ def normalize_alef_ar(text):
30
+ text = re.sub(r"[إأآا]", "ا", text)
31
+ return text
32
+ def normalize_teh_marbuta_ar(text):
33
+ text = re.sub(r"ة", "ه", text)
34
+ return text
35
  def remove_special_characters(text):
36
  text = re.sub(r'[^\w\s]', '', text)
37
  # Remove diacritics using ruqiya