MahtaFetrat commited on
Commit
e44c2ec
·
1 Parent(s): 39a01b6

add assets

Browse files
Files changed (2) hide show
  1. assets/GE2PE.py +105 -0
  2. assets/Parsivar.zip +3 -0
assets/GE2PE.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, T5ForConditionalGeneration
2
+ from Parsivar.normalizer import Normalizer
3
+
4
+ class GE2PE():
5
+
6
+ def __init__(self, model_path = './content/checkpoint-320', GPU = False, dictionary = None):
7
+ """
8
+ model_path: path to where the GE2PE transformer is saved.
9
+ GPU: boolean indicating use of GPU in generation.
10
+ dictionary: a dictionary for self-defined words.
11
+ """
12
+
13
+ self.GPU = GPU
14
+ self.model = T5ForConditionalGeneration.from_pretrained(model_path)
15
+ if self.GPU:
16
+ self.model = self.model.cuda()
17
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
18
+ self.dictionary = dictionary
19
+ self.norma = Normalizer(pinglish_conversion_needed=True)
20
+
21
+ def is_vowel(self, char):
22
+ return (char in ['a', '/', 'i', 'e', 'u', 'o'])
23
+
24
+ def rules(self, grapheme, phoneme):
25
+ grapheme = grapheme.replace('آ', 'ءا')
26
+ words = grapheme.split(' ')
27
+ prons = phoneme.replace('1', '').split(' ')
28
+ if len(words) != len(prons):
29
+ return phoneme
30
+ for i in range(len(words)):
31
+ if 'ِ' not in words[i] and 'ُ' not in words[i] and 'َ' not in words[i]:
32
+ continue
33
+ for j in range(len(words[i])):
34
+ if words[i][j] == 'َ':
35
+ if j == len(words[i]) - 1 and prons[i][-1] != '/':
36
+ prons[i] = prons[i] + '/'
37
+ elif self.is_vowel(prons[i][j]):
38
+ prons[i] = prons[i][:j] + '/' + prons[i][j+1:]
39
+ else:
40
+ prons[i] = prons[i][:j] + '/' + prons[i][j:]
41
+ if words[i][j] == 'ِ':
42
+ if j == len(words[i]) - 1 and prons[i][-1] != 'e':
43
+ prons[i] = prons[i] + 'e'
44
+ elif self.is_vowel(prons[i][j]):
45
+ prons[i] = prons[i][:j] + 'e' + prons[i][j+1:]
46
+ else:
47
+ prons[i] = prons[i][:j] + 'e' + prons[i][j:]
48
+ if words[i][j] == 'ُ':
49
+ if j == len(words[i]) - 1 and prons[i][-1] != 'o':
50
+ prons[i] = prons[i] + 'o'
51
+ elif self.is_vowel(prons[i][j]):
52
+ prons[i] = prons[i][:j] + 'o' + prons[i][j+1:]
53
+ else:
54
+ prons[i] = prons[i][:j] + 'o' + prons[i][j:]
55
+ return ' '.join(prons)
56
+
57
+ def lexicon(self, grapheme, phoneme):
58
+ words = grapheme.split(' ')
59
+ prons = phoneme.split(' ')
60
+ output = prons
61
+ for i in range(len(words)):
62
+ try:
63
+ output[i] = self.dictionary[words[i]]
64
+ if prons[i][-1] == '1' and output[i][-1] != 'e':
65
+ output[i] = output[i] + 'e1'
66
+ elif prons[i][-1] == '1' and output[i][-1] == 'e':
67
+ output[i] = output[i] + 'ye1'
68
+ except:
69
+ pass
70
+ return ' '.join(output)
71
+
72
+ def generate(self, input_list, batch_size = 10, use_rules = False, use_dict = False):
73
+ """
74
+ input_list: list of sentences to be phonemized.
75
+ batch_size: inference batch_size
76
+ use_rules: boolean indicating the use of rules to apply short vowels.
77
+ use_dict: boolean indicating the use of self-defined dictionary.
78
+ returns the list of phonemized sentences.
79
+ """
80
+
81
+ output_list = []
82
+ input_list = [self.norma.normalize(text).replace('ك', 'ک') for text in input_list]
83
+ input = input_list
84
+ input_list = [text.replace('ِ', '').replace('ُ', '').replace('َ', '') for text in input_list]
85
+ for i in range(0,len(input_list),batch_size):
86
+ in_ids = self.tokenizer(input_list[i:i+batch_size], padding=True,add_special_tokens=False, return_attention_mask=True,return_tensors='pt')
87
+ if self.GPU:
88
+ out_ids = self.model.generate(in_ids["input_ids"].cuda(), attention_mask=in_ids["attention_mask"].cuda(), num_beams=5,
89
+ min_length= 1, max_length=512, early_stopping=True,)
90
+ else:
91
+ out_ids = self.model.generate(in_ids["input_ids"], attention_mask=in_ids["attention_mask"], num_beams=5,
92
+ min_length= 1, max_length=512, early_stopping=True,)
93
+ output_list += self.tokenizer.batch_decode(out_ids, skip_special_tokens=True)
94
+
95
+
96
+ if use_dict:
97
+ for i in range(len(input_list)):
98
+ output_list[i] = self.lexicon(input_list[i], output_list[i])
99
+
100
+ if use_rules:
101
+ for i in range(len(input_list)):
102
+ output_list[i] = self.rules(input[i], output_list[i])
103
+
104
+ output_list = [i.strip() for i in output_list]
105
+ return output_list
assets/Parsivar.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe97e364feedd597a968a312876a3cf3eb55bb73f1bcb71658c616da3ed01226
3
+ size 49123011