Spaces:
Running
Running
Yoad
commited on
Commit
·
c962cbf
1
Parent(s):
900d7d4
Improve hebrew normalizer
Browse files
src/visual_eval/evaluator.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from dataclasses import dataclass
|
|
|
2 |
|
3 |
from hebrew import Hebrew
|
4 |
from jiwer import process_words
|
@@ -23,7 +24,9 @@ class HebrewTextNormalizer(BasicTextNormalizer):
|
|
23 |
{ord(c): None for c in superfluous_chars_to_remove}
|
24 |
)
|
25 |
|
26 |
-
|
|
|
|
|
27 |
|
28 |
def __remove_niqqud(self, text: str) -> str:
|
29 |
return Hebrew(text).no_niqqud().string
|
@@ -32,6 +35,7 @@ class HebrewTextNormalizer(BasicTextNormalizer):
|
|
32 |
return text.translate(self.superfluous_hebrew_unicode_symbols_translator)
|
33 |
|
34 |
def __remove_quotes(self, text: str) -> str:
|
|
|
35 |
return text.translate(self.quotes_translator)
|
36 |
|
37 |
def __call__(self, text):
|
|
|
1 |
from dataclasses import dataclass
|
2 |
+
import re
|
3 |
|
4 |
from hebrew import Hebrew
|
5 |
from jiwer import process_words
|
|
|
24 |
{ord(c): None for c in superfluous_chars_to_remove}
|
25 |
)
|
26 |
|
27 |
+
quotes_str = "\"'״׳"
|
28 |
+
self.quotes_translator = str.maketrans({ord(c): None for c in quotes_str})
|
29 |
+
self.pre_quote_hyphen_removal_pattern = re.compile(f"-[{quotes_str}]")
|
30 |
|
31 |
def __remove_niqqud(self, text: str) -> str:
|
32 |
return Hebrew(text).no_niqqud().string
|
|
|
35 |
return text.translate(self.superfluous_hebrew_unicode_symbols_translator)
|
36 |
|
37 |
def __remove_quotes(self, text: str) -> str:
|
38 |
+
text = self.pre_quote_hyphen_removal_pattern.sub("", text)
|
39 |
return text.translate(self.quotes_translator)
|
40 |
|
41 |
def __call__(self, text):
|