Yoad commited on
Commit
c962cbf
·
1 Parent(s): 900d7d4

Improve hebrew normalizer

Browse files
Files changed (1) hide show
  1. src/visual_eval/evaluator.py +5 -1
src/visual_eval/evaluator.py CHANGED
@@ -1,4 +1,5 @@
1
  from dataclasses import dataclass
 
2
 
3
  from hebrew import Hebrew
4
  from jiwer import process_words
@@ -23,7 +24,9 @@ class HebrewTextNormalizer(BasicTextNormalizer):
23
  {ord(c): None for c in superfluous_chars_to_remove}
24
  )
25
 
26
- self.quotes_translator = str.maketrans({ord(c): None for c in "\"'"})
 
 
27
 
28
  def __remove_niqqud(self, text: str) -> str:
29
  return Hebrew(text).no_niqqud().string
@@ -32,6 +35,7 @@ class HebrewTextNormalizer(BasicTextNormalizer):
32
  return text.translate(self.superfluous_hebrew_unicode_symbols_translator)
33
 
34
  def __remove_quotes(self, text: str) -> str:
 
35
  return text.translate(self.quotes_translator)
36
 
37
  def __call__(self, text):
 
1
  from dataclasses import dataclass
2
+ import re
3
 
4
  from hebrew import Hebrew
5
  from jiwer import process_words
 
24
  {ord(c): None for c in superfluous_chars_to_remove}
25
  )
26
 
27
+ quotes_str = "\"'״׳"
28
+ self.quotes_translator = str.maketrans({ord(c): None for c in quotes_str})
29
+ self.pre_quote_hyphen_removal_pattern = re.compile(f"-[{quotes_str}]")
30
 
31
  def __remove_niqqud(self, text: str) -> str:
32
  return Hebrew(text).no_niqqud().string
 
35
  return text.translate(self.superfluous_hebrew_unicode_symbols_translator)
36
 
37
  def __remove_quotes(self, text: str) -> str:
38
+ text = self.pre_quote_hyphen_removal_pattern.sub("", text)
39
  return text.translate(self.quotes_translator)
40
 
41
  def __call__(self, text):