Spaces:
Sleeping
Sleeping
File size: 1,774 Bytes
7e6964a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import re
from simplemma import lemmatize
def flatten(xss):
return [x for xs in xss for x in xs]
def remove_all_brackets(text):
return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text)
def lemmatizing(x):
if x == "":
return ""
return lemmatize(x, lang="cs")
def build_reverse_dictionary(dictionary, apply_lemmatizing=False):
reverse_dictionary = {}
for key, values in dictionary.items():
for value in values:
reverse_dictionary[value] = key
if apply_lemmatizing:
temp = lemmatizing(value)
if temp != value:
reverse_dictionary[temp] = key
return reverse_dictionary
def split_gazetteers_for_single_token_match(gazetteers):
result = {}
for k, v in gazetteers.items():
result[k] = set(flatten([vv.split(" ") for vv in v]))
result[k] = {x for x in result[k] if len(x) > 2}
return result
def preprocess_gazetteers(gazetteers, config):
if config["split_person"]:
gazetteers["PER"].update(set([x for x in flatten([v.split(" ") for v in gazetteers["PER"]]) if len(x) > 2]))
if config["lemmatize"]:
for k, v in gazetteers.items():
gazetteers[k] = set(flatten([(vv, lemmatizing(vv)) for vv in v if len(vv) > 2]))
if config["remove_brackets"]:
for k, v in gazetteers.items():
gazetteers[k] = {remove_all_brackets(vv).strip() for vv in v if len(remove_all_brackets(vv).strip()) > 2}
if config["remove_numeric"]:
for k, v in gazetteers.items():
gazetteers[k] = {vv for vv in v if not vv.isnumeric()}
if config["techniq_for_matching"] != "single":
gazetteers = split_gazetteers_for_single_token_match(gazetteers)
return gazetteers
|