Spaces:
Sleeping
Sleeping
AlzbetaStrompova
commited on
Commit
•
1709ba8
1
Parent(s):
081d311
change layout
Browse files- app.py +30 -7
- website_script.py +6 -3
app.py
CHANGED
@@ -1,9 +1,9 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
-
from website_script import load, run
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
print("Loaded model")
|
7 |
|
8 |
examples = [
|
9 |
"Masarykova univerzita se nachází v Brně .",
|
@@ -12,22 +12,45 @@ examples = [
|
|
12 |
"Nobelova cena za fyziku byla udělena týmu vědců z MIT ."
|
13 |
]
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def ner(text):
|
|
|
|
|
16 |
result = run(tokenizer, model, gazetteers_for_matching, text)
|
17 |
return {"text": text, "entities": result}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
|
19 |
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
20 |
|
21 |
gr.Interface(ner,
|
22 |
-
gr.Textbox(lines=
|
23 |
gr.HighlightedText(show_legend=True, color_map={"PER": "red", "ORG": "green", "LOC": "blue"}),
|
24 |
examples=examples,
|
25 |
title="NerROB-czech",
|
26 |
description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
|
27 |
allow_flagging="never")
|
28 |
|
29 |
-
gr.Interface(
|
30 |
-
gr.File(label="Upload a JSON file"),
|
31 |
None,
|
32 |
allow_flagging="never",
|
33 |
description="Here you can upload your own gazetteers.",
|
|
|
1 |
+
import json
|
2 |
import gradio as gr
|
3 |
+
from website_script import load, run, gaz
|
4 |
|
5 |
+
tokenizer, model = load()
|
6 |
+
gazetteers_for_matching = gaz()
|
|
|
7 |
|
8 |
examples = [
|
9 |
"Masarykova univerzita se nachází v Brně .",
|
|
|
12 |
"Nobelova cena za fyziku byla udělena týmu vědců z MIT ."
|
13 |
]
|
14 |
|
15 |
+
def add_gazetteers(new_gazetteers):
|
16 |
+
global gazetteers_for_matching
|
17 |
+
for key, value_lst in new_gazetteers.items():
|
18 |
+
key = key.upper()
|
19 |
+
for dictionary in gazetteers_for_matching:
|
20 |
+
if key in dictionary.values():
|
21 |
+
for value in value_lst:
|
22 |
+
dictionary[value] = key
|
23 |
+
|
24 |
def ner(text):
|
25 |
+
for d in gazetteers_for_matching:
|
26 |
+
print(len(d))
|
27 |
result = run(tokenizer, model, gazetteers_for_matching, text)
|
28 |
return {"text": text, "entities": result}
|
29 |
+
|
30 |
+
def load_gazetters(file_names):
|
31 |
+
print(file_names)
|
32 |
+
# Assuming you have a JSON file named 'data.json'
|
33 |
+
for file_name in file_names:
|
34 |
+
with open(file_name, 'r') as file:
|
35 |
+
data = json.load(file)
|
36 |
+
gazetteers_for_matching = add_gazetteers(data)
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
|
42 |
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
43 |
|
44 |
gr.Interface(ner,
|
45 |
+
gr.Textbox(lines=10, placeholder="Enter sentence here..."),
|
46 |
gr.HighlightedText(show_legend=True, color_map={"PER": "red", "ORG": "green", "LOC": "blue"}),
|
47 |
examples=examples,
|
48 |
title="NerROB-czech",
|
49 |
description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
|
50 |
allow_flagging="never")
|
51 |
|
52 |
+
gr.Interface(load_gazetters,
|
53 |
+
gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
|
54 |
None,
|
55 |
allow_flagging="never",
|
56 |
description="Here you can upload your own gazetteers.",
|
website_script.py
CHANGED
@@ -9,18 +9,21 @@ from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
|
|
9 |
def load():
|
10 |
model_name = "ufal/robeczech-base"
|
11 |
model_path = "bettystr/NerRoB-czech"
|
12 |
-
gazetteers_path = "gazz2.json"
|
13 |
|
14 |
model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
|
15 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
16 |
model.eval()
|
|
|
|
|
|
|
|
|
|
|
17 |
gazetteers_for_matching = load_gazetteers(gazetteers_path)
|
18 |
temp = []
|
19 |
for i in gazetteers_for_matching.keys():
|
20 |
temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
|
21 |
gazetteers_for_matching = temp
|
22 |
-
return
|
23 |
-
|
24 |
|
25 |
def run(tokenizer, model, gazetteers_for_matching, text):
|
26 |
|
|
|
9 |
def load():
|
10 |
model_name = "ufal/robeczech-base"
|
11 |
model_path = "bettystr/NerRoB-czech"
|
|
|
12 |
|
13 |
model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
|
14 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
15 |
model.eval()
|
16 |
+
return tokenizer, model
|
17 |
+
|
18 |
+
def gaz():
|
19 |
+
gazetteers_path = "gazz2.json"
|
20 |
+
|
21 |
gazetteers_for_matching = load_gazetteers(gazetteers_path)
|
22 |
temp = []
|
23 |
for i in gazetteers_for_matching.keys():
|
24 |
temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
|
25 |
gazetteers_for_matching = temp
|
26 |
+
return gazetteers_for_matching
|
|
|
27 |
|
28 |
def run(tokenizer, model, gazetteers_for_matching, text):
|
29 |
|