Spaces:

bettystr
/

NerRoB-czech

Running

App Files Files Community

AlzbetaStrompova commited on May 10, 2024

Commit

1709ba8

1 Parent(s): 081d311

change layout

Browse files

Files changed (2) hide show

app.py +30 -7
website_script.py +6 -3

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import gradio as gr
-from website_script import load, run
-print("Loading model")
-tokenizer, model, gazetteers_for_matching = load()
-print("Loaded model")
 examples = [
     "Masarykova univerzita se nachází v Brně .",
@@ -12,22 +12,45 @@ examples = [
     "Nobelova cena za fyziku byla udělena týmu vědců z MIT ."
 ]
 def ner(text):
     result = run(tokenizer, model, gazetteers_for_matching, text)
     return {"text": text, "entities": result}
 with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
 # with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Interface(ner,
-        gr.Textbox(lines=5, placeholder="Enter sentence here..."),
         gr.HighlightedText(show_legend=True, color_map={"PER": "red", "ORG": "green", "LOC": "blue"}),
         examples=examples,
         title="NerROB-czech",
         description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
         allow_flagging="never")
-    gr.Interface(ner,
-        gr.File(label="Upload a JSON file"),
         None,
         allow_flagging="never",
         description="Here you can upload your own gazetteers.",

+import json
 import gradio as gr
+from website_script import load, run, gaz
+tokenizer, model = load()
+gazetteers_for_matching = gaz()
 examples = [
     "Masarykova univerzita se nachází v Brně .",
     "Nobelova cena za fyziku byla udělena týmu vědců z MIT ."
 ]
+def add_gazetteers(new_gazetteers):
+    global gazetteers_for_matching
+    for key, value_lst in new_gazetteers.items():
+        key = key.upper()
+        for dictionary in gazetteers_for_matching:
+            if key in dictionary.values():
+                for value in value_lst:
+                    dictionary[value] = key
 def ner(text):
+    for d in gazetteers_for_matching:
+        print(len(d))
     result = run(tokenizer, model, gazetteers_for_matching, text)
     return {"text": text, "entities": result}
+def load_gazetters(file_names):
+    print(file_names)
+    # Assuming you have a JSON file named 'data.json'
+    for file_name in file_names:
+        with open(file_name, 'r') as file:
+            data = json.load(file)
+            gazetteers_for_matching = add_gazetteers(data)
 with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
 # with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Interface(ner,
+        gr.Textbox(lines=10, placeholder="Enter sentence here..."),
         gr.HighlightedText(show_legend=True, color_map={"PER": "red", "ORG": "green", "LOC": "blue"}),
         examples=examples,
         title="NerROB-czech",
         description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
         allow_flagging="never")
+    gr.Interface(load_gazetters,
+        gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
         None,
         allow_flagging="never",
         description="Here you can upload your own gazetteers.",

website_script.py CHANGED Viewed

@@ -9,18 +9,21 @@ from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
 def load():
     model_name = "ufal/robeczech-base"
     model_path = "bettystr/NerRoB-czech"
-    gazetteers_path = "gazz2.json"
     model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model.eval()
     gazetteers_for_matching = load_gazetteers(gazetteers_path)
     temp = []
     for i in gazetteers_for_matching.keys():
         temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
     gazetteers_for_matching = temp
-    return tokenizer, model, gazetteers_for_matching
 def run(tokenizer, model, gazetteers_for_matching, text):

 def load():
     model_name = "ufal/robeczech-base"
     model_path = "bettystr/NerRoB-czech"
     model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model.eval()
+    return tokenizer, model
+def gaz():
+    gazetteers_path = "gazz2.json"
     gazetteers_for_matching = load_gazetteers(gazetteers_path)
     temp = []
     for i in gazetteers_for_matching.keys():
         temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
     gazetteers_for_matching = temp
+    return gazetteers_for_matching
 def run(tokenizer, model, gazetteers_for_matching, text):