Spaces:

harshildarji
/

Juristische-NER

Running

App Files Files Community

Harshil Darji commited on 23 days ago

Commit

4a3eaed

1 Parent(s): 4cf33f6

update app

Browse files

Files changed (1) hide show

app.py +69 -60

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 import re
 import string
@@ -12,7 +11,8 @@ from transformers import (
     pipeline,
 )
-st.set_page_config(page_title="German Legal NER", page_icon="⚖️", layout="wide")
 logging.set_verbosity(logging.ERROR)
 st.markdown(
@@ -24,9 +24,7 @@ st.markdown(
     padding-left: 3rem;
     padding-right: 3rem;
 }
 header, footer {visibility: hidden;}
 .entity {
     position: relative;
     display: inline-block;
@@ -34,7 +32,6 @@ header, footer {visibility: hidden;}
     font-weight: normal;
     cursor: help;
 }
 .entity .tooltip {
     visibility: hidden;
     background-color: #333;
@@ -52,12 +49,10 @@ header, footer {visibility: hidden;}
     transition: opacity 0.05s;
     font-size: 11px;
 }
 .entity:hover .tooltip {
     visibility: visible;
     opacity: 1;
 }
 .entity.marked {
     background-color: rgba(255, 230, 0, 0.4);
     line-height: 1.3;
@@ -69,39 +64,32 @@ header, footer {visibility: hidden;}
     unsafe_allow_html=True,
 )
-# Load model
-tkn = os.getenv("tkn")
-tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER", use_auth_token=tkn)
-model = AutoModelForTokenClassification.from_pretrained(
-    "harshildarji/JuraNER", use_auth_token=tkn
-)
-ner = pipeline("ner", model=model, tokenizer=tokenizer)
-# Entity labels
 entity_labels = {
-    "AN": "Lawyer",
-    "EUN": "European legal norm",
-    "GRT": "Court",
-    "GS": "Law",
     "INN": "Institution",
-    "LD": "Country",
-    "LDS": "Landscape",
-    "LIT": "Legal literature",
-    "MRK": "Brand",
-    "ORG": "Organization",
     "PER": "Person",
-    "RR": "Judge",
-    "RS": "Court decision",
-    "ST": "City",
-    "STR": "Street",
-    "UN": "Company",
-    "VO": "Ordinance",
-    "VS": "Regulation",
-    "VT": "Contract",
 }
-# Fixed colors
 def generate_fixed_colors(keys, alpha=0.25):
     cmap = cm.get_cmap("tab20", len(keys))
     rgba_colors = {}
@@ -112,16 +100,35 @@ def generate_fixed_colors(keys, alpha=0.25):
     return rgba_colors
-ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()), alpha=0.30)
-# UI
-st.markdown("#### German Legal NER")
-uploaded_file = st.file_uploader("Upload a .txt file", type="txt")
-threshold = st.slider("Confidence threshold:", 0.0, 1.0, 0.8, 0.01)
-st.markdown("---")
-# Merge logic
 def merge_entities(entities):
     if not entities:
         return []
@@ -135,10 +142,7 @@ def merge_entities(entities):
         prev = merged[-1]
         if ent["index"] == prev["index"] + 1:
             tok = ent["word"]
-            if tok.startswith("##"):
-                prev["word"] += tok[2:]
-            else:
-                prev["word"] += " " + tok
             prev["end"] = ent["end"]
             prev["index"] = ent["index"]
             prev["score_sum"] += ent["score"]
@@ -172,7 +176,7 @@ def merge_entities(entities):
     return final
-# HTML highlighting
 def highlight_entities(line, merged_entities, threshold):
     html = ""
     last_end = 0
@@ -200,24 +204,29 @@ def highlight_entities(line, merged_entities, threshold):
     return html
 if uploaded_file:
     raw_bytes = uploaded_file.read()
     encoding = detect(raw_bytes)["encoding"]
     if encoding is None:
-        st.error("Could not detect file encoding.")
     else:
         text = raw_bytes.decode(encoding)
-        with st.spinner("Processing..."):
-            for line in text.splitlines():
-                if not line.strip():
-                    st.write("")
-                    continue
-                tokens = ner(line)
-                merged = merge_entities(tokens)
-                html_line = highlight_entities(line, merged, threshold)
-                st.markdown(
-                    f'<div style="margin:0;padding:0;line-height:1.7;">{html_line}</div>',
-                    unsafe_allow_html=True,
-                )

 import re
 import string
     pipeline,
 )
+# Streamlit page setup
+st.set_page_config(page_title="Juristische NER", page_icon="⚖️", layout="wide")
 logging.set_verbosity(logging.ERROR)
 st.markdown(
     padding-left: 3rem;
     padding-right: 3rem;
 }
 header, footer {visibility: hidden;}
 .entity {
     position: relative;
     display: inline-block;
     font-weight: normal;
     cursor: help;
 }
 .entity .tooltip {
     visibility: hidden;
     background-color: #333;
     transition: opacity 0.05s;
     font-size: 11px;
 }
 .entity:hover .tooltip {
     visibility: visible;
     opacity: 1;
 }
 .entity.marked {
     background-color: rgba(255, 230, 0, 0.4);
     line-height: 1.3;
     unsafe_allow_html=True,
 )
+# Entity label mapping
 entity_labels = {
+    "AN": "Rechtsbeistand",
+    "EUN": "EUNorm",
+    "GRT": "Gericht",
+    "GS": "Norm",
     "INN": "Institution",
+    "LD": "Land",
+    "LDS": "Bezirk",
+    "LIT": "Schrifttum",
+    "MRK": "Marke",
+    "ORG": "Organisation",
     "PER": "Person",
+    "RR": "RichterIn",
+    "RS": "Entscheidung",
+    "ST": "Stadt",
+    "STR": "Strasse",
+    "UN": "Unternehmen",
+    "VO": "Verordnung",
+    "VS": "Richtlinie",
+    "VT": "Vertrag",
+    "RED": "Schwärzung",
 }
+# Color generator
 def generate_fixed_colors(keys, alpha=0.25):
     cmap = cm.get_cmap("tab20", len(keys))
     rgba_colors = {}
     return rgba_colors
+ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()))
+# Caching model
+@st.cache_resource
+def load_ner_pipeline():
+    return pipeline(
+        "ner",
+        model=AutoModelForTokenClassification.from_pretrained("harshildarji/JuraNER"),
+        tokenizer=AutoTokenizer.from_pretrained("harshildarji/JuraNER"),
+    )
+# Caching NER + merge per line
+@st.cache_data(show_spinner=False)
+def get_ner_merged_lines(text):
+    ner = load_ner_pipeline()
+    results = []
+    for line in text.splitlines():
+        if not line.strip():
+            results.append(("", []))
+            continue
+        tokens = ner(line)
+        merged = merge_entities(tokens)
+        results.append((line, merged))
+    return results
+# Entity merging
 def merge_entities(entities):
     if not entities:
         return []
         prev = merged[-1]
         if ent["index"] == prev["index"] + 1:
             tok = ent["word"]
+            prev["word"] += tok[2:] if tok.startswith("##") else " " + tok
             prev["end"] = ent["end"]
             prev["index"] = ent["index"]
             prev["score_sum"] += ent["score"]
     return final
+# Highlighting
 def highlight_entities(line, merged_entities, threshold):
     html = ""
     last_end = 0
     return html
+# UI
+st.markdown("#### Juristische Named Entity Recognition (NER)")
+uploaded_file = st.file_uploader("Bitte laden Sie eine .txt-Datei hoch:", type="txt")
+threshold = st.slider("Schwellenwert für das Modellvertrauen:", 0.0, 1.0, 0.8, 0.01)
+st.markdown("---")
 if uploaded_file:
     raw_bytes = uploaded_file.read()
     encoding = detect(raw_bytes)["encoding"]
     if encoding is None:
+        st.error("Zeichenkodierung konnte nicht erkannt werden.")
     else:
         text = raw_bytes.decode(encoding)
+        with st.spinner("Modell wird auf jede Zeile angewendet..."):
+            merged_all_lines = get_ner_merged_lines(text)
+        for line, merged in merged_all_lines:
+            if not line.strip():
+                continue
+            html_line = highlight_entities(line, merged, threshold)
+            st.markdown(
+                f'<div style="margin-bottom:0.8rem; line-height:1.7;">{html_line}</div>',
+                unsafe_allow_html=True,
+            )