Spaces:

harshildarji
/

Juristische-Anonymisierung

Running

App Files Files Community

harshildarji commited on 28 days ago

Commit

10e7a61

verified ·

1 Parent(s): 4694471

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -75

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ from transformers import (
 )
 def setup_page():
     st.set_page_config(
         page_title="Juristische Anonymisierung", page_icon="⚖️", layout="wide"
@@ -19,57 +20,57 @@ def setup_page():
     logging.set_verbosity(logging.ERROR)
     st.markdown(
         """
-    <style>
-    .block-container {
-        padding-top: 1rem;
-        padding-bottom: 5rem;
-        padding-left: 3rem;
-        padding-right: 3rem;
-    }
-    header, footer {visibility: hidden;}
-    .entity {
-        position: relative;
-        display: inline-block;
-        background-color: transparent;
-        font-weight: normal;
-        cursor: help;
-    }
-    .entity .tooltip {
-        visibility: hidden;
-        background-color: #333;
-        color: #fff;
-        text-align: center;
-        border-radius: 4px;
-        padding: 2px 6px;
-        position: absolute;
-        z-index: 1;
-        bottom: 125%;
-        left: 50%;
-        transform: translateX(-50%);
-        white-space: nowrap;
-        opacity: 0;
-        transition: opacity 0.05s;
-        font-size: 11px;
-    }
-    .entity:hover .tooltip {
-        visibility: visible;
-        opacity: 1;
-    }
-    .entity.marked {
-        background-color: rgba(255, 230, 0, 0.4);
-        line-height: 1.3;
-        padding: 0 1px;
-        border-radius: 0px;
-    }
-    </style>
-    """,
         unsafe_allow_html=True,
     )
 def get_constants():
     entity_importance = {
-        "High": ["PER", "UN", "INN", "MRK"],
         "Mid": ["RR", "AN", "GRT", "GS", "VO", "RS", "EUN", "LIT", "VS", "VT"],
         "Low": ["LD", "ST", "STR", "LDS", "ORG"],
     }
@@ -93,25 +94,39 @@ def get_constants():
         "VO": "Verordnung",
         "VS": "Richtlinie",
         "VT": "Vertrag",
     }
     return entity_importance, entity_labels
 def generate_fixed_colors(keys, alpha=0.25):
     base_colors = sns.color_palette("tab20", len(keys))
-    rgba_colors = {
         key: f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {alpha})"
         for key, (r, g, b) in zip(keys, base_colors)
     }
-    return rgba_colors
 def load_ner_model():
     tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER")
     model = AutoModelForTokenClassification.from_pretrained("harshildarji/JuraNER")
     return pipeline("ner", model=model, tokenizer=tokenizer)
 def merge_entities(entities):
     if not entities:
         return []
@@ -180,10 +195,8 @@ def highlight_entities(
         start, end = ent["start"], ent["end"]
         label = ent["entity"].split("-")[-1]
         label_desc = entity_labels.get(label, label)
         truncated_score = truncate(ent["score"], 2)
         tooltip = f"{label_desc} ({truncated_score:.2f})"
         color = ENTITY_COLORS.get(label, "#cccccc")
         html += line[last_end:start]
@@ -220,11 +233,14 @@ def highlight_entities(
     return html
 def main():
     setup_page()
     entity_importance, entity_labels = get_constants()
     ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()))
-    ner = load_ner_model()
     st.markdown("#### Juristische Anonymisierung")
     uploaded_file = st.file_uploader(
@@ -258,38 +274,80 @@ def main():
             return
         text = raw_bytes.decode(encoding)
         label_counters = {}
         anonymized_map = {}
         all_display_keys = []
-        merged_all_lines = []
-        with st.spinner("Modell läuft und verarbeitet die Datei..."):
-            for line in text.splitlines():
-                if not line.strip():
-                    continue
-                tokens = ner(line)
-                merged = merge_entities(tokens)
-                merged_all_lines.append((line, merged))
-                for ent in merged:
-                    label = ent["entity"].split("-")[-1]
-                    if any(
-                        label in entity_importance[lvl] for lvl in importance_levels
-                    ):
-                        key = (ent["word"].lower(), label)
-                        if key not in anonymized_map:
-                            count = label_counters.get(label, 0)
-                            suffix = chr(ord("A") + count)
-                            label_counters[label] = count + 1
-                            anonymized_map[key] = suffix
-                        suffix = anonymized_map[key]
-                        normalized_word = ent["word"].strip().lower()
-                        display = f"{entity_labels.get(label, label)} {suffix} : {normalized_word}"
-                        if display not in all_display_keys:
-                            all_display_keys.append(display)
         all_display_keys.sort(key=lambda tag: tag.lower())
         with st.sidebar:
             st.markdown("### Anonymisierte Entitäten verwalten:")
             selected_keys = []
             for label_code in sorted(

 )
+# Setup & Constants
 def setup_page():
     st.set_page_config(
         page_title="Juristische Anonymisierung", page_icon="⚖️", layout="wide"
     logging.set_verbosity(logging.ERROR)
     st.markdown(
         """
+        <style>
+        .block-container {
+            padding-top: 1rem;
+            padding-bottom: 5rem;
+            padding-left: 3rem;
+            padding-right: 3rem;
+        }
+        header, footer {visibility: hidden;}
+        .entity {
+            position: relative;
+            display: inline-block;
+            background-color: transparent;
+            font-weight: normal;
+            cursor: help;
+        }
+        .entity .tooltip {
+            visibility: hidden;
+            background-color: #333;
+            color: #fff;
+            text-align: center;
+            border-radius: 4px;
+            padding: 2px 6px;
+            position: absolute;
+            z-index: 1;
+            bottom: 125%;
+            left: 50%;
+            transform: translateX(-50%);
+            white-space: nowrap;
+            opacity: 0;
+            transition: opacity 0.05s;
+            font-size: 11px;
+        }
+        .entity:hover .tooltip {
+            visibility: visible;
+            opacity: 1;
+        }
+        .entity.marked {
+            background-color: rgba(255, 230, 0, 0.4);
+            line-height: 1.3;
+            padding: 0 1px;
+            border-radius: 0px;
+        }
+        </style>
+        """,
         unsafe_allow_html=True,
     )
 def get_constants():
     entity_importance = {
+        "High": ["PER", "UN", "INN", "MRK", "RED"],
         "Mid": ["RR", "AN", "GRT", "GS", "VO", "RS", "EUN", "LIT", "VS", "VT"],
         "Low": ["LD", "ST", "STR", "LDS", "ORG"],
     }
         "VO": "Verordnung",
         "VS": "Richtlinie",
         "VT": "Vertrag",
+        "RED": "Schwärzung",
     }
     return entity_importance, entity_labels
 def generate_fixed_colors(keys, alpha=0.25):
     base_colors = sns.color_palette("tab20", len(keys))
+    return {
         key: f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {alpha})"
         for key, (r, g, b) in zip(keys, base_colors)
     }
+@st.cache_resource
 def load_ner_model():
     tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER")
     model = AutoModelForTokenClassification.from_pretrained("harshildarji/JuraNER")
     return pipeline("ner", model=model, tokenizer=tokenizer)
+@st.cache_data(show_spinner=False)
+def ner_merge_lines(text):
+    ner = load_ner_model()
+    merged_lines = []
+    for line in text.splitlines():
+        if not line.strip():
+            continue
+        tokens = ner(line)
+        merged = merge_entities(tokens)
+        merged_lines.append((line, merged))
+    return merged_lines
 def merge_entities(entities):
     if not entities:
         return []
         start, end = ent["start"], ent["end"]
         label = ent["entity"].split("-")[-1]
         label_desc = entity_labels.get(label, label)
         truncated_score = truncate(ent["score"], 2)
         tooltip = f"{label_desc} ({truncated_score:.2f})"
         color = ENTITY_COLORS.get(label, "#cccccc")
         html += line[last_end:start]
     return html
+# Main App
 def main():
     setup_page()
     entity_importance, entity_labels = get_constants()
     ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()))
+    if "manual_phrases" not in st.session_state:
+        st.session_state.manual_phrases = []
     st.markdown("#### Juristische Anonymisierung")
     uploaded_file = st.file_uploader(
             return
         text = raw_bytes.decode(encoding)
+        with st.spinner("Modell wird einmalig auf die Datei angewendet..."):
+            merged_all_lines = ner_merge_lines(text)
+        manual_phrases = st.session_state.manual_phrases
+        overlap_warnings = set()
+        for idx, (line, merged) in enumerate(merged_all_lines):
+            for phrase in manual_phrases:
+                for match in re.finditer(re.escape(phrase), line.lower()):
+                    start, end = match.start(), match.end()
+                    if any(start < e["end"] and end > e["start"] for e in merged):
+                        overlap_warnings.add(phrase)
+                        continue
+                    merged.append(
+                        {
+                            "start": start,
+                            "end": end,
+                            "word": line[start:end],
+                            "entity": "B-RED",
+                            "score": 1.0,
+                            "index": 9999,
+                        }
+                    )
+            merged_all_lines[idx] = (line, sorted(merged, key=lambda x: x["start"]))
         label_counters = {}
         anonymized_map = {}
         all_display_keys = []
+        for _, merged in merged_all_lines:
+            for ent in merged:
+                label = ent["entity"].split("-")[-1]
+                if any(label in entity_importance[lvl] for lvl in importance_levels):
+                    key = (ent["word"].lower(), label)
+                    if key not in anonymized_map:
+                        count = label_counters.get(label, 0)
+                        suffix = chr(ord("A") + count)
+                        label_counters[label] = count + 1
+                        anonymized_map[key] = suffix
+                    suffix = anonymized_map[key]
+                    normalized_word = ent["word"].strip().lower()
+                    display = f"{entity_labels.get(label, label)} {suffix} : {normalized_word}"
+                    if display not in all_display_keys:
+                        all_display_keys.append(display)
         all_display_keys.sort(key=lambda tag: tag.lower())
         with st.sidebar:
+            st.markdown("### Neue Phrase schwärzen:")
+            if "manual_phrases" not in st.session_state:
+                st.session_state.manual_phrases = []
+            with st.form("manual_add_form"):
+                new_phrase = st.text_input("Neue Phrase:")
+                submitted = st.form_submit_button("Hinzufügen")
+                with st.sidebar.expander(
+                    "Hinweise zu manuellen Phrasen", expanded=False
+                ):
+                    st.markdown("**Noch in Entwicklung**")
+                    st.markdown(
+                        "_Manuelle Schwärzungen können fehlschlagen, wenn sich die Phrase mit bereits erkannten Entitäten überschneidet oder über mehrere Zeilen erstreckt._"
+                    )
+                if submitted and new_phrase.strip():
+                    cleaned = new_phrase.strip().lower()
+                    if cleaned not in st.session_state.manual_phrases:
+                        st.session_state.manual_phrases.append(cleaned)
+                        st.rerun()
+            st.markdown("---")
             st.markdown("### Anonymisierte Entitäten verwalten:")
             selected_keys = []
             for label_code in sorted(