Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ from transformers import (
|
|
12 |
)
|
13 |
|
14 |
|
|
|
15 |
def setup_page():
|
16 |
st.set_page_config(
|
17 |
page_title="Juristische Anonymisierung", page_icon="⚖️", layout="wide"
|
@@ -19,57 +20,57 @@ def setup_page():
|
|
19 |
logging.set_verbosity(logging.ERROR)
|
20 |
st.markdown(
|
21 |
"""
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
unsafe_allow_html=True,
|
67 |
)
|
68 |
|
69 |
|
70 |
def get_constants():
|
71 |
entity_importance = {
|
72 |
-
"High": ["PER", "UN", "INN", "MRK"],
|
73 |
"Mid": ["RR", "AN", "GRT", "GS", "VO", "RS", "EUN", "LIT", "VS", "VT"],
|
74 |
"Low": ["LD", "ST", "STR", "LDS", "ORG"],
|
75 |
}
|
@@ -93,25 +94,39 @@ def get_constants():
|
|
93 |
"VO": "Verordnung",
|
94 |
"VS": "Richtlinie",
|
95 |
"VT": "Vertrag",
|
|
|
96 |
}
|
97 |
return entity_importance, entity_labels
|
98 |
|
99 |
|
100 |
def generate_fixed_colors(keys, alpha=0.25):
|
101 |
base_colors = sns.color_palette("tab20", len(keys))
|
102 |
-
|
103 |
key: f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {alpha})"
|
104 |
for key, (r, g, b) in zip(keys, base_colors)
|
105 |
}
|
106 |
-
return rgba_colors
|
107 |
|
108 |
|
|
|
109 |
def load_ner_model():
|
110 |
tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER")
|
111 |
model = AutoModelForTokenClassification.from_pretrained("harshildarji/JuraNER")
|
112 |
return pipeline("ner", model=model, tokenizer=tokenizer)
|
113 |
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
def merge_entities(entities):
|
116 |
if not entities:
|
117 |
return []
|
@@ -180,10 +195,8 @@ def highlight_entities(
|
|
180 |
start, end = ent["start"], ent["end"]
|
181 |
label = ent["entity"].split("-")[-1]
|
182 |
label_desc = entity_labels.get(label, label)
|
183 |
-
|
184 |
truncated_score = truncate(ent["score"], 2)
|
185 |
tooltip = f"{label_desc} ({truncated_score:.2f})"
|
186 |
-
|
187 |
color = ENTITY_COLORS.get(label, "#cccccc")
|
188 |
html += line[last_end:start]
|
189 |
|
@@ -220,11 +233,14 @@ def highlight_entities(
|
|
220 |
return html
|
221 |
|
222 |
|
|
|
223 |
def main():
|
224 |
setup_page()
|
225 |
entity_importance, entity_labels = get_constants()
|
226 |
ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()))
|
227 |
-
|
|
|
|
|
228 |
|
229 |
st.markdown("#### Juristische Anonymisierung")
|
230 |
uploaded_file = st.file_uploader(
|
@@ -258,38 +274,80 @@ def main():
|
|
258 |
return
|
259 |
text = raw_bytes.decode(encoding)
|
260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
label_counters = {}
|
262 |
anonymized_map = {}
|
263 |
all_display_keys = []
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
if
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
):
|
278 |
-
|
279 |
-
|
280 |
-
count = label_counters.get(label, 0)
|
281 |
-
suffix = chr(ord("A") + count)
|
282 |
-
label_counters[label] = count + 1
|
283 |
-
anonymized_map[key] = suffix
|
284 |
-
suffix = anonymized_map[key]
|
285 |
-
normalized_word = ent["word"].strip().lower()
|
286 |
-
display = f"{entity_labels.get(label, label)} {suffix} : {normalized_word}"
|
287 |
-
if display not in all_display_keys:
|
288 |
-
all_display_keys.append(display)
|
289 |
|
290 |
all_display_keys.sort(key=lambda tag: tag.lower())
|
291 |
|
292 |
with st.sidebar:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
st.markdown("### Anonymisierte Entitäten verwalten:")
|
294 |
selected_keys = []
|
295 |
for label_code in sorted(
|
|
|
12 |
)
|
13 |
|
14 |
|
15 |
+
# Setup & Constants
|
16 |
def setup_page():
|
17 |
st.set_page_config(
|
18 |
page_title="Juristische Anonymisierung", page_icon="⚖️", layout="wide"
|
|
|
20 |
logging.set_verbosity(logging.ERROR)
|
21 |
st.markdown(
|
22 |
"""
|
23 |
+
<style>
|
24 |
+
.block-container {
|
25 |
+
padding-top: 1rem;
|
26 |
+
padding-bottom: 5rem;
|
27 |
+
padding-left: 3rem;
|
28 |
+
padding-right: 3rem;
|
29 |
+
}
|
30 |
+
header, footer {visibility: hidden;}
|
31 |
+
.entity {
|
32 |
+
position: relative;
|
33 |
+
display: inline-block;
|
34 |
+
background-color: transparent;
|
35 |
+
font-weight: normal;
|
36 |
+
cursor: help;
|
37 |
+
}
|
38 |
+
.entity .tooltip {
|
39 |
+
visibility: hidden;
|
40 |
+
background-color: #333;
|
41 |
+
color: #fff;
|
42 |
+
text-align: center;
|
43 |
+
border-radius: 4px;
|
44 |
+
padding: 2px 6px;
|
45 |
+
position: absolute;
|
46 |
+
z-index: 1;
|
47 |
+
bottom: 125%;
|
48 |
+
left: 50%;
|
49 |
+
transform: translateX(-50%);
|
50 |
+
white-space: nowrap;
|
51 |
+
opacity: 0;
|
52 |
+
transition: opacity 0.05s;
|
53 |
+
font-size: 11px;
|
54 |
+
}
|
55 |
+
.entity:hover .tooltip {
|
56 |
+
visibility: visible;
|
57 |
+
opacity: 1;
|
58 |
+
}
|
59 |
+
.entity.marked {
|
60 |
+
background-color: rgba(255, 230, 0, 0.4);
|
61 |
+
line-height: 1.3;
|
62 |
+
padding: 0 1px;
|
63 |
+
border-radius: 0px;
|
64 |
+
}
|
65 |
+
</style>
|
66 |
+
""",
|
67 |
unsafe_allow_html=True,
|
68 |
)
|
69 |
|
70 |
|
71 |
def get_constants():
|
72 |
entity_importance = {
|
73 |
+
"High": ["PER", "UN", "INN", "MRK", "RED"],
|
74 |
"Mid": ["RR", "AN", "GRT", "GS", "VO", "RS", "EUN", "LIT", "VS", "VT"],
|
75 |
"Low": ["LD", "ST", "STR", "LDS", "ORG"],
|
76 |
}
|
|
|
94 |
"VO": "Verordnung",
|
95 |
"VS": "Richtlinie",
|
96 |
"VT": "Vertrag",
|
97 |
+
"RED": "Schwärzung",
|
98 |
}
|
99 |
return entity_importance, entity_labels
|
100 |
|
101 |
|
102 |
def generate_fixed_colors(keys, alpha=0.25):
|
103 |
base_colors = sns.color_palette("tab20", len(keys))
|
104 |
+
return {
|
105 |
key: f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {alpha})"
|
106 |
for key, (r, g, b) in zip(keys, base_colors)
|
107 |
}
|
|
|
108 |
|
109 |
|
110 |
+
@st.cache_resource
|
111 |
def load_ner_model():
|
112 |
tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER")
|
113 |
model = AutoModelForTokenClassification.from_pretrained("harshildarji/JuraNER")
|
114 |
return pipeline("ner", model=model, tokenizer=tokenizer)
|
115 |
|
116 |
|
117 |
+
@st.cache_data(show_spinner=False)
|
118 |
+
def ner_merge_lines(text):
|
119 |
+
ner = load_ner_model()
|
120 |
+
merged_lines = []
|
121 |
+
for line in text.splitlines():
|
122 |
+
if not line.strip():
|
123 |
+
continue
|
124 |
+
tokens = ner(line)
|
125 |
+
merged = merge_entities(tokens)
|
126 |
+
merged_lines.append((line, merged))
|
127 |
+
return merged_lines
|
128 |
+
|
129 |
+
|
130 |
def merge_entities(entities):
|
131 |
if not entities:
|
132 |
return []
|
|
|
195 |
start, end = ent["start"], ent["end"]
|
196 |
label = ent["entity"].split("-")[-1]
|
197 |
label_desc = entity_labels.get(label, label)
|
|
|
198 |
truncated_score = truncate(ent["score"], 2)
|
199 |
tooltip = f"{label_desc} ({truncated_score:.2f})"
|
|
|
200 |
color = ENTITY_COLORS.get(label, "#cccccc")
|
201 |
html += line[last_end:start]
|
202 |
|
|
|
233 |
return html
|
234 |
|
235 |
|
236 |
+
# Main App
|
237 |
def main():
|
238 |
setup_page()
|
239 |
entity_importance, entity_labels = get_constants()
|
240 |
ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()))
|
241 |
+
|
242 |
+
if "manual_phrases" not in st.session_state:
|
243 |
+
st.session_state.manual_phrases = []
|
244 |
|
245 |
st.markdown("#### Juristische Anonymisierung")
|
246 |
uploaded_file = st.file_uploader(
|
|
|
274 |
return
|
275 |
text = raw_bytes.decode(encoding)
|
276 |
|
277 |
+
with st.spinner("Modell wird einmalig auf die Datei angewendet..."):
|
278 |
+
merged_all_lines = ner_merge_lines(text)
|
279 |
+
|
280 |
+
manual_phrases = st.session_state.manual_phrases
|
281 |
+
overlap_warnings = set()
|
282 |
+
|
283 |
+
for idx, (line, merged) in enumerate(merged_all_lines):
|
284 |
+
for phrase in manual_phrases:
|
285 |
+
for match in re.finditer(re.escape(phrase), line.lower()):
|
286 |
+
start, end = match.start(), match.end()
|
287 |
+
|
288 |
+
if any(start < e["end"] and end > e["start"] for e in merged):
|
289 |
+
overlap_warnings.add(phrase)
|
290 |
+
continue
|
291 |
+
|
292 |
+
merged.append(
|
293 |
+
{
|
294 |
+
"start": start,
|
295 |
+
"end": end,
|
296 |
+
"word": line[start:end],
|
297 |
+
"entity": "B-RED",
|
298 |
+
"score": 1.0,
|
299 |
+
"index": 9999,
|
300 |
+
}
|
301 |
+
)
|
302 |
+
|
303 |
+
merged_all_lines[idx] = (line, sorted(merged, key=lambda x: x["start"]))
|
304 |
+
|
305 |
label_counters = {}
|
306 |
anonymized_map = {}
|
307 |
all_display_keys = []
|
308 |
+
|
309 |
+
for _, merged in merged_all_lines:
|
310 |
+
for ent in merged:
|
311 |
+
label = ent["entity"].split("-")[-1]
|
312 |
+
if any(label in entity_importance[lvl] for lvl in importance_levels):
|
313 |
+
key = (ent["word"].lower(), label)
|
314 |
+
if key not in anonymized_map:
|
315 |
+
count = label_counters.get(label, 0)
|
316 |
+
suffix = chr(ord("A") + count)
|
317 |
+
label_counters[label] = count + 1
|
318 |
+
anonymized_map[key] = suffix
|
319 |
+
suffix = anonymized_map[key]
|
320 |
+
normalized_word = ent["word"].strip().lower()
|
321 |
+
display = f"{entity_labels.get(label, label)} {suffix} : {normalized_word}"
|
322 |
+
if display not in all_display_keys:
|
323 |
+
all_display_keys.append(display)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
|
325 |
all_display_keys.sort(key=lambda tag: tag.lower())
|
326 |
|
327 |
with st.sidebar:
|
328 |
+
st.markdown("### Neue Phrase schwärzen:")
|
329 |
+
|
330 |
+
if "manual_phrases" not in st.session_state:
|
331 |
+
st.session_state.manual_phrases = []
|
332 |
+
|
333 |
+
with st.form("manual_add_form"):
|
334 |
+
new_phrase = st.text_input("Neue Phrase:")
|
335 |
+
submitted = st.form_submit_button("Hinzufügen")
|
336 |
+
with st.sidebar.expander(
|
337 |
+
"Hinweise zu manuellen Phrasen", expanded=False
|
338 |
+
):
|
339 |
+
st.markdown("**Noch in Entwicklung**")
|
340 |
+
st.markdown(
|
341 |
+
"_Manuelle Schwärzungen können fehlschlagen, wenn sich die Phrase mit bereits erkannten Entitäten überschneidet oder über mehrere Zeilen erstreckt._"
|
342 |
+
)
|
343 |
+
|
344 |
+
if submitted and new_phrase.strip():
|
345 |
+
cleaned = new_phrase.strip().lower()
|
346 |
+
if cleaned not in st.session_state.manual_phrases:
|
347 |
+
st.session_state.manual_phrases.append(cleaned)
|
348 |
+
st.rerun()
|
349 |
+
|
350 |
+
st.markdown("---")
|
351 |
st.markdown("### Anonymisierte Entitäten verwalten:")
|
352 |
selected_keys = []
|
353 |
for label_code in sorted(
|