harshildarji commited on
Commit
10e7a61
·
verified ·
1 Parent(s): 4694471

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -75
app.py CHANGED
@@ -12,6 +12,7 @@ from transformers import (
12
  )
13
 
14
 
 
15
  def setup_page():
16
  st.set_page_config(
17
  page_title="Juristische Anonymisierung", page_icon="⚖️", layout="wide"
@@ -19,57 +20,57 @@ def setup_page():
19
  logging.set_verbosity(logging.ERROR)
20
  st.markdown(
21
  """
22
- <style>
23
- .block-container {
24
- padding-top: 1rem;
25
- padding-bottom: 5rem;
26
- padding-left: 3rem;
27
- padding-right: 3rem;
28
- }
29
- header, footer {visibility: hidden;}
30
- .entity {
31
- position: relative;
32
- display: inline-block;
33
- background-color: transparent;
34
- font-weight: normal;
35
- cursor: help;
36
- }
37
- .entity .tooltip {
38
- visibility: hidden;
39
- background-color: #333;
40
- color: #fff;
41
- text-align: center;
42
- border-radius: 4px;
43
- padding: 2px 6px;
44
- position: absolute;
45
- z-index: 1;
46
- bottom: 125%;
47
- left: 50%;
48
- transform: translateX(-50%);
49
- white-space: nowrap;
50
- opacity: 0;
51
- transition: opacity 0.05s;
52
- font-size: 11px;
53
- }
54
- .entity:hover .tooltip {
55
- visibility: visible;
56
- opacity: 1;
57
- }
58
- .entity.marked {
59
- background-color: rgba(255, 230, 0, 0.4);
60
- line-height: 1.3;
61
- padding: 0 1px;
62
- border-radius: 0px;
63
- }
64
- </style>
65
- """,
66
  unsafe_allow_html=True,
67
  )
68
 
69
 
70
  def get_constants():
71
  entity_importance = {
72
- "High": ["PER", "UN", "INN", "MRK"],
73
  "Mid": ["RR", "AN", "GRT", "GS", "VO", "RS", "EUN", "LIT", "VS", "VT"],
74
  "Low": ["LD", "ST", "STR", "LDS", "ORG"],
75
  }
@@ -93,25 +94,39 @@ def get_constants():
93
  "VO": "Verordnung",
94
  "VS": "Richtlinie",
95
  "VT": "Vertrag",
 
96
  }
97
  return entity_importance, entity_labels
98
 
99
 
100
  def generate_fixed_colors(keys, alpha=0.25):
101
  base_colors = sns.color_palette("tab20", len(keys))
102
- rgba_colors = {
103
  key: f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {alpha})"
104
  for key, (r, g, b) in zip(keys, base_colors)
105
  }
106
- return rgba_colors
107
 
108
 
 
109
  def load_ner_model():
110
  tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER")
111
  model = AutoModelForTokenClassification.from_pretrained("harshildarji/JuraNER")
112
  return pipeline("ner", model=model, tokenizer=tokenizer)
113
 
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  def merge_entities(entities):
116
  if not entities:
117
  return []
@@ -180,10 +195,8 @@ def highlight_entities(
180
  start, end = ent["start"], ent["end"]
181
  label = ent["entity"].split("-")[-1]
182
  label_desc = entity_labels.get(label, label)
183
-
184
  truncated_score = truncate(ent["score"], 2)
185
  tooltip = f"{label_desc} ({truncated_score:.2f})"
186
-
187
  color = ENTITY_COLORS.get(label, "#cccccc")
188
  html += line[last_end:start]
189
 
@@ -220,11 +233,14 @@ def highlight_entities(
220
  return html
221
 
222
 
 
223
  def main():
224
  setup_page()
225
  entity_importance, entity_labels = get_constants()
226
  ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()))
227
- ner = load_ner_model()
 
 
228
 
229
  st.markdown("#### Juristische Anonymisierung")
230
  uploaded_file = st.file_uploader(
@@ -258,38 +274,80 @@ def main():
258
  return
259
  text = raw_bytes.decode(encoding)
260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  label_counters = {}
262
  anonymized_map = {}
263
  all_display_keys = []
264
- merged_all_lines = []
265
-
266
- with st.spinner("Modell läuft und verarbeitet die Datei..."):
267
- for line in text.splitlines():
268
- if not line.strip():
269
- continue
270
- tokens = ner(line)
271
- merged = merge_entities(tokens)
272
- merged_all_lines.append((line, merged))
273
- for ent in merged:
274
- label = ent["entity"].split("-")[-1]
275
- if any(
276
- label in entity_importance[lvl] for lvl in importance_levels
277
- ):
278
- key = (ent["word"].lower(), label)
279
- if key not in anonymized_map:
280
- count = label_counters.get(label, 0)
281
- suffix = chr(ord("A") + count)
282
- label_counters[label] = count + 1
283
- anonymized_map[key] = suffix
284
- suffix = anonymized_map[key]
285
- normalized_word = ent["word"].strip().lower()
286
- display = f"{entity_labels.get(label, label)} {suffix} : {normalized_word}"
287
- if display not in all_display_keys:
288
- all_display_keys.append(display)
289
 
290
  all_display_keys.sort(key=lambda tag: tag.lower())
291
 
292
  with st.sidebar:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  st.markdown("### Anonymisierte Entitäten verwalten:")
294
  selected_keys = []
295
  for label_code in sorted(
 
12
  )
13
 
14
 
15
+ # Setup & Constants
16
  def setup_page():
17
  st.set_page_config(
18
  page_title="Juristische Anonymisierung", page_icon="⚖️", layout="wide"
 
20
  logging.set_verbosity(logging.ERROR)
21
  st.markdown(
22
  """
23
+ <style>
24
+ .block-container {
25
+ padding-top: 1rem;
26
+ padding-bottom: 5rem;
27
+ padding-left: 3rem;
28
+ padding-right: 3rem;
29
+ }
30
+ header, footer {visibility: hidden;}
31
+ .entity {
32
+ position: relative;
33
+ display: inline-block;
34
+ background-color: transparent;
35
+ font-weight: normal;
36
+ cursor: help;
37
+ }
38
+ .entity .tooltip {
39
+ visibility: hidden;
40
+ background-color: #333;
41
+ color: #fff;
42
+ text-align: center;
43
+ border-radius: 4px;
44
+ padding: 2px 6px;
45
+ position: absolute;
46
+ z-index: 1;
47
+ bottom: 125%;
48
+ left: 50%;
49
+ transform: translateX(-50%);
50
+ white-space: nowrap;
51
+ opacity: 0;
52
+ transition: opacity 0.05s;
53
+ font-size: 11px;
54
+ }
55
+ .entity:hover .tooltip {
56
+ visibility: visible;
57
+ opacity: 1;
58
+ }
59
+ .entity.marked {
60
+ background-color: rgba(255, 230, 0, 0.4);
61
+ line-height: 1.3;
62
+ padding: 0 1px;
63
+ border-radius: 0px;
64
+ }
65
+ </style>
66
+ """,
67
  unsafe_allow_html=True,
68
  )
69
 
70
 
71
  def get_constants():
72
  entity_importance = {
73
+ "High": ["PER", "UN", "INN", "MRK", "RED"],
74
  "Mid": ["RR", "AN", "GRT", "GS", "VO", "RS", "EUN", "LIT", "VS", "VT"],
75
  "Low": ["LD", "ST", "STR", "LDS", "ORG"],
76
  }
 
94
  "VO": "Verordnung",
95
  "VS": "Richtlinie",
96
  "VT": "Vertrag",
97
+ "RED": "Schwärzung",
98
  }
99
  return entity_importance, entity_labels
100
 
101
 
102
  def generate_fixed_colors(keys, alpha=0.25):
103
  base_colors = sns.color_palette("tab20", len(keys))
104
+ return {
105
  key: f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {alpha})"
106
  for key, (r, g, b) in zip(keys, base_colors)
107
  }
 
108
 
109
 
110
+ @st.cache_resource
111
  def load_ner_model():
112
  tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER")
113
  model = AutoModelForTokenClassification.from_pretrained("harshildarji/JuraNER")
114
  return pipeline("ner", model=model, tokenizer=tokenizer)
115
 
116
 
117
+ @st.cache_data(show_spinner=False)
118
+ def ner_merge_lines(text):
119
+ ner = load_ner_model()
120
+ merged_lines = []
121
+ for line in text.splitlines():
122
+ if not line.strip():
123
+ continue
124
+ tokens = ner(line)
125
+ merged = merge_entities(tokens)
126
+ merged_lines.append((line, merged))
127
+ return merged_lines
128
+
129
+
130
  def merge_entities(entities):
131
  if not entities:
132
  return []
 
195
  start, end = ent["start"], ent["end"]
196
  label = ent["entity"].split("-")[-1]
197
  label_desc = entity_labels.get(label, label)
 
198
  truncated_score = truncate(ent["score"], 2)
199
  tooltip = f"{label_desc} ({truncated_score:.2f})"
 
200
  color = ENTITY_COLORS.get(label, "#cccccc")
201
  html += line[last_end:start]
202
 
 
233
  return html
234
 
235
 
236
+ # Main App
237
  def main():
238
  setup_page()
239
  entity_importance, entity_labels = get_constants()
240
  ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()))
241
+
242
+ if "manual_phrases" not in st.session_state:
243
+ st.session_state.manual_phrases = []
244
 
245
  st.markdown("#### Juristische Anonymisierung")
246
  uploaded_file = st.file_uploader(
 
274
  return
275
  text = raw_bytes.decode(encoding)
276
 
277
+ with st.spinner("Modell wird einmalig auf die Datei angewendet..."):
278
+ merged_all_lines = ner_merge_lines(text)
279
+
280
+ manual_phrases = st.session_state.manual_phrases
281
+ overlap_warnings = set()
282
+
283
+ for idx, (line, merged) in enumerate(merged_all_lines):
284
+ for phrase in manual_phrases:
285
+ for match in re.finditer(re.escape(phrase), line.lower()):
286
+ start, end = match.start(), match.end()
287
+
288
+ if any(start < e["end"] and end > e["start"] for e in merged):
289
+ overlap_warnings.add(phrase)
290
+ continue
291
+
292
+ merged.append(
293
+ {
294
+ "start": start,
295
+ "end": end,
296
+ "word": line[start:end],
297
+ "entity": "B-RED",
298
+ "score": 1.0,
299
+ "index": 9999,
300
+ }
301
+ )
302
+
303
+ merged_all_lines[idx] = (line, sorted(merged, key=lambda x: x["start"]))
304
+
305
  label_counters = {}
306
  anonymized_map = {}
307
  all_display_keys = []
308
+
309
+ for _, merged in merged_all_lines:
310
+ for ent in merged:
311
+ label = ent["entity"].split("-")[-1]
312
+ if any(label in entity_importance[lvl] for lvl in importance_levels):
313
+ key = (ent["word"].lower(), label)
314
+ if key not in anonymized_map:
315
+ count = label_counters.get(label, 0)
316
+ suffix = chr(ord("A") + count)
317
+ label_counters[label] = count + 1
318
+ anonymized_map[key] = suffix
319
+ suffix = anonymized_map[key]
320
+ normalized_word = ent["word"].strip().lower()
321
+ display = f"{entity_labels.get(label, label)} {suffix} : {normalized_word}"
322
+ if display not in all_display_keys:
323
+ all_display_keys.append(display)
 
 
 
 
 
 
 
 
 
324
 
325
  all_display_keys.sort(key=lambda tag: tag.lower())
326
 
327
  with st.sidebar:
328
+ st.markdown("### Neue Phrase schwärzen:")
329
+
330
+ if "manual_phrases" not in st.session_state:
331
+ st.session_state.manual_phrases = []
332
+
333
+ with st.form("manual_add_form"):
334
+ new_phrase = st.text_input("Neue Phrase:")
335
+ submitted = st.form_submit_button("Hinzufügen")
336
+ with st.sidebar.expander(
337
+ "Hinweise zu manuellen Phrasen", expanded=False
338
+ ):
339
+ st.markdown("**Noch in Entwicklung**")
340
+ st.markdown(
341
+ "_Manuelle Schwärzungen können fehlschlagen, wenn sich die Phrase mit bereits erkannten Entitäten überschneidet oder über mehrere Zeilen erstreckt._"
342
+ )
343
+
344
+ if submitted and new_phrase.strip():
345
+ cleaned = new_phrase.strip().lower()
346
+ if cleaned not in st.session_state.manual_phrases:
347
+ st.session_state.manual_phrases.append(cleaned)
348
+ st.rerun()
349
+
350
+ st.markdown("---")
351
  st.markdown("### Anonymisierte Entitäten verwalten:")
352
  selected_keys = []
353
  for label_code in sorted(