Minibase commited on
Commit
89fb98c
·
verified ·
1 Parent(s): 4924d32

Upload run_benchmarks.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_benchmarks.py +44 -27
run_benchmarks.py CHANGED
@@ -114,38 +114,55 @@ class DeIdBenchmarkRunner:
114
 
115
  return True
116
 
117
- def calculate_semantic_preservation(self, predicted: str, expected: str) -> float:
118
- """Calculate semantic preservation based on placeholder structure"""
119
- # Simple similarity: compare placeholder types and counts
120
- pred_placeholders = self.extract_placeholders(predicted)
121
- expected_placeholders = self.extract_placeholders(expected)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- if not expected_placeholders:
124
- return 1.0
 
125
 
126
- # Count placeholder types
127
- def count_types(placeholders):
128
- types = {}
129
- for ph in placeholders:
130
- # Extract type (e.g., "NAME" from "NAME_1")
131
- ptype = ph.split('_')[0]
132
- types[ptype] = types.get(ptype, 0) + 1
133
- return types
134
 
135
- pred_types = count_types(pred_placeholders)
136
- expected_types = count_types(expected_placeholders)
 
 
137
 
138
- # Calculate similarity based on type distribution
139
- all_types = set(pred_types.keys()) | set(expected_types.keys())
140
- similarity = 0
 
 
141
 
142
- for ptype in all_types:
143
- pred_count = pred_types.get(ptype, 0)
144
- exp_count = expected_types.get(ptype, 0)
145
- if exp_count > 0:
146
- similarity += min(pred_count, exp_count) / exp_count
147
 
148
- return similarity / len(all_types) if all_types else 1.0
149
 
150
  def call_model(self, instruction: str, input_text: str) -> Tuple[str, float]:
151
  """Call the de-identification model and measure latency"""
@@ -212,7 +229,7 @@ class DeIdBenchmarkRunner:
212
  # Calculate metrics
213
  pii_detection = self.calculate_pii_detection_rate(input_text, predicted_output)
214
  completeness = self.calculate_completeness(predicted_output)
215
- semantic_preservation = self.calculate_semantic_preservation(predicted_output, expected_output)
216
 
217
  # Update totals
218
  total_pii_detection += pii_detection
 
114
 
115
  return True
116
 
117
+ def calculate_semantic_preservation(self, input_text: str, predicted: str, expected: str) -> float:
118
+ """Calculate semantic preservation - how well the meaning is preserved after de-identification"""
119
+ # For de-identification, semantic preservation should focus on:
120
+ # 1. Whether the core message/content is maintained
121
+ # 2. Whether the text structure remains coherent
122
+ # 3. Whether placeholder density is reasonable
123
+
124
+ # Simple approach: compare text length and placeholder density
125
+ input_words = len(input_text.split())
126
+ expected_words = len(expected.split())
127
+ predicted_words = len(predicted.split())
128
+
129
+ # Length preservation (closer to 1.0 is better)
130
+ if expected_words == 0:
131
+ length_preservation = 1.0
132
+ else:
133
+ length_ratio = predicted_words / expected_words
134
+ # Penalize if too different in length (ideal ratio around 0.8-1.2)
135
+ if 0.5 <= length_ratio <= 2.0:
136
+ length_preservation = 1.0 - abs(1.0 - length_ratio) * 0.5
137
+ else:
138
+ length_preservation = 0.1 # Heavily penalize extreme length differences
139
 
140
+ # Placeholder density (should be reasonable, not too sparse or dense)
141
+ pred_placeholders = self.extract_placeholders(predicted)
142
+ placeholder_ratio = len(pred_placeholders) / max(predicted_words, 1)
143
 
144
+ if 0.05 <= placeholder_ratio <= 0.3: # Reasonable placeholder density
145
+ density_score = 1.0
146
+ elif placeholder_ratio < 0.05: # Too few placeholders
147
+ density_score = placeholder_ratio / 0.05
148
+ else: # Too many placeholders
149
+ density_score = max(0.1, 1.0 - (placeholder_ratio - 0.3) * 2)
 
 
150
 
151
+ # Structure preservation (check if basic sentence structure is maintained)
152
+ # Simple check: count punctuation marks as proxy for structure
153
+ input_punct = len(re.findall(r'[.!?]', input_text))
154
+ predicted_punct = len(re.findall(r'[.!?]', predicted))
155
 
156
+ if input_punct == 0:
157
+ structure_score = 1.0
158
+ else:
159
+ structure_ratio = min(predicted_punct, input_punct * 1.5) / input_punct
160
+ structure_score = min(1.0, structure_ratio)
161
 
162
+ # Combine scores (weighted average)
163
+ final_score = (length_preservation * 0.4) + (density_score * 0.4) + (structure_score * 0.2)
 
 
 
164
 
165
+ return max(0.0, min(1.0, final_score)) # Clamp to [0,1]
166
 
167
  def call_model(self, instruction: str, input_text: str) -> Tuple[str, float]:
168
  """Call the de-identification model and measure latency"""
 
229
  # Calculate metrics
230
  pii_detection = self.calculate_pii_detection_rate(input_text, predicted_output)
231
  completeness = self.calculate_completeness(predicted_output)
232
+ semantic_preservation = self.calculate_semantic_preservation(input_text, predicted_output, expected_output)
233
 
234
  # Update totals
235
  total_pii_detection += pii_detection