Minibase commited on
Commit
867d46c
·
verified ·
1 Parent(s): d5d5f9e

Upload run_benchmarks.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_benchmarks.py +40 -53
run_benchmarks.py CHANGED
@@ -52,15 +52,7 @@ class DeIdBenchmarkRunner:
52
  print(f"✅ Loaded {len(examples)} examples from {dataset_path}")
53
  return examples
54
 
55
- def categorize_domain(self, text: str) -> str:
56
- """Categorize text by domain based on keywords"""
57
- text_lower = text.lower()
58
-
59
- for domain, info in self.config["metrics"]["domain_performance"].items():
60
- if any(keyword in text_lower for keyword in info["keywords"]):
61
- return domain
62
-
63
- return "general"
64
 
65
  def extract_placeholders(self, text: str) -> List[str]:
66
  """Extract all placeholder tags from text (e.g., [NAME_1], [DOB_1])"""
@@ -68,18 +60,41 @@ class DeIdBenchmarkRunner:
68
  pattern = r'\[([A-Z_]+_\d+)\]'
69
  return re.findall(pattern, text)
70
 
71
- def calculate_pii_detection_rate(self, predicted: str, expected: str) -> float:
72
- """Calculate how many expected PII elements were detected"""
73
- expected_placeholders = set(self.extract_placeholders(expected))
 
74
 
75
- if not expected_placeholders:
76
- return 1.0 # No PII to detect
 
 
 
 
77
 
78
- predicted_placeholders = set(self.extract_placeholders(predicted))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # Calculate overlap
81
- detected = len(expected_placeholders.intersection(predicted_placeholders))
82
- return detected / len(expected_placeholders)
83
 
84
  def calculate_completeness(self, predicted: str) -> bool:
85
  """Check if response appears to have no obvious PII remaining"""
@@ -177,8 +192,6 @@ class DeIdBenchmarkRunner:
177
  total_completeness = 0
178
  total_semantic_preservation = 0
179
  total_latency = 0
180
- domain_counts = {}
181
- domain_metrics = {}
182
 
183
  successful_requests = 0
184
 
@@ -190,10 +203,6 @@ class DeIdBenchmarkRunner:
190
  input_text = example[self.config["datasets"]["benchmark_dataset"]["input_field"]]
191
  expected_output = example[self.config["datasets"]["benchmark_dataset"]["expected_output_field"]]
192
 
193
- # Categorize domain
194
- domain = self.categorize_domain(input_text)
195
- domain_counts[domain] = domain_counts.get(domain, 0) + 1
196
-
197
  # Call model
198
  predicted_output, latency = self.call_model(instruction, input_text)
199
 
@@ -201,7 +210,7 @@ class DeIdBenchmarkRunner:
201
  successful_requests += 1
202
 
203
  # Calculate metrics
204
- pii_detection = self.calculate_pii_detection_rate(predicted_output, expected_output)
205
  completeness = self.calculate_completeness(predicted_output)
206
  semantic_preservation = self.calculate_semantic_preservation(predicted_output, expected_output)
207
 
@@ -211,22 +220,12 @@ class DeIdBenchmarkRunner:
211
  total_semantic_preservation += semantic_preservation
212
  total_latency += latency
213
 
214
- # Update domain metrics
215
- if domain not in domain_metrics:
216
- domain_metrics[domain] = {"pii_detection": 0, "completeness": 0, "semantic": 0, "count": 0}
217
-
218
- domain_metrics[domain]["pii_detection"] += pii_detection
219
- domain_metrics[domain]["completeness"] += completeness
220
- domain_metrics[domain]["semantic"] += semantic_preservation
221
- domain_metrics[domain]["count"] += 1
222
-
223
  # Store example if requested
224
  if len(self.results["examples"]) < self.config["output"]["max_examples"]:
225
  self.results["examples"].append({
226
  "input": input_text,
227
  "expected": expected_output,
228
  "predicted": predicted_output,
229
- "domain": domain,
230
  "metrics": {
231
  "pii_detection": pii_detection,
232
  "completeness": completeness,
@@ -246,16 +245,6 @@ class DeIdBenchmarkRunner:
246
  "total_requests": len(examples)
247
  }
248
 
249
- # Calculate domain performance
250
- for domain, metrics in domain_metrics.items():
251
- count = metrics["count"]
252
- self.results["domain_performance"][domain] = {
253
- "sample_count": count,
254
- "pii_detection_rate": metrics["pii_detection"] / count,
255
- "completeness_score": metrics["completeness"] / count,
256
- "semantic_preservation": metrics["semantic"] / count
257
- }
258
-
259
  self.save_results()
260
 
261
  def save_results(self):
@@ -292,20 +281,18 @@ class DeIdBenchmarkRunner:
292
  | Semantic Preservation | {m.get('semantic_preservation', 0):.3f} | How well meaning is preserved |
293
  | Average Latency | {m.get('average_latency_ms', 0):.1f}ms | Response time performance |
294
 
295
- ## Domain Performance
296
 
297
- """
 
 
298
 
299
- for domain, metrics in self.results["domain_performance"].items():
300
- summary += f"### {domain.title()} Domain ({metrics['sample_count']} samples)\n"
301
- summary += f"- PII Detection: {metrics['pii_detection_rate']:.3f}\n"
302
- summary += f"- Completeness: {metrics['completeness_score']:.3f}\n"
303
- summary += f"- Semantic Preservation: {metrics['semantic_preservation']:.3f}\n\n"
304
 
305
  if self.config["output"]["include_examples"] and self.results["examples"]:
306
  summary += "## Example Results\n\n"
307
  for i, example in enumerate(self.results["examples"][:3]): # Show first 3 examples
308
- summary += f"### Example {i+1} ({example['domain']} domain)\n"
309
  summary += f"**Input:** {example['input'][:100]}...\n"
310
  summary += f"**Expected:** {example['expected'][:100]}...\n"
311
  summary += f"**Predicted:** {example['predicted'][:100]}...\n"
 
52
  print(f"✅ Loaded {len(examples)} examples from {dataset_path}")
53
  return examples
54
 
55
+ # Removed domain categorization as requested
 
 
 
 
 
 
 
 
56
 
57
  def extract_placeholders(self, text: str) -> List[str]:
58
  """Extract all placeholder tags from text (e.g., [NAME_1], [DOB_1])"""
 
60
  pattern = r'\[([A-Z_]+_\d+)\]'
61
  return re.findall(pattern, text)
62
 
63
+ def calculate_pii_detection_rate(self, input_text: str, predicted: str) -> float:
64
+ """Calculate PII detection rate - if input has PII and output has placeholders, count as success"""
65
+ # Check if input contains any PII patterns
66
+ input_has_pii = self._input_contains_pii(input_text)
67
 
68
+ if not input_has_pii:
69
+ return 1.0 # No PII in input, so detection is perfect
70
+
71
+ # Check if output contains any placeholders at all
72
+ predicted_placeholders = self.extract_placeholders(predicted)
73
+ output_has_placeholders = len(predicted_placeholders) > 0
74
 
75
+ # If input has PII and output has placeholders, count as successful detection
76
+ return 1.0 if output_has_placeholders else 0.0
77
+
78
+ def _input_contains_pii(self, input_text: str) -> bool:
79
+ """Check if input text contains personal identifiable information"""
80
+ pii_patterns = [
81
+ r'\b\d{4}-\d{2}-\d{2}\b', # Dates like 1985-03-15
82
+ r'\b\d{1,3}/\d{1,2}/\d{4}\b', # Dates like 05/12/1980
83
+ r'\b\d{1,3}\s+[A-Z][a-z]+\s+(?:St|Street|Ave|Avenue|Rd|Road|Blvd|Boulevard)\b', # Addresses
84
+ r'\(\d{3}\)\s*\d{3}-\d{4}\b', # Phone numbers like (555) 123-4567
85
+ r'\+?\d{1,3}[-.\s]?\d{3}[-.\s]?\d{4}\b', # International phone numbers
86
+ r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b', # Names (First Last)
87
+ r'\b[A-Z][a-z]+\s+[A-Z]\.\s*[A-Z][a-z]+\b', # Names with middle initial
88
+ r'\b\d+@\w+\.\w+\b', # Email addresses
89
+ r'\b[A-Z]{2,}\d+\b', # IDs like EMP-001-XYZ
90
+ r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?\b', # Monetary amounts like $85,000
91
+ r'\b\d{3}-\d{2}-\d{4}\b', # SSN-like patterns
92
+ r'\b(?:Mr|Mrs|Ms|Dr|Prof)\.\s+[A-Z][a-z]+\b', # Titles with names
93
+ r'\b\d{5}(?:-\d{4})?\b', # ZIP codes
94
+ r'\b[A-Z][a-z]+,\s+[A-Z]{2}\s+\d{5}\b', # City, State ZIP
95
+ ]
96
 
97
+ return any(re.search(pattern, input_text) for pattern in pii_patterns)
 
 
98
 
99
  def calculate_completeness(self, predicted: str) -> bool:
100
  """Check if response appears to have no obvious PII remaining"""
 
192
  total_completeness = 0
193
  total_semantic_preservation = 0
194
  total_latency = 0
 
 
195
 
196
  successful_requests = 0
197
 
 
203
  input_text = example[self.config["datasets"]["benchmark_dataset"]["input_field"]]
204
  expected_output = example[self.config["datasets"]["benchmark_dataset"]["expected_output_field"]]
205
 
 
 
 
 
206
  # Call model
207
  predicted_output, latency = self.call_model(instruction, input_text)
208
 
 
210
  successful_requests += 1
211
 
212
  # Calculate metrics
213
+ pii_detection = self.calculate_pii_detection_rate(input_text, predicted_output)
214
  completeness = self.calculate_completeness(predicted_output)
215
  semantic_preservation = self.calculate_semantic_preservation(predicted_output, expected_output)
216
 
 
220
  total_semantic_preservation += semantic_preservation
221
  total_latency += latency
222
 
 
 
 
 
 
 
 
 
 
223
  # Store example if requested
224
  if len(self.results["examples"]) < self.config["output"]["max_examples"]:
225
  self.results["examples"].append({
226
  "input": input_text,
227
  "expected": expected_output,
228
  "predicted": predicted_output,
 
229
  "metrics": {
230
  "pii_detection": pii_detection,
231
  "completeness": completeness,
 
245
  "total_requests": len(examples)
246
  }
247
 
 
 
 
 
 
 
 
 
 
 
248
  self.save_results()
249
 
250
  def save_results(self):
 
281
  | Semantic Preservation | {m.get('semantic_preservation', 0):.3f} | How well meaning is preserved |
282
  | Average Latency | {m.get('average_latency_ms', 0):.1f}ms | Response time performance |
283
 
284
+ ## Key Improvements
285
 
286
+ - **PII Detection**: Now measures if model generates ANY placeholders when PII is present in input
287
+ - **Unified Evaluation**: All examples evaluated together (no domain separation)
288
+ - **Lenient Scoring**: Focuses on detection capability rather than exact placeholder matching
289
 
290
+ """
 
 
 
 
291
 
292
  if self.config["output"]["include_examples"] and self.results["examples"]:
293
  summary += "## Example Results\n\n"
294
  for i, example in enumerate(self.results["examples"][:3]): # Show first 3 examples
295
+ summary += f"### Example {i+1}\n"
296
  summary += f"**Input:** {example['input'][:100]}...\n"
297
  summary += f"**Expected:** {example['expected'][:100]}...\n"
298
  summary += f"**Predicted:** {example['predicted'][:100]}...\n"