Upload run_benchmarks.py with huggingface_hub

Browse files

Files changed (1) hide show

run_benchmarks.py +40 -53

run_benchmarks.py CHANGED Viewed

@@ -52,15 +52,7 @@ class DeIdBenchmarkRunner:
         print(f"✅ Loaded {len(examples)} examples from {dataset_path}")
         return examples
-    def categorize_domain(self, text: str) -> str:
-        """Categorize text by domain based on keywords"""
-        text_lower = text.lower()
-        for domain, info in self.config["metrics"]["domain_performance"].items():
-            if any(keyword in text_lower for keyword in info["keywords"]):
-                return domain
-        return "general"
     def extract_placeholders(self, text: str) -> List[str]:
         """Extract all placeholder tags from text (e.g., [NAME_1], [DOB_1])"""
@@ -68,18 +60,41 @@ class DeIdBenchmarkRunner:
         pattern = r'\[([A-Z_]+_\d+)\]'
         return re.findall(pattern, text)
-    def calculate_pii_detection_rate(self, predicted: str, expected: str) -> float:
-        """Calculate how many expected PII elements were detected"""
-        expected_placeholders = set(self.extract_placeholders(expected))
-        if not expected_placeholders:
-            return 1.0  # No PII to detect
-        predicted_placeholders = set(self.extract_placeholders(predicted))
-        # Calculate overlap
-        detected = len(expected_placeholders.intersection(predicted_placeholders))
-        return detected / len(expected_placeholders)
     def calculate_completeness(self, predicted: str) -> bool:
         """Check if response appears to have no obvious PII remaining"""
@@ -177,8 +192,6 @@ class DeIdBenchmarkRunner:
         total_completeness = 0
         total_semantic_preservation = 0
         total_latency = 0
-        domain_counts = {}
-        domain_metrics = {}
         successful_requests = 0
@@ -190,10 +203,6 @@ class DeIdBenchmarkRunner:
             input_text = example[self.config["datasets"]["benchmark_dataset"]["input_field"]]
             expected_output = example[self.config["datasets"]["benchmark_dataset"]["expected_output_field"]]
-            # Categorize domain
-            domain = self.categorize_domain(input_text)
-            domain_counts[domain] = domain_counts.get(domain, 0) + 1
             # Call model
             predicted_output, latency = self.call_model(instruction, input_text)
@@ -201,7 +210,7 @@ class DeIdBenchmarkRunner:
                 successful_requests += 1
                 # Calculate metrics
-                pii_detection = self.calculate_pii_detection_rate(predicted_output, expected_output)
                 completeness = self.calculate_completeness(predicted_output)
                 semantic_preservation = self.calculate_semantic_preservation(predicted_output, expected_output)
@@ -211,22 +220,12 @@ class DeIdBenchmarkRunner:
                 total_semantic_preservation += semantic_preservation
                 total_latency += latency
-                # Update domain metrics
-                if domain not in domain_metrics:
-                    domain_metrics[domain] = {"pii_detection": 0, "completeness": 0, "semantic": 0, "count": 0}
-                domain_metrics[domain]["pii_detection"] += pii_detection
-                domain_metrics[domain]["completeness"] += completeness
-                domain_metrics[domain]["semantic"] += semantic_preservation
-                domain_metrics[domain]["count"] += 1
                 # Store example if requested
                 if len(self.results["examples"]) < self.config["output"]["max_examples"]:
                     self.results["examples"].append({
                         "input": input_text,
                         "expected": expected_output,
                         "predicted": predicted_output,
-                        "domain": domain,
                         "metrics": {
                             "pii_detection": pii_detection,
                             "completeness": completeness,
@@ -246,16 +245,6 @@ class DeIdBenchmarkRunner:
                 "total_requests": len(examples)
             }
-            # Calculate domain performance
-            for domain, metrics in domain_metrics.items():
-                count = metrics["count"]
-                self.results["domain_performance"][domain] = {
-                    "sample_count": count,
-                    "pii_detection_rate": metrics["pii_detection"] / count,
-                    "completeness_score": metrics["completeness"] / count,
-                    "semantic_preservation": metrics["semantic"] / count
-                }
         self.save_results()
     def save_results(self):
@@ -292,20 +281,18 @@ class DeIdBenchmarkRunner:
 | Semantic Preservation | {m.get('semantic_preservation', 0):.3f} | How well meaning is preserved |
 | Average Latency | {m.get('average_latency_ms', 0):.1f}ms | Response time performance |
-## Domain Performance
-"""
-        for domain, metrics in self.results["domain_performance"].items():
-            summary += f"### {domain.title()} Domain ({metrics['sample_count']} samples)\n"
-            summary += f"- PII Detection: {metrics['pii_detection_rate']:.3f}\n"
-            summary += f"- Completeness: {metrics['completeness_score']:.3f}\n"
-            summary += f"- Semantic Preservation: {metrics['semantic_preservation']:.3f}\n\n"
         if self.config["output"]["include_examples"] and self.results["examples"]:
             summary += "## Example Results\n\n"
             for i, example in enumerate(self.results["examples"][:3]):  # Show first 3 examples
-                summary += f"### Example {i+1} ({example['domain']} domain)\n"
                 summary += f"**Input:** {example['input'][:100]}...\n"
                 summary += f"**Expected:** {example['expected'][:100]}...\n"
                 summary += f"**Predicted:** {example['predicted'][:100]}...\n"

         print(f"✅ Loaded {len(examples)} examples from {dataset_path}")
         return examples
+    # Removed domain categorization as requested
     def extract_placeholders(self, text: str) -> List[str]:
         """Extract all placeholder tags from text (e.g., [NAME_1], [DOB_1])"""
         pattern = r'\[([A-Z_]+_\d+)\]'
         return re.findall(pattern, text)
+    def calculate_pii_detection_rate(self, input_text: str, predicted: str) -> float:
+        """Calculate PII detection rate - if input has PII and output has placeholders, count as success"""
+        # Check if input contains any PII patterns
+        input_has_pii = self._input_contains_pii(input_text)
+        if not input_has_pii:
+            return 1.0  # No PII in input, so detection is perfect
+        # Check if output contains any placeholders at all
+        predicted_placeholders = self.extract_placeholders(predicted)
+        output_has_placeholders = len(predicted_placeholders) > 0
+        # If input has PII and output has placeholders, count as successful detection
+        return 1.0 if output_has_placeholders else 0.0
+    def _input_contains_pii(self, input_text: str) -> bool:
+        """Check if input text contains personal identifiable information"""
+        pii_patterns = [
+            r'\b\d{4}-\d{2}-\d{2}\b',  # Dates like 1985-03-15
+            r'\b\d{1,3}/\d{1,2}/\d{4}\b',  # Dates like 05/12/1980
+            r'\b\d{1,3}\s+[A-Z][a-z]+\s+(?:St|Street|Ave|Avenue|Rd|Road|Blvd|Boulevard)\b',  # Addresses
+            r'\(\d{3}\)\s*\d{3}-\d{4}\b',  # Phone numbers like (555) 123-4567
+            r'\+?\d{1,3}[-.\s]?\d{3}[-.\s]?\d{4}\b',  # International phone numbers
+            r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b',  # Names (First Last)
+            r'\b[A-Z][a-z]+\s+[A-Z]\.\s*[A-Z][a-z]+\b',  # Names with middle initial
+            r'\b\d+@\w+\.\w+\b',  # Email addresses
+            r'\b[A-Z]{2,}\d+\b',  # IDs like EMP-001-XYZ
+            r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?\b',  # Monetary amounts like $85,000
+            r'\b\d{3}-\d{2}-\d{4}\b',  # SSN-like patterns
+            r'\b(?:Mr|Mrs|Ms|Dr|Prof)\.\s+[A-Z][a-z]+\b',  # Titles with names
+            r'\b\d{5}(?:-\d{4})?\b',  # ZIP codes
+            r'\b[A-Z][a-z]+,\s+[A-Z]{2}\s+\d{5}\b',  # City, State ZIP
+        ]
+        return any(re.search(pattern, input_text) for pattern in pii_patterns)
     def calculate_completeness(self, predicted: str) -> bool:
         """Check if response appears to have no obvious PII remaining"""
         total_completeness = 0
         total_semantic_preservation = 0
         total_latency = 0
         successful_requests = 0
             input_text = example[self.config["datasets"]["benchmark_dataset"]["input_field"]]
             expected_output = example[self.config["datasets"]["benchmark_dataset"]["expected_output_field"]]
             # Call model
             predicted_output, latency = self.call_model(instruction, input_text)
                 successful_requests += 1
                 # Calculate metrics
+                pii_detection = self.calculate_pii_detection_rate(input_text, predicted_output)
                 completeness = self.calculate_completeness(predicted_output)
                 semantic_preservation = self.calculate_semantic_preservation(predicted_output, expected_output)
                 total_semantic_preservation += semantic_preservation
                 total_latency += latency
                 # Store example if requested
                 if len(self.results["examples"]) < self.config["output"]["max_examples"]:
                     self.results["examples"].append({
                         "input": input_text,
                         "expected": expected_output,
                         "predicted": predicted_output,
                         "metrics": {
                             "pii_detection": pii_detection,
                             "completeness": completeness,
                 "total_requests": len(examples)
             }
         self.save_results()
     def save_results(self):
 | Semantic Preservation | {m.get('semantic_preservation', 0):.3f} | How well meaning is preserved |
 | Average Latency | {m.get('average_latency_ms', 0):.1f}ms | Response time performance |
+## Key Improvements
+- **PII Detection**: Now measures if model generates ANY placeholders when PII is present in input
+- **Unified Evaluation**: All examples evaluated together (no domain separation)
+- **Lenient Scoring**: Focuses on detection capability rather than exact placeholder matching
+"""
         if self.config["output"]["include_examples"] and self.results["examples"]:
             summary += "## Example Results\n\n"
             for i, example in enumerate(self.results["examples"][:3]):  # Show first 3 examples
+                summary += f"### Example {i+1}\n"
                 summary += f"**Input:** {example['input'][:100]}...\n"
                 summary += f"**Expected:** {example['expected'][:100]}...\n"
                 summary += f"**Predicted:** {example['predicted'][:100]}...\n"