Upload run_benchmarks.py with huggingface_hub

Browse files

Files changed (1) hide show

run_benchmarks.py +44 -27

run_benchmarks.py CHANGED Viewed

@@ -114,38 +114,55 @@ class DeIdBenchmarkRunner:
         return True
-    def calculate_semantic_preservation(self, predicted: str, expected: str) -> float:
-        """Calculate semantic preservation based on placeholder structure"""
-        # Simple similarity: compare placeholder types and counts
-        pred_placeholders = self.extract_placeholders(predicted)
-        expected_placeholders = self.extract_placeholders(expected)
-        if not expected_placeholders:
-            return 1.0
-        # Count placeholder types
-        def count_types(placeholders):
-            types = {}
-            for ph in placeholders:
-                # Extract type (e.g., "NAME" from "NAME_1")
-                ptype = ph.split('_')[0]
-                types[ptype] = types.get(ptype, 0) + 1
-            return types
-        pred_types = count_types(pred_placeholders)
-        expected_types = count_types(expected_placeholders)
-        # Calculate similarity based on type distribution
-        all_types = set(pred_types.keys()) | set(expected_types.keys())
-        similarity = 0
-        for ptype in all_types:
-            pred_count = pred_types.get(ptype, 0)
-            exp_count = expected_types.get(ptype, 0)
-            if exp_count > 0:
-                similarity += min(pred_count, exp_count) / exp_count
-        return similarity / len(all_types) if all_types else 1.0
     def call_model(self, instruction: str, input_text: str) -> Tuple[str, float]:
         """Call the de-identification model and measure latency"""
@@ -212,7 +229,7 @@ class DeIdBenchmarkRunner:
                 # Calculate metrics
                 pii_detection = self.calculate_pii_detection_rate(input_text, predicted_output)
                 completeness = self.calculate_completeness(predicted_output)
-                semantic_preservation = self.calculate_semantic_preservation(predicted_output, expected_output)
                 # Update totals
                 total_pii_detection += pii_detection

         return True
+    def calculate_semantic_preservation(self, input_text: str, predicted: str, expected: str) -> float:
+        """Calculate semantic preservation - how well the meaning is preserved after de-identification"""
+        # For de-identification, semantic preservation should focus on:
+        # 1. Whether the core message/content is maintained
+        # 2. Whether the text structure remains coherent
+        # 3. Whether placeholder density is reasonable
+        # Simple approach: compare text length and placeholder density
+        input_words = len(input_text.split())
+        expected_words = len(expected.split())
+        predicted_words = len(predicted.split())
+        # Length preservation (closer to 1.0 is better)
+        if expected_words == 0:
+            length_preservation = 1.0
+        else:
+            length_ratio = predicted_words / expected_words
+            # Penalize if too different in length (ideal ratio around 0.8-1.2)
+            if 0.5 <= length_ratio <= 2.0:
+                length_preservation = 1.0 - abs(1.0 - length_ratio) * 0.5
+            else:
+                length_preservation = 0.1  # Heavily penalize extreme length differences
+        # Placeholder density (should be reasonable, not too sparse or dense)
+        pred_placeholders = self.extract_placeholders(predicted)
+        placeholder_ratio = len(pred_placeholders) / max(predicted_words, 1)
+        if 0.05 <= placeholder_ratio <= 0.3:  # Reasonable placeholder density
+            density_score = 1.0
+        elif placeholder_ratio < 0.05:  # Too few placeholders
+            density_score = placeholder_ratio / 0.05
+        else:  # Too many placeholders
+            density_score = max(0.1, 1.0 - (placeholder_ratio - 0.3) * 2)
+        # Structure preservation (check if basic sentence structure is maintained)
+        # Simple check: count punctuation marks as proxy for structure
+        input_punct = len(re.findall(r'[.!?]', input_text))
+        predicted_punct = len(re.findall(r'[.!?]', predicted))
+        if input_punct == 0:
+            structure_score = 1.0
+        else:
+            structure_ratio = min(predicted_punct, input_punct * 1.5) / input_punct
+            structure_score = min(1.0, structure_ratio)
+        # Combine scores (weighted average)
+        final_score = (length_preservation * 0.4) + (density_score * 0.4) + (structure_score * 0.2)
+        return max(0.0, min(1.0, final_score))  # Clamp to [0,1]
     def call_model(self, instruction: str, input_text: str) -> Tuple[str, float]:
         """Call the de-identification model and measure latency"""
                 # Calculate metrics
                 pii_detection = self.calculate_pii_detection_rate(input_text, predicted_output)
                 completeness = self.calculate_completeness(predicted_output)
+                semantic_preservation = self.calculate_semantic_preservation(input_text, predicted_output, expected_output)
                 # Update totals
                 total_pii_detection += pii_detection