File size: 1,770 Bytes
d16cb83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
model:
  base_url: "http://127.0.0.1:8000"
  max_tokens: 256
  temperature: 0.1
  timeout: 30

datasets:
  benchmark_dataset:
    file_path: "Personal_De-identifier_Benchmark_SFT.jsonl"
    sample_size: 100  # Use first 100 examples for quick benchmarking
    instruction_field: "instruction"
    input_field: "input"
    expected_output_field: "response"

metrics:
  # Primary metrics for HuggingFace
  pii_detection:
    name: "PII Detection Rate"
    description: "Percentage of personal identifiers correctly identified and masked"
    type: "accuracy"

  completeness:
    name: "Completeness Score"
    description: "Percentage of texts where all PII was successfully removed"
    type: "binary_accuracy"

  semantic_preservation:
    name: "Semantic Preservation"
    description: "How well the original meaning is preserved (placeholder-based similarity)"
    type: "similarity"

  latency:
    name: "Average Latency"
    description: "Average response time in milliseconds"
    type: "latency"

  # Domain-specific performance
  domain_performance:
    medical:
      name: "Medical Records"
      keywords: ["patient", "doctor", "hospital", "medical", "diagnosis"]
    legal:
      name: "Legal Documents"
      keywords: ["deponent", "attorney", "case", "court", "legal"]
    hr:
      name: "HR Records"
      keywords: ["employee", "salary", "hr", "personnel", "recruitment"]
    customer_service:
      name: "Customer Service"
      keywords: ["customer", "complaint", "service", "support", "inquiry"]
    research:
      name: "Research Data"
      keywords: ["participant", "study", "research", "consent", "ethics"]

output:
  results_file: "benchmarks.txt"
  detailed_results_file: "benchmark_results.json"
  include_examples: true
  max_examples: 10