vumichien commited on
Commit
16622d0
·
1 Parent(s): 7547e8e

Refactor validation script to improve file comparison functionality, rename class for clarity, and update documentation. Add file_comparison_report.txt to .gitignore to prevent accidental commits.

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. routes/predict.py +5 -7
  3. validate_optimization.py +91 -152
.gitignore CHANGED
@@ -35,3 +35,4 @@ outputs/*.csv
35
  *.model
36
  *.bin
37
  *.safetensors
 
 
35
  *.model
36
  *.bin
37
  *.safetensors
38
+ file_comparison_report.txt
routes/predict.py CHANGED
@@ -289,18 +289,16 @@ async def predict(
289
 
290
  # Map output columns to match Excel structure
291
  # 出力_中科目 mapping - use the standard sub-subject from sub-subject mapper
292
- if "出力_基準中科目" in df_output_data.columns:
 
 
293
  df_output_data["出力_中科目"] = df_output_data["出力_基準中科目"]
294
  elif "標準中科目" in df_output_data.columns:
295
  df_output_data["出力_中科目"] = df_output_data["標準中科目"]
296
 
297
  # 出力_項目名 mapping - use the final item name from name and abstract mapper
298
- if (
299
- "出力_項目名" in df_output_data.columns
300
- and not df_output_data["出力_項目名"].isna().all()
301
- ):
302
- # Keep existing 出力_項目名 if it exists and has values
303
- pass
304
  elif "出力_標準名称" in df_output_data.columns:
305
  df_output_data["出力_項目名"] = df_output_data["出力_標準名称"]
306
  elif "出力_基準名称" in df_output_data.columns:
 
289
 
290
  # Map output columns to match Excel structure
291
  # 出力_中科目 mapping - use the standard sub-subject from sub-subject mapper
292
+ if "出力_中科目" in df_output_data.columns:
293
+ df_output_data["出力_中科目"] = df_output_data["出力_中科目"]
294
+ elif "出力_基準中科目" in df_output_data.columns:
295
  df_output_data["出力_中科目"] = df_output_data["出力_基準中科目"]
296
  elif "標準中科目" in df_output_data.columns:
297
  df_output_data["出力_中科目"] = df_output_data["標準中科目"]
298
 
299
  # 出力_項目名 mapping - use the final item name from name and abstract mapper
300
+ if "出力_項目名" in df_output_data.columns:
301
+ df_output_data["出力_項目名"] = df_output_data["出力_項目名"]
 
 
 
 
302
  elif "出力_標準名称" in df_output_data.columns:
303
  df_output_data["出力_項目名"] = df_output_data["出力_標準名称"]
304
  elif "出力_基準名称" in df_output_data.columns:
validate_optimization.py CHANGED
@@ -1,26 +1,23 @@
1
  #!/usr/bin/env python3
2
  """
3
- Validation script to compare optimized vs original mapper output
4
  Compares the following columns: 出力_科目, 出力_中科目, 出力_標準名称, 出力_項目名, 出力_標準単位
5
  """
6
 
7
  import pandas as pd
8
  import numpy as np
9
- from typing import List, Dict, Tuple
10
  import os
11
- import sys
12
  from datetime import datetime
13
 
14
- # Add the meisai-check-ai directory to Python path
15
- sys.path.append(os.path.join(os.path.dirname(__file__), 'meisai-check-ai'))
16
 
17
- class OptimizationValidator:
18
  def __init__(self, original_file_path: str):
19
  """
20
- Initialize validator with original output file
21
-
22
  Args:
23
- original_file_path: Path to outputData_original.csv
24
  """
25
  self.original_file_path = original_file_path
26
  self.comparison_columns = [
@@ -30,7 +27,7 @@ class OptimizationValidator:
30
  '出力_項目名',
31
  '出力_標準単位'
32
  ]
33
-
34
  def load_original_data(self) -> pd.DataFrame:
35
  """Load original output data"""
36
  try:
@@ -40,21 +37,23 @@ class OptimizationValidator:
40
  except Exception as e:
41
  print(f"✗ Error loading original data: {e}")
42
  raise
43
-
44
- def compare_dataframes(self, df_original: pd.DataFrame, df_optimized: pd.DataFrame) -> Dict:
 
 
45
  """
46
  Compare original vs optimized dataframes
47
 
48
  Returns:
49
  Dict with comparison results
50
  """
51
- results = {
52
- 'total_rows': len(df_original),
53
- 'columns_compared': self.comparison_columns,
54
- 'differences': {},
55
- 'summary': {}
56
  }
57
-
58
  # Check if dataframes have same length
59
  if len(df_original) != len(df_optimized):
60
  results['length_mismatch'] = {
@@ -62,53 +61,57 @@ class OptimizationValidator:
62
  'optimized': len(df_optimized)
63
  }
64
  print(f"⚠ Warning: Different number of rows - Original: {len(df_original)}, Optimized: {len(df_optimized)}")
65
-
66
  # Compare each column
67
  for col in self.comparison_columns:
68
  if col not in df_original.columns:
69
  results['differences'][col] = f"Column not found in original data"
70
  continue
71
-
72
  if col not in df_optimized.columns:
73
  results['differences'][col] = f"Column not found in optimized data"
74
  continue
75
-
76
  # Fill NaN values with empty string for comparison
77
  original_values = df_original[col].fillna('')
78
  optimized_values = df_optimized[col].fillna('')
79
-
80
  # Compare values
81
  differences = original_values != optimized_values
82
  diff_count = differences.sum()
83
-
84
  results['differences'][col] = {
85
  'total_differences': int(diff_count),
86
  'accuracy_percentage': round((1 - diff_count / len(df_original)) * 100, 2),
87
  'different_indices': differences[differences].index.tolist()[:10] # Show first 10 different indices
88
  }
89
-
90
  if diff_count > 0:
91
  print(f"⚠ {col}: {diff_count} differences ({results['differences'][col]['accuracy_percentage']}% accuracy)")
92
  else:
93
  print(f"✓ {col}: Perfect match (100% accuracy)")
94
-
95
  # Overall summary
96
  total_differences = sum([results['differences'][col]['total_differences']
97
  for col in self.comparison_columns
98
  if isinstance(results['differences'][col], dict)])
99
-
100
  overall_accuracy = round((1 - total_differences / (len(df_original) * len(self.comparison_columns))) * 100, 2)
101
-
102
  results['summary'] = {
103
  'total_differences': total_differences,
104
  'overall_accuracy': overall_accuracy,
105
  'perfect_match': total_differences == 0
106
  }
107
-
108
  return results
109
-
110
- def generate_difference_report(self, df_original: pd.DataFrame, df_optimized: pd.DataFrame,
111
- output_file: str = None) -> str:
 
 
 
 
112
  """
113
  Generate detailed difference report
114
 
@@ -122,32 +125,32 @@ class OptimizationValidator:
122
  """
123
  report_lines = []
124
  report_lines.append("=" * 80)
125
- report_lines.append(f"OPTIMIZATION VALIDATION REPORT")
126
  report_lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
127
  report_lines.append("=" * 80)
128
-
129
  # Basic info
130
  report_lines.append(f"Original data rows: {len(df_original)}")
131
- report_lines.append(f"Optimized data rows: {len(df_optimized)}")
132
  report_lines.append(f"Columns compared: {', '.join(self.comparison_columns)}")
133
  report_lines.append("")
134
-
135
  # Compare each column
136
  for col in self.comparison_columns:
137
  if col not in df_original.columns or col not in df_optimized.columns:
138
  report_lines.append(f"❌ {col}: Column missing")
139
  continue
140
-
141
  original_values = df_original[col].fillna('')
142
  optimized_values = df_optimized[col].fillna('')
143
-
144
  differences = original_values != optimized_values
145
  diff_count = differences.sum()
146
  accuracy = round((1 - diff_count / len(df_original)) * 100, 2)
147
-
148
  status = "✅" if diff_count == 0 else "⚠️"
149
  report_lines.append(f"{status} {col}: {diff_count} differences ({accuracy}% accuracy)")
150
-
151
  if diff_count > 0:
152
  # Show some examples of differences
153
  diff_indices = differences[differences].index[:5]
@@ -157,7 +160,7 @@ class OptimizationValidator:
157
  opt_val = str(optimized_values.iloc[idx])[:50]
158
  report_lines.append(f" Row {idx}: '{orig_val}' → '{opt_val}'")
159
  report_lines.append("")
160
-
161
  # Overall summary
162
  total_comparisons = len(df_original) * len(self.comparison_columns)
163
  total_differences = sum([
@@ -165,160 +168,96 @@ class OptimizationValidator:
165
  for col in self.comparison_columns
166
  if col in df_original.columns and col in df_optimized.columns
167
  ])
168
-
169
  overall_accuracy = round((1 - total_differences / total_comparisons) * 100, 2)
170
-
171
  report_lines.append("=" * 80)
172
  report_lines.append(f"OVERALL RESULTS:")
173
  report_lines.append(f"Total differences: {total_differences}")
174
  report_lines.append(f"Overall accuracy: {overall_accuracy}%")
175
  report_lines.append(f"Perfect match: {'Yes' if total_differences == 0 else 'No'}")
176
  report_lines.append("=" * 80)
177
-
178
  report_text = "\n".join(report_lines)
179
-
180
  if output_file:
181
  with open(output_file, 'w', encoding='utf-8') as f:
182
  f.write(report_text)
183
  print(f"📄 Report saved to: {output_file}")
184
-
185
  return report_text
186
-
187
- def validate_optimization(self, optimized_mapper_function, input_data: pd.DataFrame,
188
- report_file: str = None) -> bool:
189
- """
190
- Run full validation process
191
-
192
- Args:
193
- optimized_mapper_function: Function that takes input_data and returns optimized output
194
- input_data: Input dataframe to process
195
- report_file: Optional report file path
196
-
197
- Returns:
198
- True if validation passes (100% accuracy)
199
- """
200
- print("🔍 Starting optimization validation...")
201
-
202
- # Load original data
203
- df_original = self.load_original_data()
204
-
205
- # Run optimized mapper
206
- print("🚀 Running optimized mapper...")
207
- try:
208
- df_optimized = optimized_mapper_function(input_data)
209
- print(f"✓ Optimized processing completed: {len(df_optimized)} rows")
210
- except Exception as e:
211
- print(f"✗ Error in optimized processing: {e}")
212
- return False
213
-
214
- # Compare results
215
- print("📊 Comparing results...")
216
- results = self.compare_dataframes(df_original, df_optimized)
217
-
218
- # Generate report
219
- if report_file:
220
- self.generate_difference_report(df_original, df_optimized, report_file)
221
-
222
- # Print summary
223
- print("\n" + "="*50)
224
- print("🎯 VALIDATION SUMMARY")
225
- print("="*50)
226
- print(f"Overall accuracy: {results['summary']['overall_accuracy']}%")
227
- print(f"Perfect match: {'Yes' if results['summary']['perfect_match'] else 'No'}")
228
- print(f"Total differences: {results['summary']['total_differences']}")
229
-
230
- return results['summary']['perfect_match']
231
 
232
- def compare_two_files(self, optimized_file_path: str, report_file: str = None) -> bool:
 
 
233
  """
234
  Compare two CSV files directly
235
-
236
  Args:
237
- optimized_file_path: Path to optimized output CSV
238
  report_file: Optional report file path
239
-
240
  Returns:
241
- True if validation passes (100% accuracy)
242
  """
243
- print("🔍 Starting file comparison validation...")
244
-
245
  # Load original data
246
  df_original = self.load_original_data()
247
-
248
- # Load optimized data
249
  try:
250
- df_optimized = pd.read_csv(optimized_file_path)
251
- print(f"✓ Loaded optimized data: {len(df_optimized)} rows")
252
  except Exception as e:
253
- print(f"✗ Error loading optimized data: {e}")
254
  return False
255
-
256
  # Compare results
257
  print("📊 Comparing results...")
258
- results = self.compare_dataframes(df_original, df_optimized)
259
-
260
  # Generate report
261
  if report_file:
262
- self.generate_difference_report(df_original, df_optimized, report_file)
263
-
264
  # Print summary
265
  print("\n" + "="*50)
266
- print("🎯 VALIDATION SUMMARY")
267
  print("="*50)
268
  print(f"Overall accuracy: {results['summary']['overall_accuracy']}%")
269
  print(f"Perfect match: {'Yes' if results['summary']['perfect_match'] else 'No'}")
270
  print(f"Total differences: {results['summary']['total_differences']}")
271
-
272
  return results['summary']['perfect_match']
273
 
 
274
  def main():
275
- """Example usage"""
276
- # Example paths - update these according to your setup
277
  original_file = "data/outputData_original.csv"
278
- input_file = "data/outputData_api.csv"
279
-
280
  if not os.path.exists(original_file):
281
  print(f"❌ Original file not found: {original_file}")
282
- print("Please ensure outputData_original.csv exists in the current directory")
283
  return
284
-
285
- # Initialize validator
286
- validator = OptimizationValidator(original_file)
287
-
288
- # Example of how to use with your mapper
289
- def example_optimized_mapper(input_data):
290
- # This is where you would call your optimized mapper
291
- # For now, return a copy of input_data as example
292
- df_result = input_data.copy()
293
-
294
- # Add expected output columns with dummy data for demo
295
- df_result['出力_科目'] = df_result.get('科目', '')
296
- df_result['出力_中科目'] = df_result.get('中科目', '')
297
- df_result['出力_標準名称'] = df_result.get('名称', '')
298
- df_result['出力_項目名'] = df_result.get('名称', '')
299
- df_result['出力_標準単位'] = df_result.get('単位', '')
300
-
301
- return df_result
302
-
303
- # Load input data
304
- if os.path.exists(input_file):
305
- input_data = pd.read_csv(input_file)
306
-
307
- # Run validation
308
- is_valid = validator.validate_optimization(
309
- example_optimized_mapper,
310
- input_data,
311
- "optimization_validation_report.txt"
312
- )
313
-
314
- if is_valid:
315
- print("🎉 Validation PASSED! Optimization maintains accuracy.")
316
- else:
317
- print("❌ Validation FAILED! Check the report for details.")
318
  else:
319
- print(f"❌ Input file not found: {input_file}")
320
- print("You can also compare two CSV files directly:")
321
- print("validator.compare_two_files('optimized_output.csv', 'report.txt')")
322
 
323
  if __name__ == "__main__":
324
- main()
 
1
  #!/usr/bin/env python3
2
  """
3
+ Validation script to compare two CSV files
4
  Compares the following columns: 出力_科目, 出力_中科目, 出力_標準名称, 出力_項目名, 出力_標準単位
5
  """
6
 
7
  import pandas as pd
8
  import numpy as np
9
+ from typing import List, Dict, Tuple, Optional, Any
10
  import os
 
11
  from datetime import datetime
12
 
 
 
13
 
14
+ class FileComparator:
15
  def __init__(self, original_file_path: str):
16
  """
17
+ Initialize comparator with original output file
18
+
19
  Args:
20
+ original_file_path: Path to original CSV file
21
  """
22
  self.original_file_path = original_file_path
23
  self.comparison_columns = [
 
27
  '出力_項目名',
28
  '出力_標準単位'
29
  ]
30
+
31
  def load_original_data(self) -> pd.DataFrame:
32
  """Load original output data"""
33
  try:
 
37
  except Exception as e:
38
  print(f"✗ Error loading original data: {e}")
39
  raise
40
+
41
+ def compare_dataframes(
42
+ self, df_original: pd.DataFrame, df_optimized: pd.DataFrame
43
+ ) -> Dict[str, Any]:
44
  """
45
  Compare original vs optimized dataframes
46
 
47
  Returns:
48
  Dict with comparison results
49
  """
50
+ results: Dict[str, Any] = {
51
+ "total_rows": len(df_original),
52
+ "columns_compared": self.comparison_columns,
53
+ "differences": {},
54
+ "summary": {},
55
  }
56
+
57
  # Check if dataframes have same length
58
  if len(df_original) != len(df_optimized):
59
  results['length_mismatch'] = {
 
61
  'optimized': len(df_optimized)
62
  }
63
  print(f"⚠ Warning: Different number of rows - Original: {len(df_original)}, Optimized: {len(df_optimized)}")
64
+
65
  # Compare each column
66
  for col in self.comparison_columns:
67
  if col not in df_original.columns:
68
  results['differences'][col] = f"Column not found in original data"
69
  continue
70
+
71
  if col not in df_optimized.columns:
72
  results['differences'][col] = f"Column not found in optimized data"
73
  continue
74
+
75
  # Fill NaN values with empty string for comparison
76
  original_values = df_original[col].fillna('')
77
  optimized_values = df_optimized[col].fillna('')
78
+
79
  # Compare values
80
  differences = original_values != optimized_values
81
  diff_count = differences.sum()
82
+
83
  results['differences'][col] = {
84
  'total_differences': int(diff_count),
85
  'accuracy_percentage': round((1 - diff_count / len(df_original)) * 100, 2),
86
  'different_indices': differences[differences].index.tolist()[:10] # Show first 10 different indices
87
  }
88
+
89
  if diff_count > 0:
90
  print(f"⚠ {col}: {diff_count} differences ({results['differences'][col]['accuracy_percentage']}% accuracy)")
91
  else:
92
  print(f"✓ {col}: Perfect match (100% accuracy)")
93
+
94
  # Overall summary
95
  total_differences = sum([results['differences'][col]['total_differences']
96
  for col in self.comparison_columns
97
  if isinstance(results['differences'][col], dict)])
98
+
99
  overall_accuracy = round((1 - total_differences / (len(df_original) * len(self.comparison_columns))) * 100, 2)
100
+
101
  results['summary'] = {
102
  'total_differences': total_differences,
103
  'overall_accuracy': overall_accuracy,
104
  'perfect_match': total_differences == 0
105
  }
106
+
107
  return results
108
+
109
+ def generate_difference_report(
110
+ self,
111
+ df_original: pd.DataFrame,
112
+ df_optimized: pd.DataFrame,
113
+ output_file: Optional[str] = None,
114
+ ) -> str:
115
  """
116
  Generate detailed difference report
117
 
 
125
  """
126
  report_lines = []
127
  report_lines.append("=" * 80)
128
+ report_lines.append(f"FILE COMPARISON REPORT")
129
  report_lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
130
  report_lines.append("=" * 80)
131
+
132
  # Basic info
133
  report_lines.append(f"Original data rows: {len(df_original)}")
134
+ report_lines.append(f"Compared data rows: {len(df_optimized)}")
135
  report_lines.append(f"Columns compared: {', '.join(self.comparison_columns)}")
136
  report_lines.append("")
137
+
138
  # Compare each column
139
  for col in self.comparison_columns:
140
  if col not in df_original.columns or col not in df_optimized.columns:
141
  report_lines.append(f"❌ {col}: Column missing")
142
  continue
143
+
144
  original_values = df_original[col].fillna('')
145
  optimized_values = df_optimized[col].fillna('')
146
+
147
  differences = original_values != optimized_values
148
  diff_count = differences.sum()
149
  accuracy = round((1 - diff_count / len(df_original)) * 100, 2)
150
+
151
  status = "✅" if diff_count == 0 else "⚠️"
152
  report_lines.append(f"{status} {col}: {diff_count} differences ({accuracy}% accuracy)")
153
+
154
  if diff_count > 0:
155
  # Show some examples of differences
156
  diff_indices = differences[differences].index[:5]
 
160
  opt_val = str(optimized_values.iloc[idx])[:50]
161
  report_lines.append(f" Row {idx}: '{orig_val}' → '{opt_val}'")
162
  report_lines.append("")
163
+
164
  # Overall summary
165
  total_comparisons = len(df_original) * len(self.comparison_columns)
166
  total_differences = sum([
 
168
  for col in self.comparison_columns
169
  if col in df_original.columns and col in df_optimized.columns
170
  ])
171
+
172
  overall_accuracy = round((1 - total_differences / total_comparisons) * 100, 2)
173
+
174
  report_lines.append("=" * 80)
175
  report_lines.append(f"OVERALL RESULTS:")
176
  report_lines.append(f"Total differences: {total_differences}")
177
  report_lines.append(f"Overall accuracy: {overall_accuracy}%")
178
  report_lines.append(f"Perfect match: {'Yes' if total_differences == 0 else 'No'}")
179
  report_lines.append("=" * 80)
180
+
181
  report_text = "\n".join(report_lines)
182
+
183
  if output_file:
184
  with open(output_file, 'w', encoding='utf-8') as f:
185
  f.write(report_text)
186
  print(f"📄 Report saved to: {output_file}")
187
+
188
  return report_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
+ def compare_two_files(
191
+ self, second_file_path: str, report_file: Optional[str] = None
192
+ ) -> bool:
193
  """
194
  Compare two CSV files directly
195
+
196
  Args:
197
+ second_file_path: Path to second CSV file to compare
198
  report_file: Optional report file path
199
+
200
  Returns:
201
+ True if files match perfectly (100% accuracy)
202
  """
203
+ print("🔍 Starting file comparison...")
204
+
205
  # Load original data
206
  df_original = self.load_original_data()
207
+
208
+ # Load second file
209
  try:
210
+ df_second = pd.read_csv(second_file_path)
211
+ print(f"✓ Loaded second file: {len(df_second)} rows")
212
  except Exception as e:
213
+ print(f"✗ Error loading second file: {e}")
214
  return False
215
+
216
  # Compare results
217
  print("📊 Comparing results...")
218
+ results = self.compare_dataframes(df_original, df_second)
219
+
220
  # Generate report
221
  if report_file:
222
+ self.generate_difference_report(df_original, df_second, report_file)
223
+
224
  # Print summary
225
  print("\n" + "="*50)
226
+ print("🎯 COMPARISON SUMMARY")
227
  print("="*50)
228
  print(f"Overall accuracy: {results['summary']['overall_accuracy']}%")
229
  print(f"Perfect match: {'Yes' if results['summary']['perfect_match'] else 'No'}")
230
  print(f"Total differences: {results['summary']['total_differences']}")
231
+
232
  return results['summary']['perfect_match']
233
 
234
+
235
  def main():
236
+ """Main function to compare two files"""
237
+ # File paths
238
  original_file = "data/outputData_original.csv"
239
+ second_file = "data/outputData_api_v2.csv"
240
+
241
  if not os.path.exists(original_file):
242
  print(f"❌ Original file not found: {original_file}")
243
+ print("Please ensure the original file exists")
244
  return
245
+
246
+ if not os.path.exists(second_file):
247
+ print(f"❌ Second file not found: {second_file}")
248
+ print("Please ensure the second file exists")
249
+ return
250
+
251
+ # Initialize comparator
252
+ comparator = FileComparator(original_file)
253
+
254
+ # Compare files
255
+ is_match = comparator.compare_two_files(second_file, "file_comparison_report.txt")
256
+
257
+ if is_match:
258
+ print("🎉 Files MATCH perfectly!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  else:
260
+ print("❌ Files have differences. Check the report for details.")
 
 
261
 
262
  if __name__ == "__main__":
263
+ main()