meisaicheck-api / validate_optimization.py
Vu Minh Chien
change predict rule
06d9f7d
#!/usr/bin/env python3
"""
Validation script to compare two CSV files
Compares the following columns: ๅ‡บๅŠ›_็ง‘็›ฎ, ๅ‡บๅŠ›_ไธญ็ง‘็›ฎ, ๅ‡บๅŠ›_ๆจ™ๆบ–ๅ็งฐ, ๅ‡บๅŠ›_้ …็›ฎๅ, ๅ‡บๅŠ›_ๆจ™ๆบ–ๅ˜ไฝ
"""
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Optional, Any
import os
from datetime import datetime
class FileComparator:
def __init__(self, original_file_path: str):
"""
Initialize comparator with original output file
Args:
original_file_path: Path to original CSV file
"""
self.original_file_path = original_file_path
self.comparison_columns = [
'ๅ‡บๅŠ›_็ง‘็›ฎ',
'ๅ‡บๅŠ›_ไธญ็ง‘็›ฎ',
'ๅ‡บๅŠ›_ๆจ™ๆบ–ๅ็งฐ',
'ๅ‡บๅŠ›_้ …็›ฎๅ',
'ๅ‡บๅŠ›_้›†่จˆ็”จๅ˜ไฝ'
]
def load_original_data(self) -> pd.DataFrame:
"""Load original output data"""
try:
df_original = pd.read_csv(self.original_file_path)
print(f"โœ“ Loaded original data: {len(df_original)} rows")
return df_original
except Exception as e:
print(f"โœ— Error loading original data: {e}")
raise
def compare_dataframes(
self, df_original: pd.DataFrame, df_optimized: pd.DataFrame
) -> Dict[str, Any]:
"""
Compare original vs optimized dataframes
Returns:
Dict with comparison results
"""
results: Dict[str, Any] = {
"total_rows": len(df_original),
"columns_compared": self.comparison_columns,
"differences": {},
"summary": {},
}
# Check if dataframes have same length
if len(df_original) != len(df_optimized):
results['length_mismatch'] = {
'original': len(df_original),
'optimized': len(df_optimized)
}
print(f"โš  Warning: Different number of rows - Original: {len(df_original)}, Optimized: {len(df_optimized)}")
# Compare each column
for col in self.comparison_columns:
if col not in df_original.columns:
results['differences'][col] = f"Column not found in original data"
continue
if col not in df_optimized.columns:
results['differences'][col] = f"Column not found in optimized data"
continue
# Fill NaN values with empty string for comparison
original_values = df_original[col].fillna('')
optimized_values = df_optimized[col].fillna('')
# Compare values
differences = original_values != optimized_values
diff_count = differences.sum()
results['differences'][col] = {
'total_differences': int(diff_count),
'accuracy_percentage': round((1 - diff_count / len(df_original)) * 100, 2),
'different_indices': differences[differences].index.tolist()[:10] # Show first 10 different indices
}
if diff_count > 0:
print(f"โš  {col}: {diff_count} differences ({results['differences'][col]['accuracy_percentage']}% accuracy)")
else:
print(f"โœ“ {col}: Perfect match (100% accuracy)")
# Overall summary
total_differences = sum([results['differences'][col]['total_differences']
for col in self.comparison_columns
if isinstance(results['differences'][col], dict)])
overall_accuracy = round((1 - total_differences / (len(df_original) * len(self.comparison_columns))) * 100, 2)
results['summary'] = {
'total_differences': total_differences,
'overall_accuracy': overall_accuracy,
'perfect_match': total_differences == 0
}
return results
def generate_difference_report(
self,
df_original: pd.DataFrame,
df_optimized: pd.DataFrame,
output_file: Optional[str] = None,
) -> str:
"""
Generate detailed difference report
Args:
df_original: Original dataframe
df_optimized: Optimized dataframe
output_file: Optional output file path
Returns:
Report string
"""
report_lines = []
report_lines.append("=" * 80)
report_lines.append(f"FILE COMPARISON REPORT")
report_lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report_lines.append("=" * 80)
# Basic info
report_lines.append(f"Original data rows: {len(df_original)}")
report_lines.append(f"Compared data rows: {len(df_optimized)}")
report_lines.append(f"Columns compared: {', '.join(self.comparison_columns)}")
report_lines.append("")
# Compare each column
for col in self.comparison_columns:
if col not in df_original.columns or col not in df_optimized.columns:
report_lines.append(f"โŒ {col}: Column missing")
continue
original_values = df_original[col].fillna('')
optimized_values = df_optimized[col].fillna('')
differences = original_values != optimized_values
diff_count = differences.sum()
accuracy = round((1 - diff_count / len(df_original)) * 100, 2)
status = "โœ…" if diff_count == 0 else "โš ๏ธ"
report_lines.append(f"{status} {col}: {diff_count} differences ({accuracy}% accuracy)")
if diff_count > 0:
# Show some examples of differences
diff_indices = differences[differences].index[:5]
report_lines.append(f" Sample differences (first 5):")
for idx in diff_indices:
orig_val = str(original_values.iloc[idx])[:50]
opt_val = str(optimized_values.iloc[idx])[:50]
report_lines.append(f" Row {idx}: '{orig_val}' โ†’ '{opt_val}'")
report_lines.append("")
# Overall summary
total_comparisons = len(df_original) * len(self.comparison_columns)
total_differences = sum([
(df_original[col].fillna('') != df_optimized[col].fillna('')).sum()
for col in self.comparison_columns
if col in df_original.columns and col in df_optimized.columns
])
overall_accuracy = round((1 - total_differences / total_comparisons) * 100, 2)
report_lines.append("=" * 80)
report_lines.append(f"OVERALL RESULTS:")
report_lines.append(f"Total differences: {total_differences}")
report_lines.append(f"Overall accuracy: {overall_accuracy}%")
report_lines.append(f"Perfect match: {'Yes' if total_differences == 0 else 'No'}")
report_lines.append("=" * 80)
report_text = "\n".join(report_lines)
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(report_text)
print(f"๐Ÿ“„ Report saved to: {output_file}")
return report_text
def compare_two_files(
self, second_file_path: str, report_file: Optional[str] = None
) -> bool:
"""
Compare two CSV files directly
Args:
second_file_path: Path to second CSV file to compare
report_file: Optional report file path
Returns:
True if files match perfectly (100% accuracy)
"""
print("๐Ÿ” Starting file comparison...")
# Load original data
df_original = self.load_original_data()
# Load second file
try:
df_second = pd.read_csv(second_file_path)
print(f"โœ“ Loaded second file: {len(df_second)} rows")
except Exception as e:
print(f"โœ— Error loading second file: {e}")
return False
# Compare results
print("๐Ÿ“Š Comparing results...")
results = self.compare_dataframes(df_original, df_second)
# Generate report
if report_file:
self.generate_difference_report(df_original, df_second, report_file)
# Print summary
print("\n" + "="*50)
print("๐ŸŽฏ COMPARISON SUMMARY")
print("="*50)
print(f"Overall accuracy: {results['summary']['overall_accuracy']}%")
print(f"Perfect match: {'Yes' if results['summary']['perfect_match'] else 'No'}")
print(f"Total differences: {results['summary']['total_differences']}")
return results['summary']['perfect_match']
def main():
"""Main function to compare two files"""
# File paths
original_file = "data/outputData_original.csv"
second_file = "data/outputData_api.csv"
if not os.path.exists(original_file):
print(f"โŒ Original file not found: {original_file}")
print("Please ensure the original file exists")
return
if not os.path.exists(second_file):
print(f"โŒ Second file not found: {second_file}")
print("Please ensure the second file exists")
return
# Initialize comparator
comparator = FileComparator(original_file)
# Compare files
is_match = comparator.compare_two_files(second_file, "file_comparison_report.txt")
if is_match:
print("๐ŸŽ‰ Files MATCH perfectly!")
else:
print("โŒ Files have differences. Check the report for details.")
if __name__ == "__main__":
main()