File size: 6,453 Bytes
133ca9b
4a7095a
133ca9b
 
 
 
 
 
 
 
 
4a7095a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133ca9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a7095a
133ca9b
 
 
 
 
 
 
 
 
4a7095a
133ca9b
 
 
 
 
4a7095a
133ca9b
 
4a7095a
133ca9b
 
 
 
 
 
 
 
 
 
4a7095a
133ca9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a7095a
 
 
 
 
 
133ca9b
 
 
4a7095a
133ca9b
4a7095a
133ca9b
 
 
 
 
 
 
 
 
4a7095a
 
 
 
 
 
 
133ca9b
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# app.py - Deploy to Hugging Face Space (New β†’ Gradio β†’ Paste this)
# app.py - Deploy-ready with auto-generated sample data
import gradio as gr
import pandas as pd
import numpy as np
from typing import Tuple, Dict, Any
import io
import base64
import warnings
warnings.filterwarnings('ignore')

def generate_sample_data():
    """Creates a messy CSV for demo purposes - runs on Space startup"""
    np.random.seed(42)
    n = 1000
    
    df = pd.DataFrame({
        'customer_id': [f'CUST_{i:05d}' for i in range(n)],  # High cardinality
        'purchase_date': pd.date_range('2023-01-01', periods=n).strftime('%Y-%m-%d %H:%M:%S'),
        'product_sku': np.random.choice([f'SKU_{i}' for i in range(800)], n),  # Another high-cardinality
        'price': np.random.uniform(10, 1000, n).astype('float64'),  # Memory waste
        'quantity': np.random.randint(1, 10, n).astype('int64'),  # More memory waste
        'temperature': np.random.normal(20, 5, n),  # For interpolation demo
        'category': np.random.choice(['A', 'B', 'C', None], n, p=[0.3, 0.3, 0.3, 0.1]),  # Missing values
        'target': np.random.choice([0, 1], n)  # Binary target
    })
    
    # Introduce missingness that correlates with target (leakage)
    df.loc[df['target'] == 1, 'temperature'] = np.nan
    
    # Mess up datetime format for some rows
    df.loc[::100, 'purchase_date'] = df.loc[::100, 'purchase_date'].str.replace(' ', 'T')
    
    # Add constant column (silent failure)
    df['version'] = 'v1.0'
    
    df.to_csv('sample_messy_data.csv', index=False)
    return 'sample_messy_data.csv'

class DSPreprocessor:
    """Auto-fixes the 5 things that waste your time"""
    
    def __init__(self):
        self.report = {"actions": [], "warnings": [], "stats": {}}
    
    def fit_transform(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
        # 1. Memory Killer: Downcast numeric types (50-90% RAM savings)
        start_mem = df.memory_usage(deep=True).sum() / 1024**2
        for col in df.select_dtypes(include=['int64', 'float64']).columns:
            col_type = df[col].dtype
            try:
                if col_type == 'int64':
                    df[col] = pd.to_numeric(df[col], downcast='integer')
                else:
                    df[col] = pd.to_numeric(df[col], downcast='float')
                if df[col].dtype != col_type:
                    self.report["actions"].append(f"βœ“ {col}: {col_type} β†’ {df[col].dtype}")
            except:
                pass
        
        # 2. DateTime Hell: Auto-detect and parse
        for col in df.select_dtypes(include=['object']).columns:
            try:
                parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
                if parsed.notnull().sum() > len(df) * 0.3:
                    df[col] = parsed
                    self.report["actions"].append(f"βœ“ {col}: Parsed datetime ({parsed.notnull().sum()} valid)")
            except:
                pass
        
        # 3. Categorical Explosion: Hash high-cardinality strings
        for col in df.select_dtypes(include=['object']).columns:
            n_unique = df[col].nunique()
            if n_unique > len(df) * 0.5:
                df[col] = df[col].astype('category').cat.codes
                self.report["warnings"].append(
                    f"⚠️ {col}: {n_unique:,} unique values β†’ Hashed to codes"
                )
        
        # 4. Missing Target Leakage: Flag correlated missingness
        missing_corr = df.isnull().corr()
        high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index()
        high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
        
        if not high_corr.empty:
            for _, row in high_corr.iterrows():
                self.report["warnings"].append(
                    f"⚠️ Missingness correlation: {row['level_0']} ↔ {row['level_1']} (r={row[0]:.2f})"
                )
        
        # 5. Silent Failures: Detect constant columns
        constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
        if constant_cols:
            self.report["warnings"].append(f"⚠️ Constant columns (drop these): {constant_cols}")
        
        # Final stats
        end_mem = df.memory_usage(deep=True).sum() / 1024**2
        self.report["stats"] = {
            "Memory saved": f"{start_mem - end_mem:.1f} MB ({100*(1-end_mem/start_mem):.0f}%)",
            "Rows": len(df),
            "Columns": len(df.columns),
            "Dtypes optimized": len([a for a in self.report["actions"] if "β†’" in a])
        }
        
        return df, self.report

def process_file(file_obj, target_col: str = "") -> Tuple[pd.DataFrame, Dict, str]:
    """Main function for Gradio"""
    if file_obj is None:
        return None, None, "Upload a CSV first"
    
    df = pd.read_csv(file_obj.name)
    preprocessor = DSPreprocessor()
    
    if target_col and target_col in df.columns:
        df = df[[c for c in df.columns if c != target_col] + [target_col]]
    
    cleaned_df, report = preprocessor.fit_transform(df)
    
    csv_bytes = cleaned_df.to_csv(index=False).encode()
    b64 = base64.b64encode(csv_bytes).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>'
    
    return cleaned_df, report, href

# Generate sample data on startup
sample_file = generate_sample_data()

# UI
with gr.Blocks(title="DS AutoPrep") as demo:
    gr.Markdown("# πŸš€ DS AutoPrep\n**Zero-config CSV cleaning + leak detection + memory optimization**")
    
    with gr.Row():
        file_input = gr.File(label="Upload CSV", file_types=[".csv"])
        target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., target")
    
    go_btn = gr.Button("πŸ”₯ Clean My Data", variant="primary")
    
    with gr.Tabs():
        with gr.TabItem("Cleaned Data"):
            data_output = gr.Dataframe()
        with gr.TabItem("Audit Report"):
            report_output = gr.JSON()
        with gr.TabItem("Download"):
            download_html = gr.HTML()
    
    # Auto-load example on startup
    gr.Examples(
        examples=[sample_file],
        inputs=[file_input],
        label="Try with sample data"
    )
    
    go_btn.click(
        fn=process_file,
        inputs=[file_input, target_input],
        outputs=[data_output, report_output, download_html]
    )

demo.launch()