Spaces:

omasteam
/

Preprocessing-Solver

Sleeping

File size: 6,453 Bytes

# app.py - Deploy to Hugging Face Space (New → Gradio → Paste this)
# app.py - Deploy-ready with auto-generated sample data
import gradio as gr
import pandas as pd
import numpy as np
from typing import Tuple, Dict, Any
import io
import base64
import warnings
warnings.filterwarnings('ignore')

def generate_sample_data():
    """Creates a messy CSV for demo purposes - runs on Space startup"""
    np.random.seed(42)
    n = 1000
    
    df = pd.DataFrame({
        'customer_id': [f'CUST_{i:05d}' for i in range(n)],  # High cardinality
        'purchase_date': pd.date_range('2023-01-01', periods=n).strftime('%Y-%m-%d %H:%M:%S'),
        'product_sku': np.random.choice([f'SKU_{i}' for i in range(800)], n),  # Another high-cardinality
        'price': np.random.uniform(10, 1000, n).astype('float64'),  # Memory waste
        'quantity': np.random.randint(1, 10, n).astype('int64'),  # More memory waste
        'temperature': np.random.normal(20, 5, n),  # For interpolation demo
        'category': np.random.choice(['A', 'B', 'C', None], n, p=[0.3, 0.3, 0.3, 0.1]),  # Missing values
        'target': np.random.choice([0, 1], n)  # Binary target
    })
    
    # Introduce missingness that correlates with target (leakage)
    df.loc[df['target'] == 1, 'temperature'] = np.nan
    
    # Mess up datetime format for some rows
    df.loc[::100, 'purchase_date'] = df.loc[::100, 'purchase_date'].str.replace(' ', 'T')
    
    # Add constant column (silent failure)
    df['version'] = 'v1.0'
    
    df.to_csv('sample_messy_data.csv', index=False)
    return 'sample_messy_data.csv'

class DSPreprocessor:
    """Auto-fixes the 5 things that waste your time"""
    
    def __init__(self):
        self.report = {"actions": [], "warnings": [], "stats": {}}
    
    def fit_transform(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
        # 1. Memory Killer: Downcast numeric types (50-90% RAM savings)
        start_mem = df.memory_usage(deep=True).sum() / 1024**2
        for col in df.select_dtypes(include=['int64', 'float64']).columns:
            col_type = df[col].dtype
            try:
                if col_type == 'int64':
                    df[col] = pd.to_numeric(df[col], downcast='integer')
                else:
                    df[col] = pd.to_numeric(df[col], downcast='float')
                if df[col].dtype != col_type:
                    self.report["actions"].append(f"✓ {col}: {col_type} → {df[col].dtype}")
            except:
                pass
        
        # 2. DateTime Hell: Auto-detect and parse
        for col in df.select_dtypes(include=['object']).columns:
            try:
                parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
                if parsed.notnull().sum() > len(df) * 0.3:
                    df[col] = parsed
                    self.report["actions"].append(f"✓ {col}: Parsed datetime ({parsed.notnull().sum()} valid)")
            except:
                pass
        
        # 3. Categorical Explosion: Hash high-cardinality strings
        for col in df.select_dtypes(include=['object']).columns:
            n_unique = df[col].nunique()
            if n_unique > len(df) * 0.5:
                df[col] = df[col].astype('category').cat.codes
                self.report["warnings"].append(
                    f"⚠️ {col}: {n_unique:,} unique values → Hashed to codes"
                )
        
        # 4. Missing Target Leakage: Flag correlated missingness
        missing_corr = df.isnull().corr()
        high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index()
        high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
        
        if not high_corr.empty:
            for _, row in high_corr.iterrows():
                self.report["warnings"].append(
                    f"⚠️ Missingness correlation: {row['level_0']} ↔ {row['level_1']} (r={row[0]:.2f})"
                )
        
        # 5. Silent Failures: Detect constant columns
        constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
        if constant_cols:
            self.report["warnings"].append(f"⚠️ Constant columns (drop these): {constant_cols}")
        
        # Final stats
        end_mem = df.memory_usage(deep=True).sum() / 1024**2
        self.report["stats"] = {
            "Memory saved": f"{start_mem - end_mem:.1f} MB ({100*(1-end_mem/start_mem):.0f}%)",
            "Rows": len(df),
            "Columns": len(df.columns),
            "Dtypes optimized": len([a for a in self.report["actions"] if "→" in a])
        }
        
        return df, self.report

def process_file(file_obj, target_col: str = "") -> Tuple[pd.DataFrame, Dict, str]:
    """Main function for Gradio"""
    if file_obj is None:
        return None, None, "Upload a CSV first"
    
    df = pd.read_csv(file_obj.name)
    preprocessor = DSPreprocessor()
    
    if target_col and target_col in df.columns:
        df = df[[c for c in df.columns if c != target_col] + [target_col]]
    
    cleaned_df, report = preprocessor.fit_transform(df)
    
    csv_bytes = cleaned_df.to_csv(index=False).encode()
    b64 = base64.b64encode(csv_bytes).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>'
    
    return cleaned_df, report, href

# Generate sample data on startup
sample_file = generate_sample_data()

# UI
with gr.Blocks(title="DS AutoPrep") as demo:
    gr.Markdown("# 🚀 DS AutoPrep\n**Zero-config CSV cleaning + leak detection + memory optimization**")
    
    with gr.Row():
        file_input = gr.File(label="Upload CSV", file_types=[".csv"])
        target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., target")
    
    go_btn = gr.Button("🔥 Clean My Data", variant="primary")
    
    with gr.Tabs():
        with gr.TabItem("Cleaned Data"):
            data_output = gr.Dataframe()
        with gr.TabItem("Audit Report"):
            report_output = gr.JSON()
        with gr.TabItem("Download"):
            download_html = gr.HTML()
    
    # Auto-load example on startup
    gr.Examples(
        examples=[sample_file],
        inputs=[file_input],
        label="Try with sample data"
    )
    
    go_btn.click(
        fn=process_file,
        inputs=[file_input, target_input],
        outputs=[data_output, report_output, download_html]
    )

demo.launch()