Spaces:
Sleeping
Sleeping
File size: 6,453 Bytes
133ca9b 4a7095a 133ca9b 4a7095a 133ca9b 4a7095a 133ca9b 4a7095a 133ca9b 4a7095a 133ca9b 4a7095a 133ca9b 4a7095a 133ca9b 4a7095a 133ca9b 4a7095a 133ca9b 4a7095a 133ca9b 4a7095a 133ca9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# app.py - Deploy to Hugging Face Space (New β Gradio β Paste this)
# app.py - Deploy-ready with auto-generated sample data
import gradio as gr
import pandas as pd
import numpy as np
from typing import Tuple, Dict, Any
import io
import base64
import warnings
warnings.filterwarnings('ignore')
def generate_sample_data():
"""Creates a messy CSV for demo purposes - runs on Space startup"""
np.random.seed(42)
n = 1000
df = pd.DataFrame({
'customer_id': [f'CUST_{i:05d}' for i in range(n)], # High cardinality
'purchase_date': pd.date_range('2023-01-01', periods=n).strftime('%Y-%m-%d %H:%M:%S'),
'product_sku': np.random.choice([f'SKU_{i}' for i in range(800)], n), # Another high-cardinality
'price': np.random.uniform(10, 1000, n).astype('float64'), # Memory waste
'quantity': np.random.randint(1, 10, n).astype('int64'), # More memory waste
'temperature': np.random.normal(20, 5, n), # For interpolation demo
'category': np.random.choice(['A', 'B', 'C', None], n, p=[0.3, 0.3, 0.3, 0.1]), # Missing values
'target': np.random.choice([0, 1], n) # Binary target
})
# Introduce missingness that correlates with target (leakage)
df.loc[df['target'] == 1, 'temperature'] = np.nan
# Mess up datetime format for some rows
df.loc[::100, 'purchase_date'] = df.loc[::100, 'purchase_date'].str.replace(' ', 'T')
# Add constant column (silent failure)
df['version'] = 'v1.0'
df.to_csv('sample_messy_data.csv', index=False)
return 'sample_messy_data.csv'
class DSPreprocessor:
"""Auto-fixes the 5 things that waste your time"""
def __init__(self):
self.report = {"actions": [], "warnings": [], "stats": {}}
def fit_transform(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
# 1. Memory Killer: Downcast numeric types (50-90% RAM savings)
start_mem = df.memory_usage(deep=True).sum() / 1024**2
for col in df.select_dtypes(include=['int64', 'float64']).columns:
col_type = df[col].dtype
try:
if col_type == 'int64':
df[col] = pd.to_numeric(df[col], downcast='integer')
else:
df[col] = pd.to_numeric(df[col], downcast='float')
if df[col].dtype != col_type:
self.report["actions"].append(f"β {col}: {col_type} β {df[col].dtype}")
except:
pass
# 2. DateTime Hell: Auto-detect and parse
for col in df.select_dtypes(include=['object']).columns:
try:
parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
if parsed.notnull().sum() > len(df) * 0.3:
df[col] = parsed
self.report["actions"].append(f"β {col}: Parsed datetime ({parsed.notnull().sum()} valid)")
except:
pass
# 3. Categorical Explosion: Hash high-cardinality strings
for col in df.select_dtypes(include=['object']).columns:
n_unique = df[col].nunique()
if n_unique > len(df) * 0.5:
df[col] = df[col].astype('category').cat.codes
self.report["warnings"].append(
f"β οΈ {col}: {n_unique:,} unique values β Hashed to codes"
)
# 4. Missing Target Leakage: Flag correlated missingness
missing_corr = df.isnull().corr()
high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index()
high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
if not high_corr.empty:
for _, row in high_corr.iterrows():
self.report["warnings"].append(
f"β οΈ Missingness correlation: {row['level_0']} β {row['level_1']} (r={row[0]:.2f})"
)
# 5. Silent Failures: Detect constant columns
constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
if constant_cols:
self.report["warnings"].append(f"β οΈ Constant columns (drop these): {constant_cols}")
# Final stats
end_mem = df.memory_usage(deep=True).sum() / 1024**2
self.report["stats"] = {
"Memory saved": f"{start_mem - end_mem:.1f} MB ({100*(1-end_mem/start_mem):.0f}%)",
"Rows": len(df),
"Columns": len(df.columns),
"Dtypes optimized": len([a for a in self.report["actions"] if "β" in a])
}
return df, self.report
def process_file(file_obj, target_col: str = "") -> Tuple[pd.DataFrame, Dict, str]:
"""Main function for Gradio"""
if file_obj is None:
return None, None, "Upload a CSV first"
df = pd.read_csv(file_obj.name)
preprocessor = DSPreprocessor()
if target_col and target_col in df.columns:
df = df[[c for c in df.columns if c != target_col] + [target_col]]
cleaned_df, report = preprocessor.fit_transform(df)
csv_bytes = cleaned_df.to_csv(index=False).encode()
b64 = base64.b64encode(csv_bytes).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>'
return cleaned_df, report, href
# Generate sample data on startup
sample_file = generate_sample_data()
# UI
with gr.Blocks(title="DS AutoPrep") as demo:
gr.Markdown("# π DS AutoPrep\n**Zero-config CSV cleaning + leak detection + memory optimization**")
with gr.Row():
file_input = gr.File(label="Upload CSV", file_types=[".csv"])
target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., target")
go_btn = gr.Button("π₯ Clean My Data", variant="primary")
with gr.Tabs():
with gr.TabItem("Cleaned Data"):
data_output = gr.Dataframe()
with gr.TabItem("Audit Report"):
report_output = gr.JSON()
with gr.TabItem("Download"):
download_html = gr.HTML()
# Auto-load example on startup
gr.Examples(
examples=[sample_file],
inputs=[file_input],
label="Try with sample data"
)
go_btn.click(
fn=process_file,
inputs=[file_input, target_input],
outputs=[data_output, report_output, download_html]
)
demo.launch() |