|
import pandas as pd
|
|
from ctgan import CTGAN
|
|
from sklearn.preprocessing import LabelEncoder
|
|
import os
|
|
import json
|
|
import requests
|
|
|
|
def train_and_generate_synthetic(real_data, schema, output_path):
|
|
"""Trains a CTGAN model and generates synthetic data."""
|
|
categorical_cols = [col for col, dtype in zip(schema['columns'], schema['types']) if dtype == 'string']
|
|
|
|
|
|
label_encoders = {}
|
|
for col in categorical_cols:
|
|
le = LabelEncoder()
|
|
real_data[col] = le.fit_transform(real_data[col])
|
|
label_encoders[col] = le
|
|
|
|
|
|
gan = CTGAN(epochs=300)
|
|
gan.fit(real_data, categorical_cols)
|
|
|
|
|
|
synthetic_data = gan.sample(schema['size'])
|
|
|
|
|
|
for col in categorical_cols:
|
|
synthetic_data[col] = label_encoders[col].inverse_transform(synthetic_data[col])
|
|
|
|
|
|
os.makedirs('outputs', exist_ok=True)
|
|
synthetic_data.to_csv(output_path, index=False)
|
|
print(f"β
Synthetic data saved to {output_path}")
|
|
|
|
def generate_schema(prompt):
|
|
"""Fetches schema from an external API and validates JSON."""
|
|
API_URL = "https://api.example.com/schema"
|
|
headers = {"Authorization": f"Bearer YOUR_HUGGINGFACE_TOKEN"}
|
|
|
|
try:
|
|
response = requests.post(API_URL, json={"prompt": prompt}, headers=headers)
|
|
print("π Raw API Response:", response.text)
|
|
|
|
schema = response.json()
|
|
|
|
|
|
if 'columns' not in schema or 'types' not in schema or 'size' not in schema:
|
|
raise ValueError("β Invalid schema format! Expected keys: 'columns', 'types', 'size'")
|
|
|
|
print("β
Valid Schema Received:", schema)
|
|
return schema
|
|
|
|
except json.JSONDecodeError:
|
|
print("β Failed to parse JSON response. API might be down or returning non-JSON data.")
|
|
return None
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"β API request failed: {e}")
|
|
return None
|
|
|
|
def fetch_data(domain):
|
|
"""Fetches real data for the given domain and ensures it's a valid DataFrame."""
|
|
data_path = f"datasets/{domain}.csv"
|
|
if os.path.exists(data_path):
|
|
df = pd.read_csv(data_path)
|
|
if not isinstance(df, pd.DataFrame) or df.empty:
|
|
raise ValueError("β Loaded data is invalid!")
|
|
return df
|
|
else:
|
|
raise FileNotFoundError(f"β Dataset for {domain} not found.")
|
|
|