import pandas as pd import random from pathlib import Path import re from transformers import pipeline # Configuración básica raw_path = Path("../data/raw") processed_path = Path("../data/processed") synthetic_path = Path("../data/synthetic") # Cargar datos wikidoc_file = raw_path / "medical_meadow_wikidoc.csv" wikidoc = pd.read_csv(wikidoc_file, encoding='utf-8') # Limpieza de datos def clean_text(text): if pd.isna(text): return "" text = str(text).lower().strip() text = re.sub(r'\s+', ' ', text) text = re.sub(r'[^\w\s]', '', text) return text wikidoc['instruction'] = wikidoc['instruction'].apply(clean_text) wikidoc['input'] = wikidoc['input'].apply(clean_text) wikidoc['output'] = wikidoc['output'].apply(clean_text) # Generación de datos sintéticos def generate_synthetic_wikidoc(df, n=100): sampled = df.sample(n, replace=True) modificaciones = ["(modificado sintéticamente)", "[datos aumentados]", "(versión sintética)", "[augmented]"] instructions, inputs, outputs = [], [], [] for _, row in sampled.iterrows(): mod = random.choice(modificaciones) instruction = str(row["instruction"]) + " " + mod input_text = (str(row["input"]) + " " + mod) if pd.notnull(row["input"]) else mod output_text = str(row["output"]) + " " + mod instructions.append(instruction) inputs.append(input_text) outputs.append(output_text) df_synthetic = pd.DataFrame({ "instruction_sintetica": instructions, "input_sintetico": inputs, "output_sintetico": outputs }) return df_synthetic # Generar y guardar datos sintéticos wikidoc_synthetic = generate_synthetic_wikidoc(wikidoc, n=100) wikidoc_synthetic.to_csv(synthetic_path / "wikidoc_synthetic.csv", index=False) # Combinar datos originales y sintéticos wikidoc_combined = pd.concat([wikidoc, wikidoc_synthetic], ignore_index=True) wikidoc_combined.to_csv(processed_path / "wikidoc_combined.csv", index=False) print("Datos procesados y sintéticos generados exitosamente.")