Spaces:
Runtime error
Runtime error
import pandas as pd | |
import random | |
from pathlib import Path | |
import re | |
from transformers import pipeline | |
# Configuración básica | |
raw_path = Path("../data/raw") | |
processed_path = Path("../data/processed") | |
synthetic_path = Path("../data/synthetic") | |
# Cargar datos | |
wikidoc_file = raw_path / "medical_meadow_wikidoc.csv" | |
wikidoc = pd.read_csv(wikidoc_file, encoding='utf-8') | |
# Limpieza de datos | |
def clean_text(text): | |
if pd.isna(text): | |
return "" | |
text = str(text).lower().strip() | |
text = re.sub(r'\s+', ' ', text) | |
text = re.sub(r'[^\w\s]', '', text) | |
return text | |
wikidoc['instruction'] = wikidoc['instruction'].apply(clean_text) | |
wikidoc['input'] = wikidoc['input'].apply(clean_text) | |
wikidoc['output'] = wikidoc['output'].apply(clean_text) | |
# Generación de datos sintéticos | |
def generate_synthetic_wikidoc(df, n=100): | |
sampled = df.sample(n, replace=True) | |
modificaciones = ["(modificado sintéticamente)", "[datos aumentados]", "(versión sintética)", "[augmented]"] | |
instructions, inputs, outputs = [], [], [] | |
for _, row in sampled.iterrows(): | |
mod = random.choice(modificaciones) | |
instruction = str(row["instruction"]) + " " + mod | |
input_text = (str(row["input"]) + " " + mod) if pd.notnull(row["input"]) else mod | |
output_text = str(row["output"]) + " " + mod | |
instructions.append(instruction) | |
inputs.append(input_text) | |
outputs.append(output_text) | |
df_synthetic = pd.DataFrame({ | |
"instruction_sintetica": instructions, | |
"input_sintetico": inputs, | |
"output_sintetico": outputs | |
}) | |
return df_synthetic | |
# Generar y guardar datos sintéticos | |
wikidoc_synthetic = generate_synthetic_wikidoc(wikidoc, n=100) | |
wikidoc_synthetic.to_csv(synthetic_path / "wikidoc_synthetic.csv", index=False) | |
# Combinar datos originales y sintéticos | |
wikidoc_combined = pd.concat([wikidoc, wikidoc_synthetic], ignore_index=True) | |
wikidoc_combined.to_csv(processed_path / "wikidoc_combined.csv", index=False) | |
print("Datos procesados y sintéticos generados exitosamente.") |