Spaces:
Runtime error
Runtime error
File size: 2,134 Bytes
b61df7c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import pandas as pd
import random
from pathlib import Path
import re
from transformers import pipeline
# Configuración básica
raw_path = Path("../data/raw")
processed_path = Path("../data/processed")
synthetic_path = Path("../data/synthetic")
# Cargar datos
wikidoc_file = raw_path / "medical_meadow_wikidoc.csv"
wikidoc = pd.read_csv(wikidoc_file, encoding='utf-8')
# Limpieza de datos
def clean_text(text):
if pd.isna(text):
return ""
text = str(text).lower().strip()
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s]', '', text)
return text
wikidoc['instruction'] = wikidoc['instruction'].apply(clean_text)
wikidoc['input'] = wikidoc['input'].apply(clean_text)
wikidoc['output'] = wikidoc['output'].apply(clean_text)
# Generación de datos sintéticos
def generate_synthetic_wikidoc(df, n=100):
sampled = df.sample(n, replace=True)
modificaciones = ["(modificado sintéticamente)", "[datos aumentados]", "(versión sintética)", "[augmented]"]
instructions, inputs, outputs = [], [], []
for _, row in sampled.iterrows():
mod = random.choice(modificaciones)
instruction = str(row["instruction"]) + " " + mod
input_text = (str(row["input"]) + " " + mod) if pd.notnull(row["input"]) else mod
output_text = str(row["output"]) + " " + mod
instructions.append(instruction)
inputs.append(input_text)
outputs.append(output_text)
df_synthetic = pd.DataFrame({
"instruction_sintetica": instructions,
"input_sintetico": inputs,
"output_sintetico": outputs
})
return df_synthetic
# Generar y guardar datos sintéticos
wikidoc_synthetic = generate_synthetic_wikidoc(wikidoc, n=100)
wikidoc_synthetic.to_csv(synthetic_path / "wikidoc_synthetic.csv", index=False)
# Combinar datos originales y sintéticos
wikidoc_combined = pd.concat([wikidoc, wikidoc_synthetic], ignore_index=True)
wikidoc_combined.to_csv(processed_path / "wikidoc_combined.csv", index=False)
print("Datos procesados y sintéticos generados exitosamente.") |