case_study / fine_tuning.py
GurgenGulay's picture
Update fine_tuning.py
1b94a22 verified
raw
history blame
2.61 kB
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
# Tokenizer ve model yükleme
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Dataset hazırlama
def prepare_data(input_texts, target_texts, tokenizer):
inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length")
targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length")
return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
# Paraphrasing fonksiyonu
def paraphrase_with_model(text, model, tokenizer):
prompt = "Teach the following content: " + text
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
output_ids = model.generate(
inputs["input_ids"],
do_sample=True,
top_k=50,
top_p=0.95,
temperature=1.0,
max_length=150,
no_repeat_ngram_size=2,
early_stopping=True
)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Eğitim fonksiyonu
def fine_tune_model(input_texts, target_texts):
# Eğitim ve doğrulama verisini ayırma
train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts, target_texts, test_size=0.1)
# Augmentasyon ve dataset hazırlama
augmented_input_texts = input_texts + [paraphrase_with_model(text, model, tokenizer) for text in input_texts[:10]]
augmented_target_texts = target_texts + [paraphrase_with_model(text, model, tokenizer) for text in target_texts[:10]]
train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts, tokenizer))
val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer))
# Eğitim argümanları
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="steps",
learning_rate=5e-5,
per_device_train_batch_size=4,
num_train_epochs=3,
save_steps=500,
logging_dir="./logs",
logging_steps=10
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset
)
# Eğitim
trainer.train()
# Model kaydetme
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")