from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments from datasets import Dataset from sklearn.model_selection import train_test_split # Tokenizer ve model yükleme model_name = "t5-base" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) # Dataset hazırlama def prepare_data(input_texts, target_texts, tokenizer): inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length") targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length") return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]} # Paraphrasing fonksiyonu def paraphrase_with_model(text, model, tokenizer): prompt = "Teach the following content: " + text inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512) output_ids = model.generate( inputs["input_ids"], do_sample=True, top_k=50, top_p=0.95, temperature=1.0, max_length=150, no_repeat_ngram_size=2, early_stopping=True ) return tokenizer.decode(output_ids[0], skip_special_tokens=True) # Eğitim fonksiyonu def fine_tune_model(input_texts, target_texts): # Eğitim ve doğrulama verisini ayırma train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts, target_texts, test_size=0.1) # Augmentasyon ve dataset hazırlama augmented_input_texts = input_texts + [paraphrase_with_model(text, model, tokenizer) for text in input_texts[:10]] augmented_target_texts = target_texts + [paraphrase_with_model(text, model, tokenizer) for text in target_texts[:10]] train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts, tokenizer)) val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer)) # Eğitim argümanları training_args = TrainingArguments( output_dir="./results", evaluation_strategy="steps", learning_rate=5e-5, per_device_train_batch_size=4, num_train_epochs=3, save_steps=500, logging_dir="./logs", logging_steps=10 ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) # Eğitim trainer.train() # Model kaydetme model.save_pretrained("./fine_tuned_model") tokenizer.save_pretrained("./fine_tuned_model")