from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments from datasets import Dataset from sklearn.model_selection import train_test_split # Load model and tokenizer model_name = "t5-base" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) # Prepare the dataset for training def prepare_data(input_texts, target_texts, tokenizer): inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length") targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length") return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]} # Paraphrasing function def paraphrase_with_model(text, model, tokenizer): prompt = "Teach the following content: " + text inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512) output_ids = model.generate( inputs["input_ids"], do_sample=False, # For deterministic results max_length=150, no_repeat_ngram_size=2, early_stopping=True ) return tokenizer.decode(output_ids[0], skip_special_tokens=True) # Fine-tuning function def fine_tune_model(input_texts, target_texts): # Split data into training and validation sets train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts, target_texts, test_size=0.1) # Data augmentation with paraphrasing augmented_input_texts = input_texts + [paraphrase_with_model(text, model, tokenizer) for text in input_texts[:10]] augmented_target_texts = target_texts + [paraphrase_with_model(text, model, tokenizer) for text in target_texts[:10]] train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts, tokenizer)) val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer)) # Training arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="steps", learning_rate=5e-5, per_device_train_batch_size=4, num_train_epochs=3, save_steps=500, logging_dir="./logs", logging_steps=10 ) # Trainer setup trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) # Training trainer.train() # Save the fine-tuned model model.save_pretrained("./fine_tuned_model") tokenizer.save_pretrained("./fine_tuned_model")