from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments from datasets import load_dataset # Load dataset from JSON dataset = load_dataset('json', data_files='dataset.json') # Split dataset into training and validation sets train_test_split = dataset['train'].train_test_split(test_size=0.15, seed=42) train_dataset = train_test_split['train'] eval_dataset = train_test_split['test'] # Load tokenizer tokenizer = T5Tokenizer.from_pretrained('./t5_small_weights') # Preprocess dataset def preprocess_data(examples): inputs = ["question: " + q.strip() for q in examples['input']] targets = [r.strip() for r in examples['response']] model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length') labels = tokenizer(targets, max_length=64, truncation=True, padding='max_length') model_inputs['labels'] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids'] ] return model_inputs # Apply preprocessing processed_train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response']) processed_eval_dataset = eval_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response']) # Load model model = T5ForConditionalGeneration.from_pretrained('./t5_small_weights') # Define training arguments training_args = TrainingArguments( output_dir='./results', num_train_epochs=10, per_device_train_batch_size=2, gradient_accumulation_steps=2, learning_rate=3e-4, # Slightly increased for better convergence save_steps=500, save_total_limit=2, logging_steps=50, eval_strategy="steps", eval_steps=100, load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False, gradient_checkpointing=True, max_grad_norm=1.0, ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=processed_train_dataset, eval_dataset=processed_eval_dataset, ) # Train the model print("Starting training...") trainer.train() print("Training finished.") # Save the fine-tuned model final_model_save_path = './finetuned_t5_improved' model.save_pretrained(final_model_save_path) tokenizer.save_pretrained(final_model_save_path) print(f"Model fine-tuned and saved to '{final_model_save_path}'")