# Import required libraries from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments from datasets import load_dataset from huggingface_hub import login import pandas as pd import os import torch import matplotlib.pyplot as plt # Step 1: Log in to Hugging Face # Students: Replace "YOUR_HUGGING_FACE_TOKEN" with your actual Hugging Face token from https://huggingface.co/settings/tokens hf_token = "YOUR_HUGGING_FACE_TOKEN" if not hf_token or hf_token == "YOUR_HUGGING_FACE_TOKEN": raise ValueError("Please replace 'YOUR_HUGGING_FACE_TOKEN' in the code with your actual Hugging Face token") login(token=hf_token) print("Logged in to Hugging Face successfully") # Step 2: Load and convert dataset # Students: Replace "dataset.csv" or "dataset.json" with your dataset file name dataset_name = "dataset.csv" # Change to "dataset.json" if using JSON dataset_path = dataset_name if dataset_name.endswith('.csv'): # Convert CSV to JSON for consistency print(f"Converting {dataset_name} to JSON format...") df = pd.read_csv(dataset_path) df.to_json('dataset.json', orient='records', lines=True) dataset_path = 'dataset.json' # Load dataset print(f"Loading dataset from {dataset_path}...") dataset = load_dataset('json', data_files=dataset_path) # Step 3: Split dataset into training and validation # 85% training, 15% validation to monitor model performance print("Splitting dataset into training and validation sets...") train_test_split = dataset['train'].train_test_split(test_size=0.15, seed=42) train_dataset = train_test_split['train'] eval_dataset = train_test_split['test'] # Step 4: Download and load tokenizer and model print("Downloading T5-small model and tokenizer...") tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small') # Save model weights locally for fine-tuning model.save_pretrained('./t5_small_weights') tokenizer.save_pretrained('./t5_small_weights') print("Model and tokenizer saved to './t5_small_weights'") # Step 5: Preprocess dataset # This ensures the input questions and answers are properly tokenized for T5 def preprocess_data(examples): # Add "question:" prefix to inputs and clean whitespace inputs = ["question: " + q.strip() for q in examples['input']] targets = [r.strip() for r in examples['response']] # Tokenize inputs (questions) model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length') # Tokenize labels (answers) labels = tokenizer(targets, max_length=64, truncation=True, padding='max_length') # Replace pad token IDs in labels with -100 to ignore them in loss calculation model_inputs['labels'] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids'] ] return model_inputs # Apply preprocessing to training and validation datasets print("Preprocessing datasets...") processed_train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response']) processed_eval_dataset = eval_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response']) # Step 6: Define training arguments # These settings control how the model is fine-tuned training_args = TrainingArguments( output_dir='./results', # Directory to save training outputs num_train_epochs=10, # Number of training iterations over the dataset per_device_train_batch_size=2, # Batch size per device (GPU/CPU) gradient_accumulation_steps=2, # Accumulate gradients to simulate larger batch size learning_rate=3e-4, # Learning rate for optimization save_steps=500, # Save model checkpoint every 500 steps save_total_limit=2, # Keep only the last 2 checkpoints logging_steps=50, # Log training metrics every 50 steps eval_strategy="steps", # Evaluate model during training at regular intervals eval_steps=100, # Evaluate every 100 steps load_best_model_at_end=True, # Load the best model based on validation loss metric_for_best_model="eval_loss", # Use validation loss to select best model greater_is_better=False, # Lower validation loss is better gradient_checkpointing=True, # Save memory during training max_grad_norm=1.0, # Clip gradients to prevent exploding gradients ) # Step 7: Initialize Trainer # The Trainer handles the fine-tuning process print("Initializing Trainer...") trainer = Trainer( model=model, args=training_args, train_dataset=processed_train_dataset, eval_dataset=processed_eval_dataset, ) # Step 8: Train the model print("Starting training...") trainer.train() print("Training finished.") # Step 9: Plot training and validation loss # This helps students visualize model performance print("Generating training and validation loss plot...") logs = trainer.state.log_history steps = [log['step'] for log in logs if 'loss' in log or 'eval_loss' in log] train_loss = [log['loss'] for log in logs if 'loss' in log] eval_loss = [log['eval_loss'] for log in logs if 'eval_loss' in log] plt.figure(figsize=(10, 5)) if train_loss: plt.plot(steps[:len(train_loss)], train_loss, label='Training Loss') if eval_loss: plt.plot(steps[:len(eval_loss)], eval_loss, label='Validation Loss') plt.xlabel('Step') plt.ylabel('Loss') plt.title('Training and Validation Loss Over Time') plt.legend() plt.grid(True) plt.savefig('training_metrics.png') plt.show() # Step 10: Save the fine-tuned model final_model_save_path = './finetuned_t5' model.save_pretrained(final_model_save_path) tokenizer.save_pretrained(final_model_save_path) print(f"Model fine-tuned and saved to '{final_model_save_path}'") print("Training metrics plot saved as 'training_metrics.png'")