File size: 6,103 Bytes
fd06ef5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# Import required libraries
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from huggingface_hub import login
import pandas as pd
import os
import torch
import matplotlib.pyplot as plt
# Step 1: Log in to Hugging Face
# Students: Replace "YOUR_HUGGING_FACE_TOKEN" with your actual Hugging Face token from https://huggingface.co/settings/tokens
hf_token = "YOUR_HUGGING_FACE_TOKEN"
if not hf_token or hf_token == "YOUR_HUGGING_FACE_TOKEN":
raise ValueError("Please replace 'YOUR_HUGGING_FACE_TOKEN' in the code with your actual Hugging Face token")
login(token=hf_token)
print("Logged in to Hugging Face successfully")
# Step 2: Load and convert dataset
# Students: Replace "dataset.csv" or "dataset.json" with your dataset file name
dataset_name = "dataset.csv" # Change to "dataset.json" if using JSON
dataset_path = dataset_name
if dataset_name.endswith('.csv'):
# Convert CSV to JSON for consistency
print(f"Converting {dataset_name} to JSON format...")
df = pd.read_csv(dataset_path)
df.to_json('dataset.json', orient='records', lines=True)
dataset_path = 'dataset.json'
# Load dataset
print(f"Loading dataset from {dataset_path}...")
dataset = load_dataset('json', data_files=dataset_path)
# Step 3: Split dataset into training and validation
# 85% training, 15% validation to monitor model performance
print("Splitting dataset into training and validation sets...")
train_test_split = dataset['train'].train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
# Step 4: Download and load tokenizer and model
print("Downloading T5-small model and tokenizer...")
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
# Save model weights locally for fine-tuning
model.save_pretrained('./t5_small_weights')
tokenizer.save_pretrained('./t5_small_weights')
print("Model and tokenizer saved to './t5_small_weights'")
# Step 5: Preprocess dataset
# This ensures the input questions and answers are properly tokenized for T5
def preprocess_data(examples):
# Add "question:" prefix to inputs and clean whitespace
inputs = ["question: " + q.strip() for q in examples['input']]
targets = [r.strip() for r in examples['response']]
# Tokenize inputs (questions)
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
# Tokenize labels (answers)
labels = tokenizer(targets, max_length=64, truncation=True, padding='max_length')
# Replace pad token IDs in labels with -100 to ignore them in loss calculation
model_inputs['labels'] = [
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
]
return model_inputs
# Apply preprocessing to training and validation datasets
print("Preprocessing datasets...")
processed_train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response'])
processed_eval_dataset = eval_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response'])
# Step 6: Define training arguments
# These settings control how the model is fine-tuned
training_args = TrainingArguments(
output_dir='./results', # Directory to save training outputs
num_train_epochs=10, # Number of training iterations over the dataset
per_device_train_batch_size=2, # Batch size per device (GPU/CPU)
gradient_accumulation_steps=2, # Accumulate gradients to simulate larger batch size
learning_rate=3e-4, # Learning rate for optimization
save_steps=500, # Save model checkpoint every 500 steps
save_total_limit=2, # Keep only the last 2 checkpoints
logging_steps=50, # Log training metrics every 50 steps
eval_strategy="steps", # Evaluate model during training at regular intervals
eval_steps=100, # Evaluate every 100 steps
load_best_model_at_end=True, # Load the best model based on validation loss
metric_for_best_model="eval_loss", # Use validation loss to select best model
greater_is_better=False, # Lower validation loss is better
gradient_checkpointing=True, # Save memory during training
max_grad_norm=1.0, # Clip gradients to prevent exploding gradients
)
# Step 7: Initialize Trainer
# The Trainer handles the fine-tuning process
print("Initializing Trainer...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=processed_train_dataset,
eval_dataset=processed_eval_dataset,
)
# Step 8: Train the model
print("Starting training...")
trainer.train()
print("Training finished.")
# Step 9: Plot training and validation loss
# This helps students visualize model performance
print("Generating training and validation loss plot...")
logs = trainer.state.log_history
steps = [log['step'] for log in logs if 'loss' in log or 'eval_loss' in log]
train_loss = [log['loss'] for log in logs if 'loss' in log]
eval_loss = [log['eval_loss'] for log in logs if 'eval_loss' in log]
plt.figure(figsize=(10, 5))
if train_loss:
plt.plot(steps[:len(train_loss)], train_loss, label='Training Loss')
if eval_loss:
plt.plot(steps[:len(eval_loss)], eval_loss, label='Validation Loss')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Time')
plt.legend()
plt.grid(True)
plt.savefig('training_metrics.png')
plt.show()
# Step 10: Save the fine-tuned model
final_model_save_path = './finetuned_t5'
model.save_pretrained(final_model_save_path)
tokenizer.save_pretrained(final_model_save_path)
print(f"Model fine-tuned and saved to '{final_model_save_path}'")
print("Training metrics plot saved as 'training_metrics.png'") |