File size: 6,103 Bytes
fd06ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Import required libraries
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from huggingface_hub import login
import pandas as pd
import os
import torch
import matplotlib.pyplot as plt

# Step 1: Log in to Hugging Face
# Students: Replace "YOUR_HUGGING_FACE_TOKEN" with your actual Hugging Face token from https://huggingface.co/settings/tokens
hf_token = "YOUR_HUGGING_FACE_TOKEN"
if not hf_token or hf_token == "YOUR_HUGGING_FACE_TOKEN":
    raise ValueError("Please replace 'YOUR_HUGGING_FACE_TOKEN' in the code with your actual Hugging Face token")
login(token=hf_token)
print("Logged in to Hugging Face successfully")

# Step 2: Load and convert dataset
# Students: Replace "dataset.csv" or "dataset.json" with your dataset file name
dataset_name = "dataset.csv"  # Change to "dataset.json" if using JSON
dataset_path = dataset_name
if dataset_name.endswith('.csv'):
    # Convert CSV to JSON for consistency
    print(f"Converting {dataset_name} to JSON format...")
    df = pd.read_csv(dataset_path)
    df.to_json('dataset.json', orient='records', lines=True)
    dataset_path = 'dataset.json'

# Load dataset
print(f"Loading dataset from {dataset_path}...")
dataset = load_dataset('json', data_files=dataset_path)

# Step 3: Split dataset into training and validation
# 85% training, 15% validation to monitor model performance
print("Splitting dataset into training and validation sets...")
train_test_split = dataset['train'].train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Step 4: Download and load tokenizer and model
print("Downloading T5-small model and tokenizer...")
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
# Save model weights locally for fine-tuning
model.save_pretrained('./t5_small_weights')
tokenizer.save_pretrained('./t5_small_weights')
print("Model and tokenizer saved to './t5_small_weights'")

# Step 5: Preprocess dataset
# This ensures the input questions and answers are properly tokenized for T5
def preprocess_data(examples):
    # Add "question:" prefix to inputs and clean whitespace
    inputs = ["question: " + q.strip() for q in examples['input']]
    targets = [r.strip() for r in examples['response']]
    # Tokenize inputs (questions)
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    # Tokenize labels (answers)
    labels = tokenizer(targets, max_length=64, truncation=True, padding='max_length')
    # Replace pad token IDs in labels with -100 to ignore them in loss calculation
    model_inputs['labels'] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
    ]
    return model_inputs

# Apply preprocessing to training and validation datasets
print("Preprocessing datasets...")
processed_train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response'])
processed_eval_dataset = eval_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response'])

# Step 6: Define training arguments
# These settings control how the model is fine-tuned
training_args = TrainingArguments(
    output_dir='./results',                # Directory to save training outputs
    num_train_epochs=10,                   # Number of training iterations over the dataset
    per_device_train_batch_size=2,         # Batch size per device (GPU/CPU)
    gradient_accumulation_steps=2,          # Accumulate gradients to simulate larger batch size
    learning_rate=3e-4,                    # Learning rate for optimization
    save_steps=500,                        # Save model checkpoint every 500 steps
    save_total_limit=2,                    # Keep only the last 2 checkpoints
    logging_steps=50,                      # Log training metrics every 50 steps
    eval_strategy="steps",                 # Evaluate model during training at regular intervals
    eval_steps=100,                        # Evaluate every 100 steps
    load_best_model_at_end=True,           # Load the best model based on validation loss
    metric_for_best_model="eval_loss",     # Use validation loss to select best model
    greater_is_better=False,               # Lower validation loss is better
    gradient_checkpointing=True,           # Save memory during training
    max_grad_norm=1.0,                     # Clip gradients to prevent exploding gradients
)

# Step 7: Initialize Trainer
# The Trainer handles the fine-tuning process
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
)

# Step 8: Train the model
print("Starting training...")
trainer.train()
print("Training finished.")

# Step 9: Plot training and validation loss
# This helps students visualize model performance
print("Generating training and validation loss plot...")
logs = trainer.state.log_history
steps = [log['step'] for log in logs if 'loss' in log or 'eval_loss' in log]
train_loss = [log['loss'] for log in logs if 'loss' in log]
eval_loss = [log['eval_loss'] for log in logs if 'eval_loss' in log]
plt.figure(figsize=(10, 5))
if train_loss:
    plt.plot(steps[:len(train_loss)], train_loss, label='Training Loss')
if eval_loss:
    plt.plot(steps[:len(eval_loss)], eval_loss, label='Validation Loss')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Time')
plt.legend()
plt.grid(True)
plt.savefig('training_metrics.png')
plt.show()

# Step 10: Save the fine-tuned model
final_model_save_path = './finetuned_t5'
model.save_pretrained(final_model_save_path)
tokenizer.save_pretrained(final_model_save_path)
print(f"Model fine-tuned and saved to '{final_model_save_path}'")
print("Training metrics plot saved as 'training_metrics.png'")