Upload 5 files
Browse files- README.md +4 -3
- app.py +30 -0
- document.txt +103 -0
- requirements.txt +7 -0
- t5_project_all_in_one.py +130 -0
README.md
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
license:
|
| 3 |
-
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
license: mittags: - t5-small - question-answering - fine-tuning - educationallanguage: - enemoji: 📚🤖
|
| 3 |
+
T5-Small Project Guide
|
| 4 |
+
This repository by RemiAI3 provides a free educational resource for students to fine-tune the T5-small model for question-answering tasks. Detailed instructions are in document.txt.
|
app.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, request, render_template
|
| 2 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
app = Flask(__name__)
|
| 6 |
+
|
| 7 |
+
# Load the fine-tuned model and tokenizer
|
| 8 |
+
model = T5ForConditionalGeneration.from_pretrained('./finetuned_t5')
|
| 9 |
+
tokenizer = T5Tokenizer.from_pretrained('./finetuned_t5')
|
| 10 |
+
model.eval()
|
| 11 |
+
|
| 12 |
+
@app.route('/', methods=['GET', 'POST'])
|
| 13 |
+
def index():
|
| 14 |
+
answer = ""
|
| 15 |
+
if request.method == 'POST':
|
| 16 |
+
question = request.form['question']
|
| 17 |
+
input_text = f"question: {question.strip()}"
|
| 18 |
+
inputs = tokenizer(input_text, max_length=128, truncation=True, padding=True, return_tensors="pt")
|
| 19 |
+
outputs = model.generate(
|
| 20 |
+
inputs['input_ids'],
|
| 21 |
+
max_length=64,
|
| 22 |
+
num_beams=4,
|
| 23 |
+
early_stopping=True,
|
| 24 |
+
no_repeat_ngram_size=2
|
| 25 |
+
)
|
| 26 |
+
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 27 |
+
return render_template('index.html', answer=answer)
|
| 28 |
+
|
| 29 |
+
if __name__ == '__main__':
|
| 30 |
+
app.run(debug=True)
|
document.txt
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
T5-Small Project Guide
|
| 2 |
+
=====================
|
| 3 |
+
|
| 4 |
+
Welcome to the T5-Small Project Guide by RemiAI3, a free educational resource for students to learn AI model fine-tuning using Hugging Face's T5-small model. This project enables students to build a question-answering system, such as answering questions about the Chola Empire, using open-source tools.
|
| 5 |
+
|
| 6 |
+
Objective
|
| 7 |
+
---------
|
| 8 |
+
Our goal is to provide accessible AI resources for students to experiment with and learn from, promoting RemiAI3’s mission of democratizing AI education. This project is designed to be lightweight, avoiding the high costs of deploying large AI models like text-to-image generators.
|
| 9 |
+
|
| 10 |
+
Prerequisites
|
| 11 |
+
-------------
|
| 12 |
+
- Python Version: Python 3.9 or higher (recommended: 3.10)
|
| 13 |
+
- Virtual Environment: Use `venv` to isolate dependencies
|
| 14 |
+
- Hugging Face Account: Sign up at https://huggingface.co to get an access token
|
| 15 |
+
- Dataset: A CSV or JSON file with question-answer pairs. Example JSON format:
|
| 16 |
+
```json
|
| 17 |
+
[
|
| 18 |
+
{"input": "Who was the founder of the Chola Empire?", "response": "Vijayalaya Chola"},
|
| 19 |
+
{"input": "What was the main military force of the Cholas?", "response": "Well-organized army and navy"},
|
| 20 |
+
{"input": "What was a key administrative reform by the Cholas?", "response": "Efficient land revenue system"}
|
| 21 |
+
]
|
| 22 |
+
```
|
| 23 |
+
CSV format (if used):
|
| 24 |
+
```csv
|
| 25 |
+
input,response
|
| 26 |
+
"Who was the founder of the Chola Empire?","Vijayalaya Chola"
|
| 27 |
+
"What was the main military force of the Cholas?","Well-organized army and navy"
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
Setup Instructions
|
| 31 |
+
------------------
|
| 32 |
+
1. Install Python: Download Python 3.10 from https://www.python.org/downloads/.
|
| 33 |
+
2. Clone the Repository:
|
| 34 |
+
```
|
| 35 |
+
git clone https://huggingface.co/remiai3/t5-small-project-guide
|
| 36 |
+
cd t5-small-project-guide
|
| 37 |
+
```
|
| 38 |
+
3. Create and Activate a Virtual Environment:
|
| 39 |
+
```
|
| 40 |
+
python -m venv venv
|
| 41 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 42 |
+
```
|
| 43 |
+
4. Install Dependencies:
|
| 44 |
+
```
|
| 45 |
+
pip install -r requirements.txt
|
| 46 |
+
```
|
| 47 |
+
5. Prepare Your Dataset: Place your `dataset.csv` or `dataset.json` in the project folder.
|
| 48 |
+
6. Set Hugging Face Token: Open `t5_project_all_in_one.py` and replace "YOUR_HUGGING_FACE_TOKEN" with your Hugging Face token.
|
| 49 |
+
|
| 50 |
+
Running the Project
|
| 51 |
+
------------------
|
| 52 |
+
1. Fine-Tune the Model:
|
| 53 |
+
Run the all-in-one script to convert the dataset (if CSV), preprocess, download the model, and fine-tune:
|
| 54 |
+
```
|
| 55 |
+
python t5_project_all_in_one.py
|
| 56 |
+
```
|
| 57 |
+
This will:
|
| 58 |
+
- Convert CSV to JSON (if needed)
|
| 59 |
+
- Preprocess the dataset
|
| 60 |
+
- Download T5-small weights
|
| 61 |
+
- Fine-tune the model
|
| 62 |
+
- Save the fine-tuned model to `./finetuned_t5`
|
| 63 |
+
- Generate a plot of training and validation loss (`training_metrics.png`)
|
| 64 |
+
|
| 65 |
+
Project Files
|
| 66 |
+
------------
|
| 67 |
+
- t5_project_all_in_one.py: Single script for dataset conversion, preprocessing, model downloading, and fine-tuning.
|
| 68 |
+
- requirements.txt: Lists required Python libraries.
|
| 69 |
+
- document.txt: This file with detailed instructions.
|
| 70 |
+
- README.md: Model configuration and repo overview.
|
| 71 |
+
|
| 72 |
+
Libraries and Versions
|
| 73 |
+
----------------------
|
| 74 |
+
- transformers==4.44.2
|
| 75 |
+
- datasets==3.0.1
|
| 76 |
+
- torch==2.4.1
|
| 77 |
+
- pandas==2.2.3
|
| 78 |
+
- matplotlib==3.9.2
|
| 79 |
+
- accelerate==1.0.1
|
| 80 |
+
- huggingface_hub==0.26.0
|
| 81 |
+
|
| 82 |
+
Documentation
|
| 83 |
+
-------------
|
| 84 |
+
- Hugging Face Transformers: https://huggingface.co/docs/transformers
|
| 85 |
+
- Datasets Library: https://huggingface.co/docs/datasets
|
| 86 |
+
- T5 Model: https://huggingface.co/docs/transformers/model_doc/t5
|
| 87 |
+
- Pandas: https://pandas.pydata.org/docs
|
| 88 |
+
- Matplotlib: https://matplotlib.org/stable/contents.html
|
| 89 |
+
- Accelerate: https://huggingface.co/docs/accelerate
|
| 90 |
+
|
| 91 |
+
Troubleshooting
|
| 92 |
+
---------------
|
| 93 |
+
- Inaccurate Answers: Ensure your dataset has 500+ clean question-answer pairs. Increase `num_train_epochs` or `learning_rate` in `t5_project_all_in_one.py`.
|
| 94 |
+
- Token Errors: Verify the Hugging Face token in `t5_project_all_in_one.py` is correct.
|
| 95 |
+
- Library Issues: Reinstall dependencies with `pip install -r requirements.txt`.
|
| 96 |
+
|
| 97 |
+
Contributing
|
| 98 |
+
------------
|
| 99 |
+
Fork the repository, make changes, and submit a pull request at https://huggingface.co/remiai3/t5-small-project-guide.
|
| 100 |
+
|
| 101 |
+
About RemiAI3
|
| 102 |
+
-------------
|
| 103 |
+
RemiAI3 is committed to providing free AI educational resources to empower students. By using this project, you’re helping promote our mission to build our brand for future AI innovations.
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers==4.44.2
|
| 2 |
+
datasets==3.0.1
|
| 3 |
+
torch==2.4.1
|
| 4 |
+
pandas==2.2.3
|
| 5 |
+
matplotlib==3.9.2
|
| 6 |
+
accelerate==1.0.1
|
| 7 |
+
huggingface_hub==0.26.0
|
t5_project_all_in_one.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import required libraries
|
| 2 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
from huggingface_hub import login
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import os
|
| 7 |
+
import torch
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
# Step 1: Log in to Hugging Face
|
| 11 |
+
# Students: Replace "YOUR_HUGGING_FACE_TOKEN" with your actual Hugging Face token from https://huggingface.co/settings/tokens
|
| 12 |
+
hf_token = "YOUR_HUGGING_FACE_TOKEN"
|
| 13 |
+
if not hf_token or hf_token == "YOUR_HUGGING_FACE_TOKEN":
|
| 14 |
+
raise ValueError("Please replace 'YOUR_HUGGING_FACE_TOKEN' in the code with your actual Hugging Face token")
|
| 15 |
+
login(token=hf_token)
|
| 16 |
+
print("Logged in to Hugging Face successfully")
|
| 17 |
+
|
| 18 |
+
# Step 2: Load and convert dataset
|
| 19 |
+
# Students: Replace "dataset.csv" or "dataset.json" with your dataset file name
|
| 20 |
+
dataset_name = "dataset.csv" # Change to "dataset.json" if using JSON
|
| 21 |
+
dataset_path = dataset_name
|
| 22 |
+
if dataset_name.endswith('.csv'):
|
| 23 |
+
# Convert CSV to JSON for consistency
|
| 24 |
+
print(f"Converting {dataset_name} to JSON format...")
|
| 25 |
+
df = pd.read_csv(dataset_path)
|
| 26 |
+
df.to_json('dataset.json', orient='records', lines=True)
|
| 27 |
+
dataset_path = 'dataset.json'
|
| 28 |
+
|
| 29 |
+
# Load dataset
|
| 30 |
+
print(f"Loading dataset from {dataset_path}...")
|
| 31 |
+
dataset = load_dataset('json', data_files=dataset_path)
|
| 32 |
+
|
| 33 |
+
# Step 3: Split dataset into training and validation
|
| 34 |
+
# 85% training, 15% validation to monitor model performance
|
| 35 |
+
print("Splitting dataset into training and validation sets...")
|
| 36 |
+
train_test_split = dataset['train'].train_test_split(test_size=0.15, seed=42)
|
| 37 |
+
train_dataset = train_test_split['train']
|
| 38 |
+
eval_dataset = train_test_split['test']
|
| 39 |
+
|
| 40 |
+
# Step 4: Download and load tokenizer and model
|
| 41 |
+
print("Downloading T5-small model and tokenizer...")
|
| 42 |
+
tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
| 43 |
+
model = T5ForConditionalGeneration.from_pretrained('t5-small')
|
| 44 |
+
# Save model weights locally for fine-tuning
|
| 45 |
+
model.save_pretrained('./t5_small_weights')
|
| 46 |
+
tokenizer.save_pretrained('./t5_small_weights')
|
| 47 |
+
print("Model and tokenizer saved to './t5_small_weights'")
|
| 48 |
+
|
| 49 |
+
# Step 5: Preprocess dataset
|
| 50 |
+
# This ensures the input questions and answers are properly tokenized for T5
|
| 51 |
+
def preprocess_data(examples):
|
| 52 |
+
# Add "question:" prefix to inputs and clean whitespace
|
| 53 |
+
inputs = ["question: " + q.strip() for q in examples['input']]
|
| 54 |
+
targets = [r.strip() for r in examples['response']]
|
| 55 |
+
# Tokenize inputs (questions)
|
| 56 |
+
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
|
| 57 |
+
# Tokenize labels (answers)
|
| 58 |
+
labels = tokenizer(targets, max_length=64, truncation=True, padding='max_length')
|
| 59 |
+
# Replace pad token IDs in labels with -100 to ignore them in loss calculation
|
| 60 |
+
model_inputs['labels'] = [
|
| 61 |
+
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
|
| 62 |
+
]
|
| 63 |
+
return model_inputs
|
| 64 |
+
|
| 65 |
+
# Apply preprocessing to training and validation datasets
|
| 66 |
+
print("Preprocessing datasets...")
|
| 67 |
+
processed_train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response'])
|
| 68 |
+
processed_eval_dataset = eval_dataset.map(preprocess_data, batched=True, remove_columns=['input', 'response'])
|
| 69 |
+
|
| 70 |
+
# Step 6: Define training arguments
|
| 71 |
+
# These settings control how the model is fine-tuned
|
| 72 |
+
training_args = TrainingArguments(
|
| 73 |
+
output_dir='./results', # Directory to save training outputs
|
| 74 |
+
num_train_epochs=10, # Number of training iterations over the dataset
|
| 75 |
+
per_device_train_batch_size=2, # Batch size per device (GPU/CPU)
|
| 76 |
+
gradient_accumulation_steps=2, # Accumulate gradients to simulate larger batch size
|
| 77 |
+
learning_rate=3e-4, # Learning rate for optimization
|
| 78 |
+
save_steps=500, # Save model checkpoint every 500 steps
|
| 79 |
+
save_total_limit=2, # Keep only the last 2 checkpoints
|
| 80 |
+
logging_steps=50, # Log training metrics every 50 steps
|
| 81 |
+
eval_strategy="steps", # Evaluate model during training at regular intervals
|
| 82 |
+
eval_steps=100, # Evaluate every 100 steps
|
| 83 |
+
load_best_model_at_end=True, # Load the best model based on validation loss
|
| 84 |
+
metric_for_best_model="eval_loss", # Use validation loss to select best model
|
| 85 |
+
greater_is_better=False, # Lower validation loss is better
|
| 86 |
+
gradient_checkpointing=True, # Save memory during training
|
| 87 |
+
max_grad_norm=1.0, # Clip gradients to prevent exploding gradients
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Step 7: Initialize Trainer
|
| 91 |
+
# The Trainer handles the fine-tuning process
|
| 92 |
+
print("Initializing Trainer...")
|
| 93 |
+
trainer = Trainer(
|
| 94 |
+
model=model,
|
| 95 |
+
args=training_args,
|
| 96 |
+
train_dataset=processed_train_dataset,
|
| 97 |
+
eval_dataset=processed_eval_dataset,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Step 8: Train the model
|
| 101 |
+
print("Starting training...")
|
| 102 |
+
trainer.train()
|
| 103 |
+
print("Training finished.")
|
| 104 |
+
|
| 105 |
+
# Step 9: Plot training and validation loss
|
| 106 |
+
# This helps students visualize model performance
|
| 107 |
+
print("Generating training and validation loss plot...")
|
| 108 |
+
logs = trainer.state.log_history
|
| 109 |
+
steps = [log['step'] for log in logs if 'loss' in log or 'eval_loss' in log]
|
| 110 |
+
train_loss = [log['loss'] for log in logs if 'loss' in log]
|
| 111 |
+
eval_loss = [log['eval_loss'] for log in logs if 'eval_loss' in log]
|
| 112 |
+
plt.figure(figsize=(10, 5))
|
| 113 |
+
if train_loss:
|
| 114 |
+
plt.plot(steps[:len(train_loss)], train_loss, label='Training Loss')
|
| 115 |
+
if eval_loss:
|
| 116 |
+
plt.plot(steps[:len(eval_loss)], eval_loss, label='Validation Loss')
|
| 117 |
+
plt.xlabel('Step')
|
| 118 |
+
plt.ylabel('Loss')
|
| 119 |
+
plt.title('Training and Validation Loss Over Time')
|
| 120 |
+
plt.legend()
|
| 121 |
+
plt.grid(True)
|
| 122 |
+
plt.savefig('training_metrics.png')
|
| 123 |
+
plt.show()
|
| 124 |
+
|
| 125 |
+
# Step 10: Save the fine-tuned model
|
| 126 |
+
final_model_save_path = './finetuned_t5'
|
| 127 |
+
model.save_pretrained(final_model_save_path)
|
| 128 |
+
tokenizer.save_pretrained(final_model_save_path)
|
| 129 |
+
print(f"Model fine-tuned and saved to '{final_model_save_path}'")
|
| 130 |
+
print("Training metrics plot saved as 'training_metrics.png'")
|