t5-small-project-guide / individual_steps /install_and_preprocess.py
remiai3's picture
Upload 8 files
f9396fc verified
raw
history blame
929 Bytes
import os
os.system("pip install sentencepiece")
from datasets import load_dataset
from transformers import T5Tokenizer
# Load dataset
dataset = load_dataset('json', data_files='dataset.json')
# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
# Preprocess function
def preprocess_function(examples):
inputs = ["question: " + q for q in examples['input']]
targets = examples['response']
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
labels = tokenizer(targets, max_length=32, truncation=True, padding='max_length')
model_inputs['labels'] = labels['input_ids']
return model_inputs
# Apply preprocessing
processed_dataset = dataset.map(preprocess_function, batched=True)
# Save processed dataset
processed_dataset.save_to_disk('processed_dataset')
print("Dataset preprocessed and saved to 'processed_dataset'")