|
import os
|
|
os.system("pip install sentencepiece")
|
|
|
|
from datasets import load_dataset
|
|
from transformers import T5Tokenizer
|
|
|
|
|
|
dataset = load_dataset('json', data_files='dataset.json')
|
|
|
|
|
|
tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
|
|
|
|
|
def preprocess_function(examples):
|
|
inputs = ["question: " + q for q in examples['input']]
|
|
targets = examples['response']
|
|
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
|
|
labels = tokenizer(targets, max_length=32, truncation=True, padding='max_length')
|
|
model_inputs['labels'] = labels['input_ids']
|
|
return model_inputs
|
|
|
|
|
|
processed_dataset = dataset.map(preprocess_function, batched=True)
|
|
|
|
|
|
processed_dataset.save_to_disk('processed_dataset')
|
|
|
|
print("Dataset preprocessed and saved to 'processed_dataset'") |