t5-small-project-guide / individual_steps /install_and_preprocess.py

Upload 8 files

f9396fc verified 3 months ago

929 Bytes

	import os
	os.system("pip install sentencepiece")

	from datasets import load_dataset
	from transformers import T5Tokenizer

	# Load dataset
	dataset = load_dataset('json', data_files='dataset.json')

	# Load T5 tokenizer
	tokenizer = T5Tokenizer.from_pretrained('t5-small')

	# Preprocess function
	def preprocess_function(examples):
	inputs = ["question: " + q for q in examples['input']]
	targets = examples['response']
	model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
	labels = tokenizer(targets, max_length=32, truncation=True, padding='max_length')
	model_inputs['labels'] = labels['input_ids']
	return model_inputs

	# Apply preprocessing
	processed_dataset = dataset.map(preprocess_function, batched=True)

	# Save processed dataset
	processed_dataset.save_to_disk('processed_dataset')

	print("Dataset preprocessed and saved to 'processed_dataset'")