|
from datasets import load_dataset
|
|
from transformers import AutoTokenizer
|
|
|
|
|
|
MODEL_NAME = "/falcon-7b"
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
|
|
|
|
dataset = load_dataset("json", data_files="tax_train_data.json")
|
|
|
|
|
|
def preprocess_function(examples):
|
|
inputs = examples["prompt"]
|
|
targets = examples["response"]
|
|
|
|
|
|
model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
|
|
labels = tokenizer(targets, padding="max_length", truncation=True, max_length=512)
|
|
|
|
model_inputs["labels"] = labels["input_ids"]
|
|
return model_inputs
|
|
|
|
|
|
processed_dataset = dataset.map(preprocess_function, batched=True)
|
|
|
|
|
|
processed_dataset.save_to_disk("processed_dataset.json")
|
|
|