Tokenizer causes issues in Finetuning because of special tokens in tokenization <|X|>

#67
by LazerJesus - opened

Ill run through my setup and then get to the problem.

i am setting up config, tokenizer, model and peftmodel

from peft import LoraConfig, TaskType
import torch

CHATPATH = "/notebooks/starchat-beta"
BASEPATH = "/notebooks/starcoderplus"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

peftconfig = LoraConfig(
    CHATPATH,
    base_model_name_or_path = BASEPATH,
    task_type=TaskType.CAUSAL_LM,  
    target_modules = ["c_proj", "c_attn", "q_attn"],
    bias="none",
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.01
)

from transformers import AutoTokenizer

system_token = "<|system|>"
user_token = "<|user|>"
assistant_token = "<|assistant|>"
end_token = "<|end|>"

tokenizer = AutoTokenizer.from_pretrained(BASEPATH)
tokenizer.pad_token=tokenizer.eos_token
added_tokens = tokenizer.add_special_tokens({"additional_special_tokens": [system_token, user_token, assistant_token, end_token]})

print("tokenizer.vocab_size", tokenizer.vocab_size, added_tokens)
> tokenizer.vocab_size 49152 0

from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(
    BASEPATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
).to(DEVICE)

freeze_model(model)

from peft import get_peft_model

peftmodel = get_peft_model(model, peftconfig)
peftmodel.resize_token_embeddings(len(tokenizer))

now we have the peftmodel and the tokenizer setup. check that even tho i add the special tokens, they dont get added.

i continue with setting up the data.

import pandas as pd
from datasets import Dataset

system_token = "<|system|>"
user_token = "<|user|>"
assistant_token = "<|assistant|>"
end_token = "<|end|>"
system_msg = "X"

def prepare_dialogue(row):
    # print(row)
    prompt = system_token + "\n" + system_msg + end_token + "\n"
    prompt += user_token + "\n" + row["prompt"] + end_token + "\n"
    prompt += assistant_token + "\n" + row["completion"] + end_token + "\n"
    row["dialogue"] = prompt
    return row

def strip_quotes(val): return val.strip('"') if isinstance(val, str) else val
def prepare_row(row):
    for col in row.index:
        row[col] = row[col].strip("'").strip("';")
    return prepare_dialogue(row)
def prepare_data(data):
    data.rename(columns={"'completion';": 'completion', "'prompt'": 'prompt'}, inplace=True)
    data = data.apply(prepare_row, axis=1)
    return data
def load_data(path):
    data = pd.read_csv(path, delimiter=";", quotechar="'",skipinitialspace=True)
    return Dataset.from_pandas(prepare_data(data))
    
trainingdata = load_data("./data/training.csv")
testingdata = load_data("./data/testing.csv")

def tokenize(batch):
    batch_dialogues = batch['dialogue']   # Fetch the 'dialogue' field
    tokenization = tokenizer(batch_dialogues, padding=True, return_token_type_ids=False)
    labels = tokenization.input_ids.copy()
    # mask_user_labels(tokenizer, labels) # not working.
    tokenization['labels'] = labels
    return tokenization

from datasets import DatasetDict
dataset = DatasetDict({
    'train': trainingdata.map(tokenize, batched=True),
    'test': testingdata.map(tokenize, batched=True)

})
for key in dataset:
    dataset[key] = dataset[key].remove_columns(['dialogue', 'completion', 'prompt'])

let me go through the important parts.
the prepare_dialogue function takes the data from my csv and formats it according to the dialogue template.
the tokenize function takes a batch, tokenizes them and adds them as labels to the dataset.

here is the crux of the matter.

print(dataset['train'])
print('torch max: ', torch.max(torch.tensor(dataset['train']["labels"])))

final_layer = list(peftmodel.modules())[-1]

if isinstance(final_layer, torch.nn.Linear):
    print(f"The output dimension is {final_layer.out_features}")
else:
    print("Final layer is not a Linear layer.")


> Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 228
})
> torch max:  tensor(49155)
> The output dimension is 49152

here is the problem.
the largest label_id found is 49155, but the output dimension is only 49152.

print("system_token_id:", tokenizer.convert_tokens_to_ids(system_token))
print("user_token_id:", tokenizer.convert_tokens_to_ids(user_token))
print("assistant_token_id:", tokenizer.convert_tokens_to_ids(assistant_token))
print("end_token_id:", tokenizer.convert_tokens_to_ids(end_token))

> system_token_id: 49152
> user_token_id: 49154
> assistant_token_id: 49153
> end_token_id: 49155

the added token are the difference.

what am i to do here?
training like this throws errors because of dimension mismatch.
not adding the token makes no sense as per the "documentation" or the code

Your need to confirm your account before you can post a new comment.

Sign up or log in to comment