trainer changes torch shapre to 4d instead 5d
#77
by
chwunder1
- opened
I´m trying to finetune the model but facing a weird issue.
I´m loading the dataset , doing the necessary transformations and put the images and the prompt to the processor.
After this the image embeds have the right shape of 5d. But as soon as I put this into the trainer, it is being reduced to 4d. Don´t know why! Does anyone has an idea. here is my code most likely from the example:
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import BatchFeature
class VisionDataset(Dataset):
def init(self, dataset, processor):
self.dataset = dataset
self.processor = processor
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
data = self.dataset[idx]
image = data.get("image", None)
label = data.get("label_name", None)
print(f"Processing item {idx}: label={label}, label_name={data.get('label_name')}, image={image is not None}")
prompt = [
{"role": "system", "content": "Try to describe this car image with make model type and year?. Here is an exmaple: Bentley Continental Supersports Conv. Convertible 2012, BMW X6 SUV 2012, Chevrolet Corvette ZR1 2012, Chevrolet Silverado 1500 Classic Extended Cab 2007. Do not respond anything else!"},
{"role": "user", "content": "<|image_1|>"}
]
chat_template = self.processor.tokenizer.apply_chat_template(
prompt,
add_generation_prompt=True,
tokenize=False
)
inputs = self.processor(
text=chat_template,
images=image,
return_tensors="pt"
)
answer = f"{label}<|end|>\n<|endoftext|>"
answer_ids = self.processor.tokenizer(answer, return_tensors='pt').input_ids
input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)
labels = torch.full_like(input_ids, -100)
labels[:, -answer_ids.shape[1] :] = answer_ids
if input_ids.size(1) > 8192:
input_ids = input_ids[:, :8192]
labels = labels[:, :8192]
if torch.all(labels == -100).item():
# workaround to make sure loss compute won't fail
labels[:, -1] = self.processor.tokenizer.eos_token_id
print(f"inputs.input_image_embeds shape: {inputs.input_image_embeds.shape}")
return {
'input_ids': input_ids,
'labels': labels,
'input_image_embeds': inputs.input_image_embeds,
'image_attention_mask': inputs.image_attention_mask,
'image_sizes': inputs.image_sizes,
}
class CustomDataCollatorPhi:
def init(self):
pass
def __call__(self, batch):
input_ids_list = []
labels_list = []
input_image_embeds_list = []
image_attention_mask_list = []
image_sizes_list = []
for inputs in batch:
###### after debugging, here is the image embeds 4d instead of 5d
input_ids_list.append(inputs['input_ids'])
labels_list.append(inputs['labels'])
input_image_embeds_list.append(inputs['input_image_embeds'])
image_attention_mask_list.append(inputs['image_attention_mask'])
image_sizes_list.append(inputs['image_sizes'])
input_ids = pad_sequence(input_ids_list, padding_side='right', padding_value=0)
labels = pad_sequence(labels_list, padding_side='right', padding_value=0)
attention_mask = (input_ids != 0).long()
input_image_embeds = cat_with_pad(input_image_embeds_list, dim=0)
image_attention_mask = cat_with_pad(image_attention_mask_list, dim=0)
image_sizes = torch.cat(image_sizes_list)
print("done!!!")
return BatchFeature(
{
'input_ids': input_ids,
'labels': labels,
'attention_mask': attention_mask,
'input_image_embeds': input_image_embeds,
'image_attention_mask': image_attention_mask,
'image_sizes': image_sizes,
'input_mode': 1, # vision mode
}
)