microsoft/Phi-4-multimodal-instruct · trainer changes torch shapre to 4d instead 5d

I´m trying to finetune the model but facing a weird issue.

I´m loading the dataset , doing the necessary transformations and put the images and the prompt to the processor.

After this the image embeds have the right shape of 5d. But as soon as I put this into the trainer, it is being reduced to 4d. Don´t know why! Does anyone has an idea. here is my code most likely from the example:

import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import BatchFeature

class VisionDataset(Dataset):
def init(self, dataset, processor):
self.dataset = dataset
self.processor = processor

def __len__(self):
    return len(self.dataset)

def __getitem__(self, idx):
    data = self.dataset[idx]
    image = data.get("image", None)
    label = data.get("label_name", None)
    print(f"Processing item {idx}: label={label}, label_name={data.get('label_name')}, image={image is not None}")
    prompt = [
        {"role": "system", "content": "Try to describe this car image with make model type and year?. Here is an exmaple: Bentley Continental Supersports Conv. Convertible 2012, BMW X6 SUV 2012, Chevrolet Corvette ZR1 2012, Chevrolet Silverado 1500 Classic Extended Cab 2007. Do not respond anything else!"},
        {"role": "user", "content": "<|image_1|>"}
    ]

    chat_template = self.processor.tokenizer.apply_chat_template(
        prompt,
        add_generation_prompt=True,
        tokenize=False
    )
    inputs = self.processor(
        text=chat_template,
        images=image,
        return_tensors="pt"
    )
    answer = f"{label}<|end|>\n<|endoftext|>"
    answer_ids = self.processor.tokenizer(answer, return_tensors='pt').input_ids
    input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)
    

    labels = torch.full_like(input_ids, -100)
    labels[:, -answer_ids.shape[1] :] = answer_ids

    if input_ids.size(1) > 8192:
        input_ids = input_ids[:, :8192]
        labels = labels[:, :8192]
        if torch.all(labels == -100).item():
            # workaround to make sure loss compute won't fail
            labels[:, -1] = self.processor.tokenizer.eos_token_id
    print(f"inputs.input_image_embeds shape: {inputs.input_image_embeds.shape}")
    return {
        'input_ids': input_ids,
        'labels': labels,
        'input_image_embeds': inputs.input_image_embeds,
        'image_attention_mask': inputs.image_attention_mask,
        'image_sizes': inputs.image_sizes,
    }

class CustomDataCollatorPhi:
def init(self):
pass

def __call__(self, batch):
    input_ids_list = []
    labels_list = []
    input_image_embeds_list = []
    image_attention_mask_list = []
    image_sizes_list = []
    for inputs in batch:
        ###### after debugging, here is the image embeds 4d instead of 5d
        input_ids_list.append(inputs['input_ids'])
        labels_list.append(inputs['labels'])
        input_image_embeds_list.append(inputs['input_image_embeds'])
        image_attention_mask_list.append(inputs['image_attention_mask'])
        image_sizes_list.append(inputs['image_sizes'])

    input_ids = pad_sequence(input_ids_list, padding_side='right', padding_value=0)
    labels = pad_sequence(labels_list, padding_side='right', padding_value=0)
    attention_mask = (input_ids != 0).long()
    input_image_embeds = cat_with_pad(input_image_embeds_list, dim=0)
    image_attention_mask = cat_with_pad(image_attention_mask_list, dim=0)
    image_sizes = torch.cat(image_sizes_list)
    print("done!!!")
    return BatchFeature(
        {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'input_image_embeds': input_image_embeds,
            'image_attention_mask': image_attention_mask,
            'image_sizes': image_sizes,
            'input_mode': 1,  # vision mode
        }
    )