Fix image embedding logic to be mps-compatible

Addresses the assertion error raised on mps machines. Cf. https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/12

- MPS changes:
- `.bool()` instead of `.type(torch.BoolTensor)`
- Avoid `index_put` issues by having an mps-specific logical block.
- The `temp_len` variable in the assertion was never used anyway, so I removed the variable and the offending assertion.
- Various clean up of comments and code.

Files changed (1) hide show

modeling_phi4mm.py +70 -40

modeling_phi4mm.py CHANGED Viewed

@@ -325,7 +325,7 @@ class Phi4MMImageEmbedding(nn.Module):
                 bs = img_embeds.shape[0]
                 # Nx(HW)xC
                 if image_attention_mask is not None and len(image_attention_mask) > 0:
-                    img_features = self.get_img_features(img_embeds.flatten(0, 1), attention_mask=image_attention_mask.type(torch.BoolTensor).flatten(0,1).to(target_device))
                 else:
                     img_features = self.get_img_features(img_embeds.flatten(0, 1))
@@ -337,13 +337,12 @@ class Phi4MMImageEmbedding(nn.Module):
                 assert base_feat_height == base_feat_height_target and base_feat_width == base_feat_height_target, f'base_feat_height: {base_feat_height}, base_feat_width: {base_feat_width}, expect {base_feat_height_target} features for hd transform'
-                # bs x max_num_crops x (24x24) x C
                 img_features = img_features.view(bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
                 C = self.image_dim_out
                 H = base_feat_height
                 output_imgs = []
-                output_len = []
                 # training is tensor, inference is list
                 if isinstance(img_sizes, torch.Tensor):
                     img_sizes = img_sizes.view(-1, 2)
@@ -353,39 +352,71 @@ class Phi4MMImageEmbedding(nn.Module):
                     w = w // base_resolution
                     B_ = h * w
-                    # 1 x (24x24) x 1024
                     global_img_feature = img_features[_bs, :1]
-                    # 1 x 12 x 12 x 4096
-                    glb_img = global_img_feature.reshape(1,H,H,C).reshape(1,H//base_feat_height_reduction,base_feat_height_reduction,H//base_feat_height_reduction,base_feat_height_reduction,C).contiguous().permute(0,1,3,2,4,5).reshape(1,H//base_feat_height_reduction,H//base_feat_height_reduction,base_feat_height_reduction*base_feat_height_reduction*C).contiguous()
-                    temp_glb_GN = self.sub_GN.repeat(1, H//base_feat_height_reduction, 1, 1)
-                    # 1 x 156 x 4096
-                    glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(1,-1,base_feat_height_reduction*base_feat_height_reduction*C)
-                    # (max_num_crops-1) x (12x12) x C
                     sub_img = img_features[_bs, 1:]
-                    # 16x574x1024
                     # get rid of padding sub_img
                     sub_img = sub_img[:B_]
-                    # (num_crops, 12, 2, 12, 2, 1024) -> (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
-                    sub_img = sub_img.reshape(B_,H,H,C).reshape(B_,H//base_feat_height_reduction,base_feat_height_reduction,H//base_feat_height_reduction,base_feat_height_reduction,C).contiguous().permute(0,1,3,2,4,5).reshape(B_,-1,base_feat_height_reduction*base_feat_height_reduction*C).contiguous()
-                    sub_img = sub_img.reshape(1, h, w, base_feat_height // base_feat_height_reduction, base_feat_width // base_feat_height_reduction, -1).permute(0,1,3,2,4,5).reshape(1,h*base_feat_height//base_feat_height_reduction,w*base_feat_width//base_feat_height_reduction,base_feat_height_reduction*base_feat_height_reduction*C)
                     if image_attention_mask is not None and len(image_attention_mask) > 0:
-                        reshaped_image_attention_mask = image_attention_mask[_bs,1:B_+1,0::2,0::2].reshape(1, h, w, base_feat_height // base_feat_height_reduction, base_feat_width // base_feat_height_reduction).permute(0,1,3,2,4).reshape(1,h*base_feat_height//base_feat_height_reduction,w*base_feat_width//base_feat_height_reduction)
-                        useful_height = int(reshaped_image_attention_mask[0,:,0].sum().item())
-                        useful_width = int(reshaped_image_attention_mask[0,0,:].sum().item())
-                        sub_img = sub_img[:,:useful_height, :useful_width]
                         temp_sub_GN = self.sub_GN.repeat(1, useful_height, 1, 1)
-                        temp_len = int(image_attention_mask[_bs,:B_+1,0::2,0::2].sum().item()) + (useful_height+1) + base_feat_height//base_feat_height_reduction
                     else:
-                        temp_sub_GN = self.sub_GN.repeat(1, h*base_feat_height//base_feat_height_reduction, 1, 1)
-                        temp_len = int((h*w+1)*self.num_img_tokens+ 1 + (h+1)*base_feat_height//base_feat_height_reduction)
-                    sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(1,-1,base_feat_height_reduction*base_feat_height_reduction*C)
-                    # (1, num_img_tokens, 1024*4)
                     # glb + sub
                     if self.hd_transform_order == 'glb_sub':
@@ -395,17 +426,11 @@ class Phi4MMImageEmbedding(nn.Module):
                     else:
                         raise NotImplementedError(f'hd_transform_order = {self.hd_transform_order}, not implemented')
-                    #temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
-                    assert temp_len == output_imgs[-1].shape[1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: {output_imgs[-1].shape[1]}'
-                    output_len.append(temp_len)
-                num_img_tokens = output_len
                 img_set_tensor = []
                 for _output_img in output_imgs:
                     img_feature_proj = self.img_projection(_output_img.to(target_device).to(target_dtype))
                     img_set_tensor.append(img_feature_proj)
-                #logger.info(f'img_embeds size: {img_embeds.size()}, image sizes: {img_sizes} loading time {datetime.now() - start_time}')
-                #assert sum(num_img_tokens) == len(g_values), f'(branch 1) sum(num_img_tokens): {sum(num_img_tokens)}, g_values size: {len(g_values)}, g_values {g_values}'
             else:
                 raise NotImplementedError
@@ -420,7 +445,7 @@ class Phi4MMImageEmbedding(nn.Module):
                     self.get_img_features(img_embeds)
                     .to(target_device)
                     .to(target_dtype)
-                    .reshape(-1, 1024)
                 )
                 if self.use_hd_transform:
                     img_set_tensor = self.img_projection(tt.reshape(-1, self.image_dim_out*self.base_feat_height_reduction**2) * self.glb_GN[0] * self.sub_GN[0, 0])
@@ -442,14 +467,19 @@ class Phi4MMImageEmbedding(nn.Module):
                 # Shape: (merged_N_tokens, C)
                 merged_img_set_tensor = torch.cat(img_set_tensor, dim=1).squeeze(0)
                 merged_img_set_tensor = merged_img_set_tensor.to(hidden_states.dtype).to(hidden_states.device)
-                # Temporarily disable autocast to avoid issue on bf16 tensors
-                # Ref: https://github.com/pytorch/pytorch/issues/132715
-                with torch.autocast(device_type=hidden_states.device.type, enabled=False):
-                    new_hidden_states = hidden_states.index_put(
-                        indices=positions_tuple,
-                        values=merged_img_set_tensor,
-                        accumulate=False
-                    )
                 hidden_states = new_hidden_states
             else:
                 raise NotImplementedError
@@ -2096,7 +2126,7 @@ class Phi4MMForCausalLM(Phi4MMPreTrainedModel, GenerationMixin):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if isinstance(input_mode, torch.Tensor):
-            # len(input_mode) == num_beams in beam search, and all elements of input_mode should have the same value
             input_mode = input_mode[0].item()
         input_mode = InputMode(input_mode)

                 bs = img_embeds.shape[0]
                 # Nx(HW)xC
                 if image_attention_mask is not None and len(image_attention_mask) > 0:
+                    img_features = self.get_img_features(img_embeds.flatten(0, 1), attention_mask=image_attention_mask.bool().flatten(0,1).to(target_device))
                 else:
                     img_features = self.get_img_features(img_embeds.flatten(0, 1))
                 assert base_feat_height == base_feat_height_target and base_feat_width == base_feat_height_target, f'base_feat_height: {base_feat_height}, base_feat_width: {base_feat_width}, expect {base_feat_height_target} features for hd transform'
+                # bs x max_num_crops x (HxH) x C
                 img_features = img_features.view(bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
                 C = self.image_dim_out
                 H = base_feat_height
                 output_imgs = []
                 # training is tensor, inference is list
                 if isinstance(img_sizes, torch.Tensor):
                     img_sizes = img_sizes.view(-1, 2)
                     w = w // base_resolution
                     B_ = h * w
+                    # 1 x (HxH) x C
                     global_img_feature = img_features[_bs, :1]
+                    # 1 x H x H x C
+                    glb_img = (
+                        global_img_feature
+                        .reshape(1, H, H, C)
+                        .reshape(1, H // base_feat_height_reduction, base_feat_height_reduction,
+                                 H // base_feat_height_reduction, base_feat_height_reduction, C)
+                        .contiguous()
+                        .permute(0, 1, 3, 2, 4, 5)
+                        .reshape(1, H // base_feat_height_reduction, H // base_feat_height_reduction,
+                                 base_feat_height_reduction * base_feat_height_reduction * C)
+                        .contiguous()
+                    )
+                    temp_glb_GN = self.sub_GN.repeat(1, H // base_feat_height_reduction, 1, 1)
+                    # 1 x (HxH+H) x C
+                    glb_img = (
+                        torch.cat([glb_img, temp_glb_GN], dim=2)
+                        .reshape(1, -1, base_feat_height_reduction * base_feat_height_reduction * C)
+                    )
+                    # (max_num_crops-1) x (HxH) x C
                     sub_img = img_features[_bs, 1:]
                     # get rid of padding sub_img
                     sub_img = sub_img[:B_]
+                    sub_img = (
+                        sub_img
+                        .reshape(B_, H, H, C)
+                        .reshape(B_, H // base_feat_height_reduction, base_feat_height_reduction,
+                                 H // base_feat_height_reduction, base_feat_height_reduction, C)
+                        .contiguous()
+                        .permute(0, 1, 3, 2, 4, 5)
+                        .reshape(B_, -1, base_feat_height_reduction * base_feat_height_reduction * C)
+                        .contiguous()
+                    )
+                    sub_img = (
+                        sub_img
+                        .reshape(1, h, w, base_feat_height // base_feat_height_reduction,
+                                 base_feat_width // base_feat_height_reduction, -1)
+                        .permute(0, 1, 3, 2, 4, 5)
+                        .reshape(1, h * base_feat_height // base_feat_height_reduction,
+                                 w * base_feat_width // base_feat_height_reduction,
+                                 base_feat_height_reduction * base_feat_height_reduction * C)
+                    )
                     if image_attention_mask is not None and len(image_attention_mask) > 0:
+                        reshaped_image_attention_mask = (
+                            image_attention_mask[_bs, 1:B_ + 1, 0::2, 0::2]
+                            .reshape(1, h, w, base_feat_height // base_feat_height_reduction,
+                                     base_feat_width // base_feat_height_reduction)
+                            .permute(0, 1, 3, 2, 4)
+                            .reshape(1, h * base_feat_height // base_feat_height_reduction,
+                                     w * base_feat_width // base_feat_height_reduction)
+                        )
+                        useful_height = int(reshaped_image_attention_mask[0, :, 0].sum().item())
+                        useful_width = int(reshaped_image_attention_mask[0, 0, :].sum().item())
+                        sub_img = sub_img[:, :useful_height, :useful_width]
                         temp_sub_GN = self.sub_GN.repeat(1, useful_height, 1, 1)
                     else:
+                        temp_sub_GN = self.sub_GN.repeat(1, h * base_feat_height // base_feat_height_reduction, 1, 1)
+                    sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(1, -1, base_feat_height_reduction * base_feat_height_reduction * C)
                     # glb + sub
                     if self.hd_transform_order == 'glb_sub':
                     else:
                         raise NotImplementedError(f'hd_transform_order = {self.hd_transform_order}, not implemented')
                 img_set_tensor = []
                 for _output_img in output_imgs:
                     img_feature_proj = self.img_projection(_output_img.to(target_device).to(target_dtype))
                     img_set_tensor.append(img_feature_proj)
+                # logger.info(f'img_embeds size: {img_embeds.size()}, image sizes: {img_sizes} loading time {datetime.now() - start_time}')
             else:
                 raise NotImplementedError
                     self.get_img_features(img_embeds)
                     .to(target_device)
                     .to(target_dtype)
+                    .reshape(-1, self.image_dim_out)
                 )
                 if self.use_hd_transform:
                     img_set_tensor = self.img_projection(tt.reshape(-1, self.image_dim_out*self.base_feat_height_reduction**2) * self.glb_GN[0] * self.sub_GN[0, 0])
                 # Shape: (merged_N_tokens, C)
                 merged_img_set_tensor = torch.cat(img_set_tensor, dim=1).squeeze(0)
                 merged_img_set_tensor = merged_img_set_tensor.to(hidden_states.dtype).to(hidden_states.device)
+                if hidden_states.device.type == "mps":
+                    # For MPS, assign using direct indexing to avoid index_put issues.
+                    new_hidden_states = hidden_states.clone()
+                    new_hidden_states[positions_tuple] = merged_img_set_tensor
+                else:
+                    # Temporarily disable autocast to avoid issue on bf16 tensors
+                    # Ref: https://github.com/pytorch/pytorch/issues/132715
+                    with torch.autocast(device_type=hidden_states.device.type, enabled=False):
+                        new_hidden_states = hidden_states.index_put(
+                            indices=positions_tuple,
+                            values=merged_img_set_tensor,
+                            accumulate=False
+                        )
                 hidden_states = new_hidden_states
             else:
                 raise NotImplementedError
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if isinstance(input_mode, torch.Tensor):
+            assert len(input_mode) == 1
             input_mode = input_mode[0].item()
         input_mode = InputMode(input_mode)