datalama
/

kanana-nano-2.1b-embedding

@@ -97,7 +97,6 @@ class BiLlamaModel(LlamaModel):
         sequence_length: int,
         target_length: int,
         dtype: torch.dtype,
-        device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
         **kwargs,
@@ -117,8 +116,6 @@ class BiLlamaModel(LlamaModel):
                 to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
-            device (`torch.device`):
-                The device to plcae the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -130,7 +127,7 @@ class BiLlamaModel(LlamaModel):
         else:
             min_dtype = torch.finfo(dtype).min
             causal_mask = torch.zeros(
-                (sequence_length, target_length), dtype=dtype, device=device
             )
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:

         sequence_length: int,
         target_length: int,
         dtype: torch.dtype,
         cache_position: torch.Tensor,
         batch_size: int,
         **kwargs,
                 to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
         else:
             min_dtype = torch.finfo(dtype).min
             causal_mask = torch.zeros(
+                (sequence_length, target_length), dtype=dtype, device=cache_position.device
             )
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None: