lmms-lab
/

Aero-1-Audio

@@ -30,9 +30,16 @@ from transformers.modeling_outputs import BaseModelOutput, ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.auto import AutoModel, AutoModelForCausalLM
 from transformers.utils import logging
 from .configuration_aero import AeroConfig
 logger = logging.get_logger(__name__)
@@ -78,6 +85,72 @@ class AeroCausalLMOutputWithPast(ModelOutput):
     audio_hidden_states: Optional[torch.FloatTensor] = None
 class AeroAudioMultiModalProjector(nn.Module):
     def __init__(self, config: AeroConfig):
@@ -136,7 +209,8 @@ class AeroPreTrainedModel(PreTrainedModel):
 class AeroForConditionalGeneration(AeroPreTrainedModel, GenerationMixin):
     def __init__(self, config: AeroConfig):
         super().__init__(config)
         self.audio_tower_type = config.audio_config.model_type
         self.audio_tower = AutoModel.from_config(config.audio_config)
         self.audio_modal_projector = AeroAudioMultiModalProjector(config)

 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.auto import AutoModel, AutoModelForCausalLM
 from transformers.utils import logging
+from transformers.models.qwen2_audio.modeling_qwen2_audio import Qwen2AudioFlashAttention2
 from .configuration_aero import AeroConfig
+try:
+    from flash_attn import flash_attn_func
+except ImportError:
+    print("flash_attn not installed. Please install flash-attn to use flash-attn for audio tower")
 logger = logging.get_logger(__name__)
     audio_hidden_states: Optional[torch.FloatTensor] = None
+# Original Flash attn in transformers for Qwen2Audio Encoder is buggy
+# patch the function with this one
+def qwen2_audio_flash_attn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    key_value_states= None,
+    past_key_value= None,
+    attention_mask = None,
+    layer_head_mask = None,
+    output_attentions: bool = False,
+    cache_position = None,
+):
+    # Qwen2AudioFlashAttention2 attention does not support output_attentions
+    if output_attentions:
+        raise ValueError("Qwen2AudioFlashAttention2 attention does not support output_attentions")
+    bsz, tgt_len, _ = hidden_states.size()
+    # get query proj
+    query_states = torch.reshape(self.q_proj(hidden_states), (bsz, tgt_len, self.num_heads, self.head_dim))
+    key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+    value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+    # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]
+    #  We would need to refactor the KV cache to be able to avoid many of these transpose/reshape/view.
+    key_states = key_states.transpose(1, 2)
+    value_states = value_states.transpose(1, 2)
+    causal_mask = attention_mask
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, : key_states.shape[-2]]
+    # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+    # therefore the input hidden states gets silently casted in float32. Hence, we need
+    # cast them back in the correct dtype just to be sure everything works as expected.
+    # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+    # in fp32. (LlamaRMSNorm handles it correctly)
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        # Handle the case where the model is quantized
+        elif hasattr(self.config, "_pre_quantization_dtype"):
+            target_dtype = self.config._pre_quantization_dtype
+        else:
+            target_dtype = self.q_proj.weight.dtype
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+    dropout=self.dropout if self.training else 0.0
+    attn_output = flash_attn_func(
+        query_states, key_states, value_states, dropout, softmax_scale=None, causal=self.is_causal
+    )
+    attn_output = attn_output.reshape(bsz, tgt_len, -1)
+    attn_output = self.out_proj(attn_output)
+    if not output_attentions:
+        attn_weights = None
+    return attn_output, attn_weights, None
 class AeroAudioMultiModalProjector(nn.Module):
     def __init__(self, config: AeroConfig):
 class AeroForConditionalGeneration(AeroPreTrainedModel, GenerationMixin):
     def __init__(self, config: AeroConfig):
         super().__init__(config)
+        if config._attn_implementation == "flash_attention_2":
+            Qwen2AudioFlashAttention2.forward = qwen2_audio_flash_attn_forward
         self.audio_tower_type = config.audio_config.model_type
         self.audio_tower = AutoModel.from_config(config.audio_config)
         self.audio_modal_projector = AeroAudioMultiModalProjector(config)