fixie-ai
/

ultravox-v0_4

@@ -1,6 +1,6 @@
 import logging
 import re
-from typing import Any, Dict, Generator, Optional, Set, Tuple, Union
 import peft
 import torch
@@ -56,6 +56,11 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         self.multi_modal_projector = self._create_multi_modal_projector(config)
         self.language_model = self._create_language_model(config)
         # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
         # FSDP throws an error if some of the layer types are not found in the model.
         # This would be something like ["LlamaDecoderLayer"] as we don't split audio encoder layers.
@@ -64,6 +69,39 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         self.loss_config = LossConfig()
         self.post_init()
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
@@ -110,6 +148,30 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
     def _compute_kl_loss(
         self,
         lm_output: transformers.modeling_outputs.CausalLMOutputWithPast,
@@ -134,11 +196,12 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         # compute the KL divergence loss between the two models
         kl_loss = F.kl_div(
             F.log_softmax(
-                lm_output.logits[labels != -100] / self.loss_config.kl_temperature,
                 dim=-1,
             ),
             F.softmax(
-                alt_lm_output.logits[alt_labels != -100]
                 / self.loss_config.kl_temperature,
                 dim=-1,
             ),
@@ -289,7 +352,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         # include audio information in model_input only when it is needed during prefilling
         # audio_token_start_idx should always be relative to the current cache position
-        prefill_start_idx = 0 if cache_position is None else cache_position[0]
         if (
             audio_values is not None
             and audio_token_start_idx is not None
@@ -317,23 +382,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     def _create_audio_tower(
         cls, config: UltravoxConfig
     ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
-        if config.audio_model_id is not None:
-            if "whisper" in config.audio_model_id.lower():
-                audio_tower = ModifiedWhisperEncoder.from_pretrained(
-                    config.audio_model_id, torch_dtype=config.torch_dtype
-                )
-                audio_tower.init_latency_mask(
-                    config.audio_latency_block_size, dtype=config.torch_dtype
-                )
-            else:
-                assert config.audio_latency_block_size in (
-                    None,
-                    0,
-                ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'"
-                audio_tower = transformers.AutoModel.from_pretrained(
-                    config.audio_model_id, torch_dtype=config.torch_dtype
-                )
-        else:
             if "whisper" in config.audio_config._name_or_path.lower():
                 audio_tower = ModifiedWhisperEncoder(config.audio_config)
                 audio_tower.init_latency_mask(
@@ -344,12 +395,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
                     None,
                     0,
                 ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'"
-                with transformers.modeling_utils.no_init_weights():
-                    # we only ever use from_config if the weights are retrained, hence initializing is not
-                    # required. This makes the model quite creation faster since init on CPU is quite slow.
-                    audio_tower = transformers.AutoModel.from_config(
-                        config.audio_config
-                    )
         if isinstance(
             audio_tower,
@@ -367,21 +413,14 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     def _create_language_model(
         cls, config: UltravoxConfig
     ) -> transformers.LlamaForCausalLM:
-        if config.text_model_id is not None:
-            language_model = transformers.AutoModelForCausalLM.from_pretrained(
-                config.text_model_id,
-                attn_implementation=config._attn_implementation,
                 torch_dtype=config.torch_dtype,
             )
-        else:
-            with transformers.modeling_utils.no_init_weights():
-                # we only ever use from_config if the weights are retrained, hence initializing is not
-                # required. This makes the model quite creation faster since init on CPU is quite slow.
-                language_model = transformers.AutoModelForCausalLM.from_config(
-                    config.text_config,
-                    attn_implementation=config._attn_implementation,
-                    torch_dtype=config.torch_dtype,
-                )
         language_model = apply_lora(language_model, config.text_model_lora_config)
         return language_model
@@ -495,7 +534,10 @@ def is_cache_empty(
     return past_key_values.get_seq_length() == 0
-def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
     """
     Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead.
     """
@@ -574,11 +616,35 @@ class UltravoxProjector(nn.Module):
             self.ln_post = RMSNorm(dim_out, init=config.norm_init)
     def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
         audio_features = self._pad_and_stack(audio_features)
         audio_features = self.ln_pre(audio_features)
         hidden_states = self.linear_1(audio_features)
         hidden_states = self.act(hidden_states)
         hidden_states = self.ln_mid(hidden_states)
         hidden_states = self.linear_2(hidden_states)
         hidden_states = self.ln_post(hidden_states)
         return hidden_states
@@ -601,6 +667,7 @@ class ModifiedWhisperEncoder(
     base_model_prefix = "model.encoder"
     _no_split_modules = ["WhisperEncoderLayer"]
     def __init__(self, config: transformers.WhisperConfig):
         super().__init__(config)
@@ -614,7 +681,9 @@ class ModifiedWhisperEncoder(
             * self.conv2.stride[0]
         )
-    def init_latency_mask(self, audio_latency_block_size: int, dtype: torch.dtype):
         if audio_latency_block_size is None:
             self.audio_streaming_mask = None
             return
@@ -781,4 +850,4 @@ UltravoxModel.register_for_auto_class()
 transformers.AutoConfig.register("ultravox", UltravoxConfig)
 transformers.AutoModel.register(UltravoxConfig, UltravoxModel)
-transformers.activations.ACT2FN["swiglu"] = SwiGLU

 import logging
 import re
+from typing import Any, Dict, Generator, Optional, Set, Tuple, TypeVar, Union
 import peft
 import torch
         self.multi_modal_projector = self._create_multi_modal_projector(config)
         self.language_model = self._create_language_model(config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [
+                f"language_model.{k}" for k in self.language_model._tied_weights_keys
+            ]
         # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
         # FSDP throws an error if some of the layer types are not found in the model.
         # This would be something like ["LlamaDecoderLayer"] as we don't split audio encoder layers.
         self.loss_config = LossConfig()
         self.post_init()
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        model = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        model._load_child_model_weights(*args, **kwargs)
+        return model
+    def _load_child_model_weights(self, *args, **kwargs) -> "UltravoxModel":
+        if (
+            self.config.text_model_id is not None
+            and self.language_model.device.type == "meta"
+        ):
+            # Load the language model weights
+            self.language_model = transformers.AutoModelForCausalLM.from_pretrained(
+                self.config.text_model_id,
+                torch_dtype=self.config.torch_dtype,
+                *args,
+                **kwargs,
+            )
+        if (
+            self.config.audio_model_id is not None
+            and self.audio_tower.device.type == "meta"
+        ):
+            # Load the audio tower weights
+            self.audio_tower = transformers.AutoModel.from_pretrained(
+                self.config.audio_model_id,
+                torch_dtype=self.config.torch_dtype,
+                *args,
+                **kwargs,
+            )
+        return self
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
+    def _get_prediction_mask(self, labels: Optional[torch.Tensor]) -> torch.Tensor:
+        """Get a boolean mask for positions where we want to compute KL divergence.
+        For each label position, we want the position before it since that's where
+        the model makes the prediction for that label.
+        Args:
+            labels: Tensor of shape (B, T) where B is batch size and T is sequence length,
+                   with -100 for masked positions and token ids for label positions
+        Returns:
+            Boolean tensor of shape (B, T) that's True for positions where we want to compute KL divergence
+        """
+        if labels is None:
+            raise ValueError("labels must be provided")
+        # Shift the label mask right by 1 along the sequence dimension
+        # This gives us positions where we make predictions for the next token
+        label_mask = labels != -100
+        pred_mask = torch.zeros_like(label_mask)
+        pred_mask[:, :-1] = label_mask[
+            :, 1:
+        ]  # shift right by 1 along sequence dimension
+        return pred_mask
     def _compute_kl_loss(
         self,
         lm_output: transformers.modeling_outputs.CausalLMOutputWithPast,
         # compute the KL divergence loss between the two models
         kl_loss = F.kl_div(
             F.log_softmax(
+                lm_output.logits[self._get_prediction_mask(labels)]
+                / self.loss_config.kl_temperature,
                 dim=-1,
             ),
             F.softmax(
+                alt_lm_output.logits[self._get_prediction_mask(alt_labels)]
                 / self.loss_config.kl_temperature,
                 dim=-1,
             ),
         # include audio information in model_input only when it is needed during prefilling
         # audio_token_start_idx should always be relative to the current cache position
+        prefill_start_idx: int | torch.Tensor = (
+            0 if cache_position is None else cache_position[0]
+        )
         if (
             audio_values is not None
             and audio_token_start_idx is not None
     def _create_audio_tower(
         cls, config: UltravoxConfig
     ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
+        with transformers.modeling_utils.no_init_weights():
+            # we only ever use from_config if the weights are retrained, hence initializing is not
+            # required. This makes the model quite creation faster since init on CPU is quite slow.
             if "whisper" in config.audio_config._name_or_path.lower():
                 audio_tower = ModifiedWhisperEncoder(config.audio_config)
                 audio_tower.init_latency_mask(
                     None,
                     0,
                 ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'"
+                audio_tower = transformers.AutoModel.from_config(config.audio_config)
         if isinstance(
             audio_tower,
     def _create_language_model(
         cls, config: UltravoxConfig
     ) -> transformers.LlamaForCausalLM:
+        with transformers.modeling_utils.no_init_weights():
+            # we only ever use from_config if the weights are retrained, hence initializing is not
+            # required. This makes the model quite creation faster since init on CPU is quite slow.
+            language_model = transformers.AutoModelForCausalLM.from_config(
+                config.text_config,
+                attn_implementation=config.text_config._attn_implementation,
                 torch_dtype=config.torch_dtype,
             )
         language_model = apply_lora(language_model, config.text_model_lora_config)
         return language_model
     return past_key_values.get_seq_length() == 0
+T = TypeVar("T", bound=torch.nn.Module)
+def apply_lora(model: T, lora_config: dict) -> T:
     """
     Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead.
     """
             self.ln_post = RMSNorm(dim_out, init=config.norm_init)
     def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
+        """
+        Takes in audio features from the audio tower and projects them to the text model's embedding space.
+        It reduces the number of frames by a factor of `stack_factor` and increases the number of channels by the same factor.
+        If the number of audio frames are not a multiple of the stack factor, the last few frames will be padded with zeros.
+        Input shape:
+            audio_features: B, T*S, C
+        Output shape:
+            hidden_states: B, T, D
+        Where:
+            B: batch size
+            F: number of frames in the audio tower
+            T: number of output embeddings
+                T = ceil(F / S)
+            S: stack factor
+            C: number of channels out of the encoder (aka audio tower)
+            H: hidden size of the projector (config.hidden_size)
+            D: dimension of the text model (config.text_config.hidden_size)
+        """
+        # B, F, C -> B, T, C*S
         audio_features = self._pad_and_stack(audio_features)
         audio_features = self.ln_pre(audio_features)
+        # B, T, C*S -> B, T, H
         hidden_states = self.linear_1(audio_features)
+        # B, T, H -> B, T, H/2 (assuming swiglu)
         hidden_states = self.act(hidden_states)
         hidden_states = self.ln_mid(hidden_states)
+        # B, T, H/2 -> B, T, D
         hidden_states = self.linear_2(hidden_states)
         hidden_states = self.ln_post(hidden_states)
         return hidden_states
     base_model_prefix = "model.encoder"
     _no_split_modules = ["WhisperEncoderLayer"]
+    _keys_to_ignore_on_load_unexpected = ["model.decoder.*"]
     def __init__(self, config: transformers.WhisperConfig):
         super().__init__(config)
             * self.conv2.stride[0]
         )
+    def init_latency_mask(
+        self, audio_latency_block_size: int | None, dtype: torch.dtype
+    ):
         if audio_latency_block_size is None:
             self.audio_streaming_mask = None
             return
 transformers.AutoConfig.register("ultravox", UltravoxConfig)
 transformers.AutoModel.register(UltravoxConfig, UltravoxModel)
+transformers.activations.ACT2FN["swiglu"] = SwiGLU