Upload ultravox_config.py with huggingface_hub
Browse files- ultravox_config.py +0 -32
    	
        ultravox_config.py
    CHANGED
    
    | @@ -9,7 +9,6 @@ import transformers | |
| 9 | 
             
            class LoraConfigSimplified:
         | 
| 10 | 
             
                """
         | 
| 11 | 
             
                Low Rank Approximation (LoRA) configuration.
         | 
| 12 | 
            -
             | 
| 13 | 
             
                Used for language and audio models separately.
         | 
| 14 | 
             
                """
         | 
| 15 |  | 
| @@ -23,17 +22,6 @@ class LoraConfigSimplified: | |
| 23 | 
             
                unfreeze_layers: Optional[List[str]] = None
         | 
| 24 |  | 
| 25 |  | 
| 26 | 
            -
            class LossMaskType(str, Enum):
         | 
| 27 | 
            -
                """Type of loss mask to use."""
         | 
| 28 | 
            -
             | 
| 29 | 
            -
                LAST_ASSISTANT = "last_assistant"
         | 
| 30 | 
            -
                """This applies the loss mask up until the last assistant token"""
         | 
| 31 | 
            -
                ALL = "all"  # This does not work with KL loss
         | 
| 32 | 
            -
                """No loss mask, all inputs are used for loss"""
         | 
| 33 | 
            -
                AFTER_AUDIO = "after_audio"
         | 
| 34 | 
            -
                """Applies the loss mask up until the audio token"""
         | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
             
            class LossFunction(str, Enum):
         | 
| 38 | 
             
                CrossEntropy = "ce"
         | 
| 39 | 
             
                KL_Divergence = "kl"
         | 
| @@ -57,10 +45,8 @@ class UltravoxConfig(transformers.PretrainedConfig): | |
| 57 | 
             
                r"""
         | 
| 58 | 
             
                This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an
         | 
| 59 | 
             
                Ultravox model according to the specified arguments, defining the model architecture.
         | 
| 60 | 
            -
             | 
| 61 | 
             
                Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
         | 
| 62 | 
             
                documentation from [`PretrainedConfig`] for more information.
         | 
| 63 | 
            -
             | 
| 64 | 
             
                Args:
         | 
| 65 | 
             
                    audio_config (`WhisperConfig`,  *optional*):
         | 
| 66 | 
             
                        Custom audio config or dict
         | 
| @@ -82,28 +68,19 @@ class UltravoxConfig(transformers.PretrainedConfig): | |
| 82 | 
             
                        The LoRA configuration for finetuning the audio model.
         | 
| 83 | 
             
                    audio_latency_block_size (`int`, *optional*, defaults to `None`):
         | 
| 84 | 
             
                        The latency block size for simulating audio streaming.
         | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
             
                Example:
         | 
| 88 | 
            -
             | 
| 89 | 
             
                ```python
         | 
| 90 | 
             
                >>> from transformers import UltravoxModel, WhisperConfig, UltravoxConfig, LlamaConfig
         | 
| 91 | 
            -
             | 
| 92 | 
             
                >>> # Initializing an audio encoder config
         | 
| 93 | 
             
                >>> audio_config = WhisperConfig()
         | 
| 94 | 
            -
             | 
| 95 | 
             
                >>> # Initializing a Llama config
         | 
| 96 | 
             
                >>> text_config = LlamaConfig()
         | 
| 97 | 
            -
             | 
| 98 | 
             
                >>> # Initializing a default configuration
         | 
| 99 | 
             
                >>> configuration = UltravoxConfig(audio_config, text_config)
         | 
| 100 | 
            -
             | 
| 101 | 
             
                >>> # Initializing a completely untrained model from the configuration
         | 
| 102 | 
             
                >>> model = UltravoxModel(configuration)
         | 
| 103 | 
            -
             | 
| 104 | 
             
                >>> # Accessing the model configuration
         | 
| 105 | 
             
                >>> configuration = model.config
         | 
| 106 | 
            -
             | 
| 107 | 
             
                >>> # Initialize a model from pretrained checkpoints and random projector weights
         | 
| 108 | 
             
                >>> config = UltravoxConfig(audio_model_id="openai/whisper-tiny", text_model_id="meta-llama/Llama-2-7b-chat-hf")
         | 
| 109 | 
             
                ```"""
         | 
| @@ -117,9 +94,7 @@ class UltravoxConfig(transformers.PretrainedConfig): | |
| 117 | 
             
                    text_config: dict[str, Any] | transformers.PretrainedConfig | None = None,
         | 
| 118 | 
             
                    audio_model_id: str | None = None,
         | 
| 119 | 
             
                    text_model_id: str | None = None,
         | 
| 120 | 
            -
                    llm_only_training: bool = False,
         | 
| 121 | 
             
                    ignore_index: int = -100,
         | 
| 122 | 
            -
                    audio_token_index: int | None = None,
         | 
| 123 | 
             
                    hidden_size: int = 4096,
         | 
| 124 | 
             
                    stack_factor: int = 8,
         | 
| 125 | 
             
                    norm_init: float = 0.4,
         | 
| @@ -135,8 +110,6 @@ class UltravoxConfig(transformers.PretrainedConfig): | |
| 135 | 
             
                    self.audio_model_id = audio_model_id
         | 
| 136 | 
             
                    self.text_model_id = text_model_id
         | 
| 137 |  | 
| 138 | 
            -
                    self.audio_token_index = audio_token_index
         | 
| 139 | 
            -
             | 
| 140 | 
             
                    self.hidden_size = hidden_size
         | 
| 141 | 
             
                    self.stack_factor = stack_factor
         | 
| 142 | 
             
                    self.norm_init = norm_init
         | 
| @@ -163,7 +136,6 @@ class UltravoxConfig(transformers.PretrainedConfig): | |
| 163 | 
             
                    self.text_config = text_config
         | 
| 164 | 
             
                    self.audio_config = audio_config
         | 
| 165 |  | 
| 166 | 
            -
                    self.llm_only_training = llm_only_training
         | 
| 167 | 
             
                    self.text_model_lora_config = (
         | 
| 168 | 
             
                        text_model_lora_config
         | 
| 169 | 
             
                        if isinstance(text_model_lora_config, dict)
         | 
| @@ -176,10 +148,6 @@ class UltravoxConfig(transformers.PretrainedConfig): | |
| 176 | 
             
                    )
         | 
| 177 | 
             
                    self.audio_latency_block_size = audio_latency_block_size
         | 
| 178 |  | 
| 179 | 
            -
                    if hasattr(text_config, "text_config"):
         | 
| 180 | 
            -
                        text_config.vocab_size = text_config.text_config.vocab_size
         | 
| 181 | 
            -
                        text_config.hidden_size = text_config.text_config.hidden_size
         | 
| 182 | 
            -
             | 
| 183 | 
             
                    self.vocab_size = text_config.vocab_size
         | 
| 184 |  | 
| 185 | 
             
                    self.initializer_range = text_config.initializer_range
         | 
|  | |
| 9 | 
             
            class LoraConfigSimplified:
         | 
| 10 | 
             
                """
         | 
| 11 | 
             
                Low Rank Approximation (LoRA) configuration.
         | 
|  | |
| 12 | 
             
                Used for language and audio models separately.
         | 
| 13 | 
             
                """
         | 
| 14 |  | 
|  | |
| 22 | 
             
                unfreeze_layers: Optional[List[str]] = None
         | 
| 23 |  | 
| 24 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 25 | 
             
            class LossFunction(str, Enum):
         | 
| 26 | 
             
                CrossEntropy = "ce"
         | 
| 27 | 
             
                KL_Divergence = "kl"
         | 
|  | |
| 45 | 
             
                r"""
         | 
| 46 | 
             
                This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an
         | 
| 47 | 
             
                Ultravox model according to the specified arguments, defining the model architecture.
         | 
|  | |
| 48 | 
             
                Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
         | 
| 49 | 
             
                documentation from [`PretrainedConfig`] for more information.
         | 
|  | |
| 50 | 
             
                Args:
         | 
| 51 | 
             
                    audio_config (`WhisperConfig`,  *optional*):
         | 
| 52 | 
             
                        Custom audio config or dict
         | 
|  | |
| 68 | 
             
                        The LoRA configuration for finetuning the audio model.
         | 
| 69 | 
             
                    audio_latency_block_size (`int`, *optional*, defaults to `None`):
         | 
| 70 | 
             
                        The latency block size for simulating audio streaming.
         | 
|  | |
|  | |
| 71 | 
             
                Example:
         | 
|  | |
| 72 | 
             
                ```python
         | 
| 73 | 
             
                >>> from transformers import UltravoxModel, WhisperConfig, UltravoxConfig, LlamaConfig
         | 
|  | |
| 74 | 
             
                >>> # Initializing an audio encoder config
         | 
| 75 | 
             
                >>> audio_config = WhisperConfig()
         | 
|  | |
| 76 | 
             
                >>> # Initializing a Llama config
         | 
| 77 | 
             
                >>> text_config = LlamaConfig()
         | 
|  | |
| 78 | 
             
                >>> # Initializing a default configuration
         | 
| 79 | 
             
                >>> configuration = UltravoxConfig(audio_config, text_config)
         | 
|  | |
| 80 | 
             
                >>> # Initializing a completely untrained model from the configuration
         | 
| 81 | 
             
                >>> model = UltravoxModel(configuration)
         | 
|  | |
| 82 | 
             
                >>> # Accessing the model configuration
         | 
| 83 | 
             
                >>> configuration = model.config
         | 
|  | |
| 84 | 
             
                >>> # Initialize a model from pretrained checkpoints and random projector weights
         | 
| 85 | 
             
                >>> config = UltravoxConfig(audio_model_id="openai/whisper-tiny", text_model_id="meta-llama/Llama-2-7b-chat-hf")
         | 
| 86 | 
             
                ```"""
         | 
|  | |
| 94 | 
             
                    text_config: dict[str, Any] | transformers.PretrainedConfig | None = None,
         | 
| 95 | 
             
                    audio_model_id: str | None = None,
         | 
| 96 | 
             
                    text_model_id: str | None = None,
         | 
|  | |
| 97 | 
             
                    ignore_index: int = -100,
         | 
|  | |
| 98 | 
             
                    hidden_size: int = 4096,
         | 
| 99 | 
             
                    stack_factor: int = 8,
         | 
| 100 | 
             
                    norm_init: float = 0.4,
         | 
|  | |
| 110 | 
             
                    self.audio_model_id = audio_model_id
         | 
| 111 | 
             
                    self.text_model_id = text_model_id
         | 
| 112 |  | 
|  | |
|  | |
| 113 | 
             
                    self.hidden_size = hidden_size
         | 
| 114 | 
             
                    self.stack_factor = stack_factor
         | 
| 115 | 
             
                    self.norm_init = norm_init
         | 
|  | |
| 136 | 
             
                    self.text_config = text_config
         | 
| 137 | 
             
                    self.audio_config = audio_config
         | 
| 138 |  | 
|  | |
| 139 | 
             
                    self.text_model_lora_config = (
         | 
| 140 | 
             
                        text_model_lora_config
         | 
| 141 | 
             
                        if isinstance(text_model_lora_config, dict)
         | 
|  | |
| 148 | 
             
                    )
         | 
| 149 | 
             
                    self.audio_latency_block_size = audio_latency_block_size
         | 
| 150 |  | 
|  | |
|  | |
|  | |
|  | |
| 151 | 
             
                    self.vocab_size = text_config.vocab_size
         | 
| 152 |  | 
| 153 | 
             
                    self.initializer_range = text_config.initializer_range
         | 
