AtAndDev commited on
Commit
b950a14
·
verified ·
1 Parent(s): cd1c192

Upload ultravox_config.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. ultravox_config.py +0 -32
ultravox_config.py CHANGED
@@ -9,7 +9,6 @@ import transformers
9
  class LoraConfigSimplified:
10
  """
11
  Low Rank Approximation (LoRA) configuration.
12
-
13
  Used for language and audio models separately.
14
  """
15
 
@@ -23,17 +22,6 @@ class LoraConfigSimplified:
23
  unfreeze_layers: Optional[List[str]] = None
24
 
25
 
26
- class LossMaskType(str, Enum):
27
- """Type of loss mask to use."""
28
-
29
- LAST_ASSISTANT = "last_assistant"
30
- """This applies the loss mask up until the last assistant token"""
31
- ALL = "all" # This does not work with KL loss
32
- """No loss mask, all inputs are used for loss"""
33
- AFTER_AUDIO = "after_audio"
34
- """Applies the loss mask up until the audio token"""
35
-
36
-
37
  class LossFunction(str, Enum):
38
  CrossEntropy = "ce"
39
  KL_Divergence = "kl"
@@ -57,10 +45,8 @@ class UltravoxConfig(transformers.PretrainedConfig):
57
  r"""
58
  This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an
59
  Ultravox model according to the specified arguments, defining the model architecture.
60
-
61
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
62
  documentation from [`PretrainedConfig`] for more information.
63
-
64
  Args:
65
  audio_config (`WhisperConfig`, *optional*):
66
  Custom audio config or dict
@@ -82,28 +68,19 @@ class UltravoxConfig(transformers.PretrainedConfig):
82
  The LoRA configuration for finetuning the audio model.
83
  audio_latency_block_size (`int`, *optional*, defaults to `None`):
84
  The latency block size for simulating audio streaming.
85
-
86
-
87
  Example:
88
-
89
  ```python
90
  >>> from transformers import UltravoxModel, WhisperConfig, UltravoxConfig, LlamaConfig
91
-
92
  >>> # Initializing an audio encoder config
93
  >>> audio_config = WhisperConfig()
94
-
95
  >>> # Initializing a Llama config
96
  >>> text_config = LlamaConfig()
97
-
98
  >>> # Initializing a default configuration
99
  >>> configuration = UltravoxConfig(audio_config, text_config)
100
-
101
  >>> # Initializing a completely untrained model from the configuration
102
  >>> model = UltravoxModel(configuration)
103
-
104
  >>> # Accessing the model configuration
105
  >>> configuration = model.config
106
-
107
  >>> # Initialize a model from pretrained checkpoints and random projector weights
108
  >>> config = UltravoxConfig(audio_model_id="openai/whisper-tiny", text_model_id="meta-llama/Llama-2-7b-chat-hf")
109
  ```"""
@@ -117,9 +94,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
117
  text_config: dict[str, Any] | transformers.PretrainedConfig | None = None,
118
  audio_model_id: str | None = None,
119
  text_model_id: str | None = None,
120
- llm_only_training: bool = False,
121
  ignore_index: int = -100,
122
- audio_token_index: int | None = None,
123
  hidden_size: int = 4096,
124
  stack_factor: int = 8,
125
  norm_init: float = 0.4,
@@ -135,8 +110,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
135
  self.audio_model_id = audio_model_id
136
  self.text_model_id = text_model_id
137
 
138
- self.audio_token_index = audio_token_index
139
-
140
  self.hidden_size = hidden_size
141
  self.stack_factor = stack_factor
142
  self.norm_init = norm_init
@@ -163,7 +136,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
163
  self.text_config = text_config
164
  self.audio_config = audio_config
165
 
166
- self.llm_only_training = llm_only_training
167
  self.text_model_lora_config = (
168
  text_model_lora_config
169
  if isinstance(text_model_lora_config, dict)
@@ -176,10 +148,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
176
  )
177
  self.audio_latency_block_size = audio_latency_block_size
178
 
179
- if hasattr(text_config, "text_config"):
180
- text_config.vocab_size = text_config.text_config.vocab_size
181
- text_config.hidden_size = text_config.text_config.hidden_size
182
-
183
  self.vocab_size = text_config.vocab_size
184
 
185
  self.initializer_range = text_config.initializer_range
 
9
  class LoraConfigSimplified:
10
  """
11
  Low Rank Approximation (LoRA) configuration.
 
12
  Used for language and audio models separately.
13
  """
14
 
 
22
  unfreeze_layers: Optional[List[str]] = None
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
25
  class LossFunction(str, Enum):
26
  CrossEntropy = "ce"
27
  KL_Divergence = "kl"
 
45
  r"""
46
  This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an
47
  Ultravox model according to the specified arguments, defining the model architecture.
 
48
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
49
  documentation from [`PretrainedConfig`] for more information.
 
50
  Args:
51
  audio_config (`WhisperConfig`, *optional*):
52
  Custom audio config or dict
 
68
  The LoRA configuration for finetuning the audio model.
69
  audio_latency_block_size (`int`, *optional*, defaults to `None`):
70
  The latency block size for simulating audio streaming.
 
 
71
  Example:
 
72
  ```python
73
  >>> from transformers import UltravoxModel, WhisperConfig, UltravoxConfig, LlamaConfig
 
74
  >>> # Initializing an audio encoder config
75
  >>> audio_config = WhisperConfig()
 
76
  >>> # Initializing a Llama config
77
  >>> text_config = LlamaConfig()
 
78
  >>> # Initializing a default configuration
79
  >>> configuration = UltravoxConfig(audio_config, text_config)
 
80
  >>> # Initializing a completely untrained model from the configuration
81
  >>> model = UltravoxModel(configuration)
 
82
  >>> # Accessing the model configuration
83
  >>> configuration = model.config
 
84
  >>> # Initialize a model from pretrained checkpoints and random projector weights
85
  >>> config = UltravoxConfig(audio_model_id="openai/whisper-tiny", text_model_id="meta-llama/Llama-2-7b-chat-hf")
86
  ```"""
 
94
  text_config: dict[str, Any] | transformers.PretrainedConfig | None = None,
95
  audio_model_id: str | None = None,
96
  text_model_id: str | None = None,
 
97
  ignore_index: int = -100,
 
98
  hidden_size: int = 4096,
99
  stack_factor: int = 8,
100
  norm_init: float = 0.4,
 
110
  self.audio_model_id = audio_model_id
111
  self.text_model_id = text_model_id
112
 
 
 
113
  self.hidden_size = hidden_size
114
  self.stack_factor = stack_factor
115
  self.norm_init = norm_init
 
136
  self.text_config = text_config
137
  self.audio_config = audio_config
138
 
 
139
  self.text_model_lora_config = (
140
  text_model_lora_config
141
  if isinstance(text_model_lora_config, dict)
 
148
  )
149
  self.audio_latency_block_size = audio_latency_block_size
150
 
 
 
 
 
151
  self.vocab_size = text_config.vocab_size
152
 
153
  self.initializer_range = text_config.initializer_range