|
from typing import Any |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.models.qwen2 import Qwen2Config |
|
from transformers.models.siglip import SiglipVisionConfig |
|
|
|
|
|
class NVILALiteConfig(PretrainedConfig): |
|
model_type = "nvila_lite" |
|
sub_configs = { |
|
"text_config": Qwen2Config, |
|
"vision_config": SiglipVisionConfig, |
|
} |
|
_auto_class = "AutoConfig" |
|
|
|
def __init__( |
|
self, |
|
*, |
|
text_config: dict[str, Any] | None = None, |
|
vision_config: dict[str, Any] | None = None, |
|
image_token_id: int | None = None, |
|
video_token_id: int | None = None, |
|
**kwargs, |
|
): |
|
self.text_config = Qwen2Config(**text_config) if text_config is not None else Qwen2Config() |
|
self.vision_config = SiglipVisionConfig(**vision_config) if vision_config is not None else SiglipVisionConfig() |
|
|
|
self.image_token_id = image_token_id if image_token_id is not None else -1 |
|
self.video_token_id = video_token_id if video_token_id is not None else -1 |
|
|
|
super().__init__(**kwargs) |
|
|