|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig |
|
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config |
|
|
|
|
|
class OpenCUAConfig(PretrainedConfig): |
|
"""OpenCUA-2.5-32B model configuration. |
|
|
|
Args: |
|
vision_config: Configuration for the vision model.Qwen2_5_VLVisionConfig |
|
text_config: Configuration for the text model. Qwen2Config |
|
pad_token_id: The token ID to use for padding. |
|
""" |
|
|
|
model_type = "opencua" |
|
|
|
def __init__( |
|
self, |
|
vision_config: dict | Qwen2_5_VLVisionConfig | None = None, |
|
text_config: dict | Qwen2Config | None = None, |
|
ignore_index: int = -100, |
|
media_placeholder_token_id: int = 151664, |
|
pad_token_id: int = 0, |
|
**kwargs |
|
): |
|
if isinstance(vision_config, dict): |
|
vision_config = Qwen2_5_VLVisionConfig(**vision_config) |
|
self.vision_config = vision_config |
|
|
|
if isinstance(text_config, dict): |
|
text_config = Qwen2Config(**text_config) |
|
self.text_config = text_config |
|
|
|
self.ignore_index = ignore_index |
|
self.media_placeholder_token_id = media_placeholder_token_id |
|
|
|
super().__init__(pad_token_id=pad_token_id, **kwargs) |
|
|