Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

README.md +20 -24
chat_template.jinja +31 -0
config.json +194 -8
model-00001-of-00002.safetensors +2 -2
model-00002-of-00002.safetensors +2 -2
model.safetensors.index.json +0 -0
modeling_kimi_vl.py +71 -7
preprocessor_config.json +26 -0
processor_config.json +6 -0
tokenization_moonshot.py +3 -0
tokenizer_config.json +2 -1

README.md CHANGED Viewed

@@ -1,36 +1,32 @@
 ---
-base_model: moonshotai/Kimi-VL-A3B-Thinking
-license: mit
-pipeline_tag: text-generation
-library_name: mlx
 tags:
 - mlx
 ---
 # mlx-community/Kimi-VL-A3B-Thinking-4bit
-This model [mlx-community/Kimi-VL-A3B-Thinking-4bit](https://huggingface.co/mlx-community/Kimi-VL-A3B-Thinking-4bit) was
-converted to MLX format from [moonshotai/Kimi-VL-A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking)
-using mlx-lm version **0.22.4**.
 ## Use with mlx
 ```bash
-pip install mlx-lm
 ```
-```python
-from mlx_lm import load, generate
-model, tokenizer = load("mlx-community/Kimi-VL-A3B-Thinking-4bit")
-prompt = "hello"
-if tokenizer.chat_template is not None:
-    messages = [{"role": "user", "content": prompt}]
-    prompt = tokenizer.apply_chat_template(
-        messages, add_generation_prompt=True
-    )
-response = generate(model, tokenizer, prompt=prompt, verbose=True)
 ```

 ---
+license: other
+license_name: qwen
+license_link: https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/LICENSE
+pipeline_tag: image-text-to-text
+library_name: transformers
+base_model:
+- OpenGVLab/InternViT-300M-448px-V2_5
+- Qwen/Qwen2.5-0.5B
+base_model_relation: merge
+datasets:
+- OpenGVLab/MMPR-v1.2
+language:
+- multilingual
 tags:
+- internvl
+- custom_code
 - mlx
 ---
 # mlx-community/Kimi-VL-A3B-Thinking-4bit
+This model was converted to MLX format from [`moonshotai/Kimi-VL-A3B-Thinking`]() using mlx-vlm version **0.1.23**.
+Refer to the [original model card](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking) for more details on the model.
 ## Use with mlx
 ```bash
+pip install -U mlx-vlm
 ```
+```bash
+python -m mlx_vlm.generate --model mlx-community/Kimi-VL-A3B-Thinking-4bit --max-tokens 100 --temperature 0.0 --prompt "Describe this image." --image <path_to_image>
 ```

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,31 @@

+{%- for message in messages -%}
+  {%- if loop.first and messages[0]['role'] != 'system' -%}
+    {{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}
+  {%- endif -%}
+  {%- if message['role'] == 'system' -%}
+    {{'<|im_system|>'}}
+  {%- endif -%}
+  {%- if message['role'] == 'user' -%}
+    {{'<|im_user|>'}}
+  {%- endif -%}
+  {%- if message['role'] == 'assistant' -%}
+    {{'<|im_assistant|>'}}
+  {%- endif -%}
+  {{- message['role'] -}}
+  {{'<|im_middle|>'}}
+  {%- if message['content'] is string -%}
+    {{- message['content'] + '<|im_end|>' -}}
+  {%- else -%}
+    {%- for content in message['content'] -%}
+      {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
+        {{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}
+      {%- else -%}
+        {{content['text']}}
+      {%- endif -%}
+    {%- endfor -%}
+    {{'<|im_end|>'}}
+  {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+  {{'<|im_assistant|>assistant<|im_middle|>'}}
+{%- endif -%}

config.json CHANGED Viewed

@@ -1,4 +1,6 @@
 {
     "architectures": [
         "KimiVLForConditionalGeneration"
     ],
@@ -7,17 +9,60 @@
         "AutoModel": "modeling_kimi_vl.KimiVLForConditionalGeneration",
         "AutoModelForCausalLM": "modeling_kimi_vl.KimiVLForConditionalGeneration"
     },
     "ignore_index": -100,
     "media_placeholder_token_id": 163605,
     "model_type": "kimi_vl",
     "quantization": {
         "group_size": 64,
         "bits": 4
     },
-    "quantization_config": {
-        "group_size": 64,
-        "bits": 4
-    },
     "text_config": {
         "vocab_size": 163840,
         "max_position_embeddings": 131072,
@@ -25,6 +70,7 @@
         "intermediate_size": 11264,
         "moe_intermediate_size": 1408,
         "num_hidden_layers": 27,
         "num_attention_heads": 16,
         "n_shared_experts": 2,
         "n_routed_experts": 64,
@@ -55,14 +101,154 @@
         "rope_scaling": null,
         "attention_bias": false,
         "attention_dropout": 0.0,
         "bos_token_id": 163584,
         "pad_token_id": 163839,
         "eos_token_id": 163585,
-        "torch_dtype": "bfloat16",
-        "tie_word_embeddings": false
     },
     "tie_word_embeddings": false,
-    "torch_dtype": "bfloat16",
-    "transformers_version": "4.50.3",
     "vocab_size": 163840
 }

 {
+    "_attn_implementation_autoset": false,
+    "add_cross_attention": false,
     "architectures": [
         "KimiVLForConditionalGeneration"
     ],
         "AutoModel": "modeling_kimi_vl.KimiVLForConditionalGeneration",
         "AutoModelForCausalLM": "modeling_kimi_vl.KimiVLForConditionalGeneration"
     },
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+    },
     "ignore_index": -100,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
     "media_placeholder_token_id": 163605,
+    "min_length": 0,
     "model_type": "kimi_vl",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
     "quantization": {
         "group_size": 64,
         "bits": 4
     },
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
     "text_config": {
         "vocab_size": 163840,
         "max_position_embeddings": 131072,
         "intermediate_size": 11264,
         "moe_intermediate_size": 1408,
         "num_hidden_layers": 27,
+        "num_nextn_predict_layers": 1,
         "num_attention_heads": 16,
         "n_shared_experts": 2,
         "n_routed_experts": 64,
         "rope_scaling": null,
         "attention_bias": false,
         "attention_dropout": 0.0,
+        "return_dict": true,
+        "output_hidden_states": false,
+        "output_attentions": false,
+        "torchscript": false,
+        "torch_dtype": "bfloat16",
+        "use_bfloat16": false,
+        "tf_legacy_loss": false,
+        "pruned_heads": {},
+        "tie_word_embeddings": false,
+        "chunk_size_feed_forward": 0,
+        "is_encoder_decoder": false,
+        "is_decoder": false,
+        "cross_attention_hidden_size": null,
+        "add_cross_attention": false,
+        "tie_encoder_decoder": false,
+        "max_length": 20,
+        "min_length": 0,
+        "do_sample": false,
+        "early_stopping": false,
+        "num_beams": 1,
+        "num_beam_groups": 1,
+        "diversity_penalty": 0.0,
+        "temperature": 1.0,
+        "top_k": 50,
+        "top_p": 1.0,
+        "typical_p": 1.0,
+        "repetition_penalty": 1.0,
+        "length_penalty": 1.0,
+        "no_repeat_ngram_size": 0,
+        "encoder_no_repeat_ngram_size": 0,
+        "bad_words_ids": null,
+        "num_return_sequences": 1,
+        "output_scores": false,
+        "return_dict_in_generate": false,
+        "forced_bos_token_id": null,
+        "forced_eos_token_id": null,
+        "remove_invalid_values": false,
+        "exponential_decay_length_penalty": null,
+        "suppress_tokens": null,
+        "begin_suppress_tokens": null,
+        "architectures": null,
+        "finetuning_task": null,
+        "id2label": {
+            "0": "LABEL_0",
+            "1": "LABEL_1"
+        },
+        "label2id": {
+            "LABEL_0": 0,
+            "LABEL_1": 1
+        },
+        "tokenizer_class": null,
+        "prefix": null,
         "bos_token_id": 163584,
         "pad_token_id": 163839,
         "eos_token_id": 163585,
+        "sep_token_id": null,
+        "decoder_start_token_id": null,
+        "task_specific_params": null,
+        "problem_type": null,
+        "_name_or_path": "",
+        "_attn_implementation_autoset": false,
+        "model_type": "deepseek_v3"
     },
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
     "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torchscript": false,
+    "transformers_version": "4.52.0.dev0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vision_config": {
+        "return_dict": true,
+        "output_hidden_states": false,
+        "output_attentions": false,
+        "torchscript": false,
+        "torch_dtype": "bfloat16",
+        "use_bfloat16": false,
+        "tf_legacy_loss": false,
+        "pruned_heads": {},
+        "tie_word_embeddings": true,
+        "chunk_size_feed_forward": 0,
+        "is_encoder_decoder": false,
+        "is_decoder": false,
+        "cross_attention_hidden_size": null,
+        "add_cross_attention": false,
+        "tie_encoder_decoder": false,
+        "max_length": 20,
+        "min_length": 0,
+        "do_sample": false,
+        "early_stopping": false,
+        "num_beams": 1,
+        "num_beam_groups": 1,
+        "diversity_penalty": 0.0,
+        "temperature": 1.0,
+        "top_k": 50,
+        "top_p": 1.0,
+        "typical_p": 1.0,
+        "repetition_penalty": 1.0,
+        "length_penalty": 1.0,
+        "no_repeat_ngram_size": 0,
+        "encoder_no_repeat_ngram_size": 0,
+        "bad_words_ids": null,
+        "num_return_sequences": 1,
+        "output_scores": false,
+        "return_dict_in_generate": false,
+        "forced_bos_token_id": null,
+        "forced_eos_token_id": null,
+        "remove_invalid_values": false,
+        "exponential_decay_length_penalty": null,
+        "suppress_tokens": null,
+        "begin_suppress_tokens": null,
+        "architectures": null,
+        "finetuning_task": null,
+        "id2label": {
+            "0": "LABEL_0",
+            "1": "LABEL_1"
+        },
+        "label2id": {
+            "LABEL_0": 0,
+            "LABEL_1": 1
+        },
+        "tokenizer_class": null,
+        "prefix": null,
+        "bos_token_id": null,
+        "pad_token_id": null,
+        "eos_token_id": null,
+        "sep_token_id": null,
+        "decoder_start_token_id": null,
+        "task_specific_params": null,
+        "problem_type": null,
+        "_name_or_path": "",
+        "_attn_implementation_autoset": false,
+        "model_type": "moonvit",
+        "patch_size": 14,
+        "init_pos_emb_height": 64,
+        "init_pos_emb_width": 64,
+        "num_hidden_layers": 27,
+        "num_attention_heads": 16,
+        "hidden_size": 1152,
+        "intermediate_size": 4304,
+        "merge_kernel_size": [
+            2,
+            2
+        ],
+        "skip_vision": true
+    },
     "vocab_size": 163840
 }

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc1b4b9e7cdf18df281b554b721e8b3da7916fd36a7673b1396b83df64de08a7
-size 5284632734

 version https://git-lfs.github.com/spec/v1
+oid sha256:9fa24384962d98e76557bbb00193e2ac40aa6456f617cbd79730eb63787c40a4
+size 5356237611

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:861f9444d7ff364daa0ba01ea1d00f07c54fffc9ff780265caaeb7836b29522c
-size 3698140150

 version https://git-lfs.github.com/spec/v1
+oid sha256:63ce34dfdd6f4ca593e196cfffe6336c304984d7c870112c9df86b6f3b719433
+size 4477574765

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

modeling_kimi_vl.py CHANGED Viewed

@@ -55,10 +55,8 @@ import torch.distributed as dist
 from torch.nn import CrossEntropyLoss
 from transformers.activations import GELUActivation, ACT2FN, PytorchGELUTanh
 from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    GenerationMixin,
-)
 from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from transformers.modeling_outputs import (
@@ -906,6 +904,7 @@ class MoEGate(nn.Module):
         self.n_routed_experts = config.n_routed_experts
         self.routed_scaling_factor = config.routed_scaling_factor
         self.scoring_func = config.scoring_func
         self.seq_aux = config.seq_aux
         self.topk_method = config.topk_method
         self.n_group = config.n_group
@@ -972,6 +971,10 @@ class MoEGate(nn.Module):
             )  # [n, e]
             _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
             topk_weight = scores.gather(1, topk_idx)
         else:
             raise NotImplementedError(
                 f"insupportable TopK function for MoE gating: {self.topk_method}"
@@ -985,7 +988,57 @@ class MoEGate(nn.Module):
             topk_weight * self.routed_scaling_factor
         )  # must multiply the scaling factor
-        return topk_idx, topk_weight
 class DeepseekV3MoE(nn.Module):
@@ -1038,9 +1091,20 @@ class DeepseekV3MoE(nn.Module):
     def forward(self, hidden_states):
         identity = hidden_states
         orig_shape = hidden_states.shape
-        topk_idx, topk_weight = self.gate(hidden_states)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-        if not self.training:
             y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
         if self.config.n_shared_experts is not None:
             y = y + self.shared_experts(identity)

 from torch.nn import CrossEntropyLoss
 from transformers.activations import GELUActivation, ACT2FN, PytorchGELUTanh
 from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_utils import PreTrainedModel
+from transformers.generation.utils import GenerationMixin
 from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from transformers.modeling_outputs import (
         self.n_routed_experts = config.n_routed_experts
         self.routed_scaling_factor = config.routed_scaling_factor
         self.scoring_func = config.scoring_func
+        self.alpha = config.aux_loss_alpha
         self.seq_aux = config.seq_aux
         self.topk_method = config.topk_method
         self.n_group = config.n_group
             )  # [n, e]
             _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
             topk_weight = scores.gather(1, topk_idx)
+        elif self.topk_method == "greedy":
+            topk_weight, topk_idx = torch.topk(
+                scores, k=self.top_k, dim=-1, sorted=False
+            )
         else:
             raise NotImplementedError(
                 f"insupportable TopK function for MoE gating: {self.topk_method}"
             topk_weight * self.routed_scaling_factor
         )  # must multiply the scaling factor
+        if self.training and self.alpha > 0.0:
+            scores_for_aux = scores
+            aux_topk = self.top_k
+            # always compute aux loss based on the naive greedy topk method
+            topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
+            if self.seq_aux:
+                scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
+                ce = torch.zeros(
+                    bsz, self.n_routed_experts, device=hidden_states.device
+                )
+                ce.scatter_add_(
+                    1,
+                    topk_idx_for_aux_loss,
+                    torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device),
+                ).div_(seq_len * aux_topk / self.n_routed_experts)
+                aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(
+                    dim=1
+                ).mean() * self.alpha
+            else:
+                mask_ce = F.one_hot(
+                    topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts
+                )
+                ce = mask_ce.float().mean(0)
+                Pi = scores_for_aux.mean(0)
+                fi = ce * self.n_routed_experts
+                aux_loss = (Pi * fi).sum() * self.alpha
+        else:
+            aux_loss = None
+        return topk_idx, topk_weight, aux_loss
+class AddAuxiliaryLoss(torch.autograd.Function):
+    """
+    The trick function of adding auxiliary (aux) loss,
+    which includes the gradient of the aux loss during backpropagation.
+    """
+    @staticmethod
+    def forward(ctx, x, loss):
+        assert loss.numel() == 1
+        ctx.dtype = loss.dtype
+        ctx.required_aux_loss = loss.requires_grad
+        return x
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_loss = None
+        if ctx.required_aux_loss:
+            grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device)
+        return grad_output, grad_loss
 class DeepseekV3MoE(nn.Module):
     def forward(self, hidden_states):
         identity = hidden_states
         orig_shape = hidden_states.shape
+        topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        if self.training:
+            flat_topk_idx = topk_idx.view(-1)
+            hidden_states = hidden_states.repeat_interleave(
+                self.num_experts_per_tok, dim=0
+            )
+            y = torch.empty_like(hidden_states)
+            for i, expert in enumerate(self.experts):
+                y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
+            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+            y = y.to(hidden_states.dtype).view(*orig_shape)
+            y = AddAuxiliaryLoss.apply(y, aux_loss)
+        else:
             y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
         if self.config.n_shared_experts is not None:
             y = y + self.shared_experts(identity)

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_kimi_vl.KimiVLImageProcessor",
+    "AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
+  },
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "KimiVLImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "in_token_limit": 4096,
+  "merge_kernel_size": [
+    2,
+    2
+  ],
+  "num_pooled_tokens": 1024,
+  "pad_input": true,
+  "patch_size": 14,
+  "processor_class": "KimiVLProcessor"
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
+  },
+  "processor_class": "KimiVLProcessor"
+}

tokenization_moonshot.py CHANGED Viewed

@@ -16,6 +16,7 @@ from shutil import copyfile
 from tiktoken.load import load_tiktoken_bpe
 from tokenizers import AddedToken
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
@@ -229,6 +230,8 @@ class TikTokenTokenizer(PreTrainedTokenizer):
         if len(kwargs) > 0:
             return super().decode(token_ids, **kwargs)
         if type(token_ids) is int:
             token_ids = [token_ids]

 from tiktoken.load import load_tiktoken_bpe
 from tokenizers import AddedToken
 from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import to_py_obj
 from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
         if len(kwargs) > 0:
             return super().decode(token_ids, **kwargs)
+        token_ids = to_py_obj(token_ids)
         if type(token_ids) is int:
             token_ids = [token_ids]

tokenizer_config.json CHANGED Viewed

@@ -117,18 +117,19 @@
     "<|media_pad|>"
   ],
   "auto_map": {
     "AutoTokenizer": [
       "tokenization_moonshot.TikTokenTokenizer",
       null
     ]
   },
   "bos_token": "[BOS]",
-  "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "[EOS]",
   "extra_special_tokens": {},
   "model_max_length": 1048576,
   "pad_token": "[PAD]",
   "tokenizer_class": "TikTokenTokenizer",
   "unk_token": "[UNK]"
 }

     "<|media_pad|>"
   ],
   "auto_map": {
+    "AutoProcessor": "processing_kimi_vl.KimiVLProcessor",
     "AutoTokenizer": [
       "tokenization_moonshot.TikTokenTokenizer",
       null
     ]
   },
   "bos_token": "[BOS]",
   "clean_up_tokenization_spaces": false,
   "eos_token": "[EOS]",
   "extra_special_tokens": {},
   "model_max_length": 1048576,
   "pad_token": "[PAD]",
+  "processor_class": "KimiVLProcessor",
   "tokenizer_class": "TikTokenTokenizer",
   "unk_token": "[UNK]"
 }