cpatonn commited on 10 days ago

Commit

7900294

verified ·

1 Parent(s): c27aa70

Upload folder using huggingface_hub

Browse files

Files changed (30) hide show

.gitattributes +2 -0
README.md +258 -0
chat_template.jinja +150 -0
config.json +662 -0
generation_config.json +11 -0
get_size.py +14 -0
images/README +1 -0
images/cogito-v2-109b-benchmarks.png +3 -0
images/deep-cogito-logo.png +0 -0
model-00001-of-00014.safetensors +3 -0
model-00002-of-00014.safetensors +3 -0
model-00003-of-00014.safetensors +3 -0
model-00004-of-00014.safetensors +3 -0
model-00005-of-00014.safetensors +3 -0
model-00006-of-00014.safetensors +3 -0
model-00007-of-00014.safetensors +3 -0
model-00008-of-00014.safetensors +3 -0
model-00009-of-00014.safetensors +3 -0
model-00010-of-00014.safetensors +3 -0
model-00011-of-00014.safetensors +3 -0
model-00012-of-00014.safetensors +3 -0
model-00013-of-00014.safetensors +3 -0
model-00014-of-00014.safetensors +3 -0
model.safetensors.index.json +0 -0
preprocessor_config.json +34 -0
processor_config.json +6 -0
recipe.yaml +11 -0
special_tokens_map.json +23 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/cogito-v2-109b-benchmarks.png filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,258 @@

+---
+license: llama4
+library_name: transformers
+base_model:
+- meta-llama/Llama-4-Scout-17B-16E
+---
+<p align="center">
+  <img src="images/deep-cogito-logo.png" alt="Logo" width="40%">
+</p>
+# Cogito v2 preview - 109B MoE
+[Blog Post](https://www.deepcogito.com/research/cogito-v2-preview)
+The Cogito v2 LLMs are instruction tuned generative models. All models are released under an open license for commercial use.
+- Cogito v2 models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+- The LLMs are trained using **Iterated Distillation and Amplification (IDA)** - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+- The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+  - In both standard and reasoning modes, Cogito v2-preview models outperform their size equivalent counterparts on common industry benchmarks.
+- This model is trained in over 30 languages and supports long contexts (upto 10M tokens).
+# Evaluations
+Here is the model performance on some standard industry benchmarks:
+<p align="left">
+  <img src="images/cogito-v2-109b-benchmarks.png" alt="Logo" width="90%">
+</p>
+For detailed evaluations, please refer to the [Blog Post](https://www.deepcogito.com/research/cogito-v2-preview).
+# Usage
+Here is a snippet below for usage with Transformers:
+```python
+import transformers
+import torch
+model_id = "deepcogito/cogito-v2-preview-llama-109B-MoE"
+pipeline = transformers.pipeline(
+    "text-generation",
+    model=model_id,
+    model_kwargs={"torch_dtype": torch.bfloat16},
+    device_map="auto",
+)
+messages = [
+    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
+    {"role": "user", "content": "Give me a short introduction to LLMs."},
+]
+outputs = pipeline(
+    messages,
+    max_new_tokens=512,
+)
+print(outputs[0]["generated_text"][-1])
+```
+## Implementing extended thinking
+- By default, the model will answer in the standard mode.
+- To enable thinking, you can do any one of the two methods:
+  - Set `enable_thinking=True` while applying the chat template.
+  - Add a specific system prompt, along with prefilling the response with "\<think\>\n".
+**NOTE: Unlike Cogito v1 models, we initiate the response with "\<think\>\n" at the beginning of every output when reasoning is enabled. This is because hybrid models can be brittle at times (<0.1% of the cases), and adding a "\<think\>\n" ensures that the model does indeed respect thinking.**
+### Method 1 - Set enable_thinking=True in the tokenizer
+If you are using Huggingface tokenizers, then you can simply use add the argument `enable_thinking=True` to the tokenization (this option is added to the chat template).
+Here is an example -
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "deepcogito/cogito-v2-preview-llama-109B-MoE"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+prompt = "Give me a short introduction to LLMs."
+messages = [
+    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=True
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=512
+)
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(response)
+```
+### Method 2 - Add a specific system prompt, along with prefilling the response with "\<think\>\n".
+To enable thinking using this method, you need to do two parts -
+Step 1 - Simply use this in the system prompt `system_instruction = 'Enable deep thinking subroutine.'`
+If you already have a system_instruction, then use `system_instruction = 'Enable deep thinking subroutine.' + '\n\n' + system_instruction`.
+Step 2 - Prefil the response with the tokens `"<think>\n"`.
+Here is an example -
+```python
+import transformers
+import torch
+model_name = "deepcogito/cogito-v2-preview-llama-109B-MoE"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Step 1 - Add deep thinking instruction.
+DEEP_THINKING_INSTRUCTION = "Enable deep thinking subroutine."
+messages = [
+    {"role": "system", "content": DEEP_THINKING_INSTRUCTION},
+    {"role": "user", "content": "Write a bash script that takes a matrix represented as a string with format '[1,2],[3,4],[5,6]' and prints the transpose in the same format."},
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+# Step 2 - Prefill response with "<think>\n".
+text += "<think>\n"
+# Now, continue as usual.
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=512
+)
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(response)
+```
+Similarly, if you have a system prompt, you can append the `DEEP_THINKING_INSTRUCTION` to the beginning in this way -
+```python
+DEEP_THINKING_INSTRUCTION = "Enable deep thinking subroutine."
+system_prompt = "Reply to each prompt with only the actual code - no explanations."
+prompt = "Write a bash script that takes a matrix represented as a string with format '[1,2],[3,4],[5,6]' and prints the transpose in the same format."
+messages = [
+    {"role": "system", "content": DEEP_THINKING_INSTRUCTION + '\n\n' + system_prompt},
+    {"role": "user", "content": prompt}
+]
+```
+# Tool Calling
+Cogito models support tool calling (single, parallel, multiple and parallel_multiple) both in standard and extended thinking mode.
+Here is a snippet -
+```python
+# First, define a tool
+def get_current_temperature(location: str) -> float:
+    """
+    Get the current temperature at a location.
+    Args:
+        location: The location to get the temperature for, in the format "City, Country"
+    Returns:
+        The current temperature at the specified location in the specified units, as a float.
+    """
+    return 22.  # A real function should probably actually get the temperature!
+# Next, create a chat and apply the chat template
+messages = [
+  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
+]
+model_inputs = tokenizer.apply_chat_template(messages, tools=[get_current_temperature], add_generation_prompt=True)
+text = tokenizer.apply_chat_template(messages, tools=[get_current_temperature], add_generation_prompt=True, tokenize=False)
+inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=512)
+output_text = tokenizer.batch_decode(outputs)[0][len(text):]
+print(output_text)
+```
+This will result in the output -
+```
+<tool_call>
+{"name": "get_current_temperature", "arguments": {"location": "Paris, France"}}
+</tool_call><|eot|>
+```
+You can then generate text from this input as normal. If the model generates a tool call, you should add it to the chat like so:
+```python
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
+```
+and then call the tool and append the result, with the `tool` role, like so:
+```python
+messages.append({"role": "tool", "name": "get_current_temperature", "content": "22.0"})
+```
+After that, you can `generate()` again to let the model use the tool result in the chat:
+```python
+text = tokenizer.apply_chat_template(messages, tools=[get_current_temperature], add_generation_prompt=True, tokenize=False)
+inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=512)
+output_text = tokenizer.batch_decode(outputs)[0][len(text):]
+```
+This should result in the string -
+```
+'The current temperature in Paris is 22.0 degrees.<|eot|>'
+```
+## License
+This repository and the model weights are licensed under the [Llama 4 Community License Agreement](https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE) (Llama models' default license agreement).
+## Contact
+If you would like to reach out to our team, send an email to [[email protected]]([email protected]).

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,150 @@

+{{- bos_token }}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{%- if not enable_thinking is defined %}
+    {%- set enable_thinking = false %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- Set the system message. If enable_thinking is true, add the "Enable deep thinking subroutine." #}
+{%- if enable_thinking %}
+    {%- if system_message != "" %}
+        {%- set system_message = "Enable deep thinking subroutine.
+" ~ system_message %}
+    {%- else %}
+        {%- set system_message = "Enable deep thinking subroutine." %}
+    {%- endif %}
+{%- endif %}
+{#- System message + tools #}
+{%- if tools is not none or system_message != '' %}
+    {{- "<|header_start|>system<|header_end|>
+" }}
+    {{- system_message }}
+    {%- if tools is not none %}
+        {%- if system_message != "" %}
+            {{- "
+" }}
+        {%- endif %}
+        {{- "Available Tools:
+" }}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "
+" }}
+        {%- endfor %}
+    {%- endif %}
+    {{- "<|eot|>" }}
+{%- endif %}
+{#- Rest of the messages #}
+{%- for message in messages %}
+    {#- Case 1 - Usual, non tool related message. #}
+    {%- if not (message.role == "ipython" or message.role == "tool" or message.role == "tool_results" or (message.tool_calls is defined and message.tool_calls is not none)) %}
+        {{- '<|header_start|>' + message['role'] + '<|header_end|>
+' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eot|>" }}
+    {#- Case 2 - the response is from the assistant, but has a tool call returned. #}
+    {%- elif message.tool_calls is defined and message.tool_calls is not none %}
+        {{- "<|header_start|>assistant<|header_end|>
+" }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+            {%- if message['content'] | trim != "" %}
+                {{- "
+" }}
+            {%- endif %}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                    {%- if content['text'] | trim != "" %}
+                        {{- "
+" }}
+                    {%- endif %}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "[" }}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set out = tool_call.function|tojson %}
+                {%- if not tool_call.id is defined %}
+                    {{- out }}
+                {%- else %}
+                    {{- out[:-1] }}
+                    {{- ', "id": "' + tool_call.id + '"}' }}
+                {%- endif %}
+            {%- else %}
+                {{- tool_call|tojson }}
+            {%- endif %}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- else %}
+                {{- "]<|eot|>" }}
+            {%- endif %}
+        {%- endfor %}
+    {#- Case 3 - the response is from a tool call. #}
+    {%- elif message.role == "ipython" or message["role"] == "tool_results" or message["role"] == "tool" %}
+        {{- "<|header_start|>ipython<|header_end|>
+" }}
+        {%- if message.tool_call_id is defined and message.tool_call_id != '' %}
+            {{- '{"content": ' }}
+            {%- if message.content is mapping or (message.content is iterable and not message.content is string) %}
+                {{- message.content | tojson }}
+            {%- else %}
+                {{- '"' ~ message.content ~ '"' }}
+            {%- endif %}
+            {{- ', "call_id": "' ~ message.tool_call_id ~ '"}' }}
+        {%- else %}
+            {%- if message.content is mapping or (message.content is iterable and not message.content is string) %}
+                {{- message.content | tojson }}
+            {%- else %}
+                {{- message.content }}
+            {%- endif %}
+        {%- endif %}
+        {{- "<|eot|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|header_start|>assistant<|header_end|>\n\n' }}
+    {%- if enable_thinking %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,662 @@

+{
+  "architectures": [
+    "Llama4ForConditionalGeneration"
+  ],
+  "boi_token_index": 200080,
+  "eoi_token_index": 200081,
+  "image_token_index": 200092,
+  "model_type": "llama4",
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "input_activations": null,
+        "output_activations": null,
+        "targets": [
+          "Linear"
+        ],
+        "weights": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": false,
+          "group_size": 128,
+          "num_bits": 4,
+          "observer": "minmax",
+          "observer_kwargs": {},
+          "strategy": "group",
+          "symmetric": true,
+          "type": "int"
+        }
+      }
+    },
+    "format": "pack-quantized",
+    "global_compression_ratio": null,
+    "ignore": [
+      "vision_model.patch_embedding.linear",
+      "vision_model.model.layers.0.self_attn.q_proj",
+      "vision_model.model.layers.0.self_attn.k_proj",
+      "vision_model.model.layers.0.self_attn.v_proj",
+      "vision_model.model.layers.0.self_attn.o_proj",
+      "vision_model.model.layers.0.mlp.fc1",
+      "vision_model.model.layers.0.mlp.fc2",
+      "vision_model.model.layers.1.self_attn.q_proj",
+      "vision_model.model.layers.1.self_attn.k_proj",
+      "vision_model.model.layers.1.self_attn.v_proj",
+      "vision_model.model.layers.1.self_attn.o_proj",
+      "vision_model.model.layers.1.mlp.fc1",
+      "vision_model.model.layers.1.mlp.fc2",
+      "vision_model.model.layers.2.self_attn.q_proj",
+      "vision_model.model.layers.2.self_attn.k_proj",
+      "vision_model.model.layers.2.self_attn.v_proj",
+      "vision_model.model.layers.2.self_attn.o_proj",
+      "vision_model.model.layers.2.mlp.fc1",
+      "vision_model.model.layers.2.mlp.fc2",
+      "vision_model.model.layers.3.self_attn.q_proj",
+      "vision_model.model.layers.3.self_attn.k_proj",
+      "vision_model.model.layers.3.self_attn.v_proj",
+      "vision_model.model.layers.3.self_attn.o_proj",
+      "vision_model.model.layers.3.mlp.fc1",
+      "vision_model.model.layers.3.mlp.fc2",
+      "vision_model.model.layers.4.self_attn.q_proj",
+      "vision_model.model.layers.4.self_attn.k_proj",
+      "vision_model.model.layers.4.self_attn.v_proj",
+      "vision_model.model.layers.4.self_attn.o_proj",
+      "vision_model.model.layers.4.mlp.fc1",
+      "vision_model.model.layers.4.mlp.fc2",
+      "vision_model.model.layers.5.self_attn.q_proj",
+      "vision_model.model.layers.5.self_attn.k_proj",
+      "vision_model.model.layers.5.self_attn.v_proj",
+      "vision_model.model.layers.5.self_attn.o_proj",
+      "vision_model.model.layers.5.mlp.fc1",
+      "vision_model.model.layers.5.mlp.fc2",
+      "vision_model.model.layers.6.self_attn.q_proj",
+      "vision_model.model.layers.6.self_attn.k_proj",
+      "vision_model.model.layers.6.self_attn.v_proj",
+      "vision_model.model.layers.6.self_attn.o_proj",
+      "vision_model.model.layers.6.mlp.fc1",
+      "vision_model.model.layers.6.mlp.fc2",
+      "vision_model.model.layers.7.self_attn.q_proj",
+      "vision_model.model.layers.7.self_attn.k_proj",
+      "vision_model.model.layers.7.self_attn.v_proj",
+      "vision_model.model.layers.7.self_attn.o_proj",
+      "vision_model.model.layers.7.mlp.fc1",
+      "vision_model.model.layers.7.mlp.fc2",
+      "vision_model.model.layers.8.self_attn.q_proj",
+      "vision_model.model.layers.8.self_attn.k_proj",
+      "vision_model.model.layers.8.self_attn.v_proj",
+      "vision_model.model.layers.8.self_attn.o_proj",
+      "vision_model.model.layers.8.mlp.fc1",
+      "vision_model.model.layers.8.mlp.fc2",
+      "vision_model.model.layers.9.self_attn.q_proj",
+      "vision_model.model.layers.9.self_attn.k_proj",
+      "vision_model.model.layers.9.self_attn.v_proj",
+      "vision_model.model.layers.9.self_attn.o_proj",
+      "vision_model.model.layers.9.mlp.fc1",
+      "vision_model.model.layers.9.mlp.fc2",
+      "vision_model.model.layers.10.self_attn.q_proj",
+      "vision_model.model.layers.10.self_attn.k_proj",
+      "vision_model.model.layers.10.self_attn.v_proj",
+      "vision_model.model.layers.10.self_attn.o_proj",
+      "vision_model.model.layers.10.mlp.fc1",
+      "vision_model.model.layers.10.mlp.fc2",
+      "vision_model.model.layers.11.self_attn.q_proj",
+      "vision_model.model.layers.11.self_attn.k_proj",
+      "vision_model.model.layers.11.self_attn.v_proj",
+      "vision_model.model.layers.11.self_attn.o_proj",
+      "vision_model.model.layers.11.mlp.fc1",
+      "vision_model.model.layers.11.mlp.fc2",
+      "vision_model.model.layers.12.self_attn.q_proj",
+      "vision_model.model.layers.12.self_attn.k_proj",
+      "vision_model.model.layers.12.self_attn.v_proj",
+      "vision_model.model.layers.12.self_attn.o_proj",
+      "vision_model.model.layers.12.mlp.fc1",
+      "vision_model.model.layers.12.mlp.fc2",
+      "vision_model.model.layers.13.self_attn.q_proj",
+      "vision_model.model.layers.13.self_attn.k_proj",
+      "vision_model.model.layers.13.self_attn.v_proj",
+      "vision_model.model.layers.13.self_attn.o_proj",
+      "vision_model.model.layers.13.mlp.fc1",
+      "vision_model.model.layers.13.mlp.fc2",
+      "vision_model.model.layers.14.self_attn.q_proj",
+      "vision_model.model.layers.14.self_attn.k_proj",
+      "vision_model.model.layers.14.self_attn.v_proj",
+      "vision_model.model.layers.14.self_attn.o_proj",
+      "vision_model.model.layers.14.mlp.fc1",
+      "vision_model.model.layers.14.mlp.fc2",
+      "vision_model.model.layers.15.self_attn.q_proj",
+      "vision_model.model.layers.15.self_attn.k_proj",
+      "vision_model.model.layers.15.self_attn.v_proj",
+      "vision_model.model.layers.15.self_attn.o_proj",
+      "vision_model.model.layers.15.mlp.fc1",
+      "vision_model.model.layers.15.mlp.fc2",
+      "vision_model.model.layers.16.self_attn.q_proj",
+      "vision_model.model.layers.16.self_attn.k_proj",
+      "vision_model.model.layers.16.self_attn.v_proj",
+      "vision_model.model.layers.16.self_attn.o_proj",
+      "vision_model.model.layers.16.mlp.fc1",
+      "vision_model.model.layers.16.mlp.fc2",
+      "vision_model.model.layers.17.self_attn.q_proj",
+      "vision_model.model.layers.17.self_attn.k_proj",
+      "vision_model.model.layers.17.self_attn.v_proj",
+      "vision_model.model.layers.17.self_attn.o_proj",
+      "vision_model.model.layers.17.mlp.fc1",
+      "vision_model.model.layers.17.mlp.fc2",
+      "vision_model.model.layers.18.self_attn.q_proj",
+      "vision_model.model.layers.18.self_attn.k_proj",
+      "vision_model.model.layers.18.self_attn.v_proj",
+      "vision_model.model.layers.18.self_attn.o_proj",
+      "vision_model.model.layers.18.mlp.fc1",
+      "vision_model.model.layers.18.mlp.fc2",
+      "vision_model.model.layers.19.self_attn.q_proj",
+      "vision_model.model.layers.19.self_attn.k_proj",
+      "vision_model.model.layers.19.self_attn.v_proj",
+      "vision_model.model.layers.19.self_attn.o_proj",
+      "vision_model.model.layers.19.mlp.fc1",
+      "vision_model.model.layers.19.mlp.fc2",
+      "vision_model.model.layers.20.self_attn.q_proj",
+      "vision_model.model.layers.20.self_attn.k_proj",
+      "vision_model.model.layers.20.self_attn.v_proj",
+      "vision_model.model.layers.20.self_attn.o_proj",
+      "vision_model.model.layers.20.mlp.fc1",
+      "vision_model.model.layers.20.mlp.fc2",
+      "vision_model.model.layers.21.self_attn.q_proj",
+      "vision_model.model.layers.21.self_attn.k_proj",
+      "vision_model.model.layers.21.self_attn.v_proj",
+      "vision_model.model.layers.21.self_attn.o_proj",
+      "vision_model.model.layers.21.mlp.fc1",
+      "vision_model.model.layers.21.mlp.fc2",
+      "vision_model.model.layers.22.self_attn.q_proj",
+      "vision_model.model.layers.22.self_attn.k_proj",
+      "vision_model.model.layers.22.self_attn.v_proj",
+      "vision_model.model.layers.22.self_attn.o_proj",
+      "vision_model.model.layers.22.mlp.fc1",
+      "vision_model.model.layers.22.mlp.fc2",
+      "vision_model.model.layers.23.self_attn.q_proj",
+      "vision_model.model.layers.23.self_attn.k_proj",
+      "vision_model.model.layers.23.self_attn.v_proj",
+      "vision_model.model.layers.23.self_attn.o_proj",
+      "vision_model.model.layers.23.mlp.fc1",
+      "vision_model.model.layers.23.mlp.fc2",
+      "vision_model.model.layers.24.self_attn.q_proj",
+      "vision_model.model.layers.24.self_attn.k_proj",
+      "vision_model.model.layers.24.self_attn.v_proj",
+      "vision_model.model.layers.24.self_attn.o_proj",
+      "vision_model.model.layers.24.mlp.fc1",
+      "vision_model.model.layers.24.mlp.fc2",
+      "vision_model.model.layers.25.self_attn.q_proj",
+      "vision_model.model.layers.25.self_attn.k_proj",
+      "vision_model.model.layers.25.self_attn.v_proj",
+      "vision_model.model.layers.25.self_attn.o_proj",
+      "vision_model.model.layers.25.mlp.fc1",
+      "vision_model.model.layers.25.mlp.fc2",
+      "vision_model.model.layers.26.self_attn.q_proj",
+      "vision_model.model.layers.26.self_attn.k_proj",
+      "vision_model.model.layers.26.self_attn.v_proj",
+      "vision_model.model.layers.26.self_attn.o_proj",
+      "vision_model.model.layers.26.mlp.fc1",
+      "vision_model.model.layers.26.mlp.fc2",
+      "vision_model.model.layers.27.self_attn.q_proj",
+      "vision_model.model.layers.27.self_attn.k_proj",
+      "vision_model.model.layers.27.self_attn.v_proj",
+      "vision_model.model.layers.27.self_attn.o_proj",
+      "vision_model.model.layers.27.mlp.fc1",
+      "vision_model.model.layers.27.mlp.fc2",
+      "vision_model.model.layers.28.self_attn.q_proj",
+      "vision_model.model.layers.28.self_attn.k_proj",
+      "vision_model.model.layers.28.self_attn.v_proj",
+      "vision_model.model.layers.28.self_attn.o_proj",
+      "vision_model.model.layers.28.mlp.fc1",
+      "vision_model.model.layers.28.mlp.fc2",
+      "vision_model.model.layers.29.self_attn.q_proj",
+      "vision_model.model.layers.29.self_attn.k_proj",
+      "vision_model.model.layers.29.self_attn.v_proj",
+      "vision_model.model.layers.29.self_attn.o_proj",
+      "vision_model.model.layers.29.mlp.fc1",
+      "vision_model.model.layers.29.mlp.fc2",
+      "vision_model.model.layers.30.self_attn.q_proj",
+      "vision_model.model.layers.30.self_attn.k_proj",
+      "vision_model.model.layers.30.self_attn.v_proj",
+      "vision_model.model.layers.30.self_attn.o_proj",
+      "vision_model.model.layers.30.mlp.fc1",
+      "vision_model.model.layers.30.mlp.fc2",
+      "vision_model.model.layers.31.self_attn.q_proj",
+      "vision_model.model.layers.31.self_attn.k_proj",
+      "vision_model.model.layers.31.self_attn.v_proj",
+      "vision_model.model.layers.31.self_attn.o_proj",
+      "vision_model.model.layers.31.mlp.fc1",
+      "vision_model.model.layers.31.mlp.fc2",
+      "vision_model.model.layers.32.self_attn.q_proj",
+      "vision_model.model.layers.32.self_attn.k_proj",
+      "vision_model.model.layers.32.self_attn.v_proj",
+      "vision_model.model.layers.32.self_attn.o_proj",
+      "vision_model.model.layers.32.mlp.fc1",
+      "vision_model.model.layers.32.mlp.fc2",
+      "vision_model.model.layers.33.self_attn.q_proj",
+      "vision_model.model.layers.33.self_attn.k_proj",
+      "vision_model.model.layers.33.self_attn.v_proj",
+      "vision_model.model.layers.33.self_attn.o_proj",
+      "vision_model.model.layers.33.mlp.fc1",
+      "vision_model.model.layers.33.mlp.fc2",
+      "vision_model.vision_adapter.mlp.fc1",
+      "vision_model.vision_adapter.mlp.fc2",
+      "multi_modal_projector.linear_1",
+      "language_model.model.layers.0.self_attn.q_proj",
+      "language_model.model.layers.0.self_attn.k_proj",
+      "language_model.model.layers.0.self_attn.v_proj",
+      "language_model.model.layers.0.self_attn.o_proj",
+      "language_model.model.layers.1.self_attn.q_proj",
+      "language_model.model.layers.1.self_attn.k_proj",
+      "language_model.model.layers.1.self_attn.v_proj",
+      "language_model.model.layers.1.self_attn.o_proj",
+      "language_model.model.layers.2.self_attn.q_proj",
+      "language_model.model.layers.2.self_attn.k_proj",
+      "language_model.model.layers.2.self_attn.v_proj",
+      "language_model.model.layers.2.self_attn.o_proj",
+      "language_model.model.layers.3.self_attn.q_proj",
+      "language_model.model.layers.3.self_attn.k_proj",
+      "language_model.model.layers.3.self_attn.v_proj",
+      "language_model.model.layers.3.self_attn.o_proj",
+      "language_model.model.layers.4.self_attn.q_proj",
+      "language_model.model.layers.4.self_attn.k_proj",
+      "language_model.model.layers.4.self_attn.v_proj",
+      "language_model.model.layers.4.self_attn.o_proj",
+      "language_model.model.layers.5.self_attn.q_proj",
+      "language_model.model.layers.5.self_attn.k_proj",
+      "language_model.model.layers.5.self_attn.v_proj",
+      "language_model.model.layers.5.self_attn.o_proj",
+      "language_model.model.layers.6.self_attn.q_proj",
+      "language_model.model.layers.6.self_attn.k_proj",
+      "language_model.model.layers.6.self_attn.v_proj",
+      "language_model.model.layers.6.self_attn.o_proj",
+      "language_model.model.layers.7.self_attn.q_proj",
+      "language_model.model.layers.7.self_attn.k_proj",
+      "language_model.model.layers.7.self_attn.v_proj",
+      "language_model.model.layers.7.self_attn.o_proj",
+      "language_model.model.layers.8.self_attn.q_proj",
+      "language_model.model.layers.8.self_attn.k_proj",
+      "language_model.model.layers.8.self_attn.v_proj",
+      "language_model.model.layers.8.self_attn.o_proj",
+      "language_model.model.layers.9.self_attn.q_proj",
+      "language_model.model.layers.9.self_attn.k_proj",
+      "language_model.model.layers.9.self_attn.v_proj",
+      "language_model.model.layers.9.self_attn.o_proj",
+      "language_model.model.layers.10.self_attn.q_proj",
+      "language_model.model.layers.10.self_attn.k_proj",
+      "language_model.model.layers.10.self_attn.v_proj",
+      "language_model.model.layers.10.self_attn.o_proj",
+      "language_model.model.layers.11.self_attn.q_proj",
+      "language_model.model.layers.11.self_attn.k_proj",
+      "language_model.model.layers.11.self_attn.v_proj",
+      "language_model.model.layers.11.self_attn.o_proj",
+      "language_model.model.layers.12.self_attn.q_proj",
+      "language_model.model.layers.12.self_attn.k_proj",
+      "language_model.model.layers.12.self_attn.v_proj",
+      "language_model.model.layers.12.self_attn.o_proj",
+      "language_model.model.layers.13.self_attn.q_proj",
+      "language_model.model.layers.13.self_attn.k_proj",
+      "language_model.model.layers.13.self_attn.v_proj",
+      "language_model.model.layers.13.self_attn.o_proj",
+      "language_model.model.layers.14.self_attn.q_proj",
+      "language_model.model.layers.14.self_attn.k_proj",
+      "language_model.model.layers.14.self_attn.v_proj",
+      "language_model.model.layers.14.self_attn.o_proj",
+      "language_model.model.layers.15.self_attn.q_proj",
+      "language_model.model.layers.15.self_attn.k_proj",
+      "language_model.model.layers.15.self_attn.v_proj",
+      "language_model.model.layers.15.self_attn.o_proj",
+      "language_model.model.layers.16.self_attn.q_proj",
+      "language_model.model.layers.16.self_attn.k_proj",
+      "language_model.model.layers.16.self_attn.v_proj",
+      "language_model.model.layers.16.self_attn.o_proj",
+      "language_model.model.layers.17.self_attn.q_proj",
+      "language_model.model.layers.17.self_attn.k_proj",
+      "language_model.model.layers.17.self_attn.v_proj",
+      "language_model.model.layers.17.self_attn.o_proj",
+      "language_model.model.layers.18.self_attn.q_proj",
+      "language_model.model.layers.18.self_attn.k_proj",
+      "language_model.model.layers.18.self_attn.v_proj",
+      "language_model.model.layers.18.self_attn.o_proj",
+      "language_model.model.layers.19.self_attn.q_proj",
+      "language_model.model.layers.19.self_attn.k_proj",
+      "language_model.model.layers.19.self_attn.v_proj",
+      "language_model.model.layers.19.self_attn.o_proj",
+      "language_model.model.layers.20.self_attn.q_proj",
+      "language_model.model.layers.20.self_attn.k_proj",
+      "language_model.model.layers.20.self_attn.v_proj",
+      "language_model.model.layers.20.self_attn.o_proj",
+      "language_model.model.layers.21.self_attn.q_proj",
+      "language_model.model.layers.21.self_attn.k_proj",
+      "language_model.model.layers.21.self_attn.v_proj",
+      "language_model.model.layers.21.self_attn.o_proj",
+      "language_model.model.layers.22.self_attn.q_proj",
+      "language_model.model.layers.22.self_attn.k_proj",
+      "language_model.model.layers.22.self_attn.v_proj",
+      "language_model.model.layers.22.self_attn.o_proj",
+      "language_model.model.layers.23.self_attn.q_proj",
+      "language_model.model.layers.23.self_attn.k_proj",
+      "language_model.model.layers.23.self_attn.v_proj",
+      "language_model.model.layers.23.self_attn.o_proj",
+      "language_model.model.layers.24.self_attn.q_proj",
+      "language_model.model.layers.24.self_attn.k_proj",
+      "language_model.model.layers.24.self_attn.v_proj",
+      "language_model.model.layers.24.self_attn.o_proj",
+      "language_model.model.layers.25.self_attn.q_proj",
+      "language_model.model.layers.25.self_attn.k_proj",
+      "language_model.model.layers.25.self_attn.v_proj",
+      "language_model.model.layers.25.self_attn.o_proj",
+      "language_model.model.layers.26.self_attn.q_proj",
+      "language_model.model.layers.26.self_attn.k_proj",
+      "language_model.model.layers.26.self_attn.v_proj",
+      "language_model.model.layers.26.self_attn.o_proj",
+      "language_model.model.layers.27.self_attn.q_proj",
+      "language_model.model.layers.27.self_attn.k_proj",
+      "language_model.model.layers.27.self_attn.v_proj",
+      "language_model.model.layers.27.self_attn.o_proj",
+      "language_model.model.layers.28.self_attn.q_proj",
+      "language_model.model.layers.28.self_attn.k_proj",
+      "language_model.model.layers.28.self_attn.v_proj",
+      "language_model.model.layers.28.self_attn.o_proj",
+      "language_model.model.layers.29.self_attn.q_proj",
+      "language_model.model.layers.29.self_attn.k_proj",
+      "language_model.model.layers.29.self_attn.v_proj",
+      "language_model.model.layers.29.self_attn.o_proj",
+      "language_model.model.layers.30.self_attn.q_proj",
+      "language_model.model.layers.30.self_attn.k_proj",
+      "language_model.model.layers.30.self_attn.v_proj",
+      "language_model.model.layers.30.self_attn.o_proj",
+      "language_model.model.layers.31.self_attn.q_proj",
+      "language_model.model.layers.31.self_attn.k_proj",
+      "language_model.model.layers.31.self_attn.v_proj",
+      "language_model.model.layers.31.self_attn.o_proj",
+      "language_model.model.layers.32.self_attn.q_proj",
+      "language_model.model.layers.32.self_attn.k_proj",
+      "language_model.model.layers.32.self_attn.v_proj",
+      "language_model.model.layers.32.self_attn.o_proj",
+      "language_model.model.layers.33.self_attn.q_proj",
+      "language_model.model.layers.33.self_attn.k_proj",
+      "language_model.model.layers.33.self_attn.v_proj",
+      "language_model.model.layers.33.self_attn.o_proj",
+      "language_model.model.layers.34.self_attn.q_proj",
+      "language_model.model.layers.34.self_attn.k_proj",
+      "language_model.model.layers.34.self_attn.v_proj",
+      "language_model.model.layers.34.self_attn.o_proj",
+      "language_model.model.layers.35.self_attn.q_proj",
+      "language_model.model.layers.35.self_attn.k_proj",
+      "language_model.model.layers.35.self_attn.v_proj",
+      "language_model.model.layers.35.self_attn.o_proj",
+      "language_model.model.layers.36.self_attn.q_proj",
+      "language_model.model.layers.36.self_attn.k_proj",
+      "language_model.model.layers.36.self_attn.v_proj",
+      "language_model.model.layers.36.self_attn.o_proj",
+      "language_model.model.layers.37.self_attn.q_proj",
+      "language_model.model.layers.37.self_attn.k_proj",
+      "language_model.model.layers.37.self_attn.v_proj",
+      "language_model.model.layers.37.self_attn.o_proj",
+      "language_model.model.layers.38.self_attn.q_proj",
+      "language_model.model.layers.38.self_attn.k_proj",
+      "language_model.model.layers.38.self_attn.v_proj",
+      "language_model.model.layers.38.self_attn.o_proj",
+      "language_model.model.layers.39.self_attn.q_proj",
+      "language_model.model.layers.39.self_attn.k_proj",
+      "language_model.model.layers.39.self_attn.v_proj",
+      "language_model.model.layers.39.self_attn.o_proj",
+      "language_model.model.layers.40.self_attn.q_proj",
+      "language_model.model.layers.40.self_attn.k_proj",
+      "language_model.model.layers.40.self_attn.v_proj",
+      "language_model.model.layers.40.self_attn.o_proj",
+      "language_model.model.layers.41.self_attn.q_proj",
+      "language_model.model.layers.41.self_attn.k_proj",
+      "language_model.model.layers.41.self_attn.v_proj",
+      "language_model.model.layers.41.self_attn.o_proj",
+      "language_model.model.layers.42.self_attn.q_proj",
+      "language_model.model.layers.42.self_attn.k_proj",
+      "language_model.model.layers.42.self_attn.v_proj",
+      "language_model.model.layers.42.self_attn.o_proj",
+      "language_model.model.layers.43.self_attn.q_proj",
+      "language_model.model.layers.43.self_attn.k_proj",
+      "language_model.model.layers.43.self_attn.v_proj",
+      "language_model.model.layers.43.self_attn.o_proj",
+      "language_model.model.layers.44.self_attn.q_proj",
+      "language_model.model.layers.44.self_attn.k_proj",
+      "language_model.model.layers.44.self_attn.v_proj",
+      "language_model.model.layers.44.self_attn.o_proj",
+      "language_model.model.layers.45.self_attn.q_proj",
+      "language_model.model.layers.45.self_attn.k_proj",
+      "language_model.model.layers.45.self_attn.v_proj",
+      "language_model.model.layers.45.self_attn.o_proj",
+      "language_model.model.layers.46.self_attn.q_proj",
+      "language_model.model.layers.46.self_attn.k_proj",
+      "language_model.model.layers.46.self_attn.v_proj",
+      "language_model.model.layers.46.self_attn.o_proj",
+      "language_model.model.layers.47.self_attn.q_proj",
+      "language_model.model.layers.47.self_attn.k_proj",
+      "language_model.model.layers.47.self_attn.v_proj",
+      "language_model.model.layers.47.self_attn.o_proj",
+      "language_model.lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "quant_method": "compressed-tensors",
+    "quantization_status": "compressed"
+  },
+  "text_config": {
+    "attention_bias": false,
+    "attention_chunk_size": 8192,
+    "attention_dropout": 0.0,
+    "attn_scale": 0.1,
+    "attn_temperature_tuning": true,
+    "bos_token_id": 200000,
+    "eos_token_id": [
+      200001,
+      200007,
+      200008
+    ],
+    "floor_scale": 8192,
+    "for_llm_compressor": false,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "interleave_moe_layer_step": 1,
+    "intermediate_size": 8192,
+    "intermediate_size_mlp": 16384,
+    "layer_types": [
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "chunked_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 262144,
+    "model_type": "llama4_text",
+    "moe_layers": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      16,
+      17,
+      18,
+      19,
+      20,
+      21,
+      22,
+      23,
+      24,
+      25,
+      26,
+      27,
+      28,
+      29,
+      30,
+      31,
+      32,
+      33,
+      34,
+      35,
+      36,
+      37,
+      38,
+      39,
+      40,
+      41,
+      42,
+      43,
+      44,
+      45,
+      46,
+      47
+    ],
+    "no_rope_layers": [
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0
+    ],
+    "num_attention_heads": 40,
+    "num_experts_per_tok": 1,
+    "num_hidden_layers": 48,
+    "num_key_value_heads": 8,
+    "num_local_experts": 16,
+    "output_router_logits": false,
+    "pad_token_id": 200018,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+      "factor": 16.0,
+      "high_freq_factor": 1.0,
+      "low_freq_factor": 1.0,
+      "original_max_position_embeddings": 8192,
+      "rope_type": "llama3"
+    },
+    "rope_theta": 500000.0,
+    "router_aux_loss_coef": 0.001,
+    "router_jitter_noise": 0.0,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_qk_norm": true,
+    "vocab_size": 201135
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.54.1",
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1408,
+    "image_size": 336,
+    "initializer_range": 0.02,
+    "intermediate_size": 5632,
+    "model_type": "llama4_vision_model",
+    "multi_modal_projector_bias": false,
+    "norm_eps": 1e-05,
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 34,
+    "patch_size": 14,
+    "pixel_shuffle_ratio": 0.5,
+    "projector_dropout": 0.0,
+    "projector_input_dim": 4096,
+    "projector_output_dim": 4096,
+    "rope_theta": 10000,
+    "vision_feature_layer": -1,
+    "vision_feature_select_strategy": "default",
+    "vision_output_dim": 4096
+  }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 200000,
+  "eos_token_id": [
+    200001,
+    200007,
+    200008
+  ],
+  "pad_token_id": 200018,
+  "transformers_version": "4.54.1"
+}

get_size.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os, json
+# 1) load your existing index
+index_path = "model.safetensors.index.json"
+with open(index_path, "r") as f:
+    idx = json.load(f)
+# 2) find all the shards
+shards = sorted(f for f in os.listdir(".")
+                if f.startswith("model-") and f.endswith(".safetensors"))
+# 3) sum their sizes
+total_size = sum(os.path.getsize(s) for s in shards)
+print(f"Found {len(shards)} shards, total_size = {total_size} bytes")

images/README ADDED Viewed

	@@ -0,0 +1 @@


1	+ Directory for images associated with the model.

images/cogito-v2-109b-benchmarks.png ADDED Viewed

Git LFS Details

SHA256: b55cb70cf26c01b4bfbe2f7b006bae249cacb0bb318c9a9b4f157b6ce7373c32
Pointer size: 131 Bytes
Size of remote file: 279 kB

images/deep-cogito-logo.png ADDED Viewed

model-00001-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55183a0814b59a545bae3fca480e45b37c6ebdb5e7273fbb8d5bf61cc29fcf7b
+size 4989289528

model-00002-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9431faffe2702aa668774db9f637dd2bcf502805b0c5ccdf3104b2f49cc027ce
+size 4959308680

model-00003-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b55e715913966c598035e1b5fbfd334cfedd6c3bb9b55bbce9931c1dcc11e79f
+size 4989434120

model-00004-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55737b49d896dd40ede5c567c5355095dc81ce9414b6e5127b908f1ddf2e73cd
+size 4990090304

model-00005-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfa2f3e5c8b10553bfaa00b87d408293f56adab075707bb4e130f88414f6f022
+size 4980916048

model-00006-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f43b6af23fcb97c89dfa33a253069d74fdcf42f9c0428c84055eef705d3f8e2
+size 4980916048

model-00007-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e3a625c5ac2631dfaf5b705e0ff88c12b0c101828b17961268111a67ff5be22
+size 4980916048

model-00008-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2732f629c6b7bd9f23143c87977a8b89ee8b96cece1c2a67fc06242c2c241feb
+size 4980916048

model-00009-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff473bccdd6d79041ceb42a35dfa6451a523978a0be024b4507c0a8a25be1d07
+size 4980916048

model-00010-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ce66771aa3547f51cc64acf5169eab90aa0f060bee745594e0c8669d812365f
+size 4980916048

model-00011-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd97df8999bf7d2de510860eefd7f38666146f2266bd7eab2474e9a22266a77
+size 4980916048

model-00012-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54cd51307229a07b9b6b6514746783d8ca2a8a14217a3eec7a7a593fcd7dd0dc
+size 4980916048

model-00013-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:120b86b0bad7c9ae7840b302223a26525cdeeeba3ff098a2215ffa43ff0d7d93
+size 3020522640

model-00014-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5880f575e94ea288fbc0d9014548bc951e13678f2921194a629c217d0112752e
+size 2059622544

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Llama4ImageProcessorFast",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_patches": 16,
+  "processor_class": "Llama4Processor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "resize_to_max_canvas": false,
+  "return_tensors": null,
+  "size": {
+    "height": 336,
+    "width": 336
+  }
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "fake_image_token": "<|image|>",
+  "image_token": "<|image|>",
+  "patch_size": 14,
+  "processor_class": "Llama4Processor"
+}

recipe.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+default_stage:
+  default_modifiers:
+    GPTQModifier:
+      targets: [Linear]
+      ignore: ['re:.*lm_head', 're:.*self_attn', 're:.*router', 're:vision_model.*', 're:multi_modal_projector.*',
+        Llama4TextAttention]
+      scheme: W4A16
+      sequential_update: true
+      block_size: 128
+      dampening_frac: 0.01
+      offload_hessians: false

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|finetune_right_pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:172c9eb4beafc72601690da3ccfcede5c2e6806a8d5ec1fca33e22acea8023a4
+size 27948578

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff