---
library_name: transformers
pipeline_tag: image-text-to-text
inference: true
widget:
  - text: Hello!
    example_title: Hello world
    group: Python
base_model:
- google/gemma-3n-E4B-it
---

This tiny model is for debugging. It is randomly initialized with the config adapted from [google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it).

| Model ID                                                                              | Notes                           |
| ------------------------------------------------------------------------------------- | ------------------------------- |
| [tiny-random/gemma-3n](https://huggingface.co/tiny-random/gemma-3n) | hidden size is 32 |
| [tiny-random/gemma-3n-dim4](https://huggingface.co/tiny-random/gemma-3n-dim4)    | hidden size is 4; potentially not supported in paged attention kernels|

### Example usage:

```python
import torch

from transformers import pipeline

model_id = "tiny-random/gemma-3n"
pipe = pipeline(
    task="image-text-to-text",
    model=model_id,
    device=0,
    torch_dtype=torch.bfloat16
)

# temporary patch for audio tower
from accelerate.hooks import ModelHook, add_hook_to_module

class EnsureDtype(ModelHook):
    def pre_forward(self, module, *args, **kwargs):
        args = list(args)
        args[0] = args[0].to(module.dtype)
        return super().pre_forward(module, *args, **kwargs)
add_hook_to_module(pipe.model.audio_tower, EnsureDtype())

messages = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": "You are a helpful assistant."}
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
            # audio is buggy for now: bf16 x fp32
            {"type": "audio", "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"},
            {"type": "text", "text": "Which image is cuter?"},
        ]
    },
]
result = pipe(messages, min_new_tokens=512, max_new_tokens=512, do_sample=True)
print(result)
```

### Codes to create this repo:

```python
import json
from pathlib import Path

import torch

import accelerate
from huggingface_hub import file_exists, hf_hub_download
from timm.models.mobilenetv5 import decode_arch_def
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoProcessor,
    AutoTokenizer,
    Gemma3nForConditionalGeneration,
    GenerationConfig,
    set_seed,
)

source_model_id = "google/gemma-3n-E4B-it"
save_folder = "/tmp/tiny-random/gemma-3n"

processor = AutoProcessor.from_pretrained(source_model_id)
processor.save_pretrained(save_folder)

with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
    config_json = json.load(f)

config_json['audio_config'].update({
    "conf_num_attention_heads": 2,
    "conf_num_hidden_layers": 2,
    "hidden_size": 64,
})
config_json['text_config'].update({
    "activation_sparsity_pattern": [0.95, 0.95, 0.0, 0.0],
    "head_dim": 32,  # required by vllm
    "hidden_size": 32,
    "hidden_size_per_layer_input": 2,
    "intermediate_size": 64,
    "laurel_rank": 8,
    "layer_types": ['sliding_attention', 'full_attention', 'sliding_attention', 'full_attention'],
    "num_attention_heads": 1,
    "num_hidden_layers": 4,
    "num_key_value_heads": 1,
    "num_kv_shared_layers": 2,
    "sliding_window": 512,
})
block_args = decode_arch_def(
    [
        # Stage 0: 128x128 in
        [
            'er_r1_k3_s2_e4_c32',
            'er_r1_k3_s1_e4_c32',
        ],
        # Stage 1: 256x256 in
        [
            'uir_r1_a3_k5_s2_e6_c32',
            'uir_r1_a5_k0_s1_e4_c32',
            'uir_r1_a3_k0_s1_e4_c32',
        ],
        # Stage 2: 640x640 in
        [
            "uir_r1_a5_k5_s2_e6_c32",
            "uir_r1_a0_k0_s1_e1_c32",
            "mqa_r1_k3_h2_v2_s1_d64_c32",
            "uir_r1_a0_k0_s1_e2_c32",
        ],
        # Stage 3: 1280x1280 in
        [
            "uir_r1_a5_k5_s2_e6_c32",
            "mqa_r1_k3_h2_s1_d64_c32",
            "uir_r1_a0_k0_s1_e2_c32",
        ],
    ]
)
config_json['vision_config'].update({
    "hidden_size": 2048,  # hard-coded in timm
    "model_args": {
        "block_args": block_args,
    }
})
config_json['tie_word_embeddings'] = True

with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
    json.dump(config_json, f, indent=2)

config = AutoConfig.from_pretrained(
    save_folder,
    trust_remote_code=True,
)
print(config)

torch.set_default_dtype(torch.bfloat16)
model = Gemma3nForConditionalGeneration(config)
torch.set_default_dtype(torch.float32)
if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
    model.generation_config = GenerationConfig.from_pretrained(
        source_model_id, trust_remote_code=True,
    )
set_seed(42)
model = model.cpu()
all_numels = 0
for name, p in sorted(model.named_parameters()):
    all_numels += p.numel()
with torch.no_grad():
    for name, p in sorted(model.named_parameters()):
        torch.nn.init.normal_(p, 0, 0.2)
        print(name, p.shape, f'{p.numel() / all_numels * 100: .4f}%')
model.save_pretrained(save_folder)
```

### Printing the model:

```text
Gemma3nForConditionalGeneration(
  (model): Gemma3nModel(
    (vision_tower): TimmWrapperModel(
      (timm_model): MobileNetV5Encoder(
        (conv_stem): ConvNormAct(
          (conv): Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2), bias=False)
          (bn): RmsNormAct2d(
            (drop): Identity()
            (act): GELU(approximate='none')
          )
        )
        (blocks): Sequential(
          (0): Sequential(
            (0): EdgeResidual(
              (conv_exp): Conv2dSame(64, 256, kernel_size=(3, 3), stride=(2, 2), bias=False)
              (bn1): RmsNormAct2d(
                (drop): Identity()
                (act): GELU(approximate='none')
              )
              (aa): Identity()
              (se): Identity()
              (conv_pwl): Conv2d(256, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn2): RmsNormAct2d(
                (drop): Identity()
                (act): Identity()
              )
              (drop_path): Identity()
            )
            (1): EdgeResidual(
              (conv_exp): Conv2d(32, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (bn1): RmsNormAct2d(
                (drop): Identity()
                (act): GELU(approximate='none')
              )
              (aa): Identity()
              (se): Identity()
              (conv_pwl): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn2): RmsNormAct2d(
                (drop): Identity()
                (act): Identity()
              )
              (drop_path): Identity()
            )
          )
          (1): Sequential(
            (0): UniversalInvertedResidual(
              (dw_start): ConvNormAct(
                (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (pw_exp): ConvNormAct(
                (conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (dw_mid): ConvNormAct(
                (conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (se): Identity()
              (pw_proj): ConvNormAct(
                (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (dw_end): Identity()
              (layer_scale): LayerScale2d()
              (drop_path): Identity()
            )
            (1): UniversalInvertedResidual(
              (dw_start): ConvNormAct(
                (conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (pw_exp): ConvNormAct(
                (conv): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (dw_mid): Identity()
              (se): Identity()
              (pw_proj): ConvNormAct(
                (conv): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (dw_end): Identity()
              (layer_scale): LayerScale2d()
              (drop_path): Identity()
            )
            (2): UniversalInvertedResidual(
              (dw_start): ConvNormAct(
                (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (pw_exp): ConvNormAct(
                (conv): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (dw_mid): Identity()
              (se): Identity()
              (pw_proj): ConvNormAct(
                (conv): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (dw_end): Identity()
              (layer_scale): LayerScale2d()
              (drop_path): Identity()
            )
          )
          (2): Sequential(
            (0): UniversalInvertedResidual(
              (dw_start): ConvNormAct(
                (conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (pw_exp): ConvNormAct(
                (conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (dw_mid): ConvNormAct(
                (conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (se): Identity()
              (pw_proj): ConvNormAct(
                (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (dw_end): Identity()
              (layer_scale): LayerScale2d()
              (drop_path): Identity()
            )
            (1): UniversalInvertedResidual(
              (dw_start): Identity()
              (pw_exp): ConvNormAct(
                (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (dw_mid): Identity()
              (se): Identity()
              (pw_proj): ConvNormAct(
                (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (dw_end): Identity()
              (layer_scale): LayerScale2d()
              (drop_path): Identity()
            )
            (2): MobileAttention(
              (norm): RmsNormAct2d(
                (drop): Identity()
                (act): Identity()
              )
              (attn): MultiQueryAttention2d(
                (query): Sequential(
                  (proj): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
                )
                (key): Sequential(
                  (down_conv): Conv2dSame(32, 32, kernel_size=(3, 3), stride=(2, 2), groups=32, bias=False)
                  (norm): RmsNorm2d()
                  (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
                )
                (value): Sequential(
                  (down_conv): Conv2dSame(32, 32, kernel_size=(3, 3), stride=(2, 2), groups=32, bias=False)
                  (norm): RmsNorm2d()
                  (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
                )
                (attn_drop): Dropout(p=0.0, inplace=False)
                (output): Sequential(
                  (proj): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                  (drop): Dropout(p=0.0, inplace=False)
                )
              )
              (layer_scale): LayerScale2d()
              (drop_path): Identity()
            )
            (3): UniversalInvertedResidual(
              (dw_start): Identity()
              (pw_exp): ConvNormAct(
                (conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (dw_mid): Identity()
              (se): Identity()
              (pw_proj): ConvNormAct(
                (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (dw_end): Identity()
              (layer_scale): LayerScale2d()
              (drop_path): Identity()
            )
          )
          (3): Sequential(
            (0): UniversalInvertedResidual(
              (dw_start): ConvNormAct(
                (conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (pw_exp): ConvNormAct(
                (conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (dw_mid): ConvNormAct(
                (conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (se): Identity()
              (pw_proj): ConvNormAct(
                (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (dw_end): Identity()
              (layer_scale): LayerScale2d()
              (drop_path): Identity()
            )
            (1): MobileAttention(
              (norm): RmsNormAct2d(
                (drop): Identity()
                (act): Identity()
              )
              (attn): MultiQueryAttention2d(
                (query): Sequential(
                  (proj): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
                )
                (key): Sequential(
                  (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
                )
                (value): Sequential(
                  (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
                )
                (attn_drop): Dropout(p=0.0, inplace=False)
                (output): Sequential(
                  (proj): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                  (drop): Dropout(p=0.0, inplace=False)
                )
              )
              (layer_scale): LayerScale2d()
              (drop_path): Identity()
            )
            (2): UniversalInvertedResidual(
              (dw_start): Identity()
              (pw_exp): ConvNormAct(
                (conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): GELU(approximate='none')
                )
              )
              (dw_mid): Identity()
              (se): Identity()
              (pw_proj): ConvNormAct(
                (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): RmsNormAct2d(
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (dw_end): Identity()
              (layer_scale): LayerScale2d()
              (drop_path): Identity()
            )
          )
        )
        (msfa): MobileNetV5MultiScaleFusionAdapter(
          (ffn): UniversalInvertedResidual(
            (dw_start): Identity()
            (pw_exp): ConvNormAct(
              (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn): RmsNormAct2d(
                (drop): Identity()
                (act): GELU(approximate='none')
              )
            )
            (dw_mid): Identity()
            (se): Identity()
            (pw_proj): ConvNormAct(
              (conv): Conv2d(128, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn): RmsNormAct2d(
                (drop): Identity()
                (act): Identity()
              )
            )
            (dw_end): Identity()
            (layer_scale): Identity()
            (drop_path): Identity()
          )
          (norm): RmsNorm2d()
        )
      )
    )
    (language_model): Gemma3nTextModel(
      (embed_tokens): Gemma3nTextScaledWordEmbedding(262400, 32, padding_idx=0)
      (layers): ModuleList(
        (0-3): 4 x Gemma3nTextDecoderLayer(
          (self_attn): Gemma3nTextAttention(
            (q_proj): Linear(in_features=32, out_features=32, bias=False)
            (k_proj): Linear(in_features=32, out_features=32, bias=False)
            (v_proj): Linear(in_features=32, out_features=32, bias=False)
            (o_proj): Linear(in_features=32, out_features=32, bias=False)
            (q_norm): Gemma3nRMSNorm((32,), eps=1e-06)
            (k_norm): Gemma3nRMSNorm((32,), eps=1e-06)
            (v_norm): Gemma3nRMSNorm((), eps=1e-06)
          )
          (mlp): Gemma3nTextMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): PytorchGELUTanh()
          )
          (input_layernorm): Gemma3nRMSNorm((32,), eps=1e-06)
          (post_attention_layernorm): Gemma3nRMSNorm((32,), eps=1e-06)
          (pre_feedforward_layernorm): Gemma3nRMSNorm((32,), eps=1e-06)
          (post_feedforward_layernorm): Gemma3nRMSNorm((32,), eps=1e-06)
          (act_fn): PytorchGELUTanh()
          (altup): Gemma3nTextAltUp(
            (correction_coefs): Linear(in_features=4, out_features=4, bias=False)
            (prediction_coefs): Linear(in_features=4, out_features=16, bias=False)
            (modality_router): Linear(in_features=32, out_features=4, bias=False)
            (router_norm): Gemma3nRMSNorm((32,), eps=1e-06)
          )
          (laurel): Gemma3nTextLaurelBlock(
            (linear_left): Linear(in_features=32, out_features=8, bias=False)
            (linear_right): Linear(in_features=8, out_features=32, bias=False)
            (post_laurel_norm): Gemma3nRMSNorm((32,), eps=1e-06)
          )
          (per_layer_input_gate): Linear(in_features=32, out_features=2, bias=False)
          (per_layer_projection): Linear(in_features=2, out_features=32, bias=False)
          (post_per_layer_input_norm): Gemma3nRMSNorm((32,), eps=1e-06)
        )
      )
      (norm): Gemma3nRMSNorm((32,), eps=1e-06)
      (rotary_emb): Gemma3nTextRotaryEmbedding()
      (rotary_emb_local): Gemma3nTextRotaryEmbedding()
      (embed_tokens_per_layer): Gemma3nTextScaledWordEmbedding(262144, 8, padding_idx=0)
      (per_layer_model_projection): Linear(in_features=32, out_features=8, bias=False)
      (per_layer_projection_norm): Gemma3nRMSNorm((2,), eps=1e-06)
      (altup_projections): ModuleList(
        (0-2): 3 x Linear(in_features=32, out_features=32, bias=False)
      )
      (altup_unembed_projections): ModuleList(
        (0-2): 3 x Linear(in_features=32, out_features=32, bias=False)
      )
    )
    (audio_tower): Gemma3nAudioEncoder(
      (subsample_conv_projection): Gemma3nAudioSubSampleConvProjection(
        (conv_0): Gemma3nAudioSSCPConvBlock(
          (conv): Conv2d(1, 128, kernel_size=(3, 3), stride=(2, 2), bias=False)
          (norm): Gemma3nAudioCumulativeGroupNorm()
          (activation): ReLU()
        )
        (conv_1): Gemma3nAudioSSCPConvBlock(
          (conv): Conv2d(128, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
          (norm): Gemma3nAudioCumulativeGroupNorm()
          (activation): ReLU()
        )
        (input_proj_linear): Linear(in_features=1024, out_features=64, bias=False)
      )
      (conformer): ModuleList(
        (0-1): 2 x Gemma3nAudioConformerBlock(
          (ffw_layer_start): Gemma3nAudioConformerFeedForward(
            (pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06)
            (ffw_layer_1): Linear(in_features=64, out_features=256, bias=False)
            (ffw_layer_2): Linear(in_features=256, out_features=64, bias=False)
            (post_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06)
          )
          (attention): Gemma3nAudioConformerAttention(
            (pre_attn_norm): Gemma3nRMSNorm((64,), eps=1e-06)
            (attn): Gemma3nAudioAttention(
              (relative_position_embedding): Gemma3nAudioRelativePositionEmbedding(
                (pos_proj): Linear(in_features=64, out_features=64, bias=False)
              )
              (q_proj): Linear(in_features=64, out_features=64, bias=False)
              (k_proj): Linear(in_features=64, out_features=64, bias=False)
              (v_proj): Linear(in_features=64, out_features=64, bias=False)
            )
            (post): Linear(in_features=64, out_features=64, bias=False)
            (post_norm): Gemma3nRMSNorm((64,), eps=1e-06)
          )
          (lconv1d): Gemma3nAudioConformerLightConv1d(
            (pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06)
            (linear_start): Linear(in_features=64, out_features=128, bias=False)
            (depthwise_conv1d): Conv1d(64, 64, kernel_size=(5,), stride=(1,), groups=64, bias=False)
            (conv_norm): Gemma3nRMSNorm((64,), eps=1e-06)
            (linear_end): Linear(in_features=64, out_features=64, bias=False)
          )
          (ffw_layer_end): Gemma3nAudioConformerFeedForward(
            (pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06)
            (ffw_layer_1): Linear(in_features=64, out_features=256, bias=False)
            (ffw_layer_2): Linear(in_features=256, out_features=64, bias=False)
            (post_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06)
          )
          (norm): Gemma3nRMSNorm((64,), eps=1e-06)
        )
      )
    )
    (embed_vision): Gemma3nMultimodalEmbedder(
      (embedding): Embedding(128, 2048)
      (hard_embedding_norm): Gemma3nRMSNorm((2048,), eps=1e-06)
      (soft_embedding_norm): Gemma3nRMSNorm((2048,), eps=1e-06)
      (embedding_projection): Linear(in_features=2048, out_features=32, bias=False)
      (embedding_post_projection_norm): Gemma3nRMSNorm((), eps=1e-06)
    )
    (embed_audio): Gemma3nMultimodalEmbedder(
      (embedding): Embedding(128, 64)
      (hard_embedding_norm): Gemma3nRMSNorm((64,), eps=1e-06)
      (soft_embedding_norm): Gemma3nRMSNorm((64,), eps=1e-06)
      (embedding_projection): Linear(in_features=64, out_features=32, bias=False)
      (embedding_post_projection_norm): Gemma3nRMSNorm((), eps=1e-06)
    )
  )
  (lm_head): Linear(in_features=32, out_features=262400, bias=False)
)
```