--- library_name: transformers pipeline_tag: image-text-to-text inference: true widget: - text: Hello! example_title: Hello world group: Python base_model: - google/gemma-3n-E4B-it --- This tiny model is for debugging. It is randomly initialized with the config adapted from [google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it). | Model ID | Notes | | ------------------------------------------------------------------------------------- | ------------------------------- | | [tiny-random/gemma-3n](https://huggingface.co/tiny-random/gemma-3n) | hidden size is 32 | | [tiny-random/gemma-3n-dim4](https://huggingface.co/tiny-random/gemma-3n-dim4) | hidden size is 4; potentially not supported in paged attention kernels| ### Example usage: ```python import torch from transformers import pipeline model_id = "tiny-random/gemma-3n" pipe = pipeline( task="image-text-to-text", model=model_id, device=0, torch_dtype=torch.bfloat16 ) # temporary patch for audio tower from accelerate.hooks import ModelHook, add_hook_to_module class EnsureDtype(ModelHook): def pre_forward(self, module, *args, **kwargs): args = list(args) args[0] = args[0].to(module.dtype) return super().pre_forward(module, *args, **kwargs) add_hook_to_module(pipe.model.audio_tower, EnsureDtype()) messages = [ { "role": "system", "content": [ {"type": "text", "text": "You are a helpful assistant."} ] }, { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"}, # audio is buggy for now: bf16 x fp32 {"type": "audio", "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"}, {"type": "text", "text": "Which image is cuter?"}, ] }, ] result = pipe(messages, min_new_tokens=512, max_new_tokens=512, do_sample=True) print(result) ``` ### Codes to create this repo: ```python import json from pathlib import Path import torch import accelerate from huggingface_hub import file_exists, hf_hub_download from timm.models.mobilenetv5 import decode_arch_def from transformers import ( AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer, Gemma3nForConditionalGeneration, GenerationConfig, set_seed, ) source_model_id = "google/gemma-3n-E4B-it" save_folder = "/tmp/tiny-random/gemma-3n" processor = AutoProcessor.from_pretrained(source_model_id) processor.save_pretrained(save_folder) with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f: config_json = json.load(f) config_json['audio_config'].update({ "conf_num_attention_heads": 2, "conf_num_hidden_layers": 2, "hidden_size": 64, }) config_json['text_config'].update({ "activation_sparsity_pattern": [0.95, 0.95, 0.0, 0.0], "head_dim": 32, # required by vllm "hidden_size": 32, "hidden_size_per_layer_input": 2, "intermediate_size": 64, "laurel_rank": 8, "layer_types": ['sliding_attention', 'full_attention', 'sliding_attention', 'full_attention'], "num_attention_heads": 1, "num_hidden_layers": 4, "num_key_value_heads": 1, "num_kv_shared_layers": 2, "sliding_window": 512, }) block_args = decode_arch_def( [ # Stage 0: 128x128 in [ 'er_r1_k3_s2_e4_c32', 'er_r1_k3_s1_e4_c32', ], # Stage 1: 256x256 in [ 'uir_r1_a3_k5_s2_e6_c32', 'uir_r1_a5_k0_s1_e4_c32', 'uir_r1_a3_k0_s1_e4_c32', ], # Stage 2: 640x640 in [ "uir_r1_a5_k5_s2_e6_c32", "uir_r1_a0_k0_s1_e1_c32", "mqa_r1_k3_h2_v2_s1_d64_c32", "uir_r1_a0_k0_s1_e2_c32", ], # Stage 3: 1280x1280 in [ "uir_r1_a5_k5_s2_e6_c32", "mqa_r1_k3_h2_s1_d64_c32", "uir_r1_a0_k0_s1_e2_c32", ], ] ) config_json['vision_config'].update({ "hidden_size": 2048, # hard-coded in timm "model_args": { "block_args": block_args, } }) config_json['tie_word_embeddings'] = True with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f: json.dump(config_json, f, indent=2) config = AutoConfig.from_pretrained( save_folder, trust_remote_code=True, ) print(config) torch.set_default_dtype(torch.bfloat16) model = Gemma3nForConditionalGeneration(config) torch.set_default_dtype(torch.float32) if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'): model.generation_config = GenerationConfig.from_pretrained( source_model_id, trust_remote_code=True, ) set_seed(42) model = model.cpu() all_numels = 0 for name, p in sorted(model.named_parameters()): all_numels += p.numel() with torch.no_grad(): for name, p in sorted(model.named_parameters()): torch.nn.init.normal_(p, 0, 0.2) print(name, p.shape, f'{p.numel() / all_numels * 100: .4f}%') model.save_pretrained(save_folder) ``` ### Printing the model: ```text Gemma3nForConditionalGeneration( (model): Gemma3nModel( (vision_tower): TimmWrapperModel( (timm_model): MobileNetV5Encoder( (conv_stem): ConvNormAct( (conv): Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (blocks): Sequential( (0): Sequential( (0): EdgeResidual( (conv_exp): Conv2dSame(64, 256, kernel_size=(3, 3), stride=(2, 2), bias=False) (bn1): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) (aa): Identity() (se): Identity() (conv_pwl): Conv2d(256, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn2): RmsNormAct2d( (drop): Identity() (act): Identity() ) (drop_path): Identity() ) (1): EdgeResidual( (conv_exp): Conv2d(32, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn1): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) (aa): Identity() (se): Identity() (conv_pwl): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn2): RmsNormAct2d( (drop): Identity() (act): Identity() ) (drop_path): Identity() ) ) (1): Sequential( (0): UniversalInvertedResidual( (dw_start): ConvNormAct( (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (pw_exp): ConvNormAct( (conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (dw_mid): ConvNormAct( (conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (se): Identity() (pw_proj): ConvNormAct( (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (dw_end): Identity() (layer_scale): LayerScale2d() (drop_path): Identity() ) (1): UniversalInvertedResidual( (dw_start): ConvNormAct( (conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (pw_exp): ConvNormAct( (conv): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (dw_mid): Identity() (se): Identity() (pw_proj): ConvNormAct( (conv): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (dw_end): Identity() (layer_scale): LayerScale2d() (drop_path): Identity() ) (2): UniversalInvertedResidual( (dw_start): ConvNormAct( (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (pw_exp): ConvNormAct( (conv): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (dw_mid): Identity() (se): Identity() (pw_proj): ConvNormAct( (conv): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (dw_end): Identity() (layer_scale): LayerScale2d() (drop_path): Identity() ) ) (2): Sequential( (0): UniversalInvertedResidual( (dw_start): ConvNormAct( (conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (pw_exp): ConvNormAct( (conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (dw_mid): ConvNormAct( (conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (se): Identity() (pw_proj): ConvNormAct( (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (dw_end): Identity() (layer_scale): LayerScale2d() (drop_path): Identity() ) (1): UniversalInvertedResidual( (dw_start): Identity() (pw_exp): ConvNormAct( (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (dw_mid): Identity() (se): Identity() (pw_proj): ConvNormAct( (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (dw_end): Identity() (layer_scale): LayerScale2d() (drop_path): Identity() ) (2): MobileAttention( (norm): RmsNormAct2d( (drop): Identity() (act): Identity() ) (attn): MultiQueryAttention2d( (query): Sequential( (proj): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) ) (key): Sequential( (down_conv): Conv2dSame(32, 32, kernel_size=(3, 3), stride=(2, 2), groups=32, bias=False) (norm): RmsNorm2d() (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) ) (value): Sequential( (down_conv): Conv2dSame(32, 32, kernel_size=(3, 3), stride=(2, 2), groups=32, bias=False) (norm): RmsNorm2d() (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) ) (attn_drop): Dropout(p=0.0, inplace=False) (output): Sequential( (proj): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (drop): Dropout(p=0.0, inplace=False) ) ) (layer_scale): LayerScale2d() (drop_path): Identity() ) (3): UniversalInvertedResidual( (dw_start): Identity() (pw_exp): ConvNormAct( (conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (dw_mid): Identity() (se): Identity() (pw_proj): ConvNormAct( (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (dw_end): Identity() (layer_scale): LayerScale2d() (drop_path): Identity() ) ) (3): Sequential( (0): UniversalInvertedResidual( (dw_start): ConvNormAct( (conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (pw_exp): ConvNormAct( (conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (dw_mid): ConvNormAct( (conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (se): Identity() (pw_proj): ConvNormAct( (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (dw_end): Identity() (layer_scale): LayerScale2d() (drop_path): Identity() ) (1): MobileAttention( (norm): RmsNormAct2d( (drop): Identity() (act): Identity() ) (attn): MultiQueryAttention2d( (query): Sequential( (proj): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) ) (key): Sequential( (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) ) (value): Sequential( (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) ) (attn_drop): Dropout(p=0.0, inplace=False) (output): Sequential( (proj): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (drop): Dropout(p=0.0, inplace=False) ) ) (layer_scale): LayerScale2d() (drop_path): Identity() ) (2): UniversalInvertedResidual( (dw_start): Identity() (pw_exp): ConvNormAct( (conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (dw_mid): Identity() (se): Identity() (pw_proj): ConvNormAct( (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (dw_end): Identity() (layer_scale): LayerScale2d() (drop_path): Identity() ) ) ) (msfa): MobileNetV5MultiScaleFusionAdapter( (ffn): UniversalInvertedResidual( (dw_start): Identity() (pw_exp): ConvNormAct( (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): GELU(approximate='none') ) ) (dw_mid): Identity() (se): Identity() (pw_proj): ConvNormAct( (conv): Conv2d(128, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn): RmsNormAct2d( (drop): Identity() (act): Identity() ) ) (dw_end): Identity() (layer_scale): Identity() (drop_path): Identity() ) (norm): RmsNorm2d() ) ) ) (language_model): Gemma3nTextModel( (embed_tokens): Gemma3nTextScaledWordEmbedding(262400, 32, padding_idx=0) (layers): ModuleList( (0-3): 4 x Gemma3nTextDecoderLayer( (self_attn): Gemma3nTextAttention( (q_proj): Linear(in_features=32, out_features=32, bias=False) (k_proj): Linear(in_features=32, out_features=32, bias=False) (v_proj): Linear(in_features=32, out_features=32, bias=False) (o_proj): Linear(in_features=32, out_features=32, bias=False) (q_norm): Gemma3nRMSNorm((32,), eps=1e-06) (k_norm): Gemma3nRMSNorm((32,), eps=1e-06) (v_norm): Gemma3nRMSNorm((), eps=1e-06) ) (mlp): Gemma3nTextMLP( (gate_proj): Linear(in_features=32, out_features=64, bias=False) (up_proj): Linear(in_features=32, out_features=64, bias=False) (down_proj): Linear(in_features=64, out_features=32, bias=False) (act_fn): PytorchGELUTanh() ) (input_layernorm): Gemma3nRMSNorm((32,), eps=1e-06) (post_attention_layernorm): Gemma3nRMSNorm((32,), eps=1e-06) (pre_feedforward_layernorm): Gemma3nRMSNorm((32,), eps=1e-06) (post_feedforward_layernorm): Gemma3nRMSNorm((32,), eps=1e-06) (act_fn): PytorchGELUTanh() (altup): Gemma3nTextAltUp( (correction_coefs): Linear(in_features=4, out_features=4, bias=False) (prediction_coefs): Linear(in_features=4, out_features=16, bias=False) (modality_router): Linear(in_features=32, out_features=4, bias=False) (router_norm): Gemma3nRMSNorm((32,), eps=1e-06) ) (laurel): Gemma3nTextLaurelBlock( (linear_left): Linear(in_features=32, out_features=8, bias=False) (linear_right): Linear(in_features=8, out_features=32, bias=False) (post_laurel_norm): Gemma3nRMSNorm((32,), eps=1e-06) ) (per_layer_input_gate): Linear(in_features=32, out_features=2, bias=False) (per_layer_projection): Linear(in_features=2, out_features=32, bias=False) (post_per_layer_input_norm): Gemma3nRMSNorm((32,), eps=1e-06) ) ) (norm): Gemma3nRMSNorm((32,), eps=1e-06) (rotary_emb): Gemma3nTextRotaryEmbedding() (rotary_emb_local): Gemma3nTextRotaryEmbedding() (embed_tokens_per_layer): Gemma3nTextScaledWordEmbedding(262144, 8, padding_idx=0) (per_layer_model_projection): Linear(in_features=32, out_features=8, bias=False) (per_layer_projection_norm): Gemma3nRMSNorm((2,), eps=1e-06) (altup_projections): ModuleList( (0-2): 3 x Linear(in_features=32, out_features=32, bias=False) ) (altup_unembed_projections): ModuleList( (0-2): 3 x Linear(in_features=32, out_features=32, bias=False) ) ) (audio_tower): Gemma3nAudioEncoder( (subsample_conv_projection): Gemma3nAudioSubSampleConvProjection( (conv_0): Gemma3nAudioSSCPConvBlock( (conv): Conv2d(1, 128, kernel_size=(3, 3), stride=(2, 2), bias=False) (norm): Gemma3nAudioCumulativeGroupNorm() (activation): ReLU() ) (conv_1): Gemma3nAudioSSCPConvBlock( (conv): Conv2d(128, 32, kernel_size=(3, 3), stride=(2, 2), bias=False) (norm): Gemma3nAudioCumulativeGroupNorm() (activation): ReLU() ) (input_proj_linear): Linear(in_features=1024, out_features=64, bias=False) ) (conformer): ModuleList( (0-1): 2 x Gemma3nAudioConformerBlock( (ffw_layer_start): Gemma3nAudioConformerFeedForward( (pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06) (ffw_layer_1): Linear(in_features=64, out_features=256, bias=False) (ffw_layer_2): Linear(in_features=256, out_features=64, bias=False) (post_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06) ) (attention): Gemma3nAudioConformerAttention( (pre_attn_norm): Gemma3nRMSNorm((64,), eps=1e-06) (attn): Gemma3nAudioAttention( (relative_position_embedding): Gemma3nAudioRelativePositionEmbedding( (pos_proj): Linear(in_features=64, out_features=64, bias=False) ) (q_proj): Linear(in_features=64, out_features=64, bias=False) (k_proj): Linear(in_features=64, out_features=64, bias=False) (v_proj): Linear(in_features=64, out_features=64, bias=False) ) (post): Linear(in_features=64, out_features=64, bias=False) (post_norm): Gemma3nRMSNorm((64,), eps=1e-06) ) (lconv1d): Gemma3nAudioConformerLightConv1d( (pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06) (linear_start): Linear(in_features=64, out_features=128, bias=False) (depthwise_conv1d): Conv1d(64, 64, kernel_size=(5,), stride=(1,), groups=64, bias=False) (conv_norm): Gemma3nRMSNorm((64,), eps=1e-06) (linear_end): Linear(in_features=64, out_features=64, bias=False) ) (ffw_layer_end): Gemma3nAudioConformerFeedForward( (pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06) (ffw_layer_1): Linear(in_features=64, out_features=256, bias=False) (ffw_layer_2): Linear(in_features=256, out_features=64, bias=False) (post_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06) ) (norm): Gemma3nRMSNorm((64,), eps=1e-06) ) ) ) (embed_vision): Gemma3nMultimodalEmbedder( (embedding): Embedding(128, 2048) (hard_embedding_norm): Gemma3nRMSNorm((2048,), eps=1e-06) (soft_embedding_norm): Gemma3nRMSNorm((2048,), eps=1e-06) (embedding_projection): Linear(in_features=2048, out_features=32, bias=False) (embedding_post_projection_norm): Gemma3nRMSNorm((), eps=1e-06) ) (embed_audio): Gemma3nMultimodalEmbedder( (embedding): Embedding(128, 64) (hard_embedding_norm): Gemma3nRMSNorm((64,), eps=1e-06) (soft_embedding_norm): Gemma3nRMSNorm((64,), eps=1e-06) (embedding_projection): Linear(in_features=64, out_features=32, bias=False) (embedding_post_projection_norm): Gemma3nRMSNorm((), eps=1e-06) ) ) (lm_head): Linear(in_features=32, out_features=262400, bias=False) ) ```