This is an HQQ-quantized version (4-bit, group-size=64) of the gemma-3-12b-it model.

Usage

import torch
backend       = "gemlite" 
compute_dtype = torch.bfloat16 
cache_dir     = None
model_id      = 'mobiuslabsgmbh/gemma-3-12b-it_4bitgs64_bfp16_hqq_hf'

#Load model
from transformers import Gemma3ForConditionalGeneration, AutoProcessor

processor = AutoProcessor.from_pretrained(model_id, cache_dir=cache_dir)
model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=compute_dtype,
    attn_implementation="sdpa",
    cache_dir=cache_dir,
    device_map="cuda",
)

#Optimize
from hqq.utils.patching import prepare_for_inference
prepare_for_inference(model.language_model, backend=backend, verbose=True)


############################################################################
#Inference
messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": "You are a helpful assistant."}]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
            {"type": "text", "text": "Describe this image in detail."}
        ]
    }
]

inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=compute_dtype)

input_len = inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**inputs, max_new_tokens=128, do_sample=False)[0][input_len:]
    decoded    = processor.decode(generation, skip_special_tokens=True)

print(decoded)

Downloads last month
11
Safetensors
Model size
8.15B params
Tensor type
I64
BF16
U8
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.

Model tree for mobiuslabsgmbh/gemma-3-12b-it_4bitgs64_bfp16_hqq_hf

Finetuned
(17)
this model