This is an HQQ-quantized version (4-bit, group-size=64) of the gemma-3-12b-it model.
Usage
import torch
backend = "gemlite"
compute_dtype = torch.bfloat16
cache_dir = None
model_id = 'mobiuslabsgmbh/gemma-3-12b-it_4bitgs64_bfp16_hqq_hf'
#Load model
from transformers import Gemma3ForConditionalGeneration, AutoProcessor
processor = AutoProcessor.from_pretrained(model_id, cache_dir=cache_dir)
model = Gemma3ForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=compute_dtype,
attn_implementation="sdpa",
cache_dir=cache_dir,
device_map="cuda",
)
#Optimize
from hqq.utils.patching import prepare_for_inference
prepare_for_inference(model.language_model, backend=backend, verbose=True)
############################################################################
#Inference
messages = [
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant."}]
},
{
"role": "user",
"content": [
{"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
{"type": "text", "text": "Describe this image in detail."}
]
}
]
inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=compute_dtype)
input_len = inputs["input_ids"].shape[-1]
with torch.inference_mode():
generation = model.generate(**inputs, max_new_tokens=128, do_sample=False)[0][input_len:]
decoded = processor.decode(generation, skip_special_tokens=True)
print(decoded)
- Downloads last month
- 11
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API:
The model has no library tag.