Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit

Community Article Published March 24, 2025

Llama-3.1-Nemotron-Nano-8B-v1 to bnb 4bit

tobit4

Use System Ubuntu 22.04

install Software

pip transformers bitsandbytes accelerate

to bnb 4bit

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import bitsandbytes as bnb

# Define the model name and path
model_name = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"

# Configure quantization parameters
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,                  # Load the model weights in 4-bit precision
    bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for computation
    bnb_4bit_quant_type="nf4",         # Use "nf4" quantization type
    bnb_4bit_use_double_quant=True,    # Enable double quantization
    llm_int8_skip_modules=[             # Specify modules to skip during quantization
        "lm_head",
        "multi_modal_projector",
        "merger",
        "modality_projection",
        "model.layers.1.mlp"
    ],
)

# Load the pre-trained model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"  # Automatically allocate devices
)

# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Save the quantized model and tokenizer to a specified directory
model.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit")
tokenizer.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit")

Chat Test

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Configure quantization parameters
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,                  # Load the model weights in 4-bit precision
    bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for computation
    bnb_4bit_quant_type="nf4",         # Use "nf4" quantization type
    bnb_4bit_use_double_quant=True,    # Enable double quantization
)

# Define the model name and path for the quantized model
model_name = "./Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit"

# Load the quantized model with the specified configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"  # Automatically allocate devices
)

# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Determine the device where the model is located
device = model.device

# Prepare input text and move it to the same device as the model
input_text = "Once upon a time"
inputs = tokenizer(input_text, return_tensors="pt").to(device)

# Perform inference
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=50)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

bit4 model

Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit

Community

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

Your need to confirm your account before you can post a new comment.

· Sign up or log in to comment

Upvote