Qwen3-4B-Instruct-2507 - W8A8 Quantized

This is a W8A8 (8-bit weights and 8-bit activations) quantized version of Qwen/Qwen3-4B-Instruct-2507, created using LLM-Compressor.

This model was quantized by itroot.

Quantization Recipe

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.transformers import oneshot

model_id = "Qwen/Qwen3-4B-Instruct-2507"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
    low_cpu_mem_usage=True,
    offload_folder="./offload_tmp",
    # for 2x 3090s.
    max_memory={0: "22GB", 1: "22GB", "cpu": "64GB"},
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

print("Loading and preprocessing calibration dataset...")
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)

def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
        )
    }

ds = ds.map(preprocess)

def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )

ds = ds.map(tokenize, remove_columns=ds.column_names)
print("Dataset ready.")

recipe = [
    SmoothQuantModifier(smoothing_strength=0.8),
    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
]

output_dir = "./Qwen3-4B-Instruct-2507-W8A8"
print(f"Starting one-shot quantization. Output will be in '{output_dir}'")

oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    output_dir=output_dir,
)
print("Quantization complete.")

SAVE_DIR = "Qwen3-4B-Instruct-2507-W8A8"
print(f"Saving compressed model and tokenizer to '{SAVE_DIR}'...")
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

Downloads last month: 258

Safetensors

Model size

4.41B params

Tensor type

BF16

Model tree for itroot/Qwen3-4B-Instruct-2507-W8A8

Base model

Qwen/Qwen3-4B-Instruct-2507

Quantized

(100)

this model

Collection including itroot/Qwen3-4B-Instruct-2507-W8A8

Qwen3-4B-2507

Collection

2 items • Updated 5 days ago