Model Card

High quality quantization of Qwen3-Coder-480B-A35B-Instruct without using imatrix.

Run

ik_llama.cpp

See this detailed guide on how to setup ik_llama and how to make custom quants.

./build/bin/llama-server \
    --alias anikifoss/Qwen3-Coder-480B-A35B-Instruct-HQ4_K \
    --model /mnt/data/Models/anikifoss/Qwen3-Coder-480B-A35B-Instruct-HQ4_K/Qwen3-Coder-480B-A35B-Instruct-HQ4_K-00001-of-00007.gguf \
    --no-mmap -rtr \
    --temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 \
    --ctx-size 51000 \
    -ctk f16 -ctv f16 \
    -fa \
    -b 1024 -ub 1024 \
    -fmoe \
    --n-gpu-layers 99 \
    --override-tensor exps=CPU \
    --parallel 1 \
    --threads 32 \
    --threads-batch 64 \
    --host 127.0.0.1 \
    --port 8090

llama.cpp

./build/bin/llama-server \
    --alias anikifoss/Qwen3-Coder-480B-A35B-Instruct-HQ4_K \
    --model /mnt/data/Models/anikifoss/Qwen3-Coder-480B-A35B-Instruct-HQ4_K/Qwen3-Coder-480B-A35B-Instruct-HQ4_K-00001-of-00007.gguf \
    --no-mmap \
    --temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 \
    --ctx-size 51000 \
    -ctk f16 -ctv f16 \
    -fa \
    -b 1024 -ub 1024 \
    --n-gpu-layers 99 \
    --override-tensor exps=CPU \
    --parallel 1 \
    --threads 32 \
    --threads-batch 64 \
    --host 127.0.0.1 \
    --port 8090

Quantization Recipe

Quantized with ik_llama, but should work with any GGUF compatible inference framework.

#!/usr/bin/env bash

custom="
# Token embedding and output tensors
output\.weight=bf16
output_norm\.weight=f32
token_embd\.weight=bf16

blk\.[0-9]\.attn_k\.weight=q8_0
blk\.[0-9]\.attn_k_norm\.weight=f32
blk\.[0-9]\.attn_norm\.weight=f32
blk\.[0-9]\.attn_output\.weight=q8_0
blk\.[0-9]\.attn_q\.weight=q8_0
blk\.[0-9]\.attn_q_norm\.weight=f32
blk\.[0-9]\.attn_v\.weight=q8_0
blk\.[0-9]\.ffn_down_exps\.weight=q6_K
blk\.[0-9]\.ffn_gate_exps\.weight=q4_K
blk\.[0-9]\.ffn_up_exps\.weight=q4_K
blk\.[0-9]\.ffn_gate_inp\.weight=f32
blk\.[0-9]\.ffn_norm\.weight=f32
blk\.[1-5][0-9]\.attn_k\.weight=q8_0
blk\.[1-5][0-9]\.attn_k_norm\.weight=f32
blk\.[1-5][0-9]\.attn_norm\.weight=f32
blk\.[1-5][0-9]\.attn_output\.weight=q8_0
blk\.[1-5][0-9]\.attn_q\.weight=q8_0
blk\.[1-5][0-9]\.attn_q_norm\.weight=f32
blk\.[1-5][0-9]\.attn_v\.weight=q8_0
blk\.[1-5][0-9]\.ffn_down_exps\.weight=q6_K
blk\.[1-5][0-9]\.ffn_gate_exps\.weight=q4_K
blk\.[1-5][0-9]\.ffn_up_exps\.weight=q4_K
blk\.[1-5][0-9]\.ffn_gate_inp\.weight=f32
blk\.[1-5][0-9]\.ffn_norm\.weight=f32
blk\.6[0-1]\.attn_k\.weight=q8_0
blk\.6[0-1]\.attn_k_norm\.weight=f32
blk\.6[0-1]\.attn_norm\.weight=f32
blk\.6[0-1]\.attn_output\.weight=q8_0
blk\.6[0-1]\.attn_q\.weight=q8_0
blk\.6[0-1]\.attn_q_norm\.weight=f32
blk\.6[0-1]\.attn_v\.weight=q8_0
blk\.6[0-1]\.ffn_down_exps\.weight=q6_K
blk\.6[0-1]\.ffn_gate_exps\.weight=q4_K
blk\.6[0-1]\.ffn_up_exps\.weight=q4_K
blk\.6[0-1]\.ffn_gate_inp\.weight=f32
blk\.6[0-1]\.ffn_norm\.weight=f32
"

custom=$(
  echo "$custom" | grep -v '^#' | \
  sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)

echo "Running with: -custom-q $custom"

TARGET_MODEL="Qwen3-Coder-480B-A35B-Instruct-HQ4_K"
mkdir -p ~/Env/models/anikifoss/$TARGET_MODEL
./build/bin/llama-quantize \
    --custom-q "$custom" \
    /mnt/data/Models/Qwen/Qwen3-Coder-480B-A35B-Instruct-GGUF/Qwen3-Coder-480B-A35B-Instruct-BF16-00001-of-00021.gguf \
    ~/Env/models/anikifoss/$TARGET_MODEL/$TARGET_MODEL.gguf \
    Q4_K \
    32
Downloads last month
402
GGUF
Model size
480B params
Architecture
qwen3moe
Hardware compatibility
Log In to view the estimation
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support

Model tree for anikifoss/Qwen3-Coder-480B-A35B-Instruct-HQ4_K

Quantized
(25)
this model