justinjja commited on
Commit
553fefa
·
verified ·
1 Parent(s): 3b72bb5

Create quantization_script.py

Browse files
Files changed (1) hide show
  1. quantization_script.py +66 -0
quantization_script.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Quantize Qwen/Qwen3-235B-A22B (MoE) to INT4-W4A16 on a CPU-only machine.
4
+ Output: Qwen3-235B-A22B-INT4-W4A16
5
+ """
6
+
7
+ import os, warnings
8
+ import torch
9
+ from accelerate import init_empty_weights, infer_auto_device_map
10
+ from transformers import AutoModelForCausalLM
11
+ from llmcompressor import oneshot
12
+ from llmcompressor.modifiers.quantization import QuantizationModifier
13
+
14
+ # --------------------------------------------------------------------
15
+ # Optional: silence CUDA warnings on machines without a GPU
16
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
17
+ warnings.filterwarnings("ignore", message="Can't initialize NVML")
18
+
19
+ model_id = "Qwen/Qwen3-235B-A22B"
20
+ output_dir = "Qwen3-235B-A22B-INT4-W4A16"
21
+
22
+ # --------------------------------------------------------------------
23
+ # 1) Build a dummy model (no weights) to infer a device map
24
+ with init_empty_weights():
25
+ dummy = AutoModelForCausalLM.from_pretrained(
26
+ model_id, torch_dtype=torch.bfloat16, trust_remote_code=True
27
+ )
28
+ device_map = infer_auto_device_map(
29
+ dummy, no_split_module_classes=dummy._no_split_modules
30
+ )
31
+ del dummy
32
+
33
+ # force every sub-module onto CPU
34
+ device_map = {name: "cpu" for name in device_map}
35
+
36
+ # --------------------------------------------------------------------
37
+ # 2) Load the full model weights (BF16) on CPU
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ model_id,
40
+ device_map=device_map,
41
+ torch_dtype=torch.bfloat16,
42
+ trust_remote_code=True,
43
+ )
44
+
45
+ # --------------------------------------------------------------------
46
+ # 3) Quantization recipe — keep only router gates + lm_head in BF16
47
+ recipe = QuantizationModifier(
48
+ targets="Linear",
49
+ scheme="W4A16",
50
+ ignore=[
51
+ "lm_head",
52
+ r"re:.*\.mlp\.gate$", # router gates (tiny but accuracy-critical)
53
+ ],
54
+ dampening_frac=0.1, # mitigates INT4 noise
55
+ )
56
+
57
+ # --------------------------------------------------------------------
58
+ # 4) One-shot quantization
59
+ oneshot(
60
+ model=model,
61
+ recipe=recipe,
62
+ output_dir=output_dir,
63
+ )
64
+
65
+ print(f"\n✅ Quantized model written to: {output_dir}")
66
+ print( " (router gates & lm_head remain in BF16; everything else INT4 W4A16)")