from transformers import AutoModelForCausalLM, AutoProcessor | |
from llmcompressor import oneshot | |
from llmcompressor.modifiers.quantization import QuantizationModifier | |
MODEL_ID = "moonshotai/Kimi-VL-A3B-Thinking-2506" | |
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True | |
) | |
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
# Configure the simple PTQ quantization | |
recipe = QuantizationModifier( | |
targets="Linear", scheme="FP8_DYNAMIC", ignore=["re:.*lm_head", "re:multi_*", "re:vision.*"]) | |
# Apply the quantization algorithm. | |
oneshot( | |
model=model, | |
recipe=recipe, | |
tokenizer=processor, # Pass the loaded processor here | |
output_dir=SAVE_DIR | |
) | |
# Save the model | |
#model.save_pretrained(SAVE_DIR) | |
#processor.save_pretrained(SAVE_DIR) |