Problem with model quantization

#12

by bialykostek - opened 15 days ago

15 days ago

•

Hi,
I've fine-tuned the model and before deployment I wanted to quantize it. I've tried to use llm-compressor as autoAWQ is deprecated, but there is error during calibration process in get_vllm_embedding function. To recreate problem just use this code (from llm-compressor examples):

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.utils import dispatch_for_generation

# Select model and load it.
MODEL_ID = "openbmb/MiniCPM-V-4_5"            # Qwen/Qwen3-8B works well 
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)

def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
        )
    }

ds = ds.map(preprocess)

# Tokenize inputs.
def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )

ds = ds.map(tokenize, remove_columns=ds.column_names)

# Configure algorithms. 
recipe = [
    SmoothQuantModifier(smoothing_strength=0.8),
    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
]

# Apply algorithms and save to output_dir
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    trust_remote_code_model=True
)

Traceback:

Traceback (most recent call last):
  File "/home/bialykostek/4oc/MiniCPM-V/finetune/LLaMA-Factory/models/comp3.py", line 63, in <module>
    oneshot(
  File "/home/bialykostek/4oc/.venv/lib/python3.12/site-packages/llmcompressor/entrypoints/oneshot.py", line 319, in oneshot
    one_shot()
  File "/home/bialykostek/4oc/.venv/lib/python3.12/site-packages/llmcompressor/entrypoints/oneshot.py", line 149, in __call__
    self.apply_recipe_modifiers(
  File "/home/bialykostek/4oc/.venv/lib/python3.12/site-packages/llmcompressor/entrypoints/oneshot.py", line 192, in apply_recipe_modifiers
    pipeline(
  File "/home/bialykostek/4oc/.venv/lib/python3.12/site-packages/llmcompressor/pipelines/independent/pipeline.py", line 45, in __call__
    pipeline(model, dataloader, dataset_args)
  File "/home/bialykostek/4oc/.venv/lib/python3.12/site-packages/llmcompressor/pipelines/sequential/pipeline.py", line 72, in __call__
    subgraphs = trace_subgraphs(model, sample_input, sequential_targets, ignore)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bialykostek/4oc/.venv/lib/python3.12/site-packages/llmcompressor/pipelines/sequential/helpers.py", line 125, in trace_subgraphs
    tracer.trace(
  File "/home/bialykostek/4oc/.venv/lib/python3.12/site-packages/transformers/utils/fx.py", line 1315, in trace
    self.graph = super().trace(root, concrete_args=concrete_args)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bialykostek/4oc/.venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 838, in _fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/bialykostek/4oc/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 837, in trace
    (self.create_arg(fn(*args)),),
                     ^^^^^^^^^
  File "MiniCPMV_8730936129375_autowrapped", line -1, in forward
  File "/home/bialykostek/.cache/huggingface/modules/transformers_modules/openbmb/MiniCPM-V-4_5/0fe9c69d46b5539b14521791f38e96e9ed007ff9/modeling_minicpmv.py", line 79, in get_vllm_embedding
    if 'vision_hidden_states' not in data:
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: argument of type 'type' is not iterable

I've tried different configuration but at the end always faced the same error. Is the problem in model wrapper? Can you help me with that? Alternatively, can you tell me which software supports quantization of your model?

Thanks!

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment