HW requirements

#6
by vladciocan88 - opened

Hello, ty again for the best models for the open source community. I have a question, what are the minimum VRAM requirements to run this model?

it runs on 24GB in nf4 with a image reselution of 1024x1024 but the visuals will be degraded a bit

How did you get it to run in nf4?

import os
from PIL import Image
import torch
from diffusers.quantizers import PipelineQuantizationConfig
from diffusers import QwenImageEditPipeline
model_cache = "/media/kurama/1TB NVME SSD/huggingface_models/"

model_name = "Qwen/Qwen-Image"

# Load the pipeline
if torch.cuda.is_available():
    torch_dtype = torch.bfloat16
    device = "cuda:0"
else:
    torch_dtype = torch.float32
    device = "cpu"
pipeline_quant_config = PipelineQuantizationConfig(
    quant_backend="bitsandbytes_4bit",
    quant_kwargs={
        "load_in_4bit": True,
        "bnb_4bit_quant_type": "nf4",
        "bnb_4bit_compute_dtype": torch.bfloat16,
        "bnb_4bit_use_double_quant": True
    },
    components_to_quantize=["transformer", "text_encoder"],  # names depend on pipeline
)
pipeline = QwenImageEditPipeline.from_pretrained(
    "Qwen/Qwen-Image-Edit",
    torch_dtype=torch_dtype,
    cache_dir=model_cache,
    quantization_config=pipeline_quant_config
)
pipeline.load_lora_weights(
    "LORA/Qwen-Image-Lightning-4steps-V1.0.safetensors"
)
print("pipeline loaded")
pipeline.to("cuda")
image = Image.open("./example.png").convert("RGB")
prompt = "render it in a topdown perspective"
inputs = {
    "image": image,
    "prompt": prompt,
    "generator": torch.manual_seed(1),
    "true_cfg_scale": 2,
    "negative_prompt": " ",
    "num_inference_steps": 4,
}

with torch.inference_mode():
    output = pipeline(**inputs)
    output_image = output.images[0]
    output_image.save("output_image_edit.png")
    print("image saved at", os.path.abspath("output_image_edit.png"))

you can download the lora from here: https://huggingface.co/lightx2v/Qwen-Image-Lightning
without it it won't work because the model doesn't quantize that well

here is a image generates with Qwen-Image to show how it looks in nf4
example.png

So this works but there is most definitely some artifacts.
In the Qwen-Image repo, @OzzyGT uses TorchAO. Could this be a solution to reduce the artifacts? πŸ€•πŸ€”

both are correct, you can use torchao but if you want to use nf4 you should follow the same method I was using, you have to skip the transformer_blocks.0.img_mod layer or you will get degradation. It works with or without the lighting lora. It uses a little bit more than 17GB of VRAM with bitsandbytes and in 36s with a 3090 using the 8-steps lora.

prompt = "change the dog plushie for a cat preserving the background, the lighting, colors, shadows, also the cat plushie should have the same style of the dog plushie with the same eyes and lines."

source lighting 8-steps 50steps
dog_plushie.png qwenimageedit.png cat_qwenimageedit_50steps.png

code:

import torch
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
from transformers import Qwen2_5_VLForConditionalGeneration

from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
from diffusers import QwenImageEditPipeline, QwenImageTransformer2DModel
from diffusers.utils import load_image


model_id = "Qwen/Qwen-Image-Edit"
torch_dtype = torch.bfloat16
device = "cuda"

quantization_config = DiffusersBitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_skip_modules=["transformer_blocks.0.img_mod"],
)
transformer = QwenImageTransformer2DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=quantization_config,
    torch_dtype=torch_dtype,
)
transformer = transformer.to("cpu")

quantization_config = TransformersBitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    subfolder="text_encoder",
    quantization_config=quantization_config,
    torch_dtype=torch_dtype,
)
text_encoder = text_encoder.to("cpu")

pipe = QwenImageEditPipeline.from_pretrained(
    model_id, transformer=transformer, text_encoder=text_encoder, torch_dtype=torch_dtype
)

# optionally load LoRA weights to speed up inference
pipe.load_lora_weights("lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.1.safetensors")
# pipe.load_lora_weights(
#     "lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-4steps-V1.0-bf16.safetensors"
# )
pipe.enable_model_cpu_offload()

generator = torch.Generator(device="cuda").manual_seed(42)
image = load_image(
    "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/resources/dog_plushie.png"
).convert("RGB")

prompt = "change the dog plushie for a cat preserving the background, the lighting, colors, shadows, also the cat plushie should have the same style of the dog plushie with the same eyes and lines."

# change steps to 8 or 4 if you used the lighting loras
image = pipe(image, prompt, num_inference_steps=8).images[0]

image.save("qwenimageedit.png")

Been at it for hours. OMG, hype. Going to test out now!! First will test the code you just dropped though.
Thank you both for all your hard work πŸ€•πŸ«±πŸ½β€πŸ«²πŸ»

Here is the TorchAO variant.
Works and takes about 22GB-23GB of VRAM.
HUGE SHOUT OUT TO OzzyGT (didn't @ cause I did earlier and don't want to be annoying.)

πŸ€•πŸ«±πŸ½β€πŸ«²πŸ» much love to everyone.

import torch
from PIL import Image
from diffusers import AutoModel, DiffusionPipeline, TorchAoConfig

model_cache = "/path/to/weights/Qwen-image-edit"
model_id = "/path/to/weights/Qwen-Image-Edit"
torch_dtype = torch.bfloat16
device = "cuda"

# TorchAO int8 weight-only on transformer
quantization_config = TorchAoConfig("int8wo")

transformer = AutoModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=quantization_config,
    torch_dtype=torch_dtype,
)
pipe = DiffusionPipeline.from_pretrained(
    model_id, 
    transformer=transformer, 
    torch_dtype=torch_dtype,
)
pipe.enable_model_cpu_offload()


# optional LoRA (works with or without)
pipe.load_lora_weights("/path/to/weights/Qwen-Lora/Qwen-Image-Lightning-8steps-V1.1.safetensors")


prompt = "change the pickle in here hand to an eggplant while preserving the background, the lighting, colors, shadows, also the eggplant should have the same style as the cucumber"


generator = torch.Generator(device="cuda").manual_seed(42)
image = Image.open("./input.jpeg").convert("RGB")



# use 8 (or 4) steps if you're using the Lightning LoRA
image = pipe(
    image=image,
    prompt=prompt,
    num_inference_steps=8,
    generator=generator,
).images[0]

image.save("qwenimageedit_torchao.png")

I would wait for a FP8 scaled from Kijai.
Way better than a naive truncated FP8

? πŸ€”.... it's like the literal precursor to quantizing it yourself.

God forbid... doing something on your own πŸ«±πŸ½β€πŸ«²πŸ»
πŸ‡πŸΎacc/acc

H100 57.59G
wechat_2025-08-19_102329_926.png

both are correct, you can use torchao but if you want to use nf4 you should follow the same method I was using, you have to skip the transformer_blocks.0.img_mod layer or you will get degradation. It works with or without the lighting lora. It uses a little bit more than 17GB of VRAM with bitsandbytes and in 36s with a 3090 using the 8-steps lora.

prompt = "change the dog plushie for a cat preserving the background, the lighting, colors, shadows, also the cat plushie should have the same style of the dog plushie with the same eyes and lines."

source lighting 8-steps 50steps
dog_plushie.png qwenimageedit.png cat_qwenimageedit_50steps.png

code:

import torch
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
from transformers import Qwen2_5_VLForConditionalGeneration

from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
from diffusers import QwenImageEditPipeline, QwenImageTransformer2DModel
from diffusers.utils import load_image


model_id = "Qwen/Qwen-Image-Edit"
torch_dtype = torch.bfloat16
device = "cuda"

quantization_config = DiffusersBitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_skip_modules=["transformer_blocks.0.img_mod"],
)
transformer = QwenImageTransformer2DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=quantization_config,
    torch_dtype=torch_dtype,
)
transformer = transformer.to("cpu")

quantization_config = TransformersBitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    subfolder="text_encoder",
    quantization_config=quantization_config,
    torch_dtype=torch_dtype,
)
text_encoder = text_encoder.to("cpu")

pipe = QwenImageEditPipeline.from_pretrained(
    model_id, transformer=transformer, text_encoder=text_encoder, torch_dtype=torch_dtype
)

# optionally load LoRA weights to speed up inference
pipe.load_lora_weights("lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.1.safetensors")
# pipe.load_lora_weights(
#     "lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-4steps-V1.0-bf16.safetensors"
# )
pipe.enable_model_cpu_offload()

generator = torch.Generator(device="cuda").manual_seed(42)
image = load_image(
    "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/resources/dog_plushie.png"
).convert("RGB")

prompt = "change the dog plushie for a cat preserving the background, the lighting, colors, shadows, also the cat plushie should have the same style of the dog plushie with the same eyes and lines."

# change steps to 8 or 4 if you used the lighting loras
image = pipe(image, prompt, num_inference_steps=8).images[0]

image.save("qwenimageedit.png")

image.png

23s on 4090 Suprim X @18.8Gb

test1.png

Heyy @vladciocan88 , thanks for the code and attached results. Btw I am pretty new to this quantization thing, so I was wondering how do you know what to quantize and what to skip in a way that doesn't affect the quality much? and can you pls suggest me some resources or something else which can help me learn quantization/pruning better.

Sign up or log in to comment