HW requirements
Hello, ty again for the best models for the open source community. I have a question, what are the minimum VRAM requirements to run this model?
it runs on 24GB in nf4 with a image reselution of 1024x1024 but the visuals will be degraded a bit
How did you get it to run in nf4?
import os
from PIL import Image
import torch
from diffusers.quantizers import PipelineQuantizationConfig
from diffusers import QwenImageEditPipeline
model_cache = "/media/kurama/1TB NVME SSD/huggingface_models/"
model_name = "Qwen/Qwen-Image"
# Load the pipeline
if torch.cuda.is_available():
torch_dtype = torch.bfloat16
device = "cuda:0"
else:
torch_dtype = torch.float32
device = "cpu"
pipeline_quant_config = PipelineQuantizationConfig(
quant_backend="bitsandbytes_4bit",
quant_kwargs={
"load_in_4bit": True,
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": torch.bfloat16,
"bnb_4bit_use_double_quant": True
},
components_to_quantize=["transformer", "text_encoder"], # names depend on pipeline
)
pipeline = QwenImageEditPipeline.from_pretrained(
"Qwen/Qwen-Image-Edit",
torch_dtype=torch_dtype,
cache_dir=model_cache,
quantization_config=pipeline_quant_config
)
pipeline.load_lora_weights(
"LORA/Qwen-Image-Lightning-4steps-V1.0.safetensors"
)
print("pipeline loaded")
pipeline.to("cuda")
image = Image.open("./example.png").convert("RGB")
prompt = "render it in a topdown perspective"
inputs = {
"image": image,
"prompt": prompt,
"generator": torch.manual_seed(1),
"true_cfg_scale": 2,
"negative_prompt": " ",
"num_inference_steps": 4,
}
with torch.inference_mode():
output = pipeline(**inputs)
output_image = output.images[0]
output_image.save("output_image_edit.png")
print("image saved at", os.path.abspath("output_image_edit.png"))
you can download the lora from here: https://huggingface.co/lightx2v/Qwen-Image-Lightning
without it it won't work because the model doesn't quantize that well
here is a image generates with Qwen-Image to show how it looks in nf4
So this works but there is most definitely some artifacts.
In the Qwen-Image repo,
@OzzyGT
uses TorchAO. Could this be a solution to reduce the artifacts? π€π€
both are correct, you can use torchao but if you want to use nf4
you should follow the same method I was using, you have to skip the transformer_blocks.0.img_mod
layer or you will get degradation. It works with or without the lighting lora. It uses a little bit more than 17GB of VRAM with bitsandbytes
and in 36s with a 3090 using the 8-steps lora.
prompt = "change the dog plushie for a cat preserving the background, the lighting, colors, shadows, also the cat plushie should have the same style of the dog plushie with the same eyes and lines."
code:
import torch
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
from transformers import Qwen2_5_VLForConditionalGeneration
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
from diffusers import QwenImageEditPipeline, QwenImageTransformer2DModel
from diffusers.utils import load_image
model_id = "Qwen/Qwen-Image-Edit"
torch_dtype = torch.bfloat16
device = "cuda"
quantization_config = DiffusersBitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
llm_int8_skip_modules=["transformer_blocks.0.img_mod"],
)
transformer = QwenImageTransformer2DModel.from_pretrained(
model_id,
subfolder="transformer",
quantization_config=quantization_config,
torch_dtype=torch_dtype,
)
transformer = transformer.to("cpu")
quantization_config = TransformersBitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_id,
subfolder="text_encoder",
quantization_config=quantization_config,
torch_dtype=torch_dtype,
)
text_encoder = text_encoder.to("cpu")
pipe = QwenImageEditPipeline.from_pretrained(
model_id, transformer=transformer, text_encoder=text_encoder, torch_dtype=torch_dtype
)
# optionally load LoRA weights to speed up inference
pipe.load_lora_weights("lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.1.safetensors")
# pipe.load_lora_weights(
# "lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-4steps-V1.0-bf16.safetensors"
# )
pipe.enable_model_cpu_offload()
generator = torch.Generator(device="cuda").manual_seed(42)
image = load_image(
"https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/resources/dog_plushie.png"
).convert("RGB")
prompt = "change the dog plushie for a cat preserving the background, the lighting, colors, shadows, also the cat plushie should have the same style of the dog plushie with the same eyes and lines."
# change steps to 8 or 4 if you used the lighting loras
image = pipe(image, prompt, num_inference_steps=8).images[0]
image.save("qwenimageedit.png")
Been at it for hours. OMG, hype. Going to test out now!! First will test the code you just dropped though.
Thank you both for all your hard work π€π«±π½βπ«²π»
Here is the TorchAO variant.
Works and takes about 22GB-23GB of VRAM.
HUGE SHOUT OUT TO OzzyGT (didn't @ cause I did earlier and don't want to be annoying.)
π€π«±π½βπ«²π» much love to everyone.
![]() |
![]() |
import torch
from PIL import Image
from diffusers import AutoModel, DiffusionPipeline, TorchAoConfig
model_cache = "/path/to/weights/Qwen-image-edit"
model_id = "/path/to/weights/Qwen-Image-Edit"
torch_dtype = torch.bfloat16
device = "cuda"
# TorchAO int8 weight-only on transformer
quantization_config = TorchAoConfig("int8wo")
transformer = AutoModel.from_pretrained(
model_id,
subfolder="transformer",
quantization_config=quantization_config,
torch_dtype=torch_dtype,
)
pipe = DiffusionPipeline.from_pretrained(
model_id,
transformer=transformer,
torch_dtype=torch_dtype,
)
pipe.enable_model_cpu_offload()
# optional LoRA (works with or without)
pipe.load_lora_weights("/path/to/weights/Qwen-Lora/Qwen-Image-Lightning-8steps-V1.1.safetensors")
prompt = "change the pickle in here hand to an eggplant while preserving the background, the lighting, colors, shadows, also the eggplant should have the same style as the cucumber"
generator = torch.Generator(device="cuda").manual_seed(42)
image = Image.open("./input.jpeg").convert("RGB")
# use 8 (or 4) steps if you're using the Lightning LoRA
image = pipe(
image=image,
prompt=prompt,
num_inference_steps=8,
generator=generator,
).images[0]
image.save("qwenimageedit_torchao.png")
I would wait for a FP8 scaled from Kijai.
Way better than a naive truncated FP8
? π€.... it's like the literal precursor to quantizing it yourself.
God forbid... doing something on your own π«±π½βπ«²π»
ππΎacc/acc
both are correct, you can use torchao but if you want to use
nf4
you should follow the same method I was using, you have to skip thetransformer_blocks.0.img_mod
layer or you will get degradation. It works with or without the lighting lora. It uses a little bit more than 17GB of VRAM withbitsandbytes
and in 36s with a 3090 using the 8-steps lora.
prompt = "change the dog plushie for a cat preserving the background, the lighting, colors, shadows, also the cat plushie should have the same style of the dog plushie with the same eyes and lines."
code:
import torch from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig from transformers import Qwen2_5_VLForConditionalGeneration from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig from diffusers import QwenImageEditPipeline, QwenImageTransformer2DModel from diffusers.utils import load_image model_id = "Qwen/Qwen-Image-Edit" torch_dtype = torch.bfloat16 device = "cuda" quantization_config = DiffusersBitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, llm_int8_skip_modules=["transformer_blocks.0.img_mod"], ) transformer = QwenImageTransformer2DModel.from_pretrained( model_id, subfolder="transformer", quantization_config=quantization_config, torch_dtype=torch_dtype, ) transformer = transformer.to("cpu") quantization_config = TransformersBitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained( model_id, subfolder="text_encoder", quantization_config=quantization_config, torch_dtype=torch_dtype, ) text_encoder = text_encoder.to("cpu") pipe = QwenImageEditPipeline.from_pretrained( model_id, transformer=transformer, text_encoder=text_encoder, torch_dtype=torch_dtype ) # optionally load LoRA weights to speed up inference pipe.load_lora_weights("lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.1.safetensors") # pipe.load_lora_weights( # "lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-4steps-V1.0-bf16.safetensors" # ) pipe.enable_model_cpu_offload() generator = torch.Generator(device="cuda").manual_seed(42) image = load_image( "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/resources/dog_plushie.png" ).convert("RGB") prompt = "change the dog plushie for a cat preserving the background, the lighting, colors, shadows, also the cat plushie should have the same style of the dog plushie with the same eyes and lines." # change steps to 8 or 4 if you used the lighting loras image = pipe(image, prompt, num_inference_steps=8).images[0] image.save("qwenimageedit.png")
23s on 4090 Suprim X @18.8Gb
Heyy @vladciocan88 , thanks for the code and attached results. Btw I am pretty new to this quantization thing, so I was wondering how do you know what to quantize and what to skip in a way that doesn't affect the quality much? and can you pls suggest me some resources or something else which can help me learn quantization/pruning better.