Exception has occurred: KeyError 'layers.0.attn.to_q.weight'

#12
by dkackman - opened

I get this error with diffusers v0.32.1 and pytorch 2.5.1:

Exception has occurred: KeyError 'layers.0.attn.to_q.weight'

when calling load_ip_adapter

This is on a 3090 so am also calling :

pipe._exclude_from_cpu_offload.append("image_encoder")
pipe.enable_sequential_cpu_offload()

Minimal repro code:

import torch
from diffusers.models.transformers import SD3Transformer2DModel
from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import (
    StableDiffusion3Pipeline,
)
from diffusers.utils import load_image
from transformers import SiglipVisionModel, SiglipImageProcessor

model_path = "stabilityai/stable-diffusion-3.5-large"
image_encoder_path = "google/siglip-so400m-patch14-384"
ip_adapter_path = "InstantX/SD3.5-Large-IP-Adapter"

device = "cuda"

transformer = SD3Transformer2DModel.from_pretrained(
    model_path, subfolder="transformer", torch_dtype=torch.bfloat16
)

feature_extractor = SiglipImageProcessor.from_pretrained(
    image_encoder_path, torch_dtype=torch.bfloat16
)

image_encoder = SiglipVisionModel.from_pretrained(
    image_encoder_path, torch_dtype=torch.bfloat16
)
pipe = StableDiffusion3Pipeline.from_pretrained(
    model_path,
    transformer=transformer,
    torch_dtype=torch.bfloat16,
    feature_extractor=feature_extractor,
    image_encoder=image_encoder,
)
pipe._exclude_from_cpu_offload.append("image_encoder")
pipe.enable_sequential_cpu_offload()

# Exception has occurred: KeyError
# 'layers.0.attn.to_q.weight'
pipe.load_ip_adapter(ip_adapter_path, subfolder="", weight_name="ip-adapter.bin")
pipe.set_ip_adapter_scale(0.6)

ref_img = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png"
)

# please note that SD3.5 Large is sensitive to highres generation like 1536x1536
image = pipe(
    width=1024,
    height=1024,
    prompt="a cat",
    negative_prompt="lowres, low quality, worst quality",
    num_inference_steps=24,
    guidance_scale=5.0,
    generator=torch.Generator(device).manual_seed(42),
    ip_adapter_image=ref_img,
).images[0]

image.save("result.jpg")

We have this PR open here with the keys updated for the diffusers implementation, so you need to add the argument revision="f1f54ca369ae759f9278ae9c87d46def9f133c78" in pipe.load_ip_adapter in the meantime

Thanks. That works!

I wasn't able to run it with sequential offload as it then threw:
RuntimeError: Tensor on device meta is not on the expected device cuda:0!

I was able to get this to run in 24GB VRAM by quantizing the transformer to 4bits (bitsandbytes) instead of using sequential cpu offload:

import torch
from diffusers.models.transformers import SD3Transformer2DModel
from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import (
    StableDiffusion3Pipeline,
)
from diffusers import BitsAndBytesConfig
from diffusers.utils import load_image
from transformers import SiglipVisionModel, SiglipImageProcessor

model_path = "stabilityai/stable-diffusion-3.5-large"
image_encoder_path = "google/siglip-so400m-patch14-384"
ip_adapter_path = "InstantX/SD3.5-Large-IP-Adapter"

device = "cuda"

transformer = SD3Transformer2DModel.from_pretrained(
    model_path,
    subfolder="transformer",
    torch_dtype=torch.bfloat16,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    ),
)

feature_extractor = SiglipImageProcessor.from_pretrained(
    image_encoder_path, torch_dtype=torch.bfloat16
)

image_encoder = SiglipVisionModel.from_pretrained(
    image_encoder_path, torch_dtype=torch.bfloat16
)
pipe = StableDiffusion3Pipeline.from_pretrained(
    model_path,
    transformer=transformer,
    torch_dtype=torch.bfloat16,
    feature_extractor=feature_extractor,
    image_encoder=image_encoder,
).to(device)

pipe.load_ip_adapter(
    ip_adapter_path,
    subfolder="",
    weight_name="ip-adapter.bin",
    revision="f1f54ca369ae759f9278ae9c87d46def9f133c78",
)
pipe.set_ip_adapter_scale(0.6)

ref_img = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png"
)

# please note that SD3.5 Large is sensitive to highres generation like 1536x1536
image = pipe(
    prompt="a marmot drinks a milkshake",
    negative_prompt="lowres, low quality, worst quality",
    num_inference_steps=24,
    guidance_scale=5.0,
    generator=torch.Generator(device).manual_seed(42),
    ip_adapter_image=ref_img,
).images[0]

image.save("result.jpg")

image.png

Sign up or log in to comment