InstantStyle-Hyper-SD

Runtime error

File size: 12,455 Bytes

import cv2
import torch
import random
import tempfile
import numpy as np
from pathlib import Path
from PIL import Image
from diffusers import (
    ControlNetModel,
    StableDiffusionControlNetPipeline,
    TCDScheduler,
)
import spaces
import gradio as gr
from huggingface_hub import hf_hub_download, snapshot_download
from ip_adapter import IPAdapter

snapshot_download(repo_id="h94/IP-Adapter", allow_patterns="models/*", local_dir=".")

# global variable
MAX_SEED = np.iinfo(np.int32).max
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32

# initialization
base_model_path = "runwayml/stable-diffusion-v1-5"
image_encoder_path = "models/image_encoder"
ip_ckpt = "models/ip-adapter_sd15.bin"

controlnet_path = "lllyasviel/control_v11p_sd15_canny"
controlnet = ControlNetModel.from_pretrained(
    controlnet_path, use_safetensors=False, torch_dtype=torch.float16
).to(device)

# load Hyper SD

pipe = StableDiffusionControlNetPipeline.from_pretrained(
    base_model_path,
    controlnet=controlnet,
    torch_dtype=torch.float16,
    variant="fp16",
).to(device)
pipe.set_progress_bar_config(disable=True)
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
pipe.load_lora_weights(
    hf_hub_download("ByteDance/Hyper-SD", "Hyper-SD15-1step-lora.safetensors")
)
pipe.enable_vae_tiling()

eta = 1.0

ip_model = IPAdapter(
    pipe,
    image_encoder_path,
    ip_ckpt,
    device,
)


def resize_img(
    input_image,
    max_side=1280,
    min_side=512,
    size=None,
    pad_to_max_side=False,
    mode=Image.BILINEAR,
    base_pixel_number=64,
):
    w, h = input_image.size
    if size is not None:
        w_resize_new, h_resize_new = size
    else:
        ratio = min_side / min(h, w)
        w, h = round(ratio * w), round(ratio * h)
        ratio = max_side / max(h, w)
        input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode)
        w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
        h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
    input_image = input_image.resize([w_resize_new, h_resize_new], mode)

    if pad_to_max_side:
        res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
        offset_x = (max_side - w_resize_new) // 2
        offset_y = (max_side - h_resize_new) // 2
        res[offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new] = (
            np.array(input_image)
        )
        input_image = Image.fromarray(res)
    return input_image


examples = [
    [
        "./assets/0.jpg",
        None,
        "a cat, masterpiece, best quality, high quality",
        1.0,
        0.0,
    ],
    [
        "./assets/1.jpg",
        None,
        "a cat, masterpiece, best quality, high quality",
        1.0,
        0.0,
    ],
    [
        "./assets/2.jpg",
        None,
        "a cat, masterpiece, best quality, high quality",
        1.0,
        0.0,
    ],
    [
        "./assets/3.jpg",
        None,
        "a cat, masterpiece, best quality, high quality",
        1.0,
        0.0,
    ],
    [
        "./assets/2.jpg",
        "./assets/yann-lecun.jpg",
        "a man, masterpiece, best quality, high quality",
        1.0,
        0.6,
    ],
]


def run_for_examples(style_image, source_image, prompt, scale, control_scale):
    return create_image(
        image_pil=style_image,
        input_image=source_image,
        prompt=prompt,
        n_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
        scale=scale,
        control_scale=control_scale,
        guidance_scale=0.0,
        num_inference_steps=2,
        seed=42,
        target="Load only style blocks",
        neg_content_prompt="",
        neg_content_scale=0,
    )


@spaces.GPU(enable_queue=True)
def create_image(
    image_pil,
    input_image,
    prompt,
    n_prompt,
    scale,
    control_scale,
    guidance_scale,
    num_inference_steps,
    seed,
    target="Load only style blocks",
    neg_content_prompt=None,
    neg_content_scale=0,
):
    seed = random.randint(0, MAX_SEED) if seed == -1 else seed
    if target == "Load original IP-Adapter":
        # target_blocks=["blocks"] for original IP-Adapter
        ip_model = IPAdapter(
            pipe, image_encoder_path, ip_ckpt, device, target_blocks=["blocks"]
        )
    elif target == "Load only style blocks":
        # target_blocks=["up_blocks.0.attentions.1"] for style blocks only
        ip_model = IPAdapter(
            pipe,
            image_encoder_path,
            ip_ckpt,
            device,
            target_blocks=["up_blocks.1"],
        )
    elif target == "Load style+layout block":
        # target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks
        ip_model = IPAdapter(
            pipe,
            image_encoder_path,
            ip_ckpt,
            device,
            target_blocks=["down_blocks.2", "mid_block", "up_blocks.1"],
        )

    if input_image is not None:
        input_image = resize_img(input_image, max_side=512)
        cv_input_image = pil_to_cv2(input_image)
        detected_map = cv2.Canny(cv_input_image, 50, 200)
        canny_map = Image.fromarray(cv2.cvtColor(detected_map, cv2.COLOR_BGR2RGB))
    else:
        canny_map = Image.new("RGB", (512, 512), color=(255, 255, 255))
        control_scale = 0

    if float(control_scale) == 0:
        canny_map = canny_map.resize((512, 512))

    if len(neg_content_prompt) > 0 and neg_content_scale != 0:
        images = ip_model.generate(
            width=512,
            height=512,
            pil_image=image_pil,
            prompt=prompt,
            negative_prompt=n_prompt,
            scale=scale,
            guidance_scale=guidance_scale,
            num_samples=1,
            num_inference_steps=num_inference_steps,
            seed=seed,
            image=canny_map,
            controlnet_conditioning_scale=float(control_scale),
            neg_content_prompt=neg_content_prompt,
            neg_content_scale=neg_content_scale,
            eta=1.0,
        )
    else:
        images = ip_model.generate(
            width=512,
            height=512,
            pil_image=image_pil,
            prompt=prompt,
            negative_prompt=n_prompt,
            scale=scale,
            guidance_scale=guidance_scale,
            num_samples=1,
            num_inference_steps=num_inference_steps,
            seed=seed,
            image=canny_map,
            controlnet_conditioning_scale=float(control_scale),
            eta=1.0,
        )
    image = images[0]
    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmpfile:
        image.save(tmpfile, "JPEG", quality=80, optimize=True, progressive=True)
        return Path(tmpfile.name)


def pil_to_cv2(image_pil):
    image_np = np.array(image_pil)
    image_cv2 = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
    return image_cv2


# Description
title = r"""
<h1 align="center">InstantStyle + Hyper-SD</h1>
"""

description = r"""
<b>Forked from <a href='https://github.com/InstantStyle/InstantStyle' target='_blank'>InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image Generation</a>.<br>
<b>Model by <a href='https://huggingface.co/ByteDance/Hyper-SD' target='_blank'>Hyper-SD</a> and <a href='https://huggingface.co/h94/IP-Adapter' target='_blank'>IP-Adapter</a>.</b><br>
"""

article = r"""
---
📝 **Citation**
<br>
If our work is helpful for your research or applications, please cite us via:
```bibtex
@article{wang2024instantstyle,
  title={InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image Generation},
  author={Wang, Haofan and Wang, Qixun and Bai, Xu and Qin, Zekui and Chen, Anthony},
  journal={arXiv preprint arXiv:2404.02733},
  year={2024}
}
```
📧 **Contact**
<br>
If you have any questions, please feel free to open an issue or directly reach us out at <b>[email protected]</b>.
"""

block = gr.Blocks()
with block:
    # description
    gr.Markdown(title)
    gr.Markdown(description)

    with gr.Tabs():
        with gr.Row():
            with gr.Column():
                with gr.Row():
                    with gr.Column():
                        image_pil = gr.Image(label="Style Image", type="pil")
                    with gr.Column():
                        prompt = gr.Textbox(
                            label="Prompt",
                            value="a cat, masterpiece, best quality, high quality",
                        )

                        scale = gr.Slider(
                            minimum=0, maximum=2.0, step=0.01, value=1.0, label="Scale"
                        )

                with gr.Accordion(open=False, label="Advanced Options"):
                    target = gr.Radio(
                        [
                            "Load only style blocks",
                            "Load style+layout block",
                            "Load original IP-Adapter",
                        ],
                        value="Load only style blocks",
                        label="Style mode",
                    )
                    with gr.Column():
                        src_image_pil = gr.Image(
                            label="Source Image (optional)", type="pil"
                        )
                    control_scale = gr.Slider(
                        minimum=0,
                        maximum=1.0,
                        step=0.01,
                        value=0.5,
                        label="Controlnet conditioning scale",
                    )

                    n_prompt = gr.Textbox(
                        label="Neg Prompt",
                        value="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
                    )

                    neg_content_prompt = gr.Textbox(
                        label="Neg Content Prompt", value=""
                    )
                    neg_content_scale = gr.Slider(
                        minimum=0,
                        maximum=1.0,
                        step=0.01,
                        value=0.5,
                        label="Neg Content Scale",
                    )

                    guidance_scale = gr.Slider(
                        minimum=0,
                        maximum=10.0,
                        step=0.01,
                        value=0.0,
                        label="guidance scale",
                    )
                    num_inference_steps = gr.Slider(
                        minimum=1,
                        maximum=10.0,
                        step=1.0,
                        value=3,
                        label="num inference steps",
                    )
                    seed = gr.Slider(
                        minimum=-1,
                        maximum=MAX_SEED,
                        value=-1,
                        step=1,
                        label="Seed Value",
                    )

                generate_button = gr.Button("Generate Image")

            with gr.Column():
                generated_image = gr.Image(label="Generated Image")

    inputs = [
        image_pil,
        src_image_pil,
        prompt,
        n_prompt,
        scale,
        control_scale,
        guidance_scale,
        num_inference_steps,
        seed,
        target,
        neg_content_prompt,
        neg_content_scale,
    ]
    outputs = [generated_image]

    gr.on(
        triggers=[
            prompt.input,
            generate_button.click,
            guidance_scale.input,
            scale.input,
            control_scale.input,
            seed.input,
            num_inference_steps.input,
            target.input,
            neg_content_prompt.input,
            neg_content_scale.input,
        ],
        fn=create_image,
        inputs=inputs,
        outputs=outputs,
        show_progress="minimal",
        show_api=False,
        trigger_mode="always_last",
    )

    gr.Examples(
        examples=examples,
        inputs=[image_pil, src_image_pil, prompt, scale, control_scale],
        fn=run_for_examples,
        outputs=[generated_image],
        cache_examples=True,
    )

    gr.Markdown(article)

block.queue(api_open=False)
block.launch(show_api=False)