Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| from transformers import AutoProcessor, AutoModelForCausalLM, pipeline | |
| from diffusers import DiffusionPipeline | |
| import random | |
| import numpy as np | |
| import os | |
| import subprocess | |
| from huggingface_hub import hf_hub_download | |
| from llm_inference import LLMInferenceNode | |
| # Install flash-attn | |
| subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) | |
| # Initialize models | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 | |
| huggingface_token = os.getenv("HUGGINGFACE_TOKEN") | |
| # SD3.5 model | |
| pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-large", torch_dtype=dtype, use_safetensors=True, variant="fp16", token=huggingface_token).to(device) | |
| # Initialize Florence model | |
| florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval() | |
| florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True) | |
| # Prompt Enhancer | |
| enhancer_long = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance-Long", device=device) | |
| MAX_SEED = np.iinfo(np.int32).max | |
| MAX_IMAGE_SIZE = 1024 | |
| hf_hub_download( | |
| repo_id="stabilityai/stable-diffusion-3.5-large-turbo", | |
| filename="LICENSE.md", | |
| local_dir = "./models", | |
| token = huggingface_token | |
| ) | |
| # Initialize LLMInferenceNode | |
| llm_node = LLMInferenceNode() | |
| # Florence caption function | |
| def florence_caption(image): | |
| # Convert image to PIL if it's not already | |
| if not isinstance(image, Image.Image): | |
| image = Image.fromarray(image) | |
| inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device) | |
| generated_ids = florence_model.generate( | |
| input_ids=inputs["input_ids"], | |
| pixel_values=inputs["pixel_values"], | |
| max_new_tokens=1024, | |
| early_stopping=False, | |
| do_sample=False, | |
| num_beams=3, | |
| ) | |
| generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0] | |
| parsed_answer = florence_processor.post_process_generation( | |
| generated_text, | |
| task="<MORE_DETAILED_CAPTION>", | |
| image_size=(image.width, image.height) | |
| ) | |
| return parsed_answer["<MORE_DETAILED_CAPTION>"] | |
| # Prompt Enhancer function | |
| def enhance_prompt(input_prompt): | |
| result = enhancer_long("Enhance the description: " + input_prompt) | |
| enhanced_text = result[0]['summary_text'] | |
| return enhanced_text | |
| def process_workflow(image, text_prompt, use_enhancer, use_llm_generator, llm_provider, llm_model, prompt_type, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, negative_prompt="", progress=gr.Progress(track_tqdm=True)): | |
| if image is not None: | |
| # Convert image to PIL if it's not already | |
| if not isinstance(image, Image.Image): | |
| image = Image.fromarray(image) | |
| caption = florence_caption(image) | |
| print(f"Florence caption: {caption}") | |
| if use_llm_generator: | |
| prompt = generate_llm_prompt(caption, llm_provider, llm_model, prompt_type) | |
| else: | |
| prompt = caption | |
| else: | |
| prompt = text_prompt | |
| if use_enhancer: | |
| prompt = enhance_prompt(prompt) | |
| if randomize_seed: | |
| seed = random.randint(0, MAX_SEED) | |
| generator = torch.Generator(device=device).manual_seed(seed) | |
| image = pipe( | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| generator=generator, | |
| num_inference_steps=num_inference_steps, | |
| width=width, | |
| height=height, | |
| guidance_scale=guidance_scale | |
| ).images[0] | |
| return image, prompt, seed | |
| def generate_llm_prompt(input_text, provider, model, prompt_type): | |
| try: | |
| dynamic_seed = random.randint(0, 1000000) | |
| result = llm_node.generate( | |
| input_text=input_text, | |
| long_talk=True, | |
| compress=False, | |
| compression_level="medium", | |
| poster=False, | |
| prompt_type=prompt_type, | |
| provider=provider, | |
| model=model | |
| ) | |
| return result | |
| except Exception as e: | |
| print(f"An error occurred in generate_llm_prompt: {e}") | |
| return input_text # Return original input if there's an error | |
| title = """<h1 align="center">Stable Diffusion 3.5 with Florence-2 Captioner and Prompt Enhancer</h1> | |
| <p><center> | |
| <a href="https://huggingface.co/stabilityai/stable-diffusion-3.5-large" target="_blank">[Stable Diffusion 3.5 Model]</a> | |
| <a href="https://huggingface.co/microsoft/Florence-2-base" target="_blank">[Florence-2 Model]</a> | |
| <a href="https://huggingface.co/gokaygokay/Lamini-Prompt-Enchance-Long" target="_blank">[Prompt Enhancer Long]</a> | |
| <p align="center">Create long prompts from images or enhance your short prompts with prompt enhancer</p> | |
| </center></p> | |
| """ | |
| custom_css = """ | |
| .input-group, .output-group { | |
| border: 1px solid #e0e0e0; | |
| border-radius: 10px; | |
| padding: 20px; | |
| margin-bottom: 20px; | |
| background-color: #f9f9f9; | |
| } | |
| .submit-btn { | |
| background-color: #2980b9 !important; | |
| color: white !important; | |
| } | |
| .submit-btn:hover { | |
| background-color: #3498db !important; | |
| } | |
| /* Updated styles for sliders */ | |
| .custom-slider input[type="range"] { | |
| -webkit-appearance: none; | |
| width: 100%; | |
| height: 10px; | |
| border-radius: 5px; | |
| background: #d3d3d3; | |
| outline: none; | |
| opacity: 0.7; | |
| transition: opacity .2s; | |
| } | |
| .custom-slider input[type="range"]:hover { | |
| opacity: 1; | |
| } | |
| .custom-slider input[type="range"]::-webkit-slider-thumb { | |
| -webkit-appearance: none; | |
| appearance: none; | |
| width: 20px; | |
| height: 20px; | |
| border-radius: 50%; | |
| background: #2980b9; | |
| cursor: pointer; | |
| } | |
| .custom-slider input[type="range"]::-moz-range-thumb { | |
| width: 20px; | |
| height: 20px; | |
| border-radius: 50%; | |
| background: #2980b9; | |
| cursor: pointer; | |
| } | |
| """ | |
| with gr.Blocks(theme='bethecloud/storj_theme', css=custom_css) as demo: | |
| gr.HTML(title) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| with gr.Group(elem_classes="input-group"): | |
| input_image = gr.Image(label="Input Image (Florence-2 Captioner)", height=512) | |
| with gr.Accordion("Image Settings", open=False): | |
| width = gr.Slider(label="Width", minimum=512, maximum=MAX_IMAGE_SIZE, step=32, value=1024, elem_classes="custom-slider") | |
| height = gr.Slider(label="Height", minimum=512, maximum=MAX_IMAGE_SIZE, step=32, value=1024, elem_classes="custom-slider") | |
| guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=7.5, step=0.1, value=4.5, elem_classes="custom-slider") | |
| num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=40, elem_classes="custom-slider") | |
| seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, elem_classes="custom-slider") | |
| randomize_seed = gr.Checkbox(label="Randomize Seed", value=True) | |
| negative_prompt = gr.Textbox(label="Negative Prompt") | |
| with gr.Column(scale=1): | |
| with gr.Group(elem_classes="input-group"): | |
| text_prompt = gr.Textbox(label="Text Prompt (optional, used if no image is uploaded)") | |
| use_enhancer = gr.Checkbox(label="Use Prompt Enhancer", value=False) | |
| use_llm_generator = gr.Checkbox(label="Use LLM Prompt Generator", value=False) | |
| with gr.Accordion("LLM Settings", open=False): | |
| llm_provider = gr.Dropdown( | |
| choices=["Hugging Face", "SambaNova"], | |
| label="LLM Provider", | |
| value="Hugging Face", | |
| visible=False | |
| ) | |
| llm_model = gr.Dropdown( | |
| label="LLM Model", | |
| choices=["Qwen/Qwen2.5-72B-Instruct", "meta-llama/Meta-Llama-3.1-70B-Instruct", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.3"], | |
| value="Qwen/Qwen2.5-72B-Instruct", | |
| visible=False | |
| ) | |
| prompt_type = gr.Dropdown( | |
| choices=["Random", "Long", "Short", "Medium", "OnlyObjects", "NoFigure", "Landscape", "Fantasy"], | |
| label="Prompt Type", | |
| value="Short", | |
| visible=False | |
| ) | |
| generate_prompt_btn = gr.Button("Generate Prompt", elem_classes="submit-btn") | |
| final_prompt = gr.Textbox(label="Final Prompt", interactive=False) | |
| generate_btn = gr.Button("Generate Image", elem_classes="submit-btn") | |
| with gr.Column(scale=1): | |
| with gr.Group(elem_classes="output-group"): | |
| output_image = gr.Image(label="Result", elem_id="gallery", show_label=False) | |
| used_seed = gr.Number(label="Seed Used") | |
| def update_model_choices(provider): | |
| provider_models = { | |
| "Hugging Face": [ | |
| "Qwen/Qwen2.5-72B-Instruct", | |
| "meta-llama/Meta-Llama-3.1-70B-Instruct", | |
| "mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| "mistralai/Mistral-7B-Instruct-v0.3" | |
| ], | |
| "SambaNova": [ | |
| "Meta-Llama-3.1-70B-Instruct", | |
| "Meta-Llama-3.1-405B-Instruct", | |
| "Meta-Llama-3.1-8B-Instruct" | |
| ], | |
| } | |
| models = provider_models.get(provider, []) | |
| return gr.Dropdown(choices=models, value=models[0] if models else "") | |
| def update_llm_visibility(use_llm): | |
| return { | |
| llm_provider: gr.update(visible=use_llm), | |
| llm_model: gr.update(visible=use_llm), | |
| prompt_type: gr.update(visible=use_llm) | |
| } | |
| use_llm_generator.change( | |
| update_llm_visibility, | |
| inputs=[use_llm_generator], | |
| outputs=[llm_provider, llm_model, prompt_type] | |
| ) | |
| llm_provider.change( | |
| update_model_choices, | |
| inputs=[llm_provider], | |
| outputs=[llm_model] | |
| ) | |
| def generate_prompt(image, text_prompt, use_enhancer, use_llm_generator, llm_provider, llm_model, prompt_type): | |
| if image is not None: | |
| caption = florence_caption(image) | |
| initial_prompt = caption | |
| else: | |
| initial_prompt = text_prompt | |
| if use_llm_generator: | |
| prompt = generate_llm_prompt(initial_prompt, llm_provider, llm_model, prompt_type) | |
| else: | |
| prompt = initial_prompt | |
| if use_enhancer: | |
| prompt = enhance_prompt(prompt) | |
| return prompt | |
| generate_prompt_btn.click( | |
| fn=generate_prompt, | |
| inputs=[ | |
| input_image, text_prompt, use_enhancer, use_llm_generator, llm_provider, llm_model, prompt_type | |
| ], | |
| outputs=[final_prompt] | |
| ) | |
| generate_btn.click( | |
| fn=process_workflow, | |
| inputs=[ | |
| input_image, final_prompt, use_enhancer, use_llm_generator, llm_provider, llm_model, prompt_type, | |
| seed, randomize_seed, width, height, guidance_scale, num_inference_steps, negative_prompt | |
| ], | |
| outputs=[output_image, final_prompt, used_seed] | |
| ) | |
| demo.launch(debug=True) | |