Spaces:

THUDM-HF-SPACE
/

CogView4-Control

Runtime error

CogView4-Control / app.py

Yuxuan Zhang

change

905475d 9 days ago

19.3 kB

	import os
	import re
	import threading
	import time
	from datetime import datetime, timedelta

	import gradio as gr
	import random
	import spaces
	from diffusers import CogView4ControlPipeline
	import torch
	from PIL import Image
	from openai import OpenAI

	device = "cuda" if torch.cuda.is_available() else "cpu"
	pipe = CogView4ControlPipeline.from_pretrained("THUDM/CogView4-6B-Control", torch_dtype=torch.bfloat16).to(device)


	def clean_string(s):
	s = s.replace("\n", " ")
	s = s.strip()
	s = re.sub(r"\s{2,}", " ", s)
	return s


	def convert_prompt(
	prompt: str,
	retry_times: int = 5,
	) -> str:
	if not os.environ.get("OPENAI_API_KEY"):
	return prompt
	client = OpenAI()
	prompt = clean_string(prompt)
	for i in range(retry_times):
	try:
	response = client.chat.completions.create(
	messages=[
	{
	"role": "system",
	"content": "You are a bilingual image description assistant that works with an image generation bot. You work with an assistant bot that will draw anything you say . \n For example ,For example, outputting \"a beautiful morning in the woods with the sun peaking through the trees\" or \"阳光透过树林的美丽清晨\" will trigger your partner bot to output an image of a forest morning, as described . \n You will be prompted by people looking to create detailed , amazing images . The way to accomplish this is to take their short prompts and make them extremely detailed and descriptive . \n There are a few rules to follow : \n - Input can be in Chinese or English. If input is in English, prompt should be written in English. If input is in Chinese, prompt should be written in Chinese.\n - You will only ever output a single image description per user request .\n - Image descriptions must be detailed and specific, including keyword categories such as subject, medium, style, additional details, color, and lighting. \n - When generating descriptions, focus on portraying the visual elements rather than delving into abstract psychological and emotional aspects. Provide clear and concise details that vividly depict the scene and its composition, capturing the tangible elements that make up the setting.\n - Do not provide the process and explanation, just return the modified description . \n "
	},
	{
	"role": "user",
	"content": "Create an imaginative image descriptive caption for the user input : \"An anime girl stands amidst a dense flower bush.\""
	},
	{
	"role": "assistant",
	"content": "This image is a beautifully crafted digital illustration in an anime style. It features Suzuran, a character from the video game Arknights. She stands gracefully amidst a picturesque meadow with lush green grass and scattered wildflowers that gently sway in the breeze. Her attire includes a detailed outfit with layered ruffles and intricate fastenings, reflecting both elegance and functionality typical of the game's aesthetic. Suzuran holds a small bouquet of flowers delicately in her hands, adding to the serene atmosphere.\n\nProminent fox ears extend from her head, signifying her animal spirit form or persona within the game. The background showcases rolling hills covered in dense foliage under a brilliant blue sky dotted with fluffy white clouds. Golden leaves float whimsically through the air, enhancing the magical quality of the scene. A distant building partially obscured by vegetation hints at human presence without detracting from the natural beauty surrounding Suzuran. Overall, this artwork captures a moment of tranquility and charm, embodying the essence of the character and the enchanting world she inhabits."
	},
	{
	"role": "user",
	"content": "Create an imaginative image descriptive caption for the user input : \"Draw a bright convertible car with a sense of artistic design.\""
	},
	{
	"role": "assistant",
	"content": "The image showcases a meticulously crafted Delahaye 135 M.S. Special Roadster from 1937 in a highly polished and realistic rendering style that highlights its luxurious design and impeccable details. The car's body is an elegant deep brown with a glossy finish, exuding sophistication and timeless beauty. Its aerodynamic, streamlined shape features smooth curves accentuated by chrome detailing on the fenders and running boards. The front grille has three prominent circular headlights, adding to its classic charm while the dual exhaust pipes are tastefully integrated into the rear fenders.\n\nThe open cockpit area reveals sumptuous tan leather seats, emphasizing both comfort and elegance for the driver and passenger. The vehicle’s number plate reads \"MB 175\". Photographed against a dark gradient background, the focus remains solely on this automotive masterpiece, allowing viewers to appreciate every intricate detail without distraction. This image captures not just a car but a piece of art, reflecting the luxury and craftsmanship emblematic of pre-war European automobiles."
	},
	{
	"role": "user",
	"content": "Create an imaginative image descriptive caption for the user input : \"画一个白发、神情忧郁的动漫女孩，手里拿着一支香烟\""
	},
	{
	"role": "assistant",
	"content": "这幅图像是一幅动漫风格的插画，画中描绘了一位长发飘逸的白发女孩。她神情忧郁，双眼低垂，脸上带着微微的愁容。女孩穿着浅色外套，里面搭配深色衬衫和领带，增添了她沉静却时尚的外表。她的手靠近嘴边，似乎在拿着一支香烟，画作的说明也表示她在吸烟。背景描绘了一个冬季的城市场景，地面和建筑物上覆盖着积雪。几株稀疏的植物从雪地中探出，街边停着一辆车，似乎是空荡荡的街道，增强了场景的荒凉氛围。这幅作品整体风格细致精美，典型的现代动漫插画风格，既捕捉了角色的情感表达，也呈现了周围环境的静谧氛围"
	},
	{
	"role": "user",
	"content": "Create an imaginative image descriptive caption for the user input : \"一张红色的海报，中间写有“开门大吉”\""
	},
	{
	"role": "assistant",
	"content": "这张图片采用了海报风格，色彩鲜艳，主要以红色和金色为主，寓意吉祥如意。在画面的正中间是一块红色的大匾，上面用白色大字写着“开门大吉”，四角点缀着金色的装饰图案，显得格外喜庆。匾额上方悬挂着一对红色的灯笼，增添了节日的气氛。背景左右两侧是传统的绿色中式建筑，屋顶呈现出典型的飞檐翘角设计。底部有祥云朵朵和可爱的卡通福袋，象征着好运福气满满。整张海报传达出浓厚的节日氛围。"
	},
	{
	"role": "user",
	"content": f"Create an imaginative image descriptive caption for the user input : {prompt}"
	}
	],
	model="glm-4-plus",
	temperature=0.01,
	top_p=0.7,
	stream=False,
	max_tokens=300,
	)
	prompt = response.choices[0].message.content
	if prompt:
	prompt = clean_string(prompt)
	break
	except Exception as e:
	pass

	return prompt


	def delete_old_files():
	while True:
	now = datetime.now()
	cutoff = now - timedelta(minutes=5)
	os.makedirs("./gradio_tmp", exist_ok=True)
	directories = ["./gradio_tmp"]
	for directory in directories:
	for filename in os.listdir(directory):
	file_path = os.path.join(directory, filename)
	if os.path.isfile(file_path):
	file_mtime = datetime.fromtimestamp(os.path.getmtime(file_path))
	if file_mtime < cutoff:
	os.remove(file_path)
	time.sleep(600)


	threading.Thread(target=delete_old_files, daemon=True).start()


	@spaces.GPU(duration=180) # [uncomment to use ZeroGPU]
	def resize_image_to_valid_dimensions(image, min_size=512, max_size=2048, divisible_by=32, max_pixels=2 ** 21):
	"""
	Resize image to valid dimensions for the model:
	- Width and height must be divisible by divisible_by
	- Width and height must be between min_size and max_size
	- Total pixels must not exceed max_pixels
	"""
	if image is None:
	return None

	if not isinstance(image, Image.Image):
	image = Image.fromarray(image)

	orig_width, orig_height = image.size
	aspect_ratio = orig_width / orig_height
	new_width, new_height = orig_width, orig_height

	if new_width < min_size:
	new_width = min_size
	new_height = int(new_width / aspect_ratio)
	if new_height < min_size:
	new_height = min_size
	new_width = int(new_height * aspect_ratio)

	if new_width > max_size:
	new_width = max_size
	new_height = int(new_width / aspect_ratio)
	if new_height > max_size:
	new_height = max_size
	new_width = int(new_height * aspect_ratio)

	new_width = (new_width // divisible_by) * divisible_by
	new_height = (new_height // divisible_by) * divisible_by
	while (new_width * new_height) > max_pixels:
	if aspect_ratio > 1: # Width > Height
	new_width -= divisible_by
	new_height = int(new_width / aspect_ratio)
	new_height = (new_height // divisible_by) * divisible_by
	else:
	new_height -= divisible_by
	new_width = int(new_height * aspect_ratio)
	new_width = (new_width // divisible_by) * divisible_by

	new_width = max(min_size, new_width)
	new_height = max(min_size, new_height)

	resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)

	return resized_image


	def show_image_info(image):
	if image is None:
	return "Must Upload a Image"

	if not isinstance(image, Image.Image):
	image = Image.fromarray(image)

	orig_width, orig_height = image.size
	resized_img = resize_image_to_valid_dimensions(image)
	new_width, new_height = resized_img.size

	return f"Original Size: {orig_width}x{orig_height}, Resize: {new_width}x{new_height}"


	def infer(prompt, control_image, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, num_images,
	progress=gr.Progress(track_tqdm=True)):
	if randomize_seed:
	seed = random.randint(0, 65536)
	if control_image is None:
	return None, seed, "Must Upload an Image"
	if not isinstance(control_image, Image.Image):
	control_image = Image.fromarray(control_image)

	control_image = resize_image_to_valid_dimensions(control_image)
	actual_width, actual_height = control_image.size
	print(f"Change Image Size to: {actual_width}x{actual_height}")


	images = pipe(
	prompt=prompt,
	control_image=control_image,
	guidance_scale=guidance_scale,
	num_images_per_prompt=num_images,
	num_inference_steps=num_inference_steps,
	width=actual_width,
	height=actual_height,
	generator=torch.Generator().manual_seed(seed)
	).images
	return images, seed, None


	examples = [
	"这幅图像展现了一幅令人宁静的夜景壁纸。画面上方是深邃的天空，点缀着多颗星星和几道流星，给人一种无限辽阔的感觉。天际线下方是一片平静的水面，清晰地映射出上方的景象，形成完美的倒影。这种对称的视觉效果增强了画面的宁静氛围。在水面上方，有一个站在岩石上的人物剪影，似乎在凝视着夜空，剪影增添了一丝神秘感。整幅图像采用了动画风格，色彩鲜明，层次分明，给人以强烈的视觉冲击力，同时又营造出一种静谧和谐的空间感",
	"这张图片充满了魔幻色彩，展示了“哈利·波特”系列中的经典地标。画面中央是一块古朴的路牌，上面分别写着\"HOGGSMEADE\"和\"HOGWARTS\"，字体独特且具有古老的魔法风格。路牌的材质仿佛是经过岁月洗礼的铁质，表面略显斑驳。背景中矗立着宏伟的霍格沃茨城堡，其高耸的塔楼和石墙透露出神秘与庄严的气息。一盏复古的灯笼装在路牌旁边，微微发光，为整个场景增添了一丝温暖和梦幻的氛围。这幅图像采用了高清摄影风格，细节丰富，使人仿佛置身于魔法世界之中",
	]


	example_pairs = [
	{"input": "img_control/img_1_input.png", "output": "img_control/img_1_output.png", "prompt": examples[0]},
	{"input": "img_control/img_2_input.png", "output": "img_control/img_2_output.png", "prompt": examples[1]},
	]

	with gr.Blocks() as demo:
	gr.HTML("""
	<div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
	CogView4-6B-Control Demo 🚀
	</div>
	<div style="text-align: center;">
	<a href="https://huggingface.co/THUDM/CogView4-6B-Control">🤗 Model Hub</a> \|
	<a href="https://github.com/THUDM/CogView4">🌐 Github</a> \|
	<a href="https://arxiv.org/abs/2403.05121">📜 arxiv</a>
	</div>
	<div style="text-align: center; margin-top: 10px;">
	<span>基于控制图像和文本提示词生成新图像</span>
	</div>
	<div style="text-align: center; font-size: 15px; font-weight: bold; color: red; margin-bottom: 20px; margin-top: 10px;">
	⚠️ 此演示仅用于学术研究和体验目的。
	</div>
	""")

	with gr.Column():
	with gr.Row():
	prompt = gr.Text(
	label="Prompt",
	show_label=True,
	max_lines=15,
	placeholder="Enter your prompt",
	container=True,
	)
	with gr.Row():
	enhance = gr.Button("Enhance Prompt (Strongly Suggest)", scale=1)
	run_button = gr.Button("Run", scale=1)
	enhance.click(
	convert_prompt,
	inputs=[prompt],
	outputs=[prompt]
	)

	with gr.Row():
	with gr.Column(scale=1):
	control_image_input = gr.Image(
	label="Control Image",
	type="pil",
	height=400
	)
	image_info = gr.Textbox(
	label="Image Info",
	interactive=False
	)
	control_image_input.change(
	fn=show_image_info,
	inputs=[control_image_input],
	outputs=[image_info]
	)
	with gr.Column(scale=1):
	result = gr.Gallery(label="Results", show_label=True, columns=2, rows=2)
	error_output = gr.Text(label="Errors", visible=False)

	with gr.Row():
	num_images = gr.Radio(
	choices=[1, 2, 4],
	label="Number of Images",
	value=1
	)

	MAX_PIXELS = 2 ** 21
	def update_max_height(width):
	max_height = MAX_PIXELS // width
	return gr.update(maximum=max_height)


	def update_max_width(height):
	max_width = MAX_PIXELS // height
	return gr.update(maximum=max_width)


	with gr.Accordion("Advanced Settings", open=False):
	seed = gr.Slider(
	label="Seed",
	minimum=0,
	maximum=65536,
	step=1,
	value=0,
	)

	randomize_seed = gr.Checkbox(label="Randomize seed", value=True)

	with gr.Row():
	width = gr.Slider(
	label="Width",
	minimum=512,
	maximum=2048,
	step=32,
	value=1024,
	)

	height = gr.Slider(
	label="Height",
	minimum=512,
	maximum=2048,
	step=32,
	value=1024,
	)

	gr.Markdown("Warning: The width and height must be divisible by 32 and Image size must not exceed 2^21 pixels.")
	with gr.Row():
	guidance_scale = gr.Slider(
	label="Guidance scale",
	minimum=0.0,
	maximum=10.0,
	step=0.1,
	value=3.5,
	)

	num_inference_steps = gr.Slider(
	label="Number of inference steps",
	minimum=10,
	maximum=100,
	step=1,
	value=50,
	)
	with gr.Column():
	gr.Markdown("### Examples")
	for i, pair in enumerate(example_pairs):
	with gr.Row():
	ex_input_img = gr.Image(
	value=pair["input"],
	label="Input",
	interactive=False,
	height=130,
	width=130,
	scale=1
	)

	prompt_text = pair["prompt"]
	ex_btn = gr.Button(
	value=prompt_text,
	variant="secondary",
	elem_id=f"ex_btn_{i}",
	scale=3
	)
	ex_output_img = gr.Image(
	value=pair["output"],
	label="Output",
	interactive=False,
	height=130,
	width=130,
	scale=1
	)
	ex_btn.click(
	fn=lambda p=pair["prompt"], img=pair["input"]: (p, img),
	inputs=[],
	outputs=[prompt, control_image_input]
	)


	def update_gallery_layout(num_images):
	if num_images == 1:
	return gr.update(columns=1, rows=1)
	elif num_images == 2:
	return gr.update(columns=2, rows=1)
	elif num_images == 4:
	return gr.update(columns=2, rows=2)
	return gr.update(columns=2, rows=2)


	num_images.change(update_gallery_layout, inputs=[num_images], outputs=[result])


	def process_output(images, seed, error):
	if error:
	error_output.update(value=error, visible=True)
	return None, seed
	error_output.update(visible=False)
	return images, seed


	gr.on(
	triggers=[run_button.click, prompt.submit],
	fn=infer,
	inputs=[prompt, control_image_input, seed, randomize_seed, width, height, guidance_scale, num_inference_steps,
	num_images],
	outputs=[result, seed, error_output]
	)

	demo.queue().launch()