SketchModeling / app.py

Zhang Jiaqi

SketchModeling

4c65532 about 8 hours ago

10.1 kB

	import gradio as gr
	import os
	import numpy as np
	import torch
	from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler, DiffusionPipeline, EulerAncestralDiscreteScheduler
	from transformers import pipeline, AutoModelForImageSegmentation
	from torchvision.transforms import v2
	from einops import rearrange
	from omegaconf import OmegaConf
	from huggingface_hub import hf_hub_download

	from src.utils.mesh_util import save_obj, save_glb
	from src.utils.train_util import instantiate_from_config
	from src.utils.camera_util import (
	FOV_to_intrinsics,
	get_zero123plus_input_cameras,
	get_circular_camera_poses,
	)

	# ----------------------Settings-----------------------

	# Define the cache directory for model files
	model_cache_dir = 'ckpts/'
	os.makedirs(model_cache_dir, exist_ok=True)

	# Configuration
	config_path = 'configs/instant-mesh-large.yaml'
	config = OmegaConf.load(config_path)
	config_name = os.path.basename(config_path).replace('.yaml', '')
	model_config = config.model_config
	infer_config = config.infer_config

	# Device
	device = torch.device('cuda')

	# ----------------------Load Models-----------------------

	# load models for sketch-to-image
	controlnet = ControlNetModel.from_pretrained(
	"lllyasviel/sd-controlnet-scribble",
	torch_dtype=torch.float16,
	use_safetensors=True,
	cache_dir=model_cache_dir
	)
	pipeline_1 = StableDiffusionControlNetPipeline.from_pretrained(
	"stable-diffusion-v1-5/stable-diffusion-v1-5",
	controlnet=controlnet,
	torch_dtype=torch.float16,
	use_safetensors=True,
	cache_dir=model_cache_dir
	)
	pipeline_1.scheduler = UniPCMultistepScheduler.from_config(pipeline_1.scheduler.config)
	pipeline_1.enable_model_cpu_offload()

	# load models for background remove
	pipeline_2 = AutoModelForImageSegmentation.from_pretrained(
	'briaai/RMBG-2.0',
	trust_remote_code=True,
	cache_dir=model_cache_dir
	)
	pipeline_2.to(device)
	pipeline_2.eval()

	# load models for image-to-model(step1: genenrate multi-view images)
	pipeline_3 = DiffusionPipeline.from_pretrained(
	"sudo-ai/zero123plus-v1.2",
	custom_pipeline="zero123plus",
	torch_dtype=torch.float16,
	cache_dir=model_cache_dir
	)
	pipeline_3.scheduler = EulerAncestralDiscreteScheduler.from_config(
	pipeline_3.scheduler.config, timestep_spacing='trailing'
	)
	unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model", cache_dir=model_cache_dir)
	state_dict = torch.load(unet_ckpt_path, map_location='cpu')
	pipeline_3.unet.load_state_dict(state_dict, strict=True)
	pipeline_3.to(device)
	# pipeline_3 = pipeline_3.to(device)

	# load models for image-to-model(step2: 3d model reconstruction)
	model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_mesh_large.ckpt", repo_type="model", cache_dir=model_cache_dir)
	model = instantiate_from_config(model_config)
	state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
	state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k}
	model.load_state_dict(state_dict, strict=True)
	model = model.to(device)
	model.init_flexicubes_geometry(device, fovy=30.0)
	model.eval()
	# model = model.eval()

	print('----------------------Loading Finished-----------------------')

	# ----------------------Define functions-----------------------

	def input_image(input_img):
	if input_img is None:
	raise gr.Error("No image uploaded!")
	else:
	input_img.save("src/tmp/sketch.png")
	return

	def sketch_to_image(
	input_img,
	prompt,
	negative_prompt="low quality, black and white image",
	add_prompt=", 3d rendered, shadeless, white background, intact and single object",
	controlnet_conditioning_scale=0.75,
	num_inference_steps=50
	):

	output = pipeline_1(
	prompt+add_prompt,
	num_inference_steps=int(num_inference_steps),
	guidance_scale=10,
	negative_prompt=negative_prompt,
	controlnet_conditioning_scale=float(controlnet_conditioning_scale),
	image=input_img
	).images[0]

	output.save("src/tmp/image.png")

	return output

	def background_remove(input_img):
	output = pipeline_2(input_img)
	output.save("src/tmp/image_nobg.png")

	return output

	def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
	c2ws = get_circular_camera_poses(M=M, radius=radius, elevation=elevation)
	if is_flexicubes:
	cameras = torch.linalg.inv(c2ws)
	cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1, 1)
	else:
	extrinsics = c2ws.flatten(-2)
	intrinsics = FOV_to_intrinsics(30.0).unsqueeze(0).repeat(M, 1, 1).float().flatten(-2)
	cameras = torch.cat([extrinsics, intrinsics], dim=-1)
	cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1)
	return cameras

	def make_mesh(model_path, model_glb_path, planes):
	with torch.no_grad():
	mesh_out = model.extract_mesh(
	planes,
	use_texture_map=False,
	**infer_config,
	)

	vertices, faces, vertex_colors = mesh_out
	vertices = vertices[:, [1, 2, 0]]

	save_obj(vertices, faces, vertex_colors, model_path)
	save_glb(vertices, faces, vertex_colors, model_glb_path)

	return model_path, model_glb_path

	def image_to_model(input_img):
	generator = torch.Generator(device=device)
	z123_image = pipeline_3(
	input_img,
	generator=generator,
	).images[0]

	input_img = np.asarray(z123_image, dtype=np.float32) / 255.0
	input_img = torch.from_numpy(input_img ).permute(2, 0, 1).contiguous().float() # (3, 960, 640)
	input_img = rearrange(input_img, 'c (n h) (m w) -> (n m) c h w', n=3, m=2) # (6, 3, 320, 320)

	input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device)

	input_img = input_img.unsqueeze(0).to(device)
	input_img = v2.functional.resize(input_img, (320, 320), interpolation=3, antialias=True).clamp(0, 1)

	model_path = "src/tmp/model.obj"
	model_glb_path = "src/tmp/model.glb"

	with torch.no_grad():
	planes = model.forward_planes(input_img, input_cameras)

	model_path, model_glb_path = make_mesh(model_path, model_glb_path, planes)

	return model_path, model_glb_path

	# ----------------------Build Gradio Interfaces-----------------------

	with gr.Blocks() as demo:
	gr.Markdown("""
	# SketchModeling: From Sketch to 3D Model

	SketchModeling is a method for 3D mesh generation from a sketch.

	It has three steps:
	1. It generates image from sketch using Stable Diffusion and ControlNet.
	2. It removes the background of the image using RMBG.
	3. It reconsturcted the 3D model of the image using InstantMesh.

	On below, you can either upload a sketch image or draw the sketch yourself. Then press Run and wait for the model to be generated.

	ATTENTION: If it's the first time you run SketchModeling, it could take some time to download models from the Internet.
	""")
	with gr.Row(variant="panel"):
	with gr.Column():
	with gr.Row():
	with gr.Column():
	with gr.Tab("Sketch Pad"):
	input_img = gr.Sketchpad(
	crop_size=(640, 640), type="pil", label="Sketch Pad", image_mode="RGBA"
	)
	with gr.Tab("Input Image"):
	input_img = gr.Image(
	type="pil", label="Input Image", sources="upload", image_mode="RGBA"
	)
	with gr.Column():
	with gr.Tab("Generated Image"):
	generated_img = gr.Image(
	type="pil", label="Gnerated Image", image_mode="RGBA", interactive=False
	)
	with gr.Tab("Processed Image"):
	processed_img = gr.Image(
	type="pil", label="Processed Image", image_mode="RGBA", interactive=False
	)
	with gr.Row():
	prompt = gr.Textbox(label="Pompt", interactive=True)
	controlnet_conditioning_scale = gr.Slider(
	label="Controlnet Conditioning Scale",
	minimum=0.5,
	maximum=1.5,
	value=0.85,
	step=0.05,
	interactive=True
	)
	with gr.Accordion('Advanced options', open=False):
	with gr.Row():
	negative_prompt = gr.Textbox(label="Negative Prompt", value="low quality, black and white image", interactive=True)
	add_prompt = gr.Textbox(label="Styles", value=", 3d rendered, shadeless, white background, intact and single object", interactive=True)
	num_inference_steps = gr.Number(label="Inference Steps", value=50, interactive=True)
	run_btn = gr.Button("Run", variant="primary")

	with gr.Column():
	with gr.Tab("OBJ"):
	output_obj = gr.Model3D(
	label="Output Model (OBJ Format)",
	interactive=False
	)
	with gr.Tab("GLB"):
	output_glb = gr.Model3D(
	label="Output Model (GLB Format)",
	interactive=False
	)

	run_btn.click(fn=input_image, inputs=[input_img]).success(
	fn=sketch_to_image,
	inputs=[input_img, prompt, negative_prompt, add_prompt, controlnet_conditioning_scale, num_inference_steps],
	outputs=[generated_img]
	).success(
	fn=background_remove,
	inputs=[generated_img],
	outputs=[processed_img]
	).success(
	fn=image_to_model,
	inputs=[processed_img],
	outputs=[output_obj, output_glb]
	)

	demo.launch()