import gradio as gr import os import numpy as np import torch from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler, DiffusionPipeline, EulerAncestralDiscreteScheduler from transformers import pipeline, AutoModelForImageSegmentation from torchvision.transforms import v2 from einops import rearrange from omegaconf import OmegaConf from huggingface_hub import hf_hub_download from src.utils.mesh_util import save_obj, save_glb from src.utils.train_util import instantiate_from_config from src.utils.camera_util import ( FOV_to_intrinsics, get_zero123plus_input_cameras, get_circular_camera_poses, ) # ----------------------Settings----------------------- # Define the cache directory for model files model_cache_dir = 'ckpts/' os.makedirs(model_cache_dir, exist_ok=True) # Configuration config_path = 'configs/instant-mesh-large.yaml' config = OmegaConf.load(config_path) config_name = os.path.basename(config_path).replace('.yaml', '') model_config = config.model_config infer_config = config.infer_config # Device device = torch.device('cuda') # ----------------------Load Models----------------------- # load models for sketch-to-image controlnet = ControlNetModel.from_pretrained( "lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16, use_safetensors=True, cache_dir=model_cache_dir ) pipeline_1 = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True, cache_dir=model_cache_dir ) pipeline_1.scheduler = UniPCMultistepScheduler.from_config(pipeline_1.scheduler.config) pipeline_1.enable_model_cpu_offload() # load models for background remove pipeline_2 = AutoModelForImageSegmentation.from_pretrained( 'briaai/RMBG-2.0', trust_remote_code=True, cache_dir=model_cache_dir ) pipeline_2.to(device) pipeline_2.eval() # load models for image-to-model(step1: genenrate multi-view images) pipeline_3 = DiffusionPipeline.from_pretrained( "sudo-ai/zero123plus-v1.2", custom_pipeline="zero123plus", torch_dtype=torch.float16, cache_dir=model_cache_dir ) pipeline_3.scheduler = EulerAncestralDiscreteScheduler.from_config( pipeline_3.scheduler.config, timestep_spacing='trailing' ) unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model", cache_dir=model_cache_dir) state_dict = torch.load(unet_ckpt_path, map_location='cpu') pipeline_3.unet.load_state_dict(state_dict, strict=True) pipeline_3.to(device) # pipeline_3 = pipeline_3.to(device) # load models for image-to-model(step2: 3d model reconstruction) model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_mesh_large.ckpt", repo_type="model", cache_dir=model_cache_dir) model = instantiate_from_config(model_config) state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict'] state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k} model.load_state_dict(state_dict, strict=True) model = model.to(device) model.init_flexicubes_geometry(device, fovy=30.0) model.eval() # model = model.eval() print('----------------------Loading Finished-----------------------') # ----------------------Define functions----------------------- def input_image(input_img): if input_img is None: raise gr.Error("No image uploaded!") else: input_img.save("src/tmp/sketch.png") return def sketch_to_image( input_img, prompt, negative_prompt="low quality, black and white image", add_prompt=", 3d rendered, shadeless, white background, intact and single object", controlnet_conditioning_scale=0.75, num_inference_steps=50 ): output = pipeline_1( prompt+add_prompt, num_inference_steps=int(num_inference_steps), guidance_scale=10, negative_prompt=negative_prompt, controlnet_conditioning_scale=float(controlnet_conditioning_scale), image=input_img ).images[0] output.save("src/tmp/image.png") return output def background_remove(input_img): output = pipeline_2(input_img) output.save("src/tmp/image_nobg.png") return output def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False): c2ws = get_circular_camera_poses(M=M, radius=radius, elevation=elevation) if is_flexicubes: cameras = torch.linalg.inv(c2ws) cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1, 1) else: extrinsics = c2ws.flatten(-2) intrinsics = FOV_to_intrinsics(30.0).unsqueeze(0).repeat(M, 1, 1).float().flatten(-2) cameras = torch.cat([extrinsics, intrinsics], dim=-1) cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1) return cameras def make_mesh(model_path, model_glb_path, planes): with torch.no_grad(): mesh_out = model.extract_mesh( planes, use_texture_map=False, **infer_config, ) vertices, faces, vertex_colors = mesh_out vertices = vertices[:, [1, 2, 0]] save_obj(vertices, faces, vertex_colors, model_path) save_glb(vertices, faces, vertex_colors, model_glb_path) return model_path, model_glb_path def image_to_model(input_img): generator = torch.Generator(device=device) z123_image = pipeline_3( input_img, generator=generator, ).images[0] input_img = np.asarray(z123_image, dtype=np.float32) / 255.0 input_img = torch.from_numpy(input_img ).permute(2, 0, 1).contiguous().float() # (3, 960, 640) input_img = rearrange(input_img, 'c (n h) (m w) -> (n m) c h w', n=3, m=2) # (6, 3, 320, 320) input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device) input_img = input_img.unsqueeze(0).to(device) input_img = v2.functional.resize(input_img, (320, 320), interpolation=3, antialias=True).clamp(0, 1) model_path = "src/tmp/model.obj" model_glb_path = "src/tmp/model.glb" with torch.no_grad(): planes = model.forward_planes(input_img, input_cameras) model_path, model_glb_path = make_mesh(model_path, model_glb_path, planes) return model_path, model_glb_path # ----------------------Build Gradio Interfaces----------------------- with gr.Blocks() as demo: gr.Markdown(""" # SketchModeling: From Sketch to 3D Model **SketchModeling** is a method for 3D mesh generation from a sketch. It has three steps: 1. It generates image from sketch using Stable Diffusion and ControlNet. 2. It removes the background of the image using RMBG. 3. It reconsturcted the 3D model of the image using InstantMesh. On below, you can either upload a sketch image or draw the sketch yourself. Then press Run and wait for the model to be generated. **ATTENTION:** If it's the first time you run SketchModeling, it could take some time to download models from the Internet. """) with gr.Row(variant="panel"): with gr.Column(): with gr.Row(): with gr.Column(): with gr.Tab("Sketch Pad"): input_img = gr.Sketchpad( crop_size=(640, 640), type="pil", label="Sketch Pad", image_mode="RGBA" ) with gr.Tab("Input Image"): input_img = gr.Image( type="pil", label="Input Image", sources="upload", image_mode="RGBA" ) with gr.Column(): with gr.Tab("Generated Image"): generated_img = gr.Image( type="pil", label="Gnerated Image", image_mode="RGBA", interactive=False ) with gr.Tab("Processed Image"): processed_img = gr.Image( type="pil", label="Processed Image", image_mode="RGBA", interactive=False ) with gr.Row(): prompt = gr.Textbox(label="Pompt", interactive=True) controlnet_conditioning_scale = gr.Slider( label="Controlnet Conditioning Scale", minimum=0.5, maximum=1.5, value=0.85, step=0.05, interactive=True ) with gr.Accordion('Advanced options', open=False): with gr.Row(): negative_prompt = gr.Textbox(label="Negative Prompt", value="low quality, black and white image", interactive=True) add_prompt = gr.Textbox(label="Styles", value=", 3d rendered, shadeless, white background, intact and single object", interactive=True) num_inference_steps = gr.Number(label="Inference Steps", value=50, interactive=True) run_btn = gr.Button("Run", variant="primary") with gr.Column(): with gr.Tab("OBJ"): output_obj = gr.Model3D( label="Output Model (OBJ Format)", interactive=False ) with gr.Tab("GLB"): output_glb = gr.Model3D( label="Output Model (GLB Format)", interactive=False ) run_btn.click(fn=input_image, inputs=[input_img]).success( fn=sketch_to_image, inputs=[input_img, prompt, negative_prompt, add_prompt, controlnet_conditioning_scale, num_inference_steps], outputs=[generated_img] ).success( fn=background_remove, inputs=[generated_img], outputs=[processed_img] ).success( fn=image_to_model, inputs=[processed_img], outputs=[output_obj, output_glb] ) demo.launch()