Spaces:

yuyutsu07
/

Pseudo3D

Running on Zero

App Files Files Community

yuyutsu07 commited on Mar 11

Commit

6c8993e

verified ·

1 Parent(s): aef98ce

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -317

app.py CHANGED Viewed

@@ -2,359 +2,138 @@ import torch
 import gradio as gr
 import imageio
 import numpy as np
-import cv2
 from PIL import Image
-from torchvision.transforms import ToTensor, Resize, Compose, ToPILImage
 import spaces
 import tempfile
-import os
-import gc
-import warnings
-import traceback
-from huggingface_hub import hf_hub_download
-from transformers import pipeline
-from diffusers import DPTForDepthEstimation, DPTImageProcessor
-from accelerate import Accelerator
-# Suppress warnings
-warnings.filterwarnings("ignore")
-# Global model cache
-DEPTH_MODEL = None
-DEPTH_PROCESSOR = None
-class DepthModelManager:
-    @staticmethod
-    def get_depth_model():
-        """Lazy-loads the depth estimation model on first use"""
-        global DEPTH_MODEL, DEPTH_PROCESSOR
-        if DEPTH_MODEL is None:
-            try:
-                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-                model_id = "Intel/dpt-large"
-                print(f"Loading depth model on {device}...")
-                DEPTH_MODEL = DPTForDepthEstimation.from_pretrained(model_id).to(device)
-                DEPTH_PROCESSOR = DPTImageProcessor.from_pretrained(model_id)
-                print("Depth model loaded successfully")
-            except Exception as e:
-                print(f"Error loading depth model: {e}")
-                raise
-        return DEPTH_MODEL, DEPTH_PROCESSOR
-    @staticmethod
-    def generate_depth_map(image):
-        """Generate a depth map from an input image"""
-        model, processor = DepthModelManager.get_depth_model()
-        device = next(model.parameters()).device
-        # Preprocess the image
-        image_size = image.size
-        inputs = processor(images=image, return_tensors="pt").to(device)
-        # Get depth prediction
-        with torch.no_grad():
-            outputs = model(**inputs)
-            predicted_depth = outputs.predicted_depth
-        # Postprocess the depth map
-        prediction = torch.nn.functional.interpolate(
-            predicted_depth.unsqueeze(1),
-            size=image_size[::-1],
-            mode="bicubic",
-            align_corners=False,
-        ).squeeze()
-        # Normalize the depth map
-        depth_map = (prediction - prediction.min()) / (prediction.max() - prediction.min())
-        depth_map = ToPILImage()(depth_map.cpu())
-        return depth_map
 @spaces.GPU
-def generate_parallax_video(image, depth_map=None, use_auto_depth=False, animation_style="horizontal",
-                          amplitude=2.0, k=5.0, fps=30, duration=5.0, smooth_edges=True,
-                          invert_depth=False, progress=gr.Progress()):
     """
     Generate a 3D parallax video from an image and depth map with the selected animation style.
     Args:
         image (PIL.Image): Input RGB image.
-        depth_map (PIL.Image, optional): Grayscale depth map (white = closer, black = farther).
-        use_auto_depth (bool): Whether to auto-generate the depth map.
         animation_style (str): Animation type ("horizontal", "vertical", "circle", "perspective").
         amplitude (float): Intensity of camera movement.
         k (float): Depth displacement scale factor.
         fps (int): Frames per second.
         duration (float): Video duration in seconds.
-        smooth_edges (bool): Whether to apply edge smoothing to reduce artifacts.
-        invert_depth (bool): Whether to invert the depth map.
-        progress (gr.Progress): Gradio progress indicator.
     Returns:
-        str: Path to the generated video file or error message.
     """
-    try:
-        if image is None:
-            return "Error: Please upload an input image"
-        # Generate depth map if auto-mode is selected
-        if use_auto_depth or depth_map is None:
-            progress(0.1, desc="Generating depth map...")
-            depth_map = DepthModelManager.generate_depth_map(image)
-        # Validate input dimensions
-        original_size = image.size
-        if depth_map.size != image.size:
-            depth_map = depth_map.resize(image.size, Image.BICUBIC)
-        # Handle depth map inversion if requested
-        if invert_depth:
-            depth_map = Image.fromarray(255 - np.array(depth_map))
-        # Convert to device tensors
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        progress(0.2, desc="Processing inputs...")
-        # Optimize for memory - use 16-bit precision
-        torch.set_grad_enabled(False)
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        image_tensor = ToTensor()(image).unsqueeze(0).to(device, dtype=torch.float16)  # (1, 3, H, W)
-        depth_tensor = ToTensor()(depth_map.convert('L')).to(device, dtype=torch.float16)  # (1, H, W)
-        # Normalize depth (min-max)
-        depth_min = depth_tensor.min()
-        depth_max = depth_tensor.max()
-        if depth_max - depth_min < 1e-5:  # Handle flat depth maps
-            depth_tensor = torch.ones_like(depth_tensor) * 0.5
         else:
-            depth_tensor = (depth_tensor - depth_min) / (depth_max - depth_min + 1e-6)
-        depth_tensor = depth_tensor.squeeze(0)  # (H, W)
-        # Apply optional mild gaussian blur to depth for smoother transitions
-        if smooth_edges:
-            kernel_size = max(3, min(int(min(image.size) / 100) * 2 + 1, 11))
-            depth_np = depth_tensor.cpu().numpy()
-            depth_np = cv2.GaussianBlur(depth_np, (kernel_size, kernel_size), 0)
-            depth_tensor = torch.tensor(depth_np, device=device, dtype=torch.float16)
-        # Extract dimensions
-        H, W = image_tensor.shape[2], image_tensor.shape[3]
-        # Create coordinate grid for warping
-        x = torch.arange(0, W, device=device, dtype=torch.float16)
-        y = torch.arange(0, H, device=device, dtype=torch.float16)
-        xx, yy = torch.meshgrid(x, y, indexing='xy')
-        pixel_grid = torch.stack((xx, yy), dim=-1)  # (H, W, 2)
-        # Calculate number of frames
-        num_frames = int(fps * duration)
-        # Prepare video writer
-        output_path = os.path.join(tempfile.gettempdir(), "parallax_video.mp4")
-        writer = imageio.get_writer(output_path, fps=fps, codec='libx264', quality=8,
-                               pixelformat='yuv420p', bitrate='8000k')
-        # Define easing function for smoother animation
-        def ease_in_out(t):
-            return 0.5 * (1 - np.cos(np.pi * t))
-        # Animation and rendering
-        progress(0.3, desc="Generating frames...")
-        frame_count = 0
-        for frame in range(num_frames):
-            # Report progress
-            frame_progress = 0.3 + (0.65 * (frame / num_frames))
-            progress(frame_progress, desc=f"Rendering frame {frame+1}/{num_frames}")
-            # Normalized time with easing
-            t = frame / (num_frames - 1)  # [0, 1]
-            t_eased = ease_in_out(t)
-            # Calculate camera position based on animation style
-            if animation_style == "horizontal":
-                camera_x = amplitude * np.sin(2 * np.pi * t_eased)
-                camera_y = 0
-                displacement_scale = 1
-            elif animation_style == "vertical":
-                camera_x = 0
-                camera_y = amplitude * np.sin(2 * np.pi * t_eased)
-                displacement_scale = 1
-            elif animation_style == "circle":
-                camera_x = amplitude * np.sin(2 * np.pi * t_eased)
-                camera_y = amplitude * np.cos(2 * np.pi * t_eased)
-                displacement_scale = 1
-            elif animation_style == "perspective":
-                # Better perspective effect
-                zoom_factor = 0.1 * np.sin(2 * np.pi * t_eased) + 1.0  # [0.9, 1.1]
-                camera_x = amplitude * 0.5 * np.sin(2 * np.pi * t_eased)
-                camera_y = amplitude * 0.3 * np.sin(2 * np.pi * t_eased)
-                displacement_scale = zoom_factor
-            else:
-                camera_x = 0
-                camera_y = 0
-                displacement_scale = 1
-            # Compute displacements with a more natural depth response
-            displacement_x = displacement_scale * k * camera_x * depth_tensor
-            displacement_y = displacement_scale * k * camera_y * depth_tensor
-            # Calculate source coordinates for warping
-            source_pixel_x = pixel_grid[:, :, 0] + displacement_x
-            source_pixel_y = pixel_grid[:, :, 1] + displacement_y
-            # Normalize coordinates to [-1, 1] for grid_sample
-            grid_x = 2 * source_pixel_x / (W - 1) - 1
-            grid_y = 2 * source_pixel_y / (H - 1) - 1
-            grid = torch.stack((grid_x, grid_y), dim=-1).unsqueeze(0)  # (1, H, W, 2)
-            # Warp the image using grid sampling with improved border handling
-            warped = torch.nn.functional.grid_sample(
-                image_tensor,
-                grid,
-                align_corners=True,
-                mode='bilinear',
-                padding_mode='reflection'  # Using reflection padding for smoother edges
-            )
-            # Convert warped tensor to numpy image
-            warped_np = warped.squeeze(0).permute(1, 2, 0).cpu().numpy()
-            # Convert to 8-bit for video
-            frame_img = (warped_np * 255).clip(0, 255).astype(np.uint8)
-            # Apply a mild vignette effect to hide edge artifacts
-            if smooth_edges:
-                h, w = frame_img.shape[:2]
-                center_x, center_y = w // 2, h // 2
-                max_dist = np.sqrt(center_x**2 + center_y**2)
-                y, x = np.ogrid[:h, :w]
-                dist = np.sqrt((x - center_x)**2 + (y - center_y)**2)
-                vignette = np.clip(1.0 - dist / max_dist * 0.15, 0.95, 1.0)
-                frame_img = (frame_img * vignette[:, :, np.newaxis]).astype(np.uint8)
-            # Add frame to video
-            writer.append_data(frame_img)
-            frame_count += 1
-            # Prevent memory issues by cleaning up tensors
-            del grid, warped
-            if frame % 10 == 0 and torch.cuda.is_available():
-                torch.cuda.empty_cache()
-        # Ensure all frames are written and close the writer
         writer.close()
-        # Clean up tensors
-        del image_tensor, depth_tensor, pixel_grid
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            gc.collect()
-        progress(1.0, desc="Processing complete")
-        if frame_count > 0:
-            return output_path
-        else:
-            return "Error: No frames were generated. Please adjust your parameters."
-    except Exception as e:
-        # Clean up any resources
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
-        print(error_msg)
-        return f"An error occurred: {str(e)}"
 # Define Gradio interface
-with gr.Blocks(title="3D Parallax Video Generator", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# Advanced 3D Parallax Video Generator")
-    with gr.Accordion("About this app", open=False):
-        gr.Markdown("""
-        This application converts 2D images into 3D parallax motion videos. Upload an image and
-        either provide a depth map or use our built-in depth estimation model to automatically
-        generate one. Customize the animation style and parameters to create your desired effect.
-        ### Tips for best results:
-        - Start with small amplitude and k values (2-5) to avoid extreme distortions
-        - The depth map should have white areas for objects closer to camera, black for farther objects
-        - For automatic depth generation, images with clear foreground/background separation work best
-        - If you see artifacts at the edges, enable the "Smooth edges" option
-        """)
     # Input section
     with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(label="Upload Image", type="pil")
-            with gr.Row():
-                use_auto_depth = gr.Checkbox(label="Auto-generate depth map", value=True)
-                invert_depth = gr.Checkbox(label="Invert depth map", value=False)
-            depth_input = gr.Image(label="Upload Depth Map (optional)", type="pil")
     # Parameter controls
     with gr.Row():
-        with gr.Column():
-            animation_style = gr.Dropdown(
-                choices=["horizontal", "vertical", "circle", "perspective"],
-                label="Animation Style",
-                value="horizontal"
-            )
-            amplitude_slider = gr.Slider(0.5, 10, value=2, label="Movement Amplitude", step=0.1)
-            k_slider = gr.Slider(1, 20, value=5, label="Depth Effect Strength", step=0.1)
-        with gr.Column():
-            fps_slider = gr.Slider(15, 60, value=30, label="Frames Per Second", step=1)
-            duration_slider = gr.Slider(1, 10, value=3, label="Duration (seconds)", step=0.1)
-            smooth_edges = gr.Checkbox(label="Smooth edges (reduces artifacts)", value=True)
     # Output and interaction
-    with gr.Row():
-        generate_btn = gr.Button("Generate Video", variant="primary")
     video_output = gr.Video(label="Parallax Video")
-    # Handle automatic depth map generation
-    def update_depth_visibility(auto_generate):
-        return gr.update(visible=not auto_generate)
-    use_auto_depth.change(update_depth_visibility, inputs=[use_auto_depth], outputs=[depth_input])
     # Connect button to function
     generate_btn.click(
         fn=generate_parallax_video,
-        inputs=[
-            image_input,
-            depth_input,
-            use_auto_depth,
-            animation_style,
-            amplitude_slider,
-            k_slider,
-            fps_slider,
-            duration_slider,
-            smooth_edges,
-            invert_depth
-        ],
         outputs=video_output
     )
-    # Add examples
-    gr.Examples(
-        examples=[
-            ["https://huggingface.co/spaces/stabilityai/stable-diffusion/resolve/main/images/lincoln.jpg"],
-            ["https://images.unsplash.com/photo-1546614042-7df3c24c9e5d"],
-            ["https://images.unsplash.com/photo-1563473213013-de2a0133c100"],
-        ],
-        inputs=[image_input],
-    )
 # Launch the application
 demo.launch()

 import gradio as gr
 import imageio
 import numpy as np
 from PIL import Image
+from torchvision.transforms import ToTensor
 import spaces
 import tempfile
 @spaces.GPU
+def generate_parallax_video(image, depth_map, animation_style, amplitude, k, fps, duration):
     """
     Generate a 3D parallax video from an image and depth map with the selected animation style.
     Args:
         image (PIL.Image): Input RGB image.
+        depth_map (PIL.Image): Grayscale depth map (white = closer, black = farther).
         animation_style (str): Animation type ("horizontal", "vertical", "circle", "perspective").
         amplitude (float): Intensity of camera movement.
         k (float): Depth displacement scale factor.
         fps (int): Frames per second.
         duration (float): Video duration in seconds.
     Returns:
+        str: Path to the generated video file.
     """
+    # Validate input dimensions
+    if image.size != depth_map.size:
+        raise ValueError("Image and depth map must have the same dimensions")
+    # Convert inputs to PyTorch tensors on GPU
+    image_tensor = ToTensor()(image).unsqueeze(0).to('cuda')  # Shape: (1, 3, H, W)
+    depth_tensor = ToTensor()(depth_map.convert('L')).to('cuda')  # Shape: (1, 1, H, W)
+    depth_tensor = (depth_tensor - depth_tensor.min()) / (depth_tensor.max() - depth_tensor.min() + 1e-6)
+    depth_tensor = depth_tensor.squeeze(0).squeeze(0)  # Shape: (H, W)
+    H, W = image_tensor.shape[2], image_tensor.shape[3]
+    # Create coordinate grid for warping
+    x = torch.arange(0, W).float().to('cuda')
+    y = torch.arange(0, H).float().to('cuda')
+    xx, yy = torch.meshgrid(x, y, indexing='xy')
+    pixel_grid = torch.stack((xx, yy), dim=-1)  # Shape: (H, W, 2)
+    # Calculate number of frames
+    num_frames = int(fps * duration)
+    frames = []
+    # Generate frames based on animation style
+    for frame in range(num_frames):
+        t = frame / num_frames  # Normalized time [0, 1]
+        if animation_style == "horizontal":
+            camera_x = amplitude * np.sin(2 * np.pi * t)
+            camera_y = 0
+            displacement_scale = 1
+        elif animation_style == "vertical":
+            camera_x = 0
+            camera_y = amplitude * np.sin(2 * np.pi * t)
+            displacement_scale = 1
+        elif animation_style == "circle":
+            camera_x = amplitude * np.sin(2 * np.pi * t)
+            camera_y = amplitude * np.cos(2 * np.pi * t)
+            displacement_scale = 1
+        elif animation_style == "perspective":
+            camera_x = amplitude  # Fixed horizontal base for consistency
+            camera_y = 0
+            displacement_scale = 1 + 0.5 * np.sin(2 * np.pi * t)  # Scales displacement for zoom effect
         else:
+            raise ValueError(f"Unsupported animation style: {animation_style}")
+        # Compute displacements in both x and y directions
+        displacement_x = displacement_scale * k * camera_x * depth_tensor
+        displacement_y = displacement_scale * k * camera_y * depth_tensor
+        # Calculate source coordinates for warping
+        source_pixel_x = pixel_grid[:, :, 0] + displacement_x
+        source_pixel_y = pixel_grid[:, :, 1] + displacement_y
+        # Normalize coordinates to [-1, 1] for grid_sample
+        grid_x = 2 * source_pixel_x / (W - 1) - 1
+        grid_y = 2 * source_pixel_y / (H - 1) - 1
+        grid = torch.stack((grid_x, grid_y), dim=-1).unsqueeze(0)  # Shape: (1, H, W, 2)
+        # Warp the image using grid sampling
+        warped = torch.nn.functional.grid_sample(image_tensor, grid, align_corners=True)
+        # Convert warped tensor to numpy image
+        warped_np = warped.squeeze(0).permute(1, 2, 0).cpu().numpy()
+        frame_img = (warped_np * 255).astype(np.uint8)
+        frames.append(frame_img)
+    # Save frames as a video
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
+        output_path = tmpfile.name
+        writer = imageio.get_writer(output_path, fps=fps, codec='libx264')
+        for frame in frames:
+            writer.append_data(frame)
         writer.close()
+    return output_path
 # Define Gradio interface
+with gr.Blocks(title="3D Parallax Video Generator") as demo:
+    gr.Markdown("# 3D Parallax Video Generator")
+    gr.Markdown("""
+        Upload an image and its depth map (white = closer, black = farther) to create a 3D parallax video.
+        Select an animation style and adjust parameters below. Start with small amplitude and k values to avoid empty frames.
+    """)
     # Input section
     with gr.Row():
+        image_input = gr.Image(type="pil", label="Upload Image")
+        depth_input = gr.Image(type="pil", label="Upload Depth Map")
     # Parameter controls
     with gr.Row():
+        animation_style = gr.Dropdown(
+            choices=["horizontal", "vertical", "circle", "perspective"],
+            label="Animation Style",
+            value="horizontal"
+        )
+        amplitude_slider = gr.Slider(0, 10, value=2, label="Amplitude", step=0.1)
+        k_slider = gr.Slider(1, 20, value=5, label="Depth Scale (k)", step=0.1)
+        fps_slider = gr.Slider(10, 60, value=30, label="Frames Per Second", step=1)
+        duration_slider = gr.Slider(1, 10, value=5, label="Duration (seconds)", step=0.1)
     # Output and interaction
+    generate_btn = gr.Button("Generate Video")
     video_output = gr.Video(label="Parallax Video")
     # Connect button to function
     generate_btn.click(
         fn=generate_parallax_video,
+        inputs=[image_input, depth_input, animation_style, amplitude_slider, k_slider, fps_slider, duration_slider],
         outputs=video_output
     )
 # Launch the application
 demo.launch()