jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -66,229 +66,37 @@ def print_directory_structure(startpath):
 logger.info("💡 Applying a dirty hack (patch ""/repository"" to fix file extensions):")
 apply_dirty_hack_to_patch_file_extensions_and_bypass_filter("/repository")
-logger.info("💡 Printing directory structure of ""/repository"":")
-print_directory_structure("/repository")
 @dataclass
 class GenerationConfig:
     """Configuration for video generation"""
-    width: int = 768
-    height: int = 512
-    fps: int = 24
-    duration_sec: float = 4.0
-    num_inference_steps: int = 30
-    guidance_scale: float = 7.5
-    upscale_factor: float = 2.0
-    enable_interpolation: bool = False
-    seed: int = -1  # -1 means random seed
-    @property
-    def num_frames(self) -> int:
-        """Calculate number of frames based on fps and duration"""
-        return int(self.duration_sec * self.fps) + 1
-    def validate_and_adjust(self) -> 'GenerationConfig':
-        """Validate and adjust parameters to meet constraints"""
-        # Round dimensions to nearest multiple of 32
-        self.width = max(32, min(MAX_WIDTH, round(self.width / 32) * 32))
-        self.height = max(32, min(MAX_HEIGHT, round(self.height / 32) * 32))
-        # Adjust number of frames to be in format 8k + 1
-        k = (self.num_frames - 1) // 8
-        num_frames = min((k * 8) + 1, MAX_FRAMES)
-        self.duration_sec = (num_frames - 1) / self.fps
-        # Set random seed if not specified
-        if self.seed == -1:
-            self.seed = random.randint(0, 2**32 - 1)
-        return self
-class EndpointHandler:
-    """Handles video generation requests using LTX models and Varnish post-processing"""
-    def __init__(self, model_path: str = ""):
-        """Initialize the handler with LTX models and Varnish
-        Args:
-            model_path: Path to LTX model weights
-        """
-        # Enable TF32 for potential speedup on Ampere GPUs
-        #torch.backends.cuda.matmul.allow_tf32 = True
-        # Initialize models with bfloat16 precision
-        self.text_to_video = LTXPipeline.from_pretrained(
-            model_path,
-            torch_dtype=torch.bfloat16
-        ).to("cuda")
-        self.image_to_video = LTXImageToVideoPipeline.from_pretrained(
-            model_path,
-            torch_dtype=torch.bfloat16
-        ).to("cuda")
-        # Enable CPU offload for memory efficiency
-        #self.text_to_video.enable_model_cpu_offload()
-        #self.image_to_video.enable_model_cpu_offload()
-        # Initialize Varnish for post-processing
-        self.varnish = Varnish(
-            device="cuda" if torch.cuda.is_available() else "cpu",
-            output_format="mp4",
-            output_codec="h264",
-            output_quality=23,
-            enable_mmaudio=False,
-            #model_base_dir=os.path.abspath(os.path.join(os.getcwd(), "varnish"))
-            model_base_dir="/repository/varnish",
-        )
-    async def process_frames(
-        self,
-        frames: torch.Tensor,
-        config: GenerationConfig
-    ) -> tuple[str, dict]:
-        """Post-process generated frames using Varnish
-        Args:
-            frames: Generated video frames tensor
-            config: Generation configuration
-        Returns:
-            Tuple of (video data URI, metadata dictionary)
-        """
-        try:
-            logger.info(f"Original frames shape: {frames.shape}")
-            # Remove batch dimension if present
-            if len(frames.shape) == 5:
-                frames = frames.squeeze(0)  # Remove batch dimension
-            logger.info(f"Processed frames shape: {frames.shape}")
-            # Process video with Varnish
-            result = await self.varnish(
-                input_data=frames,
-                input_fps=config.fps,
-                output_fps=config.fps,
-                upscale_factor=config.upscale_factor if config.upscale_factor > 1 else None,
-                enable_interpolation=config.enable_interpolation
-            )
-            # Convert to data URI
-            video_uri = await result.write(
-                output_type="data-uri",
-                output_format="mp4",
-                output_codec="h264",
-                output_quality=23
-            )
-            # Collect metadata
-            metadata = {
-                "width": result.metadata.width,
-                "height": result.metadata.height,
-                "num_frames": result.metadata.frame_count,
-                "fps": result.metadata.fps,
-                "duration": result.metadata.duration,
-                "num_inference_steps": config.num_inference_steps,
-                "seed": config.seed,
-                "upscale_factor": config.upscale_factor,
-                "interpolation_enabled": config.enable_interpolation
-            }
-            return video_uri, metadata
-        except Exception as e:
-            logger.error(f"Error in process_frames: {str(e)}")
-            raise RuntimeError(f"Failed to process frames: {str(e)}")
-    from dataclasses import dataclass
-from pathlib import Path
-import pathlib
-from typing import Dict, Any, Optional, Tuple
-import asyncio
-import base64
-import io
-import pprint
-import logging
-import random
-import traceback
-import os
-import numpy as np
-import torch
-from diffusers import LTXPipeline, LTXImageToVideoPipeline
-from PIL import Image
-from varnish import Varnish
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Constraints
-MAX_WIDTH = 1280
-MAX_HEIGHT = 720
-MAX_FRAMES = 257
-# this is only a temporary solution (famous last words)
-def apply_dirty_hack_to_patch_file_extensions_and_bypass_filter(directory):
-    """
-    Recursively rename all '.wut' files to '.pth' in the given directory
-    Args:
-        directory (str): Path to the directory to process
-    """
-    # Convert the directory path to absolute path
-    directory = os.path.abspath(directory)
-    # Walk through directory and its subdirectories
-    for root, _, files in os.walk(directory):
-        for filename in files:
-            if filename.endswith('.wut'):
-                # Get full path of the file
-                old_path = os.path.join(root, filename)
-                # Create new filename by replacing the extension
-                new_filename = filename.replace('.wut', '.pth')
-                new_path = os.path.join(root, new_filename)
-                try:
-                    os.rename(old_path, new_path)
-                    print(f"Renamed: {old_path} -> {new_path}")
-                except OSError as e:
-                    print(f"Error renaming {old_path}: {e}")
-def print_directory_structure(startpath):
-    """Print the directory structure starting from the given path."""
-    for root, dirs, files in os.walk(startpath):
-        level = root.replace(startpath, '').count(os.sep)
-        indent = ' ' * 4 * level
-        logger.info(f"{indent}{os.path.basename(root)}/")
-        subindent = ' ' * 4 * (level + 1)
-        for f in files:
-            logger.info(f"{subindent}{f}")
-logger.info("💡 Applying a dirty hack (patch ""/repository"" to fix file extensions):")
-apply_dirty_hack_to_patch_file_extensions_and_bypass_filter("/repository")
-logger.info("💡 Printing directory structure of ""/repository"":")
-print_directory_structure("/repository")
-@dataclass
-class GenerationConfig:
-    """Configuration for video generation"""
     width: int = 768
     height: int = 512
-    fps: int = 24
-    duration_sec: float = 4.0
-    num_inference_steps: int = 30
     guidance_scale: float = 7.5
-    upscale_factor: float = 2.0
-    enable_interpolation: bool = False
     seed: int = -1  # -1 means random seed
-    @property
-    def num_frames(self) -> int:
-        """Calculate number of frames based on fps and duration"""
-        return int(self.duration_sec * self.fps) + 1
     def validate_and_adjust(self) -> 'GenerationConfig':
         """Validate and adjust parameters to meet constraints"""
@@ -299,7 +107,7 @@ class GenerationConfig:
         # Adjust number of frames to be in format 8k + 1
         k = (self.num_frames - 1) // 8
         num_frames = min((k * 8) + 1, MAX_FRAMES)
-        self.duration_sec = (num_frames - 1) / self.fps
         # Set random seed if not specified
         if self.seed == -1:
@@ -339,9 +147,8 @@ class EndpointHandler:
             device="cuda" if torch.cuda.is_available() else "cpu",
             output_format="mp4",
             output_codec="h264",
-            output_quality=23,
             enable_mmaudio=False,
-            #model_base_dir=os.path.abspath(os.path.join(os.getcwd(), "varnish"))
             model_base_dir="/repository/varnish",
         )
@@ -367,22 +174,22 @@ class EndpointHandler:
                 frames = frames.squeeze(0)  # Remove batch dimension
             logger.info(f"Processed frames shape: {frames.shape}")
             # Process video with Varnish
             result = await self.varnish(
                 input_data=frames,
-                input_fps=config.fps,
-                output_fps=config.fps,
-                upscale_factor=config.upscale_factor if config.upscale_factor > 1 else None,
-                enable_interpolation=config.enable_interpolation
             )
             # Convert to data URI
             video_uri = await result.write(
-                output_type="data-uri",
-                output_format="mp4",
-                output_codec="h264",
-                output_quality=23
             )
             # Collect metadata
@@ -392,10 +199,7 @@ class EndpointHandler:
                 "num_frames": result.metadata.frame_count,
                 "fps": result.metadata.fps,
                 "duration": result.metadata.duration,
-                "num_inference_steps": config.num_inference_steps,
                 "seed": config.seed,
-                "upscale_factor": config.upscale_factor,
-                "interpolation_enabled": config.enable_interpolation
             }
             return video_uri, metadata
@@ -404,45 +208,72 @@ class EndpointHandler:
             logger.error(f"Error in process_frames: {str(e)}")
             raise RuntimeError(f"Failed to process frames: {str(e)}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Process incoming requests for video generation
         Args:
             data: Request data containing:
-                - inputs (str): Text prompt or image
-                - width (optional): Video width
-                - height (optional): Video height
-                - fps (optional): Frames per second
-                - duration_sec (optional): Video duration
-                - num_inference_steps (optional): Inference steps
-                - guidance_scale (optional): Guidance scale
-                - upscale_factor (optional): Upscaling factor
-                - enable_interpolation (optional): Enable frame interpolation
-                - seed (optional): Random seed
         Returns:
             Dictionary containing:
                 - video: Base64 encoded MP4 data URI
                 - content-type: MIME type
                 - metadata: Generation metadata
         """
-        # Extract prompt
-        prompt = data.get("inputs")
-        if not prompt:
-            raise ValueError("No prompt provided in the 'inputs' field")
         # Create and validate configuration
         config = GenerationConfig(
-            width=data.get("width", GenerationConfig.width),
-            height=data.get("height", GenerationConfig.height),
-            fps=data.get("fps", GenerationConfig.fps),
-            duration_sec=data.get("duration_sec", GenerationConfig.duration_sec),
-            num_inference_steps=data.get("num_inference_steps", GenerationConfig.num_inference_steps),
-            guidance_scale=data.get("guidance_scale", GenerationConfig.guidance_scale),
-            upscale_factor=data.get("upscale_factor", GenerationConfig.upscale_factor),
-            enable_interpolation=data.get("enable_interpolation", GenerationConfig.enable_interpolation),
-            seed=data.get("seed", GenerationConfig.seed)
         ).validate_and_adjust()
         try:
             with torch.no_grad():
@@ -451,28 +282,35 @@ class EndpointHandler:
                 np.random.seed(config.seed)
                 generator = torch.manual_seed(config.seed)
-                # Prepare generation parameters
                 generation_kwargs = {
-                    "prompt": prompt,
-                    "height": config.height,
-                    "width": config.width,
-                    "num_frames": config.num_frames,
-                    "guidance_scale": config.guidance_scale,
-                    "num_inference_steps": config.num_inference_steps,
                     "output_type": "pt",
                     "generator": generator
                 }
-                logger.info(f"Parameters:")
                 pprint.pprint(generation_kwargs)
                 # Check if image-to-video generation is requested
-                image_data = data.get("image")
-                if image_data:
                     # Process base64 image
-                    if image_data.startswith('data:'):
-                        image_data = image_data.split(',', 1)[1]
-                    image_bytes = base64.b64decode(image_data)
                     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                     generation_kwargs["image"] = image
                     frames = self.image_to_video(**generation_kwargs).frames

 logger.info("💡 Applying a dirty hack (patch ""/repository"" to fix file extensions):")
 apply_dirty_hack_to_patch_file_extensions_and_bypass_filter("/repository")
+#logger.info("💡 Printing directory structure of ""/repository"":")
+#print_directory_structure("/repository")
 @dataclass
 class GenerationConfig:
     """Configuration for video generation"""
+    # general content settings
+    prompt: str = ""
+    negative_prompt: str = "worst quality, inconsistent motion, blurry, jittery, distorted, cropped, watermarked, watermark, logo, subtitle, subtitles, lowres",
+    # video model settings (will be used during generation of the initial raw video clip)
     width: int = 768
     height: int = 512
+    # users may tend to always set this to the max, to get as much useable content as possible (which is MAX_FRAMES ie. 257).
+    # The value must be a multiple of 8, plus 1 frame.
+    num_frames: int = 129
     guidance_scale: float = 7.5
+    num_inference_steps: int = 50
+    # reproducible generation settings
     seed: int = -1  # -1 means random seed
+    # varnish settings (will be used for post-processing after the raw video clip has been generated
+    fps: int = 24 # FPS of the final video (only applied at the the very end, when converting to mp4)
+    double_num_frames: bool = True # if True, the number of frames will be multiplied by 2 using RIFE
+    super_resolution: bool = True # if True, the resolution will be multiplied by 2 using Real_ESRGAN
+    grain_amount: float = 0.0
     def validate_and_adjust(self) -> 'GenerationConfig':
         """Validate and adjust parameters to meet constraints"""
         # Adjust number of frames to be in format 8k + 1
         k = (self.num_frames - 1) // 8
         num_frames = min((k * 8) + 1, MAX_FRAMES)
         # Set random seed if not specified
         if self.seed == -1:
             device="cuda" if torch.cuda.is_available() else "cpu",
             output_format="mp4",
             output_codec="h264",
+            output_quality=17,
             enable_mmaudio=False,
             model_base_dir="/repository/varnish",
         )
                 frames = frames.squeeze(0)  # Remove batch dimension
             logger.info(f"Processed frames shape: {frames.shape}")
             # Process video with Varnish
             result = await self.varnish(
                 input_data=frames,
+                double_num_frames=config.double_num_frames, # if True, the number of frames will be multiplied by 2 using RIFE
+                super_resolution=config.grain_amount_config, # if True, the resolution will be multiplied by 2 using Real_ESRGAN
+                grain_amount_config.grain_amount,
             )
             # Convert to data URI
             video_uri = await result.write(
+                type="data-uri",
+                format="mp4",
+                codec="h264",
+                fps=config.fps,
+                quality=23
             )
             # Collect metadata
                 "num_frames": result.metadata.frame_count,
                 "fps": result.metadata.fps,
                 "duration": result.metadata.duration,
                 "seed": config.seed,
             }
             return video_uri, metadata
             logger.error(f"Error in process_frames: {str(e)}")
             raise RuntimeError(f"Failed to process frames: {str(e)}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Process incoming requests for video generation
         Args:
             data: Request data containing:
+                - inputs (dict): Dictionary containing input, which can be either "prompt" (text field) or "image" (input image)
+                - parameters (dict):
+                    - prompt (required, string): list of concepts to keep in the video.
+                    - negative_prompt (optional, string): list of concepts to ignore in the video.
+                    - width (optional, int, default to 768): width, or horizontal size in pixels.
+                    - height (optional, int, default to 512): height, or vertical size in pixels.
+                    - num_frames (optional, int, default to 129): the numer of frames must be a multiple of 8, plus 1 frame.
+                    - guidance_scale (optional, float, default to 7.5): Guidance scale
+                    - num_inference_steps (optional, int, default to 50): number of inference steps
+                    - seed (optional, int, default to -1): set a random number generator seed, -1 means random seed.
+                    - fps (optional, int, default to 24): FPS of the final video
+                    - double_num_frames (optional, bool): if enabled, the number of frames will be multiplied by 2 using RIFE
+                    - super_resolution (optional, bool): if enabled, the resolution will be multiplied by 2 using Real_ESRGAN
+                    - grain_amount (optional, float): amount of film grain to add to the output video
         Returns:
             Dictionary containing:
                 - video: Base64 encoded MP4 data URI
                 - content-type: MIME type
                 - metadata: Generation metadata
         """
+        inputs = data.get("inputs", dict())
+        input_prompt = inputs.get("prompt", "")
+        input_image = inputs.get("image")
+        params = data.get("parameters", dict())
+        if not input_prompt:
+            raise ValueError("The prompt should not be empty")
+        logger.info(f"Prompt: {input_prompt}")
+        logger.info(f"Raw parameters:")
+        pprint.pprint(params)
         # Create and validate configuration
         config = GenerationConfig(
+            # general content settings
+            prompt: input_prompt,
+            negative_prompt=params.get("negative_prompt", GenerationConfig.negative_prompt),
+            # video model settings (will be used during generation of the initial raw video clip)
+            width=params.get("width", GenerationConfig.width),
+            height=params.get("height", GenerationConfig.height),
+            num_frames=params.get"num_frames", GenerationConfig.num_frames),
+            guidance_scale=params.get("guidance_scale", GenerationConfig.guidance_scale),
+            num_inference_steps=params.get("num_inference_steps", GenerationConfig.num_inference_steps),
+            # reproducible generation settings
+            seed=params.get("seed", GenerationConfig.seed)
+            # varnish settings (will be used for post-processing after the raw video clip has been generated)
+            fps=params.get("fps", GenerationConfig.fps), # FPS of the final video (only applied at the the very end, when converting to mp4)
+            double_num_frames=params.get("double_num_frames", GenerationConfig.double_num_frames), # if True, the number of frames will be multiplied by 2 using RIFE
+            super_resolution=params.get("super_resolution", GenerationConfig.super_resolution), # if True, the resolution will be multiplied by 2 using Real_ESRGAN
+            grain_amount=params.get("grain_amount", GenerationConfig.grain_amount),
         ).validate_and_adjust()
+        logger.info(f"Global request settings:")
+        pprint.pprint(config)
         try:
             with torch.no_grad():
                 np.random.seed(config.seed)
                 generator = torch.manual_seed(config.seed)
+                # Prepare generation parameters for the video model (we omit params that are destined to Varnish)
                 generation_kwargs = {
+                   # general content settings
+                    prompt: config.prompt,
+                    negative_prompt=config.negative_prompt,
+                    # video model settings (will be used during generation of the initial raw video clip)
+                    width=params.config.width,
+                    height=config.height,
+                    num_frames=config.num_frames,
+                    guidance_scale=config.guidance_scale,
+                    num_inference_steps=config.num_inference_steps,
+                    # reproducible generation settings
+                    seed=config.seed,
+                    # constants
                     "output_type": "pt",
                     "generator": generator
                 }
+                logger.info(f"Video model generation settings:")
                 pprint.pprint(generation_kwargs)
                 # Check if image-to-video generation is requested
+                if input_image:
                     # Process base64 image
+                    if input_image.startswith('data:'):
+                        input_image = image_data.split(',', 1)[1]
+                    image_bytes = base64.b64decode(input_image)
                     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                     generation_kwargs["image"] = image
                     frames = self.image_to_video(**generation_kwargs).frames