# !sudo apt-get install -y ffmpeg import tensorflow as tf import tensorflow_hub as hub import requests import numpy as np from typing import Generator, Iterable, List, Optional import mediapy as media import subprocess subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) # subprocess.run('pip install bitsandbytes', shell=True) subprocess.run('pip install av==12.0.0', shell=True) import gradio as gr import spaces import gradio.helpers import torch import os from glob import glob from pathlib import Path from typing import Optional # from diffusers import StableVideoDiffusionPipeline from kandinsky import get_T2V_pipeline from diffusers.utils import load_image, export_to_video from PIL import Image import uuid import random from huggingface_hub import hf_hub_download from src.gigachat import giga_generate model = hub.load("https://tfhub.dev/google/film/1") #gradio.helpers.CACHED_FOLDER = '/data/cache' # pipe = StableVideoDiffusionPipeline.from_pretrained( # "multimodalart/stable-video-diffusion", torch_dtype=torch.float16, variant="fp16" # ) # pipe.to("cuda") #pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) #pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True) device_map = { "dit": torch.device('cuda'), "vae": torch.device('cuda'), "text_embedder": torch.device('cuda') } pipe = get_T2V_pipeline(device_map) max_64_bit_int = 2**63 - 1 @spaces.GPU(duration=120) def sample( # image: Image, prompt, resolution, seed: Optional[int] = -1, randomize_seed: bool = True, motion_bucket_id: int = 127, fps_id: int = 30, # version: str = "svd_xt", # cond_aug: float = 0.02, decoding_t: int = 3, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary. device: str = "cuda", output_folder: str = "outputs", progress=gr.Progress(track_tqdm=True) ): # if image.mode == "RGBA": # image = image.convert("RGB") if(randomize_seed): seed = random.randint(0, max_64_bit_int) generator = torch.manual_seed(seed) os.makedirs(output_folder, exist_ok=True) base_count = len(glob(os.path.join(output_folder, "*.mp4"))) video_path = os.path.join(output_folder, f"{base_count:06d}.mp4") res_variants = { '16:9 (672x384)': '672x384', '9:16 (384x672)': '384x672', '1:1 (512x512)': '512x512', '1:2 (352x736)': '352x736', '2:1 (736x352)': '736x352' } width = int(res_variants[resolution].split('x')[0]) height = int(res_variants[resolution].split('x')[1]) # frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0] # prompt = "The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from it’s tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds." frames = pipe( seed=seed, time_length=12, width = width, height = height, save_path=video_path, text=prompt, ) # export_to_video(frames, video_path, fps=15) torch.manual_seed(seed) # return video_path def resize_image(image, output_size=(672, 384)): # Calculate aspect ratios target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size image_aspect = image.width / image.height # Aspect ratio of the original image # Resize then crop if the original image is larger if image_aspect > target_aspect: # Resize the image to match the target height, maintaining aspect ratio new_height = output_size[1] new_width = int(new_height * image_aspect) resized_image = image.resize((new_width, new_height), Image.LANCZOS) # Calculate coordinates for cropping left = (new_width - output_size[0]) / 2 top = 0 right = (new_width + output_size[0]) / 2 bottom = output_size[1] else: # Resize the image to match the target width, maintaining aspect ratio new_width = output_size[0] new_height = int(new_width / image_aspect) resized_image = image.resize((new_width, new_height), Image.LANCZOS) # Calculate coordinates for cropping left = 0 top = (new_height - output_size[1]) / 2 right = output_size[0] bottom = (new_height + output_size[1]) / 2 """A wrapper class for running a frame interpolation based on the FILM model on TFHub Usage: interpolator = Interpolator() result_batch = interpolator(image_batch_0, image_batch_1, batch_dt) Where image_batch_1 and image_batch_2 are numpy tensors with TF standard (B,H,W,C) layout, batch_dt is the sub-frame time in range [0..1], (B,) layout. """ def _pad_to_align(x, align): """Pads image batch x so width and height divide by align. Args: x: Image batch to align. align: Number to align to. Returns: 1) An image padded so width % align == 0 and height % align == 0. 2) A bounding box that can be fed readily to tf.image.crop_to_bounding_box to undo the padding. """ # Input checking. assert np.ndim(x) == 4 assert align > 0, 'align must be a positive number.' height, width = x.shape[-3:-1] height_to_pad = (align - height % align) if height % align != 0 else 0 width_to_pad = (align - width % align) if width % align != 0 else 0 bbox_to_pad = { 'offset_height': height_to_pad // 2, 'offset_width': width_to_pad // 2, 'target_height': height + height_to_pad, 'target_width': width + width_to_pad } padded_x = tf.image.pad_to_bounding_box(x, **bbox_to_pad) bbox_to_crop = { 'offset_height': height_to_pad // 2, 'offset_width': width_to_pad // 2, 'target_height': height, 'target_width': width } return padded_x, bbox_to_crop class Interpolator: """A class for generating interpolated frames between two input frames. Uses the Film model from TFHub """ def __init__(self, align: int = 64) -> None: """Loads a saved model. Args: align: 'If >1, pad the input size so it divides with this before inference.' """ self._model = hub.load("https://tfhub.dev/google/film/1") self._align = align def __call__(self, x0: np.ndarray, x1: np.ndarray, dt: np.ndarray) -> np.ndarray: """Generates an interpolated frame between given two batches of frames. All inputs should be np.float32 datatype. Args: x0: First image batch. Dimensions: (batch_size, height, width, channels) x1: Second image batch. Dimensions: (batch_size, height, width, channels) dt: Sub-frame time. Range [0,1]. Dimensions: (batch_size,) Returns: The result with dimensions (batch_size, height, width, channels). """ if self._align is not None: x0, bbox_to_crop = _pad_to_align(x0, self._align) x1, _ = _pad_to_align(x1, self._align) inputs = {'x0': x0, 'x1': x1, 'time': dt[..., np.newaxis]} result = self._model(inputs, training=False) image = result['image'] if self._align is not None: image = tf.image.crop_to_bounding_box(image, **bbox_to_crop) return image.numpy() def _recursive_generator( frame1: np.ndarray, frame2: np.ndarray, num_recursions: int, interpolator: Interpolator) -> Generator[np.ndarray, None, None]: """Splits halfway to repeatedly generate more frames. Args: frame1: Input image 1. frame2: Input image 2. num_recursions: How many times to interpolate the consecutive image pairs. interpolator: The frame interpolator instance. Yields: The interpolated frames, including the first frame (frame1), but excluding the final frame2. """ if num_recursions == 0: yield frame1 else: # Adds the batch dimension to all inputs before calling the interpolator, # and remove it afterwards. time = np.full(shape=(1,), fill_value=0.5, dtype=np.float32) mid_frame = interpolator( np.expand_dims(frame1, axis=0), np.expand_dims(frame2, axis=0), time)[0] yield from _recursive_generator(frame1, mid_frame, num_recursions - 1, interpolator) yield from _recursive_generator(mid_frame, frame2, num_recursions - 1, interpolator) def interpolate_recursively( frames: List[np.ndarray], num_recursions: int, interpolator: Interpolator) -> Iterable[np.ndarray]: """Generates interpolated frames by repeatedly interpolating the midpoint. Args: frames: List of input frames. Expected shape (H, W, 3). The colors should be in the range[0, 1] and in gamma space. num_recursions: Number of times to do recursive midpoint interpolation. interpolator: The frame interpolation model to use. Yields: The interpolated frames (including the inputs). """ n = len(frames) for i in range(1, n): yield from _recursive_generator(frames[i - 1], frames[i], times_to_interpolate, interpolator) # Separately yield the final frame. yield frames[-1] times_to_interpolate = 6 interpolator = Interpolator() input_frames = [image[0], image[1]] frames = list( interpolate_recursively(input_frames, times_to_interpolate, interpolator)) print(f'video with {len(frames)} frames') media.show_video(frames, fps=30, title='FILM interpolated video') with gr.Blocks() as demo: gr.Markdown('''# Kandinsky 4.0 T2V Flash''') with gr.Row(): with gr.Column(): # image = gr.Image(label="Upload your image", type="pil") # видео по центру video = gr.Video() with gr.Row(): # левая часть под видео - текст with gr.Column(): prompt = gr.Text( label="Prompt", show_label=False, lines=3, max_lines=5, placeholder="Enter your prompt", container=False, ) with gr.Row(): with gr.Column(): # под текстом, слева gr.Markdown("Prompt beautification 🪄 powered by [GigaChat-Max](https://giga.chat), LLM created by Sber") with gr.Column(): # под текстом, справа enhance_button = gr.Button("Beautify Your Prompt") # правая часть под видео - Aspect ratio with gr.Column(): aspect_ratio = gr.Dropdown( label="Aspect ratio", choices=["16:9 (672x384)", "9:16 (384x672)", "1:1 (512x512)", "1:2 (352x736)", "2:1 (736x352)"], value="16:9 (672x384)" ) generate_btn = gr.Button("Generate Video") with gr.Accordion("Advanced options", open=False): seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255) fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30) # image.upload(fn=resize_image, inputs=image, outputs=image, queue=False) def beautify_prompt(prompt: str, max_attempts: int = 5) -> str: prompt = giga_generate(prompt, max_attempts=max_attempts) return prompt def enhance_prompt_func(prompt): return beautify_prompt(prompt, max_attempts=5) # def enhance_prompt_func(prompt): # return giga_generate(prompt, max_attempts=5) # if not os.environ.get("OPENAI_API_KEY"): # return prompt # client = OpenAI() # text = prompt.strip() # for i in range(retry_times): # response = client.chat.completions.create( # messages=[ # {"role": "system", "content": sys_prompt}, # { # "role": "user", # "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "a girl is on the beach"', # }, # { # "role": "assistant", # "content": "A radiant woman stands on a deserted beach, arms outstretched, wearing a beige trench coat, white blouse, light blue jeans, and chic boots, against a backdrop of soft sky and sea. Moments later, she is seen mid-twirl, arms exuberant, with the lighting suggesting dawn or dusk. Then, she runs along the beach, her attire complemented by an off-white scarf and black ankle boots, the tranquil sea behind her. Finally, she holds a paper airplane, her pose reflecting joy and freedom, with the ocean's gentle waves and the sky's soft pastel hues enhancing the serene ambiance.", # } # ], # model="glm-4-plus", # temperature=0.01, # top_p=0.7, # stream=False, # max_tokens=200, # ) # if response.choices: # return response.choices[0].message.content # return prompt generate_btn.click(fn=sample, inputs=[prompt, aspect_ratio], outputs=[video], api_name="video") enhance_button.click(enhance_prompt_func, inputs=[prompt], outputs=[prompt]) # gr.Examples( # examples=[ # "images/blink_meme.png", # "images/confused2_meme.png", # "images/disaster_meme.png", # "images/distracted_meme.png", # "images/hide_meme.png", # "images/nazare_meme.png", # "images/success_meme.png", # "images/willy_meme.png", # "images/wink_meme.png" # ], # inputs=image, # outputs=[video, seed], # fn=sample, # cache_examples="lazy", # ) if __name__ == "__main__": #demo.queue(max_size=20, api_open=False) demo.launch(share=True, show_api=False) # end