q8-ltx-video / app.py
sayakpaul's picture
sayakpaul HF staff
L4 fix (#1)
abd07b6 verified
import shlex
import subprocess
import tempfile
subprocess.run(shlex.split("pip install q8_kernels-0.0.0-cp310-cp310-linux_x86_64.whl"))
import gradio as gr
import torch
from diffusers.utils import export_to_video
from app_utils import prepare_pipeline, compute_hash
from inference import load_text_encoding_pipeline
text_encoding_pipeline = load_text_encoding_pipeline()
inference_pipeline = prepare_pipeline()
def create_advanced_options():
with gr.Accordion("Advanced Options (Optional)", open=False):
seed = gr.Slider(label="Seed", minimum=0, maximum=1000000, step=1, value=646373)
inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=30)
guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=5.0, step=0.1, value=3.0)
max_sequence_length = gr.Slider(label="Maximum sequence length", minimum=128, maximum=512, step=1, value=128)
fps = gr.Slider(label="FPS", minimum=21, maximum=30, step=1, value=24)
return [
seed,
inference_steps,
guidance_scale,
max_sequence_length,
fps
]
@torch.no_grad()
def generate_video_from_text(prompt, negative_prompt, seed, steps, guidance_scale, max_sequence_length, fps):
global text_encoding_pipeline
prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask = text_encoding_pipeline.encode_prompt(
prompt=prompt, negative_prompt=negative_prompt, max_sequence_length=max_sequence_length
)
global inference_pipeline
video = inference_pipeline(
prompt_embeds=prompt_embeds,
prompt_attention_mask=prompt_attention_mask,
negative_prompt_embeds=negative_prompt_embeds,
negative_prompt_attention_mask=negative_prompt_attention_mask,
guidance_scale=guidance_scale,
width=768,
height=512,
num_frames=121,
num_inference_steps=steps,
max_sequence_length=max_sequence_length,
generator=torch.manual_seed(seed),
).frames[0]
_, out_path = tempfile.mkstemp(suffix=".mp4")
export_to_video(video, out_path, fps=fps)
return out_path
with gr.Blocks(theme=gr.themes.Soft()) as iface:
with gr.Row(elem_id="title-row"):
gr.Markdown(
"""
<div style="text-align: center; margin-bottom: 1em">
<h1 style="font-size: 2.5em; font-weight: 600; margin: 0.5em 0;">Fast Video Generation with <a href="https://github.com/sayakpaul/q8-ltx-video">Q8 LTX Video</a></h1>
</div>
"""
)
with gr.Row(elem_id="title-row"):
gr.HTML( # add technical report link
"""
<div style="display:flex;column-gap:4px;">
<span>This space is modified from the original <a href="https://huggingface.co/spaces/Lightricks/LTX-Video-Playground">LTX-Video playground</a>. It uses optimized Q8 kernels along with torch.compile to allow for ultra-fast video
generation. As a result, it restricts generations to 121 frames with 512x768 resolution. For more details, refer to <a href="https://github.com/sayakpaul/q8-ltx-video">this link</a>.
</div>
"""
)
with gr.Accordion(" ๐Ÿ“– Tips for Best Results", open=False, elem_id="instructions-accordion"):
gr.Markdown(
"""
๐Ÿ“ Prompt Engineering
When writing prompts, focus on detailed, chronological descriptions of actions and scenes. Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph. Start directly with the action, and keep descriptions literal and precise. Think like a cinematographer describing a shot list. Keep within 200 words.
For best results, build your prompts using this structure:
- Start with main action in a single sentence
- Add specific details about movements and gestures
- Describe character/object appearances precisely
- Include background and environment details
- Specify camera angles and movements
- Describe lighting and colors
- Note any changes or sudden events
See examples for more inspiration.
๐ŸŽฎ Parameter Guide
- Resolution Preset: Higher resolutions for detailed scenes, lower for faster generation and simpler scenes
- Seed: Save seed values to recreate specific styles or compositions you like
- Guidance Scale: 3-3.5 are the recommended values
- Inference Steps: More steps (40+) for quality, fewer steps (20-30) for speed
- When using detailed prompts, use a higher `max_sequence_length` value.
"""
)
with gr.Tabs():
# Text to Video Tab
with gr.TabItem("Text to Video"):
with gr.Row():
with gr.Column():
txt2vid_prompt = gr.Textbox(
label="Enter Your Prompt",
placeholder="Describe the video you want to generate (minimum 50 characters)...",
value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
lines=5,
)
txt2vid_negative_prompt = gr.Textbox(
label="Enter Negative Prompt",
placeholder="Describe what you don't want in the video...",
value="low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
lines=2,
)
txt2vid_advanced = create_advanced_options()
txt2vid_generate = gr.Button(
"Generate Video",
variant="primary",
size="lg",
)
with gr.Column():
txt2vid_output = gr.Video(label="Generated Output")
with gr.Row():
gr.Examples(
examples=[
[
"A clear, turquoise river flows through a rocky canyon, cascading over a small waterfall and forming a pool of water at the bottom.The river is the main focus of the scene, with its clear water reflecting the surrounding trees and rocks. The canyon walls are steep and rocky, with some vegetation growing on them. The trees are mostly pine trees, with their green needles contrasting with the brown and gray rocks. The overall tone of the scene is one of peace and tranquility.",
"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
"assets/river.mp4",
],
[
"The camera pans over a snow-covered mountain range, revealing a vast expanse of snow-capped peaks and valleys.The mountains are covered in a thick layer of snow, with some areas appearing almost white while others have a slightly darker, almost grayish hue. The peaks are jagged and irregular, with some rising sharply into the sky while others are more rounded. The valleys are deep and narrow, with steep slopes that are also covered in snow. The trees in the foreground are mostly bare, with only a few leaves remaining on their branches. The sky is overcast, with thick clouds obscuring the sun. The overall impression is one of peace and tranquility, with the snow-covered mountains standing as a testament to the power and beauty of nature.",
"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
"assets/mountain.mp4",
],
],
inputs=[txt2vid_prompt, txt2vid_negative_prompt, txt2vid_output],
label="Example Text-to-Video Generations",
)
txt2vid_generate.click(
fn=generate_video_from_text,
inputs=[
txt2vid_prompt,
txt2vid_negative_prompt,
*txt2vid_advanced,
],
outputs=txt2vid_output,
concurrency_limit=1,
concurrency_id="generate_video_from_text",
)
if __name__ == "__main__":
iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(share=True, show_api=False)