jacobitterman commited on
Commit
119f89b
·
verified ·
1 Parent(s): ea3b205

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +183 -21
README.md CHANGED
@@ -121,58 +121,220 @@ Make sure you install `diffusers` before trying out the examples below.
121
  pip install -U git+https://github.com/huggingface/diffusers
122
  ```
123
 
124
- Now, you can run the examples below:
125
 
 
126
  ```py
127
  import torch
128
- from diffusers import LTXPipeline
 
129
  from diffusers.utils import export_to_video
130
 
131
- pipe = LTXPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
 
132
  pipe.to("cuda")
 
 
133
 
134
- prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
135
  negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
136
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  video = pipe(
138
  prompt=prompt,
139
  negative_prompt=negative_prompt,
140
- width=704,
141
- height=480,
142
- num_frames=161,
143
- num_inference_steps=50,
 
 
 
 
 
 
144
  ).frames[0]
 
 
 
 
145
  export_to_video(video, "output.mp4", fps=24)
146
  ```
147
 
148
- For image-to-video:
149
 
150
  ```py
151
  import torch
152
- from diffusers import LTXImageToVideoPipeline
 
153
  from diffusers.utils import export_to_video, load_image
154
 
155
- pipe = LTXImageToVideoPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
 
156
  pipe.to("cuda")
 
 
 
 
 
 
157
 
158
- image = load_image(
159
- "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
160
- )
161
- prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background. Flames engulf the structure, with smoke billowing into the air. Firefighters in protective gear rush to the scene, a fire truck labeled '38' visible behind them. The girl's neutral expression contrasts sharply with the chaos of the fire, creating a poignant and emotionally charged scene."
162
  negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  video = pipe(
165
- image=image,
166
  prompt=prompt,
167
  negative_prompt=negative_prompt,
168
- width=704,
169
- height=480,
170
- num_frames=161,
171
- num_inference_steps=50,
 
 
 
 
 
 
172
  ).frames[0]
 
 
 
 
173
  export_to_video(video, "output.mp4", fps=24)
 
174
  ```
175
 
 
176
  To learn more, check out the [official documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video).
177
 
178
  Diffusers also supports directly loading from the original LTX checkpoints using the `from_single_file()` method. Check out [this section](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video#loading-single-files) to learn more.
 
121
  pip install -U git+https://github.com/huggingface/diffusers
122
  ```
123
 
124
+ Now, you can run the examples below (note that the upsampling stage is optional but reccomeneded):
125
 
126
+ ### text-to-video:
127
  ```py
128
  import torch
129
+ from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
130
+ from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
131
  from diffusers.utils import export_to_video
132
 
133
+ pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
134
+ pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
135
  pipe.to("cuda")
136
+ pipe_upsample.to("cuda")
137
+ pipe.vae.enable_tiling()
138
 
139
+ prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
140
  negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
141
+ expected_height, expected_width = 704, 512
142
+ downscale_factor = 2 / 3
143
+ num_frames = 121
144
+
145
+ # Part 1. Generate video at smaller resolution
146
+ downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
147
+ latents = pipe(
148
+ conditions=None,
149
+ prompt=prompt,
150
+ negative_prompt=negative_prompt,
151
+ width=downscaled_width,
152
+ height=downscaled_height,
153
+ num_frames=num_frames,
154
+ num_inference_steps=30,
155
+ generator=torch.Generator().manual_seed(0),
156
+ output_type="latent",
157
+ ).frames
158
+
159
+ # Part 2. Upscale generated video using latent upsampler with fewer inference steps
160
+ # The available latent upsampler upscales the height/width by 2x
161
+ upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
162
+ upscaled_latents = pipe_upsample(
163
+ latents=latents,
164
+ output_type="latent"
165
+ ).frames
166
+
167
+ # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
168
  video = pipe(
169
  prompt=prompt,
170
  negative_prompt=negative_prompt,
171
+ width=upscaled_width,
172
+ height=upscaled_height,
173
+ num_frames=num_frames,
174
+ denoise_strength=0.4, # Effectively, 4 inference steps out of 10
175
+ num_inference_steps=10,
176
+ latents=upscaled_latents,
177
+ decode_timestep=0.05,
178
+ image_cond_noise_scale=0.025,
179
+ generator=torch.Generator().manual_seed(0),
180
+ output_type="pil",
181
  ).frames[0]
182
+
183
+ # Part 4. Downscale the video to the expected resolution
184
+ video = [frame.resize((expected_width, expected_height)) for frame in video]
185
+
186
  export_to_video(video, "output.mp4", fps=24)
187
  ```
188
 
189
+ ### For image-to-video:
190
 
191
  ```py
192
  import torch
193
+ from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
194
+ from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
195
  from diffusers.utils import export_to_video, load_image
196
 
197
+ pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
198
+ pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
199
  pipe.to("cuda")
200
+ pipe_upsample.to("cuda")
201
+ pipe.vae.enable_tiling()
202
+
203
+ image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png")
204
+ video = [image]
205
+ condition1 = LTXVideoCondition(video=video, frame_index=0)
206
 
207
+ prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
 
 
 
208
  negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
209
+ expected_height, expected_width = 832, 480
210
+ downscale_factor = 2 / 3
211
+ num_frames = 96
212
+
213
+ # Part 1. Generate video at smaller resolution
214
+ downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
215
+ downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
216
+ latents = pipe(
217
+ conditions=[condition1],
218
+ prompt=prompt,
219
+ negative_prompt=negative_prompt,
220
+ width=downscaled_width,
221
+ height=downscaled_height,
222
+ num_frames=num_frames,
223
+ num_inference_steps=30,
224
+ generator=torch.Generator().manual_seed(0),
225
+ output_type="latent",
226
+ ).frames
227
+
228
+ # Part 2. Upscale generated video using latent upsampler with fewer inference steps
229
+ # The available latent upsampler upscales the height/width by 2x
230
+ upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
231
+ upscaled_latents = pipe_upsample(
232
+ latents=latents,
233
+ output_type="latent"
234
+ ).frames
235
+
236
+ # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
237
+ video = pipe(
238
+ conditions=[condition1],
239
+ prompt=prompt,
240
+ negative_prompt=negative_prompt,
241
+ width=upscaled_width,
242
+ height=upscaled_height,
243
+ num_frames=num_frames,
244
+ denoise_strength=0.4, # Effectively, 4 inference steps out of 10
245
+ num_inference_steps=10,
246
+ latents=upscaled_latents,
247
+ decode_timestep=0.05,
248
+ image_cond_noise_scale=0.025,
249
+ generator=torch.Generator().manual_seed(0),
250
+ output_type="pil",
251
+ ).frames[0]
252
+
253
+ # Part 4. Downscale the video to the expected resolution
254
+ video = [frame.resize((expected_width, expected_height)) for frame in video]
255
+
256
+ export_to_video(video, "output.mp4", fps=24)
257
+
258
+ ```
259
+
260
+ ### For video-to-video:
261
+
262
+ ```py
263
+ import torch
264
+ from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
265
+ from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
266
+ from diffusers.utils import export_to_video, load_video
267
+
268
+ pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
269
+ pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
270
+ pipe.to("cuda")
271
+ pipe_upsample.to("cuda")
272
+ pipe.vae.enable_tiling()
273
+
274
+ def round_to_nearest_resolution_acceptable_by_vae(height, width):
275
+ height = height - (height % pipe.vae_temporal_compression_ratio)
276
+ width = width - (width % pipe.vae_temporal_compression_ratio)
277
+ return height, width
278
 
279
+ video = load_video(
280
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
281
+ )[:21] # Use only the first 21 frames as conditioning
282
+ condition1 = LTXVideoCondition(video=video, frame_index=0)
283
+
284
+ prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
285
+ negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
286
+ expected_height, expected_width = 768, 1152
287
+ downscale_factor = 2 / 3
288
+ num_frames = 161
289
+
290
+ # Part 1. Generate video at smaller resolution
291
+ downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
292
+ downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
293
+ latents = pipe(
294
+ conditions=[condition1],
295
+ prompt=prompt,
296
+ negative_prompt=negative_prompt,
297
+ width=downscaled_width,
298
+ height=downscaled_height,
299
+ num_frames=num_frames,
300
+ num_inference_steps=30,
301
+ generator=torch.Generator().manual_seed(0),
302
+ output_type="latent",
303
+ ).frames
304
+
305
+ # Part 2. Upscale generated video using latent upsampler with fewer inference steps
306
+ # The available latent upsampler upscales the height/width by 2x
307
+ upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
308
+ upscaled_latents = pipe_upsample(
309
+ latents=latents,
310
+ output_type="latent"
311
+ ).frames
312
+
313
+ # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
314
  video = pipe(
315
+ conditions=[condition1],
316
  prompt=prompt,
317
  negative_prompt=negative_prompt,
318
+ width=upscaled_width,
319
+ height=upscaled_height,
320
+ num_frames=num_frames,
321
+ denoise_strength=0.4, # Effectively, 4 inference steps out of 10
322
+ num_inference_steps=10,
323
+ latents=upscaled_latents,
324
+ decode_timestep=0.05,
325
+ image_cond_noise_scale=0.025,
326
+ generator=torch.Generator().manual_seed(0),
327
+ output_type="pil",
328
  ).frames[0]
329
+
330
+ # Part 4. Downscale the video to the expected resolution
331
+ video = [frame.resize((expected_width, expected_height)) for frame in video]
332
+
333
  export_to_video(video, "output.mp4", fps=24)
334
+
335
  ```
336
 
337
+
338
  To learn more, check out the [official documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video).
339
 
340
  Diffusers also supports directly loading from the original LTX checkpoints using the `from_single_file()` method. Check out [this section](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video#loading-single-files) to learn more.