Spaces:

pillaryao
/

demo

Running

App Files Files Community

pillaryao commited on 9 days ago

Commit

c1c867a

verified ·

1 Parent(s): c3c5a50

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

eval/eval_sync_conf.py +1 -1
eval/eval_syncnet_acc.py +1 -1
eval/fvd.py +1 -1
inference.bat +10 -0
latentsync/models/attention.py +6 -6
latentsync/pipelines/lipsync_pipeline.py +1 -1
latentsync/trepa/__init__.py +4 -4
latentsync/whisper/whisper/__init__.py +1 -1
latentsync/whisper/whisper/transcribe.py +1 -1
scripts/inference.py +2 -2
scripts/train_syncnet.py +2 -2
scripts/train_unet.py +2 -2

eval/eval_sync_conf.py CHANGED Viewed

@@ -50,7 +50,7 @@ def main():
     args = parser.parse_args()
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     syncnet = SyncNetEval(device=device)
     syncnet.loadParameters(args.initial_model)

     args = parser.parse_args()
+    device = "cpu" if torch.cuda.is_available() else "cpu"
     syncnet = SyncNetEval(device=device)
     syncnet.loadParameters(args.initial_model)

eval/eval_syncnet_acc.py CHANGED Viewed

@@ -27,7 +27,7 @@ from accelerate.utils import set_seed
 def main(config):
     set_seed(config.run.seed)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     if config.data.latent_space:
         vae = AutoencoderKL.from_pretrained(

 def main(config):
     set_seed(config.run.seed)
+    device = "cpu" if torch.cuda.is_available() else "cpu"
     if config.data.latent_space:
         vae = AutoencoderKL.from_pretrained(

eval/fvd.py CHANGED Viewed

@@ -25,7 +25,7 @@ def compute_stats(feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
 @torch.no_grad()
-def compute_our_fvd(videos_fake: np.ndarray, videos_real: np.ndarray, device: str = "cuda") -> float:
     i3d_path = "checkpoints/auxiliary/i3d_torchscript.pt"
     i3d_kwargs = dict(
         rescale=False, resize=False, return_features=True

 @torch.no_grad()
+def compute_our_fvd(videos_fake: np.ndarray, videos_real: np.ndarray, device: str = "cpu") -> float:
     i3d_path = "checkpoints/auxiliary/i3d_torchscript.pt"
     i3d_kwargs = dict(
         rescale=False, resize=False, return_features=True

inference.bat ADDED Viewed

	@@ -0,0 +1,10 @@

+@echo off
+python -m scripts.inference ^
+    --unet_config_path "configs\unet\second_stage.yaml" ^
+    --inference_ckpt_path "checkpoints\latentsync_unet.pt" ^
+    --inference_steps 20 ^
+    --guidance_scale 1.5 ^
+    --video_path "assets\demo1_video.mp4" ^
+    --audio_path "assets\demo1_audio.wav" ^
+    --video_out_path "video_out.mp4"

latentsync/models/attention.py CHANGED Viewed

@@ -269,9 +269,9 @@ class BasicTransformerBlock(nn.Module):
             try:
                 # Make sure we can run the memory efficient attention
                 _ = xformers.ops.memory_efficient_attention(
-                    torch.randn((1, 2, 40), device="cuda"),
-                    torch.randn((1, 2, 40), device="cuda"),
-                    torch.randn((1, 2, 40), device="cuda"),
                 )
             except Exception as e:
                 raise e
@@ -402,9 +402,9 @@ class AudioTransformerBlock(nn.Module):
             try:
                 # Make sure we can run the memory efficient attention
                 _ = xformers.ops.memory_efficient_attention(
-                    torch.randn((1, 2, 40), device="cuda"),
-                    torch.randn((1, 2, 40), device="cuda"),
-                    torch.randn((1, 2, 40), device="cuda"),
                 )
             except Exception as e:
                 raise e

             try:
                 # Make sure we can run the memory efficient attention
                 _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cpu"),
+                    torch.randn((1, 2, 40), device="cpu"),
+                    torch.randn((1, 2, 40), device="cpu"),
                 )
             except Exception as e:
                 raise e
             try:
                 # Make sure we can run the memory efficient attention
                 _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cpu"),
+                    torch.randn((1, 2, 40), device="cpu"),
+                    torch.randn((1, 2, 40), device="cpu"),
                 )
             except Exception as e:
                 raise e

latentsync/pipelines/lipsync_pipeline.py CHANGED Viewed

@@ -322,7 +322,7 @@ class LipsyncPipeline(DiffusionPipeline):
         # 0. Define call parameters
         batch_size = 1
         device = self._execution_device
-        self.image_processor = ImageProcessor(height, mask=mask, device="cuda")
         self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
         faces, original_video_frames, boxes, affine_matrices = self.affine_transform_video(video_path)

         # 0. Define call parameters
         batch_size = 1
         device = self._execution_device
+        self.image_processor = ImageProcessor(height, mask=mask, device="cpu")
         self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
         faces, original_video_frames, boxes, affine_matrices = self.affine_transform_video(video_path)

latentsync/trepa/__init__.py CHANGED Viewed

@@ -21,7 +21,7 @@ from .third_party.VideoMAEv2.utils import load_videomae_model
 class TREPALoss:
     def __init__(
         self,
-        device="cuda",
         ckpt_path="checkpoints/auxiliary/vit_g_hybrid_pt_1200e_ssv2_ft.pth",
     ):
         self.model = load_videomae_model(device, ckpt_path).eval().to(dtype=torch.float16)
@@ -54,9 +54,9 @@ class TREPALoss:
 if __name__ == "__main__":
     # input shape: (b, c, f, h, w)
-    videos_fake = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cuda", dtype=torch.float16)
-    videos_real = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cuda", dtype=torch.float16)
-    trepa_loss = TREPALoss(device="cuda")
     loss = trepa_loss(videos_fake, videos_real)
     print(loss)

 class TREPALoss:
     def __init__(
         self,
+        device="cpu",
         ckpt_path="checkpoints/auxiliary/vit_g_hybrid_pt_1200e_ssv2_ft.pth",
     ):
         self.model = load_videomae_model(device, ckpt_path).eval().to(dtype=torch.float16)
 if __name__ == "__main__":
     # input shape: (b, c, f, h, w)
+    videos_fake = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cpu", dtype=torch.float16)
+    videos_real = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cpu", dtype=torch.float16)
+    trepa_loss = TREPALoss(device="cpu")
     loss = trepa_loss(videos_fake, videos_real)
     print(loss)

latentsync/whisper/whisper/__init__.py CHANGED Viewed

@@ -97,7 +97,7 @@ def load_model(
     """
     if device is None:
-        device = "cuda" if torch.cuda.is_available() else "cpu"
     if download_root is None:
         download_root = os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))

     """
     if device is None:
+        device = "cpu" if torch.cuda.is_available() else "cpu"
     if download_root is None:
         download_root = os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))

latentsync/whisper/whisper/transcribe.py CHANGED Viewed

@@ -135,7 +135,7 @@ def cli():
     parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
     parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
     parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
-    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
     parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
     parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")

     parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
     parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
     parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
+    parser.add_argument("--device", default="cpu" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
     parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
     parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")

scripts/inference.py CHANGED Viewed

@@ -41,7 +41,7 @@ def main(config, args):
     else:
         raise NotImplementedError("cross_attention_dim must be 768 or 384")
-    audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
     vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
     vae.config.scaling_factor = 0.18215
@@ -64,7 +64,7 @@ def main(config, args):
         audio_encoder=audio_encoder,
         unet=unet,
         scheduler=scheduler,
-    ).to("cuda")
     if args.seed != -1:
         set_seed(args.seed)

     else:
         raise NotImplementedError("cross_attention_dim must be 768 or 384")
+    audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cpu", num_frames=config.data.num_frames)
     vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
     vae.config.scaling_factor = 0.18215
         audio_encoder=audio_encoder,
         unet=unet,
         scheduler=scheduler,
+    ).to("cpu")
     if args.seed != -1:
         set_seed(args.seed)

scripts/train_syncnet.py CHANGED Viewed

@@ -209,7 +209,7 @@ def main(config):
             # audio_embeds = wav2vec_encoder(audio_samples).last_hidden_state
             # Mixed-precision training
-            with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=config.run.mixed_precision_training):
                 vision_embeds, audio_embeds = syncnet(frames, audio_samples)
             loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()
@@ -309,7 +309,7 @@ def validation(val_dataloader, device, syncnet, cosine_loss, latent_space, lower
                 height = frames.shape[2]
                 frames = frames[:, :, height // 2 :, :]
-            with torch.autocast(device_type="cuda", dtype=torch.float16):
                 vision_embeds, audio_embeds = syncnet(frames, audio_samples)
             loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()

             # audio_embeds = wav2vec_encoder(audio_samples).last_hidden_state
             # Mixed-precision training
+            with torch.autocast(device_type="cpu", dtype=torch.float16, enabled=config.run.mixed_precision_training):
                 vision_embeds, audio_embeds = syncnet(frames, audio_samples)
             loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()
                 height = frames.shape[2]
                 frames = frames[:, :, height // 2 :, :]
+            with torch.autocast(device_type="cpu", dtype=torch.float16):
                 vision_embeds, audio_embeds = syncnet(frames, audio_samples)
             loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()

scripts/train_unet.py CHANGED Viewed

@@ -342,7 +342,7 @@ def main(config):
             # Predict the noise and compute loss
             # Mixed-precision training
-            with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=config.run.mixed_precision_training):
                 pred_noise = unet(unet_input, timesteps, encoder_hidden_states=audio_embeds).sample
             if config.run.recon_loss_weight != 0:
@@ -456,7 +456,7 @@ def main(config):
                 validation_video_out_path = os.path.join(output_dir, f"val_videos/val_video_{global_step}.mp4")
                 validation_video_mask_path = os.path.join(output_dir, f"val_videos/val_video_mask.mp4")
-                with torch.autocast(device_type="cuda", dtype=torch.float16):
                     pipeline(
                         config.data.val_video_path,
                         config.data.val_audio_path,

             # Predict the noise and compute loss
             # Mixed-precision training
+            with torch.autocast(device_type="cpu", dtype=torch.float16, enabled=config.run.mixed_precision_training):
                 pred_noise = unet(unet_input, timesteps, encoder_hidden_states=audio_embeds).sample
             if config.run.recon_loss_weight != 0:
                 validation_video_out_path = os.path.join(output_dir, f"val_videos/val_video_{global_step}.mp4")
                 validation_video_mask_path = os.path.join(output_dir, f"val_videos/val_video_mask.mp4")
+                with torch.autocast(device_type="cpu", dtype=torch.float16):
                     pipeline(
                         config.data.val_video_path,
                         config.data.val_audio_path,