pillaryao commited on
Commit
c1c867a
·
verified ·
1 Parent(s): c3c5a50

Upload folder using huggingface_hub

Browse files
eval/eval_sync_conf.py CHANGED
@@ -50,7 +50,7 @@ def main():
50
 
51
  args = parser.parse_args()
52
 
53
- device = "cuda" if torch.cuda.is_available() else "cpu"
54
 
55
  syncnet = SyncNetEval(device=device)
56
  syncnet.loadParameters(args.initial_model)
 
50
 
51
  args = parser.parse_args()
52
 
53
+ device = "cpu" if torch.cuda.is_available() else "cpu"
54
 
55
  syncnet = SyncNetEval(device=device)
56
  syncnet.loadParameters(args.initial_model)
eval/eval_syncnet_acc.py CHANGED
@@ -27,7 +27,7 @@ from accelerate.utils import set_seed
27
  def main(config):
28
  set_seed(config.run.seed)
29
 
30
- device = "cuda" if torch.cuda.is_available() else "cpu"
31
 
32
  if config.data.latent_space:
33
  vae = AutoencoderKL.from_pretrained(
 
27
  def main(config):
28
  set_seed(config.run.seed)
29
 
30
+ device = "cpu" if torch.cuda.is_available() else "cpu"
31
 
32
  if config.data.latent_space:
33
  vae = AutoencoderKL.from_pretrained(
eval/fvd.py CHANGED
@@ -25,7 +25,7 @@ def compute_stats(feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
25
 
26
 
27
  @torch.no_grad()
28
- def compute_our_fvd(videos_fake: np.ndarray, videos_real: np.ndarray, device: str = "cuda") -> float:
29
  i3d_path = "checkpoints/auxiliary/i3d_torchscript.pt"
30
  i3d_kwargs = dict(
31
  rescale=False, resize=False, return_features=True
 
25
 
26
 
27
  @torch.no_grad()
28
+ def compute_our_fvd(videos_fake: np.ndarray, videos_real: np.ndarray, device: str = "cpu") -> float:
29
  i3d_path = "checkpoints/auxiliary/i3d_torchscript.pt"
30
  i3d_kwargs = dict(
31
  rescale=False, resize=False, return_features=True
inference.bat ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+
3
+ python -m scripts.inference ^
4
+ --unet_config_path "configs\unet\second_stage.yaml" ^
5
+ --inference_ckpt_path "checkpoints\latentsync_unet.pt" ^
6
+ --inference_steps 20 ^
7
+ --guidance_scale 1.5 ^
8
+ --video_path "assets\demo1_video.mp4" ^
9
+ --audio_path "assets\demo1_audio.wav" ^
10
+ --video_out_path "video_out.mp4"
latentsync/models/attention.py CHANGED
@@ -269,9 +269,9 @@ class BasicTransformerBlock(nn.Module):
269
  try:
270
  # Make sure we can run the memory efficient attention
271
  _ = xformers.ops.memory_efficient_attention(
272
- torch.randn((1, 2, 40), device="cuda"),
273
- torch.randn((1, 2, 40), device="cuda"),
274
- torch.randn((1, 2, 40), device="cuda"),
275
  )
276
  except Exception as e:
277
  raise e
@@ -402,9 +402,9 @@ class AudioTransformerBlock(nn.Module):
402
  try:
403
  # Make sure we can run the memory efficient attention
404
  _ = xformers.ops.memory_efficient_attention(
405
- torch.randn((1, 2, 40), device="cuda"),
406
- torch.randn((1, 2, 40), device="cuda"),
407
- torch.randn((1, 2, 40), device="cuda"),
408
  )
409
  except Exception as e:
410
  raise e
 
269
  try:
270
  # Make sure we can run the memory efficient attention
271
  _ = xformers.ops.memory_efficient_attention(
272
+ torch.randn((1, 2, 40), device="cpu"),
273
+ torch.randn((1, 2, 40), device="cpu"),
274
+ torch.randn((1, 2, 40), device="cpu"),
275
  )
276
  except Exception as e:
277
  raise e
 
402
  try:
403
  # Make sure we can run the memory efficient attention
404
  _ = xformers.ops.memory_efficient_attention(
405
+ torch.randn((1, 2, 40), device="cpu"),
406
+ torch.randn((1, 2, 40), device="cpu"),
407
+ torch.randn((1, 2, 40), device="cpu"),
408
  )
409
  except Exception as e:
410
  raise e
latentsync/pipelines/lipsync_pipeline.py CHANGED
@@ -322,7 +322,7 @@ class LipsyncPipeline(DiffusionPipeline):
322
  # 0. Define call parameters
323
  batch_size = 1
324
  device = self._execution_device
325
- self.image_processor = ImageProcessor(height, mask=mask, device="cuda")
326
  self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
327
 
328
  faces, original_video_frames, boxes, affine_matrices = self.affine_transform_video(video_path)
 
322
  # 0. Define call parameters
323
  batch_size = 1
324
  device = self._execution_device
325
+ self.image_processor = ImageProcessor(height, mask=mask, device="cpu")
326
  self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
327
 
328
  faces, original_video_frames, boxes, affine_matrices = self.affine_transform_video(video_path)
latentsync/trepa/__init__.py CHANGED
@@ -21,7 +21,7 @@ from .third_party.VideoMAEv2.utils import load_videomae_model
21
  class TREPALoss:
22
  def __init__(
23
  self,
24
- device="cuda",
25
  ckpt_path="checkpoints/auxiliary/vit_g_hybrid_pt_1200e_ssv2_ft.pth",
26
  ):
27
  self.model = load_videomae_model(device, ckpt_path).eval().to(dtype=torch.float16)
@@ -54,9 +54,9 @@ class TREPALoss:
54
 
55
  if __name__ == "__main__":
56
  # input shape: (b, c, f, h, w)
57
- videos_fake = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cuda", dtype=torch.float16)
58
- videos_real = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cuda", dtype=torch.float16)
59
 
60
- trepa_loss = TREPALoss(device="cuda")
61
  loss = trepa_loss(videos_fake, videos_real)
62
  print(loss)
 
21
  class TREPALoss:
22
  def __init__(
23
  self,
24
+ device="cpu",
25
  ckpt_path="checkpoints/auxiliary/vit_g_hybrid_pt_1200e_ssv2_ft.pth",
26
  ):
27
  self.model = load_videomae_model(device, ckpt_path).eval().to(dtype=torch.float16)
 
54
 
55
  if __name__ == "__main__":
56
  # input shape: (b, c, f, h, w)
57
+ videos_fake = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cpu", dtype=torch.float16)
58
+ videos_real = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cpu", dtype=torch.float16)
59
 
60
+ trepa_loss = TREPALoss(device="cpu")
61
  loss = trepa_loss(videos_fake, videos_real)
62
  print(loss)
latentsync/whisper/whisper/__init__.py CHANGED
@@ -97,7 +97,7 @@ def load_model(
97
  """
98
 
99
  if device is None:
100
- device = "cuda" if torch.cuda.is_available() else "cpu"
101
  if download_root is None:
102
  download_root = os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
103
 
 
97
  """
98
 
99
  if device is None:
100
+ device = "cpu" if torch.cuda.is_available() else "cpu"
101
  if download_root is None:
102
  download_root = os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
103
 
latentsync/whisper/whisper/transcribe.py CHANGED
@@ -135,7 +135,7 @@ def cli():
135
  parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
136
  parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
137
  parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
138
- parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
139
  parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
140
  parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
141
 
 
135
  parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
136
  parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
137
  parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
138
+ parser.add_argument("--device", default="cpu" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
139
  parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
140
  parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
141
 
scripts/inference.py CHANGED
@@ -41,7 +41,7 @@ def main(config, args):
41
  else:
42
  raise NotImplementedError("cross_attention_dim must be 768 or 384")
43
 
44
- audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
45
 
46
  vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
47
  vae.config.scaling_factor = 0.18215
@@ -64,7 +64,7 @@ def main(config, args):
64
  audio_encoder=audio_encoder,
65
  unet=unet,
66
  scheduler=scheduler,
67
- ).to("cuda")
68
 
69
  if args.seed != -1:
70
  set_seed(args.seed)
 
41
  else:
42
  raise NotImplementedError("cross_attention_dim must be 768 or 384")
43
 
44
+ audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cpu", num_frames=config.data.num_frames)
45
 
46
  vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
47
  vae.config.scaling_factor = 0.18215
 
64
  audio_encoder=audio_encoder,
65
  unet=unet,
66
  scheduler=scheduler,
67
+ ).to("cpu")
68
 
69
  if args.seed != -1:
70
  set_seed(args.seed)
scripts/train_syncnet.py CHANGED
@@ -209,7 +209,7 @@ def main(config):
209
  # audio_embeds = wav2vec_encoder(audio_samples).last_hidden_state
210
 
211
  # Mixed-precision training
212
- with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=config.run.mixed_precision_training):
213
  vision_embeds, audio_embeds = syncnet(frames, audio_samples)
214
 
215
  loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()
@@ -309,7 +309,7 @@ def validation(val_dataloader, device, syncnet, cosine_loss, latent_space, lower
309
  height = frames.shape[2]
310
  frames = frames[:, :, height // 2 :, :]
311
 
312
- with torch.autocast(device_type="cuda", dtype=torch.float16):
313
  vision_embeds, audio_embeds = syncnet(frames, audio_samples)
314
 
315
  loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()
 
209
  # audio_embeds = wav2vec_encoder(audio_samples).last_hidden_state
210
 
211
  # Mixed-precision training
212
+ with torch.autocast(device_type="cpu", dtype=torch.float16, enabled=config.run.mixed_precision_training):
213
  vision_embeds, audio_embeds = syncnet(frames, audio_samples)
214
 
215
  loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()
 
309
  height = frames.shape[2]
310
  frames = frames[:, :, height // 2 :, :]
311
 
312
+ with torch.autocast(device_type="cpu", dtype=torch.float16):
313
  vision_embeds, audio_embeds = syncnet(frames, audio_samples)
314
 
315
  loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()
scripts/train_unet.py CHANGED
@@ -342,7 +342,7 @@ def main(config):
342
 
343
  # Predict the noise and compute loss
344
  # Mixed-precision training
345
- with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=config.run.mixed_precision_training):
346
  pred_noise = unet(unet_input, timesteps, encoder_hidden_states=audio_embeds).sample
347
 
348
  if config.run.recon_loss_weight != 0:
@@ -456,7 +456,7 @@ def main(config):
456
  validation_video_out_path = os.path.join(output_dir, f"val_videos/val_video_{global_step}.mp4")
457
  validation_video_mask_path = os.path.join(output_dir, f"val_videos/val_video_mask.mp4")
458
 
459
- with torch.autocast(device_type="cuda", dtype=torch.float16):
460
  pipeline(
461
  config.data.val_video_path,
462
  config.data.val_audio_path,
 
342
 
343
  # Predict the noise and compute loss
344
  # Mixed-precision training
345
+ with torch.autocast(device_type="cpu", dtype=torch.float16, enabled=config.run.mixed_precision_training):
346
  pred_noise = unet(unet_input, timesteps, encoder_hidden_states=audio_embeds).sample
347
 
348
  if config.run.recon_loss_weight != 0:
 
456
  validation_video_out_path = os.path.join(output_dir, f"val_videos/val_video_{global_step}.mp4")
457
  validation_video_mask_path = os.path.join(output_dir, f"val_videos/val_video_mask.mp4")
458
 
459
+ with torch.autocast(device_type="cpu", dtype=torch.float16):
460
  pipeline(
461
  config.data.val_video_path,
462
  config.data.val_audio_path,