Upload folder using huggingface_hub
Browse files- eval/eval_sync_conf.py +1 -1
- eval/eval_syncnet_acc.py +1 -1
- eval/fvd.py +1 -1
- inference.bat +10 -0
- latentsync/models/attention.py +6 -6
- latentsync/pipelines/lipsync_pipeline.py +1 -1
- latentsync/trepa/__init__.py +4 -4
- latentsync/whisper/whisper/__init__.py +1 -1
- latentsync/whisper/whisper/transcribe.py +1 -1
- scripts/inference.py +2 -2
- scripts/train_syncnet.py +2 -2
- scripts/train_unet.py +2 -2
eval/eval_sync_conf.py
CHANGED
@@ -50,7 +50,7 @@ def main():
|
|
50 |
|
51 |
args = parser.parse_args()
|
52 |
|
53 |
-
device = "
|
54 |
|
55 |
syncnet = SyncNetEval(device=device)
|
56 |
syncnet.loadParameters(args.initial_model)
|
|
|
50 |
|
51 |
args = parser.parse_args()
|
52 |
|
53 |
+
device = "cpu" if torch.cuda.is_available() else "cpu"
|
54 |
|
55 |
syncnet = SyncNetEval(device=device)
|
56 |
syncnet.loadParameters(args.initial_model)
|
eval/eval_syncnet_acc.py
CHANGED
@@ -27,7 +27,7 @@ from accelerate.utils import set_seed
|
|
27 |
def main(config):
|
28 |
set_seed(config.run.seed)
|
29 |
|
30 |
-
device = "
|
31 |
|
32 |
if config.data.latent_space:
|
33 |
vae = AutoencoderKL.from_pretrained(
|
|
|
27 |
def main(config):
|
28 |
set_seed(config.run.seed)
|
29 |
|
30 |
+
device = "cpu" if torch.cuda.is_available() else "cpu"
|
31 |
|
32 |
if config.data.latent_space:
|
33 |
vae = AutoencoderKL.from_pretrained(
|
eval/fvd.py
CHANGED
@@ -25,7 +25,7 @@ def compute_stats(feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
25 |
|
26 |
|
27 |
@torch.no_grad()
|
28 |
-
def compute_our_fvd(videos_fake: np.ndarray, videos_real: np.ndarray, device: str = "
|
29 |
i3d_path = "checkpoints/auxiliary/i3d_torchscript.pt"
|
30 |
i3d_kwargs = dict(
|
31 |
rescale=False, resize=False, return_features=True
|
|
|
25 |
|
26 |
|
27 |
@torch.no_grad()
|
28 |
+
def compute_our_fvd(videos_fake: np.ndarray, videos_real: np.ndarray, device: str = "cpu") -> float:
|
29 |
i3d_path = "checkpoints/auxiliary/i3d_torchscript.pt"
|
30 |
i3d_kwargs = dict(
|
31 |
rescale=False, resize=False, return_features=True
|
inference.bat
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@echo off
|
2 |
+
|
3 |
+
python -m scripts.inference ^
|
4 |
+
--unet_config_path "configs\unet\second_stage.yaml" ^
|
5 |
+
--inference_ckpt_path "checkpoints\latentsync_unet.pt" ^
|
6 |
+
--inference_steps 20 ^
|
7 |
+
--guidance_scale 1.5 ^
|
8 |
+
--video_path "assets\demo1_video.mp4" ^
|
9 |
+
--audio_path "assets\demo1_audio.wav" ^
|
10 |
+
--video_out_path "video_out.mp4"
|
latentsync/models/attention.py
CHANGED
@@ -269,9 +269,9 @@ class BasicTransformerBlock(nn.Module):
|
|
269 |
try:
|
270 |
# Make sure we can run the memory efficient attention
|
271 |
_ = xformers.ops.memory_efficient_attention(
|
272 |
-
torch.randn((1, 2, 40), device="
|
273 |
-
torch.randn((1, 2, 40), device="
|
274 |
-
torch.randn((1, 2, 40), device="
|
275 |
)
|
276 |
except Exception as e:
|
277 |
raise e
|
@@ -402,9 +402,9 @@ class AudioTransformerBlock(nn.Module):
|
|
402 |
try:
|
403 |
# Make sure we can run the memory efficient attention
|
404 |
_ = xformers.ops.memory_efficient_attention(
|
405 |
-
torch.randn((1, 2, 40), device="
|
406 |
-
torch.randn((1, 2, 40), device="
|
407 |
-
torch.randn((1, 2, 40), device="
|
408 |
)
|
409 |
except Exception as e:
|
410 |
raise e
|
|
|
269 |
try:
|
270 |
# Make sure we can run the memory efficient attention
|
271 |
_ = xformers.ops.memory_efficient_attention(
|
272 |
+
torch.randn((1, 2, 40), device="cpu"),
|
273 |
+
torch.randn((1, 2, 40), device="cpu"),
|
274 |
+
torch.randn((1, 2, 40), device="cpu"),
|
275 |
)
|
276 |
except Exception as e:
|
277 |
raise e
|
|
|
402 |
try:
|
403 |
# Make sure we can run the memory efficient attention
|
404 |
_ = xformers.ops.memory_efficient_attention(
|
405 |
+
torch.randn((1, 2, 40), device="cpu"),
|
406 |
+
torch.randn((1, 2, 40), device="cpu"),
|
407 |
+
torch.randn((1, 2, 40), device="cpu"),
|
408 |
)
|
409 |
except Exception as e:
|
410 |
raise e
|
latentsync/pipelines/lipsync_pipeline.py
CHANGED
@@ -322,7 +322,7 @@ class LipsyncPipeline(DiffusionPipeline):
|
|
322 |
# 0. Define call parameters
|
323 |
batch_size = 1
|
324 |
device = self._execution_device
|
325 |
-
self.image_processor = ImageProcessor(height, mask=mask, device="
|
326 |
self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
|
327 |
|
328 |
faces, original_video_frames, boxes, affine_matrices = self.affine_transform_video(video_path)
|
|
|
322 |
# 0. Define call parameters
|
323 |
batch_size = 1
|
324 |
device = self._execution_device
|
325 |
+
self.image_processor = ImageProcessor(height, mask=mask, device="cpu")
|
326 |
self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
|
327 |
|
328 |
faces, original_video_frames, boxes, affine_matrices = self.affine_transform_video(video_path)
|
latentsync/trepa/__init__.py
CHANGED
@@ -21,7 +21,7 @@ from .third_party.VideoMAEv2.utils import load_videomae_model
|
|
21 |
class TREPALoss:
|
22 |
def __init__(
|
23 |
self,
|
24 |
-
device="
|
25 |
ckpt_path="checkpoints/auxiliary/vit_g_hybrid_pt_1200e_ssv2_ft.pth",
|
26 |
):
|
27 |
self.model = load_videomae_model(device, ckpt_path).eval().to(dtype=torch.float16)
|
@@ -54,9 +54,9 @@ class TREPALoss:
|
|
54 |
|
55 |
if __name__ == "__main__":
|
56 |
# input shape: (b, c, f, h, w)
|
57 |
-
videos_fake = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="
|
58 |
-
videos_real = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="
|
59 |
|
60 |
-
trepa_loss = TREPALoss(device="
|
61 |
loss = trepa_loss(videos_fake, videos_real)
|
62 |
print(loss)
|
|
|
21 |
class TREPALoss:
|
22 |
def __init__(
|
23 |
self,
|
24 |
+
device="cpu",
|
25 |
ckpt_path="checkpoints/auxiliary/vit_g_hybrid_pt_1200e_ssv2_ft.pth",
|
26 |
):
|
27 |
self.model = load_videomae_model(device, ckpt_path).eval().to(dtype=torch.float16)
|
|
|
54 |
|
55 |
if __name__ == "__main__":
|
56 |
# input shape: (b, c, f, h, w)
|
57 |
+
videos_fake = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cpu", dtype=torch.float16)
|
58 |
+
videos_real = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cpu", dtype=torch.float16)
|
59 |
|
60 |
+
trepa_loss = TREPALoss(device="cpu")
|
61 |
loss = trepa_loss(videos_fake, videos_real)
|
62 |
print(loss)
|
latentsync/whisper/whisper/__init__.py
CHANGED
@@ -97,7 +97,7 @@ def load_model(
|
|
97 |
"""
|
98 |
|
99 |
if device is None:
|
100 |
-
device = "
|
101 |
if download_root is None:
|
102 |
download_root = os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
|
103 |
|
|
|
97 |
"""
|
98 |
|
99 |
if device is None:
|
100 |
+
device = "cpu" if torch.cuda.is_available() else "cpu"
|
101 |
if download_root is None:
|
102 |
download_root = os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
|
103 |
|
latentsync/whisper/whisper/transcribe.py
CHANGED
@@ -135,7 +135,7 @@ def cli():
|
|
135 |
parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
|
136 |
parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
|
137 |
parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
|
138 |
-
parser.add_argument("--device", default="
|
139 |
parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
|
140 |
parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
|
141 |
|
|
|
135 |
parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
|
136 |
parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
|
137 |
parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
|
138 |
+
parser.add_argument("--device", default="cpu" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
|
139 |
parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
|
140 |
parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
|
141 |
|
scripts/inference.py
CHANGED
@@ -41,7 +41,7 @@ def main(config, args):
|
|
41 |
else:
|
42 |
raise NotImplementedError("cross_attention_dim must be 768 or 384")
|
43 |
|
44 |
-
audio_encoder = Audio2Feature(model_path=whisper_model_path, device="
|
45 |
|
46 |
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
|
47 |
vae.config.scaling_factor = 0.18215
|
@@ -64,7 +64,7 @@ def main(config, args):
|
|
64 |
audio_encoder=audio_encoder,
|
65 |
unet=unet,
|
66 |
scheduler=scheduler,
|
67 |
-
).to("
|
68 |
|
69 |
if args.seed != -1:
|
70 |
set_seed(args.seed)
|
|
|
41 |
else:
|
42 |
raise NotImplementedError("cross_attention_dim must be 768 or 384")
|
43 |
|
44 |
+
audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cpu", num_frames=config.data.num_frames)
|
45 |
|
46 |
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
|
47 |
vae.config.scaling_factor = 0.18215
|
|
|
64 |
audio_encoder=audio_encoder,
|
65 |
unet=unet,
|
66 |
scheduler=scheduler,
|
67 |
+
).to("cpu")
|
68 |
|
69 |
if args.seed != -1:
|
70 |
set_seed(args.seed)
|
scripts/train_syncnet.py
CHANGED
@@ -209,7 +209,7 @@ def main(config):
|
|
209 |
# audio_embeds = wav2vec_encoder(audio_samples).last_hidden_state
|
210 |
|
211 |
# Mixed-precision training
|
212 |
-
with torch.autocast(device_type="
|
213 |
vision_embeds, audio_embeds = syncnet(frames, audio_samples)
|
214 |
|
215 |
loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()
|
@@ -309,7 +309,7 @@ def validation(val_dataloader, device, syncnet, cosine_loss, latent_space, lower
|
|
309 |
height = frames.shape[2]
|
310 |
frames = frames[:, :, height // 2 :, :]
|
311 |
|
312 |
-
with torch.autocast(device_type="
|
313 |
vision_embeds, audio_embeds = syncnet(frames, audio_samples)
|
314 |
|
315 |
loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()
|
|
|
209 |
# audio_embeds = wav2vec_encoder(audio_samples).last_hidden_state
|
210 |
|
211 |
# Mixed-precision training
|
212 |
+
with torch.autocast(device_type="cpu", dtype=torch.float16, enabled=config.run.mixed_precision_training):
|
213 |
vision_embeds, audio_embeds = syncnet(frames, audio_samples)
|
214 |
|
215 |
loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()
|
|
|
309 |
height = frames.shape[2]
|
310 |
frames = frames[:, :, height // 2 :, :]
|
311 |
|
312 |
+
with torch.autocast(device_type="cpu", dtype=torch.float16):
|
313 |
vision_embeds, audio_embeds = syncnet(frames, audio_samples)
|
314 |
|
315 |
loss = cosine_loss(vision_embeds.float(), audio_embeds.float(), y).mean()
|
scripts/train_unet.py
CHANGED
@@ -342,7 +342,7 @@ def main(config):
|
|
342 |
|
343 |
# Predict the noise and compute loss
|
344 |
# Mixed-precision training
|
345 |
-
with torch.autocast(device_type="
|
346 |
pred_noise = unet(unet_input, timesteps, encoder_hidden_states=audio_embeds).sample
|
347 |
|
348 |
if config.run.recon_loss_weight != 0:
|
@@ -456,7 +456,7 @@ def main(config):
|
|
456 |
validation_video_out_path = os.path.join(output_dir, f"val_videos/val_video_{global_step}.mp4")
|
457 |
validation_video_mask_path = os.path.join(output_dir, f"val_videos/val_video_mask.mp4")
|
458 |
|
459 |
-
with torch.autocast(device_type="
|
460 |
pipeline(
|
461 |
config.data.val_video_path,
|
462 |
config.data.val_audio_path,
|
|
|
342 |
|
343 |
# Predict the noise and compute loss
|
344 |
# Mixed-precision training
|
345 |
+
with torch.autocast(device_type="cpu", dtype=torch.float16, enabled=config.run.mixed_precision_training):
|
346 |
pred_noise = unet(unet_input, timesteps, encoder_hidden_states=audio_embeds).sample
|
347 |
|
348 |
if config.run.recon_loss_weight != 0:
|
|
|
456 |
validation_video_out_path = os.path.join(output_dir, f"val_videos/val_video_{global_step}.mp4")
|
457 |
validation_video_mask_path = os.path.join(output_dir, f"val_videos/val_video_mask.mp4")
|
458 |
|
459 |
+
with torch.autocast(device_type="cpu", dtype=torch.float16):
|
460 |
pipeline(
|
461 |
config.data.val_video_path,
|
462 |
config.data.val_audio_path,
|