Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Dec 22, 2024

Commit

6d01598

1 Parent(s): 2c71769

Update inference_scale.py

Browse files

Files changed (1) hide show

inference_scale.py +3 -6

inference_scale.py CHANGED Viewed

@@ -15,7 +15,7 @@ import time
 @torch.no_grad()
-def inference_one_sample(model, model_args, phn2num, text_tokenizer, audio_tokenizer, audio_fn, prompt_text, target_text, mask_interval, cfg_coef, aug_text, aug_context, use_watermark, tts, device, decode_config):
     # phonemize
     text_tokens = [phn2num[phn] for phn in
             tokenize_text(
@@ -54,6 +54,7 @@ def inference_one_sample(model, model_args, phn2num, text_tokenizer, audio_token
         stop_repetition=decode_config['stop_repetition'],
         kvcache=decode_config['kvcache'],
         cfg_coef=cfg_coef,
         aug_text=aug_text,
     ) # output is [1,K,T]
     logging.info(f"inference on one sample take: {time.time() - stime:.4f} sec.")
@@ -69,23 +70,19 @@ def inference_one_sample(model, model_args, phn2num, text_tokenizer, audio_token
         padding_length = (multiple - (current_length % multiple)) % multiple
         if padding_length > 0:
             wav = F.pad(wav, (0, padding_length), "constant", 0)
-        # new_emb = torch.zeros((1, emb.shape[1], encoded_frames.shape[-1])).to(encoded_frames.device)
         new_wav = torch.zeros(1, encoded_frames.shape[-1]*320) # codec hz
         ori_non_mask_intervals = [(max(item[0],0), item[1]) for item in ori_masks]
         non_mask_intervals = [(max(item[0],0), item[1]) for item in masks]
         for i in range(len(ori_non_mask_intervals)):
-            # new_emb[..., non_mask_intervals[i][0]:non_mask_intervals[i][1]] = emb[..., ori_non_mask_intervals[i][0]:ori_non_mask_intervals[i][1]]
             new_wav[:, non_mask_intervals[i][0]*320:non_mask_intervals[i][1]*320] = wav[:, ori_non_mask_intervals[i][0]*320:ori_non_mask_intervals[i][1]*320]
-        # generated_sample = audio_tokenizer.wmdecode(encoded_frames, marks.to(encoded_frames.device), new_emb, scale)
         generated_sample = audio_tokenizer.wmdecode(encoded_frames, marks.to(encoded_frames.device), new_wav.unsqueeze(0).to(encoded_frames.device), scale)
     else:
         generated_sample = audio_tokenizer.decode(encoded_frames, scale)
     if tts:
-        wav, sr = torchaudio.load(audio_fn)
         generated_sample = generated_sample[:,:, masks[0][1]*320:]
     return generated_sample
@@ -118,4 +115,4 @@ def get_mask_interval(ali_fn, word_span):
     return (start, end)
 if __name__ == "__main__":
-    pass

 @torch.no_grad()
+def inference_one_sample(model, model_args, phn2num, text_tokenizer, audio_tokenizer, audio_fn, prompt_text, target_text, mask_interval, cfg_coef, cfg_stride, aug_text, aug_context, use_watermark, tts, device, decode_config):
     # phonemize
     text_tokens = [phn2num[phn] for phn in
             tokenize_text(
         stop_repetition=decode_config['stop_repetition'],
         kvcache=decode_config['kvcache'],
         cfg_coef=cfg_coef,
+        cfg_stride=cfg_stride,
         aug_text=aug_text,
     ) # output is [1,K,T]
     logging.info(f"inference on one sample take: {time.time() - stime:.4f} sec.")
         padding_length = (multiple - (current_length % multiple)) % multiple
         if padding_length > 0:
             wav = F.pad(wav, (0, padding_length), "constant", 0)
         new_wav = torch.zeros(1, encoded_frames.shape[-1]*320) # codec hz
         ori_non_mask_intervals = [(max(item[0],0), item[1]) for item in ori_masks]
         non_mask_intervals = [(max(item[0],0), item[1]) for item in masks]
         for i in range(len(ori_non_mask_intervals)):
             new_wav[:, non_mask_intervals[i][0]*320:non_mask_intervals[i][1]*320] = wav[:, ori_non_mask_intervals[i][0]*320:ori_non_mask_intervals[i][1]*320]
         generated_sample = audio_tokenizer.wmdecode(encoded_frames, marks.to(encoded_frames.device), new_wav.unsqueeze(0).to(encoded_frames.device), scale)
     else:
         generated_sample = audio_tokenizer.decode(encoded_frames, scale)
     if tts:
         generated_sample = generated_sample[:,:, masks[0][1]*320:]
     return generated_sample
     return (start, end)
 if __name__ == "__main__":
+    pass