|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
import librosa |
|
|
import logging |
|
|
import soundfile as sf |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sub_modules = ["", "thirdparty/G2P", "semantic_tokenizer/f40ms", "text2token", "semantic_detokenizer"] |
|
|
for sub in sub_modules: |
|
|
sys.path.append(str((Path(__file__).parent / sub).absolute())) |
|
|
|
|
|
from semantic_tokenizer.f40ms.simple_tokenizer_infer import SpeechTokenizer, TOKENIZER_CFG_NAME |
|
|
from text2token.simple_infer import Text2TokenGenerator |
|
|
from semantic_detokenizer.chunk_infer import SpeechDetokenizer |
|
|
|
|
|
|
|
|
class TTSPipeline: |
|
|
def __init__( |
|
|
self, |
|
|
detok_vocoder: str, |
|
|
tokenizer_cfg_name: str = TOKENIZER_CFG_NAME, |
|
|
tokenizer_cfg_path: str = str( |
|
|
(Path(__file__).parent / "semantic_tokenizer/f40ms/config").absolute() |
|
|
), |
|
|
tokenizer_ckpt: str = str( |
|
|
( |
|
|
Path(__file__).parent / "semantic_tokenizer/f40ms/ckpt/model.pt" |
|
|
).absolute() |
|
|
), |
|
|
max_seg_len: int = 0, |
|
|
detok_model_cfg: str = str( |
|
|
(Path(__file__).parent / "semantic_detokenizer/ckpt/model.yaml").absolute() |
|
|
), |
|
|
detok_ckpt: str = str( |
|
|
(Path(__file__).parent / "semantic_detokenizer/ckpt/model.pt").absolute() |
|
|
), |
|
|
detok_vocab: str = str( |
|
|
( |
|
|
Path(__file__).parent / "semantic_detokenizer/ckpt/vocab_4096.txt" |
|
|
).absolute() |
|
|
), |
|
|
): |
|
|
self.tokenizer_cfg_name = tokenizer_cfg_name |
|
|
self.tokenizer = SpeechTokenizer( |
|
|
ckpt_path=tokenizer_ckpt, |
|
|
cfg_path=tokenizer_cfg_path, |
|
|
cfg_name=self.tokenizer_cfg_name, |
|
|
) |
|
|
|
|
|
self.t2u_max_seg_len = max_seg_len |
|
|
self.t2u = Text2TokenGenerator() |
|
|
|
|
|
self.device = "cuda:0" |
|
|
self.detoker = SpeechDetokenizer( |
|
|
vocoder_path=detok_vocoder, |
|
|
model_cfg=detok_model_cfg, |
|
|
ckpt_file=detok_ckpt, |
|
|
vocab_file=detok_vocab, |
|
|
device=self.device, |
|
|
) |
|
|
|
|
|
self.token_chunk_len = 75 |
|
|
self.chunk_cond_proportion = 0.3 |
|
|
self.chunk_look_ahead = 10 |
|
|
self.max_ref_duration = 4.5 |
|
|
self.ref_audio_cut_from_head = False |
|
|
|
|
|
def synthesize(self, ref_wav, input_text): |
|
|
ref_wavs_list = [] |
|
|
raw_wav, sr = librosa.load(ref_wav, sr=16000) |
|
|
ref_wavs_list.append(raw_wav) |
|
|
|
|
|
token_list, token_info_list = self.tokenizer.extract( |
|
|
ref_wavs_list |
|
|
) |
|
|
ref_token_list = token_info_list[0]["reduced_unit_sequence"] |
|
|
logging.info("tokens for ref wav: %s are [%s]" % (ref_wav, ref_token_list)) |
|
|
|
|
|
phones = self.t2u.text2phone(input_text.strip()) |
|
|
logging.info("phonemes of input text: %s are [%s]" % (input_text, phones)) |
|
|
|
|
|
speech_tokens_info = self.t2u.generate_for_long_input_text( |
|
|
[phones], max_segment_len=self.t2u_max_seg_len |
|
|
) |
|
|
generated_wave, target_sample_rate = self.detoker.chunk_generate( |
|
|
ref_wav, |
|
|
ref_token_list.split(), |
|
|
speech_tokens_info[0][0], |
|
|
self.token_chunk_len, |
|
|
self.chunk_cond_proportion, |
|
|
self.chunk_look_ahead, |
|
|
self.max_ref_duration, |
|
|
self.ref_audio_cut_from_head, |
|
|
) |
|
|
|
|
|
if generated_wave is None: |
|
|
logging.info("generation FAILED") |
|
|
return None, None |
|
|
return generated_wave, target_sample_rate |
|
|
|
|
|
|
|
|
def main(args): |
|
|
|
|
|
tts = TTSPipeline( |
|
|
detok_vocoder=args.detok_vocoder, |
|
|
max_seg_len=args.max_seg_len, |
|
|
) |
|
|
|
|
|
generated_wave, target_sample_rate = tts.synthesize(args.ref_wav, args.input_text) |
|
|
with open(args.output_wav, "wb") as f: |
|
|
sf.write(f.name, generated_wave, target_sample_rate) |
|
|
logging.info(f"write output to: {f.name}") |
|
|
|
|
|
logging.info("Finished") |
|
|
return |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument( |
|
|
"--tokenizer-ckpt", |
|
|
required=False, |
|
|
help="path to ckpt", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--tokenizer-cfg-path", |
|
|
required=False, |
|
|
default="semantic_tokenizer/f40ms/config", |
|
|
help="path to config", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--detok-ckpt", |
|
|
required=False, |
|
|
help="path to ckpt", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--detok-model-cfg", |
|
|
required=False, |
|
|
help="path to model_cfg", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--detok-vocab", |
|
|
required=False, |
|
|
help="path to vocab", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--detok-vocoder", |
|
|
required=True, |
|
|
help="path to vocoder", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--ref-wav", |
|
|
required=True, |
|
|
help="path to ref wav", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max-seg-len", |
|
|
required=False, |
|
|
default=0, |
|
|
type=int, |
|
|
help="max segment length", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output-wav", |
|
|
required=True, |
|
|
help="path to output synthesized wav", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--input-text", |
|
|
required=True, |
|
|
help="input text to synthesize", |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
main(args) |
|
|
|