zerovox-demo / app.py
Guenter Bartsch
adapt to zerovox 0.3.0
ebf5126
import tempfile
import time
import librosa
import streamlit as st
from zerovox.tts.synthesize import ZeroVoxTTS
SAMPLE_RATE=24000 # FIXME
DEFAULT_SPEAKER = 'en_kevin.wav'
SAMPLE_SENTENCE_EN = "A rainbow is an optical phenomenon caused by refraction, internal reflection and dispersion of light in water droplets resulting in a continuous spectrum of light appearing in the sky."
SAMPLE_SENTENCE_DE = "Der Regenbogen ist ein atmosphärisch-optisches Phänomen, das als kreisbogenförmiges farbiges Lichtband in einem von der Sonne beschienenen Regenschauer erscheint."
DEFAULT_LANGUAGE = 'en'
if "lang" not in st.session_state:
st.session_state.lang = DEFAULT_LANGUAGE
if "text" not in st.session_state:
st.session_state.text = SAMPLE_SENTENCE_EN if st.session_state.lang == 'en' else SAMPLE_SENTENCE_DE
if "message" not in st.session_state:
st.session_state.message = "READY."
if "autoplay" not in st.session_state:
st.session_state.autoplay = False
if "speakerref" not in st.session_state:
st.session_state.speakerref = DEFAULT_SPEAKER
if "custom_voice" not in st.session_state:
st.session_state.custom_voice = False
if "voice_wav" not in st.session_state:
st.session_state.voice_wav = None
def update_text_input():
global text
if st.session_state['lang'] == "en":
st.session_state.text = SAMPLE_SENTENCE_EN
elif st.session_state['lang'] == "de":
st.session_state.text = SAMPLE_SENTENCE_DE
def do_synth():
global status, playback, meldec
synth = None
if 'synth' in st.session_state:
synth = st.session_state.synth
if synth.meldec_model != st.session_state['meldec']:
synth = None # trigger reload
else:
if synth.language != st.session_state.lang:
#status.update(label=f"loading the lexicon for {st.session_state.lang} ...", state="running")
#synth.language = st.session_state.lang
synth = None # trigger reload
if not synth:
status.update(label="loading the model...", state="running")
st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(ZeroVoxTTS.get_default_model(st.session_state.lang),
meldec_model=st.session_state['meldec'],
infer_device='cpu',
num_threads=-1,
verbose=True)
synth = st.session_state.synth
modelcfg = st.session_state.modelcfg
status.update(label="computing speaker embedding...", state="running")
if not st.session_state.custom_voice or st.session_state.voice_wav is None:
speakerref = ZeroVoxTTS.get_speakerref(st.session_state.speakerref, modelcfg['audio']['sampling_rate'])
else:
speakerref = st.session_state.voice_wav
spkemb = synth.speaker_embed(speakerref)
status.update(label="synthesizing...", state="running")
start_time = time.time()
wav, phoneme, length = synth.tts(st.session_state.text, spkemb)
elapsed_time = time.time() - start_time
message = f"synth time: {elapsed_time:.2f} sec"
wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate']
message += f", voice length: {wav_len:.2f} sec"
real_time_factor = wav_len / elapsed_time
message += f", rtf: {real_time_factor:.2f}"
st.session_state.message = message
st.session_state.wav = wav
st.session_state.autoplay = True
st.set_page_config(page_title="ZeroVOX TTS Demo", page_icon=':speech_balloon:', layout="centered", initial_sidebar_state="auto", menu_items=None)
st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")
tab1, tab2 = st.tabs(["Voice", "MEL Decoder"])
with tab1:
st.checkbox("Custom voice", key='custom_voice')
col1, col2 = st.columns([0.6, 0.4], vertical_alignment="bottom")
with col1:
speakerref = st.empty()
if st.session_state.custom_voice:
# Create a file uploader that accepts only .wav files
uploaded_file = speakerref.file_uploader("Upload your voice sample", type=["wav"])
# Process the uploaded file
if uploaded_file is not None:
with tempfile.NamedTemporaryFile() as f:
f.write(uploaded_file.read())
wav, sr = librosa.load(f.name, sr=SAMPLE_RATE)
st.session_state.voice_wav=wav
st.audio(wav, sample_rate=SAMPLE_RATE)
else:
speakers = [s for s in ZeroVoxTTS.available_speakerrefs()]
speakerref.selectbox("Voice", speakers, key='speakerref')
with col2:
st.audio(ZeroVoxTTS.get_speakerref(st.session_state.speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)
with tab2:
meldec = st.selectbox("MEL decoder",
[
#"../models/meldec-zerovox-de-hifigan-v1-0",
#"../models/meldec-libritts-hifigan-v1",
#"../models/meldec-libritts-multi-band-melgan-v2",
#"../models/meldec-libritts-parallel-wavegan-v1",
#"../models/meldec-libritts-parallel-wavegan-v1-long",
#"../models/meldec-libritts-style-melgan-v1",
#"../models/meldec-vctk-hifigan-v1",
#"../models/meldec-vctk-multi-band-melgan-v2",
#"../models/meldec-vctk-style-melgan-v1",
"meldec-libritts-multi-band-melgan-v2",
"meldec-libritts-hifigan-v1",
],
#on_change=update_text_input,
key='meldec')
status = st.status(st.session_state.message, state="complete")
col1, col2 = st.columns([0.8, 0.2])
with col1:
text = st.text_area("Text to synthesize", key='text', on_change=do_synth, height=128)
with col2:
lang = st.selectbox("Language",
["en", "de"],
on_change=update_text_input,
key='lang')
st.button("Synthesize!", type="primary", on_click=do_synth)
if 'wav' in st.session_state:
playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE, autoplay=st.session_state.autoplay)
else:
playback = st.empty()