Kevin676's picture
Update app.py
0770c01
raw
history blame contribute delete
5.19 kB
import gradio as gr
import os
os.system('pip install paddlespeech')
os.system('pip install paddlepaddle')
from transformers import AutoModel, AutoTokenizer
from TTS.api import TTS
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
tts1 = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement
enhance_model = SpectralMaskEnhancement.from_hparams(
source="speechbrain/metricgan-plus-voicebank",
savedir="pretrained_models/metricgan-plus-voicebank",
run_opts={"device":"cuda"},
)
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
model = model.eval()
def inference(text):
os.system("paddlespeech tts --input '"+text+"' --output output.wav")
return "output.wav"
def predict(input, history=None):
if history is None:
history = []
response, history = model.chat(tokenizer, input, history)
return history, history, response
def chinese(text_cn, upload1, VoiceMicrophone1):
if upload1 is not None:
tts.voice_conversion_to_file(source_wav=inference(text_cn), target_wav=upload1, file_path="output0.wav")
else:
tts.voice_conversion_to_file(source_wav=inference(text_cn), target_wav=VoiceMicrophone1, file_path="output0.wav")
noisy = enhance_model.load_audio(
"output0.wav"
).unsqueeze(0)
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
return "enhanced.wav"
def english(text_en, upload, VoiceMicrophone):
if upload is not None:
tts1.tts_to_file(text_en.strip(), speaker_wav = upload, language="en", file_path="output.wav")
else:
tts1.tts_to_file(text_en.strip(), speaker_wav = VoiceMicrophone, language="en", file_path="output.wav")
noisy = enhance_model.load_audio(
"output.wav"
).unsqueeze(0)
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
return "enhanced.wav"
with gr.Blocks() as demo:
gr.Markdown(
""" # <center>🥳💬💕 - TalktoAI,随时随地,谈天说地!</center>
### <center>🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!</center>
"""
)
state = gr.State([])
chatbot = gr.Chatbot([], elem_id="chatbot").style(height=300)
res = gr.Textbox(lines=1, placeholder="最新的回答在这里", show_label = False).style(container=False)
with gr.Row():
# with gr.Column(scale=4):
txt = gr.Textbox(label = "说点什么吧(中英皆可)", lines=1)
# with gr.Column(scale=1):
button = gr.Button("开始对话吧")
txt.submit(predict, [txt, state], [chatbot, state, res])
button.click(predict, [txt, state], [chatbot, state, res])
with gr.Row().style(mobile_collapse=False, equal_height=True):
inp3 = res
inp4 = gr.Audio(source="upload", label = "请上传您喜欢的声音(wav/mp3文件);长语音(90s左右)效果更好", type="filepath")
inp5 = gr.Audio(source="microphone", type="filepath", label = '请用麦克风上传您喜欢的声音,与文件上传二选一即可')
btn1 = gr.Button("用喜欢的声音听一听吧(中文)")
btn2 = gr.Button("用喜欢的声音听一听吧(英文)")
with gr.Row():
out1 = gr.Audio(label="为您合成的专属声音(中文)")
out2 = gr.Audio(label="为您合成的专属声音(英文)")
btn1.click(chinese, [inp3, inp4, inp5], [out1])
btn2.click(english, [inp3, inp4, inp5], [out2])
gr.Markdown(
""" ### <center>注意❗:请不要输入或生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及娱乐使用。用户输入或生成的内容与程序开发者无关,请自觉合法合规使用,违反者一切后果自负。</center>
### <center>Model by [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b). Thanks to [THUDM](https://github.com/THUDM). Please follow me on [Bilibili](https://space.bilibili.com/501495851?spm_id_from=333.1007.0.0).</center>
"""
)
gr.HTML('''
<div class="footer">
<p>🎶🖼️🎡 - It’s the intersection of technology and liberal arts that makes our hearts sing. - Steve Jobs
</p>
<p>注:中文声音克隆实际上是通过声音转换(Voice Conversion)实现,所以输出结果可能更像是一种新的声音,效果不一定很理想,希望大家多多包涵,之后我们也会不断迭代该程序的!为了实现更好的效果,使用中文声音克隆时请尽量上传女声。
</p>
</div>
''')
demo.queue().launch(show_error=True)