TTS-GPT-SoVITS / main.py
lijiacai's picture
add stt
3a17e04
from fastapi import FastAPI, Body, File, Form, UploadFile, Response, Request
from fastapi.responses import FileResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
import gradio as gr
import os
from enum import Enum
import uvicorn
import time
import tempfile
try:
from model import text_to_speech, speech_to_text
except:
def text_to_speech(voice, text):
return f"static/zh/{voice}.mp3"
def speech_to_text(voice: str):
return "文本测试",""
description = """
## [接口文档](/docs)
## [效果演示](/)
## 功能:
- 零样本文本到语音(TTS): 输入 5 秒的声音样本,即刻体验文本到语音转换。
- 少样本 TTS: 仅需 1 分钟的训练数据即可微调模型,提升声音相似度和真实感。
- 跨语言支持: 支持与训练数据集不同语言的推理,目前支持英语、日语和中文。
- 支持语音转文本/文本转语音
"""
app = FastAPI(title="text to speech", description=description)
@app.middleware("http")
async def add_process_time_header(request: Request, call_next):
start_time = time.time()
response = await call_next(request)
process_time = time.time() - start_time
response.headers["X-Process-Time"] = str(process_time)
return response
app.mount("/static", StaticFiles(directory="static"), name="static")
class Language(str, Enum):
en = "English"
zh = "中文"
class DefaultVoice(str, Enum):
voice1 = "新闻小说主播-女士"
voice2 = "温柔女士"
@app.post("/tts")
async def tts(
voice: DefaultVoice = Form("新闻女士"),
text: str = Form(..., description="转换文本")
):
wav_path = text_to_speech(voice=voice, text=text)
headers = {
"Content-Disposition": f"attachment; filename={wav_path}",
"Content-Type": "audio/wav",
}
with open(wav_path, "rb") as audio_file:
audio_content = audio_file.read()
return Response(audio_content, headers=headers)
@app.post("/stt")
async def tts(
voice: UploadFile = File(...)
):
contents = await voice.read()
with tempfile.NamedTemporaryFile() as f:
f.write(contents)
f.flush()
text, _ = speech_to_text(f.name)
return {"text": text}
class Demo:
title = "text to speech"
description = description
def __init__(self):
with gr.Blocks(theme=gr.themes.Soft()) as self.page:
with gr.Row():
gr.Markdown(value=self.description)
with gr.Row():
with gr.Column(scale=2):
with gr.Row():
text_tts = gr.Textbox(label="请输入需要转换的文本")
with gr.Row():
voice_tts = gr.Dropdown(
["新闻小说主播-女士", "温柔女士"],
label="选择音色")
with gr.Row():
audio_tts = gr.Audio(
label="转换后的音频", type="filepath", scale=3)
with gr.Row():
button_tts = gr.Button(value="文本转语音")
with gr.Column(scale=2):
audio_stt = gr.Audio(
label="上传语音", type="filepath", scale=3)
with gr.Row():
button_stt = gr.Button(value="文本转语音")
text_stt = gr.Text(label="结果")
# 事件
button_tts.click(self.click_run_button_tts, inputs=[
voice_tts, text_tts], outputs=[audio_tts])
button_stt.click(self.click_run_button_stt, inputs=[
audio_stt], outputs=[text_stt])
def click_run_button_tts(self, voice, text):
wav_path = text_to_speech(voice=voice, text=text)
return wav_path
def click_run_button_stt(self, audio):
text,_ = speech_to_text(voice=audio)
return text
gr.mount_gradio_app(app, Demo().page, path="/")
if __name__ == '__main__':
uvicorn.run(app="main:app", port=int(
os.environ.get("PORT", 7860)), host="0.0.0.0")