Spaces:

leeoxiang
/

tts-streaming-latency

Runtime error

App Files Files Community

leolxliu commited on Oct 24, 2023

Commit

4958e9a

1 Parent(s): 7d2db7e

add more code

Browse files

Files changed (2) hide show

app.py +187 -4
requirement.txt +6 -0

app.py CHANGED Viewed

@@ -1,7 +1,190 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import os
+import time
+import azure.cognitiveservices.speech as speechsdk
+from pyht import Client
+from pyht.client import TTSOptions
+import requests
+text = 'Today is Sunday, the weather is sunny. I am here to test the delay of various TTS services thoroughly'
+def azure_tts(text):
+    speech_key = os.getenv('SPEECH_KEY')
+    speech_regoion = os.getenv('SPEECH_REGION')
+    if speech_key is None or speech_regoion is None:
+        print('Please set the environment variables SPEECH_KEY and SPEECH_REGION')
+        exit(1)
+    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=speech_regoion)
+    speech_config.speech_synthesis_voice_name = 'en-US-JennyNeural'
+    speech_config.speech_synthesis_language = "en-US"
+    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)
+    pull_stream = speechsdk.audio.PullAudioOutputStream()
+    stream_config = speechsdk.audio.AudioOutputConfig(stream=pull_stream)
+    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=stream_config)
+    speech_synthesizer.speak_text_async(text)
+    azure_latency = 0
+    start = time.perf_counter()
+    audio_buffer = bytes(512)
+    filled_size = pull_stream.read(audio_buffer)
+    end = time.perf_counter()
+    azure_latency = end - start
+    return azure_latency
+def coqui_tts(text):
+    voice_id = 'c791b5b5-0558-42b8-bb0b-602ac5efc0b9'
+    COQUI_API_TOKEN = os.getenv["COQUI_TOKEN"]
+    start = time.perf_counter()
+    res = requests.post(
+        "https://app.coqui.ai/api/v2/samples/xtts/stream",
+        json={
+            "text": text,
+            "language": 'en',
+            "voice_id": voice_id},
+            headers={"Authorization": f"Bearer {COQUI_API_TOKEN}"},
+            stream=True,
+        )
+    if res.status_code != 201:
+        print(f"Endpoint failed with status code {res.status_code}:",
+                res.content.decode("utf-8"))
+        return 0
+    first = True
+    for chunk in res.iter_content(chunk_size=512):
+        if first:
+            end = time.perf_counter()
+            coqui_latency = end-start
+            return coqui_latency
+def elevenlab_tts(text):
+    voice_id = '21m00Tcm4TlvDq8ikWAM'
+    CHUNK_SIZE = 512
+    url = f'https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream'
+    xi_api_key = os.getenv['ELEVENLAB_KEY']
+    if xi_api_key is None:
+        print('Please set the environment variable ELEVENLAB_KEY')
+        exit(1)
+    headers = {
+    "Accept": "audio/mpeg",
+    "Content-Type": "application/json",
+    "xi-api-key": xi_api_key
+    }
+    data = {
+    "text": text,
+    "model_id": "eleven_multilingual_v2",
+    "voice_settings": {
+        "stability": 0.5,
+        "similarity_boost": 0.5
+    }
+    }
+    start = time.perf_counter()
+    response = requests.post(url, json=data, headers=headers, stream=True)
+    first = True
+    for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
+        if first:
+            first = False
+            end = time.perf_counter()
+            elevenlab_latency = end - start
+            return elevenlab_latency
+def playht_tts(text):
+    userid = os.getenv("PLAY_HT_USER_ID")
+    api_key = os.getenv("PLAY_HT_API_KEY")
+    if userid is None or api_key is None:
+        print('Please set the environment variables PLAY_HT_USER_ID and PLAY_HT_API_KEY')
+        exit(1)
+    client = Client(
+        user_id=userid,
+        api_key=api_key)
+    options = TTSOptions(voice="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",speed=5.0)
+    first = True
+    start = time.perf_counter()
+    res = client.tts(text, options)
+    for chunk in res:
+        # do something with the audio chunk
+        if first:
+            first = False
+            end = time.perf_counter()
+            playht_latency = end - start
+            return playht_latency
+title = """<h1 align="center">🔥TRTC 文档机器人🚀</h1>"""
+def greet(input):
+    azure_latency = azure_tts(input)
+    coqui_latency = coqui_tts(input)
+    elevenlab_latency = elevenlab_tts(input)
+    playht_latency = playht_tts(input)
+    print(f'Elevenlab TTS Delay, Time to first chunk {elevenlab_latency}s \n Azure TTS Delay, Time to first chunk {azure_latency}s \n Coqui TTS Delay, Time to first chunk {coqui_latency}s \n Pyht TTS Delay, Time to first chunk {playht_latency}s')
+    return f'Elevenlab TTS Delay, Time to first chunk {elevenlab_latency}s \n Azure TTS Delay, Time to first chunk {azure_latency}s \n Coqui TTS Delay, Time to first chunk {coqui_latency}s \n Pyht TTS Delay, Time to first chunk {playht_latency}s'
+with gr.Blocks(theme=gr.themes.Default(spacing_size=gr.themes.sizes.spacing_sm, radius_size=gr.themes.sizes.radius_sm, text_size=gr.themes.sizes.text_sm)) as demo:
+    gr.HTML(title)
+    with gr.Row():
+        txt = gr.Textbox(show_label=False, lines=1,
+                         placeholder='input the text to run ')
+        outtxt = gr.Textbox(show_label=False, lines=4,
+                         placeholder='the output text')
+        txt.submit(greet, [txt], [outtxt])
+        submit = gr.Button(value="Submmit", variant="secondary").style(
+            full_width=False)
+        submit.click(greet, [txt], [outtxt])
+    gr.Examples(
+        label="for example",
+        examples=[
+            "Today is Sunday, the weather is sunny. I am here to test the delay of various TTS services thoroughly",
+        ],
+        inputs=txt,
+    )
+demo.launch()

requirement.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+azure-cognitiveservices-speech==1.28.0
+xtts
+pyht==0.0.16
+requests==2.31.0
+gradio==3.32.0
+gradio_client==0.2.5