Spaces:
Running
on
Zero
Running
on
Zero
TTS: llasa; gradio 5.13;
Browse files- README.md +2 -1
- app/models.py +18 -4
- app/sample_caching.py +32 -0
- app/synth.py +5 -36
- test_tts_cosyvoice.py +11 -5
- test_tts_oute.py +5 -3
- voice_samples/EN_B00004_S00051_W000125.json +1 -0
README.md
CHANGED
@@ -19,6 +19,7 @@ models:
|
|
19 |
- lj1995/GPT-SoVITS
|
20 |
- metavoiceio/metavoice-1B-v0.1
|
21 |
- myshell-ai/MeloTTS-English-v2
|
|
|
22 |
- myshell-ai/OpenVoice
|
23 |
- myshell-ai/OpenVoiceV2
|
24 |
- OuteAI/OuteTTS-0.2-500M
|
@@ -28,7 +29,7 @@ models:
|
|
28 |
- Pendrokar/xvapitch_expresso
|
29 |
- SWivid/F5-TTS
|
30 |
- WhisperSpeech/WhisperSpeech
|
31 |
-
sdk_version: 5.
|
32 |
---
|
33 |
|
34 |
[Saved votes dataset](https://huggingface.co/datasets/Pendrokar/TTS_Arena)
|
|
|
19 |
- lj1995/GPT-SoVITS
|
20 |
- metavoiceio/metavoice-1B-v0.1
|
21 |
- myshell-ai/MeloTTS-English-v2
|
22 |
+
- myshell-ai/MeloTTS-English-v3
|
23 |
- myshell-ai/OpenVoice
|
24 |
- myshell-ai/OpenVoiceV2
|
25 |
- OuteAI/OuteTTS-0.2-500M
|
|
|
29 |
- Pendrokar/xvapitch_expresso
|
30 |
- SWivid/F5-TTS
|
31 |
- WhisperSpeech/WhisperSpeech
|
32 |
+
sdk_version: 5.13.0
|
33 |
---
|
34 |
|
35 |
[Saved votes dataset](https://huggingface.co/datasets/Pendrokar/TTS_Arena)
|
app/models.py
CHANGED
@@ -61,11 +61,14 @@ AVAILABLE_MODELS = {
|
|
61 |
# GPT-SoVITS
|
62 |
'lj1995/GPT-SoVITS-v2': 'lj1995/GPT-SoVITS-v2',
|
63 |
|
64 |
-
# OuteTTS
|
65 |
# 'OuteAI/OuteTTS-0.2-500M-Demo': 'OuteAI/OuteTTS-0.2-500M-Demo',
|
66 |
'ameerazam08/OuteTTS-0.2-500M-Demo': 'ameerazam08/OuteTTS-0.2-500M-Demo', # ZeroGPU Space
|
67 |
# OuteTTS 1B
|
68 |
-
'OuteAI/OuteTTS-0.3-1B-Demo': 'OuteAI/OuteTTS-0.3-1B-Demo',
|
|
|
|
|
|
|
69 |
|
70 |
# HF TTS w issues
|
71 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
@@ -114,7 +117,7 @@ HF_SPACES = {
|
|
114 |
},
|
115 |
# MetaVoice
|
116 |
'mrfakename/MetaVoice-1B-v0.1': {
|
117 |
-
'name':'MetaVoice
|
118 |
'function': '/tts',
|
119 |
'text_param_index': 0,
|
120 |
'return_audio_index': 0,
|
@@ -186,7 +189,7 @@ HF_SPACES = {
|
|
186 |
|
187 |
# Microsoft Edge TTS
|
188 |
'innoai/Edge-TTS-Text-to-Speech': {
|
189 |
-
'name': 'Edge TTS',
|
190 |
'function': '/predict',
|
191 |
'text_param_index': 0,
|
192 |
'return_audio_index': 0,
|
@@ -297,6 +300,14 @@ HF_SPACES = {
|
|
297 |
'is_zero_gpu_space': True,
|
298 |
'series': 'OuteTTS',
|
299 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
}
|
301 |
|
302 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
@@ -470,6 +481,9 @@ OVERRIDE_INPUTS = {
|
|
470 |
'speaker_selection': "en_female_1",
|
471 |
'reference_audio': None,
|
472 |
},
|
|
|
|
|
|
|
473 |
}
|
474 |
|
475 |
|
|
|
61 |
# GPT-SoVITS
|
62 |
'lj1995/GPT-SoVITS-v2': 'lj1995/GPT-SoVITS-v2',
|
63 |
|
64 |
+
# OuteTTS 500M
|
65 |
# 'OuteAI/OuteTTS-0.2-500M-Demo': 'OuteAI/OuteTTS-0.2-500M-Demo',
|
66 |
'ameerazam08/OuteTTS-0.2-500M-Demo': 'ameerazam08/OuteTTS-0.2-500M-Demo', # ZeroGPU Space
|
67 |
# OuteTTS 1B
|
68 |
+
# 'OuteAI/OuteTTS-0.3-1B-Demo': 'OuteAI/OuteTTS-0.3-1B-Demo',
|
69 |
+
|
70 |
+
# llasa 3b TTS
|
71 |
+
'srinivasbilla/llasa-3b-tts': 'srinivasbilla/llasa-3b-tts',
|
72 |
|
73 |
# HF TTS w issues
|
74 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
|
|
117 |
},
|
118 |
# MetaVoice
|
119 |
'mrfakename/MetaVoice-1B-v0.1': {
|
120 |
+
'name':'MetaVoice',
|
121 |
'function': '/tts',
|
122 |
'text_param_index': 0,
|
123 |
'return_audio_index': 0,
|
|
|
189 |
|
190 |
# Microsoft Edge TTS
|
191 |
'innoai/Edge-TTS-Text-to-Speech': {
|
192 |
+
'name': 'Microsoft™ Edge TTS',
|
193 |
'function': '/predict',
|
194 |
'text_param_index': 0,
|
195 |
'return_audio_index': 0,
|
|
|
300 |
'is_zero_gpu_space': True,
|
301 |
'series': 'OuteTTS',
|
302 |
},
|
303 |
+
'OuteAI/OuteTTS-0.3-1B-Demo': {
|
304 |
+
'name': 'llasa 3b',
|
305 |
+
'function': '/infer',
|
306 |
+
'text_param_index': 'target_text',
|
307 |
+
'return_audio_index': 0,
|
308 |
+
'is_zero_gpu_space': True,
|
309 |
+
'series': 'llasa 3b',
|
310 |
+
},
|
311 |
}
|
312 |
|
313 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
|
481 |
'speaker_selection': "en_female_1",
|
482 |
'reference_audio': None,
|
483 |
},
|
484 |
+
'srinivasbilla/llasa-3b-tts': {
|
485 |
+
'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3')
|
486 |
+
},
|
487 |
}
|
488 |
|
489 |
|
app/sample_caching.py
CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
|
|
2 |
import itertools
|
3 |
import random
|
4 |
import json
|
|
|
5 |
from typing import List, Tuple, Set, Dict
|
6 |
from hashlib import md5, sha1
|
7 |
import spaces
|
@@ -72,6 +73,37 @@ def get_userid(session_hash: str, request):
|
|
72 |
# by browser session hash - Not a cookie, session hash changes on page reload
|
73 |
return sha1(bytes(request.session_hash.encode('ascii')), usedforsecurity=False).hexdigest()
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
# Give user a cached audio sample pair they have yet to vote on
|
76 |
def give_cached_sample(session_hash: str, autoplay: bool, request: gr.Request):
|
77 |
# add new userid to voting_users from Browser session hash
|
|
|
2 |
import itertools
|
3 |
import random
|
4 |
import json
|
5 |
+
from .config import sents
|
6 |
from typing import List, Tuple, Set, Dict
|
7 |
from hashlib import md5, sha1
|
8 |
import spaces
|
|
|
73 |
# by browser session hash - Not a cookie, session hash changes on page reload
|
74 |
return sha1(bytes(request.session_hash.encode('ascii')), usedforsecurity=False).hexdigest()
|
75 |
|
76 |
+
|
77 |
+
def cache_sample(path, text, model):
|
78 |
+
# skip caching if not a hardcoded sentence
|
79 |
+
if (text not in sents):
|
80 |
+
return False
|
81 |
+
|
82 |
+
already_cached = False
|
83 |
+
# check if already cached
|
84 |
+
for cached_sample in cached_samples:
|
85 |
+
# TODO: replace cached sample with a newer version?
|
86 |
+
if (cached_sample.transcript == text and cached_sample.modelName == model):
|
87 |
+
already_cached = True
|
88 |
+
return True
|
89 |
+
|
90 |
+
if (already_cached):
|
91 |
+
return False
|
92 |
+
|
93 |
+
try:
|
94 |
+
cached_samples.append(Sample(path, text, model))
|
95 |
+
except:
|
96 |
+
print('Error when trying to cache sample')
|
97 |
+
return False
|
98 |
+
|
99 |
+
# save list to JSON file
|
100 |
+
cached_sample_dict = [cached_sample.to_dict() for cached_sample in cached_samples]
|
101 |
+
try:
|
102 |
+
with open("_cached_samples.json", "w") as write:
|
103 |
+
json.dump( cached_sample_dict , write )
|
104 |
+
except:
|
105 |
+
pass
|
106 |
+
|
107 |
# Give user a cached audio sample pair they have yet to vote on
|
108 |
def give_cached_sample(session_hash: str, autoplay: bool, request: gr.Request):
|
109 |
# add new userid to voting_users from Browser session hash
|
app/synth.py
CHANGED
@@ -266,36 +266,6 @@ def synthandreturn(text, autoplay, request: gr.Request):
|
|
266 |
|
267 |
return inputs
|
268 |
|
269 |
-
def _cache_sample(text, model):
|
270 |
-
# skip caching if not hardcoded sentence
|
271 |
-
if (text not in sents):
|
272 |
-
return False
|
273 |
-
|
274 |
-
already_cached = False
|
275 |
-
# check if already cached
|
276 |
-
for cached_sample in cached_samples:
|
277 |
-
# TODO:replace cached with newer version
|
278 |
-
if (cached_sample.transcript == text and cached_sample.modelName == model):
|
279 |
-
already_cached = True
|
280 |
-
return True
|
281 |
-
|
282 |
-
if (already_cached):
|
283 |
-
return False
|
284 |
-
|
285 |
-
try:
|
286 |
-
cached_samples.append(Sample(results[model], text, model))
|
287 |
-
except:
|
288 |
-
print('Error when trying to cache sample')
|
289 |
-
return False
|
290 |
-
|
291 |
-
# save list to JSON file
|
292 |
-
cached_sample_dict = [cached_sample.to_dict() for cached_sample in cached_samples]
|
293 |
-
try:
|
294 |
-
with open("_cached_samples.json", "w") as write:
|
295 |
-
json.dump( cached_sample_dict , write )
|
296 |
-
except:
|
297 |
-
pass
|
298 |
-
|
299 |
mdl1k = mdl1
|
300 |
mdl2k = mdl2
|
301 |
print(mdl1k, mdl2k)
|
@@ -306,19 +276,18 @@ def synthandreturn(text, autoplay, request: gr.Request):
|
|
306 |
|
307 |
# do not use multithreading when both spaces are ZeroGPU type
|
308 |
if (
|
309 |
-
# exists
|
310 |
'is_zero_gpu_space' in HF_SPACES[mdl1]
|
311 |
-
# is True
|
312 |
and HF_SPACES[mdl1]['is_zero_gpu_space']
|
|
|
313 |
and 'is_zero_gpu_space' in HF_SPACES[mdl2]
|
314 |
and HF_SPACES[mdl2]['is_zero_gpu_space']
|
315 |
):
|
316 |
# run Zero-GPU spaces one at a time
|
317 |
predict_and_update_result(text, mdl1k, results, request)
|
318 |
-
|
319 |
|
320 |
predict_and_update_result(text, mdl2k, results, request)
|
321 |
-
|
322 |
else:
|
323 |
# use multithreading
|
324 |
thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
|
@@ -332,9 +301,9 @@ def synthandreturn(text, autoplay, request: gr.Request):
|
|
332 |
thread1.join(120)
|
333 |
thread2.join(120)
|
334 |
|
335 |
-
# cache
|
336 |
for model in [mdl1k, mdl2k]:
|
337 |
-
|
338 |
|
339 |
print(f"Retrieving models {mdl1k} and {mdl2k} from API")
|
340 |
return (
|
|
|
266 |
|
267 |
return inputs
|
268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
mdl1k = mdl1
|
270 |
mdl2k = mdl2
|
271 |
print(mdl1k, mdl2k)
|
|
|
276 |
|
277 |
# do not use multithreading when both spaces are ZeroGPU type
|
278 |
if (
|
|
|
279 |
'is_zero_gpu_space' in HF_SPACES[mdl1]
|
|
|
280 |
and HF_SPACES[mdl1]['is_zero_gpu_space']
|
281 |
+
|
282 |
and 'is_zero_gpu_space' in HF_SPACES[mdl2]
|
283 |
and HF_SPACES[mdl2]['is_zero_gpu_space']
|
284 |
):
|
285 |
# run Zero-GPU spaces one at a time
|
286 |
predict_and_update_result(text, mdl1k, results, request)
|
287 |
+
cache_sample(results[mdl1k], text, mdl1k)
|
288 |
|
289 |
predict_and_update_result(text, mdl2k, results, request)
|
290 |
+
cache_sample(results[mdl2k], text, mdl2k)
|
291 |
else:
|
292 |
# use multithreading
|
293 |
thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
|
|
|
301 |
thread1.join(120)
|
302 |
thread2.join(120)
|
303 |
|
304 |
+
# cache each result
|
305 |
for model in [mdl1k, mdl2k]:
|
306 |
+
cache_sample(results[model], text, model)
|
307 |
|
308 |
print(f"Retrieving models {mdl1k} and {mdl2k} from API")
|
309 |
return (
|
test_tts_cosyvoice.py
CHANGED
@@ -1,19 +1,25 @@
|
|
1 |
import os
|
2 |
from gradio_client import Client, handle_file
|
3 |
|
4 |
-
client = Client("FunAudioLLM/CosyVoice2-0.5B", hf_token=os.getenv('HF_TOKEN'))
|
|
|
5 |
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
6 |
# print(endpoints)
|
7 |
|
8 |
result = client.predict(
|
9 |
-
tts_text="CosyVoice is undergoing a comprehensive upgrade
|
10 |
-
mode_checkbox_group=
|
|
|
11 |
prompt_text='The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory.',
|
12 |
prompt_wav_upload=handle_file("https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav"),
|
13 |
prompt_wav_record=None,
|
14 |
instruct_text=None,
|
15 |
seed=0,
|
16 |
-
stream=
|
17 |
-
api_name="/generate_audio"
|
|
|
|
|
|
|
|
|
18 |
)
|
19 |
print(result)
|
|
|
1 |
import os
|
2 |
from gradio_client import Client, handle_file
|
3 |
|
4 |
+
# client = Client("FunAudioLLM/CosyVoice2-0.5B", hf_token=os.getenv('HF_TOKEN'))
|
5 |
+
client = Client("tanbw/CosyVoice", hf_token=os.getenv('HF_TOKEN'))
|
6 |
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
7 |
# print(endpoints)
|
8 |
|
9 |
result = client.predict(
|
10 |
+
tts_text="CosyVoice is undergoing a comprehensive upgrade.",
|
11 |
+
# mode_checkbox_group=None,
|
12 |
+
mode_checkbox_group="3s极速复刻",
|
13 |
prompt_text='The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory.',
|
14 |
prompt_wav_upload=handle_file("https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav"),
|
15 |
prompt_wav_record=None,
|
16 |
instruct_text=None,
|
17 |
seed=0,
|
18 |
+
stream=False,
|
19 |
+
api_name="/generate_audio",
|
20 |
+
|
21 |
+
# tanbw
|
22 |
+
sft_dropdown=None,
|
23 |
+
speed=1,
|
24 |
)
|
25 |
print(result)
|
test_tts_oute.py
CHANGED
@@ -12,9 +12,11 @@ result = client.predict(
|
|
12 |
temperature=0.1,
|
13 |
repetition_penalty=1.1,
|
14 |
language="en",
|
15 |
-
speaker_selection=
|
16 |
-
reference_audio=
|
17 |
-
|
|
|
|
|
18 |
# reference_audio=handle_file('EN_B00004_S00051_W000213.wav'),
|
19 |
# reference_text="Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we'll call",
|
20 |
api_name="/generate_tts"
|
|
|
12 |
temperature=0.1,
|
13 |
repetition_penalty=1.1,
|
14 |
language="en",
|
15 |
+
speaker_selection=None,
|
16 |
+
reference_audio=handle_file('voice_samples/EN_B00004_S00051_W000213.mp3'),
|
17 |
+
reference_text='Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we\'ll call Kruger Bern.',
|
18 |
+
# reference_audio=None,
|
19 |
+
# reference_text=None,
|
20 |
# reference_audio=handle_file('EN_B00004_S00051_W000213.wav'),
|
21 |
# reference_text="Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we'll call",
|
22 |
api_name="/generate_tts"
|
voice_samples/EN_B00004_S00051_W000125.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"id": "EN_B00004_S00051_W000125", "wav": "EN_B00004/EN_B00004_S00051/mp3/EN_B00004_S00051_W000125.mp3", "text": " Unfortunately, there is a flip side. Small losses or setbacks can have an extremely negative effect on inner work life. In fact,", "duration": 10.1535, "speaker": "EN_B00004_S00051", "language": "en", "dnsmos": 3.4373}
|