Spaces:
Running
on
Zero
Running
on
Zero
new TTS: orpheus; sesame voice sample
Browse files- README.md +2 -0
- app/models.py +28 -7
- test_tts_orpheus.py +50 -0
- test_tts_sesame.py +50 -0
- voice_samples/read_speech_a.wav +3 -0
README.md
CHANGED
@@ -12,6 +12,7 @@ pinned: true
|
|
12 |
short_description: Blind vote on HF TTS models!
|
13 |
models:
|
14 |
- amphion/MaskGCT
|
|
|
15 |
- coqui/XTTS-v2
|
16 |
- fishaudio/fish-speech-1.4
|
17 |
- fishaudio/fish-speech-1.5
|
@@ -31,6 +32,7 @@ models:
|
|
31 |
- parler-tts/parler-tts-mini-v1
|
32 |
- parler-tts/parler-tts-mini-expresso
|
33 |
- Pendrokar/xvapitch_expresso
|
|
|
34 |
- SparkAudio/Spark-TTS-0.5B
|
35 |
- SWivid/F5-TTS
|
36 |
- WhisperSpeech/WhisperSpeech
|
|
|
12 |
short_description: Blind vote on HF TTS models!
|
13 |
models:
|
14 |
- amphion/MaskGCT
|
15 |
+
- canopylabs/orpheus-3b-0.1-ft
|
16 |
- coqui/XTTS-v2
|
17 |
- fishaudio/fish-speech-1.4
|
18 |
- fishaudio/fish-speech-1.5
|
|
|
32 |
- parler-tts/parler-tts-mini-v1
|
33 |
- parler-tts/parler-tts-mini-expresso
|
34 |
- Pendrokar/xvapitch_expresso
|
35 |
+
- sesame/csm-1b
|
36 |
- SparkAudio/Spark-TTS-0.5B
|
37 |
- SWivid/F5-TTS
|
38 |
- WhisperSpeech/WhisperSpeech
|
app/models.py
CHANGED
@@ -104,6 +104,9 @@ AVAILABLE_MODELS = {
|
|
104 |
# Sesame
|
105 |
'sesame/csm-1b' : 'sesame/csm-1b',
|
106 |
|
|
|
|
|
|
|
107 |
# HF TTS w issues
|
108 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
109 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
@@ -486,12 +489,21 @@ HF_SPACES = {
|
|
486 |
},
|
487 |
|
488 |
'sesame/csm-1b' : {
|
489 |
-
'name': '
|
490 |
'function': '/infer',
|
491 |
'text_param_index': 'gen_conversation_input',
|
492 |
'return_audio_index': 0,
|
493 |
'is_zero_gpu_space': True,
|
494 |
-
'series': '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
},
|
496 |
}
|
497 |
|
@@ -779,13 +791,22 @@ OVERRIDE_INPUTS = {
|
|
779 |
'prompt_wav_record': None,
|
780 |
},
|
781 |
|
782 |
-
#
|
783 |
'sesame/csm-1b' : {
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
789 |
}
|
790 |
|
791 |
# minor mods to model from the same space
|
|
|
104 |
# Sesame
|
105 |
'sesame/csm-1b' : 'sesame/csm-1b',
|
106 |
|
107 |
+
# Orpheus
|
108 |
+
'MohamedRashad/Orpheus-TTS' : 'MohamedRashad/Orpheus-TTS',
|
109 |
+
|
110 |
# HF TTS w issues
|
111 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
112 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
|
489 |
},
|
490 |
|
491 |
'sesame/csm-1b' : {
|
492 |
+
'name': 'CSM 1B',
|
493 |
'function': '/infer',
|
494 |
'text_param_index': 'gen_conversation_input',
|
495 |
'return_audio_index': 0,
|
496 |
'is_zero_gpu_space': True,
|
497 |
+
'series': 'CSM-1B',
|
498 |
+
},
|
499 |
+
|
500 |
+
'MohamedRashad/Orpheus-TTS' : {
|
501 |
+
'name': 'Orpheus 3B 0.1',
|
502 |
+
'function': '/generate_speech',
|
503 |
+
'text_param_index': 'text',
|
504 |
+
'return_audio_index': 0,
|
505 |
+
'is_zero_gpu_space': True,
|
506 |
+
'series': 'Orpheus',
|
507 |
},
|
508 |
}
|
509 |
|
|
|
791 |
'prompt_wav_record': None,
|
792 |
},
|
793 |
|
794 |
+
# csm-1b
|
795 |
'sesame/csm-1b' : {
|
796 |
+
'text_prompt_speaker_a': 'And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.',
|
797 |
+
'text_prompt_speaker_b': 'And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.', #second speaker unused
|
798 |
+
'audio_prompt_speaker_a': handle_file('voice_samples/read_speech_a.wav'),
|
799 |
+
'audio_prompt_speaker_b': handle_file('voice_samples/read_speech_a.wav'), #second speaker unused
|
800 |
},
|
801 |
+
|
802 |
+
# Orpheus 3B 0.1
|
803 |
+
'MohamedRashad/Orpheus-TTS' : {
|
804 |
+
'voice': 'tara',
|
805 |
+
'temperature': 0.6,
|
806 |
+
'top_p': 0.95,
|
807 |
+
'repetition_penalty': 1.1,
|
808 |
+
'max_new_tokens': 1200,
|
809 |
+
}
|
810 |
}
|
811 |
|
812 |
# minor mods to model from the same space
|
test_tts_orpheus.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from test_overrides import _get_param_examples, _override_params
|
3 |
+
from gradio_client import Client, file
|
4 |
+
|
5 |
+
model = "MohamedRashad/Orpheus-TTS"
|
6 |
+
client = Client(model, hf_token=os.getenv('HF_TOKEN'))
|
7 |
+
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
8 |
+
# print(endpoints)
|
9 |
+
|
10 |
+
api_name = '/generate_speech'
|
11 |
+
fn_index = None
|
12 |
+
end_parameters = None
|
13 |
+
text = 'This is what my voice sounds like.'
|
14 |
+
|
15 |
+
end_parameters = _get_param_examples(
|
16 |
+
endpoints['named_endpoints'][api_name]['parameters']
|
17 |
+
)
|
18 |
+
print(end_parameters)
|
19 |
+
|
20 |
+
|
21 |
+
space_inputs = end_parameters
|
22 |
+
# override some or all default parameters
|
23 |
+
space_inputs = _override_params(end_parameters, model)
|
24 |
+
|
25 |
+
if(type(space_inputs) == dict):
|
26 |
+
space_inputs['text'] = text
|
27 |
+
result = client.predict(
|
28 |
+
**space_inputs,
|
29 |
+
api_name=api_name,
|
30 |
+
fn_index=fn_index
|
31 |
+
)
|
32 |
+
else:
|
33 |
+
space_inputs[0] = text
|
34 |
+
result = client.predict(
|
35 |
+
*space_inputs,
|
36 |
+
api_name=api_name,
|
37 |
+
fn_index=fn_index
|
38 |
+
)
|
39 |
+
# space_inputs = {str(i): value for i, value in enumerate(space_inputs)}
|
40 |
+
|
41 |
+
print(space_inputs)
|
42 |
+
# print(*space_inputs)
|
43 |
+
# print(**space_inputs)
|
44 |
+
|
45 |
+
# result = client.predict(
|
46 |
+
# **space_inputs,
|
47 |
+
# api_name=api_name,
|
48 |
+
# fn_index=fn_index
|
49 |
+
# )
|
50 |
+
print(result)
|
test_tts_sesame.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from test_overrides import _get_param_examples, _override_params
|
3 |
+
from gradio_client import Client, file
|
4 |
+
|
5 |
+
model = "sesame/csm-1b"
|
6 |
+
client = Client(model, hf_token=os.getenv('HF_TOKEN'))
|
7 |
+
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
8 |
+
# print(endpoints)
|
9 |
+
|
10 |
+
api_name = '/infer'
|
11 |
+
fn_index = None
|
12 |
+
end_parameters = None
|
13 |
+
text = 'This is what my voice sounds like.'
|
14 |
+
|
15 |
+
end_parameters = _get_param_examples(
|
16 |
+
endpoints['named_endpoints'][api_name]['parameters']
|
17 |
+
)
|
18 |
+
print(end_parameters)
|
19 |
+
|
20 |
+
|
21 |
+
space_inputs = end_parameters
|
22 |
+
# override some or all default parameters
|
23 |
+
space_inputs = _override_params(end_parameters, model)
|
24 |
+
|
25 |
+
if(type(space_inputs) == dict):
|
26 |
+
space_inputs['gen_conversation_input'] = text
|
27 |
+
result = client.predict(
|
28 |
+
**space_inputs,
|
29 |
+
api_name=api_name,
|
30 |
+
fn_index=fn_index
|
31 |
+
)
|
32 |
+
else:
|
33 |
+
space_inputs[0] = text
|
34 |
+
result = client.predict(
|
35 |
+
*space_inputs,
|
36 |
+
api_name=api_name,
|
37 |
+
fn_index=fn_index
|
38 |
+
)
|
39 |
+
# space_inputs = {str(i): value for i, value in enumerate(space_inputs)}
|
40 |
+
|
41 |
+
print(space_inputs)
|
42 |
+
# print(*space_inputs)
|
43 |
+
# print(**space_inputs)
|
44 |
+
|
45 |
+
# result = client.predict(
|
46 |
+
# **space_inputs,
|
47 |
+
# api_name=api_name,
|
48 |
+
# fn_index=fn_index
|
49 |
+
# )
|
50 |
+
print(result)
|
voice_samples/read_speech_a.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59480708f84c77ab2967d14d821c2ccade9d7761685d060575121f49a149005b
|
3 |
+
size 831412
|