Pendrokar commited on
Commit
39c25b5
·
1 Parent(s): ad6af40

new TTS: orpheus; sesame voice sample

Browse files
README.md CHANGED
@@ -12,6 +12,7 @@ pinned: true
12
  short_description: Blind vote on HF TTS models!
13
  models:
14
  - amphion/MaskGCT
 
15
  - coqui/XTTS-v2
16
  - fishaudio/fish-speech-1.4
17
  - fishaudio/fish-speech-1.5
@@ -31,6 +32,7 @@ models:
31
  - parler-tts/parler-tts-mini-v1
32
  - parler-tts/parler-tts-mini-expresso
33
  - Pendrokar/xvapitch_expresso
 
34
  - SparkAudio/Spark-TTS-0.5B
35
  - SWivid/F5-TTS
36
  - WhisperSpeech/WhisperSpeech
 
12
  short_description: Blind vote on HF TTS models!
13
  models:
14
  - amphion/MaskGCT
15
+ - canopylabs/orpheus-3b-0.1-ft
16
  - coqui/XTTS-v2
17
  - fishaudio/fish-speech-1.4
18
  - fishaudio/fish-speech-1.5
 
32
  - parler-tts/parler-tts-mini-v1
33
  - parler-tts/parler-tts-mini-expresso
34
  - Pendrokar/xvapitch_expresso
35
+ - sesame/csm-1b
36
  - SparkAudio/Spark-TTS-0.5B
37
  - SWivid/F5-TTS
38
  - WhisperSpeech/WhisperSpeech
app/models.py CHANGED
@@ -104,6 +104,9 @@ AVAILABLE_MODELS = {
104
  # Sesame
105
  'sesame/csm-1b' : 'sesame/csm-1b',
106
 
 
 
 
107
  # HF TTS w issues
108
  # 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
109
  # 'PolyAI/pheme': '/predict#0', # sleepy HF Space
@@ -486,12 +489,21 @@ HF_SPACES = {
486
  },
487
 
488
  'sesame/csm-1b' : {
489
- 'name': 'sesame/csm-1b',
490
  'function': '/infer',
491
  'text_param_index': 'gen_conversation_input',
492
  'return_audio_index': 0,
493
  'is_zero_gpu_space': True,
494
- 'series': 'Spark-TTS',
 
 
 
 
 
 
 
 
 
495
  },
496
  }
497
 
@@ -779,13 +791,22 @@ OVERRIDE_INPUTS = {
779
  'prompt_wav_record': None,
780
  },
781
 
782
- # sesame/csm-1b
783
  'sesame/csm-1b' : {
784
- "text_prompt_speaker_a": "And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.",
785
- "text_prompt_speaker_b": "And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.", #second speaker unused
786
- "audio_prompt_speaker_a": handle_file('voice_samples/read_speech_a.wav'),
787
- "audio_prompt_speaker_b": handle_file('voice_samples/read_speech_a.wav'), #second speaker unused
788
  },
 
 
 
 
 
 
 
 
 
789
  }
790
 
791
  # minor mods to model from the same space
 
104
  # Sesame
105
  'sesame/csm-1b' : 'sesame/csm-1b',
106
 
107
+ # Orpheus
108
+ 'MohamedRashad/Orpheus-TTS' : 'MohamedRashad/Orpheus-TTS',
109
+
110
  # HF TTS w issues
111
  # 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
112
  # 'PolyAI/pheme': '/predict#0', # sleepy HF Space
 
489
  },
490
 
491
  'sesame/csm-1b' : {
492
+ 'name': 'CSM 1B',
493
  'function': '/infer',
494
  'text_param_index': 'gen_conversation_input',
495
  'return_audio_index': 0,
496
  'is_zero_gpu_space': True,
497
+ 'series': 'CSM-1B',
498
+ },
499
+
500
+ 'MohamedRashad/Orpheus-TTS' : {
501
+ 'name': 'Orpheus 3B 0.1',
502
+ 'function': '/generate_speech',
503
+ 'text_param_index': 'text',
504
+ 'return_audio_index': 0,
505
+ 'is_zero_gpu_space': True,
506
+ 'series': 'Orpheus',
507
  },
508
  }
509
 
 
791
  'prompt_wav_record': None,
792
  },
793
 
794
+ # csm-1b
795
  'sesame/csm-1b' : {
796
+ 'text_prompt_speaker_a': 'And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.',
797
+ 'text_prompt_speaker_b': 'And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.', #second speaker unused
798
+ 'audio_prompt_speaker_a': handle_file('voice_samples/read_speech_a.wav'),
799
+ 'audio_prompt_speaker_b': handle_file('voice_samples/read_speech_a.wav'), #second speaker unused
800
  },
801
+
802
+ # Orpheus 3B 0.1
803
+ 'MohamedRashad/Orpheus-TTS' : {
804
+ 'voice': 'tara',
805
+ 'temperature': 0.6,
806
+ 'top_p': 0.95,
807
+ 'repetition_penalty': 1.1,
808
+ 'max_new_tokens': 1200,
809
+ }
810
  }
811
 
812
  # minor mods to model from the same space
test_tts_orpheus.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from test_overrides import _get_param_examples, _override_params
3
+ from gradio_client import Client, file
4
+
5
+ model = "MohamedRashad/Orpheus-TTS"
6
+ client = Client(model, hf_token=os.getenv('HF_TOKEN'))
7
+ endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
8
+ # print(endpoints)
9
+
10
+ api_name = '/generate_speech'
11
+ fn_index = None
12
+ end_parameters = None
13
+ text = 'This is what my voice sounds like.'
14
+
15
+ end_parameters = _get_param_examples(
16
+ endpoints['named_endpoints'][api_name]['parameters']
17
+ )
18
+ print(end_parameters)
19
+
20
+
21
+ space_inputs = end_parameters
22
+ # override some or all default parameters
23
+ space_inputs = _override_params(end_parameters, model)
24
+
25
+ if(type(space_inputs) == dict):
26
+ space_inputs['text'] = text
27
+ result = client.predict(
28
+ **space_inputs,
29
+ api_name=api_name,
30
+ fn_index=fn_index
31
+ )
32
+ else:
33
+ space_inputs[0] = text
34
+ result = client.predict(
35
+ *space_inputs,
36
+ api_name=api_name,
37
+ fn_index=fn_index
38
+ )
39
+ # space_inputs = {str(i): value for i, value in enumerate(space_inputs)}
40
+
41
+ print(space_inputs)
42
+ # print(*space_inputs)
43
+ # print(**space_inputs)
44
+
45
+ # result = client.predict(
46
+ # **space_inputs,
47
+ # api_name=api_name,
48
+ # fn_index=fn_index
49
+ # )
50
+ print(result)
test_tts_sesame.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from test_overrides import _get_param_examples, _override_params
3
+ from gradio_client import Client, file
4
+
5
+ model = "sesame/csm-1b"
6
+ client = Client(model, hf_token=os.getenv('HF_TOKEN'))
7
+ endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
8
+ # print(endpoints)
9
+
10
+ api_name = '/infer'
11
+ fn_index = None
12
+ end_parameters = None
13
+ text = 'This is what my voice sounds like.'
14
+
15
+ end_parameters = _get_param_examples(
16
+ endpoints['named_endpoints'][api_name]['parameters']
17
+ )
18
+ print(end_parameters)
19
+
20
+
21
+ space_inputs = end_parameters
22
+ # override some or all default parameters
23
+ space_inputs = _override_params(end_parameters, model)
24
+
25
+ if(type(space_inputs) == dict):
26
+ space_inputs['gen_conversation_input'] = text
27
+ result = client.predict(
28
+ **space_inputs,
29
+ api_name=api_name,
30
+ fn_index=fn_index
31
+ )
32
+ else:
33
+ space_inputs[0] = text
34
+ result = client.predict(
35
+ *space_inputs,
36
+ api_name=api_name,
37
+ fn_index=fn_index
38
+ )
39
+ # space_inputs = {str(i): value for i, value in enumerate(space_inputs)}
40
+
41
+ print(space_inputs)
42
+ # print(*space_inputs)
43
+ # print(**space_inputs)
44
+
45
+ # result = client.predict(
46
+ # **space_inputs,
47
+ # api_name=api_name,
48
+ # fn_index=fn_index
49
+ # )
50
+ print(result)
voice_samples/read_speech_a.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59480708f84c77ab2967d14d821c2ccade9d7761685d060575121f49a149005b
3
+ size 831412