Pendrokar commited on
Commit
51adc6d
·
1 Parent(s): 780b4ef

TTS: llasa; gradio 5.13;

Browse files
README.md CHANGED
@@ -19,6 +19,7 @@ models:
19
  - lj1995/GPT-SoVITS
20
  - metavoiceio/metavoice-1B-v0.1
21
  - myshell-ai/MeloTTS-English-v2
 
22
  - myshell-ai/OpenVoice
23
  - myshell-ai/OpenVoiceV2
24
  - OuteAI/OuteTTS-0.2-500M
@@ -28,7 +29,7 @@ models:
28
  - Pendrokar/xvapitch_expresso
29
  - SWivid/F5-TTS
30
  - WhisperSpeech/WhisperSpeech
31
- sdk_version: 5.4.0
32
  ---
33
 
34
  [Saved votes dataset](https://huggingface.co/datasets/Pendrokar/TTS_Arena)
 
19
  - lj1995/GPT-SoVITS
20
  - metavoiceio/metavoice-1B-v0.1
21
  - myshell-ai/MeloTTS-English-v2
22
+ - myshell-ai/MeloTTS-English-v3
23
  - myshell-ai/OpenVoice
24
  - myshell-ai/OpenVoiceV2
25
  - OuteAI/OuteTTS-0.2-500M
 
29
  - Pendrokar/xvapitch_expresso
30
  - SWivid/F5-TTS
31
  - WhisperSpeech/WhisperSpeech
32
+ sdk_version: 5.13.0
33
  ---
34
 
35
  [Saved votes dataset](https://huggingface.co/datasets/Pendrokar/TTS_Arena)
app/models.py CHANGED
@@ -61,11 +61,14 @@ AVAILABLE_MODELS = {
61
  # GPT-SoVITS
62
  'lj1995/GPT-SoVITS-v2': 'lj1995/GPT-SoVITS-v2',
63
 
64
- # OuteTTS
65
  # 'OuteAI/OuteTTS-0.2-500M-Demo': 'OuteAI/OuteTTS-0.2-500M-Demo',
66
  'ameerazam08/OuteTTS-0.2-500M-Demo': 'ameerazam08/OuteTTS-0.2-500M-Demo', # ZeroGPU Space
67
  # OuteTTS 1B
68
- 'OuteAI/OuteTTS-0.3-1B-Demo': 'OuteAI/OuteTTS-0.3-1B-Demo',
 
 
 
69
 
70
  # HF TTS w issues
71
  # 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
@@ -114,7 +117,7 @@ HF_SPACES = {
114
  },
115
  # MetaVoice
116
  'mrfakename/MetaVoice-1B-v0.1': {
117
- 'name':'MetaVoice-1B',
118
  'function': '/tts',
119
  'text_param_index': 0,
120
  'return_audio_index': 0,
@@ -186,7 +189,7 @@ HF_SPACES = {
186
 
187
  # Microsoft Edge TTS
188
  'innoai/Edge-TTS-Text-to-Speech': {
189
- 'name': 'Edge TTS',
190
  'function': '/predict',
191
  'text_param_index': 0,
192
  'return_audio_index': 0,
@@ -297,6 +300,14 @@ HF_SPACES = {
297
  'is_zero_gpu_space': True,
298
  'series': 'OuteTTS',
299
  },
 
 
 
 
 
 
 
 
300
  }
301
 
302
  # for zero-shot TTS - voice sample used by XTTS (11 seconds)
@@ -470,6 +481,9 @@ OVERRIDE_INPUTS = {
470
  'speaker_selection': "en_female_1",
471
  'reference_audio': None,
472
  },
 
 
 
473
  }
474
 
475
 
 
61
  # GPT-SoVITS
62
  'lj1995/GPT-SoVITS-v2': 'lj1995/GPT-SoVITS-v2',
63
 
64
+ # OuteTTS 500M
65
  # 'OuteAI/OuteTTS-0.2-500M-Demo': 'OuteAI/OuteTTS-0.2-500M-Demo',
66
  'ameerazam08/OuteTTS-0.2-500M-Demo': 'ameerazam08/OuteTTS-0.2-500M-Demo', # ZeroGPU Space
67
  # OuteTTS 1B
68
+ # 'OuteAI/OuteTTS-0.3-1B-Demo': 'OuteAI/OuteTTS-0.3-1B-Demo',
69
+
70
+ # llasa 3b TTS
71
+ 'srinivasbilla/llasa-3b-tts': 'srinivasbilla/llasa-3b-tts',
72
 
73
  # HF TTS w issues
74
  # 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
 
117
  },
118
  # MetaVoice
119
  'mrfakename/MetaVoice-1B-v0.1': {
120
+ 'name':'MetaVoice',
121
  'function': '/tts',
122
  'text_param_index': 0,
123
  'return_audio_index': 0,
 
189
 
190
  # Microsoft Edge TTS
191
  'innoai/Edge-TTS-Text-to-Speech': {
192
+ 'name': 'Microsoft™ Edge TTS',
193
  'function': '/predict',
194
  'text_param_index': 0,
195
  'return_audio_index': 0,
 
300
  'is_zero_gpu_space': True,
301
  'series': 'OuteTTS',
302
  },
303
+ 'OuteAI/OuteTTS-0.3-1B-Demo': {
304
+ 'name': 'llasa 3b',
305
+ 'function': '/infer',
306
+ 'text_param_index': 'target_text',
307
+ 'return_audio_index': 0,
308
+ 'is_zero_gpu_space': True,
309
+ 'series': 'llasa 3b',
310
+ },
311
  }
312
 
313
  # for zero-shot TTS - voice sample used by XTTS (11 seconds)
 
481
  'speaker_selection': "en_female_1",
482
  'reference_audio': None,
483
  },
484
+ 'srinivasbilla/llasa-3b-tts': {
485
+ 'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3')
486
+ },
487
  }
488
 
489
 
app/sample_caching.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import itertools
3
  import random
4
  import json
 
5
  from typing import List, Tuple, Set, Dict
6
  from hashlib import md5, sha1
7
  import spaces
@@ -72,6 +73,37 @@ def get_userid(session_hash: str, request):
72
  # by browser session hash - Not a cookie, session hash changes on page reload
73
  return sha1(bytes(request.session_hash.encode('ascii')), usedforsecurity=False).hexdigest()
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # Give user a cached audio sample pair they have yet to vote on
76
  def give_cached_sample(session_hash: str, autoplay: bool, request: gr.Request):
77
  # add new userid to voting_users from Browser session hash
 
2
  import itertools
3
  import random
4
  import json
5
+ from .config import sents
6
  from typing import List, Tuple, Set, Dict
7
  from hashlib import md5, sha1
8
  import spaces
 
73
  # by browser session hash - Not a cookie, session hash changes on page reload
74
  return sha1(bytes(request.session_hash.encode('ascii')), usedforsecurity=False).hexdigest()
75
 
76
+
77
+ def cache_sample(path, text, model):
78
+ # skip caching if not a hardcoded sentence
79
+ if (text not in sents):
80
+ return False
81
+
82
+ already_cached = False
83
+ # check if already cached
84
+ for cached_sample in cached_samples:
85
+ # TODO: replace cached sample with a newer version?
86
+ if (cached_sample.transcript == text and cached_sample.modelName == model):
87
+ already_cached = True
88
+ return True
89
+
90
+ if (already_cached):
91
+ return False
92
+
93
+ try:
94
+ cached_samples.append(Sample(path, text, model))
95
+ except:
96
+ print('Error when trying to cache sample')
97
+ return False
98
+
99
+ # save list to JSON file
100
+ cached_sample_dict = [cached_sample.to_dict() for cached_sample in cached_samples]
101
+ try:
102
+ with open("_cached_samples.json", "w") as write:
103
+ json.dump( cached_sample_dict , write )
104
+ except:
105
+ pass
106
+
107
  # Give user a cached audio sample pair they have yet to vote on
108
  def give_cached_sample(session_hash: str, autoplay: bool, request: gr.Request):
109
  # add new userid to voting_users from Browser session hash
app/synth.py CHANGED
@@ -266,36 +266,6 @@ def synthandreturn(text, autoplay, request: gr.Request):
266
 
267
  return inputs
268
 
269
- def _cache_sample(text, model):
270
- # skip caching if not hardcoded sentence
271
- if (text not in sents):
272
- return False
273
-
274
- already_cached = False
275
- # check if already cached
276
- for cached_sample in cached_samples:
277
- # TODO:replace cached with newer version
278
- if (cached_sample.transcript == text and cached_sample.modelName == model):
279
- already_cached = True
280
- return True
281
-
282
- if (already_cached):
283
- return False
284
-
285
- try:
286
- cached_samples.append(Sample(results[model], text, model))
287
- except:
288
- print('Error when trying to cache sample')
289
- return False
290
-
291
- # save list to JSON file
292
- cached_sample_dict = [cached_sample.to_dict() for cached_sample in cached_samples]
293
- try:
294
- with open("_cached_samples.json", "w") as write:
295
- json.dump( cached_sample_dict , write )
296
- except:
297
- pass
298
-
299
  mdl1k = mdl1
300
  mdl2k = mdl2
301
  print(mdl1k, mdl2k)
@@ -306,19 +276,18 @@ def synthandreturn(text, autoplay, request: gr.Request):
306
 
307
  # do not use multithreading when both spaces are ZeroGPU type
308
  if (
309
- # exists
310
  'is_zero_gpu_space' in HF_SPACES[mdl1]
311
- # is True
312
  and HF_SPACES[mdl1]['is_zero_gpu_space']
 
313
  and 'is_zero_gpu_space' in HF_SPACES[mdl2]
314
  and HF_SPACES[mdl2]['is_zero_gpu_space']
315
  ):
316
  # run Zero-GPU spaces one at a time
317
  predict_and_update_result(text, mdl1k, results, request)
318
- _cache_sample(text, mdl1k)
319
 
320
  predict_and_update_result(text, mdl2k, results, request)
321
- _cache_sample(text, mdl2k)
322
  else:
323
  # use multithreading
324
  thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
@@ -332,9 +301,9 @@ def synthandreturn(text, autoplay, request: gr.Request):
332
  thread1.join(120)
333
  thread2.join(120)
334
 
335
- # cache the result
336
  for model in [mdl1k, mdl2k]:
337
- _cache_sample(text, model)
338
 
339
  print(f"Retrieving models {mdl1k} and {mdl2k} from API")
340
  return (
 
266
 
267
  return inputs
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  mdl1k = mdl1
270
  mdl2k = mdl2
271
  print(mdl1k, mdl2k)
 
276
 
277
  # do not use multithreading when both spaces are ZeroGPU type
278
  if (
 
279
  'is_zero_gpu_space' in HF_SPACES[mdl1]
 
280
  and HF_SPACES[mdl1]['is_zero_gpu_space']
281
+
282
  and 'is_zero_gpu_space' in HF_SPACES[mdl2]
283
  and HF_SPACES[mdl2]['is_zero_gpu_space']
284
  ):
285
  # run Zero-GPU spaces one at a time
286
  predict_and_update_result(text, mdl1k, results, request)
287
+ cache_sample(results[mdl1k], text, mdl1k)
288
 
289
  predict_and_update_result(text, mdl2k, results, request)
290
+ cache_sample(results[mdl2k], text, mdl2k)
291
  else:
292
  # use multithreading
293
  thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
 
301
  thread1.join(120)
302
  thread2.join(120)
303
 
304
+ # cache each result
305
  for model in [mdl1k, mdl2k]:
306
+ cache_sample(results[model], text, model)
307
 
308
  print(f"Retrieving models {mdl1k} and {mdl2k} from API")
309
  return (
test_tts_cosyvoice.py CHANGED
@@ -1,19 +1,25 @@
1
  import os
2
  from gradio_client import Client, handle_file
3
 
4
- client = Client("FunAudioLLM/CosyVoice2-0.5B", hf_token=os.getenv('HF_TOKEN'))
 
5
  endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
6
  # print(endpoints)
7
 
8
  result = client.predict(
9
- tts_text="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.",
10
- mode_checkbox_group="3s Voice Clone",
 
11
  prompt_text='The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory.',
12
  prompt_wav_upload=handle_file("https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav"),
13
  prompt_wav_record=None,
14
  instruct_text=None,
15
  seed=0,
16
- stream="false",
17
- api_name="/generate_audio"
 
 
 
 
18
  )
19
  print(result)
 
1
  import os
2
  from gradio_client import Client, handle_file
3
 
4
+ # client = Client("FunAudioLLM/CosyVoice2-0.5B", hf_token=os.getenv('HF_TOKEN'))
5
+ client = Client("tanbw/CosyVoice", hf_token=os.getenv('HF_TOKEN'))
6
  endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
7
  # print(endpoints)
8
 
9
  result = client.predict(
10
+ tts_text="CosyVoice is undergoing a comprehensive upgrade.",
11
+ # mode_checkbox_group=None,
12
+ mode_checkbox_group="3s极速复刻",
13
  prompt_text='The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory.',
14
  prompt_wav_upload=handle_file("https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav"),
15
  prompt_wav_record=None,
16
  instruct_text=None,
17
  seed=0,
18
+ stream=False,
19
+ api_name="/generate_audio",
20
+
21
+ # tanbw
22
+ sft_dropdown=None,
23
+ speed=1,
24
  )
25
  print(result)
test_tts_oute.py CHANGED
@@ -12,9 +12,11 @@ result = client.predict(
12
  temperature=0.1,
13
  repetition_penalty=1.1,
14
  language="en",
15
- speaker_selection="female_1",
16
- reference_audio=None,
17
- reference_text=None,
 
 
18
  # reference_audio=handle_file('EN_B00004_S00051_W000213.wav'),
19
  # reference_text="Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we'll call",
20
  api_name="/generate_tts"
 
12
  temperature=0.1,
13
  repetition_penalty=1.1,
14
  language="en",
15
+ speaker_selection=None,
16
+ reference_audio=handle_file('voice_samples/EN_B00004_S00051_W000213.mp3'),
17
+ reference_text='Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we\'ll call Kruger Bern.',
18
+ # reference_audio=None,
19
+ # reference_text=None,
20
  # reference_audio=handle_file('EN_B00004_S00051_W000213.wav'),
21
  # reference_text="Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we'll call",
22
  api_name="/generate_tts"
voice_samples/EN_B00004_S00051_W000125.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"id": "EN_B00004_S00051_W000125", "wav": "EN_B00004/EN_B00004_S00051/mp3/EN_B00004_S00051_W000125.mp3", "text": " Unfortunately, there is a flip side. Small losses or setbacks can have an extremely negative effect on inner work life. In fact,", "duration": 10.1535, "speaker": "EN_B00004_S00051", "language": "en", "dnsmos": 3.4373}