TTS-Spaces-Arena

Running on Zero

App Files Files Community

Pendrokar commited on 5 days ago

Commit

51adc6d

1 Parent(s): 780b4ef

TTS: llasa; gradio 5.13;

Browse files

Files changed (7) hide show

README.md +2 -1
app/models.py +18 -4
app/sample_caching.py +32 -0
app/synth.py +5 -36
test_tts_cosyvoice.py +11 -5
test_tts_oute.py +5 -3
voice_samples/EN_B00004_S00051_W000125.json +1 -0

README.md CHANGED Viewed

@@ -19,6 +19,7 @@ models:
 - lj1995/GPT-SoVITS
 - metavoiceio/metavoice-1B-v0.1
 - myshell-ai/MeloTTS-English-v2
 - myshell-ai/OpenVoice
 - myshell-ai/OpenVoiceV2
 - OuteAI/OuteTTS-0.2-500M
@@ -28,7 +29,7 @@ models:
 - Pendrokar/xvapitch_expresso
 - SWivid/F5-TTS
 - WhisperSpeech/WhisperSpeech
-sdk_version: 5.4.0
 ---
 [Saved votes dataset](https://huggingface.co/datasets/Pendrokar/TTS_Arena)

 - lj1995/GPT-SoVITS
 - metavoiceio/metavoice-1B-v0.1
 - myshell-ai/MeloTTS-English-v2
+- myshell-ai/MeloTTS-English-v3
 - myshell-ai/OpenVoice
 - myshell-ai/OpenVoiceV2
 - OuteAI/OuteTTS-0.2-500M
 - Pendrokar/xvapitch_expresso
 - SWivid/F5-TTS
 - WhisperSpeech/WhisperSpeech
+sdk_version: 5.13.0
 ---
 [Saved votes dataset](https://huggingface.co/datasets/Pendrokar/TTS_Arena)

app/models.py CHANGED Viewed

@@ -61,11 +61,14 @@ AVAILABLE_MODELS = {
     # GPT-SoVITS
     'lj1995/GPT-SoVITS-v2': 'lj1995/GPT-SoVITS-v2',
-    # OuteTTS
     # 'OuteAI/OuteTTS-0.2-500M-Demo': 'OuteAI/OuteTTS-0.2-500M-Demo',
     'ameerazam08/OuteTTS-0.2-500M-Demo': 'ameerazam08/OuteTTS-0.2-500M-Demo', # ZeroGPU Space
     # OuteTTS 1B
-    'OuteAI/OuteTTS-0.3-1B-Demo': 'OuteAI/OuteTTS-0.3-1B-Demo',
     # HF TTS w issues
     # 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
@@ -114,7 +117,7 @@ HF_SPACES = {
     },
     # MetaVoice
     'mrfakename/MetaVoice-1B-v0.1': {
-        'name':'MetaVoice-1B',
         'function': '/tts',
         'text_param_index': 0,
         'return_audio_index': 0,
@@ -186,7 +189,7 @@ HF_SPACES = {
     # Microsoft Edge TTS
     'innoai/Edge-TTS-Text-to-Speech': {
-        'name': 'Edge TTS',
         'function': '/predict',
         'text_param_index': 0,
         'return_audio_index': 0,
@@ -297,6 +300,14 @@ HF_SPACES = {
         'is_zero_gpu_space': True,
         'series': 'OuteTTS',
     },
 }
 # for zero-shot TTS - voice sample used by XTTS (11 seconds)
@@ -470,6 +481,9 @@ OVERRIDE_INPUTS = {
 		'speaker_selection': "en_female_1",
 		'reference_audio': None,
     },
 }

     # GPT-SoVITS
     'lj1995/GPT-SoVITS-v2': 'lj1995/GPT-SoVITS-v2',
+    # OuteTTS 500M
     # 'OuteAI/OuteTTS-0.2-500M-Demo': 'OuteAI/OuteTTS-0.2-500M-Demo',
     'ameerazam08/OuteTTS-0.2-500M-Demo': 'ameerazam08/OuteTTS-0.2-500M-Demo', # ZeroGPU Space
     # OuteTTS 1B
+    # 'OuteAI/OuteTTS-0.3-1B-Demo': 'OuteAI/OuteTTS-0.3-1B-Demo',
+    # llasa 3b TTS
+    'srinivasbilla/llasa-3b-tts': 'srinivasbilla/llasa-3b-tts',
     # HF TTS w issues
     # 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
     },
     # MetaVoice
     'mrfakename/MetaVoice-1B-v0.1': {
+        'name':'MetaVoice',
         'function': '/tts',
         'text_param_index': 0,
         'return_audio_index': 0,
     # Microsoft Edge TTS
     'innoai/Edge-TTS-Text-to-Speech': {
+        'name': 'Microsoft™ Edge TTS',
         'function': '/predict',
         'text_param_index': 0,
         'return_audio_index': 0,
         'is_zero_gpu_space': True,
         'series': 'OuteTTS',
     },
+    'OuteAI/OuteTTS-0.3-1B-Demo': {
+        'name': 'llasa 3b',
+        'function': '/infer',
+        'text_param_index': 'target_text',
+        'return_audio_index': 0,
+        'is_zero_gpu_space': True,
+        'series': 'llasa 3b',
+    },
 }
 # for zero-shot TTS - voice sample used by XTTS (11 seconds)
 		'speaker_selection': "en_female_1",
 		'reference_audio': None,
     },
+    'srinivasbilla/llasa-3b-tts': {
+		'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3')
+    },
 }

app/sample_caching.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import itertools
 import random
 import json
 from typing import List, Tuple, Set, Dict
 from hashlib import md5, sha1
 import spaces
@@ -72,6 +73,37 @@ def get_userid(session_hash: str, request):
         # by browser session hash - Not a cookie, session hash changes on page reload
         return sha1(bytes(request.session_hash.encode('ascii')), usedforsecurity=False).hexdigest()
 # Give user a cached audio sample pair they have yet to vote on
 def give_cached_sample(session_hash: str, autoplay: bool, request: gr.Request):
     # add new userid to voting_users from Browser session hash

 import itertools
 import random
 import json
+from .config import sents
 from typing import List, Tuple, Set, Dict
 from hashlib import md5, sha1
 import spaces
         # by browser session hash - Not a cookie, session hash changes on page reload
         return sha1(bytes(request.session_hash.encode('ascii')), usedforsecurity=False).hexdigest()
+def cache_sample(path, text, model):
+    # skip caching if not a hardcoded sentence
+    if (text not in sents):
+        return False
+    already_cached = False
+    # check if already cached
+    for cached_sample in cached_samples:
+        # TODO: replace cached sample with a newer version?
+        if (cached_sample.transcript == text and cached_sample.modelName == model):
+            already_cached = True
+            return True
+    if (already_cached):
+        return False
+    try:
+        cached_samples.append(Sample(path, text, model))
+    except:
+        print('Error when trying to cache sample')
+        return False
+    # save list to JSON file
+    cached_sample_dict = [cached_sample.to_dict() for cached_sample in cached_samples]
+    try:
+        with open("_cached_samples.json", "w") as write:
+            json.dump( cached_sample_dict , write )
+    except:
+        pass
 # Give user a cached audio sample pair they have yet to vote on
 def give_cached_sample(session_hash: str, autoplay: bool, request: gr.Request):
     # add new userid to voting_users from Browser session hash

app/synth.py CHANGED Viewed

@@ -266,36 +266,6 @@ def synthandreturn(text, autoplay, request: gr.Request):
         return inputs
-    def _cache_sample(text, model):
-        # skip caching if not hardcoded sentence
-        if (text not in sents):
-            return False
-        already_cached = False
-        # check if already cached
-        for cached_sample in cached_samples:
-            # TODO:replace cached with newer version
-            if (cached_sample.transcript == text and cached_sample.modelName == model):
-                already_cached = True
-                return True
-        if (already_cached):
-            return False
-        try:
-            cached_samples.append(Sample(results[model], text, model))
-        except:
-            print('Error when trying to cache sample')
-            return False
-        # save list to JSON file
-        cached_sample_dict = [cached_sample.to_dict() for cached_sample in cached_samples]
-        try:
-            with open("_cached_samples.json", "w") as write:
-                json.dump( cached_sample_dict , write )
-        except:
-            pass
     mdl1k = mdl1
     mdl2k = mdl2
     print(mdl1k, mdl2k)
@@ -306,19 +276,18 @@ def synthandreturn(text, autoplay, request: gr.Request):
     # do not use multithreading when both spaces are ZeroGPU type
     if (
-        # exists
         'is_zero_gpu_space' in HF_SPACES[mdl1]
-        # is True
         and HF_SPACES[mdl1]['is_zero_gpu_space']
         and 'is_zero_gpu_space' in HF_SPACES[mdl2]
         and HF_SPACES[mdl2]['is_zero_gpu_space']
     ):
         # run Zero-GPU spaces one at a time
         predict_and_update_result(text, mdl1k, results, request)
-        _cache_sample(text, mdl1k)
         predict_and_update_result(text, mdl2k, results, request)
-        _cache_sample(text, mdl2k)
     else:
         # use multithreading
         thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
@@ -332,9 +301,9 @@ def synthandreturn(text, autoplay, request: gr.Request):
         thread1.join(120)
         thread2.join(120)
-        # cache the result
         for model in [mdl1k, mdl2k]:
-            _cache_sample(text, model)
     print(f"Retrieving models {mdl1k} and {mdl2k} from API")
     return (

         return inputs
     mdl1k = mdl1
     mdl2k = mdl2
     print(mdl1k, mdl2k)
     # do not use multithreading when both spaces are ZeroGPU type
     if (
         'is_zero_gpu_space' in HF_SPACES[mdl1]
         and HF_SPACES[mdl1]['is_zero_gpu_space']
         and 'is_zero_gpu_space' in HF_SPACES[mdl2]
         and HF_SPACES[mdl2]['is_zero_gpu_space']
     ):
         # run Zero-GPU spaces one at a time
         predict_and_update_result(text, mdl1k, results, request)
+        cache_sample(results[mdl1k], text, mdl1k)
         predict_and_update_result(text, mdl2k, results, request)
+        cache_sample(results[mdl2k], text, mdl2k)
     else:
         # use multithreading
         thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
         thread1.join(120)
         thread2.join(120)
+        # cache each result
         for model in [mdl1k, mdl2k]:
+            cache_sample(results[model], text, model)
     print(f"Retrieving models {mdl1k} and {mdl2k} from API")
     return (

test_tts_cosyvoice.py CHANGED Viewed

@@ -1,19 +1,25 @@
 import os
 from gradio_client import Client, handle_file
-client = Client("FunAudioLLM/CosyVoice2-0.5B", hf_token=os.getenv('HF_TOKEN'))
 endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
 # print(endpoints)
 result = client.predict(
-		tts_text="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.",
-		mode_checkbox_group="3s Voice Clone",
 		prompt_text='The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory.',
 		prompt_wav_upload=handle_file("https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav"),
 		prompt_wav_record=None,
 		instruct_text=None,
 		seed=0,
-		stream="false",
-		api_name="/generate_audio"
 )
 print(result)

 import os
 from gradio_client import Client, handle_file
+# client = Client("FunAudioLLM/CosyVoice2-0.5B", hf_token=os.getenv('HF_TOKEN'))
+client = Client("tanbw/CosyVoice", hf_token=os.getenv('HF_TOKEN'))
 endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
 # print(endpoints)
 result = client.predict(
+		tts_text="CosyVoice is undergoing a comprehensive upgrade.",
+		# mode_checkbox_group=None,
+		mode_checkbox_group="3s极速复刻",
 		prompt_text='The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory.',
 		prompt_wav_upload=handle_file("https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav"),
 		prompt_wav_record=None,
 		instruct_text=None,
 		seed=0,
+		stream=False,
+		api_name="/generate_audio",
+		# tanbw
+		sft_dropdown=None,
+		speed=1,
 )
 print(result)

test_tts_oute.py CHANGED Viewed

@@ -12,9 +12,11 @@ result = client.predict(
 		temperature=0.1,
 		repetition_penalty=1.1,
 		language="en",
-		speaker_selection="female_1",
-		reference_audio=None,
-		reference_text=None,
 		# reference_audio=handle_file('EN_B00004_S00051_W000213.wav'),
 		# reference_text="Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we'll call",
 		api_name="/generate_tts"

 		temperature=0.1,
 		repetition_penalty=1.1,
 		language="en",
+		speaker_selection=None,
+		reference_audio=handle_file('voice_samples/EN_B00004_S00051_W000213.mp3'),
+        reference_text='Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we\'ll call Kruger Bern.',
+		# reference_audio=None,
+		# reference_text=None,
 		# reference_audio=handle_file('EN_B00004_S00051_W000213.wav'),
 		# reference_text="Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we'll call",
 		api_name="/generate_tts"

voice_samples/EN_B00004_S00051_W000125.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"id": "EN_B00004_S00051_W000125", "wav": "EN_B00004/EN_B00004_S00051/mp3/EN_B00004_S00051_W000125.mp3", "text": " Unfortunately, there is a flip side. Small losses or setbacks can have an extremely negative effect on inner work life. In fact,", "duration": 10.1535, "speaker": "EN_B00004_S00051", "language": "en", "dnsmos": 3.4373}