Spaces:
Build error
Build error
Upload 22 files
Browse files- .gitattributes +2 -0
- lib/__init__.py +57 -0
- lib/__pycache__/__init__.cpython-312.pyc +0 -0
- lib/__pycache__/conf.cpython-312.pyc +0 -0
- lib/__pycache__/functions.cpython-312.pyc +3 -0
- lib/__pycache__/lang.cpython-312.pyc +3 -0
- lib/__pycache__/models.cpython-312.pyc +0 -0
- lib/classes/__pycache__/background_detector.cpython-312.pyc +0 -0
- lib/classes/__pycache__/tts_manager.cpython-312.pyc +0 -0
- lib/classes/__pycache__/voice_extractor.cpython-312.pyc +0 -0
- lib/classes/argos_translator.py +122 -0
- lib/classes/background_detector.py +37 -0
- lib/classes/redirect_console.py +51 -0
- lib/classes/tts_engines/.template.py +232 -0
- lib/classes/tts_engines/common/audio_filters.py +107 -0
- lib/classes/tts_engines/common/utils.py +57 -0
- lib/classes/tts_engines/coqui.py +810 -0
- lib/classes/tts_manager.py +37 -0
- lib/classes/voice_extractor.py +286 -0
- lib/conf.py +78 -0
- lib/functions.py +0 -0
- lib/lang.py +0 -0
- lib/models.py +493 -0
.gitattributes
CHANGED
@@ -47,3 +47,5 @@ ebook2audiobook.egg-info/assets/gui_1.png filter=lfs diff=lfs merge=lfs -text
|
|
47 |
ebook2audiobook.egg-info/assets/gui_2.png filter=lfs diff=lfs merge=lfs -text
|
48 |
ebook2audiobook.egg-info/assets/gui_3.png filter=lfs diff=lfs merge=lfs -text
|
49 |
ebook2audiobook.egg-info/assets/Rainy_Day_voice_Demo.mp4 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
47 |
ebook2audiobook.egg-info/assets/gui_2.png filter=lfs diff=lfs merge=lfs -text
|
48 |
ebook2audiobook.egg-info/assets/gui_3.png filter=lfs diff=lfs merge=lfs -text
|
49 |
ebook2audiobook.egg-info/assets/Rainy_Day_voice_Demo.mp4 filter=lfs diff=lfs merge=lfs -text
|
50 |
+
lib/__pycache__/functions.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
|
51 |
+
lib/__pycache__/lang.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
|
lib/__init__.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .models import (
|
2 |
+
TTS_ENGINES, TTS_VOICE_CONVERSION, TTS_SML, default_fine_tuned, default_tts_engine,
|
3 |
+
default_engine_settings, default_vc_model, default_voice_detection_model,
|
4 |
+
loaded_tts, max_custom_model, max_custom_voices,
|
5 |
+
max_tts_in_memory, max_upload_size, models, os, voices_dir
|
6 |
+
)
|
7 |
+
|
8 |
+
from .conf import (
|
9 |
+
FULL_DOCKER, NATIVE, audiobooks_cli_dir, audiobooks_gradio_dir,
|
10 |
+
audiobooks_host_dir, debug_mode, default_audio_proc_samplerate,
|
11 |
+
default_audio_proc_format, default_device, default_gpu_wiki,
|
12 |
+
default_output_format, device_list, ebook_formats,
|
13 |
+
ebooks_dir, interface_component_options, interface_concurrency_limit,
|
14 |
+
interface_host, interface_port, interface_shared_tmp_expire,
|
15 |
+
max_python_version, min_python_version, models_dir, os,
|
16 |
+
output_formats, platform, prog_version, python_env_dir,
|
17 |
+
requirements_file, tmp_dir, tmp_expire, tts_dir, voice_formats,
|
18 |
+
voices_dir, default_output_split, default_output_split_hours
|
19 |
+
)
|
20 |
+
|
21 |
+
from .lang import (
|
22 |
+
abbreviations_mapping, chapter_word_mapping, default_language_code,
|
23 |
+
roman_numbers_tuples, emojis_list, install_info, language_mapping,
|
24 |
+
language_math_phonemes, language_clock, language_tts, os, punctuation_list,
|
25 |
+
punctuation_list_set, punctuation_split_hard, punctuation_split_hard_set,
|
26 |
+
punctuation_split_soft, punctuation_split_soft_set, punctuation_switch,
|
27 |
+
specialchars_mapping, specialchars_remove, year_to_decades_languages
|
28 |
+
)
|
29 |
+
|
30 |
+
__all__ = [
|
31 |
+
# from models
|
32 |
+
"TTS_ENGINES", "TTS_VOICE_CONVERSION", "TTS_SML", "default_fine_tuned", "default_tts_engine",
|
33 |
+
"default_engine_settings", "default_vc_model", "default_voice_detection_model",
|
34 |
+
"loaded_tts", "max_custom_model",
|
35 |
+
"max_custom_voices", "max_tts_in_memory", "max_upload_size",
|
36 |
+
"models", "os", "voices_dir",
|
37 |
+
|
38 |
+
# from conf
|
39 |
+
"FULL_DOCKER", "NATIVE", "audiobooks_cli_dir", "audiobooks_gradio_dir",
|
40 |
+
"audiobooks_host_dir", "debug_mode", "default_audio_proc_samplerate",
|
41 |
+
"default_audio_proc_format", "default_device", "default_gpu_wiki",
|
42 |
+
"default_output_format", "device_list", "ebook_formats", "ebooks_dir",
|
43 |
+
"interface_component_options", "interface_concurrency_limit",
|
44 |
+
"interface_host", "interface_port", "interface_shared_tmp_expire",
|
45 |
+
"max_python_version", "min_python_version", "models_dir", "os",
|
46 |
+
"output_formats", "platform", "prog_version", "python_env_dir",
|
47 |
+
"requirements_file", "tmp_dir", "tmp_expire", "tts_dir",
|
48 |
+
"voice_formats", "voices_dir", "default_output_split", "default_output_split_hours",
|
49 |
+
|
50 |
+
# from lang
|
51 |
+
"abbreviations_mapping", "chapter_word_mapping", "default_language_code",
|
52 |
+
"roman_numbers_tuples", "emojis_list", "install_info", "language_mapping",
|
53 |
+
"language_math_phonemes", "language_clock", "language_tts", "os", "punctuation_list",
|
54 |
+
"punctuation_list_set", "punctuation_split_hard", "punctuation_split_hard_set",
|
55 |
+
"punctuation_split_soft", "punctuation_split_soft_set", "punctuation_switch",
|
56 |
+
"specialchars_mapping", "specialchars_remove", "year_to_decades_languages"
|
57 |
+
]
|
lib/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (2.64 kB). View file
|
|
lib/__pycache__/conf.cpython-312.pyc
ADDED
Binary file (4.98 kB). View file
|
|
lib/__pycache__/functions.cpython-312.pyc
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59b1809dd2e4e86864d8ff51fbdade7548389b92cd6f3b24d9e9a54235eb0de2
|
3 |
+
size 236223
|
lib/__pycache__/lang.cpython-312.pyc
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff1e8d413d7881648a9aa7ffae42617ebc430ee61b2523706c9eb8315889c86e
|
3 |
+
size 228874
|
lib/__pycache__/models.cpython-312.pyc
ADDED
Binary file (20.8 kB). View file
|
|
lib/classes/__pycache__/background_detector.cpython-312.pyc
ADDED
Binary file (2.32 kB). View file
|
|
lib/classes/__pycache__/tts_manager.cpython-312.pyc
ADDED
Binary file (2.15 kB). View file
|
|
lib/classes/__pycache__/voice_extractor.cpython-312.pyc
ADDED
Binary file (14.3 kB). View file
|
|
lib/classes/argos_translator.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import tempfile
|
3 |
+
import argostranslate.package
|
4 |
+
import argostranslate.translate
|
5 |
+
|
6 |
+
from iso639 import languages
|
7 |
+
from lib.conf import models_dir
|
8 |
+
from lib.lang import language_mapping
|
9 |
+
|
10 |
+
# NOTE: source_lang and target_lang must be iso639-1 (2 letters)
|
11 |
+
|
12 |
+
class ArgosTranslator:
|
13 |
+
|
14 |
+
def __init__(self, neural_machine="argostranslate"):
|
15 |
+
self.neural_machine = neural_machine
|
16 |
+
self.translation = None
|
17 |
+
|
18 |
+
def get_language_iso3(self, lang_iso1):
|
19 |
+
lang = lang_iso1
|
20 |
+
try:
|
21 |
+
lang_array = languages.get(part1=lang_iso1)
|
22 |
+
if lang_array:
|
23 |
+
lang = lang_array.part3
|
24 |
+
except Exception:
|
25 |
+
pass
|
26 |
+
return lang
|
27 |
+
|
28 |
+
def get_all_sources_lang(self):
|
29 |
+
available_packages = argostranslate.package.get_available_packages()
|
30 |
+
return sorted(set(pkg.from_code for pkg in available_packages))
|
31 |
+
|
32 |
+
def get_all_targets_lang(self, source_lang):
|
33 |
+
available_packages = argostranslate.package.get_available_packages()
|
34 |
+
list_iso1 = sorted(set(pkg.to_code for pkg in available_packages if pkg.from_code == source_lang))
|
35 |
+
language_translate_mapping = {}
|
36 |
+
for iso1 in list_iso1:
|
37 |
+
try:
|
38 |
+
iso3 = self.get_language_iso3(iso1)
|
39 |
+
if iso3 in language_mapping:
|
40 |
+
language_translate_mapping[iso3] = dict(language_mapping[iso3])
|
41 |
+
language_translate_mapping[iso3]["iso1"] = iso1
|
42 |
+
except KeyError:
|
43 |
+
pass
|
44 |
+
language_translate_options = [
|
45 |
+
(
|
46 |
+
f"{details['name']} - {details['native_name']}" if details['name'] != details['native_name'] else details['name'],
|
47 |
+
lang
|
48 |
+
)
|
49 |
+
for lang, details in language_translate_mapping.items()
|
50 |
+
]
|
51 |
+
return language_translate_options
|
52 |
+
|
53 |
+
def get_all_target_packages(self, source_lang):
|
54 |
+
available_packages = argostranslate.package.get_available_packages()
|
55 |
+
return [pkg for pkg in available_packages if pkg.from_code == source_lang]
|
56 |
+
|
57 |
+
def is_package_installed(self, source_lang, target_lang):
|
58 |
+
try:
|
59 |
+
installed_languages = argostranslate.translate.get_installed_languages()
|
60 |
+
source_language = next((lang for lang in installed_languages if lang.code == source_lang), None)
|
61 |
+
target_language = next((lang for lang in installed_languages if lang.code == target_lang), None)
|
62 |
+
return source_language is not None and target_language is not None
|
63 |
+
except Exception as e:
|
64 |
+
error = f'is_package_installed() error: {e}'
|
65 |
+
return False
|
66 |
+
|
67 |
+
def download_and_install_argos_package(self, source_lang, target_lang):
|
68 |
+
try:
|
69 |
+
if self.is_package_installed(source_lang, target_lang):
|
70 |
+
print(f"Package for translation from {source_lang} to {target_lang} is already installed.")
|
71 |
+
print(msg)
|
72 |
+
return msg, True
|
73 |
+
available_packages = self.get_all_target_packages(source_lang)
|
74 |
+
target_package = None
|
75 |
+
for pkg in available_packages:
|
76 |
+
if pkg.from_code == source_lang and pkg.to_code == target_lang:
|
77 |
+
target_package = pkg
|
78 |
+
break
|
79 |
+
if target_package:
|
80 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
81 |
+
print(f"Downloading package for translation from {source_lang} to {target_lang}...")
|
82 |
+
package_path = target_package.download()
|
83 |
+
argostranslate.package.install_from_path(package_path)
|
84 |
+
print(f"Package installed for translation from {source_lang} to {target_lang}")
|
85 |
+
return None, True
|
86 |
+
else:
|
87 |
+
msg = f"No available package found for translation from {source_lang} to {target_lang}."
|
88 |
+
return msg, False
|
89 |
+
except Exception as e:
|
90 |
+
error = f'download_and_install_argos_package() error: {e}'
|
91 |
+
return error, False
|
92 |
+
|
93 |
+
def process(self, text):
|
94 |
+
try:
|
95 |
+
return self.translation.translate(text), True
|
96 |
+
except Exception as e:
|
97 |
+
error = f'AgrosTranslator.process() error: {e}'
|
98 |
+
return error, False
|
99 |
+
|
100 |
+
def start(self, source_lang, target_lang):
|
101 |
+
try:
|
102 |
+
if self.neural_machine != "argostranslate":
|
103 |
+
error = f"Neural machine '{self.neural_machine}' is not supported."
|
104 |
+
return error, False
|
105 |
+
status = True
|
106 |
+
if not self.is_package_installed(source_lang, target_lang):
|
107 |
+
error, status = self.download_and_install_argos_package(source_lang, target_lang)
|
108 |
+
if status:
|
109 |
+
installed_languages = argostranslate.translate.get_installed_languages()
|
110 |
+
source_language = next((lang for lang in installed_languages if lang.code == source_lang), None)
|
111 |
+
target_language = next((lang for lang in installed_languages if lang.code == target_lang), None)
|
112 |
+
|
113 |
+
if not source_language or not target_language:
|
114 |
+
error = f"Translation languages not installed: {source_lang} to {target_lang}"
|
115 |
+
return error, False
|
116 |
+
|
117 |
+
self.translation = source_language.get_translation(target_language)
|
118 |
+
return None, True
|
119 |
+
return error, status
|
120 |
+
except Exception as e:
|
121 |
+
error = f'AgrosTranslator.process() error: {e}'
|
122 |
+
return error, False
|
lib/classes/background_detector.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import librosa
|
4 |
+
|
5 |
+
from pyannote.audio import Model
|
6 |
+
from pyannote.audio.pipelines import VoiceActivityDetection
|
7 |
+
from lib.conf import tts_dir
|
8 |
+
from lib.models import default_voice_detection_model
|
9 |
+
|
10 |
+
class BackgroundDetector:
|
11 |
+
|
12 |
+
def __init__(self, wav_file: str):
|
13 |
+
self.wav_file = wav_file
|
14 |
+
model = Model.from_pretrained(default_voice_detection_model, cache_dir=tts_dir)
|
15 |
+
self.pipeline = VoiceActivityDetection(segmentation=model)
|
16 |
+
hyper_params = {
|
17 |
+
# onset/offset activation thresholds
|
18 |
+
"onset": 0.5, "offset": 0.5,
|
19 |
+
# remove speech regions shorter than that many seconds.
|
20 |
+
"min_duration_on": 0.0,
|
21 |
+
# fill non-speech regions shorter than that many seconds.
|
22 |
+
"min_duration_off": 0.0
|
23 |
+
}
|
24 |
+
self.pipeline.instantiate(hyper_params)
|
25 |
+
|
26 |
+
def detect(self, vad_ratio_thresh: float=0.05):
|
27 |
+
diarization = self.pipeline(self.wav_file)
|
28 |
+
speech_segments = [(s.start, s.end) for s in diarization.get_timeline()]
|
29 |
+
total_duration = librosa.get_duration(path=self.wav_file)
|
30 |
+
speech_time = sum(end - start for start, end in speech_segments)
|
31 |
+
non_speech_ratio = 1 - (speech_time / total_duration)
|
32 |
+
status = non_speech_ratio > vad_ratio_thresh
|
33 |
+
report = {
|
34 |
+
'non_speech_ratio': non_speech_ratio,
|
35 |
+
'background_detected': status
|
36 |
+
}
|
37 |
+
return status, report
|
lib/classes/redirect_console.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from queue import Queue, Empty
|
2 |
+
import time
|
3 |
+
import logging
|
4 |
+
|
5 |
+
|
6 |
+
class RedirectConsole:
|
7 |
+
def __init__(self, log_buffer: Queue, real_output):
|
8 |
+
self.log_buffer = log_buffer # Queue buffer for the log
|
9 |
+
self.real_output = real_output # Real terminal (sys.__stdout__ or sys.__stderr__)
|
10 |
+
|
11 |
+
# Setup for transformers logging
|
12 |
+
self.setup_transformers_logger()
|
13 |
+
|
14 |
+
def write(self, message: str):
|
15 |
+
# Write to the real terminal
|
16 |
+
self.real_output.write(message)
|
17 |
+
self.real_output.flush()
|
18 |
+
|
19 |
+
# Write to the log buffer
|
20 |
+
self.log_buffer.put(message)
|
21 |
+
|
22 |
+
def flush(self):
|
23 |
+
self.real_output.flush()
|
24 |
+
|
25 |
+
def isatty(self) -> bool:
|
26 |
+
return self.real_output.isatty()
|
27 |
+
|
28 |
+
def poll_logs(self, stop_event):
|
29 |
+
logs = ""
|
30 |
+
errors = ""
|
31 |
+
while not stop_event.is_set() or not self.log_buffer.empty():
|
32 |
+
try:
|
33 |
+
# Read logs from the buffer without blocking
|
34 |
+
log = self.log_buffer.get_nowait()
|
35 |
+
if "An error occurred" in log:
|
36 |
+
errors += log # Capture error messages separately
|
37 |
+
logs += log
|
38 |
+
except Empty:
|
39 |
+
pass # No logs in the buffer
|
40 |
+
yield logs, errors # Yield updated logs and errors
|
41 |
+
time.sleep(0.1) # Prevent tight looping
|
42 |
+
|
43 |
+
def setup_transformers_logger(self):
|
44 |
+
# Configure the `transformers` logger
|
45 |
+
transformers_logger = logging.getLogger("transformers")
|
46 |
+
transformers_logger.setLevel(logging.WARNING) # Capture warnings and above
|
47 |
+
|
48 |
+
# Create a handler that writes to this instance
|
49 |
+
handler = logging.StreamHandler(self)
|
50 |
+
handler.setFormatter(logging.Formatter("%(message)s")) # Simplified format
|
51 |
+
transformers_logger.addHandler(handler)
|
lib/classes/tts_engines/.template.py
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import hashlib
|
2 |
+
import math
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
import subprocess
|
6 |
+
import tempfile
|
7 |
+
import threading
|
8 |
+
import uuid
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import regex as re
|
12 |
+
import soundfile as sf
|
13 |
+
import torch
|
14 |
+
import torchaudio
|
15 |
+
|
16 |
+
from huggingface_hub import hf_hub_download
|
17 |
+
from pathlib import Path
|
18 |
+
from pprint import pprint
|
19 |
+
|
20 |
+
from lib import *
|
21 |
+
from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
|
22 |
+
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
23 |
+
|
24 |
+
#import logging
|
25 |
+
#logging.basicConfig(level=logging.DEBUG)
|
26 |
+
|
27 |
+
lock = threading.Lock()
|
28 |
+
|
29 |
+
class Coqui:
|
30 |
+
|
31 |
+
def __init__(self, session):
|
32 |
+
try:
|
33 |
+
self.session = session
|
34 |
+
self.cache_dir = tts_dir
|
35 |
+
self.speakers_path = None
|
36 |
+
self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
|
37 |
+
self.tts_vc_key = default_vc_model.rsplit('/', 1)[-1]
|
38 |
+
self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported() == True else False
|
39 |
+
self.npz_path = None
|
40 |
+
self.npz_data = None
|
41 |
+
self.sentences_total_time = 0.0
|
42 |
+
self.sentence_idx = 1
|
43 |
+
self.params = {TTS_ENGINES['NEW_TTS']: {}}
|
44 |
+
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
45 |
+
self.vtt_path = os.path.join(self.session['process_dir'], os.path.splitext(self.session['final_name'])[0] + '.vtt')
|
46 |
+
self.resampler_cache = {}
|
47 |
+
self.audio_segments = []
|
48 |
+
self._build()
|
49 |
+
except Exception as e:
|
50 |
+
error = f'__init__() error: {e}'
|
51 |
+
print(error)
|
52 |
+
return None
|
53 |
+
|
54 |
+
def _build(self):
|
55 |
+
try:
|
56 |
+
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
57 |
+
if not tts:
|
58 |
+
if self.session['tts_engine'] == TTS_ENGINES['NEW_TTS']:
|
59 |
+
if self.session['custom_model'] is not None:
|
60 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
61 |
+
print(msg)
|
62 |
+
return False
|
63 |
+
else:
|
64 |
+
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
65 |
+
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
66 |
+
return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
67 |
+
except Exception as e:
|
68 |
+
error = f'build() error: {e}'
|
69 |
+
print(error)
|
70 |
+
return False
|
71 |
+
|
72 |
+
def _load_api(self, key, model_path, device):
|
73 |
+
global lock
|
74 |
+
try:
|
75 |
+
if key in loaded_tts.keys():
|
76 |
+
return loaded_tts[key]['engine']
|
77 |
+
unload_tts(device, [self.tts_key, self.tts_vc_key])
|
78 |
+
with lock:
|
79 |
+
tts = NEW_TTS(model_path)
|
80 |
+
if tts
|
81 |
+
if device == 'cuda':
|
82 |
+
NEW_TTS.WITH_CUDA
|
83 |
+
else:
|
84 |
+
NEW_TTS.WITHOUT_CUDA
|
85 |
+
loaded_tts[key] = {"engine": tts, "config": None}
|
86 |
+
msg = f'{model_path} Loaded!'
|
87 |
+
print(msg)
|
88 |
+
return tts
|
89 |
+
else:
|
90 |
+
error = 'TTS engine could not be created!'
|
91 |
+
print(error)
|
92 |
+
except Exception as e:
|
93 |
+
error = f'_load_api() error: {e}'
|
94 |
+
print(error)
|
95 |
+
return False
|
96 |
+
|
97 |
+
def _load_checkpoint(self, **kwargs):
|
98 |
+
global lock
|
99 |
+
try:
|
100 |
+
key = kwargs.get('key')
|
101 |
+
if key in loaded_tts.keys():
|
102 |
+
return loaded_tts[key]['engine']
|
103 |
+
tts_engine = kwargs.get('tts_engine')
|
104 |
+
device = kwargs.get('device')
|
105 |
+
unload_tts(device, [self.tts_key])
|
106 |
+
with lock:
|
107 |
+
checkpoint_dir = kwargs.get('checkpoint_dir')
|
108 |
+
NEW_TTS.LOAD_CHECKPOINT(
|
109 |
+
config,
|
110 |
+
checkpoint_dir=checkpoint_dir,
|
111 |
+
eval=True
|
112 |
+
)
|
113 |
+
if tts:
|
114 |
+
if device == 'cuda':
|
115 |
+
NEW_TTS.WITH_CUDA
|
116 |
+
else:
|
117 |
+
NEW_TTS.WITHOUT_CUDA
|
118 |
+
loaded_tts[key] = {"engine": tts, "config": config}
|
119 |
+
msg = f'{tts_engine} Loaded!'
|
120 |
+
print(msg)
|
121 |
+
return tts
|
122 |
+
else:
|
123 |
+
error = 'TTS engine could not be created!'
|
124 |
+
print(error)
|
125 |
+
except Exception as e:
|
126 |
+
error = f'_load_checkpoint() error: {e}'
|
127 |
+
return False
|
128 |
+
|
129 |
+
def _tensor_type(self, audio_data):
|
130 |
+
if isinstance(audio_data, torch.Tensor):
|
131 |
+
return audio_data
|
132 |
+
elif isinstance(audio_data, np.ndarray):
|
133 |
+
return torch.from_numpy(audio_data).float()
|
134 |
+
elif isinstance(audio_data, list):
|
135 |
+
return torch.tensor(audio_data, dtype=torch.float32)
|
136 |
+
else:
|
137 |
+
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
138 |
+
|
139 |
+
def _get_resampler(self, orig_sr, target_sr):
|
140 |
+
key = (orig_sr, target_sr)
|
141 |
+
if key not in self.resampler_cache:
|
142 |
+
self.resampler_cache[key] = torchaudio.transforms.Resample(
|
143 |
+
orig_freq=orig_sr, new_freq=target_sr
|
144 |
+
)
|
145 |
+
return self.resampler_cache[key]
|
146 |
+
|
147 |
+
def _resample_wav(self, wav_path, expected_sr):
|
148 |
+
waveform, orig_sr = torchaudio.load(wav_path)
|
149 |
+
if orig_sr == expected_sr and waveform.size(0) == 1:
|
150 |
+
return wav_path
|
151 |
+
if waveform.size(0) > 1:
|
152 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
153 |
+
if orig_sr != expected_sr:
|
154 |
+
resampler = self._get_resampler(orig_sr, expected_sr)
|
155 |
+
waveform = resampler(waveform)
|
156 |
+
wav_tensor = waveform.squeeze(0)
|
157 |
+
wav_numpy = wav_tensor.cpu().numpy()
|
158 |
+
tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
159 |
+
tmp_path = tmp_fh.name
|
160 |
+
tmp_fh.close()
|
161 |
+
sf.write(tmp_path, wav_numpy, expected_sr, subtype="PCM_16")
|
162 |
+
return tmp_path
|
163 |
+
|
164 |
+
def convert(self, sentence_number, sentence):
|
165 |
+
global xtts_builtin_speakers_list
|
166 |
+
try:
|
167 |
+
speaker = None
|
168 |
+
audio_data = False
|
169 |
+
trim_audio_buffer = 0.004
|
170 |
+
settings = self.params[self.session['tts_engine']]
|
171 |
+
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
|
172 |
+
sentence = sentence.strip()
|
173 |
+
settings['voice_path'] = (
|
174 |
+
self.session['voice'] if self.session['voice'] is not None
|
175 |
+
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
|
176 |
+
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
177 |
+
)
|
178 |
+
if settings['voice_path'] is not None:
|
179 |
+
speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
|
180 |
+
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
181 |
+
if tts:
|
182 |
+
if sentence[-1].isalnum():
|
183 |
+
sentence = f'{sentence} —'
|
184 |
+
if sentence == TTS_SML['break']:
|
185 |
+
break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100))) # 0.4 to 0.7 seconds
|
186 |
+
self.audio_segments.append(break_tensor.clone())
|
187 |
+
return True
|
188 |
+
elif sentence == TTS_SML['pause']:
|
189 |
+
pause_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(1.0, 1.8) * 100) / 100))) # 1.0 to 1.8 seconds
|
190 |
+
self.audio_segments.append(pause_tensor.clone())
|
191 |
+
return True
|
192 |
+
else:
|
193 |
+
if self.session['tts_engine'] == TTS_ENGINES['NEW_TTS']:
|
194 |
+
audio_sentence = NEW_TTS.CONVERT() # audio_sentence must be torch.Tensor or (list, tuple) or np.ndarray
|
195 |
+
if is_audio_data_valid(audio_sentence):
|
196 |
+
sourceTensor = self._tensor_type(audio_sentence)
|
197 |
+
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
198 |
+
if sentence[-1].isalnum() or sentence[-1] == '—':
|
199 |
+
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.003, trim_audio_buffer).unsqueeze(0)
|
200 |
+
self.audio_segments.append(audio_tensor)
|
201 |
+
if not re.search(r'\w$', sentence, flags=re.UNICODE):
|
202 |
+
break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100)))
|
203 |
+
self.audio_segments.append(break_tensor.clone())
|
204 |
+
if self.audio_segments:
|
205 |
+
audio_tensor = torch.cat(self.audio_segments, dim=-1)
|
206 |
+
start_time = self.sentences_total_time
|
207 |
+
duration = audio_tensor.shape[-1] / settings['samplerate']
|
208 |
+
end_time = start_time + duration
|
209 |
+
self.sentences_total_time = end_time
|
210 |
+
sentence_obj = {
|
211 |
+
"start": start_time,
|
212 |
+
"end": end_time,
|
213 |
+
"text": sentence,
|
214 |
+
"resume_check": self.sentence_idx
|
215 |
+
}
|
216 |
+
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
217 |
+
if self.sentence_idx:
|
218 |
+
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
|
219 |
+
del audio_tensor
|
220 |
+
self.audio_segments = []
|
221 |
+
if os.path.exists(final_sentence_file):
|
222 |
+
return True
|
223 |
+
else:
|
224 |
+
error = f"Cannot create {final_sentence_file}"
|
225 |
+
print(error)
|
226 |
+
else:
|
227 |
+
error = f"convert() error: {self.session['tts_engine']} is None"
|
228 |
+
print(error)
|
229 |
+
except Exception as e:
|
230 |
+
error = f'Coquit.convert(): {e}'
|
231 |
+
raise ValueError(e)
|
232 |
+
return False
|
lib/classes/tts_engines/common/audio_filters.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import subprocess
|
4 |
+
import shutil
|
5 |
+
|
6 |
+
from scipy.io import wavfile as wav
|
7 |
+
from scipy.signal import find_peaks
|
8 |
+
|
9 |
+
def detect_gender(voice_path):
|
10 |
+
try:
|
11 |
+
samplerate, signal = wav.read(voice_path)
|
12 |
+
# Convert stereo to mono if needed
|
13 |
+
if len(signal.shape) > 1:
|
14 |
+
signal = np.mean(signal, axis=1)
|
15 |
+
# Compute FFT
|
16 |
+
fft_spectrum = np.abs(np.fft.fft(signal))
|
17 |
+
freqs = np.fft.fftfreq(len(fft_spectrum), d=1/samplerate)
|
18 |
+
# Consider only positive frequencies
|
19 |
+
positive_freqs = freqs[:len(freqs)//2]
|
20 |
+
positive_magnitude = fft_spectrum[:len(fft_spectrum)//2]
|
21 |
+
# Find peaks in frequency spectrum
|
22 |
+
peaks, _ = find_peaks(positive_magnitude, height=np.max(positive_magnitude) * 0.2)
|
23 |
+
if len(peaks) == 0:
|
24 |
+
return None
|
25 |
+
# Find the first strong peak within the human voice range (75Hz - 300Hz)
|
26 |
+
for peak in peaks:
|
27 |
+
if 75 <= positive_freqs[peak] <= 300:
|
28 |
+
pitch = positive_freqs[peak]
|
29 |
+
gender = "female" if pitch > 135 else "male"
|
30 |
+
return gender
|
31 |
+
break
|
32 |
+
return None
|
33 |
+
except Exception as e:
|
34 |
+
error = f"_detect_gender() error: {voice_path}: {e}"
|
35 |
+
print(error)
|
36 |
+
return None
|
37 |
+
|
38 |
+
def trim_audio(audio_data, samplerate, silence_threshold=0.003, buffer_sec=0.005):
|
39 |
+
# Ensure audio_data is a PyTorch tensor
|
40 |
+
if isinstance(audio_data, list):
|
41 |
+
audio_data = torch.tensor(audio_data, dtype=torch.float32) # Ensure dtype and always float32 for audio
|
42 |
+
if isinstance(audio_data, torch.Tensor):
|
43 |
+
if audio_data.ndim != 1:
|
44 |
+
error = "audio_data must be a 1D tensor (mono audio)."
|
45 |
+
raise ValueError(error)
|
46 |
+
if audio_data.is_cuda:
|
47 |
+
audio_data = audio_data.cpu()
|
48 |
+
# Detect non-silent indices
|
49 |
+
non_silent_indices = torch.where(audio_data.abs() > silence_threshold)[0]
|
50 |
+
if len(non_silent_indices) == 0:
|
51 |
+
return torch.tensor([], dtype=audio_data.dtype) # Preserves dtype
|
52 |
+
# Calculate start and end trimming indices with buffer
|
53 |
+
start_index = max(non_silent_indices[0].item() - int(buffer_sec * samplerate), 0)
|
54 |
+
end_index = min(non_silent_indices[-1].item() + int(buffer_sec * samplerate), audio_data.size(0)) # Clamp end to signal length
|
55 |
+
trimmed_audio = audio_data[start_index:end_index]
|
56 |
+
return trimmed_audio
|
57 |
+
error = "audio_data must be a PyTorch tensor or a list of numerical values."
|
58 |
+
raise TypeError(error)
|
59 |
+
|
60 |
+
def normalize_audio(input_file, output_file, samplerate):
|
61 |
+
filter_complex = (
|
62 |
+
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
|
63 |
+
'afftdn=nf=-70,'
|
64 |
+
'acompressor=threshold=-20dB:ratio=2:attack=80:release=200:makeup=1dB,'
|
65 |
+
'loudnorm=I=-14:TP=-3:LRA=7:linear=true,'
|
66 |
+
'equalizer=f=150:t=q:w=2:g=1,'
|
67 |
+
'equalizer=f=250:t=q:w=2:g=-3,'
|
68 |
+
'equalizer=f=3000:t=q:w=2:g=2,'
|
69 |
+
'equalizer=f=5500:t=q:w=2:g=-4,'
|
70 |
+
'equalizer=f=9000:t=q:w=2:g=-2,'
|
71 |
+
'highpass=f=63[audio]'
|
72 |
+
)
|
73 |
+
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', input_file]
|
74 |
+
ffmpeg_cmd += [
|
75 |
+
'-filter_complex', filter_complex,
|
76 |
+
'-map', '[audio]',
|
77 |
+
'-ar', str(samplerate),
|
78 |
+
'-y', output_file
|
79 |
+
]
|
80 |
+
try:
|
81 |
+
subprocess.run(
|
82 |
+
ffmpeg_cmd,
|
83 |
+
env={},
|
84 |
+
stdout=subprocess.PIPE,
|
85 |
+
stderr=subprocess.PIPE,
|
86 |
+
encoding='utf-8',
|
87 |
+
errors='ignore'
|
88 |
+
)
|
89 |
+
return True
|
90 |
+
except subprocess.CalledProcessError as e:
|
91 |
+
error = f"normalize_audio() error: {input_file}: {e}"
|
92 |
+
print(error)
|
93 |
+
return False
|
94 |
+
|
95 |
+
def is_audio_data_valid(audio_data):
|
96 |
+
if audio_data is None:
|
97 |
+
return False
|
98 |
+
if isinstance(audio_data, torch.Tensor):
|
99 |
+
return audio_data.numel() > 0
|
100 |
+
if isinstance(audio_data, (list, tuple)):
|
101 |
+
return len(audio_data) > 0
|
102 |
+
try:
|
103 |
+
if isinstance(audio_data, np.ndarray):
|
104 |
+
return audio_data.size > 0
|
105 |
+
except ImportError:
|
106 |
+
pass
|
107 |
+
return False
|
lib/classes/tts_engines/common/utils.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import regex as re
|
4 |
+
import stanza
|
5 |
+
|
6 |
+
from lib.models import loaded_tts, max_tts_in_memory, TTS_ENGINES
|
7 |
+
|
8 |
+
def unload_tts(device, reserved_keys=None, tts_key=None):
|
9 |
+
try:
|
10 |
+
if len(loaded_tts) >= max_tts_in_memory:
|
11 |
+
if reserved_keys is None:
|
12 |
+
reserved_keys = []
|
13 |
+
if tts_key is not None:
|
14 |
+
if tts_key in loaded_tts.keys():
|
15 |
+
del loaded_tts[tts_key]
|
16 |
+
if device == 'cuda':
|
17 |
+
torch.cuda.empty_cache()
|
18 |
+
torch.cuda.ipc_collect()
|
19 |
+
else:
|
20 |
+
for key in list(loaded_tts.keys()):
|
21 |
+
if key not in reserved_keys:
|
22 |
+
del loaded_tts[key]
|
23 |
+
except Exception as e:
|
24 |
+
error = f'unload_tts() error: {e}'
|
25 |
+
print(error)
|
26 |
+
return False
|
27 |
+
|
28 |
+
def append_sentence2vtt(sentence_obj, path):
|
29 |
+
|
30 |
+
def format_timestamp(seconds):
|
31 |
+
m, s = divmod(seconds, 60)
|
32 |
+
h, m = divmod(m, 60)
|
33 |
+
return f"{int(h):02}:{int(m):02}:{s:06.3f}"
|
34 |
+
|
35 |
+
try:
|
36 |
+
index = 1
|
37 |
+
if os.path.exists(path):
|
38 |
+
with open(path, "r", encoding="utf-8") as f:
|
39 |
+
lines = f.readlines()
|
40 |
+
for line in lines:
|
41 |
+
if "-->" in line:
|
42 |
+
index += 1
|
43 |
+
if index > 1 and "resume_check" in sentence_obj and sentence_obj["resume_check"] < index:
|
44 |
+
return index # Already written
|
45 |
+
if not os.path.exists(path):
|
46 |
+
with open(path, "w", encoding="utf-8") as f:
|
47 |
+
f.write("WEBVTT\n\n")
|
48 |
+
with open(path, "a", encoding="utf-8") as f:
|
49 |
+
start = format_timestamp(sentence_obj["start"])
|
50 |
+
end = format_timestamp(sentence_obj["end"])
|
51 |
+
text = re.sub(r'[\r\n]+', ' ', sentence_obj["text"]).strip()
|
52 |
+
f.write(f"{start} --> {end}\n{text}\n\n")
|
53 |
+
return index + 1
|
54 |
+
except Exception as e:
|
55 |
+
error = f'append_sentence2vtt() error: {e}'
|
56 |
+
print(error)
|
57 |
+
return False
|
lib/classes/tts_engines/coqui.py
ADDED
@@ -0,0 +1,810 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import hashlib, math, os, shutil, subprocess, tempfile, threading, uuid
|
2 |
+
import numpy as np, regex as re, soundfile as sf, torch, torchaudio
|
3 |
+
|
4 |
+
from huggingface_hub import hf_hub_download
|
5 |
+
from pathlib import Path
|
6 |
+
from pprint import pprint
|
7 |
+
|
8 |
+
from lib import *
|
9 |
+
from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
|
10 |
+
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
11 |
+
|
12 |
+
#import logging
|
13 |
+
#logging.basicConfig(level=logging.DEBUG)
|
14 |
+
|
15 |
+
lock = threading.Lock()
|
16 |
+
xtts_builtin_speakers_list = None
|
17 |
+
|
18 |
+
class Coqui:
|
19 |
+
|
20 |
+
def __init__(self, session):
|
21 |
+
try:
|
22 |
+
self.session = session
|
23 |
+
self.cache_dir = tts_dir
|
24 |
+
self.speakers_path = None
|
25 |
+
self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
|
26 |
+
self.tts_vc_key = default_vc_model.rsplit('/', 1)[-1]
|
27 |
+
self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported() == True else False
|
28 |
+
self.npz_path = None
|
29 |
+
self.npz_data = None
|
30 |
+
self.sentences_total_time = 0.0
|
31 |
+
self.sentence_idx = 1
|
32 |
+
self.params = {TTS_ENGINES['XTTSv2']: {"latent_embedding":{}}, TTS_ENGINES['BARK']: {},TTS_ENGINES['VITS']: {"semitones": {}}, TTS_ENGINES['FAIRSEQ']: {"semitones": {}}, TTS_ENGINES['TACOTRON2']: {"semitones": {}}, TTS_ENGINES['YOURTTS']: {}}
|
33 |
+
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
34 |
+
self.vtt_path = os.path.join(self.session['process_dir'], Path(self.session['final_name']).stem + '.vtt')
|
35 |
+
self.resampler_cache = {}
|
36 |
+
self.audio_segments = []
|
37 |
+
self._build()
|
38 |
+
except Exception as e:
|
39 |
+
error = f'__init__() error: {e}'
|
40 |
+
print(error)
|
41 |
+
return None
|
42 |
+
|
43 |
+
def _build(self):
|
44 |
+
try:
|
45 |
+
global xtts_builtin_speakers_list
|
46 |
+
load_zeroshot = True if self.session['tts_engine'] in [TTS_ENGINES['VITS'], TTS_ENGINES['FAIRSEQ'], TTS_ENGINES['TACOTRON2']] else False
|
47 |
+
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
48 |
+
if not tts:
|
49 |
+
if xtts_builtin_speakers_list is None:
|
50 |
+
self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename=default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][4], cache_dir=self.cache_dir)
|
51 |
+
xtts_builtin_speakers_list = torch.load(self.speakers_path)
|
52 |
+
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
|
53 |
+
msg = f"Loading TTS {self.session['tts_engine']} model, it takes a while, please be patient..."
|
54 |
+
print(msg)
|
55 |
+
if self.session['custom_model'] is not None:
|
56 |
+
config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
|
57 |
+
checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
|
58 |
+
vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
|
59 |
+
self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
|
60 |
+
tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
|
61 |
+
else:
|
62 |
+
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
63 |
+
if self.session['fine_tuned'] == 'internal':
|
64 |
+
hf_sub = ''
|
65 |
+
if self.speakers_path is None:
|
66 |
+
self.speakers_path = hf_hub_download(repo_id=hf_repo, filename=default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][4], cache_dir=self.cache_dir)
|
67 |
+
else:
|
68 |
+
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
69 |
+
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
70 |
+
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
71 |
+
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
72 |
+
tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
|
73 |
+
elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
|
74 |
+
if self.session['custom_model'] is not None:
|
75 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
76 |
+
print(msg)
|
77 |
+
return False
|
78 |
+
else:
|
79 |
+
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
80 |
+
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
81 |
+
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
82 |
+
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
83 |
+
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
84 |
+
checkpoint_dir = os.path.dirname(text_model_path)
|
85 |
+
tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir, device=self.session['device'])
|
86 |
+
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
|
87 |
+
if self.session['custom_model'] is not None:
|
88 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
89 |
+
print(msg)
|
90 |
+
return False
|
91 |
+
else:
|
92 |
+
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
|
93 |
+
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
94 |
+
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
95 |
+
if sub is not None:
|
96 |
+
self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['VITS']][self.session['fine_tuned']]['samplerate'][sub]
|
97 |
+
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
98 |
+
msg = f"Loading TTS {model_path} model, it takes a while, please be patient..."
|
99 |
+
print(msg)
|
100 |
+
self.tts_key = model_path
|
101 |
+
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
102 |
+
else:
|
103 |
+
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
|
104 |
+
print(msg)
|
105 |
+
return False
|
106 |
+
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
|
107 |
+
if self.session['custom_model'] is not None:
|
108 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
109 |
+
print(msg)
|
110 |
+
return False
|
111 |
+
else:
|
112 |
+
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
|
113 |
+
self.tts_key = model_path
|
114 |
+
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
115 |
+
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
|
116 |
+
if self.session['custom_model'] is not None:
|
117 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
118 |
+
print(msg)
|
119 |
+
return False
|
120 |
+
else:
|
121 |
+
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
|
122 |
+
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
123 |
+
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
124 |
+
self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['TACOTRON2']][self.session['fine_tuned']]['samplerate'][sub]
|
125 |
+
if sub is None:
|
126 |
+
iso_dir = self.session['language']
|
127 |
+
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
128 |
+
if sub is not None:
|
129 |
+
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
130 |
+
msg = f"Loading TTS {model_path} model, it takes a while, please be patient..."
|
131 |
+
print(msg)
|
132 |
+
self.tts_key = model_path
|
133 |
+
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
134 |
+
else:
|
135 |
+
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
|
136 |
+
print(msg)
|
137 |
+
return False
|
138 |
+
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
|
139 |
+
if self.session['custom_model'] is not None:
|
140 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
141 |
+
print(msg)
|
142 |
+
return False
|
143 |
+
else:
|
144 |
+
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
145 |
+
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
146 |
+
if load_zeroshot:
|
147 |
+
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
148 |
+
if not tts_vc:
|
149 |
+
if self.session['voice'] is not None:
|
150 |
+
msg = f"Loading TTS {self.tts_vc_key} zeroshot model, it takes a while, please be patient..."
|
151 |
+
print(msg)
|
152 |
+
tts_vc = self._load_api(self.tts_vc_key, default_vc_model, self.session['device'])
|
153 |
+
return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
154 |
+
except Exception as e:
|
155 |
+
error = f'build() error: {e}'
|
156 |
+
print(error)
|
157 |
+
return False
|
158 |
+
|
159 |
+
def _load_api(self, key, model_path, device):
|
160 |
+
global lock
|
161 |
+
try:
|
162 |
+
if key in loaded_tts.keys():
|
163 |
+
return loaded_tts[key]['engine']
|
164 |
+
unload_tts(device, [self.tts_key, self.tts_vc_key])
|
165 |
+
from TTS.api import TTS as coquiAPI
|
166 |
+
with lock:
|
167 |
+
tts = coquiAPI(model_path)
|
168 |
+
if tts:
|
169 |
+
if device == 'cuda':
|
170 |
+
tts.cuda()
|
171 |
+
else:
|
172 |
+
tts.to(device)
|
173 |
+
loaded_tts[key] = {"engine": tts, "config": None}
|
174 |
+
msg = f'{model_path} Loaded!'
|
175 |
+
print(msg)
|
176 |
+
return tts
|
177 |
+
else:
|
178 |
+
error = 'TTS engine could not be created!'
|
179 |
+
print(error)
|
180 |
+
except Exception as e:
|
181 |
+
error = f'_load_api() error: {e}'
|
182 |
+
print(error)
|
183 |
+
return False
|
184 |
+
|
185 |
+
def _load_checkpoint(self, **kwargs):
|
186 |
+
global lock
|
187 |
+
try:
|
188 |
+
key = kwargs.get('key')
|
189 |
+
if key in loaded_tts.keys():
|
190 |
+
return loaded_tts[key]['engine']
|
191 |
+
tts_engine = kwargs.get('tts_engine')
|
192 |
+
device = kwargs.get('device')
|
193 |
+
unload_tts(device, [self.tts_key, self.tts_vc_key])
|
194 |
+
with lock:
|
195 |
+
if tts_engine == TTS_ENGINES['XTTSv2']:
|
196 |
+
from TTS.tts.configs.xtts_config import XttsConfig
|
197 |
+
from TTS.tts.models.xtts import Xtts
|
198 |
+
checkpoint_path = kwargs.get('checkpoint_path')
|
199 |
+
config_path = kwargs.get('config_path', None)
|
200 |
+
vocab_path = kwargs.get('vocab_path', None)
|
201 |
+
config = XttsConfig()
|
202 |
+
config.models_dir = os.path.join("models", "tts")
|
203 |
+
config.load_json(config_path)
|
204 |
+
tts = Xtts.init_from_config(config)
|
205 |
+
tts.load_checkpoint(
|
206 |
+
config,
|
207 |
+
checkpoint_path=checkpoint_path,
|
208 |
+
vocab_path=vocab_path,
|
209 |
+
use_deepspeed=default_engine_settings[TTS_ENGINES['XTTSv2']]['use_deepspeed'],
|
210 |
+
eval=True
|
211 |
+
)
|
212 |
+
elif tts_engine == TTS_ENGINES['BARK']:
|
213 |
+
from TTS.tts.configs.bark_config import BarkConfig
|
214 |
+
from TTS.tts.models.bark import Bark
|
215 |
+
checkpoint_dir = kwargs.get('checkpoint_dir')
|
216 |
+
config = BarkConfig()
|
217 |
+
config.CACHE_DIR = self.cache_dir
|
218 |
+
config.USE_SMALLER_MODELS = os.environ.get('SUNO_USE_SMALL_MODELS', '').lower() == 'true'
|
219 |
+
tts = Bark.init_from_config(config)
|
220 |
+
tts.load_checkpoint(
|
221 |
+
config,
|
222 |
+
checkpoint_dir=checkpoint_dir,
|
223 |
+
eval=True
|
224 |
+
)
|
225 |
+
if tts:
|
226 |
+
if device == 'cuda':
|
227 |
+
tts.cuda()
|
228 |
+
else:
|
229 |
+
tts.to(device)
|
230 |
+
loaded_tts[key] = {"engine": tts, "config": config}
|
231 |
+
msg = f'{tts_engine} Loaded!'
|
232 |
+
print(msg)
|
233 |
+
return tts
|
234 |
+
else:
|
235 |
+
error = 'TTS engine could not be created!'
|
236 |
+
print(error)
|
237 |
+
except Exception as e:
|
238 |
+
error = f'_load_checkpoint() error: {e}'
|
239 |
+
return False
|
240 |
+
|
241 |
+
def _check_xtts_builtin_speakers(self, voice_path, speaker, device):
|
242 |
+
try:
|
243 |
+
voice_parts = Path(voice_path).parts
|
244 |
+
if self.session['language'] not in voice_parts and speaker not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and self.session['language'] != 'eng':
|
245 |
+
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
|
246 |
+
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
247 |
+
if os.path.exists(default_text_file):
|
248 |
+
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
249 |
+
print(msg)
|
250 |
+
tts_internal_key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
251 |
+
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
252 |
+
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
253 |
+
hf_sub = ''
|
254 |
+
tts = (loaded_tts.get(tts_internal_key) or {}).get('engine', False)
|
255 |
+
if not tts:
|
256 |
+
for key in list(loaded_tts.keys()): unload_tts(device, None, key)
|
257 |
+
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
258 |
+
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
259 |
+
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
260 |
+
tts = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=tts_internal_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=device)
|
261 |
+
if tts:
|
262 |
+
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
263 |
+
gpt_cond_latent, speaker_embedding = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
264 |
+
else:
|
265 |
+
gpt_cond_latent, speaker_embedding = tts.get_conditioning_latents(audio_path=[voice_path])
|
266 |
+
fine_tuned_params = {
|
267 |
+
key: cast_type(self.session[key])
|
268 |
+
for key, cast_type in {
|
269 |
+
"temperature": float,
|
270 |
+
"length_penalty": float,
|
271 |
+
"num_beams": int,
|
272 |
+
"repetition_penalty": float,
|
273 |
+
"top_k": int,
|
274 |
+
"top_p": float,
|
275 |
+
"speed": float,
|
276 |
+
"enable_text_splitting": bool
|
277 |
+
}.items()
|
278 |
+
if self.session.get(key) is not None
|
279 |
+
}
|
280 |
+
with torch.no_grad():
|
281 |
+
result = tts.inference(
|
282 |
+
text=default_text,
|
283 |
+
language=self.session['language_iso1'],
|
284 |
+
gpt_cond_latent=gpt_cond_latent,
|
285 |
+
speaker_embedding=speaker_embedding,
|
286 |
+
**fine_tuned_params
|
287 |
+
)
|
288 |
+
audio_data = result.get('wav')
|
289 |
+
if audio_data is not None:
|
290 |
+
audio_data = audio_data.tolist()
|
291 |
+
sourceTensor = self._tensor_type(audio_data)
|
292 |
+
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
293 |
+
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
294 |
+
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
295 |
+
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
296 |
+
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
297 |
+
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate):
|
298 |
+
del audio_data, sourceTensor, audio_tensor
|
299 |
+
if self.session['tts_engine'] != TTS_ENGINES['XTTSv2']:
|
300 |
+
del tts
|
301 |
+
unload_tts(device, None, tts_internal_key)
|
302 |
+
return new_voice_path
|
303 |
+
else:
|
304 |
+
error = 'normalize_audio() error:'
|
305 |
+
else:
|
306 |
+
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
307 |
+
else:
|
308 |
+
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
|
309 |
+
else:
|
310 |
+
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
311 |
+
print(error)
|
312 |
+
else:
|
313 |
+
return voice_path
|
314 |
+
else:
|
315 |
+
return voice_path
|
316 |
+
except Exception as e:
|
317 |
+
error = f'_check_xtts_builtin_speakers() error: {e}'
|
318 |
+
print(error)
|
319 |
+
return False
|
320 |
+
|
321 |
+
def _check_bark_npz(self, voice_path, bark_dir, speaker, device):
|
322 |
+
try:
|
323 |
+
if self.session['language'] in language_tts[TTS_ENGINES['BARK']].keys():
|
324 |
+
npz_dir = os.path.join(bark_dir, speaker)
|
325 |
+
npz_file = os.path.join(npz_dir, f'{speaker}.npz')
|
326 |
+
if os.path.exists(npz_file):
|
327 |
+
return True
|
328 |
+
else:
|
329 |
+
os.makedirs(npz_dir, exist_ok=True)
|
330 |
+
tts_internal_key = f"{TTS_ENGINES['BARK']}-internal"
|
331 |
+
hf_repo = models[TTS_ENGINES['BARK']]['internal']['repo']
|
332 |
+
hf_sub = models[TTS_ENGINES['BARK']]['internal']['sub']
|
333 |
+
tts = (loaded_tts.get(tts_internal_key) or {}).get('engine', False)
|
334 |
+
if not tts:
|
335 |
+
for key in list(loaded_tts.keys()): unload_tts(device, None, key)
|
336 |
+
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
337 |
+
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
338 |
+
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
339 |
+
checkpoint_dir = os.path.dirname(text_model_path)
|
340 |
+
tts = self._load_checkpoint(tts_engine=TTS_ENGINES['BARK'], key=tts_internal_key, checkpoint_dir=checkpoint_dir, device=device)
|
341 |
+
if tts:
|
342 |
+
voice_temp = os.path.splitext(npz_file)[0]+'.wav'
|
343 |
+
shutil.copy(voice_path, voice_temp)
|
344 |
+
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
345 |
+
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
346 |
+
fine_tuned_params = {
|
347 |
+
key: cast_type(self.session[key])
|
348 |
+
for key, cast_type in {
|
349 |
+
"text_temp": float,
|
350 |
+
"waveform_temp": float
|
351 |
+
}.items()
|
352 |
+
if self.session.get(key) is not None
|
353 |
+
}
|
354 |
+
with torch.no_grad():
|
355 |
+
torch.manual_seed(67878789)
|
356 |
+
audio_data = tts.synthesize(
|
357 |
+
default_text,
|
358 |
+
loaded_tts[tts_internal_key]['config'],
|
359 |
+
speaker_id=speaker,
|
360 |
+
voice_dirs=bark_dir,
|
361 |
+
silent=True,
|
362 |
+
**fine_tuned_params
|
363 |
+
)
|
364 |
+
os.remove(voice_temp)
|
365 |
+
del audio_data
|
366 |
+
if self.session['tts_engine'] != TTS_ENGINES['BARK']:
|
367 |
+
del tts
|
368 |
+
unload_tts(device, None, tts_internal_key)
|
369 |
+
msg = f"Saved NPZ file: {npz_file}"
|
370 |
+
print(msg)
|
371 |
+
return True
|
372 |
+
else:
|
373 |
+
error = f'_check_bark_npz() error: {tts_internal_key} is False'
|
374 |
+
print(error)
|
375 |
+
else:
|
376 |
+
return True
|
377 |
+
except Exception as e:
|
378 |
+
error = f'_check_bark_npz() error: {e}'
|
379 |
+
print(error)
|
380 |
+
return False
|
381 |
+
|
382 |
+
def _tensor_type(self, audio_data):
|
383 |
+
if isinstance(audio_data, torch.Tensor):
|
384 |
+
return audio_data
|
385 |
+
elif isinstance(audio_data, np.ndarray):
|
386 |
+
return torch.from_numpy(audio_data).float()
|
387 |
+
elif isinstance(audio_data, list):
|
388 |
+
return torch.tensor(audio_data, dtype=torch.float32)
|
389 |
+
else:
|
390 |
+
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
391 |
+
|
392 |
+
def _get_resampler(self, orig_sr, target_sr):
|
393 |
+
key = (orig_sr, target_sr)
|
394 |
+
if key not in self.resampler_cache:
|
395 |
+
self.resampler_cache[key] = torchaudio.transforms.Resample(
|
396 |
+
orig_freq=orig_sr, new_freq=target_sr
|
397 |
+
)
|
398 |
+
return self.resampler_cache[key]
|
399 |
+
|
400 |
+
def _resample_wav(self, wav_path, expected_sr):
|
401 |
+
waveform, orig_sr = torchaudio.load(wav_path)
|
402 |
+
if orig_sr == expected_sr and waveform.size(0) == 1:
|
403 |
+
return wav_path
|
404 |
+
if waveform.size(0) > 1:
|
405 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
406 |
+
if orig_sr != expected_sr:
|
407 |
+
resampler = self._get_resampler(orig_sr, expected_sr)
|
408 |
+
waveform = resampler(waveform)
|
409 |
+
wav_tensor = waveform.squeeze(0)
|
410 |
+
wav_numpy = wav_tensor.cpu().numpy()
|
411 |
+
tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
412 |
+
tmp_path = tmp_fh.name
|
413 |
+
tmp_fh.close()
|
414 |
+
sf.write(tmp_path, wav_numpy, expected_sr, subtype="PCM_16")
|
415 |
+
return tmp_path
|
416 |
+
|
417 |
+
def convert(self, s_n, s):
|
418 |
+
global xtts_builtin_speakers_list
|
419 |
+
try:
|
420 |
+
sentence_number = s_n
|
421 |
+
sentence = s
|
422 |
+
speaker = None
|
423 |
+
audio_data = False
|
424 |
+
trim_audio_buffer = 0.004
|
425 |
+
settings = self.params[self.session['tts_engine']]
|
426 |
+
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
|
427 |
+
settings['voice_path'] = (
|
428 |
+
self.session['voice'] if self.session['voice'] is not None
|
429 |
+
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
|
430 |
+
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
431 |
+
)
|
432 |
+
if settings['voice_path'] is not None:
|
433 |
+
speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
|
434 |
+
if settings['voice_path'] not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and os.path.basename(settings['voice_path']) != 'ref.wav':
|
435 |
+
self.session['voice'] = settings['voice_path'] = self._check_xtts_builtin_speakers(settings['voice_path'], speaker, self.session['device'])
|
436 |
+
if not settings['voice_path']:
|
437 |
+
msg = f"Could not create the builtin speaker selected voice in {self.session['language']}"
|
438 |
+
print(msg)
|
439 |
+
return False
|
440 |
+
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
441 |
+
if tts:
|
442 |
+
if sentence == TTS_SML['break']:
|
443 |
+
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
444 |
+
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
|
445 |
+
self.audio_segments.append(break_tensor.clone())
|
446 |
+
return True
|
447 |
+
elif sentence == TTS_SML['pause']:
|
448 |
+
silence_time = int(np.random.uniform(1.0, 1.8) * 100) / 100
|
449 |
+
pause_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 1.0 to 1.8 seconds
|
450 |
+
self.audio_segments.append(pause_tensor.clone())
|
451 |
+
return True
|
452 |
+
else:
|
453 |
+
if sentence[-1].isalnum():
|
454 |
+
sentence = f'{sentence} —'
|
455 |
+
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
|
456 |
+
trim_audio_buffer = 0.008
|
457 |
+
if settings['voice_path'] is not None and settings['voice_path'] in settings['latent_embedding'].keys():
|
458 |
+
settings['gpt_cond_latent'], settings['speaker_embedding'] = settings['latent_embedding'][settings['voice_path']]
|
459 |
+
else:
|
460 |
+
msg = 'Computing speaker latents...'
|
461 |
+
print(msg)
|
462 |
+
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
463 |
+
settings['gpt_cond_latent'], settings['speaker_embedding'] = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
464 |
+
else:
|
465 |
+
settings['gpt_cond_latent'], settings['speaker_embedding'] = tts.get_conditioning_latents(audio_path=[settings['voice_path']])
|
466 |
+
settings['latent_embedding'][settings['voice_path']] = settings['gpt_cond_latent'], settings['speaker_embedding']
|
467 |
+
fine_tuned_params = {
|
468 |
+
key: cast_type(self.session[key])
|
469 |
+
for key, cast_type in {
|
470 |
+
"temperature": float,
|
471 |
+
"length_penalty": float,
|
472 |
+
"num_beams": int,
|
473 |
+
"repetition_penalty": float,
|
474 |
+
"top_k": int,
|
475 |
+
"top_p": float,
|
476 |
+
"speed": float,
|
477 |
+
"enable_text_splitting": bool
|
478 |
+
}.items()
|
479 |
+
if self.session.get(key) is not None
|
480 |
+
}
|
481 |
+
with torch.no_grad():
|
482 |
+
result = tts.inference(
|
483 |
+
text=sentence.replace('.', ' —'),
|
484 |
+
language=self.session['language_iso1'],
|
485 |
+
gpt_cond_latent=settings['gpt_cond_latent'],
|
486 |
+
speaker_embedding=settings['speaker_embedding'],
|
487 |
+
**fine_tuned_params
|
488 |
+
)
|
489 |
+
audio_sentence = result.get('wav')
|
490 |
+
if is_audio_data_valid(audio_sentence):
|
491 |
+
audio_sentence = audio_sentence.tolist()
|
492 |
+
elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
|
493 |
+
trim_audio_buffer = 0.002
|
494 |
+
'''
|
495 |
+
[laughter]
|
496 |
+
[laughs]
|
497 |
+
[sighs]
|
498 |
+
[music]
|
499 |
+
[gasps]
|
500 |
+
[clears throat]
|
501 |
+
— or ... for hesitations
|
502 |
+
♪ for song lyrics
|
503 |
+
CAPITALIZATION for emphasis of a word
|
504 |
+
[MAN] and [WOMAN] to bias Bark toward male and female speakers, respectively
|
505 |
+
'''
|
506 |
+
if speaker in default_engine_settings[self.session['tts_engine']]['voices'].keys():
|
507 |
+
bark_dir = default_engine_settings[self.session['tts_engine']]['speakers_path']
|
508 |
+
else:
|
509 |
+
bark_dir = os.path.join(os.path.dirname(settings['voice_path']), 'bark')
|
510 |
+
if not self._check_bark_npz(settings['voice_path'], bark_dir, speaker, self.session['device']):
|
511 |
+
error = 'Could not create npz file!'
|
512 |
+
print(error)
|
513 |
+
return False
|
514 |
+
npz_file = os.path.join(bark_dir, speaker, f'{speaker}.npz')
|
515 |
+
fine_tuned_params = {
|
516 |
+
key: cast_type(self.session[key])
|
517 |
+
for key, cast_type in {
|
518 |
+
"text_temp": float,
|
519 |
+
"waveform_temp": float
|
520 |
+
}.items()
|
521 |
+
if self.session.get(key) is not None
|
522 |
+
}
|
523 |
+
if self.npz_path is None or self.npz_path != npz_file:
|
524 |
+
self.npz_path = npz_file
|
525 |
+
self.npz_data = np.load(self.npz_path, allow_pickle=True)
|
526 |
+
history_prompt = [
|
527 |
+
self.npz_data["semantic_prompt"],
|
528 |
+
self.npz_data["coarse_prompt"],
|
529 |
+
self.npz_data["fine_prompt"]
|
530 |
+
]
|
531 |
+
with torch.no_grad():
|
532 |
+
torch.manual_seed(67878789)
|
533 |
+
audio_sentence, _ = tts.generate_audio(
|
534 |
+
sentence,
|
535 |
+
history_prompt=history_prompt,
|
536 |
+
silent=True,
|
537 |
+
**fine_tuned_params
|
538 |
+
)
|
539 |
+
if is_audio_data_valid(audio_sentence):
|
540 |
+
audio_sentence = audio_sentence.tolist()
|
541 |
+
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
|
542 |
+
speaker_argument = {}
|
543 |
+
if self.session['language'] == 'eng' and 'vctk/vits' in models[self.session['tts_engine']]['internal']['sub']:
|
544 |
+
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits']:
|
545 |
+
speaker_argument = {"speaker": 'p262'}
|
546 |
+
elif self.session['language'] == 'cat' and 'custom/vits' in models[self.session['tts_engine']]['internal']['sub']:
|
547 |
+
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits']:
|
548 |
+
speaker_argument = {"speaker": '09901'}
|
549 |
+
if settings['voice_path'] is not None:
|
550 |
+
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
551 |
+
os.makedirs(proc_dir, exist_ok=True)
|
552 |
+
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
553 |
+
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
554 |
+
tts.tts_to_file(
|
555 |
+
text=sentence,
|
556 |
+
file_path=tmp_in_wav,
|
557 |
+
**speaker_argument
|
558 |
+
)
|
559 |
+
if settings['voice_path'] in settings['semitones'].keys():
|
560 |
+
semitones = settings['semitones'][settings['voice_path']]
|
561 |
+
else:
|
562 |
+
voice_path_gender = detect_gender(settings['voice_path'])
|
563 |
+
voice_builtin_gender = detect_gender(tmp_in_wav)
|
564 |
+
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
|
565 |
+
print(msg)
|
566 |
+
if voice_builtin_gender != voice_path_gender:
|
567 |
+
semitones = -4 if voice_path_gender == 'male' else 4
|
568 |
+
msg = f"Adapting builtin voice frequencies from the clone voice..."
|
569 |
+
print(msg)
|
570 |
+
else:
|
571 |
+
semitones = 0
|
572 |
+
settings['semitones'][settings['voice_path']] = semitones
|
573 |
+
if semitones > 0:
|
574 |
+
try:
|
575 |
+
cmd = [
|
576 |
+
shutil.which('sox'), tmp_in_wav,
|
577 |
+
"-r", str(settings['samplerate']), tmp_out_wav,
|
578 |
+
"pitch", str(semitones * 100)
|
579 |
+
]
|
580 |
+
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
581 |
+
except subprocess.CalledProcessError as e:
|
582 |
+
error = f"Subprocess error: {e.stderr}"
|
583 |
+
print(error)
|
584 |
+
DependencyError(e)
|
585 |
+
return False
|
586 |
+
except FileNotFoundError as e:
|
587 |
+
error = f"File not found: {e}"
|
588 |
+
print(error)
|
589 |
+
DependencyError(e)
|
590 |
+
return False
|
591 |
+
else:
|
592 |
+
tmp_out_wav = tmp_in_wav
|
593 |
+
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
594 |
+
if tts_vc:
|
595 |
+
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
|
596 |
+
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
597 |
+
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
598 |
+
audio_sentence = tts_vc.voice_conversion(
|
599 |
+
source_wav=source_wav,
|
600 |
+
target_wav=target_wav
|
601 |
+
)
|
602 |
+
else:
|
603 |
+
error = f'Engine {self.tts_vc_key} is None'
|
604 |
+
print(error)
|
605 |
+
return False
|
606 |
+
if os.path.exists(tmp_in_wav):
|
607 |
+
os.remove(tmp_in_wav)
|
608 |
+
if os.path.exists(tmp_out_wav):
|
609 |
+
os.remove(tmp_out_wav)
|
610 |
+
if os.path.exists(source_wav):
|
611 |
+
os.remove(source_wav)
|
612 |
+
else:
|
613 |
+
audio_sentence = tts.tts(
|
614 |
+
text=sentence,
|
615 |
+
**speaker_argument
|
616 |
+
)
|
617 |
+
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
|
618 |
+
speaker_argument = {}
|
619 |
+
not_supported_punc_pattern = re.compile(r"[.:—]")
|
620 |
+
if settings['voice_path'] is not None:
|
621 |
+
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
622 |
+
os.makedirs(proc_dir, exist_ok=True)
|
623 |
+
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
624 |
+
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
625 |
+
tts.tts_to_file(
|
626 |
+
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
627 |
+
file_path=tmp_in_wav,
|
628 |
+
**speaker_argument
|
629 |
+
)
|
630 |
+
if settings['voice_path'] in settings['semitones'].keys():
|
631 |
+
semitones = settings['semitones'][settings['voice_path']]
|
632 |
+
else:
|
633 |
+
voice_path_gender = detect_gender(settings['voice_path'])
|
634 |
+
voice_builtin_gender = detect_gender(tmp_in_wav)
|
635 |
+
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
|
636 |
+
print(msg)
|
637 |
+
if voice_builtin_gender != voice_path_gender:
|
638 |
+
semitones = -4 if voice_path_gender == 'male' else 4
|
639 |
+
msg = f"Adapting builtin voice frequencies from the clone voice..."
|
640 |
+
print(msg)
|
641 |
+
else:
|
642 |
+
semitones = 0
|
643 |
+
settings['semitones'][settings['voice_path']] = semitones
|
644 |
+
if semitones > 0:
|
645 |
+
try:
|
646 |
+
cmd = [
|
647 |
+
shutil.which('sox'), tmp_in_wav,
|
648 |
+
"-r", str(settings['samplerate']), tmp_out_wav,
|
649 |
+
"pitch", str(semitones * 100)
|
650 |
+
]
|
651 |
+
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
652 |
+
except subprocess.CalledProcessError as e:
|
653 |
+
print(f"Subprocess error: {e.stderr}")
|
654 |
+
DependencyError(e)
|
655 |
+
return False
|
656 |
+
except FileNotFoundError as e:
|
657 |
+
print(f"File not found: {e}")
|
658 |
+
DependencyError(e)
|
659 |
+
return False
|
660 |
+
else:
|
661 |
+
tmp_out_wav = tmp_in_wav
|
662 |
+
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
663 |
+
if tts_vc:
|
664 |
+
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
|
665 |
+
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
666 |
+
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
667 |
+
audio_sentence = tts_vc.voice_conversion(
|
668 |
+
source_wav=source_wav,
|
669 |
+
target_wav=target_wav
|
670 |
+
)
|
671 |
+
else:
|
672 |
+
error = f'Engine {self.tts_vc_key} is None'
|
673 |
+
print(error)
|
674 |
+
return False
|
675 |
+
if os.path.exists(tmp_in_wav):
|
676 |
+
os.remove(tmp_in_wav)
|
677 |
+
if os.path.exists(tmp_out_wav):
|
678 |
+
os.remove(tmp_out_wav)
|
679 |
+
if os.path.exists(source_wav):
|
680 |
+
os.remove(source_wav)
|
681 |
+
else:
|
682 |
+
audio_sentence = tts.tts(
|
683 |
+
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
684 |
+
**speaker_argument
|
685 |
+
)
|
686 |
+
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
|
687 |
+
speaker_argument = {}
|
688 |
+
not_supported_punc_pattern = re.compile(r'["—]')
|
689 |
+
if settings['voice_path'] is not None:
|
690 |
+
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
691 |
+
os.makedirs(proc_dir, exist_ok=True)
|
692 |
+
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
693 |
+
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
694 |
+
tts.tts_to_file(
|
695 |
+
text=re.sub(not_supported_punc_pattern, '', sentence),
|
696 |
+
file_path=tmp_in_wav,
|
697 |
+
**speaker_argument
|
698 |
+
)
|
699 |
+
if settings['voice_path'] in settings['semitones'].keys():
|
700 |
+
semitones = settings['semitones'][settings['voice_path']]
|
701 |
+
else:
|
702 |
+
voice_path_gender = detect_gender(settings['voice_path'])
|
703 |
+
voice_builtin_gender = detect_gender(tmp_in_wav)
|
704 |
+
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
|
705 |
+
print(msg)
|
706 |
+
if voice_builtin_gender != voice_path_gender:
|
707 |
+
semitones = -4 if voice_path_gender == 'male' else 4
|
708 |
+
msg = f"Adapting builtin voice frequencies from the clone voice..."
|
709 |
+
print(msg)
|
710 |
+
else:
|
711 |
+
semitones = 0
|
712 |
+
settings['semitones'][settings['voice_path']] = semitones
|
713 |
+
if semitones > 0:
|
714 |
+
try:
|
715 |
+
cmd = [
|
716 |
+
shutil.which('sox'), tmp_in_wav,
|
717 |
+
"-r", str(settings['samplerate']), tmp_out_wav,
|
718 |
+
"pitch", str(semitones * 100)
|
719 |
+
]
|
720 |
+
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
721 |
+
except subprocess.CalledProcessError as e:
|
722 |
+
error = f"Subprocess error: {e.stderr}"
|
723 |
+
print(error)
|
724 |
+
DependencyError(e)
|
725 |
+
return False
|
726 |
+
except FileNotFoundError as e:
|
727 |
+
error = f"File not found: {e}"
|
728 |
+
print(error)
|
729 |
+
DependencyError(e)
|
730 |
+
return False
|
731 |
+
else:
|
732 |
+
tmp_out_wav = tmp_in_wav
|
733 |
+
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
734 |
+
if tts_vc:
|
735 |
+
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
|
736 |
+
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
737 |
+
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
738 |
+
audio_sentence = tts_vc.voice_conversion(
|
739 |
+
source_wav=source_wav,
|
740 |
+
target_wav=target_wav
|
741 |
+
)
|
742 |
+
else:
|
743 |
+
error = f'Engine {self.tts_vc_key} is None'
|
744 |
+
print(error)
|
745 |
+
return False
|
746 |
+
if os.path.exists(tmp_in_wav):
|
747 |
+
os.remove(tmp_in_wav)
|
748 |
+
if os.path.exists(tmp_out_wav):
|
749 |
+
os.remove(tmp_out_wav)
|
750 |
+
if os.path.exists(source_wav):
|
751 |
+
os.remove(source_wav)
|
752 |
+
else:
|
753 |
+
audio_sentence = tts.tts(
|
754 |
+
text=re.sub(not_supported_punc_pattern, '', sentence),
|
755 |
+
**speaker_argument
|
756 |
+
)
|
757 |
+
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
|
758 |
+
speaker_argument = {}
|
759 |
+
language = self.session['language_iso1'] if self.session['language_iso1'] == 'en' else 'fr-fr' if self.session['language_iso1'] == 'fr' else 'pt-br' if self.session['language_iso1'] == 'pt' else 'en'
|
760 |
+
if settings['voice_path'] is not None:
|
761 |
+
speaker_wav = settings['voice_path']
|
762 |
+
speaker_argument = {"speaker_wav": speaker_wav}
|
763 |
+
else:
|
764 |
+
voice_key = default_engine_settings[TTS_ENGINES['YOURTTS']]['voices']['ElectroMale-2']
|
765 |
+
speaker_argument = {"speaker": voice_key}
|
766 |
+
with torch.no_grad():
|
767 |
+
audio_sentence = tts.tts(
|
768 |
+
text=sentence.replace('—', '').strip(),
|
769 |
+
language=language,
|
770 |
+
**speaker_argument
|
771 |
+
)
|
772 |
+
if is_audio_data_valid(audio_sentence):
|
773 |
+
sourceTensor = self._tensor_type(audio_sentence)
|
774 |
+
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
775 |
+
if sentence[-1].isalnum() or sentence[-1] == '—':
|
776 |
+
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.003, trim_audio_buffer).unsqueeze(0)
|
777 |
+
self.audio_segments.append(audio_tensor)
|
778 |
+
if not re.search(r'\w$', sentence, flags=re.UNICODE):
|
779 |
+
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
780 |
+
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time))
|
781 |
+
self.audio_segments.append(break_tensor.clone())
|
782 |
+
if self.audio_segments:
|
783 |
+
audio_tensor = torch.cat(self.audio_segments, dim=-1)
|
784 |
+
start_time = self.sentences_total_time
|
785 |
+
duration = round((audio_tensor.shape[-1] / settings['samplerate']), 2)
|
786 |
+
end_time = start_time + duration
|
787 |
+
self.sentences_total_time = end_time
|
788 |
+
sentence_obj = {
|
789 |
+
"start": start_time,
|
790 |
+
"end": end_time,
|
791 |
+
"text": sentence,
|
792 |
+
"resume_check": self.sentence_idx
|
793 |
+
}
|
794 |
+
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
795 |
+
if self.sentence_idx:
|
796 |
+
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
|
797 |
+
del audio_tensor
|
798 |
+
self.audio_segments = []
|
799 |
+
if os.path.exists(final_sentence_file):
|
800 |
+
return True
|
801 |
+
else:
|
802 |
+
error = f"Cannot create {final_sentence_file}"
|
803 |
+
print(error)
|
804 |
+
else:
|
805 |
+
error = f"convert() error: {self.session['tts_engine']} is None"
|
806 |
+
print(error)
|
807 |
+
except Exception as e:
|
808 |
+
error = f'Coquit.convert(): {e}'
|
809 |
+
raise ValueError(e)
|
810 |
+
return False
|
lib/classes/tts_manager.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from lib.models import TTS_ENGINES
|
4 |
+
|
5 |
+
class TTSManager:
|
6 |
+
def __init__(self, session):
|
7 |
+
self.session = session
|
8 |
+
self.tts = None
|
9 |
+
self._build()
|
10 |
+
|
11 |
+
def _build(self):
|
12 |
+
if self.session['tts_engine'] in TTS_ENGINES.values():
|
13 |
+
if self.session['tts_engine'] in [TTS_ENGINES['XTTSv2'], TTS_ENGINES['BARK'], TTS_ENGINES['VITS'], TTS_ENGINES['FAIRSEQ'], TTS_ENGINES['TACOTRON2'], TTS_ENGINES['YOURTTS']]:
|
14 |
+
from lib.classes.tts_engines.coqui import Coqui
|
15 |
+
self.tts = Coqui(self.session)
|
16 |
+
#elif self.session['tts_engine'] in [TTS_ENGINES['NEW_TTS']]:
|
17 |
+
# from lib.classes.tts_engines.new_tts import NewTts
|
18 |
+
# self.tts = NewTts(self.session)
|
19 |
+
if self.tts:
|
20 |
+
return True
|
21 |
+
else:
|
22 |
+
error = 'TTS engine could not be created!'
|
23 |
+
print(error)
|
24 |
+
else:
|
25 |
+
print('Other TTS engines coming soon!')
|
26 |
+
return False
|
27 |
+
|
28 |
+
def convert_sentence2audio(self, sentence_number, sentence):
|
29 |
+
try:
|
30 |
+
if self.session['tts_engine'] in TTS_ENGINES.values():
|
31 |
+
return self.tts.convert(sentence_number, sentence)
|
32 |
+
else:
|
33 |
+
print('Other TTS engines coming soon!')
|
34 |
+
except Exception as e:
|
35 |
+
error = f'convert_sentence2audio(): {e}'
|
36 |
+
raise ValueError(e)
|
37 |
+
return False
|
lib/classes/voice_extractor.py
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import regex as re
|
4 |
+
import scipy.fftpack
|
5 |
+
import soundfile as sf
|
6 |
+
import subprocess
|
7 |
+
import shutil
|
8 |
+
|
9 |
+
from io import BytesIO
|
10 |
+
from pydub import AudioSegment, silence
|
11 |
+
from pydub.silence import detect_silence
|
12 |
+
|
13 |
+
from lib.conf import voice_formats, default_audio_proc_samplerate
|
14 |
+
from lib.models import TTS_ENGINES, models
|
15 |
+
from lib.classes.background_detector import BackgroundDetector
|
16 |
+
|
17 |
+
class VoiceExtractor:
|
18 |
+
|
19 |
+
def __init__(self, session, voice_file, voice_name):
|
20 |
+
self.wav_file = None
|
21 |
+
self.session = session
|
22 |
+
self.voice_file = voice_file
|
23 |
+
self.voice_name = voice_name
|
24 |
+
self.voice_track = 'vocals.wav'
|
25 |
+
self.samplerate = models[session['tts_engine']][session['fine_tuned']]['samplerate']
|
26 |
+
self.output_dir = self.session['voice_dir']
|
27 |
+
self.demucs_dir = os.path.join(self.output_dir, 'htdemucs', voice_name)
|
28 |
+
self.silence_threshold = -60
|
29 |
+
|
30 |
+
def _validate_format(self):
|
31 |
+
file_extension = os.path.splitext(self.voice_file)[1].lower()
|
32 |
+
if file_extension in voice_formats:
|
33 |
+
msg = 'Input file valid'
|
34 |
+
return True, msg
|
35 |
+
error = f'Unsupported file format: {file_extension}. Supported formats are: {", ".join(voice_formats)}'
|
36 |
+
return False, error
|
37 |
+
|
38 |
+
def _convert2wav(self):
|
39 |
+
try:
|
40 |
+
self.wav_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
|
41 |
+
ffmpeg_cmd = [
|
42 |
+
shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_file,
|
43 |
+
'-ac', '1',
|
44 |
+
'-y', self.wav_file
|
45 |
+
]
|
46 |
+
process = subprocess.Popen(
|
47 |
+
ffmpeg_cmd,
|
48 |
+
env={},
|
49 |
+
stdout=subprocess.PIPE,
|
50 |
+
stderr=subprocess.STDOUT,
|
51 |
+
text=True,
|
52 |
+
universal_newlines=True,
|
53 |
+
encoding='utf-8'
|
54 |
+
)
|
55 |
+
for line in process.stdout:
|
56 |
+
print(line, end='') # Print each line of stdout
|
57 |
+
process.wait()
|
58 |
+
if process.returncode != 0:
|
59 |
+
error = f'_convert2wav(): process.returncode: {process.returncode}'
|
60 |
+
elif not os.path.exists(self.wav_file) or os.path.getsize(self.wav_file) == 0:
|
61 |
+
error = f'_convert2wav output error: {self.wav_file} was not created or is empty.'
|
62 |
+
else:
|
63 |
+
msg = 'Conversion to .wav format for processing successful'
|
64 |
+
return True, msg
|
65 |
+
except subprocess.CalledProcessError as e:
|
66 |
+
error = f'convert2wav fmpeg.Error: {e.stderr.decode()}'
|
67 |
+
raise ValueError(error)
|
68 |
+
except Exception as e:
|
69 |
+
error = f'_convert2wav() error: {e}'
|
70 |
+
raise ValueError(error)
|
71 |
+
return False, error
|
72 |
+
|
73 |
+
def _detect_background(self):
|
74 |
+
try:
|
75 |
+
msg = 'Detecting any background noise or music...'
|
76 |
+
print(msg)
|
77 |
+
detector = BackgroundDetector(wav_file=self.wav_file)
|
78 |
+
status, report = detector.detect(vad_ratio_thresh=0.15)
|
79 |
+
print(report)
|
80 |
+
if status:
|
81 |
+
msg = 'Background noise or music detected. Proceeding voice extraction...'
|
82 |
+
else:
|
83 |
+
msg = 'No background noise or music detected. Skipping separation...'
|
84 |
+
return True, status, msg
|
85 |
+
except Exception as e:
|
86 |
+
error = f'_detect_background() error: {e}'
|
87 |
+
raise ValueError(error)
|
88 |
+
return False, False, error
|
89 |
+
|
90 |
+
def _demucs_voice(self):
|
91 |
+
try:
|
92 |
+
cmd = [
|
93 |
+
"demucs",
|
94 |
+
"--verbose",
|
95 |
+
"--two-stems=vocals",
|
96 |
+
"--out", self.output_dir,
|
97 |
+
self.wav_file
|
98 |
+
]
|
99 |
+
try:
|
100 |
+
process = subprocess.run(cmd, check=True)
|
101 |
+
self.voice_track = os.path.join(self.demucs_dir, self.voice_track)
|
102 |
+
msg = 'Voice track isolation successful'
|
103 |
+
return True, msg
|
104 |
+
except subprocess.CalledProcessError as e:
|
105 |
+
error = (
|
106 |
+
f'_demucs_voice() subprocess CalledProcessError error: {e.returncode}\n\n'
|
107 |
+
f'stdout: {e.output}\n\n'
|
108 |
+
f'stderr: {e.stderr}'
|
109 |
+
)
|
110 |
+
raise ValueError(error)
|
111 |
+
except FileNotFoundError:
|
112 |
+
error = f'_demucs_voice() subprocess FileNotFoundError error: The "demucs" command was not found. Ensure it is installed and in PATH.'
|
113 |
+
raise ValueError(error)
|
114 |
+
except Exception as e:
|
115 |
+
error = f'_demucs_voice() subprocess Exception error: {str(e)}'
|
116 |
+
raise ValueError(error)
|
117 |
+
except Exception as e:
|
118 |
+
error = f'_demucs_voice() error: {e}'
|
119 |
+
raise ValueError(error)
|
120 |
+
return False, error
|
121 |
+
|
122 |
+
def _remove_silences(self, audio, silence_threshold, min_silence_len=200, keep_silence=300):
|
123 |
+
final_audio = AudioSegment.silent(duration=0)
|
124 |
+
chunks = silence.split_on_silence(
|
125 |
+
audio,
|
126 |
+
min_silence_len=min_silence_len,
|
127 |
+
silence_thresh=silence_threshold,
|
128 |
+
keep_silence=keep_silence
|
129 |
+
)
|
130 |
+
for chunk in chunks:
|
131 |
+
final_audio += chunk
|
132 |
+
final_audio.export(self.voice_track, format='wav')
|
133 |
+
|
134 |
+
def _trim_and_clean(self,silence_threshold, min_silence_len=200, chunk_size=100):
|
135 |
+
try:
|
136 |
+
audio = AudioSegment.from_file(self.voice_track)
|
137 |
+
total_duration = len(audio) # Total duration in milliseconds
|
138 |
+
min_required_duration = 20000 if self.session['tts_engine'] == TTS_ENGINES['BARK'] else 12000
|
139 |
+
msg = f"Removing long pauses..."
|
140 |
+
print(msg)
|
141 |
+
self._remove_silences(audio, silence_threshold)
|
142 |
+
if total_duration <= min_required_duration:
|
143 |
+
msg = f"Audio is only {total_duration/1000:.2f}s long; skipping audio trimming..."
|
144 |
+
return True, msg
|
145 |
+
else:
|
146 |
+
if total_duration > (min_required_duration * 2):
|
147 |
+
msg = f"Audio longer than the max allowed. Proceeding to audio trimming..."
|
148 |
+
print(msg)
|
149 |
+
window = min_required_duration
|
150 |
+
hop = max(1, window // 4)
|
151 |
+
best_var = -float("inf")
|
152 |
+
best_start = 0
|
153 |
+
sr = audio.frame_rate
|
154 |
+
for start in range(0, total_duration - window + 1, hop):
|
155 |
+
chunk = audio[start : start + window]
|
156 |
+
samples = np.array(chunk.get_array_of_samples()).astype(float)
|
157 |
+
# 1) FFT + magnitude
|
158 |
+
spectrum = np.abs(scipy.fftpack.fft(samples))
|
159 |
+
# 2) turn into a probability distribution
|
160 |
+
p = spectrum / (np.sum(spectrum) + 1e-10)
|
161 |
+
# 3) spectral entropy
|
162 |
+
entropy = -np.sum(p * np.log2(p + 1e-10))
|
163 |
+
if entropy > best_var:
|
164 |
+
best_var = entropy
|
165 |
+
best_start = start
|
166 |
+
best_end = best_start + window
|
167 |
+
msg = (
|
168 |
+
f"Selected most‐diverse‐spectrum window "
|
169 |
+
f"{best_start/1000:.2f}s–{best_end/1000:.2f}s "
|
170 |
+
f"(@ entropy {best_var:.2f} bits)"
|
171 |
+
)
|
172 |
+
print(msg)
|
173 |
+
# 1) find all silent spans in the file
|
174 |
+
silence_spans = detect_silence(
|
175 |
+
audio,
|
176 |
+
min_silence_len=min_silence_len,
|
177 |
+
silence_thresh=silence_threshold
|
178 |
+
)
|
179 |
+
# silence_spans = [ [start_ms, end_ms], … ]
|
180 |
+
# 2) snap best_start *backward* to the end of the last silence before it
|
181 |
+
prev_ends = [end for (start, end) in silence_spans if end <= best_start]
|
182 |
+
if prev_ends:
|
183 |
+
new_start = max(prev_ends)
|
184 |
+
else:
|
185 |
+
new_start = 0
|
186 |
+
# 3) snap best_end *forward* to the start of the first silence after it
|
187 |
+
next_starts = [start for (start, end) in silence_spans if start >= best_end]
|
188 |
+
if next_starts:
|
189 |
+
new_end = min(next_starts)
|
190 |
+
else:
|
191 |
+
new_end = total_duration
|
192 |
+
# 4) update your slice bounds
|
193 |
+
best_start, best_end = new_start, new_end
|
194 |
+
else:
|
195 |
+
best_start = 0
|
196 |
+
best_end = total_duration
|
197 |
+
trimmed_audio = audio[best_start:best_end]
|
198 |
+
trimmed_audio.export(self.voice_track, format='wav')
|
199 |
+
msg = 'Audio trimmed and cleaned!'
|
200 |
+
return True, msg
|
201 |
+
except Exception as e:
|
202 |
+
error = f'_trim_and_clean() error: {e}'
|
203 |
+
raise ValueError(error)
|
204 |
+
|
205 |
+
def _normalize_audio(self):
|
206 |
+
error = ''
|
207 |
+
try:
|
208 |
+
proc_voice_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}_proc.wav')
|
209 |
+
final_voice_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
|
210 |
+
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_track]
|
211 |
+
filter_complex = (
|
212 |
+
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
|
213 |
+
'afftdn=nf=-70,'
|
214 |
+
'acompressor=threshold=-20dB:ratio=2:attack=80:release=200:makeup=1dB,'
|
215 |
+
'loudnorm=I=-14:TP=-3:LRA=7:linear=true,'
|
216 |
+
'equalizer=f=150:t=q:w=2:g=1,'
|
217 |
+
'equalizer=f=250:t=q:w=2:g=-3,'
|
218 |
+
'equalizer=f=3000:t=q:w=2:g=2,'
|
219 |
+
'equalizer=f=5500:t=q:w=2:g=-4,'
|
220 |
+
'equalizer=f=9000:t=q:w=2:g=-2,'
|
221 |
+
'highpass=f=63[audio]'
|
222 |
+
)
|
223 |
+
ffmpeg_cmd += [
|
224 |
+
'-filter_complex', filter_complex,
|
225 |
+
'-map', '[audio]',
|
226 |
+
'-ar', f'{default_audio_proc_samplerate}',
|
227 |
+
'-y', proc_voice_file
|
228 |
+
]
|
229 |
+
try:
|
230 |
+
process = subprocess.Popen(
|
231 |
+
ffmpeg_cmd,
|
232 |
+
env={},
|
233 |
+
stdout=subprocess.PIPE,
|
234 |
+
stderr=subprocess.PIPE,
|
235 |
+
encoding='utf-8',
|
236 |
+
errors='ignore'
|
237 |
+
)
|
238 |
+
for line in process.stdout:
|
239 |
+
print(line, end='') # Print each line of stdout
|
240 |
+
process.wait()
|
241 |
+
if process.returncode != 0:
|
242 |
+
error = f'_normalize_audio(): process.returncode: {process.returncode}'
|
243 |
+
elif not os.path.exists(proc_voice_file) or os.path.getsize(proc_voice_file) == 0:
|
244 |
+
error = f'_normalize_audio() error: {proc_voice_file} was not created or is empty.'
|
245 |
+
else:
|
246 |
+
os.replace(proc_voice_file, final_voice_file)
|
247 |
+
shutil.rmtree(self.demucs_dir, ignore_errors=True)
|
248 |
+
msg = 'Audio normalization successful!'
|
249 |
+
return True, msg
|
250 |
+
except subprocess.CalledProcessError as e:
|
251 |
+
error = f'_normalize_audio() ffmpeg.Error: {e.stderr.decode()}'
|
252 |
+
except FileNotFoundError as e:
|
253 |
+
error = '_normalize_audio() FileNotFoundError: {e} Input file or FFmpeg PATH not found!'
|
254 |
+
except Exception as e:
|
255 |
+
error = f'_normalize_audio() error: {e}'
|
256 |
+
return False, error
|
257 |
+
|
258 |
+
def extract_voice(self):
|
259 |
+
success = False
|
260 |
+
msg = None
|
261 |
+
try:
|
262 |
+
success, msg = self._validate_format()
|
263 |
+
print(msg)
|
264 |
+
if success:
|
265 |
+
success, msg = self._convert2wav()
|
266 |
+
print(msg)
|
267 |
+
if success:
|
268 |
+
success, status, msg = self._detect_background()
|
269 |
+
print(msg)
|
270 |
+
if success:
|
271 |
+
if status:
|
272 |
+
success, msg = self._demucs_voice()
|
273 |
+
print(msg)
|
274 |
+
else:
|
275 |
+
self.voice_track = self.wav_file
|
276 |
+
if success:
|
277 |
+
success, msg = self._trim_and_clean(self.silence_threshold)
|
278 |
+
print(msg)
|
279 |
+
if success:
|
280 |
+
success, msg = self._normalize_audio()
|
281 |
+
print(msg)
|
282 |
+
except Exception as e:
|
283 |
+
msg = f'extract_voice() error: {e}'
|
284 |
+
raise ValueError(msg)
|
285 |
+
shutil.rmtree(self.demucs_dir, ignore_errors=True)
|
286 |
+
return success, msg
|
lib/conf.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import platform
|
3 |
+
|
4 |
+
tmp_dir = os.path.abspath('tmp')
|
5 |
+
tmp_expire = 7 # days
|
6 |
+
|
7 |
+
models_dir = os.path.abspath('models')
|
8 |
+
ebooks_dir = os.path.abspath('ebooks')
|
9 |
+
voices_dir = os.path.abspath('voices')
|
10 |
+
tts_dir = os.path.join(models_dir, 'tts')
|
11 |
+
|
12 |
+
os.environ['PYTHONUTF8'] = '1'
|
13 |
+
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
14 |
+
os.environ['COQUI_TOS_AGREED'] = '1'
|
15 |
+
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
16 |
+
os.environ['CALIBRE_NO_NATIVE_FILEDIALOGS'] = '1'
|
17 |
+
os.environ['GRADIO_DEBUG'] = '1'
|
18 |
+
os.environ['DO_NOT_TRACK'] = 'true'
|
19 |
+
os.environ['CALIBRE_TEMP_DIR'] = tmp_dir
|
20 |
+
os.environ['CALIBRE_CACHE_DIRECTORY'] = tmp_dir
|
21 |
+
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
|
22 |
+
os.environ['HF_HOME'] = tts_dir
|
23 |
+
os.environ['HF_DATASETS_CACHE'] = tts_dir
|
24 |
+
os.environ['BARK_CACHE_DIR'] = tts_dir
|
25 |
+
os.environ['TTS_CACHE'] = tts_dir
|
26 |
+
os.environ['TORCH_HOME'] = tts_dir
|
27 |
+
os.environ['TTS_HOME'] = models_dir
|
28 |
+
os.environ['XDG_CACHE_HOME'] = models_dir
|
29 |
+
os.environ['STANZA_RESOURCES_DIR'] = os.path.join(models_dir, 'stanza')
|
30 |
+
os.environ['ARGOS_TRANSLATE_PACKAGE_PATH'] = os.path.join(models_dir, 'argostranslate')
|
31 |
+
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
|
32 |
+
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
33 |
+
os.environ['SUNO_OFFLOAD_CPU'] = 'False' # BARK option: False needs A GPU
|
34 |
+
os.environ['SUNO_USE_SMALL_MODELS'] = 'False' # BARK option: False needs a GPU with VRAM > 4GB
|
35 |
+
if platform.system() == 'Windows':
|
36 |
+
os.environ['ESPEAK_DATA_PATH'] = os.path.expandvars(r"%USERPROFILE%\scoop\apps\espeak-ng\current\eSpeak NG\espeak-ng-data")
|
37 |
+
|
38 |
+
prog_version = (lambda: open('VERSION.txt').read().strip())()
|
39 |
+
|
40 |
+
min_python_version = (3,10)
|
41 |
+
max_python_version = (3,12)
|
42 |
+
|
43 |
+
NATIVE = 'native'
|
44 |
+
FULL_DOCKER = 'full_docker'
|
45 |
+
|
46 |
+
debug_mode = True
|
47 |
+
|
48 |
+
device_list = ['cpu', 'gpu', 'mps']
|
49 |
+
default_device = 'cpu'
|
50 |
+
default_gpu_wiki = '<a href="https://github.com/DrewThomasson/ebook2audiobook/wiki/GPU-ISSUES">howto wiki</a>'
|
51 |
+
|
52 |
+
python_env_dir = os.path.abspath(os.path.join('.','python_env'))
|
53 |
+
requirements_file = os.path.abspath(os.path.join('.','requirements.txt'))
|
54 |
+
|
55 |
+
interface_host = '0.0.0.0'
|
56 |
+
interface_port = 7860
|
57 |
+
interface_shared_tmp_expire = 3 # in days
|
58 |
+
interface_concurrency_limit = 1 # or None for unlimited
|
59 |
+
|
60 |
+
interface_component_options = {
|
61 |
+
"gr_tab_xtts_params": True,
|
62 |
+
"gr_tab_bark_params": True,
|
63 |
+
"gr_group_voice_file": True,
|
64 |
+
"gr_group_custom_model": True
|
65 |
+
}
|
66 |
+
|
67 |
+
audiobooks_gradio_dir = os.path.abspath(os.path.join('audiobooks','gui','gradio'))
|
68 |
+
audiobooks_host_dir = os.path.abspath(os.path.join('audiobooks','gui','host'))
|
69 |
+
audiobooks_cli_dir = os.path.abspath(os.path.join('audiobooks','cli'))
|
70 |
+
|
71 |
+
ebook_formats = ['.epub', '.mobi', '.azw3', '.fb2', '.lrf', '.rb', '.snb', '.tcr', '.pdf', '.txt', '.rtf', '.doc', '.docx', '.html', '.odt', '.azw'] # Add or remove the format you accept as input
|
72 |
+
voice_formats = ['.mp4', '.m4b', '.m4a', '.mp3', '.wav', '.aac', '.flac', '.alac', '.ogg', '.aiff', '.aif', '.wma', '.dsd', '.opus', '.pcmu', '.pcma', '.gsm'] # Add or remove the format you accept as input
|
73 |
+
output_formats = ['aac', 'flac', 'mp3', 'm4b', 'm4a', 'mp4', 'mov', 'ogg', 'wav', 'webm']
|
74 |
+
default_audio_proc_samplerate = 24000
|
75 |
+
default_audio_proc_format = 'flac' # or 'mp3', 'aac', 'm4a', 'm4b', 'amr', '3gp', 'alac'. 'wav' format is ok but limited to process files < 4GB
|
76 |
+
default_output_format = 'm4b'
|
77 |
+
default_output_split = False
|
78 |
+
default_output_split_hours = '6' # if the final ouput esceed outpout_split_hours * 2 hours the final file will be splitted by outpout_split_hours + the end if any.
|
lib/functions.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lib/lang.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lib/models.py
ADDED
@@ -0,0 +1,493 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from lib.conf import tts_dir, voices_dir
|
4 |
+
loaded_tts = {}
|
5 |
+
|
6 |
+
TTS_ENGINES = {
|
7 |
+
"XTTSv2": "xtts",
|
8 |
+
"BARK": "bark",
|
9 |
+
"VITS": "vits",
|
10 |
+
"FAIRSEQ": "fairseq",
|
11 |
+
"TACOTRON2": "tacotron",
|
12 |
+
"YOURTTS": "yourtts"
|
13 |
+
}
|
14 |
+
|
15 |
+
TTS_VOICE_CONVERSION = {
|
16 |
+
"freevc24": {"path": "voice_conversion_models/multilingual/vctk/freevc24", "samplerate": 24000},
|
17 |
+
"knnvc": {"path": "voice_conversion_models/multilingual/multi-dataset/knnvc", "samplerate": 16000},
|
18 |
+
"openvoice_v1": {"path": "voice_conversion_models/multilingual/multi-dataset/openvoice_v1", "samplerate": 22050},
|
19 |
+
"openvoice_v2": {"path": "voice_conversion_models/multilingual/multi-dataset/openvoice_v2", "samplerate": 22050}
|
20 |
+
}
|
21 |
+
|
22 |
+
TTS_SML = {
|
23 |
+
"break": "‡break‡",
|
24 |
+
"pause": "‡pause‡",
|
25 |
+
"###": "‡pause‡"
|
26 |
+
}
|
27 |
+
|
28 |
+
default_tts_engine = TTS_ENGINES['XTTSv2']
|
29 |
+
default_fine_tuned = 'internal'
|
30 |
+
default_vc_model = TTS_VOICE_CONVERSION['knnvc']['path']
|
31 |
+
default_voice_detection_model = 'drewThomasson/segmentation'
|
32 |
+
|
33 |
+
max_tts_in_memory = 2 # TTS engines to keep in memory (1 tts engine ~= 4GB to 8GB RAM).
|
34 |
+
max_custom_model = 100
|
35 |
+
max_custom_voices = 1000
|
36 |
+
max_upload_size = '6GB'
|
37 |
+
|
38 |
+
default_engine_settings = {
|
39 |
+
TTS_ENGINES['XTTSv2']: {
|
40 |
+
"samplerate": 24000,
|
41 |
+
"temperature": 0.75,
|
42 |
+
"length_penalty": 1.0,
|
43 |
+
"num_beams": 1,
|
44 |
+
"repetition_penalty": 3.0,
|
45 |
+
"top_k": 50,
|
46 |
+
"top_p": 0.85,
|
47 |
+
"speed": 1.0,
|
48 |
+
"enable_text_splitting": False,
|
49 |
+
# to enable deepspeed, you must install it first:
|
50 |
+
# conda activate ./python_env (linux/mac) or .\python_env (windows)
|
51 |
+
# pip install deepspeed
|
52 |
+
# conda deactivate
|
53 |
+
"use_deepspeed": False,
|
54 |
+
"files": ['config.json', 'model.pth', 'vocab.json', 'ref.wav', 'speakers_xtts.pth'],
|
55 |
+
"voices": {
|
56 |
+
"ClaribelDervla": "Claribel Dervla", "DaisyStudious": "Daisy Studious", "GracieWise": "Gracie Wise",
|
57 |
+
"TammieEma": "Tammie Ema", "AlisonDietlinde": "Alison Dietlinde", "AnaFlorence": "Ana Florence",
|
58 |
+
"AnnmarieNele": "Annmarie Nele", "AsyaAnara": "Asya Anara", "BrendaStern": "Brenda Stern",
|
59 |
+
"GittaNikolina": "Gitta Nikolina", "HenrietteUsha": "Henriette Usha", "SofiaHellen": "Sofia Hellen",
|
60 |
+
"TammyGrit": "Tammy Grit", "TanjaAdelina": "Tanja Adelina", "VjollcaJohnnie": "Vjollca Johnnie",
|
61 |
+
"AndrewChipper": "Andrew Chipper", "BadrOdhiambo": "Badr Odhiambo", "DionisioSchuyler": "Dionisio Schuyler",
|
62 |
+
"RoystonMin": "Royston Min", "ViktorEka": "Viktor Eka", "AbrahanMack": "Abrahan Mack",
|
63 |
+
"AddeMichal": "Adde Michal", "BaldurSanjin": "Baldur Sanjin", "CraigGutsy": "Craig Gutsy",
|
64 |
+
"DamienBlack": "Damien Black", "GilbertoMathias": "Gilberto Mathias", "IlkinUrbano": "Ilkin Urbano",
|
65 |
+
"KazuhikoAtallah": "Kazuhiko Atallah", "LudvigMilivoj": "Ludvig Milivoj", "SuadQasim": "Suad Qasim",
|
66 |
+
"TorcullDiarmuid": "Torcull Diarmuid", "ViktorMenelaos": "Viktor Menelaos", "ZacharieAimilios": "Zacharie Aimilios",
|
67 |
+
"NovaHogarth": "Nova Hogarth", "MajaRuoho": "Maja Ruoho", "UtaObando": "Uta Obando",
|
68 |
+
"LidiyaSzekeres": "Lidiya Szekeres", "ChandraMacFarland": "Chandra MacFarland", "SzofiGranger": "Szofi Granger",
|
69 |
+
"CamillaHolmström": "Camilla Holmström", "LilyaStainthorpe": "Lilya Stainthorpe", "ZofijaKendrick": "Zofija Kendrick",
|
70 |
+
"NarelleMoon": "Narelle Moon", "BarboraMacLean": "Barbora MacLean", "AlexandraHisakawa": "Alexandra Hisakawa",
|
71 |
+
"AlmaMaría": "Alma María", "RosemaryOkafor": "Rosemary Okafor", "IgeBehringer": "Ige Behringer",
|
72 |
+
"FilipTraverse": "Filip Traverse", "DamjanChapman": "Damjan Chapman", "WulfCarlevaro": "Wulf Carlevaro",
|
73 |
+
"AaronDreschner": "Aaron Dreschner", "KumarDahl": "Kumar Dahl", "EugenioMataracı": "Eugenio Mataracı",
|
74 |
+
"FerranSimen": "Ferran Simen", "XavierHayasaka": "Xavier Hayasaka", "LuisMoray": "Luis Moray",
|
75 |
+
"MarcosRudaski": "Marcos Rudaski"
|
76 |
+
},
|
77 |
+
"rating": {"GPU VRAM": 4, "CPU": 3, "RAM": 8, "Realism": 4}
|
78 |
+
},
|
79 |
+
TTS_ENGINES['BARK']: {
|
80 |
+
"samplerate": 24000,
|
81 |
+
"text_temp": 0.50,
|
82 |
+
"waveform_temp": 0.50,
|
83 |
+
"files": ["text_2.pt", "coarse_2.pt", "fine_2.pt"],
|
84 |
+
"speakers_path": os.path.join(voices_dir, '__bark'),
|
85 |
+
"voices": {
|
86 |
+
"de_speaker_0": "Speaker 0", "de_speaker_1": "Speaker 1", "de_speaker_2": "Speaker 2",
|
87 |
+
"de_speaker_3": "Speaker 3", "de_speaker_4": "Speaker 4", "de_speaker_5": "Speaker 5",
|
88 |
+
"de_speaker_6": "Speaker 6", "de_speaker_7": "Speaker 7", "de_speaker_8": "Speaker 8",
|
89 |
+
"de_speaker_9": "Speaker 9", "en_speaker_0": "Speaker 0", "en_speaker_1": "Speaker 1",
|
90 |
+
"en_speaker_2": "Speaker 2", "en_speaker_3": "Speaker 3", "en_speaker_4": "Speaker 4",
|
91 |
+
"en_speaker_5": "Speaker 5", "en_speaker_6": "Speaker 6", "en_speaker_7": "Speaker 7",
|
92 |
+
"en_speaker_8": "Speaker 8", "en_speaker_9": "Speaker 9", "es_speaker_0": "Speaker 0",
|
93 |
+
"es_speaker_1": "Speaker 1", "es_speaker_2": "Speaker 2", "es_speaker_3": "Speaker 3",
|
94 |
+
"es_speaker_4": "Speaker 4", "es_speaker_5": "Speaker 5", "es_speaker_6": "Speaker 6",
|
95 |
+
"es_speaker_7": "Speaker 7", "es_speaker_8": "Speaker 8", "es_speaker_9": "Speaker 9",
|
96 |
+
"fr_speaker_0": "Speaker 0", "fr_speaker_1": "Speaker 1", "fr_speaker_2": "Speaker 2",
|
97 |
+
"fr_speaker_3": "Speaker 3", "fr_speaker_4": "Speaker 4", "fr_speaker_5": "Speaker 5",
|
98 |
+
"fr_speaker_6": "Speaker 6", "fr_speaker_7": "Speaker 7", "fr_speaker_8": "Speaker 8",
|
99 |
+
"fr_speaker_9": "Speaker 9", "hi_speaker_0": "Speaker 0", "hi_speaker_1": "Speaker 1",
|
100 |
+
"hi_speaker_2": "Speaker 2", "hi_speaker_3": "Speaker 3", "hi_speaker_4": "Speaker 4",
|
101 |
+
"hi_speaker_5": "Speaker 5", "hi_speaker_6": "Speaker 6", "hi_speaker_7": "Speaker 7",
|
102 |
+
"hi_speaker_8": "Speaker 8", "hi_speaker_9": "Speaker 9", "it_speaker_0": "Speaker 0",
|
103 |
+
"it_speaker_1": "Speaker 1", "it_speaker_2": "Speaker 2", "it_speaker_3": "Speaker 3",
|
104 |
+
"it_speaker_4": "Speaker 4", "it_speaker_5": "Speaker 5", "it_speaker_6": "Speaker 6",
|
105 |
+
"it_speaker_7": "Speaker 7", "it_speaker_8": "Speaker 8", "it_speaker_9": "Speaker 9",
|
106 |
+
"ja_speaker_0": "Speaker 0", "ja_speaker_1": "Speaker 1", "ja_speaker_2": "Speaker 2",
|
107 |
+
"ja_speaker_3": "Speaker 3", "ja_speaker_4": "Speaker 4", "ja_speaker_5": "Speaker 5",
|
108 |
+
"ja_speaker_6": "Speaker 6", "ja_speaker_7": "Speaker 7", "ja_speaker_8": "Speaker 8",
|
109 |
+
"ja_speaker_9": "Speaker 9", "ko_speaker_0": "Speaker 0", "ko_speaker_1": "Speaker 1",
|
110 |
+
"ko_speaker_2": "Speaker 2", "ko_speaker_3": "Speaker 3", "ko_speaker_4": "Speaker 4",
|
111 |
+
"ko_speaker_5": "Speaker 5", "ko_speaker_6": "Speaker 6", "ko_speaker_7": "Speaker 7",
|
112 |
+
"ko_speaker_8": "Speaker 8", "ko_speaker_9": "Speaker 9", "pl_speaker_0": "Speaker 0",
|
113 |
+
"pl_speaker_1": "Speaker 1", "pl_speaker_2": "Speaker 2", "pl_speaker_3": "Speaker 3",
|
114 |
+
"pl_speaker_4": "Speaker 4", "pl_speaker_5": "Speaker 5", "pl_speaker_6": "Speaker 6",
|
115 |
+
"pl_speaker_7": "Speaker 7", "pl_speaker_8": "Speaker 8", "pl_speaker_9": "Speaker 9",
|
116 |
+
"pt_speaker_0": "Speaker 0", "pt_speaker_1": "Speaker 1", "pt_speaker_2": "Speaker 2",
|
117 |
+
"pt_speaker_3": "Speaker 3", "pt_speaker_4": "Speaker 4", "pt_speaker_5": "Speaker 5",
|
118 |
+
"pt_speaker_6": "Speaker 6", "pt_speaker_7": "Speaker 7", "pt_speaker_8": "Speaker 8",
|
119 |
+
"pt_speaker_9": "Speaker 9", "ru_speaker_0": "Speaker 0", "ru_speaker_1": "Speaker 1",
|
120 |
+
"ru_speaker_2": "Speaker 2", "ru_speaker_3": "Speaker 3", "ru_speaker_4": "Speaker 4",
|
121 |
+
"ru_speaker_5": "Speaker 5", "ru_speaker_6": "Speaker 6", "ru_speaker_7": "Speaker 7",
|
122 |
+
"ru_speaker_8": "Speaker 8", "ru_speaker_9": "Speaker 9", "tr_speaker_0": "Speaker 0",
|
123 |
+
"tr_speaker_1": "Speaker 1", "tr_speaker_2": "Speaker 2", "tr_speaker_3": "Speaker 3",
|
124 |
+
"tr_speaker_4": "Speaker 4", "tr_speaker_5": "Speaker 5", "tr_speaker_6": "Speaker 6",
|
125 |
+
"tr_speaker_7": "Speaker 7", "tr_speaker_8": "Speaker 8", "tr_speaker_9": "Speaker 9",
|
126 |
+
"zh_speaker_0": "Speaker 0", "zh_speaker_1": "Speaker 1", "zh_speaker_2": "Speaker 2",
|
127 |
+
"zh_speaker_3": "Speaker 3", "zh_speaker_4": "Speaker 4", "zh_speaker_5": "Speaker 5",
|
128 |
+
"zh_speaker_6": "Speaker 6", "zh_speaker_7": "Speaker 7", "zh_speaker_8": "Speaker 8",
|
129 |
+
"zh_speaker_9": "Speaker 9"
|
130 |
+
},
|
131 |
+
"rating": {"GPU VRAM": 4, "CPU": 1, "RAM": 16, "Realism": 3}
|
132 |
+
},
|
133 |
+
TTS_ENGINES['VITS']: {
|
134 |
+
"samplerate": 22050,
|
135 |
+
"files": ['config.json', 'model_file.pth', 'language_ids.json'],
|
136 |
+
"voices": {},
|
137 |
+
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 2}
|
138 |
+
},
|
139 |
+
TTS_ENGINES['FAIRSEQ']: {
|
140 |
+
"samplerate": 16000,
|
141 |
+
"files": ['config.json', 'G_100000.pth', 'vocab.json'],
|
142 |
+
"voices": {},
|
143 |
+
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 2}
|
144 |
+
},
|
145 |
+
TTS_ENGINES['TACOTRON2']: {
|
146 |
+
"samplerate": 22050,
|
147 |
+
"files": ['config.json', 'best_model.pth', 'vocoder_config.json', 'vocoder_model.pth'],
|
148 |
+
"voices": {},
|
149 |
+
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 2}
|
150 |
+
},
|
151 |
+
TTS_ENGINES['YOURTTS']: {
|
152 |
+
"samplerate": 16000,
|
153 |
+
"files": ['config.json', 'model_file.pth'],
|
154 |
+
"voices": {"Machinella-5": "female-en-5", "ElectroMale-2": "male-en-2", 'Machinella-4': 'female-pt-4\n', 'ElectroMale-3': 'male-pt-3\n'},
|
155 |
+
"rating": {"GPU VRAM": 1, "CPU": 5, "RAM": 4, "Realism": 1}
|
156 |
+
}
|
157 |
+
}
|
158 |
+
models = {
|
159 |
+
TTS_ENGINES['XTTSv2']: {
|
160 |
+
"internal": {
|
161 |
+
"lang": "multi",
|
162 |
+
"repo": "coqui/XTTS-v2",
|
163 |
+
"sub": "tts_models/multilingual/multi-dataset/xtts_v2/",
|
164 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
|
165 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
166 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
167 |
+
},
|
168 |
+
"AiExplained": {
|
169 |
+
"lang": "eng",
|
170 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
171 |
+
"sub": "xtts-v2/eng/AiExplained/",
|
172 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AiExplained.wav'),
|
173 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
174 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
175 |
+
},
|
176 |
+
"AsmrRacoon": {
|
177 |
+
"lang": "eng",
|
178 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
179 |
+
"sub": "xtts-v2/eng/AsmrRacoon/",
|
180 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AsmrRacoon.wav'),
|
181 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
182 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
183 |
+
},
|
184 |
+
"Awkwafina": {
|
185 |
+
"lang": "eng",
|
186 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
187 |
+
"sub": "xtts-v2/eng/Awkwafina/",
|
188 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'Awkwafina.wav'),
|
189 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
190 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
191 |
+
},
|
192 |
+
"BobOdenkirk": {
|
193 |
+
"lang": "eng",
|
194 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
195 |
+
"sub": "xtts-v2/eng/BobOdenkirk/",
|
196 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobOdenkirk.wav'),
|
197 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
198 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
199 |
+
},
|
200 |
+
"BobRoss": {
|
201 |
+
"lang": "eng",
|
202 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
203 |
+
"sub": "xtts-v2/eng/BobRoss/",
|
204 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobRoss.wav'),
|
205 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
206 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
207 |
+
},
|
208 |
+
"BrinaPalencia": {
|
209 |
+
"lang": "eng",
|
210 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
211 |
+
"sub": "xtts-v2/eng/BrinaPalencia/",
|
212 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'BrinaPalencia.wav'),
|
213 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
214 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
215 |
+
},
|
216 |
+
"BryanCranston": {
|
217 |
+
"lang": "eng",
|
218 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
219 |
+
"sub": "xtts-v2/eng/BryanCranston/",
|
220 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BryanCranston.wav'),
|
221 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
222 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
223 |
+
},
|
224 |
+
"DavidAttenborough": {
|
225 |
+
"lang": "eng",
|
226 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
227 |
+
"sub": "xtts-v2/eng/DavidAttenborough/",
|
228 |
+
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DavidAttenborough.wav'),
|
229 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
230 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
231 |
+
},
|
232 |
+
"DeathPussInBoots": {
|
233 |
+
"lang": "eng",
|
234 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
235 |
+
"sub": "xtts-v2/eng/DeathPussInBoots/",
|
236 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'DeathPussInBoots.wav'),
|
237 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
238 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
239 |
+
},
|
240 |
+
"DermotCrowley": {
|
241 |
+
"lang": "eng",
|
242 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
243 |
+
"sub": "xtts-v2/eng/DermotCrowley/",
|
244 |
+
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DermotCrowley.wav'),
|
245 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
246 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
247 |
+
},
|
248 |
+
"EvaSeymour": {
|
249 |
+
"lang": "eng",
|
250 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
251 |
+
"sub": "xtts-v2/eng/EvaSeymour/",
|
252 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'EvaSeymour.wav'),
|
253 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
254 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
255 |
+
},
|
256 |
+
"GideonOfnirEldenRing": {
|
257 |
+
"lang": "eng",
|
258 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
259 |
+
"sub": "xtts-v2/eng/GideonOfnirEldenRing/",
|
260 |
+
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'GideonOfnirEldenRing.wav'),
|
261 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
262 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
263 |
+
},
|
264 |
+
"GhostMW2": {
|
265 |
+
"lang": "eng",
|
266 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
267 |
+
"sub": "xtts-v2/eng/GhostMW2/",
|
268 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'GhostMW2.wav'),
|
269 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
270 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
271 |
+
},
|
272 |
+
"JhonButlerASMR": {
|
273 |
+
"lang": "eng",
|
274 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
275 |
+
"sub": "xtts-v2/eng/JhonButlerASMR/",
|
276 |
+
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'JhonButlerASMR.wav'),
|
277 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
278 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
279 |
+
},
|
280 |
+
"JhonMulaney": {
|
281 |
+
"lang": "eng",
|
282 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
283 |
+
"sub": "xtts-v2/eng/JhonMulaney/",
|
284 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'JhonMulaney.wav'),
|
285 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
286 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
287 |
+
},
|
288 |
+
"JillRedfield": {
|
289 |
+
"lang": "eng",
|
290 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
291 |
+
"sub": "xtts-v2/eng/JillRedfield/",
|
292 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JillRedfield.wav'),
|
293 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
294 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
295 |
+
},
|
296 |
+
"JuliaWhenlan": {
|
297 |
+
"lang": "eng",
|
298 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
299 |
+
"sub": "xtts-v2/eng/JuliaWhenlan/",
|
300 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JuliaWhenlan.wav'),
|
301 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
302 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
303 |
+
},
|
304 |
+
"LeeHorsley": {
|
305 |
+
"lang": "eng",
|
306 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
307 |
+
"sub": "xtts-v2/eng/LeeHorsley/",
|
308 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'LeeHorsley.wav'),
|
309 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
310 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
311 |
+
},
|
312 |
+
"MelinaEldenRing": {
|
313 |
+
"lang": "eng",
|
314 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
315 |
+
"sub": "xtts-v2/eng/MelinaEldenRing/",
|
316 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'MelinaEldenRing.wav'),
|
317 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
318 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
319 |
+
},
|
320 |
+
"MorganFreeman": {
|
321 |
+
"lang": "eng",
|
322 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
323 |
+
"sub": "xtts-v2/eng/MorganFreeman/",
|
324 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'MorganFreeman.wav'),
|
325 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
326 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
327 |
+
},
|
328 |
+
"NeilGaiman": {
|
329 |
+
"lang": "eng",
|
330 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
331 |
+
"sub": "xtts-v2/eng/NeilGaiman/",
|
332 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'NeilGaiman.wav'),
|
333 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
334 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
335 |
+
},
|
336 |
+
"RainyDayHeadSpace": {
|
337 |
+
"lang": "eng",
|
338 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
339 |
+
"sub": "xtts-v2/eng/RainyDayHeadSpace/",
|
340 |
+
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'RainyDayHeadSpace.wav'),
|
341 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
342 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
343 |
+
},
|
344 |
+
"RayPorter": {
|
345 |
+
"lang": "eng",
|
346 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
347 |
+
"sub": "xtts-v2/eng/RayPorter/",
|
348 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RayPorter.wav'),
|
349 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
350 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
351 |
+
},
|
352 |
+
"RelaxForAWhile": {
|
353 |
+
"lang": "eng",
|
354 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
355 |
+
"sub": "xtts-v2/eng/RelaxForAWhile/",
|
356 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RelaxForAWhile.wav'),
|
357 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
358 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
359 |
+
},
|
360 |
+
"RosamundPike": {
|
361 |
+
"lang": "eng",
|
362 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
363 |
+
"sub": "xtts-v2/eng/RosamundPike/",
|
364 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RosamundPike.wav'),
|
365 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
366 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
367 |
+
},
|
368 |
+
"ScarlettJohansson": {
|
369 |
+
"lang": "eng",
|
370 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
371 |
+
"sub": "xtts-v2/eng/ScarlettJohansson/",
|
372 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'ScarlettJohansson.wav'),
|
373 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
374 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
375 |
+
},
|
376 |
+
"SladeTeenTitans": {
|
377 |
+
"lang": "eng",
|
378 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
379 |
+
"sub": "xtts-v2/eng/SladeTeenTitans/",
|
380 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'SladeTeenTitans.wav'),
|
381 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
382 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
383 |
+
},
|
384 |
+
"StanleyParable": {
|
385 |
+
"lang": "eng",
|
386 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
387 |
+
"sub": "xtts-v2/eng/StanleyParable/",
|
388 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'StanleyParable.wav'),
|
389 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
390 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
391 |
+
},
|
392 |
+
"WhisperSalemASMR": {
|
393 |
+
"lang": "eng",
|
394 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
395 |
+
"sub": "xtts-v2/eng/WhisperSalemASMR/",
|
396 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'WhisperSalemASMR.wav'),
|
397 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
398 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
399 |
+
},
|
400 |
+
"Konishev": {
|
401 |
+
"lang": "rus",
|
402 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
403 |
+
"sub": "xtts-v2/rus/Konishev/",
|
404 |
+
"voice": os.path.join(voices_dir, 'rus', 'adult', 'male', 'Konishev.wav'),
|
405 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
406 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
407 |
+
}
|
408 |
+
},
|
409 |
+
TTS_ENGINES['BARK']: {
|
410 |
+
"internal": {
|
411 |
+
"lang": "multi",
|
412 |
+
"repo": "erogol/bark", # suno/bark, rsxdalv/suno, tts_models/multilingual/multi-dataset/bark
|
413 |
+
"sub": "", # {"big-bf16": "big-bf16/", "small-bf16": "small-bf16/", "big": "big/", "small": "small/"}
|
414 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
|
415 |
+
"files": default_engine_settings[TTS_ENGINES['BARK']]['files'],
|
416 |
+
"samplerate": default_engine_settings[TTS_ENGINES['BARK']]['samplerate']
|
417 |
+
}
|
418 |
+
},
|
419 |
+
TTS_ENGINES['VITS']: {
|
420 |
+
"internal": {
|
421 |
+
"lang": "multi",
|
422 |
+
"repo": "tts_models/[lang_iso1]/[xxx]",
|
423 |
+
"sub": {
|
424 |
+
"css10/vits": ['es','hu','fi','fr','nl','ru','el'],
|
425 |
+
"custom/vits": ['ca'],
|
426 |
+
"custom/vits-female": ['bn', 'fa'],
|
427 |
+
"cv/vits": ['bg','cs','da','et','ga','hr','lt','lv','mt','pt','ro','sk','sl','sv'],
|
428 |
+
"mai/vits": ['uk'],
|
429 |
+
"mai_female/vits": ['pl'],
|
430 |
+
"mai_male/vits": ['it'],
|
431 |
+
"openbible/vits": ['ewe','hau','lin','tw_akuapem','tw_asante','yor'],
|
432 |
+
"vctk/vits": ['en'],
|
433 |
+
"thorsten/vits": ['de']
|
434 |
+
},
|
435 |
+
"voice": None,
|
436 |
+
"files": default_engine_settings[TTS_ENGINES['VITS']]['files'],
|
437 |
+
"samplerate": {
|
438 |
+
"css10/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
439 |
+
"custom/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
440 |
+
"custom/vits-female": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
441 |
+
"cv/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
442 |
+
"mai/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
443 |
+
"mai_female/vits": 24000,
|
444 |
+
"mai_male/vits": 16000,
|
445 |
+
"openbible/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
446 |
+
"vctk/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
447 |
+
"thorsten/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate']
|
448 |
+
}
|
449 |
+
}
|
450 |
+
},
|
451 |
+
TTS_ENGINES['FAIRSEQ']: {
|
452 |
+
"internal": {
|
453 |
+
"lang": "multi",
|
454 |
+
"repo": "tts_models/[lang]/fairseq/vits",
|
455 |
+
"sub": "",
|
456 |
+
"voice": None,
|
457 |
+
"files": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['files'],
|
458 |
+
"samplerate": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['samplerate']
|
459 |
+
}
|
460 |
+
},
|
461 |
+
TTS_ENGINES['TACOTRON2']: {
|
462 |
+
"internal": {
|
463 |
+
"lang": "multi",
|
464 |
+
"repo": "tts_models/[lang_iso1]/[xxx]",
|
465 |
+
"sub": {
|
466 |
+
"mai/tacotron2-DDC": ['fr', 'es', 'nl'],
|
467 |
+
"thorsten/tacotron2-DDC": ['de'],
|
468 |
+
"kokoro/tacotron2-DDC": ['ja'],
|
469 |
+
"ljspeech/tacotron2-DDC": ['en'],
|
470 |
+
"baker/tacotron2-DDC-GST": ['zh-CN']
|
471 |
+
},
|
472 |
+
"voice": None,
|
473 |
+
"files": default_engine_settings[TTS_ENGINES['TACOTRON2']]['files'],
|
474 |
+
"samplerate": {
|
475 |
+
"mai/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
476 |
+
"thorsten/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
477 |
+
"kokoro/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
478 |
+
"ljspeech/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
479 |
+
"baker/tacotron2-DDC-GST": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate']
|
480 |
+
},
|
481 |
+
}
|
482 |
+
},
|
483 |
+
TTS_ENGINES['YOURTTS']: {
|
484 |
+
"internal": {
|
485 |
+
"lang": "multi",
|
486 |
+
"repo": "tts_models/multilingual/multi-dataset/your_tts",
|
487 |
+
"sub": "",
|
488 |
+
"voice": None,
|
489 |
+
"files": default_engine_settings[TTS_ENGINES['YOURTTS']]['files'],
|
490 |
+
"samplerate": default_engine_settings[TTS_ENGINES['YOURTTS']]['samplerate']
|
491 |
+
}
|
492 |
+
}
|
493 |
+
}
|