priteshmistry commited on
Commit
aa7ea23
·
verified ·
1 Parent(s): d800e94

Upload 22 files

Browse files
.gitattributes CHANGED
@@ -47,3 +47,5 @@ ebook2audiobook.egg-info/assets/gui_1.png filter=lfs diff=lfs merge=lfs -text
47
  ebook2audiobook.egg-info/assets/gui_2.png filter=lfs diff=lfs merge=lfs -text
48
  ebook2audiobook.egg-info/assets/gui_3.png filter=lfs diff=lfs merge=lfs -text
49
  ebook2audiobook.egg-info/assets/Rainy_Day_voice_Demo.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
 
47
  ebook2audiobook.egg-info/assets/gui_2.png filter=lfs diff=lfs merge=lfs -text
48
  ebook2audiobook.egg-info/assets/gui_3.png filter=lfs diff=lfs merge=lfs -text
49
  ebook2audiobook.egg-info/assets/Rainy_Day_voice_Demo.mp4 filter=lfs diff=lfs merge=lfs -text
50
+ lib/__pycache__/functions.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
51
+ lib/__pycache__/lang.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
lib/__init__.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .models import (
2
+ TTS_ENGINES, TTS_VOICE_CONVERSION, TTS_SML, default_fine_tuned, default_tts_engine,
3
+ default_engine_settings, default_vc_model, default_voice_detection_model,
4
+ loaded_tts, max_custom_model, max_custom_voices,
5
+ max_tts_in_memory, max_upload_size, models, os, voices_dir
6
+ )
7
+
8
+ from .conf import (
9
+ FULL_DOCKER, NATIVE, audiobooks_cli_dir, audiobooks_gradio_dir,
10
+ audiobooks_host_dir, debug_mode, default_audio_proc_samplerate,
11
+ default_audio_proc_format, default_device, default_gpu_wiki,
12
+ default_output_format, device_list, ebook_formats,
13
+ ebooks_dir, interface_component_options, interface_concurrency_limit,
14
+ interface_host, interface_port, interface_shared_tmp_expire,
15
+ max_python_version, min_python_version, models_dir, os,
16
+ output_formats, platform, prog_version, python_env_dir,
17
+ requirements_file, tmp_dir, tmp_expire, tts_dir, voice_formats,
18
+ voices_dir, default_output_split, default_output_split_hours
19
+ )
20
+
21
+ from .lang import (
22
+ abbreviations_mapping, chapter_word_mapping, default_language_code,
23
+ roman_numbers_tuples, emojis_list, install_info, language_mapping,
24
+ language_math_phonemes, language_clock, language_tts, os, punctuation_list,
25
+ punctuation_list_set, punctuation_split_hard, punctuation_split_hard_set,
26
+ punctuation_split_soft, punctuation_split_soft_set, punctuation_switch,
27
+ specialchars_mapping, specialchars_remove, year_to_decades_languages
28
+ )
29
+
30
+ __all__ = [
31
+ # from models
32
+ "TTS_ENGINES", "TTS_VOICE_CONVERSION", "TTS_SML", "default_fine_tuned", "default_tts_engine",
33
+ "default_engine_settings", "default_vc_model", "default_voice_detection_model",
34
+ "loaded_tts", "max_custom_model",
35
+ "max_custom_voices", "max_tts_in_memory", "max_upload_size",
36
+ "models", "os", "voices_dir",
37
+
38
+ # from conf
39
+ "FULL_DOCKER", "NATIVE", "audiobooks_cli_dir", "audiobooks_gradio_dir",
40
+ "audiobooks_host_dir", "debug_mode", "default_audio_proc_samplerate",
41
+ "default_audio_proc_format", "default_device", "default_gpu_wiki",
42
+ "default_output_format", "device_list", "ebook_formats", "ebooks_dir",
43
+ "interface_component_options", "interface_concurrency_limit",
44
+ "interface_host", "interface_port", "interface_shared_tmp_expire",
45
+ "max_python_version", "min_python_version", "models_dir", "os",
46
+ "output_formats", "platform", "prog_version", "python_env_dir",
47
+ "requirements_file", "tmp_dir", "tmp_expire", "tts_dir",
48
+ "voice_formats", "voices_dir", "default_output_split", "default_output_split_hours",
49
+
50
+ # from lang
51
+ "abbreviations_mapping", "chapter_word_mapping", "default_language_code",
52
+ "roman_numbers_tuples", "emojis_list", "install_info", "language_mapping",
53
+ "language_math_phonemes", "language_clock", "language_tts", "os", "punctuation_list",
54
+ "punctuation_list_set", "punctuation_split_hard", "punctuation_split_hard_set",
55
+ "punctuation_split_soft", "punctuation_split_soft_set", "punctuation_switch",
56
+ "specialchars_mapping", "specialchars_remove", "year_to_decades_languages"
57
+ ]
lib/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (2.64 kB). View file
 
lib/__pycache__/conf.cpython-312.pyc ADDED
Binary file (4.98 kB). View file
 
lib/__pycache__/functions.cpython-312.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59b1809dd2e4e86864d8ff51fbdade7548389b92cd6f3b24d9e9a54235eb0de2
3
+ size 236223
lib/__pycache__/lang.cpython-312.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff1e8d413d7881648a9aa7ffae42617ebc430ee61b2523706c9eb8315889c86e
3
+ size 228874
lib/__pycache__/models.cpython-312.pyc ADDED
Binary file (20.8 kB). View file
 
lib/classes/__pycache__/background_detector.cpython-312.pyc ADDED
Binary file (2.32 kB). View file
 
lib/classes/__pycache__/tts_manager.cpython-312.pyc ADDED
Binary file (2.15 kB). View file
 
lib/classes/__pycache__/voice_extractor.cpython-312.pyc ADDED
Binary file (14.3 kB). View file
 
lib/classes/argos_translator.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import argostranslate.package
4
+ import argostranslate.translate
5
+
6
+ from iso639 import languages
7
+ from lib.conf import models_dir
8
+ from lib.lang import language_mapping
9
+
10
+ # NOTE: source_lang and target_lang must be iso639-1 (2 letters)
11
+
12
+ class ArgosTranslator:
13
+
14
+ def __init__(self, neural_machine="argostranslate"):
15
+ self.neural_machine = neural_machine
16
+ self.translation = None
17
+
18
+ def get_language_iso3(self, lang_iso1):
19
+ lang = lang_iso1
20
+ try:
21
+ lang_array = languages.get(part1=lang_iso1)
22
+ if lang_array:
23
+ lang = lang_array.part3
24
+ except Exception:
25
+ pass
26
+ return lang
27
+
28
+ def get_all_sources_lang(self):
29
+ available_packages = argostranslate.package.get_available_packages()
30
+ return sorted(set(pkg.from_code for pkg in available_packages))
31
+
32
+ def get_all_targets_lang(self, source_lang):
33
+ available_packages = argostranslate.package.get_available_packages()
34
+ list_iso1 = sorted(set(pkg.to_code for pkg in available_packages if pkg.from_code == source_lang))
35
+ language_translate_mapping = {}
36
+ for iso1 in list_iso1:
37
+ try:
38
+ iso3 = self.get_language_iso3(iso1)
39
+ if iso3 in language_mapping:
40
+ language_translate_mapping[iso3] = dict(language_mapping[iso3])
41
+ language_translate_mapping[iso3]["iso1"] = iso1
42
+ except KeyError:
43
+ pass
44
+ language_translate_options = [
45
+ (
46
+ f"{details['name']} - {details['native_name']}" if details['name'] != details['native_name'] else details['name'],
47
+ lang
48
+ )
49
+ for lang, details in language_translate_mapping.items()
50
+ ]
51
+ return language_translate_options
52
+
53
+ def get_all_target_packages(self, source_lang):
54
+ available_packages = argostranslate.package.get_available_packages()
55
+ return [pkg for pkg in available_packages if pkg.from_code == source_lang]
56
+
57
+ def is_package_installed(self, source_lang, target_lang):
58
+ try:
59
+ installed_languages = argostranslate.translate.get_installed_languages()
60
+ source_language = next((lang for lang in installed_languages if lang.code == source_lang), None)
61
+ target_language = next((lang for lang in installed_languages if lang.code == target_lang), None)
62
+ return source_language is not None and target_language is not None
63
+ except Exception as e:
64
+ error = f'is_package_installed() error: {e}'
65
+ return False
66
+
67
+ def download_and_install_argos_package(self, source_lang, target_lang):
68
+ try:
69
+ if self.is_package_installed(source_lang, target_lang):
70
+ print(f"Package for translation from {source_lang} to {target_lang} is already installed.")
71
+ print(msg)
72
+ return msg, True
73
+ available_packages = self.get_all_target_packages(source_lang)
74
+ target_package = None
75
+ for pkg in available_packages:
76
+ if pkg.from_code == source_lang and pkg.to_code == target_lang:
77
+ target_package = pkg
78
+ break
79
+ if target_package:
80
+ with tempfile.TemporaryDirectory() as tmpdirname:
81
+ print(f"Downloading package for translation from {source_lang} to {target_lang}...")
82
+ package_path = target_package.download()
83
+ argostranslate.package.install_from_path(package_path)
84
+ print(f"Package installed for translation from {source_lang} to {target_lang}")
85
+ return None, True
86
+ else:
87
+ msg = f"No available package found for translation from {source_lang} to {target_lang}."
88
+ return msg, False
89
+ except Exception as e:
90
+ error = f'download_and_install_argos_package() error: {e}'
91
+ return error, False
92
+
93
+ def process(self, text):
94
+ try:
95
+ return self.translation.translate(text), True
96
+ except Exception as e:
97
+ error = f'AgrosTranslator.process() error: {e}'
98
+ return error, False
99
+
100
+ def start(self, source_lang, target_lang):
101
+ try:
102
+ if self.neural_machine != "argostranslate":
103
+ error = f"Neural machine '{self.neural_machine}' is not supported."
104
+ return error, False
105
+ status = True
106
+ if not self.is_package_installed(source_lang, target_lang):
107
+ error, status = self.download_and_install_argos_package(source_lang, target_lang)
108
+ if status:
109
+ installed_languages = argostranslate.translate.get_installed_languages()
110
+ source_language = next((lang for lang in installed_languages if lang.code == source_lang), None)
111
+ target_language = next((lang for lang in installed_languages if lang.code == target_lang), None)
112
+
113
+ if not source_language or not target_language:
114
+ error = f"Translation languages not installed: {source_lang} to {target_lang}"
115
+ return error, False
116
+
117
+ self.translation = source_language.get_translation(target_language)
118
+ return None, True
119
+ return error, status
120
+ except Exception as e:
121
+ error = f'AgrosTranslator.process() error: {e}'
122
+ return error, False
lib/classes/background_detector.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import librosa
4
+
5
+ from pyannote.audio import Model
6
+ from pyannote.audio.pipelines import VoiceActivityDetection
7
+ from lib.conf import tts_dir
8
+ from lib.models import default_voice_detection_model
9
+
10
+ class BackgroundDetector:
11
+
12
+ def __init__(self, wav_file: str):
13
+ self.wav_file = wav_file
14
+ model = Model.from_pretrained(default_voice_detection_model, cache_dir=tts_dir)
15
+ self.pipeline = VoiceActivityDetection(segmentation=model)
16
+ hyper_params = {
17
+ # onset/offset activation thresholds
18
+ "onset": 0.5, "offset": 0.5,
19
+ # remove speech regions shorter than that many seconds.
20
+ "min_duration_on": 0.0,
21
+ # fill non-speech regions shorter than that many seconds.
22
+ "min_duration_off": 0.0
23
+ }
24
+ self.pipeline.instantiate(hyper_params)
25
+
26
+ def detect(self, vad_ratio_thresh: float=0.05):
27
+ diarization = self.pipeline(self.wav_file)
28
+ speech_segments = [(s.start, s.end) for s in diarization.get_timeline()]
29
+ total_duration = librosa.get_duration(path=self.wav_file)
30
+ speech_time = sum(end - start for start, end in speech_segments)
31
+ non_speech_ratio = 1 - (speech_time / total_duration)
32
+ status = non_speech_ratio > vad_ratio_thresh
33
+ report = {
34
+ 'non_speech_ratio': non_speech_ratio,
35
+ 'background_detected': status
36
+ }
37
+ return status, report
lib/classes/redirect_console.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from queue import Queue, Empty
2
+ import time
3
+ import logging
4
+
5
+
6
+ class RedirectConsole:
7
+ def __init__(self, log_buffer: Queue, real_output):
8
+ self.log_buffer = log_buffer # Queue buffer for the log
9
+ self.real_output = real_output # Real terminal (sys.__stdout__ or sys.__stderr__)
10
+
11
+ # Setup for transformers logging
12
+ self.setup_transformers_logger()
13
+
14
+ def write(self, message: str):
15
+ # Write to the real terminal
16
+ self.real_output.write(message)
17
+ self.real_output.flush()
18
+
19
+ # Write to the log buffer
20
+ self.log_buffer.put(message)
21
+
22
+ def flush(self):
23
+ self.real_output.flush()
24
+
25
+ def isatty(self) -> bool:
26
+ return self.real_output.isatty()
27
+
28
+ def poll_logs(self, stop_event):
29
+ logs = ""
30
+ errors = ""
31
+ while not stop_event.is_set() or not self.log_buffer.empty():
32
+ try:
33
+ # Read logs from the buffer without blocking
34
+ log = self.log_buffer.get_nowait()
35
+ if "An error occurred" in log:
36
+ errors += log # Capture error messages separately
37
+ logs += log
38
+ except Empty:
39
+ pass # No logs in the buffer
40
+ yield logs, errors # Yield updated logs and errors
41
+ time.sleep(0.1) # Prevent tight looping
42
+
43
+ def setup_transformers_logger(self):
44
+ # Configure the `transformers` logger
45
+ transformers_logger = logging.getLogger("transformers")
46
+ transformers_logger.setLevel(logging.WARNING) # Capture warnings and above
47
+
48
+ # Create a handler that writes to this instance
49
+ handler = logging.StreamHandler(self)
50
+ handler.setFormatter(logging.Formatter("%(message)s")) # Simplified format
51
+ transformers_logger.addHandler(handler)
lib/classes/tts_engines/.template.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import math
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+ import tempfile
7
+ import threading
8
+ import uuid
9
+
10
+ import numpy as np
11
+ import regex as re
12
+ import soundfile as sf
13
+ import torch
14
+ import torchaudio
15
+
16
+ from huggingface_hub import hf_hub_download
17
+ from pathlib import Path
18
+ from pprint import pprint
19
+
20
+ from lib import *
21
+ from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
22
+ from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
23
+
24
+ #import logging
25
+ #logging.basicConfig(level=logging.DEBUG)
26
+
27
+ lock = threading.Lock()
28
+
29
+ class Coqui:
30
+
31
+ def __init__(self, session):
32
+ try:
33
+ self.session = session
34
+ self.cache_dir = tts_dir
35
+ self.speakers_path = None
36
+ self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
37
+ self.tts_vc_key = default_vc_model.rsplit('/', 1)[-1]
38
+ self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported() == True else False
39
+ self.npz_path = None
40
+ self.npz_data = None
41
+ self.sentences_total_time = 0.0
42
+ self.sentence_idx = 1
43
+ self.params = {TTS_ENGINES['NEW_TTS']: {}}
44
+ self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
45
+ self.vtt_path = os.path.join(self.session['process_dir'], os.path.splitext(self.session['final_name'])[0] + '.vtt')
46
+ self.resampler_cache = {}
47
+ self.audio_segments = []
48
+ self._build()
49
+ except Exception as e:
50
+ error = f'__init__() error: {e}'
51
+ print(error)
52
+ return None
53
+
54
+ def _build(self):
55
+ try:
56
+ tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
57
+ if not tts:
58
+ if self.session['tts_engine'] == TTS_ENGINES['NEW_TTS']:
59
+ if self.session['custom_model'] is not None:
60
+ msg = f"{self.session['tts_engine']} custom model not implemented yet!"
61
+ print(msg)
62
+ return False
63
+ else:
64
+ model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
65
+ tts = self._load_api(self.tts_key, model_path, self.session['device'])
66
+ return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
67
+ except Exception as e:
68
+ error = f'build() error: {e}'
69
+ print(error)
70
+ return False
71
+
72
+ def _load_api(self, key, model_path, device):
73
+ global lock
74
+ try:
75
+ if key in loaded_tts.keys():
76
+ return loaded_tts[key]['engine']
77
+ unload_tts(device, [self.tts_key, self.tts_vc_key])
78
+ with lock:
79
+ tts = NEW_TTS(model_path)
80
+ if tts
81
+ if device == 'cuda':
82
+ NEW_TTS.WITH_CUDA
83
+ else:
84
+ NEW_TTS.WITHOUT_CUDA
85
+ loaded_tts[key] = {"engine": tts, "config": None}
86
+ msg = f'{model_path} Loaded!'
87
+ print(msg)
88
+ return tts
89
+ else:
90
+ error = 'TTS engine could not be created!'
91
+ print(error)
92
+ except Exception as e:
93
+ error = f'_load_api() error: {e}'
94
+ print(error)
95
+ return False
96
+
97
+ def _load_checkpoint(self, **kwargs):
98
+ global lock
99
+ try:
100
+ key = kwargs.get('key')
101
+ if key in loaded_tts.keys():
102
+ return loaded_tts[key]['engine']
103
+ tts_engine = kwargs.get('tts_engine')
104
+ device = kwargs.get('device')
105
+ unload_tts(device, [self.tts_key])
106
+ with lock:
107
+ checkpoint_dir = kwargs.get('checkpoint_dir')
108
+ NEW_TTS.LOAD_CHECKPOINT(
109
+ config,
110
+ checkpoint_dir=checkpoint_dir,
111
+ eval=True
112
+ )
113
+ if tts:
114
+ if device == 'cuda':
115
+ NEW_TTS.WITH_CUDA
116
+ else:
117
+ NEW_TTS.WITHOUT_CUDA
118
+ loaded_tts[key] = {"engine": tts, "config": config}
119
+ msg = f'{tts_engine} Loaded!'
120
+ print(msg)
121
+ return tts
122
+ else:
123
+ error = 'TTS engine could not be created!'
124
+ print(error)
125
+ except Exception as e:
126
+ error = f'_load_checkpoint() error: {e}'
127
+ return False
128
+
129
+ def _tensor_type(self, audio_data):
130
+ if isinstance(audio_data, torch.Tensor):
131
+ return audio_data
132
+ elif isinstance(audio_data, np.ndarray):
133
+ return torch.from_numpy(audio_data).float()
134
+ elif isinstance(audio_data, list):
135
+ return torch.tensor(audio_data, dtype=torch.float32)
136
+ else:
137
+ raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
138
+
139
+ def _get_resampler(self, orig_sr, target_sr):
140
+ key = (orig_sr, target_sr)
141
+ if key not in self.resampler_cache:
142
+ self.resampler_cache[key] = torchaudio.transforms.Resample(
143
+ orig_freq=orig_sr, new_freq=target_sr
144
+ )
145
+ return self.resampler_cache[key]
146
+
147
+ def _resample_wav(self, wav_path, expected_sr):
148
+ waveform, orig_sr = torchaudio.load(wav_path)
149
+ if orig_sr == expected_sr and waveform.size(0) == 1:
150
+ return wav_path
151
+ if waveform.size(0) > 1:
152
+ waveform = waveform.mean(dim=0, keepdim=True)
153
+ if orig_sr != expected_sr:
154
+ resampler = self._get_resampler(orig_sr, expected_sr)
155
+ waveform = resampler(waveform)
156
+ wav_tensor = waveform.squeeze(0)
157
+ wav_numpy = wav_tensor.cpu().numpy()
158
+ tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
159
+ tmp_path = tmp_fh.name
160
+ tmp_fh.close()
161
+ sf.write(tmp_path, wav_numpy, expected_sr, subtype="PCM_16")
162
+ return tmp_path
163
+
164
+ def convert(self, sentence_number, sentence):
165
+ global xtts_builtin_speakers_list
166
+ try:
167
+ speaker = None
168
+ audio_data = False
169
+ trim_audio_buffer = 0.004
170
+ settings = self.params[self.session['tts_engine']]
171
+ final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
172
+ sentence = sentence.strip()
173
+ settings['voice_path'] = (
174
+ self.session['voice'] if self.session['voice'] is not None
175
+ else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
176
+ else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
177
+ )
178
+ if settings['voice_path'] is not None:
179
+ speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
180
+ tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
181
+ if tts:
182
+ if sentence[-1].isalnum():
183
+ sentence = f'{sentence} —'
184
+ if sentence == TTS_SML['break']:
185
+ break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100))) # 0.4 to 0.7 seconds
186
+ self.audio_segments.append(break_tensor.clone())
187
+ return True
188
+ elif sentence == TTS_SML['pause']:
189
+ pause_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(1.0, 1.8) * 100) / 100))) # 1.0 to 1.8 seconds
190
+ self.audio_segments.append(pause_tensor.clone())
191
+ return True
192
+ else:
193
+ if self.session['tts_engine'] == TTS_ENGINES['NEW_TTS']:
194
+ audio_sentence = NEW_TTS.CONVERT() # audio_sentence must be torch.Tensor or (list, tuple) or np.ndarray
195
+ if is_audio_data_valid(audio_sentence):
196
+ sourceTensor = self._tensor_type(audio_sentence)
197
+ audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
198
+ if sentence[-1].isalnum() or sentence[-1] == '—':
199
+ audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.003, trim_audio_buffer).unsqueeze(0)
200
+ self.audio_segments.append(audio_tensor)
201
+ if not re.search(r'\w$', sentence, flags=re.UNICODE):
202
+ break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100)))
203
+ self.audio_segments.append(break_tensor.clone())
204
+ if self.audio_segments:
205
+ audio_tensor = torch.cat(self.audio_segments, dim=-1)
206
+ start_time = self.sentences_total_time
207
+ duration = audio_tensor.shape[-1] / settings['samplerate']
208
+ end_time = start_time + duration
209
+ self.sentences_total_time = end_time
210
+ sentence_obj = {
211
+ "start": start_time,
212
+ "end": end_time,
213
+ "text": sentence,
214
+ "resume_check": self.sentence_idx
215
+ }
216
+ self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
217
+ if self.sentence_idx:
218
+ torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
219
+ del audio_tensor
220
+ self.audio_segments = []
221
+ if os.path.exists(final_sentence_file):
222
+ return True
223
+ else:
224
+ error = f"Cannot create {final_sentence_file}"
225
+ print(error)
226
+ else:
227
+ error = f"convert() error: {self.session['tts_engine']} is None"
228
+ print(error)
229
+ except Exception as e:
230
+ error = f'Coquit.convert(): {e}'
231
+ raise ValueError(e)
232
+ return False
lib/classes/tts_engines/common/audio_filters.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import subprocess
4
+ import shutil
5
+
6
+ from scipy.io import wavfile as wav
7
+ from scipy.signal import find_peaks
8
+
9
+ def detect_gender(voice_path):
10
+ try:
11
+ samplerate, signal = wav.read(voice_path)
12
+ # Convert stereo to mono if needed
13
+ if len(signal.shape) > 1:
14
+ signal = np.mean(signal, axis=1)
15
+ # Compute FFT
16
+ fft_spectrum = np.abs(np.fft.fft(signal))
17
+ freqs = np.fft.fftfreq(len(fft_spectrum), d=1/samplerate)
18
+ # Consider only positive frequencies
19
+ positive_freqs = freqs[:len(freqs)//2]
20
+ positive_magnitude = fft_spectrum[:len(fft_spectrum)//2]
21
+ # Find peaks in frequency spectrum
22
+ peaks, _ = find_peaks(positive_magnitude, height=np.max(positive_magnitude) * 0.2)
23
+ if len(peaks) == 0:
24
+ return None
25
+ # Find the first strong peak within the human voice range (75Hz - 300Hz)
26
+ for peak in peaks:
27
+ if 75 <= positive_freqs[peak] <= 300:
28
+ pitch = positive_freqs[peak]
29
+ gender = "female" if pitch > 135 else "male"
30
+ return gender
31
+ break
32
+ return None
33
+ except Exception as e:
34
+ error = f"_detect_gender() error: {voice_path}: {e}"
35
+ print(error)
36
+ return None
37
+
38
+ def trim_audio(audio_data, samplerate, silence_threshold=0.003, buffer_sec=0.005):
39
+ # Ensure audio_data is a PyTorch tensor
40
+ if isinstance(audio_data, list):
41
+ audio_data = torch.tensor(audio_data, dtype=torch.float32) # Ensure dtype and always float32 for audio
42
+ if isinstance(audio_data, torch.Tensor):
43
+ if audio_data.ndim != 1:
44
+ error = "audio_data must be a 1D tensor (mono audio)."
45
+ raise ValueError(error)
46
+ if audio_data.is_cuda:
47
+ audio_data = audio_data.cpu()
48
+ # Detect non-silent indices
49
+ non_silent_indices = torch.where(audio_data.abs() > silence_threshold)[0]
50
+ if len(non_silent_indices) == 0:
51
+ return torch.tensor([], dtype=audio_data.dtype) # Preserves dtype
52
+ # Calculate start and end trimming indices with buffer
53
+ start_index = max(non_silent_indices[0].item() - int(buffer_sec * samplerate), 0)
54
+ end_index = min(non_silent_indices[-1].item() + int(buffer_sec * samplerate), audio_data.size(0)) # Clamp end to signal length
55
+ trimmed_audio = audio_data[start_index:end_index]
56
+ return trimmed_audio
57
+ error = "audio_data must be a PyTorch tensor or a list of numerical values."
58
+ raise TypeError(error)
59
+
60
+ def normalize_audio(input_file, output_file, samplerate):
61
+ filter_complex = (
62
+ 'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
63
+ 'afftdn=nf=-70,'
64
+ 'acompressor=threshold=-20dB:ratio=2:attack=80:release=200:makeup=1dB,'
65
+ 'loudnorm=I=-14:TP=-3:LRA=7:linear=true,'
66
+ 'equalizer=f=150:t=q:w=2:g=1,'
67
+ 'equalizer=f=250:t=q:w=2:g=-3,'
68
+ 'equalizer=f=3000:t=q:w=2:g=2,'
69
+ 'equalizer=f=5500:t=q:w=2:g=-4,'
70
+ 'equalizer=f=9000:t=q:w=2:g=-2,'
71
+ 'highpass=f=63[audio]'
72
+ )
73
+ ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', input_file]
74
+ ffmpeg_cmd += [
75
+ '-filter_complex', filter_complex,
76
+ '-map', '[audio]',
77
+ '-ar', str(samplerate),
78
+ '-y', output_file
79
+ ]
80
+ try:
81
+ subprocess.run(
82
+ ffmpeg_cmd,
83
+ env={},
84
+ stdout=subprocess.PIPE,
85
+ stderr=subprocess.PIPE,
86
+ encoding='utf-8',
87
+ errors='ignore'
88
+ )
89
+ return True
90
+ except subprocess.CalledProcessError as e:
91
+ error = f"normalize_audio() error: {input_file}: {e}"
92
+ print(error)
93
+ return False
94
+
95
+ def is_audio_data_valid(audio_data):
96
+ if audio_data is None:
97
+ return False
98
+ if isinstance(audio_data, torch.Tensor):
99
+ return audio_data.numel() > 0
100
+ if isinstance(audio_data, (list, tuple)):
101
+ return len(audio_data) > 0
102
+ try:
103
+ if isinstance(audio_data, np.ndarray):
104
+ return audio_data.size > 0
105
+ except ImportError:
106
+ pass
107
+ return False
lib/classes/tts_engines/common/utils.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import regex as re
4
+ import stanza
5
+
6
+ from lib.models import loaded_tts, max_tts_in_memory, TTS_ENGINES
7
+
8
+ def unload_tts(device, reserved_keys=None, tts_key=None):
9
+ try:
10
+ if len(loaded_tts) >= max_tts_in_memory:
11
+ if reserved_keys is None:
12
+ reserved_keys = []
13
+ if tts_key is not None:
14
+ if tts_key in loaded_tts.keys():
15
+ del loaded_tts[tts_key]
16
+ if device == 'cuda':
17
+ torch.cuda.empty_cache()
18
+ torch.cuda.ipc_collect()
19
+ else:
20
+ for key in list(loaded_tts.keys()):
21
+ if key not in reserved_keys:
22
+ del loaded_tts[key]
23
+ except Exception as e:
24
+ error = f'unload_tts() error: {e}'
25
+ print(error)
26
+ return False
27
+
28
+ def append_sentence2vtt(sentence_obj, path):
29
+
30
+ def format_timestamp(seconds):
31
+ m, s = divmod(seconds, 60)
32
+ h, m = divmod(m, 60)
33
+ return f"{int(h):02}:{int(m):02}:{s:06.3f}"
34
+
35
+ try:
36
+ index = 1
37
+ if os.path.exists(path):
38
+ with open(path, "r", encoding="utf-8") as f:
39
+ lines = f.readlines()
40
+ for line in lines:
41
+ if "-->" in line:
42
+ index += 1
43
+ if index > 1 and "resume_check" in sentence_obj and sentence_obj["resume_check"] < index:
44
+ return index # Already written
45
+ if not os.path.exists(path):
46
+ with open(path, "w", encoding="utf-8") as f:
47
+ f.write("WEBVTT\n\n")
48
+ with open(path, "a", encoding="utf-8") as f:
49
+ start = format_timestamp(sentence_obj["start"])
50
+ end = format_timestamp(sentence_obj["end"])
51
+ text = re.sub(r'[\r\n]+', ' ', sentence_obj["text"]).strip()
52
+ f.write(f"{start} --> {end}\n{text}\n\n")
53
+ return index + 1
54
+ except Exception as e:
55
+ error = f'append_sentence2vtt() error: {e}'
56
+ print(error)
57
+ return False
lib/classes/tts_engines/coqui.py ADDED
@@ -0,0 +1,810 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib, math, os, shutil, subprocess, tempfile, threading, uuid
2
+ import numpy as np, regex as re, soundfile as sf, torch, torchaudio
3
+
4
+ from huggingface_hub import hf_hub_download
5
+ from pathlib import Path
6
+ from pprint import pprint
7
+
8
+ from lib import *
9
+ from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
10
+ from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
11
+
12
+ #import logging
13
+ #logging.basicConfig(level=logging.DEBUG)
14
+
15
+ lock = threading.Lock()
16
+ xtts_builtin_speakers_list = None
17
+
18
+ class Coqui:
19
+
20
+ def __init__(self, session):
21
+ try:
22
+ self.session = session
23
+ self.cache_dir = tts_dir
24
+ self.speakers_path = None
25
+ self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
26
+ self.tts_vc_key = default_vc_model.rsplit('/', 1)[-1]
27
+ self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported() == True else False
28
+ self.npz_path = None
29
+ self.npz_data = None
30
+ self.sentences_total_time = 0.0
31
+ self.sentence_idx = 1
32
+ self.params = {TTS_ENGINES['XTTSv2']: {"latent_embedding":{}}, TTS_ENGINES['BARK']: {},TTS_ENGINES['VITS']: {"semitones": {}}, TTS_ENGINES['FAIRSEQ']: {"semitones": {}}, TTS_ENGINES['TACOTRON2']: {"semitones": {}}, TTS_ENGINES['YOURTTS']: {}}
33
+ self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
34
+ self.vtt_path = os.path.join(self.session['process_dir'], Path(self.session['final_name']).stem + '.vtt')
35
+ self.resampler_cache = {}
36
+ self.audio_segments = []
37
+ self._build()
38
+ except Exception as e:
39
+ error = f'__init__() error: {e}'
40
+ print(error)
41
+ return None
42
+
43
+ def _build(self):
44
+ try:
45
+ global xtts_builtin_speakers_list
46
+ load_zeroshot = True if self.session['tts_engine'] in [TTS_ENGINES['VITS'], TTS_ENGINES['FAIRSEQ'], TTS_ENGINES['TACOTRON2']] else False
47
+ tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
48
+ if not tts:
49
+ if xtts_builtin_speakers_list is None:
50
+ self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename=default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][4], cache_dir=self.cache_dir)
51
+ xtts_builtin_speakers_list = torch.load(self.speakers_path)
52
+ if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
53
+ msg = f"Loading TTS {self.session['tts_engine']} model, it takes a while, please be patient..."
54
+ print(msg)
55
+ if self.session['custom_model'] is not None:
56
+ config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
57
+ checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
58
+ vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
59
+ self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
60
+ tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
61
+ else:
62
+ hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
63
+ if self.session['fine_tuned'] == 'internal':
64
+ hf_sub = ''
65
+ if self.speakers_path is None:
66
+ self.speakers_path = hf_hub_download(repo_id=hf_repo, filename=default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][4], cache_dir=self.cache_dir)
67
+ else:
68
+ hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
69
+ config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
70
+ checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
71
+ vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
72
+ tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
73
+ elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
74
+ if self.session['custom_model'] is not None:
75
+ msg = f"{self.session['tts_engine']} custom model not implemented yet!"
76
+ print(msg)
77
+ return False
78
+ else:
79
+ hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
80
+ hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
81
+ text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
82
+ coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
83
+ fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
84
+ checkpoint_dir = os.path.dirname(text_model_path)
85
+ tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir, device=self.session['device'])
86
+ elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
87
+ if self.session['custom_model'] is not None:
88
+ msg = f"{self.session['tts_engine']} custom model not implemented yet!"
89
+ print(msg)
90
+ return False
91
+ else:
92
+ iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
93
+ sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
94
+ sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
95
+ if sub is not None:
96
+ self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['VITS']][self.session['fine_tuned']]['samplerate'][sub]
97
+ model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
98
+ msg = f"Loading TTS {model_path} model, it takes a while, please be patient..."
99
+ print(msg)
100
+ self.tts_key = model_path
101
+ tts = self._load_api(self.tts_key, model_path, self.session['device'])
102
+ else:
103
+ msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
104
+ print(msg)
105
+ return False
106
+ elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
107
+ if self.session['custom_model'] is not None:
108
+ msg = f"{self.session['tts_engine']} custom model not implemented yet!"
109
+ print(msg)
110
+ return False
111
+ else:
112
+ model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
113
+ self.tts_key = model_path
114
+ tts = self._load_api(self.tts_key, model_path, self.session['device'])
115
+ elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
116
+ if self.session['custom_model'] is not None:
117
+ msg = f"{self.session['tts_engine']} custom model not implemented yet!"
118
+ print(msg)
119
+ return False
120
+ else:
121
+ iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
122
+ sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
123
+ sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
124
+ self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['TACOTRON2']][self.session['fine_tuned']]['samplerate'][sub]
125
+ if sub is None:
126
+ iso_dir = self.session['language']
127
+ sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
128
+ if sub is not None:
129
+ model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
130
+ msg = f"Loading TTS {model_path} model, it takes a while, please be patient..."
131
+ print(msg)
132
+ self.tts_key = model_path
133
+ tts = self._load_api(self.tts_key, model_path, self.session['device'])
134
+ else:
135
+ msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
136
+ print(msg)
137
+ return False
138
+ elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
139
+ if self.session['custom_model'] is not None:
140
+ msg = f"{self.session['tts_engine']} custom model not implemented yet!"
141
+ print(msg)
142
+ return False
143
+ else:
144
+ model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
145
+ tts = self._load_api(self.tts_key, model_path, self.session['device'])
146
+ if load_zeroshot:
147
+ tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
148
+ if not tts_vc:
149
+ if self.session['voice'] is not None:
150
+ msg = f"Loading TTS {self.tts_vc_key} zeroshot model, it takes a while, please be patient..."
151
+ print(msg)
152
+ tts_vc = self._load_api(self.tts_vc_key, default_vc_model, self.session['device'])
153
+ return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
154
+ except Exception as e:
155
+ error = f'build() error: {e}'
156
+ print(error)
157
+ return False
158
+
159
+ def _load_api(self, key, model_path, device):
160
+ global lock
161
+ try:
162
+ if key in loaded_tts.keys():
163
+ return loaded_tts[key]['engine']
164
+ unload_tts(device, [self.tts_key, self.tts_vc_key])
165
+ from TTS.api import TTS as coquiAPI
166
+ with lock:
167
+ tts = coquiAPI(model_path)
168
+ if tts:
169
+ if device == 'cuda':
170
+ tts.cuda()
171
+ else:
172
+ tts.to(device)
173
+ loaded_tts[key] = {"engine": tts, "config": None}
174
+ msg = f'{model_path} Loaded!'
175
+ print(msg)
176
+ return tts
177
+ else:
178
+ error = 'TTS engine could not be created!'
179
+ print(error)
180
+ except Exception as e:
181
+ error = f'_load_api() error: {e}'
182
+ print(error)
183
+ return False
184
+
185
+ def _load_checkpoint(self, **kwargs):
186
+ global lock
187
+ try:
188
+ key = kwargs.get('key')
189
+ if key in loaded_tts.keys():
190
+ return loaded_tts[key]['engine']
191
+ tts_engine = kwargs.get('tts_engine')
192
+ device = kwargs.get('device')
193
+ unload_tts(device, [self.tts_key, self.tts_vc_key])
194
+ with lock:
195
+ if tts_engine == TTS_ENGINES['XTTSv2']:
196
+ from TTS.tts.configs.xtts_config import XttsConfig
197
+ from TTS.tts.models.xtts import Xtts
198
+ checkpoint_path = kwargs.get('checkpoint_path')
199
+ config_path = kwargs.get('config_path', None)
200
+ vocab_path = kwargs.get('vocab_path', None)
201
+ config = XttsConfig()
202
+ config.models_dir = os.path.join("models", "tts")
203
+ config.load_json(config_path)
204
+ tts = Xtts.init_from_config(config)
205
+ tts.load_checkpoint(
206
+ config,
207
+ checkpoint_path=checkpoint_path,
208
+ vocab_path=vocab_path,
209
+ use_deepspeed=default_engine_settings[TTS_ENGINES['XTTSv2']]['use_deepspeed'],
210
+ eval=True
211
+ )
212
+ elif tts_engine == TTS_ENGINES['BARK']:
213
+ from TTS.tts.configs.bark_config import BarkConfig
214
+ from TTS.tts.models.bark import Bark
215
+ checkpoint_dir = kwargs.get('checkpoint_dir')
216
+ config = BarkConfig()
217
+ config.CACHE_DIR = self.cache_dir
218
+ config.USE_SMALLER_MODELS = os.environ.get('SUNO_USE_SMALL_MODELS', '').lower() == 'true'
219
+ tts = Bark.init_from_config(config)
220
+ tts.load_checkpoint(
221
+ config,
222
+ checkpoint_dir=checkpoint_dir,
223
+ eval=True
224
+ )
225
+ if tts:
226
+ if device == 'cuda':
227
+ tts.cuda()
228
+ else:
229
+ tts.to(device)
230
+ loaded_tts[key] = {"engine": tts, "config": config}
231
+ msg = f'{tts_engine} Loaded!'
232
+ print(msg)
233
+ return tts
234
+ else:
235
+ error = 'TTS engine could not be created!'
236
+ print(error)
237
+ except Exception as e:
238
+ error = f'_load_checkpoint() error: {e}'
239
+ return False
240
+
241
+ def _check_xtts_builtin_speakers(self, voice_path, speaker, device):
242
+ try:
243
+ voice_parts = Path(voice_path).parts
244
+ if self.session['language'] not in voice_parts and speaker not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and self.session['language'] != 'eng':
245
+ if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
246
+ default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
247
+ if os.path.exists(default_text_file):
248
+ msg = f"Converting builtin eng voice to {self.session['language']}..."
249
+ print(msg)
250
+ tts_internal_key = f"{TTS_ENGINES['XTTSv2']}-internal"
251
+ default_text = Path(default_text_file).read_text(encoding="utf-8")
252
+ hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
253
+ hf_sub = ''
254
+ tts = (loaded_tts.get(tts_internal_key) or {}).get('engine', False)
255
+ if not tts:
256
+ for key in list(loaded_tts.keys()): unload_tts(device, None, key)
257
+ config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
258
+ checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
259
+ vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
260
+ tts = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=tts_internal_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=device)
261
+ if tts:
262
+ if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
263
+ gpt_cond_latent, speaker_embedding = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
264
+ else:
265
+ gpt_cond_latent, speaker_embedding = tts.get_conditioning_latents(audio_path=[voice_path])
266
+ fine_tuned_params = {
267
+ key: cast_type(self.session[key])
268
+ for key, cast_type in {
269
+ "temperature": float,
270
+ "length_penalty": float,
271
+ "num_beams": int,
272
+ "repetition_penalty": float,
273
+ "top_k": int,
274
+ "top_p": float,
275
+ "speed": float,
276
+ "enable_text_splitting": bool
277
+ }.items()
278
+ if self.session.get(key) is not None
279
+ }
280
+ with torch.no_grad():
281
+ result = tts.inference(
282
+ text=default_text,
283
+ language=self.session['language_iso1'],
284
+ gpt_cond_latent=gpt_cond_latent,
285
+ speaker_embedding=speaker_embedding,
286
+ **fine_tuned_params
287
+ )
288
+ audio_data = result.get('wav')
289
+ if audio_data is not None:
290
+ audio_data = audio_data.tolist()
291
+ sourceTensor = self._tensor_type(audio_data)
292
+ audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
293
+ lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
294
+ new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
295
+ proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
296
+ torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
297
+ if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate):
298
+ del audio_data, sourceTensor, audio_tensor
299
+ if self.session['tts_engine'] != TTS_ENGINES['XTTSv2']:
300
+ del tts
301
+ unload_tts(device, None, tts_internal_key)
302
+ return new_voice_path
303
+ else:
304
+ error = 'normalize_audio() error:'
305
+ else:
306
+ error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
307
+ else:
308
+ error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
309
+ else:
310
+ error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
311
+ print(error)
312
+ else:
313
+ return voice_path
314
+ else:
315
+ return voice_path
316
+ except Exception as e:
317
+ error = f'_check_xtts_builtin_speakers() error: {e}'
318
+ print(error)
319
+ return False
320
+
321
+ def _check_bark_npz(self, voice_path, bark_dir, speaker, device):
322
+ try:
323
+ if self.session['language'] in language_tts[TTS_ENGINES['BARK']].keys():
324
+ npz_dir = os.path.join(bark_dir, speaker)
325
+ npz_file = os.path.join(npz_dir, f'{speaker}.npz')
326
+ if os.path.exists(npz_file):
327
+ return True
328
+ else:
329
+ os.makedirs(npz_dir, exist_ok=True)
330
+ tts_internal_key = f"{TTS_ENGINES['BARK']}-internal"
331
+ hf_repo = models[TTS_ENGINES['BARK']]['internal']['repo']
332
+ hf_sub = models[TTS_ENGINES['BARK']]['internal']['sub']
333
+ tts = (loaded_tts.get(tts_internal_key) or {}).get('engine', False)
334
+ if not tts:
335
+ for key in list(loaded_tts.keys()): unload_tts(device, None, key)
336
+ text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][0]}", cache_dir=self.cache_dir)
337
+ coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][1]}", cache_dir=self.cache_dir)
338
+ fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][2]}", cache_dir=self.cache_dir)
339
+ checkpoint_dir = os.path.dirname(text_model_path)
340
+ tts = self._load_checkpoint(tts_engine=TTS_ENGINES['BARK'], key=tts_internal_key, checkpoint_dir=checkpoint_dir, device=device)
341
+ if tts:
342
+ voice_temp = os.path.splitext(npz_file)[0]+'.wav'
343
+ shutil.copy(voice_path, voice_temp)
344
+ default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
345
+ default_text = Path(default_text_file).read_text(encoding="utf-8")
346
+ fine_tuned_params = {
347
+ key: cast_type(self.session[key])
348
+ for key, cast_type in {
349
+ "text_temp": float,
350
+ "waveform_temp": float
351
+ }.items()
352
+ if self.session.get(key) is not None
353
+ }
354
+ with torch.no_grad():
355
+ torch.manual_seed(67878789)
356
+ audio_data = tts.synthesize(
357
+ default_text,
358
+ loaded_tts[tts_internal_key]['config'],
359
+ speaker_id=speaker,
360
+ voice_dirs=bark_dir,
361
+ silent=True,
362
+ **fine_tuned_params
363
+ )
364
+ os.remove(voice_temp)
365
+ del audio_data
366
+ if self.session['tts_engine'] != TTS_ENGINES['BARK']:
367
+ del tts
368
+ unload_tts(device, None, tts_internal_key)
369
+ msg = f"Saved NPZ file: {npz_file}"
370
+ print(msg)
371
+ return True
372
+ else:
373
+ error = f'_check_bark_npz() error: {tts_internal_key} is False'
374
+ print(error)
375
+ else:
376
+ return True
377
+ except Exception as e:
378
+ error = f'_check_bark_npz() error: {e}'
379
+ print(error)
380
+ return False
381
+
382
+ def _tensor_type(self, audio_data):
383
+ if isinstance(audio_data, torch.Tensor):
384
+ return audio_data
385
+ elif isinstance(audio_data, np.ndarray):
386
+ return torch.from_numpy(audio_data).float()
387
+ elif isinstance(audio_data, list):
388
+ return torch.tensor(audio_data, dtype=torch.float32)
389
+ else:
390
+ raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
391
+
392
+ def _get_resampler(self, orig_sr, target_sr):
393
+ key = (orig_sr, target_sr)
394
+ if key not in self.resampler_cache:
395
+ self.resampler_cache[key] = torchaudio.transforms.Resample(
396
+ orig_freq=orig_sr, new_freq=target_sr
397
+ )
398
+ return self.resampler_cache[key]
399
+
400
+ def _resample_wav(self, wav_path, expected_sr):
401
+ waveform, orig_sr = torchaudio.load(wav_path)
402
+ if orig_sr == expected_sr and waveform.size(0) == 1:
403
+ return wav_path
404
+ if waveform.size(0) > 1:
405
+ waveform = waveform.mean(dim=0, keepdim=True)
406
+ if orig_sr != expected_sr:
407
+ resampler = self._get_resampler(orig_sr, expected_sr)
408
+ waveform = resampler(waveform)
409
+ wav_tensor = waveform.squeeze(0)
410
+ wav_numpy = wav_tensor.cpu().numpy()
411
+ tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
412
+ tmp_path = tmp_fh.name
413
+ tmp_fh.close()
414
+ sf.write(tmp_path, wav_numpy, expected_sr, subtype="PCM_16")
415
+ return tmp_path
416
+
417
+ def convert(self, s_n, s):
418
+ global xtts_builtin_speakers_list
419
+ try:
420
+ sentence_number = s_n
421
+ sentence = s
422
+ speaker = None
423
+ audio_data = False
424
+ trim_audio_buffer = 0.004
425
+ settings = self.params[self.session['tts_engine']]
426
+ final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
427
+ settings['voice_path'] = (
428
+ self.session['voice'] if self.session['voice'] is not None
429
+ else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
430
+ else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
431
+ )
432
+ if settings['voice_path'] is not None:
433
+ speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
434
+ if settings['voice_path'] not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and os.path.basename(settings['voice_path']) != 'ref.wav':
435
+ self.session['voice'] = settings['voice_path'] = self._check_xtts_builtin_speakers(settings['voice_path'], speaker, self.session['device'])
436
+ if not settings['voice_path']:
437
+ msg = f"Could not create the builtin speaker selected voice in {self.session['language']}"
438
+ print(msg)
439
+ return False
440
+ tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
441
+ if tts:
442
+ if sentence == TTS_SML['break']:
443
+ silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
444
+ break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
445
+ self.audio_segments.append(break_tensor.clone())
446
+ return True
447
+ elif sentence == TTS_SML['pause']:
448
+ silence_time = int(np.random.uniform(1.0, 1.8) * 100) / 100
449
+ pause_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 1.0 to 1.8 seconds
450
+ self.audio_segments.append(pause_tensor.clone())
451
+ return True
452
+ else:
453
+ if sentence[-1].isalnum():
454
+ sentence = f'{sentence} —'
455
+ if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
456
+ trim_audio_buffer = 0.008
457
+ if settings['voice_path'] is not None and settings['voice_path'] in settings['latent_embedding'].keys():
458
+ settings['gpt_cond_latent'], settings['speaker_embedding'] = settings['latent_embedding'][settings['voice_path']]
459
+ else:
460
+ msg = 'Computing speaker latents...'
461
+ print(msg)
462
+ if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
463
+ settings['gpt_cond_latent'], settings['speaker_embedding'] = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
464
+ else:
465
+ settings['gpt_cond_latent'], settings['speaker_embedding'] = tts.get_conditioning_latents(audio_path=[settings['voice_path']])
466
+ settings['latent_embedding'][settings['voice_path']] = settings['gpt_cond_latent'], settings['speaker_embedding']
467
+ fine_tuned_params = {
468
+ key: cast_type(self.session[key])
469
+ for key, cast_type in {
470
+ "temperature": float,
471
+ "length_penalty": float,
472
+ "num_beams": int,
473
+ "repetition_penalty": float,
474
+ "top_k": int,
475
+ "top_p": float,
476
+ "speed": float,
477
+ "enable_text_splitting": bool
478
+ }.items()
479
+ if self.session.get(key) is not None
480
+ }
481
+ with torch.no_grad():
482
+ result = tts.inference(
483
+ text=sentence.replace('.', ' —'),
484
+ language=self.session['language_iso1'],
485
+ gpt_cond_latent=settings['gpt_cond_latent'],
486
+ speaker_embedding=settings['speaker_embedding'],
487
+ **fine_tuned_params
488
+ )
489
+ audio_sentence = result.get('wav')
490
+ if is_audio_data_valid(audio_sentence):
491
+ audio_sentence = audio_sentence.tolist()
492
+ elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
493
+ trim_audio_buffer = 0.002
494
+ '''
495
+ [laughter]
496
+ [laughs]
497
+ [sighs]
498
+ [music]
499
+ [gasps]
500
+ [clears throat]
501
+ — or ... for hesitations
502
+ ♪ for song lyrics
503
+ CAPITALIZATION for emphasis of a word
504
+ [MAN] and [WOMAN] to bias Bark toward male and female speakers, respectively
505
+ '''
506
+ if speaker in default_engine_settings[self.session['tts_engine']]['voices'].keys():
507
+ bark_dir = default_engine_settings[self.session['tts_engine']]['speakers_path']
508
+ else:
509
+ bark_dir = os.path.join(os.path.dirname(settings['voice_path']), 'bark')
510
+ if not self._check_bark_npz(settings['voice_path'], bark_dir, speaker, self.session['device']):
511
+ error = 'Could not create npz file!'
512
+ print(error)
513
+ return False
514
+ npz_file = os.path.join(bark_dir, speaker, f'{speaker}.npz')
515
+ fine_tuned_params = {
516
+ key: cast_type(self.session[key])
517
+ for key, cast_type in {
518
+ "text_temp": float,
519
+ "waveform_temp": float
520
+ }.items()
521
+ if self.session.get(key) is not None
522
+ }
523
+ if self.npz_path is None or self.npz_path != npz_file:
524
+ self.npz_path = npz_file
525
+ self.npz_data = np.load(self.npz_path, allow_pickle=True)
526
+ history_prompt = [
527
+ self.npz_data["semantic_prompt"],
528
+ self.npz_data["coarse_prompt"],
529
+ self.npz_data["fine_prompt"]
530
+ ]
531
+ with torch.no_grad():
532
+ torch.manual_seed(67878789)
533
+ audio_sentence, _ = tts.generate_audio(
534
+ sentence,
535
+ history_prompt=history_prompt,
536
+ silent=True,
537
+ **fine_tuned_params
538
+ )
539
+ if is_audio_data_valid(audio_sentence):
540
+ audio_sentence = audio_sentence.tolist()
541
+ elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
542
+ speaker_argument = {}
543
+ if self.session['language'] == 'eng' and 'vctk/vits' in models[self.session['tts_engine']]['internal']['sub']:
544
+ if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits']:
545
+ speaker_argument = {"speaker": 'p262'}
546
+ elif self.session['language'] == 'cat' and 'custom/vits' in models[self.session['tts_engine']]['internal']['sub']:
547
+ if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits']:
548
+ speaker_argument = {"speaker": '09901'}
549
+ if settings['voice_path'] is not None:
550
+ proc_dir = os.path.join(self.session['voice_dir'], 'proc')
551
+ os.makedirs(proc_dir, exist_ok=True)
552
+ tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
553
+ tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
554
+ tts.tts_to_file(
555
+ text=sentence,
556
+ file_path=tmp_in_wav,
557
+ **speaker_argument
558
+ )
559
+ if settings['voice_path'] in settings['semitones'].keys():
560
+ semitones = settings['semitones'][settings['voice_path']]
561
+ else:
562
+ voice_path_gender = detect_gender(settings['voice_path'])
563
+ voice_builtin_gender = detect_gender(tmp_in_wav)
564
+ msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
565
+ print(msg)
566
+ if voice_builtin_gender != voice_path_gender:
567
+ semitones = -4 if voice_path_gender == 'male' else 4
568
+ msg = f"Adapting builtin voice frequencies from the clone voice..."
569
+ print(msg)
570
+ else:
571
+ semitones = 0
572
+ settings['semitones'][settings['voice_path']] = semitones
573
+ if semitones > 0:
574
+ try:
575
+ cmd = [
576
+ shutil.which('sox'), tmp_in_wav,
577
+ "-r", str(settings['samplerate']), tmp_out_wav,
578
+ "pitch", str(semitones * 100)
579
+ ]
580
+ subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
581
+ except subprocess.CalledProcessError as e:
582
+ error = f"Subprocess error: {e.stderr}"
583
+ print(error)
584
+ DependencyError(e)
585
+ return False
586
+ except FileNotFoundError as e:
587
+ error = f"File not found: {e}"
588
+ print(error)
589
+ DependencyError(e)
590
+ return False
591
+ else:
592
+ tmp_out_wav = tmp_in_wav
593
+ tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
594
+ if tts_vc:
595
+ settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
596
+ source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
597
+ target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
598
+ audio_sentence = tts_vc.voice_conversion(
599
+ source_wav=source_wav,
600
+ target_wav=target_wav
601
+ )
602
+ else:
603
+ error = f'Engine {self.tts_vc_key} is None'
604
+ print(error)
605
+ return False
606
+ if os.path.exists(tmp_in_wav):
607
+ os.remove(tmp_in_wav)
608
+ if os.path.exists(tmp_out_wav):
609
+ os.remove(tmp_out_wav)
610
+ if os.path.exists(source_wav):
611
+ os.remove(source_wav)
612
+ else:
613
+ audio_sentence = tts.tts(
614
+ text=sentence,
615
+ **speaker_argument
616
+ )
617
+ elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
618
+ speaker_argument = {}
619
+ not_supported_punc_pattern = re.compile(r"[.:—]")
620
+ if settings['voice_path'] is not None:
621
+ proc_dir = os.path.join(self.session['voice_dir'], 'proc')
622
+ os.makedirs(proc_dir, exist_ok=True)
623
+ tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
624
+ tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
625
+ tts.tts_to_file(
626
+ text=re.sub(not_supported_punc_pattern, ' ', sentence),
627
+ file_path=tmp_in_wav,
628
+ **speaker_argument
629
+ )
630
+ if settings['voice_path'] in settings['semitones'].keys():
631
+ semitones = settings['semitones'][settings['voice_path']]
632
+ else:
633
+ voice_path_gender = detect_gender(settings['voice_path'])
634
+ voice_builtin_gender = detect_gender(tmp_in_wav)
635
+ msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
636
+ print(msg)
637
+ if voice_builtin_gender != voice_path_gender:
638
+ semitones = -4 if voice_path_gender == 'male' else 4
639
+ msg = f"Adapting builtin voice frequencies from the clone voice..."
640
+ print(msg)
641
+ else:
642
+ semitones = 0
643
+ settings['semitones'][settings['voice_path']] = semitones
644
+ if semitones > 0:
645
+ try:
646
+ cmd = [
647
+ shutil.which('sox'), tmp_in_wav,
648
+ "-r", str(settings['samplerate']), tmp_out_wav,
649
+ "pitch", str(semitones * 100)
650
+ ]
651
+ subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
652
+ except subprocess.CalledProcessError as e:
653
+ print(f"Subprocess error: {e.stderr}")
654
+ DependencyError(e)
655
+ return False
656
+ except FileNotFoundError as e:
657
+ print(f"File not found: {e}")
658
+ DependencyError(e)
659
+ return False
660
+ else:
661
+ tmp_out_wav = tmp_in_wav
662
+ tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
663
+ if tts_vc:
664
+ settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
665
+ source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
666
+ target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
667
+ audio_sentence = tts_vc.voice_conversion(
668
+ source_wav=source_wav,
669
+ target_wav=target_wav
670
+ )
671
+ else:
672
+ error = f'Engine {self.tts_vc_key} is None'
673
+ print(error)
674
+ return False
675
+ if os.path.exists(tmp_in_wav):
676
+ os.remove(tmp_in_wav)
677
+ if os.path.exists(tmp_out_wav):
678
+ os.remove(tmp_out_wav)
679
+ if os.path.exists(source_wav):
680
+ os.remove(source_wav)
681
+ else:
682
+ audio_sentence = tts.tts(
683
+ text=re.sub(not_supported_punc_pattern, ' ', sentence),
684
+ **speaker_argument
685
+ )
686
+ elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
687
+ speaker_argument = {}
688
+ not_supported_punc_pattern = re.compile(r'["—]')
689
+ if settings['voice_path'] is not None:
690
+ proc_dir = os.path.join(self.session['voice_dir'], 'proc')
691
+ os.makedirs(proc_dir, exist_ok=True)
692
+ tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
693
+ tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
694
+ tts.tts_to_file(
695
+ text=re.sub(not_supported_punc_pattern, '', sentence),
696
+ file_path=tmp_in_wav,
697
+ **speaker_argument
698
+ )
699
+ if settings['voice_path'] in settings['semitones'].keys():
700
+ semitones = settings['semitones'][settings['voice_path']]
701
+ else:
702
+ voice_path_gender = detect_gender(settings['voice_path'])
703
+ voice_builtin_gender = detect_gender(tmp_in_wav)
704
+ msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
705
+ print(msg)
706
+ if voice_builtin_gender != voice_path_gender:
707
+ semitones = -4 if voice_path_gender == 'male' else 4
708
+ msg = f"Adapting builtin voice frequencies from the clone voice..."
709
+ print(msg)
710
+ else:
711
+ semitones = 0
712
+ settings['semitones'][settings['voice_path']] = semitones
713
+ if semitones > 0:
714
+ try:
715
+ cmd = [
716
+ shutil.which('sox'), tmp_in_wav,
717
+ "-r", str(settings['samplerate']), tmp_out_wav,
718
+ "pitch", str(semitones * 100)
719
+ ]
720
+ subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
721
+ except subprocess.CalledProcessError as e:
722
+ error = f"Subprocess error: {e.stderr}"
723
+ print(error)
724
+ DependencyError(e)
725
+ return False
726
+ except FileNotFoundError as e:
727
+ error = f"File not found: {e}"
728
+ print(error)
729
+ DependencyError(e)
730
+ return False
731
+ else:
732
+ tmp_out_wav = tmp_in_wav
733
+ tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
734
+ if tts_vc:
735
+ settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
736
+ source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
737
+ target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
738
+ audio_sentence = tts_vc.voice_conversion(
739
+ source_wav=source_wav,
740
+ target_wav=target_wav
741
+ )
742
+ else:
743
+ error = f'Engine {self.tts_vc_key} is None'
744
+ print(error)
745
+ return False
746
+ if os.path.exists(tmp_in_wav):
747
+ os.remove(tmp_in_wav)
748
+ if os.path.exists(tmp_out_wav):
749
+ os.remove(tmp_out_wav)
750
+ if os.path.exists(source_wav):
751
+ os.remove(source_wav)
752
+ else:
753
+ audio_sentence = tts.tts(
754
+ text=re.sub(not_supported_punc_pattern, '', sentence),
755
+ **speaker_argument
756
+ )
757
+ elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
758
+ speaker_argument = {}
759
+ language = self.session['language_iso1'] if self.session['language_iso1'] == 'en' else 'fr-fr' if self.session['language_iso1'] == 'fr' else 'pt-br' if self.session['language_iso1'] == 'pt' else 'en'
760
+ if settings['voice_path'] is not None:
761
+ speaker_wav = settings['voice_path']
762
+ speaker_argument = {"speaker_wav": speaker_wav}
763
+ else:
764
+ voice_key = default_engine_settings[TTS_ENGINES['YOURTTS']]['voices']['ElectroMale-2']
765
+ speaker_argument = {"speaker": voice_key}
766
+ with torch.no_grad():
767
+ audio_sentence = tts.tts(
768
+ text=sentence.replace('—', '').strip(),
769
+ language=language,
770
+ **speaker_argument
771
+ )
772
+ if is_audio_data_valid(audio_sentence):
773
+ sourceTensor = self._tensor_type(audio_sentence)
774
+ audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
775
+ if sentence[-1].isalnum() or sentence[-1] == '—':
776
+ audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.003, trim_audio_buffer).unsqueeze(0)
777
+ self.audio_segments.append(audio_tensor)
778
+ if not re.search(r'\w$', sentence, flags=re.UNICODE):
779
+ silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
780
+ break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time))
781
+ self.audio_segments.append(break_tensor.clone())
782
+ if self.audio_segments:
783
+ audio_tensor = torch.cat(self.audio_segments, dim=-1)
784
+ start_time = self.sentences_total_time
785
+ duration = round((audio_tensor.shape[-1] / settings['samplerate']), 2)
786
+ end_time = start_time + duration
787
+ self.sentences_total_time = end_time
788
+ sentence_obj = {
789
+ "start": start_time,
790
+ "end": end_time,
791
+ "text": sentence,
792
+ "resume_check": self.sentence_idx
793
+ }
794
+ self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
795
+ if self.sentence_idx:
796
+ torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
797
+ del audio_tensor
798
+ self.audio_segments = []
799
+ if os.path.exists(final_sentence_file):
800
+ return True
801
+ else:
802
+ error = f"Cannot create {final_sentence_file}"
803
+ print(error)
804
+ else:
805
+ error = f"convert() error: {self.session['tts_engine']} is None"
806
+ print(error)
807
+ except Exception as e:
808
+ error = f'Coquit.convert(): {e}'
809
+ raise ValueError(e)
810
+ return False
lib/classes/tts_manager.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from lib.models import TTS_ENGINES
4
+
5
+ class TTSManager:
6
+ def __init__(self, session):
7
+ self.session = session
8
+ self.tts = None
9
+ self._build()
10
+
11
+ def _build(self):
12
+ if self.session['tts_engine'] in TTS_ENGINES.values():
13
+ if self.session['tts_engine'] in [TTS_ENGINES['XTTSv2'], TTS_ENGINES['BARK'], TTS_ENGINES['VITS'], TTS_ENGINES['FAIRSEQ'], TTS_ENGINES['TACOTRON2'], TTS_ENGINES['YOURTTS']]:
14
+ from lib.classes.tts_engines.coqui import Coqui
15
+ self.tts = Coqui(self.session)
16
+ #elif self.session['tts_engine'] in [TTS_ENGINES['NEW_TTS']]:
17
+ # from lib.classes.tts_engines.new_tts import NewTts
18
+ # self.tts = NewTts(self.session)
19
+ if self.tts:
20
+ return True
21
+ else:
22
+ error = 'TTS engine could not be created!'
23
+ print(error)
24
+ else:
25
+ print('Other TTS engines coming soon!')
26
+ return False
27
+
28
+ def convert_sentence2audio(self, sentence_number, sentence):
29
+ try:
30
+ if self.session['tts_engine'] in TTS_ENGINES.values():
31
+ return self.tts.convert(sentence_number, sentence)
32
+ else:
33
+ print('Other TTS engines coming soon!')
34
+ except Exception as e:
35
+ error = f'convert_sentence2audio(): {e}'
36
+ raise ValueError(e)
37
+ return False
lib/classes/voice_extractor.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import regex as re
4
+ import scipy.fftpack
5
+ import soundfile as sf
6
+ import subprocess
7
+ import shutil
8
+
9
+ from io import BytesIO
10
+ from pydub import AudioSegment, silence
11
+ from pydub.silence import detect_silence
12
+
13
+ from lib.conf import voice_formats, default_audio_proc_samplerate
14
+ from lib.models import TTS_ENGINES, models
15
+ from lib.classes.background_detector import BackgroundDetector
16
+
17
+ class VoiceExtractor:
18
+
19
+ def __init__(self, session, voice_file, voice_name):
20
+ self.wav_file = None
21
+ self.session = session
22
+ self.voice_file = voice_file
23
+ self.voice_name = voice_name
24
+ self.voice_track = 'vocals.wav'
25
+ self.samplerate = models[session['tts_engine']][session['fine_tuned']]['samplerate']
26
+ self.output_dir = self.session['voice_dir']
27
+ self.demucs_dir = os.path.join(self.output_dir, 'htdemucs', voice_name)
28
+ self.silence_threshold = -60
29
+
30
+ def _validate_format(self):
31
+ file_extension = os.path.splitext(self.voice_file)[1].lower()
32
+ if file_extension in voice_formats:
33
+ msg = 'Input file valid'
34
+ return True, msg
35
+ error = f'Unsupported file format: {file_extension}. Supported formats are: {", ".join(voice_formats)}'
36
+ return False, error
37
+
38
+ def _convert2wav(self):
39
+ try:
40
+ self.wav_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
41
+ ffmpeg_cmd = [
42
+ shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_file,
43
+ '-ac', '1',
44
+ '-y', self.wav_file
45
+ ]
46
+ process = subprocess.Popen(
47
+ ffmpeg_cmd,
48
+ env={},
49
+ stdout=subprocess.PIPE,
50
+ stderr=subprocess.STDOUT,
51
+ text=True,
52
+ universal_newlines=True,
53
+ encoding='utf-8'
54
+ )
55
+ for line in process.stdout:
56
+ print(line, end='') # Print each line of stdout
57
+ process.wait()
58
+ if process.returncode != 0:
59
+ error = f'_convert2wav(): process.returncode: {process.returncode}'
60
+ elif not os.path.exists(self.wav_file) or os.path.getsize(self.wav_file) == 0:
61
+ error = f'_convert2wav output error: {self.wav_file} was not created or is empty.'
62
+ else:
63
+ msg = 'Conversion to .wav format for processing successful'
64
+ return True, msg
65
+ except subprocess.CalledProcessError as e:
66
+ error = f'convert2wav fmpeg.Error: {e.stderr.decode()}'
67
+ raise ValueError(error)
68
+ except Exception as e:
69
+ error = f'_convert2wav() error: {e}'
70
+ raise ValueError(error)
71
+ return False, error
72
+
73
+ def _detect_background(self):
74
+ try:
75
+ msg = 'Detecting any background noise or music...'
76
+ print(msg)
77
+ detector = BackgroundDetector(wav_file=self.wav_file)
78
+ status, report = detector.detect(vad_ratio_thresh=0.15)
79
+ print(report)
80
+ if status:
81
+ msg = 'Background noise or music detected. Proceeding voice extraction...'
82
+ else:
83
+ msg = 'No background noise or music detected. Skipping separation...'
84
+ return True, status, msg
85
+ except Exception as e:
86
+ error = f'_detect_background() error: {e}'
87
+ raise ValueError(error)
88
+ return False, False, error
89
+
90
+ def _demucs_voice(self):
91
+ try:
92
+ cmd = [
93
+ "demucs",
94
+ "--verbose",
95
+ "--two-stems=vocals",
96
+ "--out", self.output_dir,
97
+ self.wav_file
98
+ ]
99
+ try:
100
+ process = subprocess.run(cmd, check=True)
101
+ self.voice_track = os.path.join(self.demucs_dir, self.voice_track)
102
+ msg = 'Voice track isolation successful'
103
+ return True, msg
104
+ except subprocess.CalledProcessError as e:
105
+ error = (
106
+ f'_demucs_voice() subprocess CalledProcessError error: {e.returncode}\n\n'
107
+ f'stdout: {e.output}\n\n'
108
+ f'stderr: {e.stderr}'
109
+ )
110
+ raise ValueError(error)
111
+ except FileNotFoundError:
112
+ error = f'_demucs_voice() subprocess FileNotFoundError error: The "demucs" command was not found. Ensure it is installed and in PATH.'
113
+ raise ValueError(error)
114
+ except Exception as e:
115
+ error = f'_demucs_voice() subprocess Exception error: {str(e)}'
116
+ raise ValueError(error)
117
+ except Exception as e:
118
+ error = f'_demucs_voice() error: {e}'
119
+ raise ValueError(error)
120
+ return False, error
121
+
122
+ def _remove_silences(self, audio, silence_threshold, min_silence_len=200, keep_silence=300):
123
+ final_audio = AudioSegment.silent(duration=0)
124
+ chunks = silence.split_on_silence(
125
+ audio,
126
+ min_silence_len=min_silence_len,
127
+ silence_thresh=silence_threshold,
128
+ keep_silence=keep_silence
129
+ )
130
+ for chunk in chunks:
131
+ final_audio += chunk
132
+ final_audio.export(self.voice_track, format='wav')
133
+
134
+ def _trim_and_clean(self,silence_threshold, min_silence_len=200, chunk_size=100):
135
+ try:
136
+ audio = AudioSegment.from_file(self.voice_track)
137
+ total_duration = len(audio) # Total duration in milliseconds
138
+ min_required_duration = 20000 if self.session['tts_engine'] == TTS_ENGINES['BARK'] else 12000
139
+ msg = f"Removing long pauses..."
140
+ print(msg)
141
+ self._remove_silences(audio, silence_threshold)
142
+ if total_duration <= min_required_duration:
143
+ msg = f"Audio is only {total_duration/1000:.2f}s long; skipping audio trimming..."
144
+ return True, msg
145
+ else:
146
+ if total_duration > (min_required_duration * 2):
147
+ msg = f"Audio longer than the max allowed. Proceeding to audio trimming..."
148
+ print(msg)
149
+ window = min_required_duration
150
+ hop = max(1, window // 4)
151
+ best_var = -float("inf")
152
+ best_start = 0
153
+ sr = audio.frame_rate
154
+ for start in range(0, total_duration - window + 1, hop):
155
+ chunk = audio[start : start + window]
156
+ samples = np.array(chunk.get_array_of_samples()).astype(float)
157
+ # 1) FFT + magnitude
158
+ spectrum = np.abs(scipy.fftpack.fft(samples))
159
+ # 2) turn into a probability distribution
160
+ p = spectrum / (np.sum(spectrum) + 1e-10)
161
+ # 3) spectral entropy
162
+ entropy = -np.sum(p * np.log2(p + 1e-10))
163
+ if entropy > best_var:
164
+ best_var = entropy
165
+ best_start = start
166
+ best_end = best_start + window
167
+ msg = (
168
+ f"Selected most‐diverse‐spectrum window "
169
+ f"{best_start/1000:.2f}s–{best_end/1000:.2f}s "
170
+ f"(@ entropy {best_var:.2f} bits)"
171
+ )
172
+ print(msg)
173
+ # 1) find all silent spans in the file
174
+ silence_spans = detect_silence(
175
+ audio,
176
+ min_silence_len=min_silence_len,
177
+ silence_thresh=silence_threshold
178
+ )
179
+ # silence_spans = [ [start_ms, end_ms], … ]
180
+ # 2) snap best_start *backward* to the end of the last silence before it
181
+ prev_ends = [end for (start, end) in silence_spans if end <= best_start]
182
+ if prev_ends:
183
+ new_start = max(prev_ends)
184
+ else:
185
+ new_start = 0
186
+ # 3) snap best_end *forward* to the start of the first silence after it
187
+ next_starts = [start for (start, end) in silence_spans if start >= best_end]
188
+ if next_starts:
189
+ new_end = min(next_starts)
190
+ else:
191
+ new_end = total_duration
192
+ # 4) update your slice bounds
193
+ best_start, best_end = new_start, new_end
194
+ else:
195
+ best_start = 0
196
+ best_end = total_duration
197
+ trimmed_audio = audio[best_start:best_end]
198
+ trimmed_audio.export(self.voice_track, format='wav')
199
+ msg = 'Audio trimmed and cleaned!'
200
+ return True, msg
201
+ except Exception as e:
202
+ error = f'_trim_and_clean() error: {e}'
203
+ raise ValueError(error)
204
+
205
+ def _normalize_audio(self):
206
+ error = ''
207
+ try:
208
+ proc_voice_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}_proc.wav')
209
+ final_voice_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
210
+ ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_track]
211
+ filter_complex = (
212
+ 'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
213
+ 'afftdn=nf=-70,'
214
+ 'acompressor=threshold=-20dB:ratio=2:attack=80:release=200:makeup=1dB,'
215
+ 'loudnorm=I=-14:TP=-3:LRA=7:linear=true,'
216
+ 'equalizer=f=150:t=q:w=2:g=1,'
217
+ 'equalizer=f=250:t=q:w=2:g=-3,'
218
+ 'equalizer=f=3000:t=q:w=2:g=2,'
219
+ 'equalizer=f=5500:t=q:w=2:g=-4,'
220
+ 'equalizer=f=9000:t=q:w=2:g=-2,'
221
+ 'highpass=f=63[audio]'
222
+ )
223
+ ffmpeg_cmd += [
224
+ '-filter_complex', filter_complex,
225
+ '-map', '[audio]',
226
+ '-ar', f'{default_audio_proc_samplerate}',
227
+ '-y', proc_voice_file
228
+ ]
229
+ try:
230
+ process = subprocess.Popen(
231
+ ffmpeg_cmd,
232
+ env={},
233
+ stdout=subprocess.PIPE,
234
+ stderr=subprocess.PIPE,
235
+ encoding='utf-8',
236
+ errors='ignore'
237
+ )
238
+ for line in process.stdout:
239
+ print(line, end='') # Print each line of stdout
240
+ process.wait()
241
+ if process.returncode != 0:
242
+ error = f'_normalize_audio(): process.returncode: {process.returncode}'
243
+ elif not os.path.exists(proc_voice_file) or os.path.getsize(proc_voice_file) == 0:
244
+ error = f'_normalize_audio() error: {proc_voice_file} was not created or is empty.'
245
+ else:
246
+ os.replace(proc_voice_file, final_voice_file)
247
+ shutil.rmtree(self.demucs_dir, ignore_errors=True)
248
+ msg = 'Audio normalization successful!'
249
+ return True, msg
250
+ except subprocess.CalledProcessError as e:
251
+ error = f'_normalize_audio() ffmpeg.Error: {e.stderr.decode()}'
252
+ except FileNotFoundError as e:
253
+ error = '_normalize_audio() FileNotFoundError: {e} Input file or FFmpeg PATH not found!'
254
+ except Exception as e:
255
+ error = f'_normalize_audio() error: {e}'
256
+ return False, error
257
+
258
+ def extract_voice(self):
259
+ success = False
260
+ msg = None
261
+ try:
262
+ success, msg = self._validate_format()
263
+ print(msg)
264
+ if success:
265
+ success, msg = self._convert2wav()
266
+ print(msg)
267
+ if success:
268
+ success, status, msg = self._detect_background()
269
+ print(msg)
270
+ if success:
271
+ if status:
272
+ success, msg = self._demucs_voice()
273
+ print(msg)
274
+ else:
275
+ self.voice_track = self.wav_file
276
+ if success:
277
+ success, msg = self._trim_and_clean(self.silence_threshold)
278
+ print(msg)
279
+ if success:
280
+ success, msg = self._normalize_audio()
281
+ print(msg)
282
+ except Exception as e:
283
+ msg = f'extract_voice() error: {e}'
284
+ raise ValueError(msg)
285
+ shutil.rmtree(self.demucs_dir, ignore_errors=True)
286
+ return success, msg
lib/conf.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import platform
3
+
4
+ tmp_dir = os.path.abspath('tmp')
5
+ tmp_expire = 7 # days
6
+
7
+ models_dir = os.path.abspath('models')
8
+ ebooks_dir = os.path.abspath('ebooks')
9
+ voices_dir = os.path.abspath('voices')
10
+ tts_dir = os.path.join(models_dir, 'tts')
11
+
12
+ os.environ['PYTHONUTF8'] = '1'
13
+ os.environ['PYTHONIOENCODING'] = 'utf-8'
14
+ os.environ['COQUI_TOS_AGREED'] = '1'
15
+ os.environ['PYTHONIOENCODING'] = 'utf-8'
16
+ os.environ['CALIBRE_NO_NATIVE_FILEDIALOGS'] = '1'
17
+ os.environ['GRADIO_DEBUG'] = '1'
18
+ os.environ['DO_NOT_TRACK'] = 'true'
19
+ os.environ['CALIBRE_TEMP_DIR'] = tmp_dir
20
+ os.environ['CALIBRE_CACHE_DIRECTORY'] = tmp_dir
21
+ os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
22
+ os.environ['HF_HOME'] = tts_dir
23
+ os.environ['HF_DATASETS_CACHE'] = tts_dir
24
+ os.environ['BARK_CACHE_DIR'] = tts_dir
25
+ os.environ['TTS_CACHE'] = tts_dir
26
+ os.environ['TORCH_HOME'] = tts_dir
27
+ os.environ['TTS_HOME'] = models_dir
28
+ os.environ['XDG_CACHE_HOME'] = models_dir
29
+ os.environ['STANZA_RESOURCES_DIR'] = os.path.join(models_dir, 'stanza')
30
+ os.environ['ARGOS_TRANSLATE_PACKAGE_PATH'] = os.path.join(models_dir, 'argostranslate')
31
+ os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
32
+ os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
33
+ os.environ['SUNO_OFFLOAD_CPU'] = 'False' # BARK option: False needs A GPU
34
+ os.environ['SUNO_USE_SMALL_MODELS'] = 'False' # BARK option: False needs a GPU with VRAM > 4GB
35
+ if platform.system() == 'Windows':
36
+ os.environ['ESPEAK_DATA_PATH'] = os.path.expandvars(r"%USERPROFILE%\scoop\apps\espeak-ng\current\eSpeak NG\espeak-ng-data")
37
+
38
+ prog_version = (lambda: open('VERSION.txt').read().strip())()
39
+
40
+ min_python_version = (3,10)
41
+ max_python_version = (3,12)
42
+
43
+ NATIVE = 'native'
44
+ FULL_DOCKER = 'full_docker'
45
+
46
+ debug_mode = True
47
+
48
+ device_list = ['cpu', 'gpu', 'mps']
49
+ default_device = 'cpu'
50
+ default_gpu_wiki = '<a href="https://github.com/DrewThomasson/ebook2audiobook/wiki/GPU-ISSUES">howto wiki</a>'
51
+
52
+ python_env_dir = os.path.abspath(os.path.join('.','python_env'))
53
+ requirements_file = os.path.abspath(os.path.join('.','requirements.txt'))
54
+
55
+ interface_host = '0.0.0.0'
56
+ interface_port = 7860
57
+ interface_shared_tmp_expire = 3 # in days
58
+ interface_concurrency_limit = 1 # or None for unlimited
59
+
60
+ interface_component_options = {
61
+ "gr_tab_xtts_params": True,
62
+ "gr_tab_bark_params": True,
63
+ "gr_group_voice_file": True,
64
+ "gr_group_custom_model": True
65
+ }
66
+
67
+ audiobooks_gradio_dir = os.path.abspath(os.path.join('audiobooks','gui','gradio'))
68
+ audiobooks_host_dir = os.path.abspath(os.path.join('audiobooks','gui','host'))
69
+ audiobooks_cli_dir = os.path.abspath(os.path.join('audiobooks','cli'))
70
+
71
+ ebook_formats = ['.epub', '.mobi', '.azw3', '.fb2', '.lrf', '.rb', '.snb', '.tcr', '.pdf', '.txt', '.rtf', '.doc', '.docx', '.html', '.odt', '.azw'] # Add or remove the format you accept as input
72
+ voice_formats = ['.mp4', '.m4b', '.m4a', '.mp3', '.wav', '.aac', '.flac', '.alac', '.ogg', '.aiff', '.aif', '.wma', '.dsd', '.opus', '.pcmu', '.pcma', '.gsm'] # Add or remove the format you accept as input
73
+ output_formats = ['aac', 'flac', 'mp3', 'm4b', 'm4a', 'mp4', 'mov', 'ogg', 'wav', 'webm']
74
+ default_audio_proc_samplerate = 24000
75
+ default_audio_proc_format = 'flac' # or 'mp3', 'aac', 'm4a', 'm4b', 'amr', '3gp', 'alac'. 'wav' format is ok but limited to process files < 4GB
76
+ default_output_format = 'm4b'
77
+ default_output_split = False
78
+ default_output_split_hours = '6' # if the final ouput esceed outpout_split_hours * 2 hours the final file will be splitted by outpout_split_hours + the end if any.
lib/functions.py ADDED
The diff for this file is too large to render. See raw diff
 
lib/lang.py ADDED
The diff for this file is too large to render. See raw diff
 
lib/models.py ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from lib.conf import tts_dir, voices_dir
4
+ loaded_tts = {}
5
+
6
+ TTS_ENGINES = {
7
+ "XTTSv2": "xtts",
8
+ "BARK": "bark",
9
+ "VITS": "vits",
10
+ "FAIRSEQ": "fairseq",
11
+ "TACOTRON2": "tacotron",
12
+ "YOURTTS": "yourtts"
13
+ }
14
+
15
+ TTS_VOICE_CONVERSION = {
16
+ "freevc24": {"path": "voice_conversion_models/multilingual/vctk/freevc24", "samplerate": 24000},
17
+ "knnvc": {"path": "voice_conversion_models/multilingual/multi-dataset/knnvc", "samplerate": 16000},
18
+ "openvoice_v1": {"path": "voice_conversion_models/multilingual/multi-dataset/openvoice_v1", "samplerate": 22050},
19
+ "openvoice_v2": {"path": "voice_conversion_models/multilingual/multi-dataset/openvoice_v2", "samplerate": 22050}
20
+ }
21
+
22
+ TTS_SML = {
23
+ "break": "‡break‡",
24
+ "pause": "‡pause‡",
25
+ "###": "‡pause‡"
26
+ }
27
+
28
+ default_tts_engine = TTS_ENGINES['XTTSv2']
29
+ default_fine_tuned = 'internal'
30
+ default_vc_model = TTS_VOICE_CONVERSION['knnvc']['path']
31
+ default_voice_detection_model = 'drewThomasson/segmentation'
32
+
33
+ max_tts_in_memory = 2 # TTS engines to keep in memory (1 tts engine ~= 4GB to 8GB RAM).
34
+ max_custom_model = 100
35
+ max_custom_voices = 1000
36
+ max_upload_size = '6GB'
37
+
38
+ default_engine_settings = {
39
+ TTS_ENGINES['XTTSv2']: {
40
+ "samplerate": 24000,
41
+ "temperature": 0.75,
42
+ "length_penalty": 1.0,
43
+ "num_beams": 1,
44
+ "repetition_penalty": 3.0,
45
+ "top_k": 50,
46
+ "top_p": 0.85,
47
+ "speed": 1.0,
48
+ "enable_text_splitting": False,
49
+ # to enable deepspeed, you must install it first:
50
+ # conda activate ./python_env (linux/mac) or .\python_env (windows)
51
+ # pip install deepspeed
52
+ # conda deactivate
53
+ "use_deepspeed": False,
54
+ "files": ['config.json', 'model.pth', 'vocab.json', 'ref.wav', 'speakers_xtts.pth'],
55
+ "voices": {
56
+ "ClaribelDervla": "Claribel Dervla", "DaisyStudious": "Daisy Studious", "GracieWise": "Gracie Wise",
57
+ "TammieEma": "Tammie Ema", "AlisonDietlinde": "Alison Dietlinde", "AnaFlorence": "Ana Florence",
58
+ "AnnmarieNele": "Annmarie Nele", "AsyaAnara": "Asya Anara", "BrendaStern": "Brenda Stern",
59
+ "GittaNikolina": "Gitta Nikolina", "HenrietteUsha": "Henriette Usha", "SofiaHellen": "Sofia Hellen",
60
+ "TammyGrit": "Tammy Grit", "TanjaAdelina": "Tanja Adelina", "VjollcaJohnnie": "Vjollca Johnnie",
61
+ "AndrewChipper": "Andrew Chipper", "BadrOdhiambo": "Badr Odhiambo", "DionisioSchuyler": "Dionisio Schuyler",
62
+ "RoystonMin": "Royston Min", "ViktorEka": "Viktor Eka", "AbrahanMack": "Abrahan Mack",
63
+ "AddeMichal": "Adde Michal", "BaldurSanjin": "Baldur Sanjin", "CraigGutsy": "Craig Gutsy",
64
+ "DamienBlack": "Damien Black", "GilbertoMathias": "Gilberto Mathias", "IlkinUrbano": "Ilkin Urbano",
65
+ "KazuhikoAtallah": "Kazuhiko Atallah", "LudvigMilivoj": "Ludvig Milivoj", "SuadQasim": "Suad Qasim",
66
+ "TorcullDiarmuid": "Torcull Diarmuid", "ViktorMenelaos": "Viktor Menelaos", "ZacharieAimilios": "Zacharie Aimilios",
67
+ "NovaHogarth": "Nova Hogarth", "MajaRuoho": "Maja Ruoho", "UtaObando": "Uta Obando",
68
+ "LidiyaSzekeres": "Lidiya Szekeres", "ChandraMacFarland": "Chandra MacFarland", "SzofiGranger": "Szofi Granger",
69
+ "CamillaHolmström": "Camilla Holmström", "LilyaStainthorpe": "Lilya Stainthorpe", "ZofijaKendrick": "Zofija Kendrick",
70
+ "NarelleMoon": "Narelle Moon", "BarboraMacLean": "Barbora MacLean", "AlexandraHisakawa": "Alexandra Hisakawa",
71
+ "AlmaMaría": "Alma María", "RosemaryOkafor": "Rosemary Okafor", "IgeBehringer": "Ige Behringer",
72
+ "FilipTraverse": "Filip Traverse", "DamjanChapman": "Damjan Chapman", "WulfCarlevaro": "Wulf Carlevaro",
73
+ "AaronDreschner": "Aaron Dreschner", "KumarDahl": "Kumar Dahl", "EugenioMataracı": "Eugenio Mataracı",
74
+ "FerranSimen": "Ferran Simen", "XavierHayasaka": "Xavier Hayasaka", "LuisMoray": "Luis Moray",
75
+ "MarcosRudaski": "Marcos Rudaski"
76
+ },
77
+ "rating": {"GPU VRAM": 4, "CPU": 3, "RAM": 8, "Realism": 4}
78
+ },
79
+ TTS_ENGINES['BARK']: {
80
+ "samplerate": 24000,
81
+ "text_temp": 0.50,
82
+ "waveform_temp": 0.50,
83
+ "files": ["text_2.pt", "coarse_2.pt", "fine_2.pt"],
84
+ "speakers_path": os.path.join(voices_dir, '__bark'),
85
+ "voices": {
86
+ "de_speaker_0": "Speaker 0", "de_speaker_1": "Speaker 1", "de_speaker_2": "Speaker 2",
87
+ "de_speaker_3": "Speaker 3", "de_speaker_4": "Speaker 4", "de_speaker_5": "Speaker 5",
88
+ "de_speaker_6": "Speaker 6", "de_speaker_7": "Speaker 7", "de_speaker_8": "Speaker 8",
89
+ "de_speaker_9": "Speaker 9", "en_speaker_0": "Speaker 0", "en_speaker_1": "Speaker 1",
90
+ "en_speaker_2": "Speaker 2", "en_speaker_3": "Speaker 3", "en_speaker_4": "Speaker 4",
91
+ "en_speaker_5": "Speaker 5", "en_speaker_6": "Speaker 6", "en_speaker_7": "Speaker 7",
92
+ "en_speaker_8": "Speaker 8", "en_speaker_9": "Speaker 9", "es_speaker_0": "Speaker 0",
93
+ "es_speaker_1": "Speaker 1", "es_speaker_2": "Speaker 2", "es_speaker_3": "Speaker 3",
94
+ "es_speaker_4": "Speaker 4", "es_speaker_5": "Speaker 5", "es_speaker_6": "Speaker 6",
95
+ "es_speaker_7": "Speaker 7", "es_speaker_8": "Speaker 8", "es_speaker_9": "Speaker 9",
96
+ "fr_speaker_0": "Speaker 0", "fr_speaker_1": "Speaker 1", "fr_speaker_2": "Speaker 2",
97
+ "fr_speaker_3": "Speaker 3", "fr_speaker_4": "Speaker 4", "fr_speaker_5": "Speaker 5",
98
+ "fr_speaker_6": "Speaker 6", "fr_speaker_7": "Speaker 7", "fr_speaker_8": "Speaker 8",
99
+ "fr_speaker_9": "Speaker 9", "hi_speaker_0": "Speaker 0", "hi_speaker_1": "Speaker 1",
100
+ "hi_speaker_2": "Speaker 2", "hi_speaker_3": "Speaker 3", "hi_speaker_4": "Speaker 4",
101
+ "hi_speaker_5": "Speaker 5", "hi_speaker_6": "Speaker 6", "hi_speaker_7": "Speaker 7",
102
+ "hi_speaker_8": "Speaker 8", "hi_speaker_9": "Speaker 9", "it_speaker_0": "Speaker 0",
103
+ "it_speaker_1": "Speaker 1", "it_speaker_2": "Speaker 2", "it_speaker_3": "Speaker 3",
104
+ "it_speaker_4": "Speaker 4", "it_speaker_5": "Speaker 5", "it_speaker_6": "Speaker 6",
105
+ "it_speaker_7": "Speaker 7", "it_speaker_8": "Speaker 8", "it_speaker_9": "Speaker 9",
106
+ "ja_speaker_0": "Speaker 0", "ja_speaker_1": "Speaker 1", "ja_speaker_2": "Speaker 2",
107
+ "ja_speaker_3": "Speaker 3", "ja_speaker_4": "Speaker 4", "ja_speaker_5": "Speaker 5",
108
+ "ja_speaker_6": "Speaker 6", "ja_speaker_7": "Speaker 7", "ja_speaker_8": "Speaker 8",
109
+ "ja_speaker_9": "Speaker 9", "ko_speaker_0": "Speaker 0", "ko_speaker_1": "Speaker 1",
110
+ "ko_speaker_2": "Speaker 2", "ko_speaker_3": "Speaker 3", "ko_speaker_4": "Speaker 4",
111
+ "ko_speaker_5": "Speaker 5", "ko_speaker_6": "Speaker 6", "ko_speaker_7": "Speaker 7",
112
+ "ko_speaker_8": "Speaker 8", "ko_speaker_9": "Speaker 9", "pl_speaker_0": "Speaker 0",
113
+ "pl_speaker_1": "Speaker 1", "pl_speaker_2": "Speaker 2", "pl_speaker_3": "Speaker 3",
114
+ "pl_speaker_4": "Speaker 4", "pl_speaker_5": "Speaker 5", "pl_speaker_6": "Speaker 6",
115
+ "pl_speaker_7": "Speaker 7", "pl_speaker_8": "Speaker 8", "pl_speaker_9": "Speaker 9",
116
+ "pt_speaker_0": "Speaker 0", "pt_speaker_1": "Speaker 1", "pt_speaker_2": "Speaker 2",
117
+ "pt_speaker_3": "Speaker 3", "pt_speaker_4": "Speaker 4", "pt_speaker_5": "Speaker 5",
118
+ "pt_speaker_6": "Speaker 6", "pt_speaker_7": "Speaker 7", "pt_speaker_8": "Speaker 8",
119
+ "pt_speaker_9": "Speaker 9", "ru_speaker_0": "Speaker 0", "ru_speaker_1": "Speaker 1",
120
+ "ru_speaker_2": "Speaker 2", "ru_speaker_3": "Speaker 3", "ru_speaker_4": "Speaker 4",
121
+ "ru_speaker_5": "Speaker 5", "ru_speaker_6": "Speaker 6", "ru_speaker_7": "Speaker 7",
122
+ "ru_speaker_8": "Speaker 8", "ru_speaker_9": "Speaker 9", "tr_speaker_0": "Speaker 0",
123
+ "tr_speaker_1": "Speaker 1", "tr_speaker_2": "Speaker 2", "tr_speaker_3": "Speaker 3",
124
+ "tr_speaker_4": "Speaker 4", "tr_speaker_5": "Speaker 5", "tr_speaker_6": "Speaker 6",
125
+ "tr_speaker_7": "Speaker 7", "tr_speaker_8": "Speaker 8", "tr_speaker_9": "Speaker 9",
126
+ "zh_speaker_0": "Speaker 0", "zh_speaker_1": "Speaker 1", "zh_speaker_2": "Speaker 2",
127
+ "zh_speaker_3": "Speaker 3", "zh_speaker_4": "Speaker 4", "zh_speaker_5": "Speaker 5",
128
+ "zh_speaker_6": "Speaker 6", "zh_speaker_7": "Speaker 7", "zh_speaker_8": "Speaker 8",
129
+ "zh_speaker_9": "Speaker 9"
130
+ },
131
+ "rating": {"GPU VRAM": 4, "CPU": 1, "RAM": 16, "Realism": 3}
132
+ },
133
+ TTS_ENGINES['VITS']: {
134
+ "samplerate": 22050,
135
+ "files": ['config.json', 'model_file.pth', 'language_ids.json'],
136
+ "voices": {},
137
+ "rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 2}
138
+ },
139
+ TTS_ENGINES['FAIRSEQ']: {
140
+ "samplerate": 16000,
141
+ "files": ['config.json', 'G_100000.pth', 'vocab.json'],
142
+ "voices": {},
143
+ "rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 2}
144
+ },
145
+ TTS_ENGINES['TACOTRON2']: {
146
+ "samplerate": 22050,
147
+ "files": ['config.json', 'best_model.pth', 'vocoder_config.json', 'vocoder_model.pth'],
148
+ "voices": {},
149
+ "rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 2}
150
+ },
151
+ TTS_ENGINES['YOURTTS']: {
152
+ "samplerate": 16000,
153
+ "files": ['config.json', 'model_file.pth'],
154
+ "voices": {"Machinella-5": "female-en-5", "ElectroMale-2": "male-en-2", 'Machinella-4': 'female-pt-4\n', 'ElectroMale-3': 'male-pt-3\n'},
155
+ "rating": {"GPU VRAM": 1, "CPU": 5, "RAM": 4, "Realism": 1}
156
+ }
157
+ }
158
+ models = {
159
+ TTS_ENGINES['XTTSv2']: {
160
+ "internal": {
161
+ "lang": "multi",
162
+ "repo": "coqui/XTTS-v2",
163
+ "sub": "tts_models/multilingual/multi-dataset/xtts_v2/",
164
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
165
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
166
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
167
+ },
168
+ "AiExplained": {
169
+ "lang": "eng",
170
+ "repo": "drewThomasson/fineTunedTTSModels",
171
+ "sub": "xtts-v2/eng/AiExplained/",
172
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AiExplained.wav'),
173
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
174
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
175
+ },
176
+ "AsmrRacoon": {
177
+ "lang": "eng",
178
+ "repo": "drewThomasson/fineTunedTTSModels",
179
+ "sub": "xtts-v2/eng/AsmrRacoon/",
180
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AsmrRacoon.wav'),
181
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
182
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
183
+ },
184
+ "Awkwafina": {
185
+ "lang": "eng",
186
+ "repo": "drewThomasson/fineTunedTTSModels",
187
+ "sub": "xtts-v2/eng/Awkwafina/",
188
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'Awkwafina.wav'),
189
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
190
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
191
+ },
192
+ "BobOdenkirk": {
193
+ "lang": "eng",
194
+ "repo": "drewThomasson/fineTunedTTSModels",
195
+ "sub": "xtts-v2/eng/BobOdenkirk/",
196
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobOdenkirk.wav'),
197
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
198
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
199
+ },
200
+ "BobRoss": {
201
+ "lang": "eng",
202
+ "repo": "drewThomasson/fineTunedTTSModels",
203
+ "sub": "xtts-v2/eng/BobRoss/",
204
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobRoss.wav'),
205
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
206
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
207
+ },
208
+ "BrinaPalencia": {
209
+ "lang": "eng",
210
+ "repo": "drewThomasson/fineTunedTTSModels",
211
+ "sub": "xtts-v2/eng/BrinaPalencia/",
212
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'BrinaPalencia.wav'),
213
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
214
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
215
+ },
216
+ "BryanCranston": {
217
+ "lang": "eng",
218
+ "repo": "drewThomasson/fineTunedTTSModels",
219
+ "sub": "xtts-v2/eng/BryanCranston/",
220
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BryanCranston.wav'),
221
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
222
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
223
+ },
224
+ "DavidAttenborough": {
225
+ "lang": "eng",
226
+ "repo": "drewThomasson/fineTunedTTSModels",
227
+ "sub": "xtts-v2/eng/DavidAttenborough/",
228
+ "voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DavidAttenborough.wav'),
229
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
230
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
231
+ },
232
+ "DeathPussInBoots": {
233
+ "lang": "eng",
234
+ "repo": "drewThomasson/fineTunedTTSModels",
235
+ "sub": "xtts-v2/eng/DeathPussInBoots/",
236
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'DeathPussInBoots.wav'),
237
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
238
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
239
+ },
240
+ "DermotCrowley": {
241
+ "lang": "eng",
242
+ "repo": "drewThomasson/fineTunedTTSModels",
243
+ "sub": "xtts-v2/eng/DermotCrowley/",
244
+ "voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DermotCrowley.wav'),
245
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
246
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
247
+ },
248
+ "EvaSeymour": {
249
+ "lang": "eng",
250
+ "repo": "drewThomasson/fineTunedTTSModels",
251
+ "sub": "xtts-v2/eng/EvaSeymour/",
252
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'EvaSeymour.wav'),
253
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
254
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
255
+ },
256
+ "GideonOfnirEldenRing": {
257
+ "lang": "eng",
258
+ "repo": "drewThomasson/fineTunedTTSModels",
259
+ "sub": "xtts-v2/eng/GideonOfnirEldenRing/",
260
+ "voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'GideonOfnirEldenRing.wav'),
261
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
262
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
263
+ },
264
+ "GhostMW2": {
265
+ "lang": "eng",
266
+ "repo": "drewThomasson/fineTunedTTSModels",
267
+ "sub": "xtts-v2/eng/GhostMW2/",
268
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'GhostMW2.wav'),
269
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
270
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
271
+ },
272
+ "JhonButlerASMR": {
273
+ "lang": "eng",
274
+ "repo": "drewThomasson/fineTunedTTSModels",
275
+ "sub": "xtts-v2/eng/JhonButlerASMR/",
276
+ "voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'JhonButlerASMR.wav'),
277
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
278
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
279
+ },
280
+ "JhonMulaney": {
281
+ "lang": "eng",
282
+ "repo": "drewThomasson/fineTunedTTSModels",
283
+ "sub": "xtts-v2/eng/JhonMulaney/",
284
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'JhonMulaney.wav'),
285
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
286
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
287
+ },
288
+ "JillRedfield": {
289
+ "lang": "eng",
290
+ "repo": "drewThomasson/fineTunedTTSModels",
291
+ "sub": "xtts-v2/eng/JillRedfield/",
292
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JillRedfield.wav'),
293
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
294
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
295
+ },
296
+ "JuliaWhenlan": {
297
+ "lang": "eng",
298
+ "repo": "drewThomasson/fineTunedTTSModels",
299
+ "sub": "xtts-v2/eng/JuliaWhenlan/",
300
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JuliaWhenlan.wav'),
301
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
302
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
303
+ },
304
+ "LeeHorsley": {
305
+ "lang": "eng",
306
+ "repo": "drewThomasson/fineTunedTTSModels",
307
+ "sub": "xtts-v2/eng/LeeHorsley/",
308
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'LeeHorsley.wav'),
309
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
310
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
311
+ },
312
+ "MelinaEldenRing": {
313
+ "lang": "eng",
314
+ "repo": "drewThomasson/fineTunedTTSModels",
315
+ "sub": "xtts-v2/eng/MelinaEldenRing/",
316
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'MelinaEldenRing.wav'),
317
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
318
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
319
+ },
320
+ "MorganFreeman": {
321
+ "lang": "eng",
322
+ "repo": "drewThomasson/fineTunedTTSModels",
323
+ "sub": "xtts-v2/eng/MorganFreeman/",
324
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'MorganFreeman.wav'),
325
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
326
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
327
+ },
328
+ "NeilGaiman": {
329
+ "lang": "eng",
330
+ "repo": "drewThomasson/fineTunedTTSModels",
331
+ "sub": "xtts-v2/eng/NeilGaiman/",
332
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'NeilGaiman.wav'),
333
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
334
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
335
+ },
336
+ "RainyDayHeadSpace": {
337
+ "lang": "eng",
338
+ "repo": "drewThomasson/fineTunedTTSModels",
339
+ "sub": "xtts-v2/eng/RainyDayHeadSpace/",
340
+ "voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'RainyDayHeadSpace.wav'),
341
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
342
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
343
+ },
344
+ "RayPorter": {
345
+ "lang": "eng",
346
+ "repo": "drewThomasson/fineTunedTTSModels",
347
+ "sub": "xtts-v2/eng/RayPorter/",
348
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RayPorter.wav'),
349
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
350
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
351
+ },
352
+ "RelaxForAWhile": {
353
+ "lang": "eng",
354
+ "repo": "drewThomasson/fineTunedTTSModels",
355
+ "sub": "xtts-v2/eng/RelaxForAWhile/",
356
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RelaxForAWhile.wav'),
357
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
358
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
359
+ },
360
+ "RosamundPike": {
361
+ "lang": "eng",
362
+ "repo": "drewThomasson/fineTunedTTSModels",
363
+ "sub": "xtts-v2/eng/RosamundPike/",
364
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RosamundPike.wav'),
365
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
366
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
367
+ },
368
+ "ScarlettJohansson": {
369
+ "lang": "eng",
370
+ "repo": "drewThomasson/fineTunedTTSModels",
371
+ "sub": "xtts-v2/eng/ScarlettJohansson/",
372
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'ScarlettJohansson.wav'),
373
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
374
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
375
+ },
376
+ "SladeTeenTitans": {
377
+ "lang": "eng",
378
+ "repo": "drewThomasson/fineTunedTTSModels",
379
+ "sub": "xtts-v2/eng/SladeTeenTitans/",
380
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'SladeTeenTitans.wav'),
381
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
382
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
383
+ },
384
+ "StanleyParable": {
385
+ "lang": "eng",
386
+ "repo": "drewThomasson/fineTunedTTSModels",
387
+ "sub": "xtts-v2/eng/StanleyParable/",
388
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'StanleyParable.wav'),
389
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
390
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
391
+ },
392
+ "WhisperSalemASMR": {
393
+ "lang": "eng",
394
+ "repo": "drewThomasson/fineTunedTTSModels",
395
+ "sub": "xtts-v2/eng/WhisperSalemASMR/",
396
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'WhisperSalemASMR.wav'),
397
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
398
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
399
+ },
400
+ "Konishev": {
401
+ "lang": "rus",
402
+ "repo": "drewThomasson/fineTunedTTSModels",
403
+ "sub": "xtts-v2/rus/Konishev/",
404
+ "voice": os.path.join(voices_dir, 'rus', 'adult', 'male', 'Konishev.wav'),
405
+ "files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
406
+ "samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
407
+ }
408
+ },
409
+ TTS_ENGINES['BARK']: {
410
+ "internal": {
411
+ "lang": "multi",
412
+ "repo": "erogol/bark", # suno/bark, rsxdalv/suno, tts_models/multilingual/multi-dataset/bark
413
+ "sub": "", # {"big-bf16": "big-bf16/", "small-bf16": "small-bf16/", "big": "big/", "small": "small/"}
414
+ "voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
415
+ "files": default_engine_settings[TTS_ENGINES['BARK']]['files'],
416
+ "samplerate": default_engine_settings[TTS_ENGINES['BARK']]['samplerate']
417
+ }
418
+ },
419
+ TTS_ENGINES['VITS']: {
420
+ "internal": {
421
+ "lang": "multi",
422
+ "repo": "tts_models/[lang_iso1]/[xxx]",
423
+ "sub": {
424
+ "css10/vits": ['es','hu','fi','fr','nl','ru','el'],
425
+ "custom/vits": ['ca'],
426
+ "custom/vits-female": ['bn', 'fa'],
427
+ "cv/vits": ['bg','cs','da','et','ga','hr','lt','lv','mt','pt','ro','sk','sl','sv'],
428
+ "mai/vits": ['uk'],
429
+ "mai_female/vits": ['pl'],
430
+ "mai_male/vits": ['it'],
431
+ "openbible/vits": ['ewe','hau','lin','tw_akuapem','tw_asante','yor'],
432
+ "vctk/vits": ['en'],
433
+ "thorsten/vits": ['de']
434
+ },
435
+ "voice": None,
436
+ "files": default_engine_settings[TTS_ENGINES['VITS']]['files'],
437
+ "samplerate": {
438
+ "css10/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
439
+ "custom/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
440
+ "custom/vits-female": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
441
+ "cv/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
442
+ "mai/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
443
+ "mai_female/vits": 24000,
444
+ "mai_male/vits": 16000,
445
+ "openbible/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
446
+ "vctk/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
447
+ "thorsten/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate']
448
+ }
449
+ }
450
+ },
451
+ TTS_ENGINES['FAIRSEQ']: {
452
+ "internal": {
453
+ "lang": "multi",
454
+ "repo": "tts_models/[lang]/fairseq/vits",
455
+ "sub": "",
456
+ "voice": None,
457
+ "files": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['files'],
458
+ "samplerate": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['samplerate']
459
+ }
460
+ },
461
+ TTS_ENGINES['TACOTRON2']: {
462
+ "internal": {
463
+ "lang": "multi",
464
+ "repo": "tts_models/[lang_iso1]/[xxx]",
465
+ "sub": {
466
+ "mai/tacotron2-DDC": ['fr', 'es', 'nl'],
467
+ "thorsten/tacotron2-DDC": ['de'],
468
+ "kokoro/tacotron2-DDC": ['ja'],
469
+ "ljspeech/tacotron2-DDC": ['en'],
470
+ "baker/tacotron2-DDC-GST": ['zh-CN']
471
+ },
472
+ "voice": None,
473
+ "files": default_engine_settings[TTS_ENGINES['TACOTRON2']]['files'],
474
+ "samplerate": {
475
+ "mai/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
476
+ "thorsten/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
477
+ "kokoro/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
478
+ "ljspeech/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
479
+ "baker/tacotron2-DDC-GST": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate']
480
+ },
481
+ }
482
+ },
483
+ TTS_ENGINES['YOURTTS']: {
484
+ "internal": {
485
+ "lang": "multi",
486
+ "repo": "tts_models/multilingual/multi-dataset/your_tts",
487
+ "sub": "",
488
+ "voice": None,
489
+ "files": default_engine_settings[TTS_ENGINES['YOURTTS']]['files'],
490
+ "samplerate": default_engine_settings[TTS_ENGINES['YOURTTS']]['samplerate']
491
+ }
492
+ }
493
+ }