|
""" |
|
This module defines the 'SubtitleToSpeech' class, which converts subtitle files to speech audio files. |
|
It supports multiple Text-to-Speech (TTS) services including Harpo, Balabolka, Edge, and ElevenLabs. |
|
|
|
* Usage: |
|
To use this module, create an instance of the 'SubtitleToSpeech' class and call the 'generate_audio' method. |
|
|
|
* Example usage: |
|
from subtitle_to_speech import SubtitleToSpeech |
|
|
|
# Create an instance of SubtitleToSpeech |
|
converter = SubtitleToSpeech(filename="example.srt") |
|
|
|
# Generate audio |
|
converter.generate_audio(settings) |
|
|
|
* Example usage: |
|
if __name__ == '__main__': |
|
converter = SubtitleToSpeech(filename="example.srt") |
|
converter.generate_audio(settings) |
|
|
|
* Example usage: |
|
if __name__ == '__main__': |
|
if 'TTS - *Głos* - ElevenLans' in settings.tts: |
|
audio_generator = SubtitleToSpeech(filename="") |
|
audio_generator.srt_to_eac3_elevenlabs() # For Alt Subs |
|
""" |
|
|
|
from dataclasses import dataclass |
|
from msvcrt import getch |
|
from os import listdir, path, remove |
|
from subprocess import call, Popen |
|
from threading import Thread |
|
from time import sleep |
|
import wave |
|
from asyncio import create_task, gather, run |
|
from typing import Dict, List, Optional |
|
|
|
import pyttsx3 |
|
import pysrt |
|
from edge_tts import Communicate |
|
from pydub import AudioSegment |
|
from pydub.utils import mediainfo |
|
|
|
from constants import (WORKING_SPACE, |
|
WORKING_SPACE_OUTPUT, |
|
WORKING_SPACE_TEMP, |
|
WORKING_SPACE_TEMP_MAIN_SUBS, |
|
WORKING_SPACE_TEMP_ALT_SUBS, |
|
BALABOLKA_PATH, |
|
FFMPEG_PATH, |
|
console) |
|
from data.settings import Settings |
|
|
|
|
|
@dataclass(slots=True) |
|
class SubtitleToSpeech: |
|
""" |
|
This class provides methods to convert subtitle files to speech audio files. |
|
|
|
Attributes: |
|
- filename (str): The name of the subtitle file to convert. |
|
- working_space (str): The path to the working directory. |
|
- working_space_output (str): The path to the output directory. |
|
- working_space_temp (str): The path to the temporary directory. |
|
- working_space_temp_main_subs (str): The path to the main subtitles directory. |
|
- working_space_temp_alt_subs (str): The path to the alternative subtitles directory. |
|
- balabolka_path (str): The path to the Balabolka executable. |
|
- ffmpeg_path (str): The path to the FFmpeg executable. |
|
|
|
Methods: |
|
- ansi_srt(self) -> None: |
|
Converts the encoding of the subtitle file to ANSI. |
|
|
|
- srt_to_wav_harpo(self, tts_speed: str, tts_volume: str) -> None: |
|
Converts the subtitle file to a WAV audio file using Harpo TTS. |
|
|
|
- srt_to_wav_balabolka(self, tts_speed: str, tts_volume: str) -> None: |
|
Converts the subtitle file to a WAV audio file using Balabolka TTS. |
|
|
|
- srt_to_wav_edge_online(self, tts: str, tts_speed: str, tts_volume: str) -> None: |
|
Converts the subtitle file to a WAV audio file using Edge TTS. |
|
|
|
- merge_tts_audio(self) -> None: |
|
Merges the generated TTS audio files. |
|
|
|
- generate_audio(self, settings: Settings) -> None: |
|
Generates the audio file from the subtitle file using the specified TTS settings. |
|
|
|
- srt_to_eac3_elevenlabs(self) -> None: |
|
Opens the main_subs folder for the user to add audio files generated by ElevenLabs. |
|
""" |
|
|
|
filename: str |
|
working_space: str = WORKING_SPACE |
|
working_space_output: str = WORKING_SPACE_OUTPUT |
|
working_space_temp: str = WORKING_SPACE_TEMP |
|
working_space_temp_main_subs: str = WORKING_SPACE_TEMP_MAIN_SUBS |
|
working_space_temp_alt_subs: str = WORKING_SPACE_TEMP_ALT_SUBS |
|
balabolka_path: str = BALABOLKA_PATH |
|
ffmpeg_path: str = FFMPEG_PATH |
|
|
|
def ansi_srt(self) -> None: |
|
""" |
|
Converts the encoding of the subtitle file to ANSI. |
|
|
|
Raises: |
|
- UnicodeDecodeError: If the file is not in UTF-8 encoding. |
|
""" |
|
try: |
|
with open(path.join(self.working_space_temp_main_subs, self.filename), "r", encoding="utf-8") as source_file: |
|
content: str = source_file.read() |
|
except UnicodeDecodeError: |
|
with open(path.join(self.working_space_temp_main_subs, self.filename), "r", encoding="ANSI") as source_file: |
|
content: str = source_file.read() |
|
|
|
with open(path.join(self.working_space_temp_main_subs, self.filename), "w", encoding="ANSI", errors="ignore") as target_file: |
|
target_file.write(content) |
|
|
|
console.print("Zamieniono kodowanie na ANSI:", |
|
style='green_bold', end=' ') |
|
console.print(self.filename) |
|
|
|
def srt_to_wav_harpo(self, tts_speed: str, tts_volume: str) -> None: |
|
""" |
|
Converts the subtitle file to a WAV audio file using Harpo TTS. |
|
|
|
Args: |
|
- tts_speed (str): The speed of the TTS voice. |
|
- tts_volume (str): The volume of the TTS voice. |
|
""" |
|
self.ansi_srt() |
|
engine = self._init_engine(tts_speed, tts_volume) |
|
subtitles: pysrt.SubRipFile = pysrt.open(path.join( |
|
self.working_space_temp_main_subs, self.filename), encoding='ANSI') |
|
output_file: str = path.splitext(path.join( |
|
self.working_space_temp_main_subs, self.filename))[0] + '.wav' |
|
self._generate_wav_file(engine, subtitles, output_file) |
|
remove(path.join(self.working_space_temp, "temp.wav")) |
|
|
|
def _init_engine(self, tts_speed: str, tts_volume: str) -> pyttsx3.Engine: |
|
""" |
|
Initializes the TTS engine with the specified speed and volume. |
|
|
|
Args: |
|
- tts_speed (str): The speed of the TTS voice. |
|
- tts_volume (str): The volume of the TTS voice. |
|
|
|
Returns: |
|
- pyttsx3.Engine: The initialized TTS engine. |
|
""" |
|
engine: pyttsx3.Engine = pyttsx3.init() |
|
voices: List[pyttsx3.Voice] = engine.getProperty('voices') |
|
for voice in voices: |
|
if voice.name == 'Vocalizer Expressive Zosia Harpo 22kHz': |
|
engine.setProperty('voice', voice.id) |
|
engine.setProperty('rate', int(tts_speed)) |
|
engine.setProperty('volume', float(tts_volume)) |
|
return engine |
|
|
|
def _generate_wav_file(self, engine: pyttsx3.Engine, subtitles: pysrt.SubRipFile, output_file: str) -> None: |
|
""" |
|
Generates a WAV audio file from the given subtitles using the specified TTS engine. |
|
|
|
Args: |
|
- engine (pyttsx3.Engine): The TTS engine to use for speech synthesis. |
|
- subtitles (pysrt.SubRipFile): The subtitles to convert to speech. |
|
- output_file (str): The path to the output WAV file. |
|
""" |
|
with wave.open(output_file, 'wb') as wav_file: |
|
wav_file.setnchannels(1) |
|
wav_file.setsampwidth(2) |
|
wav_file.setframerate(22500) |
|
|
|
for i, subtitle in enumerate(subtitles, start=1): |
|
print( |
|
f"{i}\n{subtitle.start.to_time().strftime('%H:%M:%S.%f')[:-3]} --> {subtitle.end.to_time().strftime('%H:%M:%S.%f')[:-3]}\n{subtitle.text}\n") |
|
start_time: float = subtitle.start.ordinal / 1000.0 |
|
self._save_subtitle_to_wav(engine, subtitle.text) |
|
self._add_empty_frame_if_needed(wav_file, start_time) |
|
self._add_subtitle_to_wav(wav_file) |
|
|
|
def _save_subtitle_to_wav(self, engine: pyttsx3.Engine, text: str) -> None: |
|
""" |
|
Saves a single subtitle to a temporary WAV file. |
|
|
|
Args: |
|
- engine (pyttsx3.Engine): The TTS engine to use for speech synthesis. |
|
- text (str): The text of the subtitle to convert to speech. |
|
""" |
|
engine.save_to_file(text, path.join( |
|
self.working_space_temp, "temp.wav")) |
|
engine.runAndWait() |
|
|
|
def _add_empty_frame_if_needed(self, wav_file: wave.Wave_write, start_time: float) -> None: |
|
""" |
|
Adds an empty frame to the WAV file if the start time of the next subtitle is later than the current time in the audio. |
|
|
|
Args: |
|
- wav_file (wave.Wave_write): The WAV file to add the empty frame to. |
|
- start_time (float): The start time of the next subtitle. |
|
""" |
|
framerate: int = wav_file.getframerate() |
|
nframes: int = wav_file.getnframes() |
|
current_time: float = nframes / float(framerate) |
|
if start_time > current_time: |
|
empty_frame_duration: int = int( |
|
(start_time - current_time) * framerate) |
|
empty_frame: bytes = b'\x00' * empty_frame_duration * 2 |
|
wav_file.writeframes(empty_frame) |
|
|
|
def _add_subtitle_to_wav(self, wav_file: wave.Wave_write) -> None: |
|
""" |
|
Adds a subtitle to the WAV file. |
|
|
|
Args: |
|
- wav_file (wave.Wave_write): The WAV file to add the subtitle to. |
|
""" |
|
with wave.open(path.join(self.working_space_temp, "temp.wav"), 'rb') as temp_file: |
|
data: bytes = temp_file.readframes(temp_file.getnframes()) |
|
wav_file.writeframes(data) |
|
|
|
def srt_to_wav_balabolka(self, tts_speed: str, tts_volume: str) -> None: |
|
""" |
|
Converts the subtitle file to a WAV audio file using Balabolka TTS. |
|
|
|
Args: |
|
- tts_speed (str): The speed of the TTS voice. |
|
- tts_volume (str): The volume of the TTS voice. |
|
""" |
|
self.ansi_srt() |
|
balcon_path: str = self.balabolka_path |
|
file_path: str = path.join( |
|
self.working_space_temp_main_subs, self.filename) |
|
output_wav_path: str = path.join( |
|
self.working_space_temp_main_subs, path.splitext(self.filename)[0] + ".wav") |
|
command: List[str] = self._prepare_balabolka_command( |
|
balcon_path, file_path, output_wav_path, tts_speed, tts_volume) |
|
|
|
command_thread: Thread = Thread( |
|
target=call, args=(command,)) |
|
command_thread.start() |
|
|
|
subtitles: pysrt.SubRipFile = pysrt.open(file_path, encoding='ANSI') |
|
for subtitle in subtitles: |
|
self.process_subtitle(subtitle) |
|
|
|
command_thread.join() |
|
|
|
def _prepare_balabolka_command(self, balcon_path: str, file_path: str, output_wav_path: str, tts_speed: str, tts_volume: str) -> List[str]: |
|
""" |
|
Prepares the command to run Balabolka TTS. |
|
|
|
Args: |
|
- balcon_path (str): The path to the Balabolka executable. |
|
- file_path (str): The path to the subtitle file. |
|
- output_wav_path (str): The path to the output WAV file. |
|
- tts_speed (str): The speed of the TTS voice. |
|
- tts_volume (str): The volume of the TTS voice. |
|
|
|
Returns: |
|
- List[str]: The prepared command. |
|
""" |
|
return [ |
|
balcon_path, |
|
"-fr", "48", |
|
"-f", file_path, |
|
"-w", output_wav_path, |
|
"-n", "IVONA 2 Agnieszka", |
|
"-s", tts_speed, |
|
"-v", tts_volume |
|
] |
|
|
|
def process_subtitle(self, subtitle: pysrt.SubRipItem) -> None: |
|
""" |
|
Processes a single subtitle. |
|
|
|
Args: |
|
- subtitle (pysrt.SubRipItem): The subtitle to process. |
|
""" |
|
i: int = subtitle.index |
|
start_time: str = subtitle.start.to_time().strftime('%H:%M:%S.%f')[:-3] |
|
end_time: str = subtitle.end.to_time().strftime('%H:%M:%S.%f')[:-3] |
|
text: str = subtitle.text |
|
print(f"{i}\n{start_time} --> {end_time}\n{text}\n") |
|
sleep(0.02) |
|
|
|
async def generate_speech(self, subtitle: pysrt.SubRipItem, voice: str, output_file: str, rate: str, volume: str) -> None: |
|
""" |
|
Generates speech from a single subtitle using the specified TTS voice. |
|
|
|
Args: |
|
- subtitle (pysrt.SubRipItem): The subtitle to convert to speech. |
|
- voice (str): The TTS voice to use. |
|
- output_file (str): The path to the output audio file. |
|
- rate (str): The speed of the TTS voice. |
|
- volume (str): The volume of the TTS voice. |
|
""" |
|
communicate = Communicate( |
|
subtitle.text, voice, rate=rate, volume=volume) |
|
await communicate.save(output_file) |
|
|
|
async def generate_wav_files(self, subtitles: pysrt.SubRipFile, voice: str, rate: str, volume: str) -> List[str]: |
|
""" |
|
Generates WAV audio files from the given subtitles using the specified TTS voice. |
|
|
|
Args: |
|
- subtitles (pysrt.SubRipFile): The subtitles to convert to speech. |
|
- voice (str): The TTS voice to use. |
|
- rate (str): The speed of the TTS voice. |
|
- volume (str): The volume of the TTS voice. |
|
|
|
Returns: |
|
- List[str]: The paths to the generated WAV files. |
|
""" |
|
tasks = [] |
|
mp3_files = [] |
|
file_name = path.splitext(subtitles.path)[0] |
|
for i, subtitle in enumerate(subtitles, start=1): |
|
output_file = f"{file_name}_{i}.mp3" |
|
mp3_files.append(output_file) |
|
tasks.append(create_task(self.generate_speech( |
|
subtitle, voice, output_file, rate, volume))) |
|
if i % 50 == 0: |
|
await gather(*tasks) |
|
tasks = [] |
|
sleep(2) |
|
await gather(*tasks) |
|
return mp3_files |
|
|
|
def merge_audio_files(self, mp3_files: List[str], subtitles: pysrt.SubRipFile, dir_path: str) -> None: |
|
""" |
|
Merges the given MP3 audio files into a single WAV file. |
|
|
|
Args: |
|
- mp3_files (List[str]): The paths to the MP3 files to merge. |
|
- subtitles (pysrt.SubRipFile): The subtitles corresponding to the audio files. |
|
- dir_path (str): The directory where the audio files are located. |
|
""" |
|
file_name: str = path.splitext(subtitles.path)[0] |
|
with wave.open(f"{file_name}.wav", 'wb') as wav_file: |
|
wav_file.setnchannels(1) |
|
wav_file.setsampwidth(2) |
|
wav_file.setframerate(24000) |
|
|
|
for i, mp3_file in enumerate(mp3_files, start=1): |
|
print( |
|
f"{i}\n{subtitles[i-1].start.to_time().strftime('%H:%M:%S.%f')[:-3]} --> {subtitles[i-1].end.to_time().strftime('%H:%M:%S.%f')[:-3]}\n{subtitles[i-1].text}\n") |
|
mp3_file_path: str = path.join(dir_path, mp3_file) |
|
if path.isfile(mp3_file_path): |
|
start_time: float = subtitles[i-1].start.ordinal / 1000.0 |
|
sound: AudioSegment = AudioSegment.from_file( |
|
mp3_file_path, format="mp3") |
|
remove(mp3_file_path) |
|
self._add_empty_frame_if_needed(wav_file, start_time) |
|
sound_data: bytes = sound.raw_data |
|
wav_file.writefqrames(sound_data) |
|
|
|
def srt_to_wav_edge_online(self, tts: str, tts_speed: str, tts_volume: str) -> None: |
|
""" |
|
Converts the subtitle file to a WAV audio file using Edge TTS. |
|
|
|
Args: |
|
- tts (str): The TTS service to use. |
|
- tts_speed (str): The speed of the TTS voice. |
|
- tts_volume (str): The volume of the TTS voice. |
|
""" |
|
self.ansi_srt() |
|
voice = "pl-PL-ZofiaNeural" if tts == "TTS - Zofia - Edge" else "pl-PL-MarekNeural" |
|
|
|
subtitles: pysrt.SubRipFile = pysrt.open(path.join( |
|
self.working_space_temp_main_subs, self.filename), encoding='ANSI') |
|
mp3_files: List[str] = run(self.generate_wav_files( |
|
subtitles, voice, tts_speed, tts_volume)) |
|
self.merge_audio_files(mp3_files, subtitles, |
|
self.working_space_temp_main_subs) |
|
|
|
def merge_tts_audio(self) -> None: |
|
""" |
|
Merges the generated TTS audio files. |
|
""" |
|
main_subs_files_dict: Dict[str, str] = self._get_files_dict( |
|
self.working_space_temp_main_subs) |
|
tmp_files_dict: Dict[str, str] = self._get_files_dict( |
|
self.working_space_temp) |
|
|
|
for file_name, main_subs_file in main_subs_files_dict.items(): |
|
main_subs_file_path: str = path.join( |
|
self.working_space_temp_main_subs, main_subs_file) |
|
output_file: str = path.join( |
|
self.working_space_output, file_name + ".eac3") |
|
|
|
if file_name in tmp_files_dict: |
|
tmp_file: str = tmp_files_dict[file_name] |
|
tmp_file_path: str = path.join( |
|
self.working_space_temp, tmp_file) |
|
|
|
main_subs_file_duration: float = self._get_file_duration( |
|
main_subs_file_path) |
|
tmp_file_duration: float = self._get_file_duration( |
|
tmp_file_path) |
|
|
|
input_file_1: str |
|
input_file_2: str |
|
if main_subs_file_duration > tmp_file_duration: |
|
input_file_1, input_file_2 = main_subs_file_path, tmp_file_path |
|
else: |
|
input_file_1, input_file_2 = tmp_file_path, main_subs_file_path |
|
|
|
self._merge_files(input_file_1, input_file_2, output_file) |
|
|
|
remove(main_subs_file_path) |
|
remove(tmp_file_path) |
|
else: |
|
self._convert_to_eac3(main_subs_file_path, output_file) |
|
remove(main_subs_file_path) |
|
self._remove_same_name_files( |
|
self.working_space_temp_main_subs, file_name) |
|
|
|
def _get_files_dict(self, directory: str) -> Dict[str, str]: |
|
""" |
|
Gets a dictionary of the files in the given directory, excluding files with certain extensions. |
|
|
|
Args: |
|
- directory (str): The directory to get the files from. |
|
|
|
Returns: |
|
- Dict[str, str]: A dictionary of the files in the directory. |
|
""" |
|
excluded_extensions: List[str] = ["srt", "ass"] |
|
return {path.splitext(f)[0]: f for f in listdir(directory) if path.splitext(f)[1][1:].lower() not in excluded_extensions} |
|
|
|
def _get_file_duration(self, file_path: str) -> float: |
|
""" |
|
Gets the duration of the file at the given path. |
|
|
|
Args: |
|
- file_path (str): The path to the file. |
|
|
|
Returns: |
|
- float: The duration of the file in seconds. |
|
""" |
|
return float(mediainfo(file_path)['duration']) |
|
|
|
def _merge_files(self, input_file_1: str, input_file_2: str, output_file: str): |
|
""" |
|
Merges two audio files into a single file. |
|
|
|
Args: |
|
- input_file_1 (str): The path to the first input file. |
|
- input_file_2 (str): The path to the second input file. |
|
- output_file (str): The path to the output file. |
|
""" |
|
if 'main_subs' in input_file_1: |
|
command: List[str] = [ |
|
self.ffmpeg_path, |
|
"-i", input_file_1, |
|
"-i", input_file_2, |
|
"-filter_complex", "[0:a]volume=7dB[a1];[a1][1:a]amix=inputs=2:duration=first", |
|
output_file |
|
] |
|
else: |
|
command: List[str] = [ |
|
self.ffmpeg_path, |
|
"-i", input_file_1, |
|
"-i", input_file_2, |
|
"-filter_complex", "[1:a]volume=7dB[a1];[0:a][a1]amix=inputs=2:duration=first", |
|
output_file |
|
] |
|
call(command) |
|
|
|
def _convert_to_eac3(self, input_file: str, output_file: str): |
|
""" |
|
Converts an audio file to EAC3 format. |
|
|
|
Args: |
|
- input_file (str): The path to the input file. |
|
- output_file (str): The path to the output file. |
|
""" |
|
command: List[str] = [ |
|
self.ffmpeg_path, |
|
"-i", input_file, |
|
"-c:a", "eac3", |
|
output_file |
|
] |
|
call(command) |
|
|
|
def _remove_same_name_files(self, directory: str, file_name: str): |
|
""" |
|
Removes files with the same name as the given file name from the specified directory. |
|
|
|
Args: |
|
- directory (str): The directory to remove the files from. |
|
- file_name (str): The name of the files to remove. |
|
""" |
|
for file in listdir(directory): |
|
if path.splitext(file)[0] == file_name: |
|
remove(path.join(directory, file)) |
|
|
|
def generate_audio(self, settings: Settings): |
|
""" |
|
Generates the audio file from the subtitle file using the specified TTS settings. |
|
|
|
Args: |
|
- settings (Settings): The TTS settings to use. |
|
""" |
|
tts: Optional[str] = settings.tts |
|
tts_speed: Optional[str] = settings.tts_speed |
|
tts_volume: Optional[str] = settings.tts_volume |
|
|
|
console.print("Rozpoczynam generowanie pliku audio...", |
|
style='green_bold', end=' ') |
|
console.print(self.filename, style='white_bold') |
|
if tts == "TTS - Zosia - Harpo": |
|
self.srt_to_wav_harpo(tts_speed, tts_volume) |
|
elif tts == "TTS - Agnieszka - Ivona": |
|
self.srt_to_wav_balabolka(tts_speed, tts_volume) |
|
elif tts in ["TTS - Zofia - Edge", "TTS - Marek - Edge"]: |
|
self.srt_to_wav_edge_online(tts, tts_speed, tts_volume) |
|
console.print( |
|
"Generowanie pliku audio zakończone.", style='green_bold') |
|
|
|
self.merge_tts_audio() |
|
|
|
def srt_to_eac3_elevenlabs(self) -> None: |
|
""" |
|
Opens the main_subs folder for the user to add audio files generated by ElevenLabs. |
|
""" |
|
Popen(['explorer', path.realpath(self.working_space_temp_main_subs)]) |
|
|
|
console.print("\nWygeneruj pliki audio z plików .srt za pomocą 11Labs_TTS_Colab,\na następnie dodaj je do folderu main_subs.", |
|
style='yellow_bold') |
|
console.print( |
|
"11Labs_TTS_Colab: https://github.com/MattyMroz/11Labs_TTS_Colab", style='yellow_bold') |
|
console.print( |
|
"\n[green_italic]Naciśnij dowolny klawisz, aby kontynuować...", end=' ') |
|
getch() |
|
console.print() |
|
self.merge_tts_audio() |
|
|