MM_AVH / mm_avh_working_space /modules /subtitle_to_speech.py
MattyMroz's picture
mm_avh_working_space
068ed60
"""
This module defines the 'SubtitleToSpeech' class, which converts subtitle files to speech audio files.
It supports multiple Text-to-Speech (TTS) services including Harpo, Balabolka, Edge, and ElevenLabs.
* Usage:
To use this module, create an instance of the 'SubtitleToSpeech' class and call the 'generate_audio' method.
* Example usage:
from subtitle_to_speech import SubtitleToSpeech
# Create an instance of SubtitleToSpeech
converter = SubtitleToSpeech(filename="example.srt")
# Generate audio
converter.generate_audio(settings)
* Example usage:
if __name__ == '__main__':
converter = SubtitleToSpeech(filename="example.srt")
converter.generate_audio(settings)
* Example usage:
if __name__ == '__main__':
if 'TTS - *Głos* - ElevenLans' in settings.tts:
audio_generator = SubtitleToSpeech(filename="")
audio_generator.srt_to_eac3_elevenlabs() # For Alt Subs
"""
from dataclasses import dataclass
from msvcrt import getch
from os import listdir, path, remove
from subprocess import call, Popen
from threading import Thread
from time import sleep
import wave
from asyncio import create_task, gather, run
from typing import Dict, List, Optional
import pyttsx3
import pysrt
from edge_tts import Communicate
from pydub import AudioSegment
from pydub.utils import mediainfo
from constants import (WORKING_SPACE,
WORKING_SPACE_OUTPUT,
WORKING_SPACE_TEMP,
WORKING_SPACE_TEMP_MAIN_SUBS,
WORKING_SPACE_TEMP_ALT_SUBS,
BALABOLKA_PATH,
FFMPEG_PATH,
console)
from data.settings import Settings
@dataclass(slots=True)
class SubtitleToSpeech:
"""
This class provides methods to convert subtitle files to speech audio files.
Attributes:
- filename (str): The name of the subtitle file to convert.
- working_space (str): The path to the working directory.
- working_space_output (str): The path to the output directory.
- working_space_temp (str): The path to the temporary directory.
- working_space_temp_main_subs (str): The path to the main subtitles directory.
- working_space_temp_alt_subs (str): The path to the alternative subtitles directory.
- balabolka_path (str): The path to the Balabolka executable.
- ffmpeg_path (str): The path to the FFmpeg executable.
Methods:
- ansi_srt(self) -> None:
Converts the encoding of the subtitle file to ANSI.
- srt_to_wav_harpo(self, tts_speed: str, tts_volume: str) -> None:
Converts the subtitle file to a WAV audio file using Harpo TTS.
- srt_to_wav_balabolka(self, tts_speed: str, tts_volume: str) -> None:
Converts the subtitle file to a WAV audio file using Balabolka TTS.
- srt_to_wav_edge_online(self, tts: str, tts_speed: str, tts_volume: str) -> None:
Converts the subtitle file to a WAV audio file using Edge TTS.
- merge_tts_audio(self) -> None:
Merges the generated TTS audio files.
- generate_audio(self, settings: Settings) -> None:
Generates the audio file from the subtitle file using the specified TTS settings.
- srt_to_eac3_elevenlabs(self) -> None:
Opens the main_subs folder for the user to add audio files generated by ElevenLabs.
"""
filename: str
working_space: str = WORKING_SPACE
working_space_output: str = WORKING_SPACE_OUTPUT
working_space_temp: str = WORKING_SPACE_TEMP
working_space_temp_main_subs: str = WORKING_SPACE_TEMP_MAIN_SUBS
working_space_temp_alt_subs: str = WORKING_SPACE_TEMP_ALT_SUBS
balabolka_path: str = BALABOLKA_PATH
ffmpeg_path: str = FFMPEG_PATH
def ansi_srt(self) -> None:
"""
Converts the encoding of the subtitle file to ANSI.
Raises:
- UnicodeDecodeError: If the file is not in UTF-8 encoding.
"""
try:
with open(path.join(self.working_space_temp_main_subs, self.filename), "r", encoding="utf-8") as source_file:
content: str = source_file.read()
except UnicodeDecodeError:
with open(path.join(self.working_space_temp_main_subs, self.filename), "r", encoding="ANSI") as source_file:
content: str = source_file.read()
with open(path.join(self.working_space_temp_main_subs, self.filename), "w", encoding="ANSI", errors="ignore") as target_file:
target_file.write(content)
console.print("Zamieniono kodowanie na ANSI:",
style='green_bold', end=' ')
console.print(self.filename)
def srt_to_wav_harpo(self, tts_speed: str, tts_volume: str) -> None:
"""
Converts the subtitle file to a WAV audio file using Harpo TTS.
Args:
- tts_speed (str): The speed of the TTS voice.
- tts_volume (str): The volume of the TTS voice.
"""
self.ansi_srt()
engine = self._init_engine(tts_speed, tts_volume)
subtitles: pysrt.SubRipFile = pysrt.open(path.join(
self.working_space_temp_main_subs, self.filename), encoding='ANSI')
output_file: str = path.splitext(path.join(
self.working_space_temp_main_subs, self.filename))[0] + '.wav'
self._generate_wav_file(engine, subtitles, output_file)
remove(path.join(self.working_space_temp, "temp.wav"))
def _init_engine(self, tts_speed: str, tts_volume: str) -> pyttsx3.Engine:
"""
Initializes the TTS engine with the specified speed and volume.
Args:
- tts_speed (str): The speed of the TTS voice.
- tts_volume (str): The volume of the TTS voice.
Returns:
- pyttsx3.Engine: The initialized TTS engine.
"""
engine: pyttsx3.Engine = pyttsx3.init()
voices: List[pyttsx3.Voice] = engine.getProperty('voices')
for voice in voices:
if voice.name == 'Vocalizer Expressive Zosia Harpo 22kHz':
engine.setProperty('voice', voice.id)
engine.setProperty('rate', int(tts_speed))
engine.setProperty('volume', float(tts_volume))
return engine
def _generate_wav_file(self, engine: pyttsx3.Engine, subtitles: pysrt.SubRipFile, output_file: str) -> None:
"""
Generates a WAV audio file from the given subtitles using the specified TTS engine.
Args:
- engine (pyttsx3.Engine): The TTS engine to use for speech synthesis.
- subtitles (pysrt.SubRipFile): The subtitles to convert to speech.
- output_file (str): The path to the output WAV file.
"""
with wave.open(output_file, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(22500) # 22kHz
for i, subtitle in enumerate(subtitles, start=1):
print(
f"{i}\n{subtitle.start.to_time().strftime('%H:%M:%S.%f')[:-3]} --> {subtitle.end.to_time().strftime('%H:%M:%S.%f')[:-3]}\n{subtitle.text}\n")
start_time: float = subtitle.start.ordinal / 1000.0
self._save_subtitle_to_wav(engine, subtitle.text)
self._add_empty_frame_if_needed(wav_file, start_time)
self._add_subtitle_to_wav(wav_file)
def _save_subtitle_to_wav(self, engine: pyttsx3.Engine, text: str) -> None:
"""
Saves a single subtitle to a temporary WAV file.
Args:
- engine (pyttsx3.Engine): The TTS engine to use for speech synthesis.
- text (str): The text of the subtitle to convert to speech.
"""
engine.save_to_file(text, path.join(
self.working_space_temp, "temp.wav"))
engine.runAndWait()
def _add_empty_frame_if_needed(self, wav_file: wave.Wave_write, start_time: float) -> None:
"""
Adds an empty frame to the WAV file if the start time of the next subtitle is later than the current time in the audio.
Args:
- wav_file (wave.Wave_write): The WAV file to add the empty frame to.
- start_time (float): The start time of the next subtitle.
"""
framerate: int = wav_file.getframerate()
nframes: int = wav_file.getnframes()
current_time: float = nframes / float(framerate)
if start_time > current_time:
empty_frame_duration: int = int(
(start_time - current_time) * framerate)
empty_frame: bytes = b'\x00' * empty_frame_duration * 2
wav_file.writeframes(empty_frame)
def _add_subtitle_to_wav(self, wav_file: wave.Wave_write) -> None:
"""
Adds a subtitle to the WAV file.
Args:
- wav_file (wave.Wave_write): The WAV file to add the subtitle to.
"""
with wave.open(path.join(self.working_space_temp, "temp.wav"), 'rb') as temp_file:
data: bytes = temp_file.readframes(temp_file.getnframes())
wav_file.writeframes(data)
def srt_to_wav_balabolka(self, tts_speed: str, tts_volume: str) -> None:
"""
Converts the subtitle file to a WAV audio file using Balabolka TTS.
Args:
- tts_speed (str): The speed of the TTS voice.
- tts_volume (str): The volume of the TTS voice.
"""
self.ansi_srt()
balcon_path: str = self.balabolka_path
file_path: str = path.join(
self.working_space_temp_main_subs, self.filename)
output_wav_path: str = path.join(
self.working_space_temp_main_subs, path.splitext(self.filename)[0] + ".wav")
command: List[str] = self._prepare_balabolka_command(
balcon_path, file_path, output_wav_path, tts_speed, tts_volume)
command_thread: Thread = Thread(
target=call, args=(command,))
command_thread.start()
subtitles: pysrt.SubRipFile = pysrt.open(file_path, encoding='ANSI')
for subtitle in subtitles:
self.process_subtitle(subtitle)
command_thread.join()
def _prepare_balabolka_command(self, balcon_path: str, file_path: str, output_wav_path: str, tts_speed: str, tts_volume: str) -> List[str]:
"""
Prepares the command to run Balabolka TTS.
Args:
- balcon_path (str): The path to the Balabolka executable.
- file_path (str): The path to the subtitle file.
- output_wav_path (str): The path to the output WAV file.
- tts_speed (str): The speed of the TTS voice.
- tts_volume (str): The volume of the TTS voice.
Returns:
- List[str]: The prepared command.
"""
return [
balcon_path,
"-fr", "48",
"-f", file_path,
"-w", output_wav_path,
"-n", "IVONA 2 Agnieszka",
"-s", tts_speed,
"-v", tts_volume
]
def process_subtitle(self, subtitle: pysrt.SubRipItem) -> None:
"""
Processes a single subtitle.
Args:
- subtitle (pysrt.SubRipItem): The subtitle to process.
"""
i: int = subtitle.index
start_time: str = subtitle.start.to_time().strftime('%H:%M:%S.%f')[:-3]
end_time: str = subtitle.end.to_time().strftime('%H:%M:%S.%f')[:-3]
text: str = subtitle.text
print(f"{i}\n{start_time} --> {end_time}\n{text}\n")
sleep(0.02)
async def generate_speech(self, subtitle: pysrt.SubRipItem, voice: str, output_file: str, rate: str, volume: str) -> None:
"""
Generates speech from a single subtitle using the specified TTS voice.
Args:
- subtitle (pysrt.SubRipItem): The subtitle to convert to speech.
- voice (str): The TTS voice to use.
- output_file (str): The path to the output audio file.
- rate (str): The speed of the TTS voice.
- volume (str): The volume of the TTS voice.
"""
communicate = Communicate(
subtitle.text, voice, rate=rate, volume=volume)
await communicate.save(output_file)
async def generate_wav_files(self, subtitles: pysrt.SubRipFile, voice: str, rate: str, volume: str) -> List[str]:
"""
Generates WAV audio files from the given subtitles using the specified TTS voice.
Args:
- subtitles (pysrt.SubRipFile): The subtitles to convert to speech.
- voice (str): The TTS voice to use.
- rate (str): The speed of the TTS voice.
- volume (str): The volume of the TTS voice.
Returns:
- List[str]: The paths to the generated WAV files.
"""
tasks = []
mp3_files = []
file_name = path.splitext(subtitles.path)[0]
for i, subtitle in enumerate(subtitles, start=1):
output_file = f"{file_name}_{i}.mp3"
mp3_files.append(output_file)
tasks.append(create_task(self.generate_speech(
subtitle, voice, output_file, rate, volume)))
if i % 50 == 0:
await gather(*tasks)
tasks = []
sleep(2)
await gather(*tasks)
return mp3_files
def merge_audio_files(self, mp3_files: List[str], subtitles: pysrt.SubRipFile, dir_path: str) -> None:
"""
Merges the given MP3 audio files into a single WAV file.
Args:
- mp3_files (List[str]): The paths to the MP3 files to merge.
- subtitles (pysrt.SubRipFile): The subtitles corresponding to the audio files.
- dir_path (str): The directory where the audio files are located.
"""
file_name: str = path.splitext(subtitles.path)[0]
with wave.open(f"{file_name}.wav", 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(24000)
for i, mp3_file in enumerate(mp3_files, start=1):
print(
f"{i}\n{subtitles[i-1].start.to_time().strftime('%H:%M:%S.%f')[:-3]} --> {subtitles[i-1].end.to_time().strftime('%H:%M:%S.%f')[:-3]}\n{subtitles[i-1].text}\n")
mp3_file_path: str = path.join(dir_path, mp3_file)
if path.isfile(mp3_file_path):
start_time: float = subtitles[i-1].start.ordinal / 1000.0
sound: AudioSegment = AudioSegment.from_file(
mp3_file_path, format="mp3")
remove(mp3_file_path)
self._add_empty_frame_if_needed(wav_file, start_time)
sound_data: bytes = sound.raw_data
wav_file.writefqrames(sound_data)
def srt_to_wav_edge_online(self, tts: str, tts_speed: str, tts_volume: str) -> None:
"""
Converts the subtitle file to a WAV audio file using Edge TTS.
Args:
- tts (str): The TTS service to use.
- tts_speed (str): The speed of the TTS voice.
- tts_volume (str): The volume of the TTS voice.
"""
self.ansi_srt()
voice = "pl-PL-ZofiaNeural" if tts == "TTS - Zofia - Edge" else "pl-PL-MarekNeural"
subtitles: pysrt.SubRipFile = pysrt.open(path.join(
self.working_space_temp_main_subs, self.filename), encoding='ANSI')
mp3_files: List[str] = run(self.generate_wav_files(
subtitles, voice, tts_speed, tts_volume))
self.merge_audio_files(mp3_files, subtitles,
self.working_space_temp_main_subs)
def merge_tts_audio(self) -> None:
"""
Merges the generated TTS audio files.
"""
main_subs_files_dict: Dict[str, str] = self._get_files_dict(
self.working_space_temp_main_subs)
tmp_files_dict: Dict[str, str] = self._get_files_dict(
self.working_space_temp)
for file_name, main_subs_file in main_subs_files_dict.items():
main_subs_file_path: str = path.join(
self.working_space_temp_main_subs, main_subs_file)
output_file: str = path.join(
self.working_space_output, file_name + ".eac3")
if file_name in tmp_files_dict:
tmp_file: str = tmp_files_dict[file_name]
tmp_file_path: str = path.join(
self.working_space_temp, tmp_file)
main_subs_file_duration: float = self._get_file_duration(
main_subs_file_path)
tmp_file_duration: float = self._get_file_duration(
tmp_file_path)
input_file_1: str
input_file_2: str
if main_subs_file_duration > tmp_file_duration:
input_file_1, input_file_2 = main_subs_file_path, tmp_file_path
else:
input_file_1, input_file_2 = tmp_file_path, main_subs_file_path
self._merge_files(input_file_1, input_file_2, output_file)
remove(main_subs_file_path)
remove(tmp_file_path)
else:
self._convert_to_eac3(main_subs_file_path, output_file)
remove(main_subs_file_path)
self._remove_same_name_files(
self.working_space_temp_main_subs, file_name)
def _get_files_dict(self, directory: str) -> Dict[str, str]:
"""
Gets a dictionary of the files in the given directory, excluding files with certain extensions.
Args:
- directory (str): The directory to get the files from.
Returns:
- Dict[str, str]: A dictionary of the files in the directory.
"""
excluded_extensions: List[str] = ["srt", "ass"]
return {path.splitext(f)[0]: f for f in listdir(directory) if path.splitext(f)[1][1:].lower() not in excluded_extensions}
def _get_file_duration(self, file_path: str) -> float:
"""
Gets the duration of the file at the given path.
Args:
- file_path (str): The path to the file.
Returns:
- float: The duration of the file in seconds.
"""
return float(mediainfo(file_path)['duration'])
def _merge_files(self, input_file_1: str, input_file_2: str, output_file: str):
"""
Merges two audio files into a single file.
Args:
- input_file_1 (str): The path to the first input file.
- input_file_2 (str): The path to the second input file.
- output_file (str): The path to the output file.
"""
if 'main_subs' in input_file_1:
command: List[str] = [
self.ffmpeg_path,
"-i", input_file_1,
"-i", input_file_2,
"-filter_complex", "[0:a]volume=7dB[a1];[a1][1:a]amix=inputs=2:duration=first",
output_file
]
else:
command: List[str] = [
self.ffmpeg_path,
"-i", input_file_1,
"-i", input_file_2,
"-filter_complex", "[1:a]volume=7dB[a1];[0:a][a1]amix=inputs=2:duration=first",
output_file
]
call(command)
def _convert_to_eac3(self, input_file: str, output_file: str):
"""
Converts an audio file to EAC3 format.
Args:
- input_file (str): The path to the input file.
- output_file (str): The path to the output file.
"""
command: List[str] = [
self.ffmpeg_path,
"-i", input_file,
"-c:a", "eac3",
output_file
]
call(command)
def _remove_same_name_files(self, directory: str, file_name: str):
"""
Removes files with the same name as the given file name from the specified directory.
Args:
- directory (str): The directory to remove the files from.
- file_name (str): The name of the files to remove.
"""
for file in listdir(directory):
if path.splitext(file)[0] == file_name:
remove(path.join(directory, file))
def generate_audio(self, settings: Settings):
"""
Generates the audio file from the subtitle file using the specified TTS settings.
Args:
- settings (Settings): The TTS settings to use.
"""
tts: Optional[str] = settings.tts
tts_speed: Optional[str] = settings.tts_speed
tts_volume: Optional[str] = settings.tts_volume
console.print("Rozpoczynam generowanie pliku audio...",
style='green_bold', end=' ')
console.print(self.filename, style='white_bold')
if tts == "TTS - Zosia - Harpo":
self.srt_to_wav_harpo(tts_speed, tts_volume)
elif tts == "TTS - Agnieszka - Ivona":
self.srt_to_wav_balabolka(tts_speed, tts_volume)
elif tts in ["TTS - Zofia - Edge", "TTS - Marek - Edge"]:
self.srt_to_wav_edge_online(tts, tts_speed, tts_volume)
console.print(
"Generowanie pliku audio zakończone.", style='green_bold')
self.merge_tts_audio()
def srt_to_eac3_elevenlabs(self) -> None:
"""
Opens the main_subs folder for the user to add audio files generated by ElevenLabs.
"""
Popen(['explorer', path.realpath(self.working_space_temp_main_subs)])
console.print("\nWygeneruj pliki audio z plików .srt za pomocą 11Labs_TTS_Colab,\na następnie dodaj je do folderu main_subs.",
style='yellow_bold')
console.print(
"11Labs_TTS_Colab: https://github.com/MattyMroz/11Labs_TTS_Colab", style='yellow_bold')
console.print(
"\n[green_italic]Naciśnij dowolny klawisz, aby kontynuować...", end=' ')
getch()
console.print()
self.merge_tts_audio()