MattyMroz
/

MM_AVH

Model card Files Files and versions Community

MM_AVH / mm_avh_working_space /modules /subtitle_to_speech.py

MattyMroz

mm_avh_working_space

068ed60 over 1 year ago

raw

history blame contribute delete

22.7 kB

	"""
	This module defines the 'SubtitleToSpeech' class, which converts subtitle files to speech audio files.
	It supports multiple Text-to-Speech (TTS) services including Harpo, Balabolka, Edge, and ElevenLabs.

	* Usage:
	To use this module, create an instance of the 'SubtitleToSpeech' class and call the 'generate_audio' method.

	* Example usage:
	from subtitle_to_speech import SubtitleToSpeech

	# Create an instance of SubtitleToSpeech
	converter = SubtitleToSpeech(filename="example.srt")

	# Generate audio
	converter.generate_audio(settings)

	* Example usage:
	if __name__ == '__main__':
	converter = SubtitleToSpeech(filename="example.srt")
	converter.generate_audio(settings)

	* Example usage:
	if __name__ == '__main__':
	if 'TTS - Głos - ElevenLans' in settings.tts:
	audio_generator = SubtitleToSpeech(filename="")
	audio_generator.srt_to_eac3_elevenlabs() # For Alt Subs
	"""

	from dataclasses import dataclass
	from msvcrt import getch
	from os import listdir, path, remove
	from subprocess import call, Popen
	from threading import Thread
	from time import sleep
	import wave
	from asyncio import create_task, gather, run
	from typing import Dict, List, Optional

	import pyttsx3
	import pysrt
	from edge_tts import Communicate
	from pydub import AudioSegment
	from pydub.utils import mediainfo

	from constants import (WORKING_SPACE,
	WORKING_SPACE_OUTPUT,
	WORKING_SPACE_TEMP,
	WORKING_SPACE_TEMP_MAIN_SUBS,
	WORKING_SPACE_TEMP_ALT_SUBS,
	BALABOLKA_PATH,
	FFMPEG_PATH,
	console)
	from data.settings import Settings


	@dataclass(slots=True)
	class SubtitleToSpeech:
	"""
	This class provides methods to convert subtitle files to speech audio files.

	Attributes:
	- filename (str): The name of the subtitle file to convert.
	- working_space (str): The path to the working directory.
	- working_space_output (str): The path to the output directory.
	- working_space_temp (str): The path to the temporary directory.
	- working_space_temp_main_subs (str): The path to the main subtitles directory.
	- working_space_temp_alt_subs (str): The path to the alternative subtitles directory.
	- balabolka_path (str): The path to the Balabolka executable.
	- ffmpeg_path (str): The path to the FFmpeg executable.

	Methods:
	- ansi_srt(self) -> None:
	Converts the encoding of the subtitle file to ANSI.

	- srt_to_wav_harpo(self, tts_speed: str, tts_volume: str) -> None:
	Converts the subtitle file to a WAV audio file using Harpo TTS.

	- srt_to_wav_balabolka(self, tts_speed: str, tts_volume: str) -> None:
	Converts the subtitle file to a WAV audio file using Balabolka TTS.

	- srt_to_wav_edge_online(self, tts: str, tts_speed: str, tts_volume: str) -> None:
	Converts the subtitle file to a WAV audio file using Edge TTS.

	- merge_tts_audio(self) -> None:
	Merges the generated TTS audio files.

	- generate_audio(self, settings: Settings) -> None:
	Generates the audio file from the subtitle file using the specified TTS settings.

	- srt_to_eac3_elevenlabs(self) -> None:
	Opens the main_subs folder for the user to add audio files generated by ElevenLabs.
	"""

	filename: str
	working_space: str = WORKING_SPACE
	working_space_output: str = WORKING_SPACE_OUTPUT
	working_space_temp: str = WORKING_SPACE_TEMP
	working_space_temp_main_subs: str = WORKING_SPACE_TEMP_MAIN_SUBS
	working_space_temp_alt_subs: str = WORKING_SPACE_TEMP_ALT_SUBS
	balabolka_path: str = BALABOLKA_PATH
	ffmpeg_path: str = FFMPEG_PATH

	def ansi_srt(self) -> None:
	"""
	Converts the encoding of the subtitle file to ANSI.

	Raises:
	- UnicodeDecodeError: If the file is not in UTF-8 encoding.
	"""
	try:
	with open(path.join(self.working_space_temp_main_subs, self.filename), "r", encoding="utf-8") as source_file:
	content: str = source_file.read()
	except UnicodeDecodeError:
	with open(path.join(self.working_space_temp_main_subs, self.filename), "r", encoding="ANSI") as source_file:
	content: str = source_file.read()

	with open(path.join(self.working_space_temp_main_subs, self.filename), "w", encoding="ANSI", errors="ignore") as target_file:
	target_file.write(content)

	console.print("Zamieniono kodowanie na ANSI:",
	style='green_bold', end=' ')
	console.print(self.filename)

	def srt_to_wav_harpo(self, tts_speed: str, tts_volume: str) -> None:
	"""
	Converts the subtitle file to a WAV audio file using Harpo TTS.

	Args:
	- tts_speed (str): The speed of the TTS voice.
	- tts_volume (str): The volume of the TTS voice.
	"""
	self.ansi_srt()
	engine = self._init_engine(tts_speed, tts_volume)
	subtitles: pysrt.SubRipFile = pysrt.open(path.join(
	self.working_space_temp_main_subs, self.filename), encoding='ANSI')
	output_file: str = path.splitext(path.join(
	self.working_space_temp_main_subs, self.filename))[0] + '.wav'
	self._generate_wav_file(engine, subtitles, output_file)
	remove(path.join(self.working_space_temp, "temp.wav"))

	def _init_engine(self, tts_speed: str, tts_volume: str) -> pyttsx3.Engine:
	"""
	Initializes the TTS engine with the specified speed and volume.

	Args:
	- tts_speed (str): The speed of the TTS voice.
	- tts_volume (str): The volume of the TTS voice.

	Returns:
	- pyttsx3.Engine: The initialized TTS engine.
	"""
	engine: pyttsx3.Engine = pyttsx3.init()
	voices: List[pyttsx3.Voice] = engine.getProperty('voices')
	for voice in voices:
	if voice.name == 'Vocalizer Expressive Zosia Harpo 22kHz':
	engine.setProperty('voice', voice.id)
	engine.setProperty('rate', int(tts_speed))
	engine.setProperty('volume', float(tts_volume))
	return engine

	def _generate_wav_file(self, engine: pyttsx3.Engine, subtitles: pysrt.SubRipFile, output_file: str) -> None:
	"""
	Generates a WAV audio file from the given subtitles using the specified TTS engine.

	Args:
	- engine (pyttsx3.Engine): The TTS engine to use for speech synthesis.
	- subtitles (pysrt.SubRipFile): The subtitles to convert to speech.
	- output_file (str): The path to the output WAV file.
	"""
	with wave.open(output_file, 'wb') as wav_file:
	wav_file.setnchannels(1) # Mono
	wav_file.setsampwidth(2) # 16-bit
	wav_file.setframerate(22500) # 22kHz

	for i, subtitle in enumerate(subtitles, start=1):
	print(
	f"{i}\n{subtitle.start.to_time().strftime('%H:%M:%S.%f')[:-3]} --> {subtitle.end.to_time().strftime('%H:%M:%S.%f')[:-3]}\n{subtitle.text}\n")
	start_time: float = subtitle.start.ordinal / 1000.0
	self._save_subtitle_to_wav(engine, subtitle.text)
	self._add_empty_frame_if_needed(wav_file, start_time)
	self._add_subtitle_to_wav(wav_file)

	def _save_subtitle_to_wav(self, engine: pyttsx3.Engine, text: str) -> None:
	"""
	Saves a single subtitle to a temporary WAV file.

	Args:
	- engine (pyttsx3.Engine): The TTS engine to use for speech synthesis.
	- text (str): The text of the subtitle to convert to speech.
	"""
	engine.save_to_file(text, path.join(
	self.working_space_temp, "temp.wav"))
	engine.runAndWait()

	def _add_empty_frame_if_needed(self, wav_file: wave.Wave_write, start_time: float) -> None:
	"""
	Adds an empty frame to the WAV file if the start time of the next subtitle is later than the current time in the audio.

	Args:
	- wav_file (wave.Wave_write): The WAV file to add the empty frame to.
	- start_time (float): The start time of the next subtitle.
	"""
	framerate: int = wav_file.getframerate()
	nframes: int = wav_file.getnframes()
	current_time: float = nframes / float(framerate)
	if start_time > current_time:
	empty_frame_duration: int = int(
	(start_time - current_time) * framerate)
	empty_frame: bytes = b'\x00' * empty_frame_duration * 2
	wav_file.writeframes(empty_frame)

	def _add_subtitle_to_wav(self, wav_file: wave.Wave_write) -> None:
	"""
	Adds a subtitle to the WAV file.

	Args:
	- wav_file (wave.Wave_write): The WAV file to add the subtitle to.
	"""
	with wave.open(path.join(self.working_space_temp, "temp.wav"), 'rb') as temp_file:
	data: bytes = temp_file.readframes(temp_file.getnframes())
	wav_file.writeframes(data)

	def srt_to_wav_balabolka(self, tts_speed: str, tts_volume: str) -> None:
	"""
	Converts the subtitle file to a WAV audio file using Balabolka TTS.

	Args:
	- tts_speed (str): The speed of the TTS voice.
	- tts_volume (str): The volume of the TTS voice.
	"""
	self.ansi_srt()
	balcon_path: str = self.balabolka_path
	file_path: str = path.join(
	self.working_space_temp_main_subs, self.filename)
	output_wav_path: str = path.join(
	self.working_space_temp_main_subs, path.splitext(self.filename)[0] + ".wav")
	command: List[str] = self._prepare_balabolka_command(
	balcon_path, file_path, output_wav_path, tts_speed, tts_volume)

	command_thread: Thread = Thread(
	target=call, args=(command,))
	command_thread.start()

	subtitles: pysrt.SubRipFile = pysrt.open(file_path, encoding='ANSI')
	for subtitle in subtitles:
	self.process_subtitle(subtitle)

	command_thread.join()

	def _prepare_balabolka_command(self, balcon_path: str, file_path: str, output_wav_path: str, tts_speed: str, tts_volume: str) -> List[str]:
	"""
	Prepares the command to run Balabolka TTS.

	Args:
	- balcon_path (str): The path to the Balabolka executable.
	- file_path (str): The path to the subtitle file.
	- output_wav_path (str): The path to the output WAV file.
	- tts_speed (str): The speed of the TTS voice.
	- tts_volume (str): The volume of the TTS voice.

	Returns:
	- List[str]: The prepared command.
	"""
	return [
	balcon_path,
	"-fr", "48",
	"-f", file_path,
	"-w", output_wav_path,
	"-n", "IVONA 2 Agnieszka",
	"-s", tts_speed,
	"-v", tts_volume
	]

	def process_subtitle(self, subtitle: pysrt.SubRipItem) -> None:
	"""
	Processes a single subtitle.

	Args:
	- subtitle (pysrt.SubRipItem): The subtitle to process.
	"""
	i: int = subtitle.index
	start_time: str = subtitle.start.to_time().strftime('%H:%M:%S.%f')[:-3]
	end_time: str = subtitle.end.to_time().strftime('%H:%M:%S.%f')[:-3]
	text: str = subtitle.text
	print(f"{i}\n{start_time} --> {end_time}\n{text}\n")
	sleep(0.02)

	async def generate_speech(self, subtitle: pysrt.SubRipItem, voice: str, output_file: str, rate: str, volume: str) -> None:
	"""
	Generates speech from a single subtitle using the specified TTS voice.

	Args:
	- subtitle (pysrt.SubRipItem): The subtitle to convert to speech.
	- voice (str): The TTS voice to use.
	- output_file (str): The path to the output audio file.
	- rate (str): The speed of the TTS voice.
	- volume (str): The volume of the TTS voice.
	"""
	communicate = Communicate(
	subtitle.text, voice, rate=rate, volume=volume)
	await communicate.save(output_file)

	async def generate_wav_files(self, subtitles: pysrt.SubRipFile, voice: str, rate: str, volume: str) -> List[str]:
	"""
	Generates WAV audio files from the given subtitles using the specified TTS voice.

	Args:
	- subtitles (pysrt.SubRipFile): The subtitles to convert to speech.
	- voice (str): The TTS voice to use.
	- rate (str): The speed of the TTS voice.
	- volume (str): The volume of the TTS voice.

	Returns:
	- List[str]: The paths to the generated WAV files.
	"""
	tasks = []
	mp3_files = []
	file_name = path.splitext(subtitles.path)[0]
	for i, subtitle in enumerate(subtitles, start=1):
	output_file = f"{file_name}_{i}.mp3"
	mp3_files.append(output_file)
	tasks.append(create_task(self.generate_speech(
	subtitle, voice, output_file, rate, volume)))
	if i % 50 == 0:
	await gather(*tasks)
	tasks = []
	sleep(2)
	await gather(*tasks)
	return mp3_files

	def merge_audio_files(self, mp3_files: List[str], subtitles: pysrt.SubRipFile, dir_path: str) -> None:
	"""
	Merges the given MP3 audio files into a single WAV file.

	Args:
	- mp3_files (List[str]): The paths to the MP3 files to merge.
	- subtitles (pysrt.SubRipFile): The subtitles corresponding to the audio files.
	- dir_path (str): The directory where the audio files are located.
	"""
	file_name: str = path.splitext(subtitles.path)[0]
	with wave.open(f"{file_name}.wav", 'wb') as wav_file:
	wav_file.setnchannels(1)
	wav_file.setsampwidth(2)
	wav_file.setframerate(24000)

	for i, mp3_file in enumerate(mp3_files, start=1):
	print(
	f"{i}\n{subtitles[i-1].start.to_time().strftime('%H:%M:%S.%f')[:-3]} --> {subtitles[i-1].end.to_time().strftime('%H:%M:%S.%f')[:-3]}\n{subtitles[i-1].text}\n")
	mp3_file_path: str = path.join(dir_path, mp3_file)
	if path.isfile(mp3_file_path):
	start_time: float = subtitles[i-1].start.ordinal / 1000.0
	sound: AudioSegment = AudioSegment.from_file(
	mp3_file_path, format="mp3")
	remove(mp3_file_path)
	self._add_empty_frame_if_needed(wav_file, start_time)
	sound_data: bytes = sound.raw_data
	wav_file.writefqrames(sound_data)

	def srt_to_wav_edge_online(self, tts: str, tts_speed: str, tts_volume: str) -> None:
	"""
	Converts the subtitle file to a WAV audio file using Edge TTS.

	Args:
	- tts (str): The TTS service to use.
	- tts_speed (str): The speed of the TTS voice.
	- tts_volume (str): The volume of the TTS voice.
	"""
	self.ansi_srt()
	voice = "pl-PL-ZofiaNeural" if tts == "TTS - Zofia - Edge" else "pl-PL-MarekNeural"

	subtitles: pysrt.SubRipFile = pysrt.open(path.join(
	self.working_space_temp_main_subs, self.filename), encoding='ANSI')
	mp3_files: List[str] = run(self.generate_wav_files(
	subtitles, voice, tts_speed, tts_volume))
	self.merge_audio_files(mp3_files, subtitles,
	self.working_space_temp_main_subs)

	def merge_tts_audio(self) -> None:
	"""
	Merges the generated TTS audio files.
	"""
	main_subs_files_dict: Dict[str, str] = self._get_files_dict(
	self.working_space_temp_main_subs)
	tmp_files_dict: Dict[str, str] = self._get_files_dict(
	self.working_space_temp)

	for file_name, main_subs_file in main_subs_files_dict.items():
	main_subs_file_path: str = path.join(
	self.working_space_temp_main_subs, main_subs_file)
	output_file: str = path.join(
	self.working_space_output, file_name + ".eac3")

	if file_name in tmp_files_dict:
	tmp_file: str = tmp_files_dict[file_name]
	tmp_file_path: str = path.join(
	self.working_space_temp, tmp_file)

	main_subs_file_duration: float = self._get_file_duration(
	main_subs_file_path)
	tmp_file_duration: float = self._get_file_duration(
	tmp_file_path)

	input_file_1: str
	input_file_2: str
	if main_subs_file_duration > tmp_file_duration:
	input_file_1, input_file_2 = main_subs_file_path, tmp_file_path
	else:
	input_file_1, input_file_2 = tmp_file_path, main_subs_file_path

	self._merge_files(input_file_1, input_file_2, output_file)

	remove(main_subs_file_path)
	remove(tmp_file_path)
	else:
	self._convert_to_eac3(main_subs_file_path, output_file)
	remove(main_subs_file_path)
	self._remove_same_name_files(
	self.working_space_temp_main_subs, file_name)

	def _get_files_dict(self, directory: str) -> Dict[str, str]:
	"""
	Gets a dictionary of the files in the given directory, excluding files with certain extensions.

	Args:
	- directory (str): The directory to get the files from.

	Returns:
	- Dict[str, str]: A dictionary of the files in the directory.
	"""
	excluded_extensions: List[str] = ["srt", "ass"]
	return {path.splitext(f)[0]: f for f in listdir(directory) if path.splitext(f)[1][1:].lower() not in excluded_extensions}

	def _get_file_duration(self, file_path: str) -> float:
	"""
	Gets the duration of the file at the given path.

	Args:
	- file_path (str): The path to the file.

	Returns:
	- float: The duration of the file in seconds.
	"""
	return float(mediainfo(file_path)['duration'])

	def _merge_files(self, input_file_1: str, input_file_2: str, output_file: str):
	"""
	Merges two audio files into a single file.

	Args:
	- input_file_1 (str): The path to the first input file.
	- input_file_2 (str): The path to the second input file.
	- output_file (str): The path to the output file.
	"""
	if 'main_subs' in input_file_1:
	command: List[str] = [
	self.ffmpeg_path,
	"-i", input_file_1,
	"-i", input_file_2,
	"-filter_complex", "[0:a]volume=7dB[a1];[a1][1:a]amix=inputs=2:duration=first",
	output_file
	]
	else:
	command: List[str] = [
	self.ffmpeg_path,
	"-i", input_file_1,
	"-i", input_file_2,
	"-filter_complex", "[1:a]volume=7dB[a1];[0:a][a1]amix=inputs=2:duration=first",
	output_file
	]
	call(command)

	def _convert_to_eac3(self, input_file: str, output_file: str):
	"""
	Converts an audio file to EAC3 format.

	Args:
	- input_file (str): The path to the input file.
	- output_file (str): The path to the output file.
	"""
	command: List[str] = [
	self.ffmpeg_path,
	"-i", input_file,
	"-c:a", "eac3",
	output_file
	]
	call(command)

	def _remove_same_name_files(self, directory: str, file_name: str):
	"""
	Removes files with the same name as the given file name from the specified directory.

	Args:
	- directory (str): The directory to remove the files from.
	- file_name (str): The name of the files to remove.
	"""
	for file in listdir(directory):
	if path.splitext(file)[0] == file_name:
	remove(path.join(directory, file))

	def generate_audio(self, settings: Settings):
	"""
	Generates the audio file from the subtitle file using the specified TTS settings.

	Args:
	- settings (Settings): The TTS settings to use.
	"""
	tts: Optional[str] = settings.tts
	tts_speed: Optional[str] = settings.tts_speed
	tts_volume: Optional[str] = settings.tts_volume

	console.print("Rozpoczynam generowanie pliku audio...",
	style='green_bold', end=' ')
	console.print(self.filename, style='white_bold')
	if tts == "TTS - Zosia - Harpo":
	self.srt_to_wav_harpo(tts_speed, tts_volume)
	elif tts == "TTS - Agnieszka - Ivona":
	self.srt_to_wav_balabolka(tts_speed, tts_volume)
	elif tts in ["TTS - Zofia - Edge", "TTS - Marek - Edge"]:
	self.srt_to_wav_edge_online(tts, tts_speed, tts_volume)
	console.print(
	"Generowanie pliku audio zakończone.", style='green_bold')

	self.merge_tts_audio()

	def srt_to_eac3_elevenlabs(self) -> None:
	"""
	Opens the main_subs folder for the user to add audio files generated by ElevenLabs.
	"""
	Popen(['explorer', path.realpath(self.working_space_temp_main_subs)])

	console.print("\nWygeneruj pliki audio z plików .srt za pomocą 11Labs_TTS_Colab,\na następnie dodaj je do folderu main_subs.",
	style='yellow_bold')
	console.print(
	"11Labs_TTS_Colab: https://github.com/MattyMroz/11Labs_TTS_Colab", style='yellow_bold')
	console.print(
	"\n[green_italic]Naciśnij dowolny klawisz, aby kontynuować...", end=' ')
	getch()
	console.print()
	self.merge_tts_audio()