MattyMroz's picture
mm_avh_working_space
068ed60
"""
The SubtitleRefactor class in this module provides various utilities for working with subtitle files.
It can be used in different scenarios to perform tasks such as splitting ASS subtitle files, converting subtitles to different formats, and organizing files.
* Example usage for splitting an ASS subtitle file based on styles:
subtitle_tool = SubtitleRefactor("sample_subtitle.ass")
subtitle_tool.split_ass()
* Example usage for converting ASS subtitles to SRT format:
subtitle_tool = SubtitleRefactor("sample_subtitle.ass")
subtitle_tool.ass_to_srt()
* Example usage for moving an SRT subtitle file to a specified directory:
subtitle_tool = SubtitleRefactor("sample_subtitle.srt")
subtitle_tool.move_srt()
* Example usage for converting a text file to SRT format and performing other operations:
subtitle_tool = SubtitleRefactor("sample_text.txt")
subtitle_tool.txt_to_srt(1)
* Example usage for converting numbers in an SRT subtitle file to their word equivalents in Polish:
subtitle_tool = SubtitleRefactor("sample_subtitle.srt")
subtitle_tool.convert_numbers_in_srt()
* Example usage for updating subtitles in an existing ASS file using translated subtitles from an SRT file:
subtitle_tool = SubtitleRefactor("sample_subtitle.srt")
subtitle_tool.srt_to_ass()
"""
import re
from contextlib import suppress
from dataclasses import dataclass
from os import makedirs, path, remove, stat
from shutil import move
from typing import List, Tuple
from nltk.tokenize import sent_tokenize
from pyasstosrt import Subtitle
from pysubs2 import load, SSAEvent, SSAFile
from constants import (WORKING_SPACE,
WORKING_SPACE_OUTPUT,
WORKING_SPACE_TEMP,
WORKING_SPACE_TEMP_MAIN_SUBS,
WORKING_SPACE_TEMP_ALT_SUBS,
console)
from utils.number_in_words import NumberInWords
@dataclass(slots=True)
class SubtitleRefactor:
"""
The SubtitleRefactor class is used for manipulating subtitle files. It provides functionalities such as splitting ASS subtitle files based on styles, converting ASS subtitles to SRT format, and moving SRT files to a specified directory.
Attributes:
- filename (str): The name of the subtitle file to be processed.
- working_space (str, optional): The directory where the subtitle file is located.
- working_space_output (str, optional): The directory where the output files will be saved.
- working_space_temp (str, optional): The directory where temporary files will be saved during processing.
- working_space_temp_main_subs (str, optional): The directory for main subtitles during processing.
- working_space_temp_alt_subs (str, optional): The directory for alternate subtitles during processing.
Methods:
- split_ass(self) -> None: Splits an ASS subtitle file into two files based on selected styles.
- ass_to_srt(self) -> None: Converts ASS subtitle files to SRT format.
- move_srt(self) -> None: Moves an SRT subtitle file to a specified directory.
- txt_to_srt(self, lines_per_caption: int) -> None: Converts a text file to SRT (SubRip Text) format.
- convert_numbers_in_srt(self) -> None: Converts numbers in an SRT subtitle file to their word equivalents in Polish.
- srt_to_ass(self) -> None: Updates subtitles in an existing ASS file using translated subtitles from an SRT file. If the ASS file does not exist, the SRT file is moved to the output directory, and its extension is changed to .ass. After these operations, the original ASS and SRT files are deleted.
"""
filename: str
working_space: str = WORKING_SPACE
working_space_output: str = WORKING_SPACE_OUTPUT
working_space_temp: str = WORKING_SPACE_TEMP
working_space_temp_main_subs = WORKING_SPACE_TEMP_MAIN_SUBS
working_space_temp_alt_subs = WORKING_SPACE_TEMP_ALT_SUBS
def split_ass(self) -> None:
"""
Splits an ASS subtitle file into two files based on selected styles.
"""
self._create_directories()
subs: SSAFile = self._load_subs()
styles: List[str] = self._get_styles(subs)
self._display_styles(styles)
selected_styles: List[str] = self._select_styles(styles)
if not selected_styles:
self._move_subs_to_main()
return
main_subs, alt_subs = self._split_subs(subs, selected_styles)
self._copy_metadata_and_styles(
subs, main_subs, alt_subs, selected_styles)
self._save_subs(main_subs, alt_subs, subs, selected_styles)
self._remove_source_file()
console.print("Podzia艂 napis贸w zako艅czony pomy艣lnie.",
style='green_bold')
def _create_directories(self) -> None:
"""
Creates directories if they do not exist.
"""
if not path.exists(self.working_space_temp_main_subs):
makedirs(self.working_space_temp_main_subs, exist_ok=True)
if not path.exists(self.working_space_temp_alt_subs):
makedirs(self.working_space_temp_alt_subs, exist_ok=True)
def _load_subs(self) -> SSAFile:
"""
Loads the subtitle file.
"""
with open(path.join(self.working_space_temp, self.filename), 'r', encoding='utf-8') as file:
return SSAFile.from_file(file)
def _get_styles(self, subs: SSAFile) -> List[str]:
"""
Returns a list of styles in the subtitle file.
"""
styles: List[str] = []
for event in subs:
if event.style not in styles:
styles.append(event.style)
return styles
def _display_styles(self, styles: List[str]) -> None:
"""
Displays the styles to the user.
"""
console.print("PODZIA艁 PLIKU:", style='yellow_bold')
console.print(self.filename, style='white_bold')
console.print("Dost臋pne style do TTS:", style='yellow_bold')
for i, style in enumerate(styles, start=1):
console.print(
f"[yellow_bold]{i}.[/yellow_bold] {style}", style='white_bold')
def _select_styles(self, styles: List[str]) -> List[str]:
"""
Prompts the user to select styles and returns a list of selected styles.
"""
selected_styles: List[str] = []
while True:
console.print("Wybierz style do zapisu (naci艣nij ENTER, aby zako艅czy膰):",
style='green_bold', end=" ")
selection: str = input('')
if not selection:
break
with suppress(ValueError):
selected_index: int = int(selection) - 1
if 0 <= selected_index < len(styles):
selected_styles.append(styles[selected_index])
return selected_styles
def _move_subs_to_main(self) -> None:
"""
Moves the subtitle file to the main_subs directory.
"""
console.print("Nie wybrano 偶adnych styl贸w. Przeniesiono napis贸w do main_subs.",
style='red_bold')
makedirs(self.working_space_temp_main_subs, exist_ok=True)
alt_file_path: str = path.join(
self.working_space_temp, self.filename)
main_file_path: str = path.join(
self.working_space_temp_main_subs, self.filename)
move(alt_file_path, main_file_path)
def _split_subs(self, subs: SSAFile, selected_styles: List[str]) -> Tuple[SSAFile, SSAFile]:
"""
Splits the subtitle file into two files based on the selected styles.
"""
main_subs: SSAFile = SSAFile()
alt_subs: SSAFile = SSAFile()
for event in subs:
if event.style in selected_styles:
main_subs.append(event)
else:
alt_subs.append(event)
return main_subs, alt_subs
def _copy_metadata_and_styles(self, subs: SSAFile, main_subs: SSAFile, alt_subs: SSAFile, selected_styles: List[str]) -> None:
"""
Copies metadata and styles to the output files.
"""
main_subs.info = subs.info
alt_subs.info = subs.info
main_style_names: List[str] = [style_name for style_name in subs.styles.keys()
if style_name in selected_styles]
main_subs.styles.clear()
for style_name in main_style_names:
main_subs.styles[style_name] = subs.styles[style_name]
alt_style_names: List[str] = [style_name for style_name in subs.styles.keys()
if style_name not in selected_styles]
alt_subs.styles.clear()
for style_name in alt_style_names:
alt_subs.styles[style_name] = subs.styles[style_name]
def _save_subs(self, main_subs: SSAFile, alt_subs: SSAFile, subs: SSAFile, selected_styles: List[str]) -> None:
"""
Saves the split subtitle files.
"""
main_output_file: str = path.join(
self.working_space_temp_main_subs, self.filename)
alt_output_file: str = path.join(
self.working_space_temp_alt_subs, self.filename)
main_subs.info = subs.info
alt_subs.info = subs.info
main_style_names: List[str] = [style_name for style_name in subs.styles.keys()
if style_name in selected_styles]
main_subs.styles.clear()
for style_name in main_style_names:
main_subs.styles[style_name] = subs.styles[style_name]
alt_style_names: List[str] = [style_name for style_name in subs.styles.keys()
if style_name not in selected_styles]
alt_subs.styles.clear()
for style_name in alt_style_names:
alt_subs.styles[style_name] = subs.styles[style_name]
with open(main_output_file, 'w', encoding='utf-8') as main_file:
main_file.write(main_subs.to_string(format_='ass'))
with open(alt_output_file, 'w', encoding='utf-8') as alt_file:
alt_file.write(alt_subs.to_string(format_='ass'))
def _remove_source_file(self) -> None:
"""
Removes the source subtitle file.
"""
remove(path.join(self.working_space_temp, self.filename))
def ass_to_srt(self) -> None:
"""
Converts ASS subtitle files to SRT format.
"""
folders = [self.working_space_temp_main_subs,
self.working_space_temp_alt_subs]
for folder in folders:
file_path: str = path.join(folder, self.filename)
if not path.exists(file_path):
continue
sub: Subtitle = Subtitle(file_path)
sub.export()
console.print()
def move_srt(self) -> None:
"""
Moves an SRT subtitle file to a specified directory and removes HTML tags.
"""
target_file_path: str = path.join(
self.working_space_temp_main_subs, self.filename)
if not path.exists(self.working_space_temp_main_subs):
makedirs(self.working_space_temp_main_subs, exist_ok=True)
source_file_path: str = path.join(
self.working_space_temp, self.filename)
# Otw贸rz plik 藕r贸d艂owy i przeczytaj jego zawarto艣膰
with open(source_file_path, 'r', encoding='utf-8') as file:
file_content = file.read()
# Usu艅 znaczniki HTML
file_content = re.sub(r'<.*?>', '', file_content)
# Zapisz zmodyfikowan膮 zawarto艣膰 do pliku docelowego
with open(target_file_path, 'w', encoding='utf-8') as file:
file.write(file_content)
# Usu艅 plik 藕r贸d艂owy
remove(source_file_path)
def txt_to_srt(self, lines_per_caption: int) -> None:
"""
Converts a text file to SRT (SubRip Text) format.
This method reads a text file, tokenizes the text into sentences using the NLTK library, and writes the sentences into a new SRT file. Groups of sentences are combined into a single caption based on the 'lines_per_caption' parameter. Each group becomes a separate subtitle in the SRT file. The original text file is then deleted, and the filename attribute of the class instance is updated to the new SRT file. Finally, the SRT file is moved to the 'main_subs' directory.
Args:
lines_per_caption (int): The number of sentences to include in each caption.
"""
txt_file_path: str = path.join(self.working_space_temp, self.filename)
srt_file_path: str = txt_file_path.replace('.txt', '.srt')
with open(txt_file_path, 'r', encoding='utf-8') as file:
text: str = file.read()
sentences: List[str] = sent_tokenize(text)
subs: SSAFile = SSAFile()
for i in range(0, len(sentences), lines_per_caption):
caption = ' '.join(sentences[i:i+lines_per_caption]).strip()
event: SSAEvent = SSAEvent(start=0, end=0, text=caption)
subs.append(event)
with open(srt_file_path, 'w', encoding='utf-8') as file:
file.write(subs.to_string(format_='srt'))
remove(txt_file_path)
self.filename = self.filename.replace('.txt', '.srt')
self.move_srt()
def convert_numbers_in_srt(self) -> None:
"""
Converts numbers in an SRT subtitle file to their word equivalents in Polish.
"""
srt_file_path: str = path.join(
self.working_space_temp_main_subs, self.filename)
subs = load(srt_file_path, encoding='utf-8')
number_in_words = NumberInWords()
for i, sub in enumerate(subs):
try:
sub.text = number_in_words.convert_numbers_in_text(sub.text)
except IndexError:
console.print(
f"[red_bold]Wyst膮pi艂 b艂膮d w napisie {i+1}:[/red_bold] {sub.text}.\n[red_bold]Pomijam ten napis.", style='white_bold')
subs.save(srt_file_path)
console.print(
"\nPrzekonwertowano liczby na s艂owa:", style='green_bold')
console.print(srt_file_path, '\n', style='white_bold')
def srt_to_ass(self) -> None:
"""
This method updates the subtitles in an existing ASS file using the translated subtitles from an SRT file.
If the ASS file does not exist, the SRT file is moved to the output directory.
After these operations, the original ASS and SRT files are deleted.
"""
srt_file_path: str = path.join(
self.working_space_temp_alt_subs, self.filename)
ass_file_path: str = path.join(
self.working_space_temp_alt_subs, self.filename.replace('.srt', '.ass'))
output_file_path: str = path.join(
self.working_space_output, self.filename.replace('.srt', '.ass'))
if stat(srt_file_path).st_size == 0:
return
srt_subs = load(srt_file_path, encoding='utf-8')
if path.exists(ass_file_path):
ass_subs = SSAFile.load(ass_file_path)
srt_index = 0
for event in ass_subs.events:
if event.type == "Dialogue" and srt_index < len(srt_subs):
srt_lines = srt_subs[srt_index].text.split('\n')
ass_lines = event.text.split('}')
if len(ass_lines) >= len(srt_lines):
for j in range(len(srt_lines)):
last_brace_position = ass_lines[j].rfind('}')
if last_brace_position != -1:
ass_lines[j] = ass_lines[j][:last_brace_position +
1] + srt_lines[j]
event.text = '}'.join(ass_lines)
srt_index += 1
ass_subs.save(output_file_path)
else:
output_file_path = output_file_path.replace('.ass', '.srt')
move(srt_file_path, output_file_path)
console.print("Przeniesiono alternatywne napisy:", style="green_bold")
console.print(output_file_path, style="white_bold")
if path.exists(srt_file_path):
remove(srt_file_path)
if path.exists(ass_file_path):
remove(ass_file_path)