srt / app /srt_to_ssml.py
badalsahani's picture
Update app/srt_to_ssml.py
9e20a4b
import re
from .translator import translate_text
def sub_to_ssml(input, output, lang, voice, gender, from_lang, to_lang):
srtFile = input
# Output file name
outputFile = output
#------- SSML Options -------
# Language
# language = "en-US"
language = lang
# Voice Name - To not specify a voice, put nothing between the quotes or set value to None
# voiceName = "en-US-DavisNeural"
voiceName = voice
durationAttributeName = "duration"
# Whether to escape special characters in the text. Possible Values: True, False
enableCharacterEscape = True
#------- Advanced SSML Options -------
# SSML Version
ssmlVersion = "1.0"
# Whether to include the xmlns:xsi and xsi:schemaLocation attributes in the <speak> tag.
includeSchemaLocation = True # Possible Values: True, False
# Output File Encoding
chosenFileEncoding = "utf_8_sig" # utf_8_sig for BOM, utf_8 for no BOM
def escapeChars(enableCharacterEscape, text):
if enableCharacterEscape:
text = text.replace("&", "&amp;")
text = text.replace('"', "&quot;")
text = text.replace("'", "&apos;")
text = text.replace("<", "&lt;")
text = text.replace(">", "&gt;")
return text
#======================================== Parse SRT File ================================================
# Open an srt file and read the lines into a list
with open(srtFile, 'r', encoding='utf-8-sig') as f:
lines = f.readlines()
# Matches the following example with regex: 00:00:20,130 --> 00:00:23,419
subtitleTimeLineRegex = re.compile(r'\d\d:\d\d:\d\d,\d\d\d --> \d\d:\d\d:\d\d,\d\d\d')
# Create a dictionary
subsDict = {}
# Enumerate lines, and if a line in lines contains only an integer, put that number in the key, and a dictionary in the value
# The dictionary contains the start, ending, and duration of the subtitles as well as the text
# The next line uses the syntax HH:MM:SS,MMM --> HH:MM:SS,MMM . Get the difference between the two times and put that in the dictionary
# For the line after that, put the text in the dictionary
for lineNum, line in enumerate(lines):
line = line.strip()
# If line has no text
if line.isdigit() and subtitleTimeLineRegex.match(lines[lineNum + 1]):
lineWithTimestamps = lines[lineNum + 1].strip()
lineWithSubtitleText = lines[lineNum + 2].strip()
# If there are more lines after the subtitle text, add them to the text
count = 3
while True:
# Check if the next line is blank or not
if (lineNum+count) < len(lines) and lines[lineNum + count].strip():
lineWithSubtitleText += ' ' + lines[lineNum + count].strip()
count += 1
else:
break
# Create empty dictionary with keys for start and end times and subtitle text
subsDict[line] = {'start_ms': '', 'end_ms': '', 'duration_ms': '', 'text': '', 'break_until_next': ''}
time = lineWithTimestamps.split(' --> ')
time1 = time[0].split(':')
time2 = time[1].split(':')
# Converts the time to milliseconds
processedTime1 = int(time1[0]) * 3600000 + int(time1[1]) * 60000 + int(time1[2].split(',')[0]) * 1000 + int(time1[2].split(',')[1]) #/ 1000 #Uncomment to turn into seconds
processedTime2 = int(time2[0]) * 3600000 + int(time2[1]) * 60000 + int(time2[2].split(',')[0]) * 1000 + int(time2[2].split(',')[1]) #/ 1000 #Uncomment to turn into seconds
timeDifferenceMs = str(processedTime2 - processedTime1)
# Set the keys in the dictionary to the values
subsDict[line]['start_ms'] = str(processedTime1)
subsDict[line]['end_ms'] = str(processedTime2)
subsDict[line]['duration_ms'] = timeDifferenceMs
subsDict[line]['text'] = lineWithSubtitleText
if lineNum > 0:
# Goes back to previous line's dictionary and writes difference in time to current line
subsDict[str(int(line)-1)]['break_until_next'] = str(processedTime1 - int(subsDict[str(int(line) - 1)]['end_ms']))
else:
subsDict[line]['break_until_next'] = '0'
#=========================================== Create SSML File ============================================
# Make voice tag if applicable
if voiceName is None or voiceName == '' or voiceName.lower() == 'none':
voiceTag = ''
voiceTagEnd = ''
else:
voiceTag = f'<voice xml:lang="{language}" xml:gender="{gender}" name="' + voiceName + '">'
voiceTagEnd = '</voice>'
# Encoding with utf-8-sig adds BOM to the beginning of the file, because use with Azure requires it
output_string = f'<speak version="{ssmlVersion}" xml:lang="{language}">{voiceTag}\n'
for key, value in subsDict.items():
if not value['break_until_next'] or value['break_until_next'] == '0':
breakTimeString = ''
else:
breakTime = str(value['break_until_next'])
breakTimeString = f'<break time="{breakTime}ms"/>'
text = escapeChars(enableCharacterEscape, value['text'])
translated_text = translate_text(text, from_lang, to_lang)
texToWrite = (f'\t<prosody {durationAttributeName}="{value["duration_ms"]}ms">{translated_text}</prosody>{breakTimeString}\n')
output_string += texToWrite
output_string += f'{voiceTagEnd}</speak>'
with open(outputFile, 'w', encoding=chosenFileEncoding) as f:
f.write(output_string)
return output_string