|
import spaces |
|
import numpy as np |
|
import gradio as gr |
|
import torch |
|
from transformers import MarianTokenizer, MarianMTModel, AutoTokenizer, AutoFeatureExtractor |
|
from parler_tts import ParlerTTSForConditionalGeneration |
|
from PyPDF2 import PdfReader |
|
import re |
|
import textwrap |
|
import soundfile as sf |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device) |
|
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1") |
|
feature_extractor = AutoFeatureExtractor.from_pretrained("parler-tts/parler-tts-mini-v1") |
|
SAMPLE_RATE = feature_extractor.sampling_rate |
|
SEED = 42 |
|
|
|
|
|
def pdf_to_text(pdf_file): |
|
with open(pdf_file, 'rb') as file: |
|
pdf_reader = PdfReader(file) |
|
text = "" |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() or "" |
|
return text |
|
|
|
|
|
def split_text_into_sentences(text): |
|
sentence_endings = re.compile(r'[.!?]') |
|
sentences = sentence_endings.split(text) |
|
return [sentence.strip() for sentence in sentences if sentence.strip()] |
|
|
|
@spaces.GPU(duration=120) |
|
|
|
def translate(source_text, source_lang, target_lang, batch_size=16): |
|
if source_lang == 'en' and target_lang == 'tr': |
|
model_name = f"Helsinki-NLP/opus-mt-tc-big-en-tr" |
|
else: |
|
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}" |
|
tokenizer = MarianTokenizer.from_pretrained(model_name) |
|
model = MarianMTModel.from_pretrained(model_name).to(device) |
|
|
|
text_chunks = textwrap.wrap(source_text, 512) |
|
translated_text = "" |
|
|
|
for i in range(0, len(text_chunks), batch_size): |
|
text_batch = text_chunks[i:i+batch_size] |
|
input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device) |
|
output_ids = model.generate(input_ids, max_new_tokens=512) |
|
|
|
for output in output_ids: |
|
output_text = tokenizer.decode(output, skip_special_tokens=True) |
|
translated_text += output_text + " " |
|
|
|
return translated_text |
|
|
|
|
|
def combine_audio_arrays(audio_list): |
|
combined_audio = np.concatenate(audio_list, axis=0) |
|
return combined_audio |
|
|
|
@spaces.GPU(duration=35) |
|
|
|
def generate_single_wav_from_text(sentence, description): |
|
torch.manual_seed(SEED) |
|
inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device) |
|
prompt = tts_tokenizer(sentence, return_tensors="pt").to(device) |
|
|
|
generation = tts_model.generate( |
|
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, |
|
prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0 |
|
) |
|
audio_arr = generation.cpu().numpy().squeeze() |
|
return SAMPLE_RATE, audio_arr |
|
|
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_mode = gr.Radio(choices=["Upload PDF", "Type Text"], label="Input Mode", value="Type Text") |
|
pdf_input = gr.File(label="Upload PDF", file_types=['pdf'], visible=False) |
|
text_input = gr.Textbox(label="Type your text here", visible=True, placeholder="Enter text here if not uploading a PDF...") |
|
translate_checkbox = gr.Checkbox(label="Enable Translation", value=False) |
|
source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True) |
|
target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True) |
|
description = gr.Textbox(label="Voice Description", lines=2, |
|
value="Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.") |
|
run_button = gr.Button("Generate Audio", variant="primary") |
|
with gr.Column(): |
|
audio_output = gr.Audio(label="Generated Audio") |
|
markdown_output = gr.Markdown() |
|
|
|
def update_target_lang(source_lang): |
|
options = { |
|
"en": ["de", "fr", "tr"], |
|
"tr": ["en"], |
|
"de": ["en", "fr"], |
|
"fr": ["en", "de"] |
|
} |
|
return gr.update(choices=options[source_lang], value=options[source_lang][0]) |
|
|
|
def handle_input(input_mode, pdf_input, text_input): |
|
if input_mode == "Upload PDF": |
|
return pdf_to_text(pdf_input.name) |
|
else: |
|
return text_input |
|
|
|
def run_pipeline(input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description): |
|
text = handle_input(input_mode, pdf_input, text_input) |
|
|
|
if translate_checkbox: |
|
text = translate(text, source_lang, target_lang) |
|
|
|
sentences = split_text_into_sentences(text) |
|
all_audio = [] |
|
all_text = "" |
|
for sentence in sentences: |
|
sample_rate, audio_arr = generate_single_wav_from_text(sentence, description) |
|
all_audio.append(audio_arr) |
|
combined_audio = combine_audio_arrays(all_audio) |
|
all_text += f"**Sentence**: {sentence}\n\n" |
|
yield (sample_rate, combined_audio), all_text |
|
|
|
examples = [ |
|
[ |
|
"Type Text", |
|
None, |
|
"Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her palace window, which had a carved frame of black wood.", |
|
False, |
|
"en", |
|
"tr", |
|
"In an inferior recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average." |
|
], |
|
[ |
|
"Upload PDF", |
|
"Ethics.pdf", |
|
None, |
|
False, |
|
"en", |
|
"tr", |
|
"Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise." |
|
] |
|
] |
|
|
|
input_mode.change( |
|
fn=lambda choice: [gr.update(visible=choice == "Upload PDF"), gr.update(visible=choice == "Type Text")], |
|
inputs=input_mode, |
|
outputs=[pdf_input, text_input], |
|
) |
|
gr.Examples(examples=examples, fn=run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output], cache_examples=False) |
|
source_lang.change(update_target_lang, inputs=source_lang, outputs=target_lang) |
|
|
|
run_button.click(run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output]) |
|
|
|
demo.launch(share=True) |
|
|