import gradio as gr from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM import torch import unicodedata import re import whisper import tempfile import os import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize import fitz # PyMuPDF import docx from bs4 import BeautifulSoup import markdown2 import chardet # Device setup device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Model configuration MODELS = { "english_wolof": { "model_name": "LocaleNLP/localenlp-eng-wol-0.03", "target_tag": ">>wol<<" }, "wolof_english": { "model_name": "LocaleNLP/localenlp-wol-eng-0.03", "target_tag": ">>eng<<" }, "english_hausa": { "model_name": "LocaleNLP/localenlp-eng-hau-0.01", "target_tag": ">>hau<<" }, "hausa_english": { "model_name": "LocaleNLP/localenlp-hau-eng-0.01", "target_tag": ">>eng<<" } } # Global variables translator = None current_model = None whisper_model = None HF_TOKEN = os.getenv("HF_TOKEN") def load_translation_model(input_lang, output_lang): global translator, current_model model_key = f"{input_lang.lower()}_{output_lang.lower()}" if model_key not in MODELS: raise ValueError(f"Translation from {input_lang} to {output_lang} is not supported") if current_model != model_key or translator is None: model_config = MODELS[model_key] model = AutoModelForSeq2SeqLM.from_pretrained(model_config["model_name"], token=HF_TOKEN).to(device) tokenizer = MarianTokenizer.from_pretrained(model_config["model_name"], token=HF_TOKEN) translator = { "pipeline": pipeline("translation", model=model, tokenizer=tokenizer, device=0 if device.type == 'cuda' else -1), "target_tag": model_config["target_tag"] } current_model = model_key return translator def load_whisper_model(): global whisper_model if whisper_model is None: whisper_model = whisper.load_model("base") return whisper_model def transcribe_audio(audio_file): model = load_whisper_model() if isinstance(audio_file, str): audio_path = audio_file else: with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(audio_file.read()) audio_path = tmp.name result = model.transcribe(audio_path) if not isinstance(audio_file, str): os.remove(audio_path) return result["text"] def extract_text_from_file(uploaded_file): if isinstance(uploaded_file, str): file_path = uploaded_file file_type = file_path.split('.')[-1].lower() with open(file_path, "rb") as f: content = f.read() else: file_type = uploaded_file.name.split('.')[-1].lower() content = uploaded_file.read() if file_type == "pdf": with fitz.open(stream=content, filetype="pdf") as doc: return "\n".join([page.get_text() for page in doc]) elif file_type == "docx": if isinstance(uploaded_file, str): doc = docx.Document(file_path) else: doc = docx.Document(uploaded_file) return "\n".join([para.text for para in doc.paragraphs]) else: encoding = chardet.detect(content)['encoding'] if encoding: content = content.decode(encoding, errors='ignore') if file_type in ("html", "htm"): soup = BeautifulSoup(content, "html.parser") return soup.get_text() elif file_type == "md": html = markdown2.markdown(content) soup = BeautifulSoup(html, "html.parser") return soup.get_text() elif file_type == "srt": return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content) elif file_type in ("txt", "text"): return content else: raise ValueError("Unsupported file type") def translate(text, input_lang, output_lang): translator = load_translation_model(input_lang, output_lang) lang_tag = translator["target_tag"] translation_pipeline = translator["pipeline"] paragraphs = text.split("\n") translated_output = [] with torch.no_grad(): for para in paragraphs: if not para.strip(): translated_output.append("") continue sentences = [s.strip() for s in para.split('. ') if s.strip()] formatted = [f"{lang_tag} {s}" for s in sentences] results = translation_pipeline(formatted, max_length=5000, num_beams=5, early_stopping=True, no_repeat_ngram_size=3, repetition_penalty=1.5, length_penalty=1.2) translated_sentences = [r['translation_text'].capitalize() for r in results] translated_output.append('. '.join(translated_sentences)) return "\n".join(translated_output) def process_input(input_mode, text, audio_file, file_obj, input_lang): input_text = "" if input_mode == "Text": input_text = text elif input_mode == "Audio": if audio_file is not None: input_text = transcribe_audio(audio_file) elif input_mode == "File": if file_obj is not None: input_text = extract_text_from_file(file_obj) return input_text def translate_and_return(text, input_lang, output_lang): if not text.strip(): return "No input text to translate." return translate(text, input_lang, output_lang) def update_input_lang_dropdown(input_mode): if input_mode == "Audio": return gr.Dropdown(value="English", interactive=False) else: return gr.Dropdown(interactive=True) # Gradio UI components with gr.Blocks() as demo: gr.Markdown("## LocaleNLP Translator") gr.Markdown("Translate between English, Wolof, and Hausa using Localenlp models.") with gr.Row(): input_mode = gr.Radio(choices=["Text", "Audio", "File"], label="Select input mode", value="Text") with gr.Row(): input_lang = gr.Dropdown(choices=["English", "Wolof", "Hausa"], label="Input Language", value="English") output_lang = gr.Dropdown(choices=["English", "Wolof", "Hausa"], label="Output Language", value="Hausa") input_text = gr.Textbox(label="Enter text", lines=10, visible=True) audio_input = gr.Audio(label="Upload audio (.wav, .mp3, .m4a)", type="filepath", visible=False) file_input = gr.File(file_types=['.pdf', '.docx', '.html', '.htm', '.md', '.srt', '.txt'], label="Upload document", visible=False) extracted_text = gr.Textbox(label="Extracted / Transcribed Text", lines=10, interactive=False) translate_button = gr.Button("Translate") output_text = gr.Textbox(label="Translated Text", lines=10, interactive=False) def update_visibility(mode): return { input_text: gr.update(visible=(mode=="Text")), audio_input: gr.update(visible=(mode=="Audio")), file_input: gr.update(visible=(mode=="File")), extracted_text: gr.update(value="", visible=True), output_text: gr.update(value="") } input_mode.change(fn=update_visibility, inputs=input_mode, outputs=[input_text, audio_input, file_input, extracted_text, output_text]) input_mode.change(fn=update_input_lang_dropdown, inputs=input_mode, outputs=input_lang) def handle_process(mode, text, audio, file_obj, in_lang): try: extracted = process_input(mode, text, audio, file_obj, in_lang) return extracted, "" except Exception as e: return "", f"Error: {str(e)}" translate_button.click(fn=handle_process, inputs=[input_mode, input_text, audio_input, file_input, input_lang], outputs=[extracted_text, output_text]) def handle_translate(text, in_lang, out_lang): return translate_and_return(text, in_lang, out_lang) translate_button.click(fn=handle_translate, inputs=[extracted_text, input_lang, output_lang], outputs=output_text) demo.launch()