import os import torch import torchaudio import soundfile as sf import gradio as gr from transformers import SeamlessM4TProcessor, SeamlessM4TModel # ✅ Load Model and Processor HF_MODEL_ID = "facebook/hf-seamless-m4t-medium" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") processor = SeamlessM4TProcessor.from_pretrained(HF_MODEL_ID) model = SeamlessM4TModel.from_pretrained(HF_MODEL_ID).to(device).eval() # ✅ Voice-to-Text Translation Function def voice_translate(audio_file, src_lang, tgt_lang): try: # 🔥 Load audio file (uploaded file object) waveform, sr = sf.read(audio_file.name) # 🔄 Convert stereo to mono if needed if len(waveform.shape) > 1: waveform = waveform.mean(axis=1) # 🔧 Ensure float32 format waveform = waveform.astype("float32") # 🔄 Resample to 16kHz if needed if sr != 16000: waveform_tensor = torch.tensor(waveform).unsqueeze(0) resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000) waveform = resampler(waveform_tensor).squeeze(0).numpy() sr = 16000 # ✅ Prepare model input inputs = processor( audios=waveform, sampling_rate=sr, return_tensors="pt", src_lang=src_lang ).to(device) # ✅ Run inference with torch.no_grad(): output = model.generate( **inputs, tgt_lang=tgt_lang, generate_speech=False # ❌ Only text translation ) # ✅ Decode output translated_text = processor.batch_decode( output.sequences, skip_special_tokens=True )[0] return [translated_text] # ⬅️ Wrap in a list for Gradio output except Exception as e: return [f"❌ Error: {str(e)}"] # ✅ Gradio Interface iface = gr.Interface( fn=voice_translate, inputs=[ gr.File(label="🎤 Input Audio"), # ✅ Accepts file upload gr.Textbox(label="Source Language Code (e.g. eng)"), gr.Textbox(label="Target Language Code (e.g. fra)") ], outputs=[ gr.Textbox(label="🌍 Translated Text") ], title="Kalpani iVoice (Voice ➜ Translated Text)", allow_flagging="never" ).queue() # ✅ Launch if __name__ == "__main__": iface.launch(server_name="0.0.0.0", share=True, server_port=int(os.environ.get("PORT", 7860)))