import torch import torchaudio import soundfile as sf import gradio as gr from transformers import SeamlessM4TProcessor, SeamlessM4TModel # Load model and processor HF_MODEL_ID = "facebook/hf-seamless-m4t-medium" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") processor = SeamlessM4TProcessor.from_pretrained(HF_MODEL_ID) model = SeamlessM4TModel.from_pretrained(HF_MODEL_ID).to(device).eval() def translate_backend(audio_path, src_lang, tgt_lang): try: waveform, sr = sf.read(audio_path) if len(waveform.shape) > 1: waveform = waveform.mean(axis=1) waveform = waveform.astype("float32") if sr != 16000: waveform_tensor = torch.tensor(waveform).unsqueeze(0) resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000) waveform = resampler(waveform_tensor).squeeze(0).numpy() sr = 16000 inputs = processor(audios=waveform, sampling_rate=sr, return_tensors="pt", src_lang=src_lang).to(device) with torch.no_grad(): output = model.generate(**inputs, tgt_lang=tgt_lang, generate_speech=False) translated_text = processor.batch_decode(output.sequences, skip_special_tokens=True)[0] return translated_text except Exception as e: return f"❌ Error: {str(e)}" # Standard Gradio launch (no API flag) iface = gr.Interface( fn=translate_backend, inputs=[ gr.Audio(type="filepath", label="🎤 Audio"), gr.Text(label="Source Language"), gr.Text(label="Target Language") ], outputs="text", title="Kalpani iVoice Translate", allow_flagging="never" ) if __name__ == "__main__": iface.launch() # No 'api=True'