Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
import soundfile as sf | |
import gradio as gr | |
from transformers import SeamlessM4TProcessor, SeamlessM4TModel | |
# Load model and processor | |
HF_MODEL_ID = "facebook/hf-seamless-m4t-medium" | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
processor = SeamlessM4TProcessor.from_pretrained(HF_MODEL_ID) | |
model = SeamlessM4TModel.from_pretrained(HF_MODEL_ID).to(device).eval() | |
def translate_backend(audio_path, src_lang, tgt_lang): | |
try: | |
waveform, sr = sf.read(audio_path) | |
if len(waveform.shape) > 1: | |
waveform = waveform.mean(axis=1) | |
waveform = waveform.astype("float32") | |
if sr != 16000: | |
waveform_tensor = torch.tensor(waveform).unsqueeze(0) | |
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000) | |
waveform = resampler(waveform_tensor).squeeze(0).numpy() | |
sr = 16000 | |
inputs = processor(audios=waveform, sampling_rate=sr, return_tensors="pt", src_lang=src_lang).to(device) | |
with torch.no_grad(): | |
output = model.generate(**inputs, tgt_lang=tgt_lang, generate_speech=False) | |
translated_text = processor.batch_decode(output.sequences, skip_special_tokens=True)[0] | |
return translated_text | |
except Exception as e: | |
return f"β Error: {str(e)}" | |
# Standard Gradio launch (no API flag) | |
iface = gr.Interface( | |
fn=translate_backend, | |
inputs=[ | |
gr.Audio(type="filepath", label="π€ Audio"), | |
gr.Text(label="Source Language"), | |
gr.Text(label="Target Language") | |
], | |
outputs="text", | |
title="Kalpani iVoice Translate", | |
allow_flagging="never" | |
) | |
if __name__ == "__main__": | |
iface.launch() # No 'api=True' | |