Spaces:

kanyekuthi
/

dsn-afrispeech-demo

Sleeping

kanyekuthi commited on Jun 2

Commit

3814abf

verified ·

1 Parent(s): 5cc5855

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import torch
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import torchaudio
-model_id = "kanyekuthi/dsn_afrispeech"  # or your correct model repo ID
 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
@@ -11,22 +11,19 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
 def transcribe(audio):
     waveform, sr = torchaudio.load(audio)
     if sr != 16000:
-        resampler = torchaudio.transforms.Resample(sr, 16000)
-        waveform = resampler(waveform)
     inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
     with torch.no_grad():
-        generated_ids = model.generate(**inputs)
-    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return transcription
 iface = gr.Interface(
     fn=transcribe,
-    inputs=gr.Audio(source="microphone", type="filepath"),
     outputs="text",
-    title="Whisper-based ASR Demo"
 )
 iface.launch()

 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import torchaudio
+model_id = "kanyekuthi/dsn_afrispeech"
 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
 def transcribe(audio):
     waveform, sr = torchaudio.load(audio)
     if sr != 16000:
+        waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
     inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
     with torch.no_grad():
+        predicted_ids = model.generate(**inputs)
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return transcription
 iface = gr.Interface(
     fn=transcribe,
+    inputs=gr.Audio(type="filepath", label="Upload or Record Audio"),
     outputs="text",
+    title="Whisper ASR: DSN Afrispeech"
 )
 iface.launch()