Spaces:

kanyekuthi
/

dsn-afrispeech-demo

Running

kanyekuthi commited on Jun 2

Commit

5cc5855

verified ·

1 Parent(s): a5ab88c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,36 +1,32 @@
 import gradio as gr
 import torch
 import torchaudio
-from transformers import AutoProcessor, AutoModelForCTC
-# Load model and processor
-model_id = "kanyekuthi/dsn_afrispeech"
 processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForCTC.from_pretrained(model_id)
 def transcribe(audio):
-    # Load and resample audio to 16kHz if needed
     waveform, sr = torchaudio.load(audio)
     if sr != 16000:
         resampler = torchaudio.transforms.Resample(sr, 16000)
         waveform = resampler(waveform)
-    # Run model
     inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
     with torch.no_grad():
-        logits = model(**inputs).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.batch_decode(predicted_ids)[0]
     return transcription
-# Build Gradio interface
 iface = gr.Interface(
     fn=transcribe,
     inputs=gr.Audio(source="microphone", type="filepath"),
     outputs="text",
-    title="DSN Afrispeech Transcriber",
-    description="Speak into your mic and this ASR model will transcribe it."
 )
-iface.launch()

 import gradio as gr
 import torch
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import torchaudio
+model_id = "kanyekuthi/dsn_afrispeech"  # or your correct model repo ID
 processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
 def transcribe(audio):
     waveform, sr = torchaudio.load(audio)
     if sr != 16000:
         resampler = torchaudio.transforms.Resample(sr, 16000)
         waveform = resampler(waveform)
     inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
     with torch.no_grad():
+        generated_ids = model.generate(**inputs)
+    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return transcription
 iface = gr.Interface(
     fn=transcribe,
     inputs=gr.Audio(source="microphone", type="filepath"),
     outputs="text",
+    title="Whisper-based ASR Demo"
 )
+iface.launch()