hriteshMaikap commited on
Commit
670f5ce
·
verified ·
1 Parent(s): 132669d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -0
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ import numpy as np
5
+ from transformers import Wav2Vec2BertProcessor, Wav2Vec2BertForCTC
6
+
7
+ # Load model and processor
8
+ repo_id = "hriteshMaikap/marathi-asr-model"
9
+ processor = Wav2Vec2BertProcessor.from_pretrained(repo_id)
10
+ model = Wav2Vec2BertForCTC.from_pretrained(repo_id)
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ model = model.to(device)
13
+
14
+ def transcribe(audio):
15
+ # Process audio
16
+ waveform, sample_rate = torchaudio.load(audio)
17
+
18
+ # Resample if needed
19
+ if sample_rate != 16000:
20
+ resampler = torchaudio.transforms.Resample(sample_rate, 16000)
21
+ waveform = resampler(waveform)
22
+
23
+ # Convert to mono if needed
24
+ if waveform.shape[0] > 1:
25
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
26
+
27
+ # Convert to numpy
28
+ speech_array = waveform.squeeze().numpy()
29
+
30
+ # Process and run inference
31
+ with torch.no_grad():
32
+ inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt").to(device)
33
+ logits = model(inputs.input_features).logits
34
+ predicted_ids = torch.argmax(logits, dim=-1)
35
+
36
+ # Decode the predicted IDs
37
+ transcription = processor.decode(predicted_ids[0])
38
+
39
+ return transcription
40
+
41
+ # Create Gradio interface
42
+ iface = gr.Interface(
43
+ fn=transcribe,
44
+ inputs=gr.Audio(source="microphone", type="filepath"),
45
+ outputs="text",
46
+ title="Marathi Speech Recognition",
47
+ description="Record your voice in Marathi and get a transcription."
48
+ )
49
+
50
+ iface.launch()