File size: 2,400 Bytes
fb75f92 0c5f390 fb75f92 ce73a0d 0c5f390 fb75f92 775715c 4f038de c0d8b29 4f038de 561b555 f4c4440 a026b19 921ba44 d1884ab 921ba44 4f038de ce73a0d f4c4440 ce73a0d fb75f92 0c5f390 4ba2ace fb75f92 ce73a0d 0c5f390 fb75f92 0c5f390 fb75f92 ce73a0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
import gradio as gr
import sox
import subprocess
def read_file_and_process(wav_file):
filename = wav_file.split('.')[0]
filename_16k = filename + "16k.wav"
resampler(wav_file, filename_16k)
speech, _ = sf.read(filename_16k)
inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
return inputs
def resampler(input_file_path, output_file_path):
command = (
f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
f"{output_file_path}"
)
subprocess.call(command, shell=True)
def parse_transcription(logits):
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
def parse(wav_file):
input_values = read_file_and_process(wav_file)
with torch.no_grad():
logits = model(**input_values).logits
if wav_file:
return parse_transcription(logits)
# model_id = "infinitejoy/wav2vec2-large-xls-r-300m-odia"
# working 50%
# model_id = "Harveenchadha/odia_large_wav2vec2"
# It worked when first run but after that getting error
model_id = "anuragshas/wav2vec2-large-xlsr-53-odia"
# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"
# model_id = "Ranjit/Whisper_Small_Odia_CV_11.0_5k_steps"
# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"
# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"
# This is hindi
# model_id = "Harveenchadha/vakyansh-wav2vec2-hindi-him-4200"
processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)
# input_ = gr.Audio(source="microphone", type="filepath")
# input_ = gr.inputs.File(source="upload", type="filepath") # Change input source to "upload" and type to "audio"
input_ = gr.Audio(source="upload", type="filepath")
txtbox = gr.Textbox(
label="Output from the model will appear here:",
lines=5
)
# chkbox = gr.Checkbox(label="Apply LM", value=False)
# gr.Interface(parse, inputs=[input_, chkbox], outputs=txtbox,
gr.Interface(parse, inputs=[input_], outputs=txtbox,
streaming=True, interactive=True,
analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);
|