Spaces:

Bishan
/

test-odia

Sleeping

File size: 2,400 Bytes

fb75f92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c5f390
 
fb75f92
 
 
ce73a0d
0c5f390
fb75f92
 
 
775715c
4f038de
c0d8b29
4f038de
561b555
f4c4440
a026b19
921ba44
 
d1884ab
921ba44
 
 
 
4f038de
ce73a0d
f4c4440
ce73a0d
fb75f92
 
 
0c5f390
4ba2ace
 
fb75f92
ce73a0d
 
 
0c5f390
fb75f92
0c5f390
 
fb75f92
ce73a0d

import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
import gradio as gr
import sox
import subprocess


def read_file_and_process(wav_file):
    filename = wav_file.split('.')[0]
    filename_16k = filename + "16k.wav"
    resampler(wav_file, filename_16k)
    speech, _ = sf.read(filename_16k)
    inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
    
    return inputs


def resampler(input_file_path, output_file_path):
    command = (
        f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
        f"{output_file_path}"
    )
    subprocess.call(command, shell=True)


def parse_transcription(logits):
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
    return transcription


def parse(wav_file):
    input_values = read_file_and_process(wav_file)
    with torch.no_grad():
        logits = model(**input_values).logits

    if wav_file:
        return parse_transcription(logits)

    
# model_id = "infinitejoy/wav2vec2-large-xls-r-300m-odia"
# working 50%  
# model_id = "Harveenchadha/odia_large_wav2vec2"

# It worked when first run but after that getting error
model_id = "anuragshas/wav2vec2-large-xlsr-53-odia"

# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"

# model_id = "Ranjit/Whisper_Small_Odia_CV_11.0_5k_steps"

# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"

# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"

# This is hindi
# model_id = "Harveenchadha/vakyansh-wav2vec2-hindi-him-4200"

processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

# input_ = gr.Audio(source="microphone", type="filepath") 
# input_ = gr.inputs.File(source="upload", type="filepath")  # Change input source to "upload" and type to "audio"
input_ = gr.Audio(source="upload", type="filepath")
txtbox = gr.Textbox(
    label="Output from the model will appear here:",
    lines=5
)
# chkbox = gr.Checkbox(label="Apply LM", value=False)

# gr.Interface(parse, inputs=[input_, chkbox], outputs=txtbox,
gr.Interface(parse, inputs=[input_], outputs=txtbox,
             streaming=True, interactive=True,
             analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);