Usage

Use below Python to do ASR task:

import torch
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load model and processor
processor = WhisperProcessor.from_pretrained("danhtran2mind/Vi-Whisper-Tiny-finetuning")
model = WhisperForConditionalGeneration.from_pretrained("danhtran2mind/Vi-Whisper-Tiny-finetuning")
model.config.forced_decoder_ids = None
# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load audio file (replace 'audio.wav' with your audio file path)
audio_path = "<audio_path>"
audio, sr = librosa.load(audio_path, sr=16000)

# Preprocess audio
inputs = processor(audio, sampling_rate=16000, return_tensors="pt").to(device)

# Perform inference with max_length and language
with torch.no_grad():
    generated_ids = model.generate(
        inputs["input_features"],
        max_length=448,
    )

# Decode the output
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Print the transcription
print("Transcription:\n", transcription)