Whisper Small ru - slowlydoor (Automatic Speech Recognition)

This model is a fine-tuned version of openai/whisper-small on the Common Voice 17.0 dataset. It achieves the following results on the evaluation set:

  • Loss: 0.2125
  • Wer: 16.0405
  • Cer: 4.2321
  • Ser: 57.5223

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

  • learning_rate: 1e-05
  • train_batch_size: 8
  • eval_batch_size: 4
  • seed: 42
  • optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
  • lr_scheduler_type: linear
  • num_epochs: 3
  • mixed_precision_training: Native AMP

Training code

pip install transformers evaluate soundfile
pip install -q jiwer tensorboard
pip install --upgrade datasets transformers
import re
import json
from datasets import load_dataset, DatasetDict, Audio
from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os, numpy as np, torch, evaluate, jiwer
from huggingface_hub import login
from dataclasses import dataclass
from typing import Any, Dict, List, Union

login("***")


common_voice = DatasetDict()
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_17_0", "ru", split="train")
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_17_0", "ru", split="test")

common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Russian", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Russian", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_features"] = feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]

    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2 )

common_voice

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str  = tokenizer.batch_decode(pred_ids,  skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    pairs = [(ref.strip(), hyp.strip()) for ref, hyp in zip(label_str, pred_str)]
    pairs = [(ref, hyp) for ref, hyp in pairs if len(ref) > 0]

    label_str, pred_str = zip(*pairs)

    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
    cer = 100 * cer_metric.compute(predictions=pred_str, references=label_str)

    ser = 100 * (sum(p.strip() != r.strip() for p, r in zip(pred_str, label_str)) / len(pred_str))

    return {
        "wer":  wer,
        "cer":  cer,
        "ser":  ser
    }

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": f["input_features"]} for f in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/models/whisper_small_ru_model_trainer_3ep",
    logging_dir="/content/drive/MyDrive/models/whisper_small_ru_model_trainer_3ep",
    group_by_length=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    num_train_epochs=3,
    generation_max_length=170,
    logging_steps=25,
    eval_steps=500,
    save_steps=500,
    fp16=True,
    optim="adamw_torch_fused",
    torch_compile=True,
    gradient_checkpointing=True,
    learning_rate=1e-5,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

trainer.train()

Test result


import os
from transformers import (WhisperProcessor, 
            WhisperForConditionalGeneration, 
            pipeline)
import torch
import torchaudio
import librosa
import numpy as np

MODEL_HUG = "internalhell/whisper_small_ru_model_trainer_3ep"

processor = None
model = None
pipe = None

def get_model_pipe():
    global model, processor, pipe
    if model is None or processor is None:
        processor = WhisperProcessor.from_pretrained(MODEL_HUG, language="russian")
        model = WhisperForConditionalGeneration.from_pretrained(MODEL_HUG)

        model.generation_config.forced_decoder_ids = None
        forced_decoder_ids = processor.get_decoder_prompt_ids(language="ru", task="transcribe")
        model.config.forced_decoder_ids = forced_decoder_ids

        pipe = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            device=0 if torch.cuda.is_available() else -1,
        )    
        
    return model

def recognize_audio_pipe(audio_path):
    model = get_model_pipe()
    
    waveform, sr = torchaudio.load(audio_path)
    waveform = waveform.mean(dim=0, keepdim=True)  # моно

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)
        sr = 16000

    waveform_np = waveform.squeeze(0).numpy()
    return pipe({"array": waveform_np, "sampling_rate": sr})["text"]

print(recognize_audio_pipe("test.wav")) # jast .wav only

Training results

Training Loss Epoch Step Cer Validation Loss Ser Wer
0.2206 0.1516 500 5.4963 0.2603 69.4306 21.2669
0.22 0.3032 1000 5.3823 0.2467 67.3527 20.2971
0.1901 0.4548 1500 5.1160 0.2377 66.1766 19.5642
0.1969 0.6064 2000 5.0754 0.2273 64.3242 19.0509
0.1743 0.7580 2500 4.8523 0.2188 63.1481 18.2286
0.1747 0.9096 3000 4.8867 0.2167 62.4032 18.0985
0.077 1.0612 3500 4.5272 0.2142 60.5998 17.2007
0.0839 1.2129 4000 4.4628 0.2126 60.8743 17.1601
0.0888 1.3645 4500 4.4864 0.2092 60.3940 17.3529
0.069 1.5161 5000 4.4667 0.2118 60.1588 17.1578
0.0609 1.6677 5500 4.4298 0.2077 59.3355 16.8546
0.0721 1.8193 6000 4.3442 0.2060 58.6592 16.5527
0.0681 1.9709 6500 4.3284 0.2038 58.1692 16.3575
0.0322 2.1225 7000 4.2709 0.2130 57.7771 16.2809
0.0277 2.2741 7500 4.2543 0.2151 57.4733 16.1067
0.0249 2.4257 8000 4.2513 0.2130 57.4635 16.0741
0.0234 2.5773 8500 4.2832 0.2150 57.6693 16.2600
0.0264 2.7289 9000 4.2645 0.2145 57.6301 16.1160
0.0268 2.8805 9500 4.2321 0.2125 57.5223 16.0405

Framework versions

  • Transformers 4.52.4
  • Pytorch 2.6.0+cu124
  • Datasets 3.6.0
  • Tokenizers 0.21.1
Downloads last month
5
Safetensors
Model size
242M params
Tensor type
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for internalhell/whisper_small_ru_model_trainer_3ep

Finetuned
(2830)
this model

Dataset used to train internalhell/whisper_small_ru_model_trainer_3ep

Evaluation results