bond005
/

whisper-podlodka-turbo

@@ -141,7 +141,7 @@ from transformers import pipeline  # for working with Whisper-Podlodka-Turbo
 import wget  # for downloading demo sound from its URL
 from whisper_lid.whisper_lid import detect_language_in_speech  # for spoken language detection
-model_id = "openai/whisper-podlodks-turbo"  # the best Whisper model :-)
 target_sampling_rate = 16_000  # Hz
 asr = pipeline(model=model_id, device_map='auto', torch_dtype='auto')
@@ -149,7 +149,7 @@ asr = pipeline(model=model_id, device_map='auto', torch_dtype='auto')
 # An example of speech recognition in Russian, spoken by a native speaker of this language
 sound_ru_url = 'https://huggingface.co/bond005/whisper-podlodka-turbo/resolve/main/test_sound_ru.wav'
 sound_ru_name = wget.download(sound_ru_url)
-sound_ru = librosa.load(sound_ru_name, sr=TARGET_SR, mono=True)[0]
 print('Duration of sound with Russian speech = {0:.3f} seconds.'.format(
     sound_ru.shape[0] / target_sampling_rate
 ))
@@ -173,7 +173,7 @@ print(recognition_result['text'] + '\n')
 # An example of speech recognition in English, pronounced by a non-native speaker of that language with an accent
 sound_en_url = 'https://huggingface.co/bond005/whisper-podlodka-turbo/resolve/main/test_sound_en.wav'
 sound_en_name = wget.download(sound_en_url)
-sound_en = librosa.load(sound_en_name, sr=TARGET_SR, mono=True)[0]
 print('Duration of sound with English speech = {0:.3f} seconds.'.format(
     sound_en.shape[0] / target_sampling_rate
 ))
@@ -263,7 +263,7 @@ Along with special language tokens, the model can also return the special token
 ```python
 nonspeech_sound_url = 'https://huggingface.co/bond005/whisper-podlodka-turbo/resolve/main/test_sound_nonspeech.wav'
 nonspeech_sound_name = wget.download(nonspeech_sound_url)
-nonspeech_sound = librosa.load(nonspeech_sound_name, sr=TARGET_SR, mono=True)[0]
 print('Duration of sound without speech = {0:.3f} seconds.'.format(
     nonspeech_sound.shape[0] / target_sampling_rate
 ))
@@ -337,7 +337,7 @@ The model was fine-tuned on a composite dataset including:
 - [Common Voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0) (Ru, En)
 - [Podlodka Speech](https://huggingface.co/datasets/bond005/podlodka_speech) (Ru)
 - [Taiga Speech](https://huggingface.co/datasets/bond005/taiga_speech_v2) (Ru, synthetic)
-- [Golos Farfield](https://huggingface.co/datasets/bond005/sberdevices_golos_100h_farfield) and [Golos Crows](https://huggingface.co/datasets/bond005/sberdevices_golos_10h_crowd) (Ru)
 - [Sova Rudevices](https://huggingface.co/datasets/bond005/sova_rudevices) (Ru)
 - [Audioset](https://huggingface.co/datasets/bond005/audioset-nonspeech) (non-speech audio)
@@ -375,7 +375,7 @@ The quality of the Russian speech recognition task was tested on test sub-sets o
 - [Common Voice 11 Ru](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0)
 - [Podlodka Speech](https://huggingface.co/datasets/bond005/podlodka_speech)
 - [Golos Farfield](https://huggingface.co/datasets/bond005/sberdevices_golos_100h_farfield)
-- [Golos Crows](https://huggingface.co/datasets/bond005/sberdevices_golos_10h_crowd)
 - [Sova Rudevices](https://huggingface.co/datasets/bond005/sova_rudevices)
 - [Russian Librispeech](https://huggingface.co/datasets/bond005/rulibrispeech)

 import wget  # for downloading demo sound from its URL
 from whisper_lid.whisper_lid import detect_language_in_speech  # for spoken language detection
+model_id = "bond005/whisper-podlodks-turbo"  # the best Whisper model :-)
 target_sampling_rate = 16_000  # Hz
 asr = pipeline(model=model_id, device_map='auto', torch_dtype='auto')
 # An example of speech recognition in Russian, spoken by a native speaker of this language
 sound_ru_url = 'https://huggingface.co/bond005/whisper-podlodka-turbo/resolve/main/test_sound_ru.wav'
 sound_ru_name = wget.download(sound_ru_url)
+sound_ru = librosa.load(sound_ru_name, sr=target_sampling_rate, mono=True)[0]
 print('Duration of sound with Russian speech = {0:.3f} seconds.'.format(
     sound_ru.shape[0] / target_sampling_rate
 ))
 # An example of speech recognition in English, pronounced by a non-native speaker of that language with an accent
 sound_en_url = 'https://huggingface.co/bond005/whisper-podlodka-turbo/resolve/main/test_sound_en.wav'
 sound_en_name = wget.download(sound_en_url)
+sound_en = librosa.load(sound_en_name, sr=target_sampling_rate, mono=True)[0]
 print('Duration of sound with English speech = {0:.3f} seconds.'.format(
     sound_en.shape[0] / target_sampling_rate
 ))
 ```python
 nonspeech_sound_url = 'https://huggingface.co/bond005/whisper-podlodka-turbo/resolve/main/test_sound_nonspeech.wav'
 nonspeech_sound_name = wget.download(nonspeech_sound_url)
+nonspeech_sound = librosa.load(nonspeech_sound_name, sr=target_sampling_rate, mono=True)[0]
 print('Duration of sound without speech = {0:.3f} seconds.'.format(
     nonspeech_sound.shape[0] / target_sampling_rate
 ))
 - [Common Voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0) (Ru, En)
 - [Podlodka Speech](https://huggingface.co/datasets/bond005/podlodka_speech) (Ru)
 - [Taiga Speech](https://huggingface.co/datasets/bond005/taiga_speech_v2) (Ru, synthetic)
+- [Golos Farfield](https://huggingface.co/datasets/bond005/sberdevices_golos_100h_farfield) and [Golos Crowd](https://huggingface.co/datasets/bond005/sberdevices_golos_10h_crowd) (Ru)
 - [Sova Rudevices](https://huggingface.co/datasets/bond005/sova_rudevices) (Ru)
 - [Audioset](https://huggingface.co/datasets/bond005/audioset-nonspeech) (non-speech audio)
 - [Common Voice 11 Ru](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0)
 - [Podlodka Speech](https://huggingface.co/datasets/bond005/podlodka_speech)
 - [Golos Farfield](https://huggingface.co/datasets/bond005/sberdevices_golos_100h_farfield)
+- [Golos Crowd](https://huggingface.co/datasets/bond005/sberdevices_golos_10h_crowd)
 - [Sova Rudevices](https://huggingface.co/datasets/bond005/sova_rudevices)
 - [Russian Librispeech](https://huggingface.co/datasets/bond005/rulibrispeech)