VCL3D commited on
Commit
1e6d776
·
verified ·
1 Parent(s): 0715768

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -37
README.md CHANGED
@@ -3,7 +3,7 @@ license: apache-2.0
3
  ---
4
  ONNX format of voxreality/whisper-small-el-adapters model
5
 
6
- Run on CPU
7
 
8
  ```python
9
  from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
@@ -14,48 +14,13 @@ from datasets import Audio, load_dataset
14
  model_id = "voxreality/whisper-small-el-adapters-onnx"
15
  processor = WhisperProcessor.from_pretrained(model_id)
16
  model = ORTModelForSpeechSeq2Seq.from_pretrained(model_id)
17
- # model.save_pretrained("whisper-small-el-finetune_onnx")
18
- forced_decoder_ids = processor.get_decoder_prompt_ids(language="greek", task="transcribe")
19
- # load streaming dataset and read first audio sample
20
- ds = load_dataset("mozilla-foundation/common_voice_11_0", "el", split="test", streaming=True)
21
- ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
22
- input_speech = next(iter(ds))["audio"]
23
- input_features = processor(input_speech["array"], sampling_rate=input_speech["sampling_rate"], return_tensors="pt").input_features
24
-
25
- # Generate token ids (without 'use_adapters', as it is not needed)
26
- predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
27
-
28
- # Decode token ids to text
29
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
30
-
31
- print(transcription)
32
- ```
33
-
34
- Run on GPU
35
-
36
- ```python
37
- from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
38
- from transformers import WhisperProcessor
39
- from datasets import Audio, load_dataset
40
-
41
- model_id = "voxreality/whisper-small-el-adapters-onnx" # ONNX model path
42
-
43
- # Load model with CUDA provider for GPU inference
44
- model = ORTModelForSpeechSeq2Seq.from_pretrained(model_id, provider="CUDAExecutionProvider")
45
-
46
- # Load processor
47
- processor = WhisperProcessor.from_pretrained("voxreality/whisper-small-el-adapters")
48
-
49
- # Get decoder prompt ids
50
  forced_decoder_ids = processor.get_decoder_prompt_ids(language="greek", task="transcribe")
51
 
52
  # Load streaming dataset and read first audio sample
53
  ds = load_dataset("mozilla-foundation/common_voice_11_0", "el", split="test", streaming=True)
54
  ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
55
  input_speech = next(iter(ds))["audio"]
56
-
57
- # Process input features
58
- input_features = processor(input_speech["array"], sampling_rate=input_speech["sampling_rate"], return_tensors="pt").input_features.to('cuda') # Move inputs to GPU
59
 
60
  # Generate token ids (without 'use_adapters', as it is not needed)
61
  predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
 
3
  ---
4
  ONNX format of voxreality/whisper-small-el-adapters model
5
 
6
+ Model inference example:
7
 
8
  ```python
9
  from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
 
14
  model_id = "voxreality/whisper-small-el-adapters-onnx"
15
  processor = WhisperProcessor.from_pretrained(model_id)
16
  model = ORTModelForSpeechSeq2Seq.from_pretrained(model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  forced_decoder_ids = processor.get_decoder_prompt_ids(language="greek", task="transcribe")
18
 
19
  # Load streaming dataset and read first audio sample
20
  ds = load_dataset("mozilla-foundation/common_voice_11_0", "el", split="test", streaming=True)
21
  ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
22
  input_speech = next(iter(ds))["audio"]
23
+ input_features = processor(input_speech["array"], sampling_rate=input_speech["sampling_rate"], return_tensors="pt").input_features
 
 
24
 
25
  # Generate token ids (without 'use_adapters', as it is not needed)
26
  predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)