Speech Tokenizer
Collection
Multilingual discrete speech tokenizer for LLM.
•
6 items
•
Updated
Add a convolution layer with stride 2 to introduce 25 TPS with 32768 VQ embedding size.
This model to introduce VQ on top mesolitica/whisper-conv-large-v3-turbo
WanDB at https://wandb.ai/huseinzol05/whisperconv?nw=nwuserhuseinzol05
from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer
import librosa
model_id = "mesolitica/whisper-conv-VQ-32k-large-v3-turbo"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id, trust_remote_code = True, torch_dtype = 'auto').cuda()
encoder = model.model.get_encoder()
y, sr = librosa.load('common_voice_ba_26517811.mp3', sr = feature_extractor.sampling_rate)
features = feature_extractor([y], return_tensors = 'pt', return_attention_mask = True)
for k in features.keys():
features[k] = features[k].cuda()
encoded = encoder(**features)
print(encoded[1][0, encoded[2][0] == 1])
tensor([14135, 7585, 12890, 32383, 15559, 4515, 252, 32713, 252, 16296,
3050, 18175, 15733, 5619, 5619, 1770, 7520, 32041, 26287, 8139,
8453, 28652, 4327, 26837, 20927, 26620, 12310, 12310, 12938, 29755,
29755, 18102, 5597, 8076, 8076, 8076, 9772, 31738, 31738, 1856,
24397, 27124, 5538, 1970, 29984, 8891, 20453, 20453, 1815, 1465,
1465, 26893, 5597, 9531, 11871, 11871, 6484, 21016, 14653, 18417,
9598, 9598, 30138, 27531, 18071, 18071, 30147, 24892, 434, 16557,
30589, 25516, 30876, 30876, 32039, 29394, 27996, 10042, 1939, 16692,
8163, 16665, 16665, 4507, 28100, 31251, 3051, 3051, 12157, 19865,
27147, 27357, 21524, 19750, 20016, 9031, 20016, 13475, 30149, 30149,
21785, 4176, 24032, 19334, 17387, 31375, 2659, 16509, 31672, 7785,
10352, 30063, 8518, 30730, 29357, 28538, 7072], device='cuda:0')
from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer
import librosa
model_id = "mesolitica/whisper-conv-VQ-32k-large-v3-turbo"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id, trust_remote_code = True, torch_dtype = 'auto').cuda()
y, sr = librosa.load('common_voice_ba_26517811.mp3', sr = feature_extractor.sampling_rate)
input_ids = tokenizer(
'<|startoftranscript|><|ru|><|transcribe|><|notimestamps|>',
add_special_tokens = False, return_tensors = 'pt')['input_ids']
features = feature_extractor([y], return_tensors = 'pt', return_attention_mask = True)
features['decoder_input_ids'] = input_ids
for k in features.keys():
features[k] = features[k].cuda()
generate_kwargs = dict(
**features,
max_new_tokens=1024,
)
generation_output = model.generate(**generate_kwargs)
tokenizer.decode(generation_output[0])
Output,
<|startoftranscript|><|ru|><|transcribe|><|notimestamps|> Кубах сирта был холква кешене битарафлыг сирпаса.<|endoftext|>
Evaluate on malaysia-ai/common_voice_17_0/test, with some conditions,
<|startoftranscript|><|{lang}|><|transcribe|><|notimestamps|>
.
Source code at https://github.com/mesolitica/malaya-speech/tree/master/session/whisper-conv