Speech Tokenizer
Collection
Multilingual discrete speech tokenizer for LLM.
•
6 items
•
Updated
Combine mesolitica/gemma-3n-e4b-it-audio-encoder Encoder + Projection + VQ + Projection Layer Norm + openai/whisper-large-v3-turbo Decoder.
This model to introduce VQ on top mesolitica/gemma3n-audio-encoder-whisper-decoder
This is the most compressed speech token model, 6.25 TPS with 65536 embedding size.
WanDB at https://wandb.ai/huseinzol05/gemma3n-audio-vq-whisper-decoder-65k
from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer
import librosa
model_id = "mesolitica/gemma3n-audio-encoder-VQ-65k-whisper-decoder"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id, trust_remote_code = True, torch_dtype = 'auto').cuda()
encoder = model.model.get_encoder()
y, sr = librosa.load('218757.mp3', sr = feature_extractor.sampling_rate)
features = feature_extractor([y], return_tensors = 'pt')
features['input_features'] = features['input_features'].cuda()
features['input_features_mask'] = features['input_features_mask'].cuda()
_, tokens = encoder(**features)
print(tokens)
tensor([ 910, 32677, 16546, 53781, 27314, 15398, 31002, 5847, 28410, 7585,
53632, 47416, 38022, 6021, 48561, 20577, 29356, 16390, 6384, 7237,
53567, 15756, 52487, 12716, 218, 27008, 25496, 27415, 14668, 41049,
25329, 8556, 27467, 56087, 2976, 46430, 46014, 13764, 11178, 34157,
11391, 46160, 46693, 14112, 23788, 2958, 15187, 37938, 60867, 20024,
25597, 36378, 39933, 23820, 2392, 18068, 8010, 17651, 5731, 4433,
40480, 30643, 55976, 46888, 60327, 53716, 30826, 26525, 11720, 7492,
26675, 28757, 10368, 10900, 31609, 32883, 59085, 34244, 215, 43827,
5471, 26865, 26593], device='cuda:0')
from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer
import librosa
model_id = "mesolitica/gemma3n-audio-encoder-VQ-65k-whisper-decoder"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id, trust_remote_code = True, torch_dtype = 'auto').cuda()
tokenizer = AutoTokenizer.from_pretrained(model_id)
y, sr = librosa.load('common_voice_ba_26517811.mp3', sr = feature_extractor.sampling_rate)
input_ids = tokenizer(
'<|startoftranscript|><|ru|><|transcribe|><|notimestamps|>',
add_special_tokens = False, return_tensors = 'pt')['input_ids']
features = feature_extractor([y], return_tensors = 'pt')
features['input_features'] = features['input_features'].cuda()
features['input_features_mask'] = features['input_features_mask'].cuda()
features['attention_mask'] = features['input_features_mask']
features['decoder_input_ids'] = input_ids.cuda()
generate_kwargs = dict(
**features,
max_new_tokens=1024,
)
generation_output = model.generate(**generate_kwargs)
tokenizer.decode(generation_output[0])
Output,
<|startoftranscript|><|ru|><|transcribe|><|notimestamps|> Купыкта был широкое глобка шляпше на битапсы.<|endoftext|>
Evaluate on malaysia-ai/common_voice_17_0/test, with some conditions,
<|startoftranscript|><|{lang}|><|transcribe|><|notimestamps|>
.
Source code at https://github.com/mesolitica/malaya-speech/tree/master/session/gemma3n-audio-whisper-decoder