Audio Language Model
Collection
Open source models including Malaysian context and dataset.
•
23 items
•
Updated
•
2
Audio model on top of mesolitica/Malaysian-Qwen2.5-7B-Instruct.
Audio understanding, this is to introduce audio dataset to the LLM.
With total 6.71B tokens or 25557.47 audio hours.
Because most of the dataset is about audio understanding, for End-to-End Speech-LLM chat instructions, please use mesolitica/Malaysian-Qwen2.5-7B-Speech-Instruct.
from transformers import AutoProcessor, AutoModel
from transformers import TextStreamer
import librosa
import torch
model = AutoModel.from_pretrained(
'mesolitica/Malaysian-Qwen2.5-7B-Audio-Instruct',
torch_dtype = 'auto',
trust_remote_code = True,
)
_ = model.cuda()
processor = AutoProcessor.from_pretrained(
'mesolitica/Malaysian-Qwen2.5-7B-Audio-Instruct',
trust_remote_code = True,
)
conversation = [
{"role": "user", "content": [
{"type": "audio", "audio_url": 'speech/mallm-2.mp3'},
{"type": "text", "text": 'translate to french'},
]},
]
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios = []
for message in conversation:
if isinstance(message["content"], list):
for ele in message["content"]:
if ele["type"] == "audio":
audios.append(librosa.load(
ele['audio_url'],
sr=processor.feature_extractor.sampling_rate)[0]
)
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True).to('cuda')
inputs['input_features'] = inputs['input_features'].to(model.dtype)
with torch.no_grad():
generate_kwargs = dict(
**inputs,
max_new_tokens=2048,
top_p=0.95,
top_k=50,
temperature=0.01,
do_sample=True,
repetition_penalty=1.05,
)
generation_output = model.generate(**generate_kwargs)
generate_ids = generation_output[:, inputs.input_ids.size(1):]
output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(output)
Output,
"Les conséquences de l'acte de syirik sont : A - détruire la foi ; B - commettre un grand péché ; C - être murtad ; D - recevoir le châtiment dans l'au-delà"
You can use this fork to serve the model in vLLM, https://github.com/mesolitica/vllm-llmaudio
import base64
from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8001/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
with open('speech/mallm-2.mp3', 'rb') as fopen:
audio_base64 = base64.b64encode(fopen.read()).decode('utf-8')
model = 'mesolitica/Malaysian-Qwen2.5-7B-Audio-Instruct'
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": audio_base64,
"format": "mp3"
},
},
{
"type": "text",
"text": "explain the audio"
},
],
}],
model=model,
max_completion_tokens=1024,
temperature=0.6,
top_p=0.9,
)
Output,
ChatCompletion(id='chatcmpl-4343d53b608249c49cc56257b7fc6c72', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The speaker is listing the consequences of shirk (polytheism) in a straightforward and matter-of-fact manner. Here’s a breakdown of each point:\n\n1. **A. Rosak akidah (Rusak iman)**: This means "ruining one\'s faith." The speaker is emphasizing that shirk directly affects one\'s spiritual well-being and faith.\n\n2. **B. Berdosa besar (Berbuat dosa besar)**: This translates to "committing major sins." The speaker is highlighting that shirk is considered a grave sin in Islam, which can lead to severe spiritual consequences.\n\n3. **C. Menjadi murtad (Menjadi muhaddid)**: This means "becoming an apostate." The speaker is pointing out that shirk can lead to a person being labeled as an apostate, which has serious legal and social implications.\n\n4. **D. Mendapat azab di akhirat (Menerima azab di akhirat)**: This translates to "receiving punishment in the afterlife." The speaker is indicating that the ultimate consequence of shirk is eternal punishment in the hereafter.\n\nThe tone is serious and educational, typical of religious teachings where the gravity of certain actions is emphasized. The speaker is likely addressing an audience that is familiar with Islamic concepts and is providing a clear and concise explanation of the consequences of shirk.', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[], reasoning_content=None), stop_reason=None)], created=1750396026, model='mesolitica/Malaysian-Qwen2.5-7B-Audio-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=285, prompt_tokens=569, total_tokens=854, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None, kv_transfer_params=None)
Source code at https://github.com/mesolitica/malaya/tree/master/session/audiollm