This model is for debugging. It is randomly initialized using the config from Qwen/Qwen2-Audio-7B-Instruct but with smaller size.

Codes:

import os
from typing import Dict

import requests
import torch
import transformers
from PIL import Image
from torchvision import io
from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
                          AutoTokenizer, GenerationConfig,
                          Qwen2AudioForConditionalGeneration, pipeline,
                          set_seed)

model_id = "Qwen/Qwen2-Audio-7B-Instruct"
repo_id = "yujiepan/qwen2-audio-tiny-random"
save_path = f"/tmp/{repo_id}"

config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config.audio_config.encoder_layers = 2
config.audio_config.encoder_attention_heads = 2
config.audio_config.encoder_ffn_dim = 32
config.audio_config.d_model = 16
config.text_config.num_hidden_layers = 2
config.text_config.intermediate_size = 32
config.text_config.hidden_size = 16
config.text_config.num_attention_heads = 2
config.text_config.num_key_value_heads = 1

model = Qwen2AudioForConditionalGeneration(config=config)
model = model.to(torch.bfloat16).cuda().eval()
model.generation_config = GenerationConfig.from_pretrained(
    model_id, trust_remote_code=True,
)
set_seed(42)
with torch.no_grad():
    for _, p in sorted(model.named_parameters()):
        torch.nn.init.uniform_(p, -0.3, 0.3)

processor = AutoProcessor.from_pretrained(model_id)
model.save_pretrained(save_path)
processor.save_pretrained(save_path)
os.system(f"ls -alh {save_path}")


def try_inference():
    from io import BytesIO
    from urllib.request import urlopen

    import librosa
    processor = AutoProcessor.from_pretrained(save_path)
    model = Qwen2AudioForConditionalGeneration.from_pretrained(
        save_path, device_map="auto")
    conversation = [
        {"role": "user", "content": [
            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
        ]},
        {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
        {"role": "user", "content": [
            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
        ]},
    ]
    text = processor.apply_chat_template(
        conversation, add_generation_prompt=True, tokenize=False)
    audios = []
    for message in conversation:
        if isinstance(message["content"], list):
            for ele in message["content"]:
                if ele["type"] == "audio":
                    audios.append(librosa.load(
                        BytesIO(urlopen(ele['audio_url']).read()),
                        sr=processor.feature_extractor.sampling_rate)[0]
                    )

    inputs = processor(text=text, audios=audios,
                       return_tensors="pt", padding=True)
    inputs.input_ids = inputs.input_ids.to("cuda")

    generate_ids = model.generate(**inputs, max_length=256)
    generate_ids = generate_ids[:, inputs.input_ids.size(1):]

    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    print(response)


try_inference()
Downloads last month
44
Safetensors
Model size
5.03M params
Tensor type
BF16
·
Inference Providers NEW
This model is not currently available via any of the supported third-party Inference Providers, and the model is not deployed on the HF Inference API.

Collection including yujiepan/qwen2-audio-tiny-random