gijs
/

audsemthinker-qa-grpo

@@ -20,64 +20,81 @@ Corresponding paper: https://arxiv.org/abs/2505.14142
 To use `AudSemThinker-QA-GRPO` for audio question answering, you can load it using the `transformers` library. Ensure you have `torch`, `torchaudio`, and `soundfile` installed.
 ```python
-from transformers import AutoProcessor, AutoModelForCausalLM
-import torch
-import torchaudio
 import soundfile as sf
-# Load processor and model
-processor = Qwen2_5OmniProcessor.from_pretrained("gijs/audsemthinker-qa-grpo", trust_remote_code=True)
-model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
-    "gijs/audsemthinker-qa-grpo",
-    torch_dtype=torch.bfloat16,
     device_map="auto",
     trust_remote_code=True,
-    low_cpu_mem_usage=True,
 )
-# Example audio file (replace with your audio path)
-audio_file = "path/to/your/audio.wav"
 audio_input, sampling_rate = torchaudio.load(audio_file)
 if sampling_rate != processor.feature_extractor.sampling_rate:
-    audio_input = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=processor.feature_extractor.sampling_rate)(audio_input)
-audio_input = audio_input.squeeze().numpy() # Ensure mono and numpy array
 # Example multiple-choice question
 question = "What type of sound is present in the audio? Options: (A) Speech (B) Music (C) Environmental Sound (D) Silence"
 user_prompt_text = f"You are given a question and an audio clip. Your task is to answer the question based on the audio clip. First, think about the question and the audio clip and put your thoughts in <think> and </think> tags. Then reason about the semantic elements involved in the audio clip and put your reasoning in <semantic_elements> and </semantic_elements> tags. Then answer the question based on the audio clip, put your answer in <answer> and </answer> tags.\nQuestion: {question}"
-# Construct messages in conversation format, similar to training
-messages = [
-    {"role": "system", "content": [{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]},
     {
         "role": "user",
         "content": [
             {"type": "audio", "audio": audio_input},
             {"type": "text", "text": user_prompt_text}
-        ]
-    }
 ]
-# Apply chat template
-text_from_chat_template = processor.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-# Prepare inputs for the model
 inputs = processor(
-    text=text_from_chat_template,
-    audio=[audio_input], # Pass audio as a list of numpy arrays
-    return_tensors="pt"
-).to(model.device)
-# Generate response
 output_ids = model.generate(**inputs, max_new_tokens=512)
-response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
-print(response)
 # Expected output format for QA:
 # <think>...detailed reasoning about the audio scene and question...</think>
 # <semantic_elements>...list of identified semantic descriptors...</semantic_elements>

 To use `AudSemThinker-QA-GRPO` for audio question answering, you can load it using the `transformers` library. Ensure you have `torch`, `torchaudio`, and `soundfile` installed.
 ```python
 import soundfile as sf
+from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
+from qwen_omni_utils import process_mm_info
+import torchaudio
+# default: Load the model on the available device(s)
+model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+    "gijs/audsemthinker-qa-grpo",
+    torch_dtype="auto",
     device_map="auto",
     trust_remote_code=True,
+    low_cpu_mem_usage=True
 )
+# We recommend enabling flash_attention_2 for better acceleration and memory saving.
+# model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+#     "gijs/audsemthinker-qa-grpo",
+#     torch_dtype="auto",
+#     device_map="auto",
+#     attn_implementation="flash_attention_2",
+#     trust_remote_code=True,
+#     low_cpu_mem_usage=True
+# )
+processor = Qwen2_5OmniProcessor.from_pretrained("gijs/audsemthinker-qa-grpo", trust_remote_code=True)
+# Load and preprocess audio
+audio_file = "path/to/your/audio.wav"
 audio_input, sampling_rate = torchaudio.load(audio_file)
 if sampling_rate != processor.feature_extractor.sampling_rate:
+    audio_input = torchaudio.transforms.Resample(
+        orig_freq=sampling_rate,
+        new_freq=processor.feature_extractor.sampling_rate
+    )(audio_input)
+audio_input = audio_input.squeeze().numpy()
 # Example multiple-choice question
 question = "What type of sound is present in the audio? Options: (A) Speech (B) Music (C) Environmental Sound (D) Silence"
 user_prompt_text = f"You are given a question and an audio clip. Your task is to answer the question based on the audio clip. First, think about the question and the audio clip and put your thoughts in <think> and </think> tags. Then reason about the semantic elements involved in the audio clip and put your reasoning in <semantic_elements> and </semantic_elements> tags. Then answer the question based on the audio clip, put your answer in <answer> and </answer> tags.\nQuestion: {question}"
+# Conversation format
+conversation = [
+    {
+        "role": "system",
+        "content": [
+            {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
+        ],
+    },
     {
         "role": "user",
         "content": [
             {"type": "audio", "audio": audio_input},
             {"type": "text", "text": user_prompt_text}
+        ],
+    },
 ]
+# Preparation for inference
+text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+audios, images, videos = process_mm_info(conversation)
 inputs = processor(
+    text=text,
+    audio=audios,
+    images=images,
+    videos=videos,
+    return_tensors="pt",
+    padding=True
+)
+inputs = inputs.to(model.device).to(model.dtype)
+# Inference: Generation of the output
 output_ids = model.generate(**inputs, max_new_tokens=512)
+response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print(response[0])
 # Expected output format for QA:
 # <think>...detailed reasoning about the audio scene and question...</think>
 # <semantic_elements>...list of identified semantic descriptors...</semantic_elements>