gijs commited on
Commit
2d3f2ac
·
verified ·
1 Parent(s): b7fa05d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +50 -33
README.md CHANGED
@@ -23,64 +23,81 @@ This model is built upon the `Qwen2.5-Omni-7B` multimodal foundation model and i
23
  To use `AudSemThinker-QA` for audio question answering, you can load it using the `transformers` library. Ensure you have `torch`, `torchaudio`, and `soundfile` installed.
24
 
25
  ```python
26
- from transformers import AutoProcessor, AutoModelForCausalLM
27
- import torch
28
- import torchaudio
29
  import soundfile as sf
 
 
 
30
 
31
- # Load processor and model
32
- processor = Qwen2_5OmniProcessor.from_pretrained("gijs/audsemthinker-qa", trust_remote_code=True)
33
- model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
34
- "gijs/audsemthinker-qa",
35
- torch_dtype=torch.bfloat16,
36
  device_map="auto",
37
  trust_remote_code=True,
38
- low_cpu_mem_usage=True,
39
  )
40
 
41
- # Example audio file (replace with your audio path)
42
- audio_file = "path/to/your/audio.wav"
 
 
 
 
 
 
 
 
 
43
 
 
 
44
  audio_input, sampling_rate = torchaudio.load(audio_file)
45
  if sampling_rate != processor.feature_extractor.sampling_rate:
46
- audio_input = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=processor.feature_extractor.sampling_rate)(audio_input)
47
- audio_input = audio_input.squeeze().numpy() # Ensure mono and numpy array
 
 
 
48
 
49
  # Example question
50
  question = "What type of sound is present in the audio?"
51
  user_prompt_text = f"You are given a question and an audio clip. Your task is to answer the question based on the audio clip. First, think about the question and the audio clip and put your thoughts in <think> and </think> tags. Then reason about the semantic elements involved in the audio clip and put your reasoning in <semantic_elements> and </semantic_elements> tags. Then answer the question based on the audio clip, put your answer in <answer> and </answer> tags.\nQuestion: {question}"
52
 
53
- # Construct messages in conversation format, similar to training
54
- messages = [
55
- {"role": "system", "content": [{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]},
 
 
 
 
 
56
  {
57
  "role": "user",
58
  "content": [
59
  {"type": "audio", "audio": audio_input},
60
  {"type": "text", "text": user_prompt_text}
61
- ]
62
- }
63
  ]
64
 
65
- # Apply chat template
66
- text_from_chat_template = processor.apply_chat_template(
67
- messages,
68
- tokenize=False,
69
- add_generation_prompt=True
70
- )
71
-
72
- # Prepare inputs for the model
73
  inputs = processor(
74
- text=text_from_chat_template,
75
- audio=[audio_input], # Pass audio as a list of numpy arrays
76
- return_tensors="pt"
77
- ).to(model.device)
 
 
 
 
78
 
79
- # Generate response
80
  output_ids = model.generate(**inputs, max_new_tokens=512)
81
- response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
 
82
 
83
- print(response)
84
  # Expected output format for QA:
85
  # <think>...detailed reasoning about the audio scene and question...</think>
86
  # <semantic_elements>...list of identified semantic descriptors...</semantic_elements>
 
23
  To use `AudSemThinker-QA` for audio question answering, you can load it using the `transformers` library. Ensure you have `torch`, `torchaudio`, and `soundfile` installed.
24
 
25
  ```python
 
 
 
26
  import soundfile as sf
27
+ from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
28
+ from qwen_omni_utils import process_mm_info
29
+ import torchaudio
30
 
31
+ # default: Load the model on the available device(s)
32
+ model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
33
+ "gijs/audsemthinker-qa",
34
+ torch_dtype="auto",
 
35
  device_map="auto",
36
  trust_remote_code=True,
37
+ low_cpu_mem_usage=True
38
  )
39
 
40
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving.
41
+ # model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
42
+ # "gijs/audsemthinker-qa",
43
+ # torch_dtype="auto",
44
+ # device_map="auto",
45
+ # attn_implementation="flash_attention_2",
46
+ # trust_remote_code=True,
47
+ # low_cpu_mem_usage=True
48
+ # )
49
+
50
+ processor = Qwen2_5OmniProcessor.from_pretrained("gijs/audsemthinker-qa", trust_remote_code=True)
51
 
52
+ # Load and preprocess audio
53
+ audio_file = "path/to/your/audio.wav"
54
  audio_input, sampling_rate = torchaudio.load(audio_file)
55
  if sampling_rate != processor.feature_extractor.sampling_rate:
56
+ audio_input = torchaudio.transforms.Resample(
57
+ orig_freq=sampling_rate,
58
+ new_freq=processor.feature_extractor.sampling_rate
59
+ )(audio_input)
60
+ audio_input = audio_input.squeeze().numpy()
61
 
62
  # Example question
63
  question = "What type of sound is present in the audio?"
64
  user_prompt_text = f"You are given a question and an audio clip. Your task is to answer the question based on the audio clip. First, think about the question and the audio clip and put your thoughts in <think> and </think> tags. Then reason about the semantic elements involved in the audio clip and put your reasoning in <semantic_elements> and </semantic_elements> tags. Then answer the question based on the audio clip, put your answer in <answer> and </answer> tags.\nQuestion: {question}"
65
 
66
+ # Conversation format
67
+ conversation = [
68
+ {
69
+ "role": "system",
70
+ "content": [
71
+ {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
72
+ ],
73
+ },
74
  {
75
  "role": "user",
76
  "content": [
77
  {"type": "audio", "audio": audio_input},
78
  {"type": "text", "text": user_prompt_text}
79
+ ],
80
+ },
81
  ]
82
 
83
+ # Preparation for inference
84
+ text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
85
+ audios, images, videos = process_mm_info(conversation)
 
 
 
 
 
86
  inputs = processor(
87
+ text=text,
88
+ audio=audios,
89
+ images=images,
90
+ videos=videos,
91
+ return_tensors="pt",
92
+ padding=True
93
+ )
94
+ inputs = inputs.to(model.device).to(model.dtype)
95
 
96
+ # Inference: Generation of the output
97
  output_ids = model.generate(**inputs, max_new_tokens=512)
98
+ response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
99
+ print(response[0])
100
 
 
101
  # Expected output format for QA:
102
  # <think>...detailed reasoning about the audio scene and question...</think>
103
  # <semantic_elements>...list of identified semantic descriptors...</semantic_elements>