Commit
·
238b905
1
Parent(s):
7708082
vad module
Browse files
vad/__init__.py
CHANGED
@@ -1,10 +1,3 @@
|
|
1 |
-
|
2 |
-
VAD音频处理工具包
|
3 |
|
4 |
-
|
5 |
-
"""
|
6 |
-
|
7 |
-
from .audio_processor import AudioProcessor, AudioSegment
|
8 |
-
from .audio_transcriber import AudioTranscriber, TranscriptionResult
|
9 |
-
|
10 |
-
__all__ = ['AudioProcessor', 'AudioSegment', 'AudioTranscriber', 'TranscriptionResult']
|
|
|
1 |
+
from .vad import AudioVad, AudioSegment
|
|
|
2 |
|
3 |
+
__all__ = ['AudioVad', 'AudioSegment']
|
|
|
|
|
|
|
|
|
|
|
|
vad/vad.py
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import numpy as np
|
3 |
+
import soundfile as sf
|
4 |
+
from typing import List, Tuple, Optional, Dict, Union
|
5 |
+
import webrtcvad
|
6 |
+
from dataclasses import dataclass, asdict
|
7 |
+
from scipy import signal
|
8 |
+
import json
|
9 |
+
import os
|
10 |
+
from datetime import datetime
|
11 |
+
import logging
|
12 |
+
|
13 |
+
# 配置日志
|
14 |
+
logger = logging.getLogger("vad")
|
15 |
+
handler = logging.StreamHandler()
|
16 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
17 |
+
handler.setFormatter(formatter)
|
18 |
+
logger.addHandler(handler)
|
19 |
+
logger.setLevel(logging.INFO) # 默认INFO级别
|
20 |
+
|
21 |
+
@dataclass
|
22 |
+
class AudioSegment:
|
23 |
+
start_time: float # 开始时间(秒)
|
24 |
+
end_time: float # 结束时间(秒)
|
25 |
+
audio_data: np.ndarray # 音频数据
|
26 |
+
is_speech: bool # 是否包含语音
|
27 |
+
|
28 |
+
class AudioVad:
|
29 |
+
def __init__(self,
|
30 |
+
sample_rate: int = 16000,
|
31 |
+
frame_duration_ms: int = 20,
|
32 |
+
vad_level: int = 0, # 降低VAD灵敏度
|
33 |
+
min_silence_duration: float = 0.3, # 静音持续时间
|
34 |
+
min_speech_duration: float = 0.3, # 增加最小语音持续时间,确保完整句子
|
35 |
+
amplitude_threshold: float = 0.0015, # 振幅阈值
|
36 |
+
save_audio: bool = False, # 是否保存分段音频
|
37 |
+
save_json: bool = False, # 是否保存JSON元数据
|
38 |
+
output_dir: str = "dataset/audio/segments", # 音频输出目录
|
39 |
+
json_dir: str = "dataset/audio/metadata", # JSON输出目录
|
40 |
+
log_level: Union[int, str] = logging.INFO): # 日志级别
|
41 |
+
"""
|
42 |
+
初始化音频VAD处理器
|
43 |
+
|
44 |
+
Args:
|
45 |
+
sample_rate: 采样率
|
46 |
+
frame_duration_ms: VAD帧长度(毫秒)
|
47 |
+
vad_level: VAD灵敏度 (0-3)
|
48 |
+
min_silence_duration: 最小静音持续时间(秒)
|
49 |
+
min_speech_duration: 最小语音片段长度(秒)
|
50 |
+
amplitude_threshold: 振幅阈值
|
51 |
+
save_audio: 是否保存分段音频文件
|
52 |
+
save_json: 是否保存JSON元数据
|
53 |
+
output_dir: 音频输出目录
|
54 |
+
json_dir: JSON元数据输出目录
|
55 |
+
log_level: 日志级别
|
56 |
+
"""
|
57 |
+
# 设置日志级别
|
58 |
+
if isinstance(log_level, str):
|
59 |
+
log_level = getattr(logging, log_level.upper())
|
60 |
+
logger.setLevel(log_level)
|
61 |
+
|
62 |
+
self.sample_rate = sample_rate
|
63 |
+
self.frame_duration_ms = frame_duration_ms
|
64 |
+
self.frame_size = int(sample_rate * frame_duration_ms / 1000)
|
65 |
+
self.vad = webrtcvad.Vad(vad_level)
|
66 |
+
self.min_silence_frames = int(min_silence_duration * 1000 / frame_duration_ms)
|
67 |
+
self.min_speech_frames = int(min_speech_duration * 1000 / frame_duration_ms)
|
68 |
+
self.amplitude_threshold = amplitude_threshold
|
69 |
+
|
70 |
+
# 保存配置
|
71 |
+
self.save_audio = save_audio
|
72 |
+
self.save_json = save_json
|
73 |
+
self.output_dir = output_dir
|
74 |
+
self.json_dir = json_dir
|
75 |
+
|
76 |
+
# 如果需要保存文件,确保目录存在
|
77 |
+
if self.save_audio:
|
78 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
79 |
+
if self.save_json:
|
80 |
+
os.makedirs(self.json_dir, exist_ok=True)
|
81 |
+
|
82 |
+
def _is_speech_frame(self, frame: np.ndarray) -> bool:
|
83 |
+
"""
|
84 |
+
判断一帧是否包含语音
|
85 |
+
"""
|
86 |
+
# 确保帧长度正确
|
87 |
+
if len(frame) != self.frame_size:
|
88 |
+
return False
|
89 |
+
|
90 |
+
# 将float32转换为int16,并确保值在范围内
|
91 |
+
frame_int16 = np.clip(frame * 32768, -32768, 32767).astype(np.int16)
|
92 |
+
|
93 |
+
# 使用振幅判断
|
94 |
+
frame_amplitude = np.max(np.abs(frame))
|
95 |
+
if frame_amplitude < self.amplitude_threshold:
|
96 |
+
return False
|
97 |
+
|
98 |
+
# 使用VAD判断
|
99 |
+
try:
|
100 |
+
return self.vad.is_speech(frame_int16.tobytes(), self.sample_rate)
|
101 |
+
except Exception as e:
|
102 |
+
logger.error(f"VAD处理出错: {e}")
|
103 |
+
# 如果VAD失败,仅使用振幅判断
|
104 |
+
return frame_amplitude >= self.amplitude_threshold * 2
|
105 |
+
|
106 |
+
def process_audio_data(self, audio_data: np.ndarray, sample_rate: int = None) -> List[AudioSegment]:
|
107 |
+
"""
|
108 |
+
处理音频数据,返回切割后的片段列表
|
109 |
+
|
110 |
+
Args:
|
111 |
+
audio_data: 音频数据numpy数组
|
112 |
+
sample_rate: 音频采样率,如果与初始化不同则会重采样
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
AudioSegment列表
|
116 |
+
"""
|
117 |
+
logger.debug(f"处理音频数据,形状: {audio_data.shape}")
|
118 |
+
|
119 |
+
# 如果提供了采样率且与目标不同,进行重采样
|
120 |
+
if sample_rate is not None and sample_rate != self.sample_rate:
|
121 |
+
logger.debug(f"正在重采样音频从 {sample_rate}Hz 到 {self.sample_rate}Hz")
|
122 |
+
# 使用scipy的resample函数进行重采样
|
123 |
+
num_samples = int(len(audio_data) * self.sample_rate / sample_rate)
|
124 |
+
audio_data = signal.resample(audio_data, num_samples)
|
125 |
+
logger.debug(f"重采样后音频长度: {len(audio_data)} 采样点")
|
126 |
+
|
127 |
+
# 如果是多声道,转换为单声道
|
128 |
+
if len(audio_data.shape) > 1:
|
129 |
+
logger.debug("检测到多声道音频,正在转换为单声道")
|
130 |
+
audio_data = audio_data.mean(axis=1) # 转换为单声道
|
131 |
+
|
132 |
+
# 初始化结果列表
|
133 |
+
segments: List[AudioSegment] = []
|
134 |
+
logger.debug(f"开始处理音频,总长度: {len(audio_data)} 采样点 ({len(audio_data)/self.sample_rate:.2f}秒)")
|
135 |
+
|
136 |
+
# 当前处理的状态
|
137 |
+
current_segment_start = 0
|
138 |
+
silence_frame_count = 0
|
139 |
+
is_in_speech = False
|
140 |
+
|
141 |
+
# 按帧处理音频
|
142 |
+
total_frames = len(audio_data) // self.frame_size
|
143 |
+
speech_frames = 0
|
144 |
+
for i in range(0, len(audio_data), self.frame_size):
|
145 |
+
# 确保帧长度正确
|
146 |
+
frame = audio_data[i:i + self.frame_size]
|
147 |
+
if len(frame) < self.frame_size:
|
148 |
+
# 对于最后一个不完整帧,补零处理
|
149 |
+
frame = np.pad(frame, (0, self.frame_size - len(frame)), 'constant')
|
150 |
+
|
151 |
+
is_speech = self._is_speech_frame(frame)
|
152 |
+
if is_speech:
|
153 |
+
speech_frames += 1
|
154 |
+
|
155 |
+
if is_speech and not is_in_speech:
|
156 |
+
# 开始新的语音段
|
157 |
+
current_segment_start = i
|
158 |
+
is_in_speech = True
|
159 |
+
silence_frame_count = 0
|
160 |
+
logger.debug(f"检测到语音开始,位置: {i/self.sample_rate:.2f}秒")
|
161 |
+
elif not is_speech and is_in_speech:
|
162 |
+
silence_frame_count += 1
|
163 |
+
|
164 |
+
# 如果静音持续足够长,结束当前语音段
|
165 |
+
if silence_frame_count >= self.min_silence_frames:
|
166 |
+
segment_end = i - (silence_frame_count * self.frame_size)
|
167 |
+
duration_frames = (segment_end - current_segment_start) // self.frame_size
|
168 |
+
|
169 |
+
# 只保存超过最小长度的片段
|
170 |
+
if duration_frames >= self.min_speech_frames:
|
171 |
+
start_time = current_segment_start / self.sample_rate
|
172 |
+
end_time = segment_end / self.sample_rate
|
173 |
+
logger.debug(f"保存语音片段: {start_time:.2f}s -> {end_time:.2f}s (持续时间: {end_time-start_time:.2f}s)")
|
174 |
+
segments.append(AudioSegment(
|
175 |
+
start_time=start_time,
|
176 |
+
end_time=end_time,
|
177 |
+
audio_data=audio_data[current_segment_start:segment_end],
|
178 |
+
is_speech=True
|
179 |
+
))
|
180 |
+
else:
|
181 |
+
logger.debug(f"丢弃过短的语音片段: {duration_frames * self.frame_duration_ms / 1000:.2f}s")
|
182 |
+
|
183 |
+
is_in_speech = False
|
184 |
+
|
185 |
+
# 处理最后一个语音段
|
186 |
+
if is_in_speech:
|
187 |
+
segment_end = len(audio_data)
|
188 |
+
duration_frames = (segment_end - current_segment_start) // self.frame_size
|
189 |
+
if duration_frames >= self.min_speech_frames:
|
190 |
+
start_time = current_segment_start / self.sample_rate
|
191 |
+
end_time = segment_end / self.sample_rate
|
192 |
+
logger.debug(f"保存最后的语音片段: {start_time:.2f}s -> {end_time:.2f}s (持续时间: {end_time-start_time:.2f}s)")
|
193 |
+
segments.append(AudioSegment(
|
194 |
+
start_time=start_time,
|
195 |
+
end_time=end_time,
|
196 |
+
audio_data=audio_data[current_segment_start:segment_end],
|
197 |
+
is_speech=True
|
198 |
+
))
|
199 |
+
else:
|
200 |
+
logger.debug(f"丢弃过短的最后语音片段: {duration_frames * self.frame_duration_ms / 1000:.2f}s")
|
201 |
+
|
202 |
+
logger.info(f"音频处理完成: 总帧数: {total_frames}, 语音帧数: {speech_frames}, 检测到的语音片段数: {len(segments)}")
|
203 |
+
|
204 |
+
return segments
|
205 |
+
|
206 |
+
def process_audio_file(self, audio_path: str) -> List[AudioSegment]:
|
207 |
+
"""
|
208 |
+
处理音频文件,返回切割后的片段列表
|
209 |
+
|
210 |
+
Args:
|
211 |
+
audio_path: 音频文件路径
|
212 |
+
|
213 |
+
Returns:
|
214 |
+
AudioSegment列表
|
215 |
+
"""
|
216 |
+
# 读取音频文件
|
217 |
+
logger.info(f"正在读取音频文件: {audio_path}")
|
218 |
+
audio_data, sample_rate = sf.read(audio_path)
|
219 |
+
logger.debug(f"音频采样率: {sample_rate}Hz, 形状: {audio_data.shape}")
|
220 |
+
|
221 |
+
# 处理音频数据
|
222 |
+
segments = self.process_audio_data(audio_data, sample_rate)
|
223 |
+
|
224 |
+
# 如果需要保存音频片段
|
225 |
+
if self.save_audio and segments:
|
226 |
+
base_name = os.path.splitext(os.path.basename(audio_path))[0]
|
227 |
+
for i, segment in enumerate(segments):
|
228 |
+
output_path = os.path.join(self.output_dir, f"{base_name}_segment_{i+1}.wav")
|
229 |
+
self.save_segment(segment, output_path)
|
230 |
+
logger.debug(f"保存音频片段到: {output_path}")
|
231 |
+
|
232 |
+
# 如果需要保存JSON元数据
|
233 |
+
if self.save_json and segments:
|
234 |
+
self.save_segments_metadata(segments, audio_path)
|
235 |
+
|
236 |
+
return segments
|
237 |
+
|
238 |
+
def save_segment(self, segment: AudioSegment, output_path: str):
|
239 |
+
"""
|
240 |
+
保存音频片段到文件
|
241 |
+
|
242 |
+
Args:
|
243 |
+
segment: 音频片段
|
244 |
+
output_path: 输出文件路径
|
245 |
+
"""
|
246 |
+
sf.write(output_path, segment.audio_data, self.sample_rate)
|
247 |
+
|
248 |
+
def save_segments_metadata(self, segments: List[AudioSegment], audio_path: str):
|
249 |
+
"""
|
250 |
+
保存片段元数据到JSON文件
|
251 |
+
|
252 |
+
Args:
|
253 |
+
segments: 音频片段列表
|
254 |
+
audio_path: 原始音频文件路径
|
255 |
+
"""
|
256 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
257 |
+
base_name = os.path.splitext(os.path.basename(audio_path))[0]
|
258 |
+
|
259 |
+
# 准备保存的数据
|
260 |
+
metadata = {
|
261 |
+
"audio_file": audio_path,
|
262 |
+
"timestamp": timestamp,
|
263 |
+
"total_segments": len(segments),
|
264 |
+
"segments": [
|
265 |
+
{
|
266 |
+
"index": i,
|
267 |
+
"start_time": seg.start_time,
|
268 |
+
"end_time": seg.end_time,
|
269 |
+
"duration": seg.end_time - seg.start_time,
|
270 |
+
"is_speech": seg.is_speech
|
271 |
+
}
|
272 |
+
for i, seg in enumerate(segments)
|
273 |
+
]
|
274 |
+
}
|
275 |
+
|
276 |
+
# 保存JSON文件
|
277 |
+
json_path = os.path.join(self.json_dir, f"{base_name}_segments_{timestamp}.json")
|
278 |
+
with open(json_path, 'w', encoding='utf-8') as f:
|
279 |
+
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
280 |
+
logger.info(f"保存片段元数据到: {json_path}")
|
281 |
+
|
282 |
+
|
283 |
+
if __name__ == "__main__":
|
284 |
+
# 测试代码
|
285 |
+
# 设置日志级别为DEBUG以查看详细信息
|
286 |
+
logger.setLevel(logging.DEBUG)
|
287 |
+
|
288 |
+
# 创建VAD处理器,配置为保存音频和JSON
|
289 |
+
vad = AudioVad(
|
290 |
+
save_audio=True,
|
291 |
+
save_json=True,
|
292 |
+
output_dir="dataset/audio/segments",
|
293 |
+
json_dir="dataset/audio/metadata"
|
294 |
+
)
|
295 |
+
|
296 |
+
# 示例:处理一个音频文件
|
297 |
+
audio_path = "dataset/audio/test1.wav" # 替换为实际的音频文件路径
|
298 |
+
try:
|
299 |
+
segments = vad.process_audio_file(audio_path)
|
300 |
+
logger.info(f"检测到 {len(segments)} 个语音片段:")
|
301 |
+
for i, segment in enumerate(segments):
|
302 |
+
logger.info(f"片段 {i+1}: {segment.start_time:.2f}s -> {segment.end_time:.2f}s")
|
303 |
+
except Exception as e:
|
304 |
+
logger.error(f"处理音频时出错: {e}")
|
{vad → vad_transcribe_test}/README.md
RENAMED
File without changes
|
vad_transcribe_test/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
VAD音频处理工具包
|
3 |
+
|
4 |
+
提供音频切割、转录和验证功能。
|
5 |
+
"""
|
6 |
+
|
7 |
+
from .audio_processor import AudioProcessor, AudioSegment
|
8 |
+
from .audio_transcriber import AudioTranscriber, TranscriptionResult
|
9 |
+
|
10 |
+
__all__ = ['AudioProcessor', 'AudioSegment', 'AudioTranscriber', 'TranscriptionResult']
|
{vad → vad_transcribe_test}/audio_processor.py
RENAMED
File without changes
|
{vad → vad_transcribe_test}/audio_transcriber.py
RENAMED
File without changes
|
{vad → vad_transcribe_test}/main.py
RENAMED
File without changes
|