tianyaogavin commited on
Commit
238b905
·
1 Parent(s): 7708082

vad module

Browse files
vad/__init__.py CHANGED
@@ -1,10 +1,3 @@
1
- """
2
- VAD音频处理工具包
3
 
4
- 提供音频切割、转录和验证功能。
5
- """
6
-
7
- from .audio_processor import AudioProcessor, AudioSegment
8
- from .audio_transcriber import AudioTranscriber, TranscriptionResult
9
-
10
- __all__ = ['AudioProcessor', 'AudioSegment', 'AudioTranscriber', 'TranscriptionResult']
 
1
+ from .vad import AudioVad, AudioSegment
 
2
 
3
+ __all__ = ['AudioVad', 'AudioSegment']
 
 
 
 
 
 
vad/vad.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from typing import List, Tuple, Optional, Dict, Union
5
+ import webrtcvad
6
+ from dataclasses import dataclass, asdict
7
+ from scipy import signal
8
+ import json
9
+ import os
10
+ from datetime import datetime
11
+ import logging
12
+
13
+ # 配置日志
14
+ logger = logging.getLogger("vad")
15
+ handler = logging.StreamHandler()
16
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17
+ handler.setFormatter(formatter)
18
+ logger.addHandler(handler)
19
+ logger.setLevel(logging.INFO) # 默认INFO级别
20
+
21
+ @dataclass
22
+ class AudioSegment:
23
+ start_time: float # 开始时间(秒)
24
+ end_time: float # 结束时间(秒)
25
+ audio_data: np.ndarray # 音频数据
26
+ is_speech: bool # 是否包含语音
27
+
28
+ class AudioVad:
29
+ def __init__(self,
30
+ sample_rate: int = 16000,
31
+ frame_duration_ms: int = 20,
32
+ vad_level: int = 0, # 降低VAD灵敏度
33
+ min_silence_duration: float = 0.3, # 静音持续时间
34
+ min_speech_duration: float = 0.3, # 增加最小语音持续时间,确保完整句子
35
+ amplitude_threshold: float = 0.0015, # 振幅阈值
36
+ save_audio: bool = False, # 是否保存分段音频
37
+ save_json: bool = False, # 是否保存JSON元数据
38
+ output_dir: str = "dataset/audio/segments", # 音频输出目录
39
+ json_dir: str = "dataset/audio/metadata", # JSON输出目录
40
+ log_level: Union[int, str] = logging.INFO): # 日志级别
41
+ """
42
+ 初始化音频VAD处理器
43
+
44
+ Args:
45
+ sample_rate: 采样率
46
+ frame_duration_ms: VAD帧长度(毫秒)
47
+ vad_level: VAD灵敏度 (0-3)
48
+ min_silence_duration: 最小静音持续时间(秒)
49
+ min_speech_duration: 最小语音片段长度(秒)
50
+ amplitude_threshold: 振幅阈值
51
+ save_audio: 是否保存分段音频文件
52
+ save_json: 是否保存JSON元数据
53
+ output_dir: 音频输出目录
54
+ json_dir: JSON元数据输出目录
55
+ log_level: 日志级别
56
+ """
57
+ # 设置日志级别
58
+ if isinstance(log_level, str):
59
+ log_level = getattr(logging, log_level.upper())
60
+ logger.setLevel(log_level)
61
+
62
+ self.sample_rate = sample_rate
63
+ self.frame_duration_ms = frame_duration_ms
64
+ self.frame_size = int(sample_rate * frame_duration_ms / 1000)
65
+ self.vad = webrtcvad.Vad(vad_level)
66
+ self.min_silence_frames = int(min_silence_duration * 1000 / frame_duration_ms)
67
+ self.min_speech_frames = int(min_speech_duration * 1000 / frame_duration_ms)
68
+ self.amplitude_threshold = amplitude_threshold
69
+
70
+ # 保存配置
71
+ self.save_audio = save_audio
72
+ self.save_json = save_json
73
+ self.output_dir = output_dir
74
+ self.json_dir = json_dir
75
+
76
+ # 如果需要保存文件,确保目录存在
77
+ if self.save_audio:
78
+ os.makedirs(self.output_dir, exist_ok=True)
79
+ if self.save_json:
80
+ os.makedirs(self.json_dir, exist_ok=True)
81
+
82
+ def _is_speech_frame(self, frame: np.ndarray) -> bool:
83
+ """
84
+ 判断一帧是否包含语音
85
+ """
86
+ # 确保帧长度正确
87
+ if len(frame) != self.frame_size:
88
+ return False
89
+
90
+ # 将float32转换为int16,并确保值在范围内
91
+ frame_int16 = np.clip(frame * 32768, -32768, 32767).astype(np.int16)
92
+
93
+ # 使用振幅判断
94
+ frame_amplitude = np.max(np.abs(frame))
95
+ if frame_amplitude < self.amplitude_threshold:
96
+ return False
97
+
98
+ # 使用VAD判断
99
+ try:
100
+ return self.vad.is_speech(frame_int16.tobytes(), self.sample_rate)
101
+ except Exception as e:
102
+ logger.error(f"VAD处理出错: {e}")
103
+ # 如果VAD失败,仅使用振幅判断
104
+ return frame_amplitude >= self.amplitude_threshold * 2
105
+
106
+ def process_audio_data(self, audio_data: np.ndarray, sample_rate: int = None) -> List[AudioSegment]:
107
+ """
108
+ 处理音频数据,返回切割后的片段列表
109
+
110
+ Args:
111
+ audio_data: 音频数据numpy数组
112
+ sample_rate: 音频采样率,如果与初始化不同则会重采样
113
+
114
+ Returns:
115
+ AudioSegment列表
116
+ """
117
+ logger.debug(f"处理音频数据,形状: {audio_data.shape}")
118
+
119
+ # 如果提供了采样率且与目标不同,进行重采样
120
+ if sample_rate is not None and sample_rate != self.sample_rate:
121
+ logger.debug(f"正在重采样音频从 {sample_rate}Hz 到 {self.sample_rate}Hz")
122
+ # 使用scipy的resample函数进行重采样
123
+ num_samples = int(len(audio_data) * self.sample_rate / sample_rate)
124
+ audio_data = signal.resample(audio_data, num_samples)
125
+ logger.debug(f"重采样后音频长度: {len(audio_data)} 采样点")
126
+
127
+ # 如果是多声道,转换为单声道
128
+ if len(audio_data.shape) > 1:
129
+ logger.debug("检测到多声道音频,正在转换为单声道")
130
+ audio_data = audio_data.mean(axis=1) # 转换为单声道
131
+
132
+ # 初始化结果列表
133
+ segments: List[AudioSegment] = []
134
+ logger.debug(f"开始处理音频,总长度: {len(audio_data)} 采样点 ({len(audio_data)/self.sample_rate:.2f}秒)")
135
+
136
+ # 当前处理的状态
137
+ current_segment_start = 0
138
+ silence_frame_count = 0
139
+ is_in_speech = False
140
+
141
+ # 按帧处理音频
142
+ total_frames = len(audio_data) // self.frame_size
143
+ speech_frames = 0
144
+ for i in range(0, len(audio_data), self.frame_size):
145
+ # 确保帧长度正确
146
+ frame = audio_data[i:i + self.frame_size]
147
+ if len(frame) < self.frame_size:
148
+ # 对于最后一个不完整帧,补零处理
149
+ frame = np.pad(frame, (0, self.frame_size - len(frame)), 'constant')
150
+
151
+ is_speech = self._is_speech_frame(frame)
152
+ if is_speech:
153
+ speech_frames += 1
154
+
155
+ if is_speech and not is_in_speech:
156
+ # 开始新的语音段
157
+ current_segment_start = i
158
+ is_in_speech = True
159
+ silence_frame_count = 0
160
+ logger.debug(f"检测到语音开始,位置: {i/self.sample_rate:.2f}秒")
161
+ elif not is_speech and is_in_speech:
162
+ silence_frame_count += 1
163
+
164
+ # 如果静音持续足够长,结束当前语音段
165
+ if silence_frame_count >= self.min_silence_frames:
166
+ segment_end = i - (silence_frame_count * self.frame_size)
167
+ duration_frames = (segment_end - current_segment_start) // self.frame_size
168
+
169
+ # 只保存超过最小长度的片段
170
+ if duration_frames >= self.min_speech_frames:
171
+ start_time = current_segment_start / self.sample_rate
172
+ end_time = segment_end / self.sample_rate
173
+ logger.debug(f"保存语音片段: {start_time:.2f}s -> {end_time:.2f}s (持续时间: {end_time-start_time:.2f}s)")
174
+ segments.append(AudioSegment(
175
+ start_time=start_time,
176
+ end_time=end_time,
177
+ audio_data=audio_data[current_segment_start:segment_end],
178
+ is_speech=True
179
+ ))
180
+ else:
181
+ logger.debug(f"丢弃过短的语音片段: {duration_frames * self.frame_duration_ms / 1000:.2f}s")
182
+
183
+ is_in_speech = False
184
+
185
+ # 处理最后一个语音段
186
+ if is_in_speech:
187
+ segment_end = len(audio_data)
188
+ duration_frames = (segment_end - current_segment_start) // self.frame_size
189
+ if duration_frames >= self.min_speech_frames:
190
+ start_time = current_segment_start / self.sample_rate
191
+ end_time = segment_end / self.sample_rate
192
+ logger.debug(f"保存最后的语音片段: {start_time:.2f}s -> {end_time:.2f}s (持续时间: {end_time-start_time:.2f}s)")
193
+ segments.append(AudioSegment(
194
+ start_time=start_time,
195
+ end_time=end_time,
196
+ audio_data=audio_data[current_segment_start:segment_end],
197
+ is_speech=True
198
+ ))
199
+ else:
200
+ logger.debug(f"丢弃过短的最后语音片段: {duration_frames * self.frame_duration_ms / 1000:.2f}s")
201
+
202
+ logger.info(f"音频处理完成: 总帧数: {total_frames}, 语音帧数: {speech_frames}, 检测到的语音片段数: {len(segments)}")
203
+
204
+ return segments
205
+
206
+ def process_audio_file(self, audio_path: str) -> List[AudioSegment]:
207
+ """
208
+ 处理音频文件,返回切割后的片段列表
209
+
210
+ Args:
211
+ audio_path: 音频文件路径
212
+
213
+ Returns:
214
+ AudioSegment列表
215
+ """
216
+ # 读取音频文件
217
+ logger.info(f"正在读取音频文件: {audio_path}")
218
+ audio_data, sample_rate = sf.read(audio_path)
219
+ logger.debug(f"音频采样率: {sample_rate}Hz, 形状: {audio_data.shape}")
220
+
221
+ # 处理音频数据
222
+ segments = self.process_audio_data(audio_data, sample_rate)
223
+
224
+ # 如果需要保存音频片段
225
+ if self.save_audio and segments:
226
+ base_name = os.path.splitext(os.path.basename(audio_path))[0]
227
+ for i, segment in enumerate(segments):
228
+ output_path = os.path.join(self.output_dir, f"{base_name}_segment_{i+1}.wav")
229
+ self.save_segment(segment, output_path)
230
+ logger.debug(f"保存音频片段到: {output_path}")
231
+
232
+ # 如果需要保存JSON元数据
233
+ if self.save_json and segments:
234
+ self.save_segments_metadata(segments, audio_path)
235
+
236
+ return segments
237
+
238
+ def save_segment(self, segment: AudioSegment, output_path: str):
239
+ """
240
+ 保存音频片段到文件
241
+
242
+ Args:
243
+ segment: 音频片段
244
+ output_path: 输出文件路径
245
+ """
246
+ sf.write(output_path, segment.audio_data, self.sample_rate)
247
+
248
+ def save_segments_metadata(self, segments: List[AudioSegment], audio_path: str):
249
+ """
250
+ 保存片段元数据到JSON文件
251
+
252
+ Args:
253
+ segments: 音频片段列表
254
+ audio_path: 原始音频文件路径
255
+ """
256
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
257
+ base_name = os.path.splitext(os.path.basename(audio_path))[0]
258
+
259
+ # 准备保存的数据
260
+ metadata = {
261
+ "audio_file": audio_path,
262
+ "timestamp": timestamp,
263
+ "total_segments": len(segments),
264
+ "segments": [
265
+ {
266
+ "index": i,
267
+ "start_time": seg.start_time,
268
+ "end_time": seg.end_time,
269
+ "duration": seg.end_time - seg.start_time,
270
+ "is_speech": seg.is_speech
271
+ }
272
+ for i, seg in enumerate(segments)
273
+ ]
274
+ }
275
+
276
+ # 保存JSON文件
277
+ json_path = os.path.join(self.json_dir, f"{base_name}_segments_{timestamp}.json")
278
+ with open(json_path, 'w', encoding='utf-8') as f:
279
+ json.dump(metadata, f, ensure_ascii=False, indent=2)
280
+ logger.info(f"保存片段元数据到: {json_path}")
281
+
282
+
283
+ if __name__ == "__main__":
284
+ # 测试代码
285
+ # 设置日志级别为DEBUG以查看详细信息
286
+ logger.setLevel(logging.DEBUG)
287
+
288
+ # 创建VAD处理器,配置为保存音频和JSON
289
+ vad = AudioVad(
290
+ save_audio=True,
291
+ save_json=True,
292
+ output_dir="dataset/audio/segments",
293
+ json_dir="dataset/audio/metadata"
294
+ )
295
+
296
+ # 示例:处理一个音频文件
297
+ audio_path = "dataset/audio/test1.wav" # 替换为实际的音频文件路径
298
+ try:
299
+ segments = vad.process_audio_file(audio_path)
300
+ logger.info(f"检测到 {len(segments)} 个语音片段:")
301
+ for i, segment in enumerate(segments):
302
+ logger.info(f"片段 {i+1}: {segment.start_time:.2f}s -> {segment.end_time:.2f}s")
303
+ except Exception as e:
304
+ logger.error(f"处理音频时出错: {e}")
{vad → vad_transcribe_test}/README.md RENAMED
File without changes
vad_transcribe_test/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VAD音频处理工具包
3
+
4
+ 提供音频切割、转录和验证功能。
5
+ """
6
+
7
+ from .audio_processor import AudioProcessor, AudioSegment
8
+ from .audio_transcriber import AudioTranscriber, TranscriptionResult
9
+
10
+ __all__ = ['AudioProcessor', 'AudioSegment', 'AudioTranscriber', 'TranscriptionResult']
{vad → vad_transcribe_test}/audio_processor.py RENAMED
File without changes
{vad → vad_transcribe_test}/audio_transcriber.py RENAMED
File without changes
{vad → vad_transcribe_test}/main.py RENAMED
File without changes