import numpy as np import pandas as pd from sklearn.metrics import roc_auc_score import pandas as pd from concurrent.futures import ProcessPoolExecutor from tqdm import tqdm from silero_vad import read_audio, get_speech_timestamps from functools import partial from pathlib import Path str(Path().resolve() / "silero-vad/src/silero-vad") from silero_vad import utils_vad # from utils_vad import init_jit_model, OnnxWrapper import torch torch.set_num_threads(1) def load_silero_vad(onnx=False, model_file_path=None): if onnx: model = utils_vad.OnnxWrapper(model_file_path, force_onnx_cpu=True) else: model = utils_vad.init_jit_model(model_file_path) return model def init_worker(model_file_path): """Initialize the model inside each worker process""" global model model = load_silero_vad(onnx=False, model_file_path=model_file_path) def get_vad(file, threshold): if pd.isna(file): return None wav = read_audio(file) speech_timestamps = get_speech_timestamps( wav, model, return_seconds=True, threshold=threshold ) return speech_timestamps def process_vad_parallel(df, threshold, column_name, model_file_path): results = [] with ProcessPoolExecutor(max_workers=8, initializer=partial(init_worker, model_file_path) ) as executor: futures = {executor.submit(get_vad, file, threshold): i for i, file in enumerate(df["audio_path"])} for future in tqdm(futures, total=len(df), desc=f"Processing {column_name}"): results.append(future.result()) # Collect results df[column_name] = results return df def create_frame_labels(segments, duration, frame_size=0.01): frames = np.zeros(int(duration / frame_size)) for seg in segments: start_idx = int(seg['start'] / frame_size) end_idx = int(seg['end'] / frame_size) frames[start_idx:end_idx] = 1 return frames def compute_auc_roc(df, actual_col, predicted_col, frame_size=0.01): max_time = max( max(seg['end'] for row in df[actual_col] for seg in row), max(seg['end'] for row in df[predicted_col] for seg in row) ) gt_labels = create_frame_labels([seg for row in df[actual_col] for seg in row], max_time, frame_size) pred_labels = create_frame_labels([seg for row in df[predicted_col] for seg in row], max_time, frame_size) auc_roc = roc_auc_score(gt_labels, pred_labels) return auc_roc df = pd.read_feather("./val.feather") model_file_path = "/home/sourabh/Desktop/dev/hum-vad/HumAware-VAD/humaware_vad.jit" df = process_vad_parallel(df, 0.5, "unhum_vad_output_0.5", model_file_path=model_file_path) df = process_vad_parallel(df, 0.9, "unhum_vad_output_0.9", model_file_path=model_file_path) model_file_path = "/home/sourabh/Desktop/dev/hum-vad/.venv/lib/python3.12/site-packages/silero_vad/data/silero_vad.jit" df = process_vad_parallel(df, 0.5, "silero_vad_output_0.5", model_file_path=model_file_path) df = process_vad_parallel(df, 0.9, "silero_vad_output_0.9", model_file_path=model_file_path) auc_roc_score = compute_auc_roc(df, "speech_ts", "unhum_vad_output_0.5") print(f"AUC-ROC Score: {auc_roc_score:.4f}") auc_roc_score = compute_auc_roc(df, "speech_ts", "unhum_vad_output_0.9") print(f"AUC-ROC Score unhum_vad_output_0.9: {auc_roc_score:.4f}") auc_roc_score = compute_auc_roc(df, "speech_ts", "silero_vad_output_0.5") print(f"AUC-ROC Score silero_vad_output_0.5: {auc_roc_score:.4f}") auc_roc_score = compute_auc_roc(df, "speech_ts", "silero_vad_output_0.9") print(f"AUC-ROC Score silero_vad_output_0.9: {auc_roc_score:.4f}")