Model Upload
Browse files- .gitattributes +1 -0
- .gitignore +1 -0
- README.md +69 -0
- compute_performance.py +99 -0
- humaware_vad.jit +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.jit filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.feather
|
README.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# HumAware-VAD: Humming-Aware Voice Activity Detection
|
2 |
+
|
3 |
+
## 📌 Overview
|
4 |
+
**HumAware-VAD** is a fine-tuned version of the **Silero-VAD** model, trained to distinguish **humming from actual speech**. Standard Voice Activity Detection (VAD) models, including Silero-VAD, often misclassify humming as speech, leading to inaccurate speech segmentation. HumAware-VAD improves upon this by leveraging a custom dataset (**HumSpeechBlend**) to enhance speech detection accuracy in the presence of humming.
|
5 |
+
|
6 |
+
## 🎯 Purpose
|
7 |
+
The primary goal of **HumAware-VAD** is to:
|
8 |
+
- Reduce **false positives** where humming is mistakenly detected as speech.
|
9 |
+
- Enhance **speech segmentation accuracy** in real-world applications.
|
10 |
+
- Improve VAD performance for tasks involving **music, background noise, and vocal sounds**.
|
11 |
+
|
12 |
+
## 🗂️ Model Details
|
13 |
+
- **Base Model**: [Silero-VAD](https://github.com/snakers4/silero-vad/tree/master)
|
14 |
+
- **Fine-tuning Dataset**: [HumSpeechBlend](https://huggingface.co/datasets/CuriousMonkey7/HumSpeechBlend)
|
15 |
+
- **Format**: JIT (TorchScript)
|
16 |
+
- **Framework**: PyTorch
|
17 |
+
- **Inference Speed**: Real-time
|
18 |
+
|
19 |
+
## 📥 Download & Usage
|
20 |
+
### 🔹 Install Dependencies
|
21 |
+
```bash
|
22 |
+
pip install torch torchaudio
|
23 |
+
```
|
24 |
+
|
25 |
+
### 🔹 Load the Model
|
26 |
+
```python
|
27 |
+
import torch
|
28 |
+
|
29 |
+
def load_humaware_vad(model_path="humaware_vad.jit"):
|
30 |
+
model = torch.jit.load(model_path)
|
31 |
+
model.eval()
|
32 |
+
return model
|
33 |
+
|
34 |
+
vad_model = load_humaware_vad()
|
35 |
+
```
|
36 |
+
|
37 |
+
### 🔹 Run Inference
|
38 |
+
```python
|
39 |
+
import torchaudio
|
40 |
+
|
41 |
+
waveform, sample_rate = torchaudio.load("data/0000.wav")
|
42 |
+
out = vad_model(waveform)
|
43 |
+
print("VAD Output:", out)
|
44 |
+
```
|
45 |
+
<!--
|
46 |
+
## 🏆 Performance
|
47 |
+
Compared to the base Silero-VAD model, **HumAware-VAD** demonstrates:
|
48 |
+
✅ **Lower false positives for humming**
|
49 |
+
✅ **Better segmentation of speech in mixed audio**
|
50 |
+
✅ **Maintained real-time inference capabilities**
|
51 |
+
|
52 |
+
## 📊 Applications
|
53 |
+
- **Automatic Speech Recognition (ASR) Preprocessing**
|
54 |
+
- **Noise-Robust VAD Systems**
|
55 |
+
- **Speech Enhancement & Separation**
|
56 |
+
- **Call Center & Voice Communication Filtering** -->
|
57 |
+
|
58 |
+
## 📄 Citation
|
59 |
+
If you use this model, please cite it accordingly.
|
60 |
+
|
61 |
+
```
|
62 |
+
@model{HumAwareVAD2025,
|
63 |
+
author = {Sourabh Saini},
|
64 |
+
title = {HumAware-VAD: Humming-Aware Voice Activity Detection},
|
65 |
+
year = {2025},
|
66 |
+
publisher = {Hugging Face},
|
67 |
+
url = {https://huggingface.co/CuriousMonkey7/HumAware-VAD}
|
68 |
+
}
|
69 |
+
```
|
compute_performance.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.metrics import roc_auc_score
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
from concurrent.futures import ProcessPoolExecutor
|
7 |
+
from tqdm import tqdm
|
8 |
+
from silero_vad import read_audio, get_speech_timestamps
|
9 |
+
from functools import partial
|
10 |
+
|
11 |
+
from pathlib import Path
|
12 |
+
str(Path().resolve() / "silero-vad/src/silero-vad")
|
13 |
+
from silero_vad import utils_vad
|
14 |
+
# from utils_vad import init_jit_model, OnnxWrapper
|
15 |
+
import torch
|
16 |
+
torch.set_num_threads(1)
|
17 |
+
|
18 |
+
def load_silero_vad(onnx=False, model_file_path=None):
|
19 |
+
if onnx:
|
20 |
+
model = utils_vad.OnnxWrapper(model_file_path, force_onnx_cpu=True)
|
21 |
+
else:
|
22 |
+
model = utils_vad.init_jit_model(model_file_path)
|
23 |
+
|
24 |
+
return model
|
25 |
+
|
26 |
+
def init_worker(model_file_path):
|
27 |
+
"""Initialize the model inside each worker process"""
|
28 |
+
global model
|
29 |
+
model = load_silero_vad(onnx=False, model_file_path=model_file_path)
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
def get_vad(file, threshold):
|
34 |
+
if pd.isna(file):
|
35 |
+
return None
|
36 |
+
wav = read_audio(file)
|
37 |
+
speech_timestamps = get_speech_timestamps(
|
38 |
+
wav,
|
39 |
+
model,
|
40 |
+
return_seconds=True,
|
41 |
+
threshold=threshold
|
42 |
+
)
|
43 |
+
return speech_timestamps
|
44 |
+
|
45 |
+
def process_vad_parallel(df, threshold, column_name, model_file_path):
|
46 |
+
results = []
|
47 |
+
with ProcessPoolExecutor(max_workers=8, initializer=partial(init_worker, model_file_path) ) as executor:
|
48 |
+
futures = {executor.submit(get_vad, file, threshold): i for i, file in enumerate(df["audio_path"])}
|
49 |
+
for future in tqdm(futures, total=len(df), desc=f"Processing {column_name}"):
|
50 |
+
results.append(future.result()) # Collect results
|
51 |
+
df[column_name] = results
|
52 |
+
return df
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
def create_frame_labels(segments, duration, frame_size=0.01):
|
57 |
+
frames = np.zeros(int(duration / frame_size))
|
58 |
+
for seg in segments:
|
59 |
+
start_idx = int(seg['start'] / frame_size)
|
60 |
+
end_idx = int(seg['end'] / frame_size)
|
61 |
+
frames[start_idx:end_idx] = 1
|
62 |
+
return frames
|
63 |
+
|
64 |
+
def compute_auc_roc(df, actual_col, predicted_col, frame_size=0.01):
|
65 |
+
max_time = max(
|
66 |
+
max(seg['end'] for row in df[actual_col] for seg in row),
|
67 |
+
max(seg['end'] for row in df[predicted_col] for seg in row)
|
68 |
+
)
|
69 |
+
|
70 |
+
gt_labels = create_frame_labels([seg for row in df[actual_col] for seg in row], max_time, frame_size)
|
71 |
+
pred_labels = create_frame_labels([seg for row in df[predicted_col] for seg in row], max_time, frame_size)
|
72 |
+
|
73 |
+
auc_roc = roc_auc_score(gt_labels, pred_labels)
|
74 |
+
return auc_roc
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
df = pd.read_feather("./val.feather")
|
79 |
+
model_file_path = "/home/sourabh/Desktop/dev/hum-vad/HumAware-VAD/humaware_vad.jit"
|
80 |
+
df = process_vad_parallel(df, 0.5, "unhum_vad_output_0.5", model_file_path=model_file_path)
|
81 |
+
df = process_vad_parallel(df, 0.9, "unhum_vad_output_0.9", model_file_path=model_file_path)
|
82 |
+
|
83 |
+
model_file_path = "/home/sourabh/Desktop/dev/hum-vad/.venv/lib/python3.12/site-packages/silero_vad/data/silero_vad.jit"
|
84 |
+
df = process_vad_parallel(df, 0.5, "silero_vad_output_0.5", model_file_path=model_file_path)
|
85 |
+
df = process_vad_parallel(df, 0.9, "silero_vad_output_0.9", model_file_path=model_file_path)
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
auc_roc_score = compute_auc_roc(df, "speech_ts", "unhum_vad_output_0.5")
|
90 |
+
print(f"AUC-ROC Score: {auc_roc_score:.4f}")
|
91 |
+
|
92 |
+
auc_roc_score = compute_auc_roc(df, "speech_ts", "unhum_vad_output_0.9")
|
93 |
+
print(f"AUC-ROC Score unhum_vad_output_0.9: {auc_roc_score:.4f}")
|
94 |
+
|
95 |
+
auc_roc_score = compute_auc_roc(df, "speech_ts", "silero_vad_output_0.5")
|
96 |
+
print(f"AUC-ROC Score silero_vad_output_0.5: {auc_roc_score:.4f}")
|
97 |
+
|
98 |
+
auc_roc_score = compute_auc_roc(df, "speech_ts", "silero_vad_output_0.9")
|
99 |
+
print(f"AUC-ROC Score silero_vad_output_0.9: {auc_roc_score:.4f}")
|
humaware_vad.jit
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3601c85aa142a1cc7ac2fbe31e225b1fe9586b66f84ac0232b1a67fa3cb75ea2
|
3 |
+
size 2271010
|