Model Upload

Browse files

Files changed (5) hide show

.gitattributes +1 -0
.gitignore +1 -0
README.md +69 -0
compute_performance.py +99 -0
humaware_vad.jit +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jit filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.feather

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# HumAware-VAD: Humming-Aware Voice Activity Detection
+## 📌 Overview
+**HumAware-VAD** is a fine-tuned version of the **Silero-VAD** model, trained to distinguish **humming from actual speech**. Standard Voice Activity Detection (VAD) models, including Silero-VAD, often misclassify humming as speech, leading to inaccurate speech segmentation. HumAware-VAD improves upon this by leveraging a custom dataset (**HumSpeechBlend**) to enhance speech detection accuracy in the presence of humming.
+## 🎯 Purpose
+The primary goal of **HumAware-VAD** is to:
+- Reduce **false positives** where humming is mistakenly detected as speech.
+- Enhance **speech segmentation accuracy** in real-world applications.
+- Improve VAD performance for tasks involving **music, background noise, and vocal sounds**.
+## 🗂️ Model Details
+- **Base Model**: [Silero-VAD](https://github.com/snakers4/silero-vad/tree/master)
+- **Fine-tuning Dataset**: [HumSpeechBlend](https://huggingface.co/datasets/CuriousMonkey7/HumSpeechBlend)
+- **Format**: JIT (TorchScript)
+- **Framework**: PyTorch
+- **Inference Speed**: Real-time
+## 📥 Download & Usage
+### 🔹 Install Dependencies
+```bash
+pip install torch torchaudio
+```
+### 🔹 Load the Model
+```python
+import torch
+def load_humaware_vad(model_path="humaware_vad.jit"):
+    model = torch.jit.load(model_path)
+    model.eval()
+    return model
+vad_model = load_humaware_vad()
+```
+### 🔹 Run Inference
+```python
+import torchaudio
+waveform, sample_rate = torchaudio.load("data/0000.wav")
+out = vad_model(waveform)
+print("VAD Output:", out)
+```
+<!--
+## 🏆 Performance
+Compared to the base Silero-VAD model, **HumAware-VAD** demonstrates:
+✅ **Lower false positives for humming**
+✅ **Better segmentation of speech in mixed audio**
+✅ **Maintained real-time inference capabilities**
+## 📊 Applications
+- **Automatic Speech Recognition (ASR) Preprocessing**
+- **Noise-Robust VAD Systems**
+- **Speech Enhancement & Separation**
+- **Call Center & Voice Communication Filtering** -->
+## 📄 Citation
+If you use this model, please cite it accordingly.
+```
+@model{HumAwareVAD2025,
+  author = {Sourabh Saini},
+  title = {HumAware-VAD: Humming-Aware Voice Activity Detection},
+  year = {2025},
+  publisher = {Hugging Face},
+  url = {https://huggingface.co/CuriousMonkey7/HumAware-VAD}
+}
+```

compute_performance.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import numpy as np
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+import pandas as pd
+from concurrent.futures import ProcessPoolExecutor
+from tqdm import tqdm
+from silero_vad import  read_audio, get_speech_timestamps
+from functools import partial
+from pathlib import Path
+str(Path().resolve() / "silero-vad/src/silero-vad")
+from silero_vad import utils_vad
+# from utils_vad import init_jit_model, OnnxWrapper
+import torch
+torch.set_num_threads(1)
+def load_silero_vad(onnx=False, model_file_path=None):
+    if onnx:
+        model = utils_vad.OnnxWrapper(model_file_path, force_onnx_cpu=True)
+    else:
+        model = utils_vad.init_jit_model(model_file_path)
+    return model
+def init_worker(model_file_path):
+    """Initialize the model inside each worker process"""
+    global model
+    model = load_silero_vad(onnx=False, model_file_path=model_file_path)
+def get_vad(file, threshold):
+    if pd.isna(file):
+        return None
+    wav = read_audio(file)
+    speech_timestamps = get_speech_timestamps(
+        wav,
+        model,
+        return_seconds=True,
+        threshold=threshold
+    )
+    return speech_timestamps
+def process_vad_parallel(df, threshold, column_name, model_file_path):
+    results = []
+    with ProcessPoolExecutor(max_workers=8, initializer=partial(init_worker, model_file_path) ) as executor:
+        futures = {executor.submit(get_vad, file, threshold): i for i, file in enumerate(df["audio_path"])}
+        for future in tqdm(futures, total=len(df), desc=f"Processing {column_name}"):
+            results.append(future.result())  # Collect results
+    df[column_name] = results
+    return df
+def create_frame_labels(segments, duration, frame_size=0.01):
+    frames = np.zeros(int(duration / frame_size))
+    for seg in segments:
+        start_idx = int(seg['start'] / frame_size)
+        end_idx = int(seg['end'] / frame_size)
+        frames[start_idx:end_idx] = 1
+    return frames
+def compute_auc_roc(df, actual_col, predicted_col, frame_size=0.01):
+    max_time = max(
+        max(seg['end'] for row in df[actual_col] for seg in row),
+        max(seg['end'] for row in df[predicted_col] for seg in row)
+    )
+    gt_labels = create_frame_labels([seg for row in df[actual_col] for seg in row], max_time, frame_size)
+    pred_labels = create_frame_labels([seg for row in df[predicted_col] for seg in row], max_time, frame_size)
+    auc_roc = roc_auc_score(gt_labels, pred_labels)
+    return auc_roc
+df = pd.read_feather("./val.feather")
+model_file_path = "/home/sourabh/Desktop/dev/hum-vad/HumAware-VAD/humaware_vad.jit"
+df = process_vad_parallel(df, 0.5, "unhum_vad_output_0.5", model_file_path=model_file_path)
+df = process_vad_parallel(df, 0.9, "unhum_vad_output_0.9", model_file_path=model_file_path)
+model_file_path = "/home/sourabh/Desktop/dev/hum-vad/.venv/lib/python3.12/site-packages/silero_vad/data/silero_vad.jit"
+df = process_vad_parallel(df, 0.5, "silero_vad_output_0.5", model_file_path=model_file_path)
+df = process_vad_parallel(df, 0.9, "silero_vad_output_0.9", model_file_path=model_file_path)
+auc_roc_score = compute_auc_roc(df, "speech_ts", "unhum_vad_output_0.5")
+print(f"AUC-ROC Score: {auc_roc_score:.4f}")
+auc_roc_score = compute_auc_roc(df, "speech_ts", "unhum_vad_output_0.9")
+print(f"AUC-ROC Score unhum_vad_output_0.9: {auc_roc_score:.4f}")
+auc_roc_score = compute_auc_roc(df, "speech_ts", "silero_vad_output_0.5")
+print(f"AUC-ROC Score silero_vad_output_0.5: {auc_roc_score:.4f}")
+auc_roc_score = compute_auc_roc(df, "speech_ts", "silero_vad_output_0.9")
+print(f"AUC-ROC Score silero_vad_output_0.9: {auc_roc_score:.4f}")

humaware_vad.jit ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3601c85aa142a1cc7ac2fbe31e225b1fe9586b66f84ac0232b1a67fa3cb75ea2
+size 2271010