VocRT / faster-whiper.py
Anurag
version-2 initial version
5306da4
import asyncio
import numpy as np
import time
from faster_whisper import WhisperModel
import pyaudio
import wave
import threading
from collections import deque
import os
from silero_vad import load_silero_vad, VADIterator
sampling_rate = 16_000
vad_model = load_silero_vad()
vad_iter = VADIterator(vad_model, sampling_rate=sampling_rate)
frame_size = 512
PRE_CHUNK_LIMIT_BYTES = frame_size * 2 * 20
class FasterWhisperTerminalTest:
def __init__(self):
# Initialize Faster-Whisper model for M1 Mac
self.model = WhisperModel(
"small", # Using base model for M1 8GB RAM
device="cpu",
compute_type="int8", # Memory optimization
cpu_threads=4, # M1 performance cores
download_root="./models"
)
# Audio settings
self.sample_rate = 16000
self.chunk_size = 512
self.channels = 1
self.format = pyaudio.paInt16
# Recording state
self.is_recording = False
self.audio_buffer = deque()
self.silence_threshold = 500
self.silence_duration = 0.4 # seconds
self.silence_counter = 0
self.pre_chunks = deque(maxlen=PRE_CHUNK_LIMIT_BYTES)
self.audio_count = 0
self.in_speech = False
print("Faster-Whisper Terminal Test Initialized")
print("Model loaded successfully")
def start_recording(self):
"""Start recording audio from microphone"""
self.audio = pyaudio.PyAudio()
self.stream = self.audio.open(
format=self.format,
channels=self.channels,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk_size,
stream_callback=self.audio_callback
)
self.is_recording = True
self.stream.start_stream()
print("Recording started. Speak into microphone...")
print("Press Ctrl+C to stop")
def audio_callback(self, in_data, frame_count, time_info, status):
"""Audio callback for real-time processing"""
if self.is_recording:
audio_data = np.frombuffer(in_data, dtype=np.int16)
self.audio_buffer.extend(audio_data)
float_chunk = audio_data.astype(np.float32) / 32768.0
vad_result = vad_iter(float_chunk)
self.pre_chunks.extend(audio_data)
# if len(self.pre_chunks) > PRE_CHUNK_LIMIT_BYTES:
# overflow = len(
# self.pre_chunks) - PRE_CHUNK_LIMIT_BYTES
# del self.pre_chunks[:overflow]
if vad_result:
if "start" in vad_result:
self.in_speech = True
self.audio_buffer.extend(
self.pre_chunks)
if "end" in vad_result:
self.in_speech = False
if self.in_speech:
self.audio_buffer.extend(audio_data)
self.silence_counter = 0.0
self.audio_count += 1
else:
sample_rate = 16000
duration = len(audio_data) / sample_rate
self.silence_counter += duration
# Simple voice activity detection
# volume = np.sqrt(np.mean(audio_data**2))
# if volume > self.silence_threshold:
# self.silence_counter = 0
# else:
# self.silence_counter += len(audio_data) / self.sample_rate
# Process when silence detected
if self.silence_counter >= self.silence_duration and len(self.audio_buffer) > 0:
self.silence_counter = 0
if self.audio_count < 2:
self.audio_count = 0
return (in_data, pyaudio.paContinue)
self.audio_count = 0
print("Silence ")
threading.Thread(target=self.process_audio,
daemon=True).start()
return (in_data, pyaudio.paContinue)
def process_audio(self):
"""Process accumulated audio buffer"""
if len(self.audio_buffer) == 0:
return
# Convert to float32 for Faster-Whisper
audio_array = np.array(list(self.audio_buffer),
dtype=np.float32) / 32768.0
self.audio_buffer.clear()
self.silence_counter = 0
if len(audio_array) < self.sample_rate * 0.5: # Skip very short audio
return
print("\n🎀 Processing audio...")
start_time = time.time()
try:
# Transcribe with Faster-Whisper
segments, info = self.model.transcribe(
audio_array,
language="en",
beam_size=2, # Faster processing
vad_filter=True, # Built-in VAD
vad_parameters=dict(
# threshold=0.5,
min_speech_duration_ms=500,
max_speech_duration_s=60
)
)
# Collect results
transcription = ""
for segment in segments:
transcription += segment.text
end_time = time.time()
processing_time = end_time - start_time
if transcription.strip():
print(f"πŸ“ Transcription: {transcription.strip()}")
print(f"⏱️ Processing time: {processing_time:.2f}s")
print(
f"🌍 Language: {info.language} (confidence: {info.language_probability:.2f})")
print("-" * 50)
else:
print("πŸ”‡ No speech detected")
except Exception as e:
print(f"❌ Error during transcription: {e}")
def stop_recording(self):
"""Stop recording and cleanup"""
self.is_recording = False
if hasattr(self, 'stream'):
self.stream.stop_stream()
self.stream.close()
if hasattr(self, 'audio'):
self.audio.terminate()
print("\nπŸ›‘ Recording stopped")
def test_faster_whisper():
"""Main function to test Faster-Whisper"""
tester = FasterWhisperTerminalTest()
try:
tester.start_recording()
# Keep running until interrupted
while True:
time.sleep(0.1)
except KeyboardInterrupt:
print("\n\nπŸ›‘ Stopping test...")
tester.stop_recording()
print("βœ… Faster-Whisper test completed")
if __name__ == "__main__":
test_faster_whisper()