VocRT / faster-whiper.py

Anurag

version-2 initial version

5306da4 27 days ago

6.56 kB

	import asyncio
	import numpy as np
	import time
	from faster_whisper import WhisperModel
	import pyaudio
	import wave
	import threading
	from collections import deque
	import os
	from silero_vad import load_silero_vad, VADIterator

	sampling_rate = 16_000
	vad_model = load_silero_vad()
	vad_iter = VADIterator(vad_model, sampling_rate=sampling_rate)
	frame_size = 512

	PRE_CHUNK_LIMIT_BYTES = frame_size * 2 * 20


	class FasterWhisperTerminalTest:
	def __init__(self):
	# Initialize Faster-Whisper model for M1 Mac
	self.model = WhisperModel(
	"small", # Using base model for M1 8GB RAM
	device="cpu",
	compute_type="int8", # Memory optimization
	cpu_threads=4, # M1 performance cores
	download_root="./models"
	)

	# Audio settings
	self.sample_rate = 16000
	self.chunk_size = 512
	self.channels = 1
	self.format = pyaudio.paInt16

	# Recording state
	self.is_recording = False
	self.audio_buffer = deque()
	self.silence_threshold = 500
	self.silence_duration = 0.4 # seconds
	self.silence_counter = 0
	self.pre_chunks = deque(maxlen=PRE_CHUNK_LIMIT_BYTES)
	self.audio_count = 0
	self.in_speech = False

	print("Faster-Whisper Terminal Test Initialized")
	print("Model loaded successfully")

	def start_recording(self):
	"""Start recording audio from microphone"""
	self.audio = pyaudio.PyAudio()
	self.stream = self.audio.open(
	format=self.format,
	channels=self.channels,
	rate=self.sample_rate,
	input=True,
	frames_per_buffer=self.chunk_size,
	stream_callback=self.audio_callback
	)

	self.is_recording = True
	self.stream.start_stream()
	print("Recording started. Speak into microphone...")
	print("Press Ctrl+C to stop")

	def audio_callback(self, in_data, frame_count, time_info, status):
	"""Audio callback for real-time processing"""
	if self.is_recording:
	audio_data = np.frombuffer(in_data, dtype=np.int16)
	self.audio_buffer.extend(audio_data)

	float_chunk = audio_data.astype(np.float32) / 32768.0

	vad_result = vad_iter(float_chunk)

	self.pre_chunks.extend(audio_data)
	# if len(self.pre_chunks) > PRE_CHUNK_LIMIT_BYTES:
	# overflow = len(
	# self.pre_chunks) - PRE_CHUNK_LIMIT_BYTES
	# del self.pre_chunks[:overflow]

	if vad_result:
	if "start" in vad_result:
	self.in_speech = True
	self.audio_buffer.extend(
	self.pre_chunks)
	if "end" in vad_result:
	self.in_speech = False

	if self.in_speech:
	self.audio_buffer.extend(audio_data)
	self.silence_counter = 0.0
	self.audio_count += 1
	else:
	sample_rate = 16000
	duration = len(audio_data) / sample_rate
	self.silence_counter += duration

	# Simple voice activity detection
	# volume = np.sqrt(np.mean(audio_data**2))
	# if volume > self.silence_threshold:
	# self.silence_counter = 0
	# else:
	# self.silence_counter += len(audio_data) / self.sample_rate

	# Process when silence detected
	if self.silence_counter >= self.silence_duration and len(self.audio_buffer) > 0:
	self.silence_counter = 0
	if self.audio_count < 2:
	self.audio_count = 0
	return (in_data, pyaudio.paContinue)
	self.audio_count = 0
	print("Silence ")
	threading.Thread(target=self.process_audio,
	daemon=True).start()

	return (in_data, pyaudio.paContinue)

	def process_audio(self):
	"""Process accumulated audio buffer"""
	if len(self.audio_buffer) == 0:
	return

	# Convert to float32 for Faster-Whisper
	audio_array = np.array(list(self.audio_buffer),
	dtype=np.float32) / 32768.0
	self.audio_buffer.clear()
	self.silence_counter = 0

	if len(audio_array) < self.sample_rate * 0.5: # Skip very short audio
	return

	print("\n🎤 Processing audio...")
	start_time = time.time()

	try:
	# Transcribe with Faster-Whisper
	segments, info = self.model.transcribe(
	audio_array,
	language="en",
	beam_size=2, # Faster processing
	vad_filter=True, # Built-in VAD
	vad_parameters=dict(
	# threshold=0.5,
	min_speech_duration_ms=500,
	max_speech_duration_s=60
	)
	)

	# Collect results
	transcription = ""
	for segment in segments:
	transcription += segment.text

	end_time = time.time()
	processing_time = end_time - start_time

	if transcription.strip():
	print(f"📝 Transcription: {transcription.strip()}")
	print(f"⏱️ Processing time: {processing_time:.2f}s")
	print(
	f"🌍 Language: {info.language} (confidence: {info.language_probability:.2f})")
	print("-" * 50)
	else:
	print("🔇 No speech detected")

	except Exception as e:
	print(f"❌ Error during transcription: {e}")

	def stop_recording(self):
	"""Stop recording and cleanup"""
	self.is_recording = False
	if hasattr(self, 'stream'):
	self.stream.stop_stream()
	self.stream.close()
	if hasattr(self, 'audio'):
	self.audio.terminate()
	print("\n🛑 Recording stopped")


	def test_faster_whisper():
	"""Main function to test Faster-Whisper"""
	tester = FasterWhisperTerminalTest()

	try:
	tester.start_recording()

	# Keep running until interrupted
	while True:
	time.sleep(0.1)

	except KeyboardInterrupt:
	print("\n\n🛑 Stopping test...")
	tester.stop_recording()
	print("✅ Faster-Whisper test completed")


	if __name__ == "__main__":
	test_faster_whisper()