Spaces:

sofdog
/

live-transcription-english

Sleeping

App Files Files Community

Sofia Casadei commited on May 25

Commit

061790e

1 Parent(s): ddd255d

max continuous speech s

Browse files

Files changed (3) hide show

.gitignore +2 -1
main.py +7 -5
requirements.txt +1 -2

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 __pycache__/
 logs/
 .env
-.venv/

 __pycache__/
 logs/
 .env
+.venv/
+NOTES.md

main.py CHANGED Viewed

@@ -76,7 +76,8 @@ transcribe_pipeline = pipeline(
     torch_dtype=torch_dtype,
     device=device,
 )
-transcribe_pipeline.model = torch.compile(transcribe_pipeline.model, mode="max-autotune")
 # Warm up the model with empty audio
 logger.info("Warming up Whisper model with dummy input")
@@ -91,7 +92,7 @@ async def transcribe(audio: tuple[int, np.ndarray]):
     outputs = transcribe_pipeline(
         audio_to_bytes(audio),
         chunk_length_s=3,
-        batch_size=2,
         generate_kwargs={
             'task': 'transcribe',
             'language': LANGUAGE,
@@ -105,20 +106,21 @@ stream = Stream(
     handler=ReplyOnPause(
         transcribe,
         algo_options=AlgoOptions(
-            # Duration in seconds of audio chunks (default 0.6)
             audio_chunk_duration=0.6,
             # If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
             started_talking_threshold=0.1,
             # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
             speech_threshold=0.1,
         ),
         model_options=SileroVadOptions(
             # Threshold for what is considered speech (default 0.5)
             threshold=0.5,
             # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
             min_speech_duration_ms=250,
-            # Max duration of speech chunks, longer will be split at the timestamp of the last silence
-            # that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf'))
             max_speech_duration_s=3,
             # Wait for ms at the end of each speech chunk before separating it (default 2000)
             min_silence_duration_ms=100,

     torch_dtype=torch_dtype,
     device=device,
 )
+if device == "cuda":
+    transcribe_pipeline.model = torch.compile(transcribe_pipeline.model, mode="max-autotune")
 # Warm up the model with empty audio
 logger.info("Warming up Whisper model with dummy input")
     outputs = transcribe_pipeline(
         audio_to_bytes(audio),
         chunk_length_s=3,
+        batch_size=1,
         generate_kwargs={
             'task': 'transcribe',
             'language': LANGUAGE,
     handler=ReplyOnPause(
         transcribe,
         algo_options=AlgoOptions(
+            # Duration in seconds of audio chunks passed to the VAD model (default 0.6)
             audio_chunk_duration=0.6,
             # If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
             started_talking_threshold=0.1,
             # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
             speech_threshold=0.1,
+            # Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model. (default -inf)
+            max_continuous_speech_s=6
         ),
         model_options=SileroVadOptions(
             # Threshold for what is considered speech (default 0.5)
             threshold=0.5,
             # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
             min_speech_duration_ms=250,
+            # Max duration of speech chunks, longer will be split at the timestamp of the last silence that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf')) (used internally in the VAD algorithm to split the audio that's passed to the algorithm)
             max_speech_duration_s=3,
             # Wait for ms at the end of each speech chunk before separating it (default 2000)
             min_silence_duration_ms=100,

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 accelerate
-fastrtc==0.0.24
-fastrtc[vad]==0.0.24
 python-dotenv
 transformers
 torch==2.6.0

 accelerate
+git+https://github.com/sofi444/fastrtc@break-cont-speech#egg=fastrtc[vad]
 python-dotenv
 transformers
 torch==2.6.0