Sofia Casadei commited on
Commit
061790e
Β·
1 Parent(s): ddd255d

max continuous speech s

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. main.py +7 -5
  3. requirements.txt +1 -2
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  __pycache__/
2
  logs/
3
  .env
4
- .venv/
 
 
1
  __pycache__/
2
  logs/
3
  .env
4
+ .venv/
5
+ NOTES.md
main.py CHANGED
@@ -76,7 +76,8 @@ transcribe_pipeline = pipeline(
76
  torch_dtype=torch_dtype,
77
  device=device,
78
  )
79
- transcribe_pipeline.model = torch.compile(transcribe_pipeline.model, mode="max-autotune")
 
80
 
81
  # Warm up the model with empty audio
82
  logger.info("Warming up Whisper model with dummy input")
@@ -91,7 +92,7 @@ async def transcribe(audio: tuple[int, np.ndarray]):
91
  outputs = transcribe_pipeline(
92
  audio_to_bytes(audio),
93
  chunk_length_s=3,
94
- batch_size=2,
95
  generate_kwargs={
96
  'task': 'transcribe',
97
  'language': LANGUAGE,
@@ -105,20 +106,21 @@ stream = Stream(
105
  handler=ReplyOnPause(
106
  transcribe,
107
  algo_options=AlgoOptions(
108
- # Duration in seconds of audio chunks (default 0.6)
109
  audio_chunk_duration=0.6,
110
  # If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
111
  started_talking_threshold=0.1,
112
  # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
113
  speech_threshold=0.1,
 
 
114
  ),
115
  model_options=SileroVadOptions(
116
  # Threshold for what is considered speech (default 0.5)
117
  threshold=0.5,
118
  # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
119
  min_speech_duration_ms=250,
120
- # Max duration of speech chunks, longer will be split at the timestamp of the last silence
121
- # that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf'))
122
  max_speech_duration_s=3,
123
  # Wait for ms at the end of each speech chunk before separating it (default 2000)
124
  min_silence_duration_ms=100,
 
76
  torch_dtype=torch_dtype,
77
  device=device,
78
  )
79
+ if device == "cuda":
80
+ transcribe_pipeline.model = torch.compile(transcribe_pipeline.model, mode="max-autotune")
81
 
82
  # Warm up the model with empty audio
83
  logger.info("Warming up Whisper model with dummy input")
 
92
  outputs = transcribe_pipeline(
93
  audio_to_bytes(audio),
94
  chunk_length_s=3,
95
+ batch_size=1,
96
  generate_kwargs={
97
  'task': 'transcribe',
98
  'language': LANGUAGE,
 
106
  handler=ReplyOnPause(
107
  transcribe,
108
  algo_options=AlgoOptions(
109
+ # Duration in seconds of audio chunks passed to the VAD model (default 0.6)
110
  audio_chunk_duration=0.6,
111
  # If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
112
  started_talking_threshold=0.1,
113
  # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
114
  speech_threshold=0.1,
115
+ # Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model. (default -inf)
116
+ max_continuous_speech_s=6
117
  ),
118
  model_options=SileroVadOptions(
119
  # Threshold for what is considered speech (default 0.5)
120
  threshold=0.5,
121
  # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
122
  min_speech_duration_ms=250,
123
+ # Max duration of speech chunks, longer will be split at the timestamp of the last silence that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf')) (used internally in the VAD algorithm to split the audio that's passed to the algorithm)
 
124
  max_speech_duration_s=3,
125
  # Wait for ms at the end of each speech chunk before separating it (default 2000)
126
  min_silence_duration_ms=100,
requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
  accelerate
2
- fastrtc==0.0.24
3
- fastrtc[vad]==0.0.24
4
  python-dotenv
5
  transformers
6
  torch==2.6.0
 
1
  accelerate
2
+ git+https://github.com/sofi444/fastrtc@break-cont-speech#egg=fastrtc[vad]
 
3
  python-dotenv
4
  transformers
5
  torch==2.6.0