Spaces:
Sleeping
Sleeping
Sofia Casadei
commited on
Commit
Β·
061790e
1
Parent(s):
ddd255d
max continuous speech s
Browse files- .gitignore +2 -1
- main.py +7 -5
- requirements.txt +1 -2
.gitignore
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
__pycache__/
|
2 |
logs/
|
3 |
.env
|
4 |
-
.venv/
|
|
|
|
1 |
__pycache__/
|
2 |
logs/
|
3 |
.env
|
4 |
+
.venv/
|
5 |
+
NOTES.md
|
main.py
CHANGED
@@ -76,7 +76,8 @@ transcribe_pipeline = pipeline(
|
|
76 |
torch_dtype=torch_dtype,
|
77 |
device=device,
|
78 |
)
|
79 |
-
|
|
|
80 |
|
81 |
# Warm up the model with empty audio
|
82 |
logger.info("Warming up Whisper model with dummy input")
|
@@ -91,7 +92,7 @@ async def transcribe(audio: tuple[int, np.ndarray]):
|
|
91 |
outputs = transcribe_pipeline(
|
92 |
audio_to_bytes(audio),
|
93 |
chunk_length_s=3,
|
94 |
-
batch_size=
|
95 |
generate_kwargs={
|
96 |
'task': 'transcribe',
|
97 |
'language': LANGUAGE,
|
@@ -105,20 +106,21 @@ stream = Stream(
|
|
105 |
handler=ReplyOnPause(
|
106 |
transcribe,
|
107 |
algo_options=AlgoOptions(
|
108 |
-
# Duration in seconds of audio chunks (default 0.6)
|
109 |
audio_chunk_duration=0.6,
|
110 |
# If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
|
111 |
started_talking_threshold=0.1,
|
112 |
# If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
|
113 |
speech_threshold=0.1,
|
|
|
|
|
114 |
),
|
115 |
model_options=SileroVadOptions(
|
116 |
# Threshold for what is considered speech (default 0.5)
|
117 |
threshold=0.5,
|
118 |
# Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
|
119 |
min_speech_duration_ms=250,
|
120 |
-
# Max duration of speech chunks, longer will be split at the timestamp of the last silence
|
121 |
-
# that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf'))
|
122 |
max_speech_duration_s=3,
|
123 |
# Wait for ms at the end of each speech chunk before separating it (default 2000)
|
124 |
min_silence_duration_ms=100,
|
|
|
76 |
torch_dtype=torch_dtype,
|
77 |
device=device,
|
78 |
)
|
79 |
+
if device == "cuda":
|
80 |
+
transcribe_pipeline.model = torch.compile(transcribe_pipeline.model, mode="max-autotune")
|
81 |
|
82 |
# Warm up the model with empty audio
|
83 |
logger.info("Warming up Whisper model with dummy input")
|
|
|
92 |
outputs = transcribe_pipeline(
|
93 |
audio_to_bytes(audio),
|
94 |
chunk_length_s=3,
|
95 |
+
batch_size=1,
|
96 |
generate_kwargs={
|
97 |
'task': 'transcribe',
|
98 |
'language': LANGUAGE,
|
|
|
106 |
handler=ReplyOnPause(
|
107 |
transcribe,
|
108 |
algo_options=AlgoOptions(
|
109 |
+
# Duration in seconds of audio chunks passed to the VAD model (default 0.6)
|
110 |
audio_chunk_duration=0.6,
|
111 |
# If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
|
112 |
started_talking_threshold=0.1,
|
113 |
# If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
|
114 |
speech_threshold=0.1,
|
115 |
+
# Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model. (default -inf)
|
116 |
+
max_continuous_speech_s=6
|
117 |
),
|
118 |
model_options=SileroVadOptions(
|
119 |
# Threshold for what is considered speech (default 0.5)
|
120 |
threshold=0.5,
|
121 |
# Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
|
122 |
min_speech_duration_ms=250,
|
123 |
+
# Max duration of speech chunks, longer will be split at the timestamp of the last silence that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf')) (used internally in the VAD algorithm to split the audio that's passed to the algorithm)
|
|
|
124 |
max_speech_duration_s=3,
|
125 |
# Wait for ms at the end of each speech chunk before separating it (default 2000)
|
126 |
min_silence_duration_ms=100,
|
requirements.txt
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
accelerate
|
2 |
-
fastrtc
|
3 |
-
fastrtc[vad]==0.0.24
|
4 |
python-dotenv
|
5 |
transformers
|
6 |
torch==2.6.0
|
|
|
1 |
accelerate
|
2 |
+
git+https://github.com/sofi444/fastrtc@break-cont-speech#egg=fastrtc[vad]
|
|
|
3 |
python-dotenv
|
4 |
transformers
|
5 |
torch==2.6.0
|