Morgan Funtowicz
commited on
Commit
·
1b7eead
1
Parent(s):
8550385
misc(whisper): minor
Browse files- Dockerfile +2 -2
- handler.py +27 -24
Dockerfile
CHANGED
@@ -7,8 +7,8 @@ RUN --mount=type=bind,from=huggingface/endpoints-sdk:v1.0.0-beta-py312-manylinux
|
|
7 |
|
8 |
COPY handler.py /opt/endpoints/
|
9 |
|
10 |
-
ENV
|
11 |
-
ENV
|
12 |
|
13 |
EXPOSE 80
|
14 |
ENTRYPOINT ["python3"]
|
|
|
7 |
|
8 |
COPY handler.py /opt/endpoints/
|
9 |
|
10 |
+
ENV INTERFACE=0.0.0.0
|
11 |
+
ENV PORT=80
|
12 |
|
13 |
EXPOSE 80
|
14 |
ENTRYPOINT ["python3"]
|
handler.py
CHANGED
@@ -38,7 +38,7 @@ SUPPORTED_MODEL_ARCHITECTURES = ["WhisperForConditionalGeneration"]
|
|
38 |
|
39 |
|
40 |
def chunk_audio_with_duration(
|
41 |
-
|
42 |
) -> Sequence[np.ndarray]:
|
43 |
"""
|
44 |
Chunk a mono audio timeseries so that each chunk is as long as `maximum_duration_sec`.
|
@@ -67,10 +67,10 @@ def compression_ratio(text: str) -> float:
|
|
67 |
|
68 |
|
69 |
def create_prompt(
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
):
|
75 |
"""
|
76 |
Generate the right prompt with the specific parameters to submit for inference over Whisper
|
@@ -97,7 +97,7 @@ def create_prompt(
|
|
97 |
|
98 |
|
99 |
def create_params(
|
100 |
-
|
101 |
) -> "SamplingParams":
|
102 |
"""
|
103 |
Create sampling parameters to submit for inference through vLLM `generate`
|
@@ -127,12 +127,12 @@ def get_avg_logprob(logprobs: "SampleLogprobs") -> float:
|
|
127 |
|
128 |
|
129 |
def process_chunk(
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
) -> Generator:
|
137 |
"""
|
138 |
Decode a single transcribed audio chunk and generates all the segments associated
|
@@ -202,9 +202,9 @@ def process_chunk(
|
|
202 |
|
203 |
|
204 |
def process_chunks(
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
) -> Tuple[List[Segment], str]:
|
209 |
"""
|
210 |
Iterate over all the audio chunk's outputs and consolidates outputs as segment(s) whether the response is verbose or not
|
@@ -227,7 +227,7 @@ def process_chunks(
|
|
227 |
logprobs = generation.logprobs
|
228 |
|
229 |
for segment, _is_continuation in process_chunk(
|
230 |
-
|
231 |
):
|
232 |
materialized_segments.append(segment)
|
233 |
|
@@ -267,12 +267,12 @@ class WhisperHandler(Handler[TranscriptionRequest, TranscriptionResponse]):
|
|
267 |
)
|
268 |
|
269 |
async def transcribe(
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
) -> (List[Segment], str):
|
277 |
async def __agenerate__(request_id: str, prompt, params):
|
278 |
"""
|
@@ -323,14 +323,14 @@ class WhisperHandler(Handler[TranscriptionRequest, TranscriptionResponse]):
|
|
323 |
return segments, text
|
324 |
|
325 |
async def __call__(
|
326 |
-
|
327 |
) -> TranscriptionResponse:
|
328 |
with logger.contextualize(request_id=ctx.request_id):
|
329 |
with memoryview(request) as audio:
|
330 |
|
331 |
# Check if we need to enable the verbose path
|
332 |
is_verbose = (
|
333 |
-
|
334 |
)
|
335 |
|
336 |
# Retrieve the tokenizer and model config asynchronously while we decode audio
|
@@ -377,6 +377,9 @@ class WhisperHandler(Handler[TranscriptionRequest, TranscriptionResponse]):
|
|
377 |
case TranscriptionResponseKind.TEXT:
|
378 |
return TranscriptionResponse.text(text)
|
379 |
|
|
|
|
|
|
|
380 |
|
381 |
def entrypoint():
|
382 |
# Retrieve endpoint configuration
|
|
|
38 |
|
39 |
|
40 |
def chunk_audio_with_duration(
|
41 |
+
audio: np.ndarray, maximum_duration_sec: int, sampling_rate: int
|
42 |
) -> Sequence[np.ndarray]:
|
43 |
"""
|
44 |
Chunk a mono audio timeseries so that each chunk is as long as `maximum_duration_sec`.
|
|
|
67 |
|
68 |
|
69 |
def create_prompt(
|
70 |
+
audio: np.ndarray,
|
71 |
+
sampling_rate: int,
|
72 |
+
language: int,
|
73 |
+
timestamp_marker: int,
|
74 |
):
|
75 |
"""
|
76 |
Generate the right prompt with the specific parameters to submit for inference over Whisper
|
|
|
97 |
|
98 |
|
99 |
def create_params(
|
100 |
+
max_tokens: int, temperature: float, is_verbose: bool
|
101 |
) -> "SamplingParams":
|
102 |
"""
|
103 |
Create sampling parameters to submit for inference through vLLM `generate`
|
|
|
127 |
|
128 |
|
129 |
def process_chunk(
|
130 |
+
tokenizer: "PreTrainedTokenizer",
|
131 |
+
ids: np.ndarray,
|
132 |
+
logprobs: "SampleLogprobs",
|
133 |
+
request: TranscriptionRequest,
|
134 |
+
segment_offset: int,
|
135 |
+
timestamp_offset: int,
|
136 |
) -> Generator:
|
137 |
"""
|
138 |
Decode a single transcribed audio chunk and generates all the segments associated
|
|
|
202 |
|
203 |
|
204 |
def process_chunks(
|
205 |
+
tokenizer: "PreTrainedTokenizer",
|
206 |
+
chunks: List["RequestOutput"],
|
207 |
+
request: TranscriptionRequest,
|
208 |
) -> Tuple[List[Segment], str]:
|
209 |
"""
|
210 |
Iterate over all the audio chunk's outputs and consolidates outputs as segment(s) whether the response is verbose or not
|
|
|
227 |
logprobs = generation.logprobs
|
228 |
|
229 |
for segment, _is_continuation in process_chunk(
|
230 |
+
tokenizer, ids, logprobs, request, segment_offset, time_offset
|
231 |
):
|
232 |
materialized_segments.append(segment)
|
233 |
|
|
|
267 |
)
|
268 |
|
269 |
async def transcribe(
|
270 |
+
self,
|
271 |
+
ctx: Context,
|
272 |
+
request: TranscriptionRequest,
|
273 |
+
tokenizer: "PreTrainedTokenizer",
|
274 |
+
audio_chunks: Iterable[np.ndarray],
|
275 |
+
params: "SamplingParams",
|
276 |
) -> (List[Segment], str):
|
277 |
async def __agenerate__(request_id: str, prompt, params):
|
278 |
"""
|
|
|
323 |
return segments, text
|
324 |
|
325 |
async def __call__(
|
326 |
+
self, request: TranscriptionRequest, ctx: Context
|
327 |
) -> TranscriptionResponse:
|
328 |
with logger.contextualize(request_id=ctx.request_id):
|
329 |
with memoryview(request) as audio:
|
330 |
|
331 |
# Check if we need to enable the verbose path
|
332 |
is_verbose = (
|
333 |
+
request.response_kind == TranscriptionResponseKind.VERBOSE_JSON
|
334 |
)
|
335 |
|
336 |
# Retrieve the tokenizer and model config asynchronously while we decode audio
|
|
|
377 |
case TranscriptionResponseKind.TEXT:
|
378 |
return TranscriptionResponse.text(text)
|
379 |
|
380 |
+
# I don't forsee any case this would happen but at least we are safe
|
381 |
+
raise ValueError(f"Invalid response_kind ({request.response_kind})")
|
382 |
+
|
383 |
|
384 |
def entrypoint():
|
385 |
# Retrieve endpoint configuration
|