Morgan Funtowicz commited on
Commit
1b7eead
·
1 Parent(s): 8550385

misc(whisper): minor

Browse files
Files changed (2) hide show
  1. Dockerfile +2 -2
  2. handler.py +27 -24
Dockerfile CHANGED
@@ -7,8 +7,8 @@ RUN --mount=type=bind,from=huggingface/endpoints-sdk:v1.0.0-beta-py312-manylinux
7
 
8
  COPY handler.py /opt/endpoints/
9
 
10
- ENV HFENDPOINT_INTERFACE=0.0.0.0
11
- ENV HFENDPOINT_PORT=80
12
 
13
  EXPOSE 80
14
  ENTRYPOINT ["python3"]
 
7
 
8
  COPY handler.py /opt/endpoints/
9
 
10
+ ENV INTERFACE=0.0.0.0
11
+ ENV PORT=80
12
 
13
  EXPOSE 80
14
  ENTRYPOINT ["python3"]
handler.py CHANGED
@@ -38,7 +38,7 @@ SUPPORTED_MODEL_ARCHITECTURES = ["WhisperForConditionalGeneration"]
38
 
39
 
40
  def chunk_audio_with_duration(
41
- audio: np.ndarray, maximum_duration_sec: int, sampling_rate: int
42
  ) -> Sequence[np.ndarray]:
43
  """
44
  Chunk a mono audio timeseries so that each chunk is as long as `maximum_duration_sec`.
@@ -67,10 +67,10 @@ def compression_ratio(text: str) -> float:
67
 
68
 
69
  def create_prompt(
70
- audio: np.ndarray,
71
- sampling_rate: int,
72
- language: int,
73
- timestamp_marker: int,
74
  ):
75
  """
76
  Generate the right prompt with the specific parameters to submit for inference over Whisper
@@ -97,7 +97,7 @@ def create_prompt(
97
 
98
 
99
  def create_params(
100
- max_tokens: int, temperature: float, is_verbose: bool
101
  ) -> "SamplingParams":
102
  """
103
  Create sampling parameters to submit for inference through vLLM `generate`
@@ -127,12 +127,12 @@ def get_avg_logprob(logprobs: "SampleLogprobs") -> float:
127
 
128
 
129
  def process_chunk(
130
- tokenizer: "PreTrainedTokenizer",
131
- ids: np.ndarray,
132
- logprobs: "SampleLogprobs",
133
- request: TranscriptionRequest,
134
- segment_offset: int,
135
- timestamp_offset: int,
136
  ) -> Generator:
137
  """
138
  Decode a single transcribed audio chunk and generates all the segments associated
@@ -202,9 +202,9 @@ def process_chunk(
202
 
203
 
204
  def process_chunks(
205
- tokenizer: "PreTrainedTokenizer",
206
- chunks: List["RequestOutput"],
207
- request: TranscriptionRequest,
208
  ) -> Tuple[List[Segment], str]:
209
  """
210
  Iterate over all the audio chunk's outputs and consolidates outputs as segment(s) whether the response is verbose or not
@@ -227,7 +227,7 @@ def process_chunks(
227
  logprobs = generation.logprobs
228
 
229
  for segment, _is_continuation in process_chunk(
230
- tokenizer, ids, logprobs, request, segment_offset, time_offset
231
  ):
232
  materialized_segments.append(segment)
233
 
@@ -267,12 +267,12 @@ class WhisperHandler(Handler[TranscriptionRequest, TranscriptionResponse]):
267
  )
268
 
269
  async def transcribe(
270
- self,
271
- ctx: Context,
272
- request: TranscriptionRequest,
273
- tokenizer: "PreTrainedTokenizer",
274
- audio_chunks: Iterable[np.ndarray],
275
- params: "SamplingParams",
276
  ) -> (List[Segment], str):
277
  async def __agenerate__(request_id: str, prompt, params):
278
  """
@@ -323,14 +323,14 @@ class WhisperHandler(Handler[TranscriptionRequest, TranscriptionResponse]):
323
  return segments, text
324
 
325
  async def __call__(
326
- self, request: TranscriptionRequest, ctx: Context
327
  ) -> TranscriptionResponse:
328
  with logger.contextualize(request_id=ctx.request_id):
329
  with memoryview(request) as audio:
330
 
331
  # Check if we need to enable the verbose path
332
  is_verbose = (
333
- request.response_kind == TranscriptionResponseKind.VERBOSE_JSON
334
  )
335
 
336
  # Retrieve the tokenizer and model config asynchronously while we decode audio
@@ -377,6 +377,9 @@ class WhisperHandler(Handler[TranscriptionRequest, TranscriptionResponse]):
377
  case TranscriptionResponseKind.TEXT:
378
  return TranscriptionResponse.text(text)
379
 
 
 
 
380
 
381
  def entrypoint():
382
  # Retrieve endpoint configuration
 
38
 
39
 
40
  def chunk_audio_with_duration(
41
+ audio: np.ndarray, maximum_duration_sec: int, sampling_rate: int
42
  ) -> Sequence[np.ndarray]:
43
  """
44
  Chunk a mono audio timeseries so that each chunk is as long as `maximum_duration_sec`.
 
67
 
68
 
69
  def create_prompt(
70
+ audio: np.ndarray,
71
+ sampling_rate: int,
72
+ language: int,
73
+ timestamp_marker: int,
74
  ):
75
  """
76
  Generate the right prompt with the specific parameters to submit for inference over Whisper
 
97
 
98
 
99
  def create_params(
100
+ max_tokens: int, temperature: float, is_verbose: bool
101
  ) -> "SamplingParams":
102
  """
103
  Create sampling parameters to submit for inference through vLLM `generate`
 
127
 
128
 
129
  def process_chunk(
130
+ tokenizer: "PreTrainedTokenizer",
131
+ ids: np.ndarray,
132
+ logprobs: "SampleLogprobs",
133
+ request: TranscriptionRequest,
134
+ segment_offset: int,
135
+ timestamp_offset: int,
136
  ) -> Generator:
137
  """
138
  Decode a single transcribed audio chunk and generates all the segments associated
 
202
 
203
 
204
  def process_chunks(
205
+ tokenizer: "PreTrainedTokenizer",
206
+ chunks: List["RequestOutput"],
207
+ request: TranscriptionRequest,
208
  ) -> Tuple[List[Segment], str]:
209
  """
210
  Iterate over all the audio chunk's outputs and consolidates outputs as segment(s) whether the response is verbose or not
 
227
  logprobs = generation.logprobs
228
 
229
  for segment, _is_continuation in process_chunk(
230
+ tokenizer, ids, logprobs, request, segment_offset, time_offset
231
  ):
232
  materialized_segments.append(segment)
233
 
 
267
  )
268
 
269
  async def transcribe(
270
+ self,
271
+ ctx: Context,
272
+ request: TranscriptionRequest,
273
+ tokenizer: "PreTrainedTokenizer",
274
+ audio_chunks: Iterable[np.ndarray],
275
+ params: "SamplingParams",
276
  ) -> (List[Segment], str):
277
  async def __agenerate__(request_id: str, prompt, params):
278
  """
 
323
  return segments, text
324
 
325
  async def __call__(
326
+ self, request: TranscriptionRequest, ctx: Context
327
  ) -> TranscriptionResponse:
328
  with logger.contextualize(request_id=ctx.request_id):
329
  with memoryview(request) as audio:
330
 
331
  # Check if we need to enable the verbose path
332
  is_verbose = (
333
+ request.response_kind == TranscriptionResponseKind.VERBOSE_JSON
334
  )
335
 
336
  # Retrieve the tokenizer and model config asynchronously while we decode audio
 
377
  case TranscriptionResponseKind.TEXT:
378
  return TranscriptionResponse.text(text)
379
 
380
+ # I don't forsee any case this would happen but at least we are safe
381
+ raise ValueError(f"Invalid response_kind ({request.response_kind})")
382
+
383
 
384
  def entrypoint():
385
  # Retrieve endpoint configuration