model crashes when qps >=2
import torch
from PIL import Image
from vllm import LLM
from vllm.config import PoolerConfig
from vllm.inputs.data import TextPrompt
Initialize model
model = LLM(
model="jinaai/jina-embeddings-v4-vllm-retrieval",
task="embed",
override_pooler_config=PoolerConfig(pooling_type="ALL", normalize=False),
dtype="float16",
)
Create text prompts
query = "Overview of climate change impacts on coastal cities"
query_prompt = TextPrompt(
prompt=f"Query: {query}"
)
passage = "The impacts of climate change on coastal cities are significant.."
passage_prompt = TextPrompt(
prompt=f"Passage: {passage}"
)
Create image prompt
image = Image.open("")
image_prompt = TextPrompt(
prompt="<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>\n",
multi_modal_data={"image": image},
)
Encode all prompts
prompts = [query_prompt, passage_prompt, image_prompt]
outputs = model.encode(prompts)
def get_embeddings(outputs):
VISION_START_TOKEN_ID, VISION_END_TOKEN_ID = 151652, 151653
embeddings = []
for output in outputs:
if VISION_START_TOKEN_ID in output.prompt_token_ids:
# Gather only vision tokens
img_start_pos = torch.where(
torch.tensor(output.prompt_token_ids) == VISION_START_TOKEN_ID
)[0][0]
img_end_pos = torch.where(
torch.tensor(output.prompt_token_ids) == VISION_END_TOKEN_ID
)[0][0]
embeddings_tensor = output.outputs.data.detach().clone()[
img_start_pos : img_end_pos + 1
]
else:
# Use all tokens for text-only prompts
embeddings_tensor = output.outputs.data.detach().clone()
# Pool and normalize embeddings
pooled_output = (
embeddings_tensor.sum(dim=0, dtype=torch.float32)
/ embeddings_tensor.shape[0]
)
embeddings.append(torch.nn.functional.normalize(pooled_output, dim=-1))
return embeddings
embeddings = get_embeddings(outputs)
When I wrap this code using fastapi and request embedding service with QPS=1, it works fine. But when requesting the service with QPS >=2, it raises the following error:
File "/home/bml/.storage/mnt/v-ehn3x2i8d9bd43e3/org/gqy/mmrag/embedding_model.py", line 138, in embedding
res = self.get_emb(prompt_tensor)
^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/utils/init.py", line 1292, in inner
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 1056, in encode
outputs = self._run_engine(use_tqdm=use_tqdm)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 1570, in _run_engine
step_outputs = self.llm_engine.step()
^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 1356, in step
outputs = self.model_executor.execute_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/executor/executor_base.py", line 141, in execute_model
output = self.collective_rpc("execute_model",
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py", line 57, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/utils/init.py", line 2736, in run_method
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/worker/worker_base.py", line 420, in execute_model
output = self.model_runner.execute_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/worker/pooling_model_runner.py", line 119, in execute_model
hidden_or_intermediate_states = model_executable(
^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/model_executor/models/qwen2_5_vl.py", line 1145, in forward
hidden_states = self.language_model.model(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/compilation/decorators.py", line 173, in call
return self.forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 355, in forward
hidden_states, residual = layer(
^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 254, in forward
hidden_states = self.self_attn(
^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 184, in forward
attn_output = self.attn(q, k, v)
^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/attention/layer.py", line 256, in forward
return torch.ops.vllm.unified_attention(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/_ops.py", line 1158, in call
return self._op(*args, **(kwargs or {}))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/attention/layer.py", line 412, in unified_attention
output = self.impl.forward(self, query, key, value, kv_cache,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/attention/backends/xformers.py", line 563, in forward
assert query.shape[0] == num_prefill_query_tokens
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
This error occurs too.
File "/home/bml/.storage/mnt/v-ehn3x2i8d9bd43e3/org/gqy/mmrag/embedding_model.py", line 138, in embedding
res = self.get_emb(prompt_tensor)
^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/utils/init.py", line 1292, in inner
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 1056, in encode
outputs = self._run_engine(use_tqdm=use_tqdm)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 1570, in _run_engine
step_outputs = self.llm_engine.step()
^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 1356, in step
outputs = self.model_executor.execute_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/executor/executor_base.py", line 141, in execute_model
output = self.collective_rpc("execute_model",
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py", line 57, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/utils/init.py", line 2736, in run_method
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/worker/worker_base.py", line 420, in execute_model
output = self.model_runner.execute_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/worker/pooling_model_runner.py", line 119, in execute_model
hidden_or_intermediate_states = model_executable(
^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/model_executor/models/qwen2_5_vl.py", line 1145, in forward
hidden_states = self.language_model.model(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/compilation/decorators.py", line 173, in call
return self.forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 355, in forward
hidden_states, residual = layer(
^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 254, in forward
hidden_states = self.self_attn(
^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 184, in forward
attn_output = self.attn(q, k, v)
^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/attention/layer.py", line 256, in forward
return torch.ops.vllm.unified_attention(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/torch/_ops.py", line 1158, in call
return self._op(*args, **(kwargs or {}))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/attention/layer.py", line 412, in unified_attention
output = self.impl.forward(self, query, key, value, kv_cache,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/attention/backends/xformers.py", line 564, in forward
assert decode_query.shape[0] == num_decode_query_tokens
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
File "/home/bml/.storage/mnt/v-ehn3x2i8d9bd43e3/org/gqy/mmrag/embedding_model.py", line 138, in embedding
res = self.get_emb(prompt_tensor)
^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/utils/init.py", line 1292, in inner
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 1056, in encode
outputs = self._run_engine(use_tqdm=use_tqdm)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 1570, in _run_engine
step_outputs = self.llm_engine.step()
^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 1356, in step
outputs = self.model_executor.execute_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/executor/executor_base.py", line 141, in execute_model
output = self.collective_rpc("execute_model",
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py", line 57, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/utils/init.py", line 2736, in run_method
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/worker/worker_base.py", line 394, in execute_model
inputs = self.prepare_input(execute_model_req)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/worker/worker_base.py", line 379, in prepare_input
return self._get_driver_input_and_broadcast(execute_model_req)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/worker/worker_base.py", line 341, in _get_driver_input_and_broadcast
self.model_runner.prepare_model_input(
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/worker/pooling_model_runner.py", line 179, in prepare_model_input
model_input = self._prepare_model_input_tensors(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 1285, in _prepare_model_input_tensors
return self.builder.build() # type: ignore
^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/jina-vllm/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 913, in build
_seq_mrope_input_positions[idx])
~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^
TypeError: 'NoneType' object is not subscriptable