RuntimeError: weight lm_head.weight does not exist error when deploying to HF inference endpoint.
When I try to deploy this model on L4 GPU (AWS) in Huggingface endpoints it gives lm_head.weight missing error everytime.
Container: Text Generation Inference
Task: Image-Text-to-Text
Error Message:
Error when initializing model
Traceback (most recent call last):
File "/usr/src/.venv/bin/text-generation-server", line 10, in
sys.exit(app())
File "/usr/src/.venv/lib/python3.11/site-packages/typer/main.py", line 323, in call
return get_command(self)(*args, **kwargs)
File "/usr/src/.venv/lib/python3.11/site-packages/click/core.py", line 1161, in call
return self.main(*args, **kwargs)
File "/usr/src/.venv/lib/python3.11/site-packages/typer/core.py", line 740, in main
return _main(
File "/usr/src/.venv/lib/python3.11/site-packages/typer/core.py", line 195, in _main
rv = self.invoke(ctx)
File "/usr/src/.venv/lib/python3.11/site-packages/click/core.py", line 1697, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/usr/src/.venv/lib/python3.11/site-packages/click/core.py", line 1443, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/usr/src/.venv/lib/python3.11/site-packages/click/core.py", line 788, in invoke
return __callback(*args, **kwargs)
File "/usr/src/.venv/lib/python3.11/site-packages/typer/main.py", line 698, in wrapper
return callback(**use_params)
File "/usr/src/server/text_generation_server/cli.py", line 119, in serve
server.serve(
File "/usr/src/server/text_generation_server/server.py", line 313, in serve
asyncio.run(
File "/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/python3.11/asyncio/runners.py", line 190, in run
return runner.run(main)
File "/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/python3.11/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
File "/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/python3.11/asyncio/base_events.py", line 641, in run_until_complete
self.run_forever()
File "/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/python3.11/asyncio/base_events.py", line 608, in run_forever
self._run_once()
File "/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once
handle._run()
File "/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/python3.11/asyncio/events.py", line 84, in _run
self._context.run(self._callback, *self._args)
File "/usr/src/server/text_generation_server/server.py", line 266, in serve_inner
model = get_model_with_lora_adapters(
File "/usr/src/server/text_generation_server/models/init.py", line 1816, in get_model_with_lora_adapters
model = get_model(
File "/usr/src/server/text_generation_server/models/init.py", line 1545, in get_model
return VlmCausalLM(
File "/usr/src/server/text_generation_server/models/vlm_causal_lm.py", line 720, in init
super().init(
File "/usr/src/server/text_generation_server/models/flash_causal_lm.py", line 1269, in init
model = model_class(prefix, config, weights)
File "/usr/src/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py", line 830, in init
self.lm_head = SpeculativeHead.load(
File "/usr/src/server/text_generation_server/layers/speculative.py", line 40, in load
lm_head = TensorParallelHead.load(config, prefix, weights)
File "/usr/src/server/text_generation_server/layers/tensor_parallel.py", line 66, in load
weight = weights.get_tensor(f"{prefix}.weight")
File "/usr/src/server/text_generation_server/utils/weights.py", line 213, in get_tensor
filename, tensor_name = self.get_filename(tensor_name)
File "/usr/src/server/text_generation_server/utils/weights.py", line 192, in get_filename
raise RuntimeError(f"weight {tensor_name} does not exist")
RuntimeError: weight lm_head.weight does not exist