用vllm时应该是什么参数
#30
by
daiwk
- opened
需要加那个什么apply_chat_template不,我用的https://hf-mirror.com/lmstudio-community/QwQ-32B-GGUF 这里的gguf,好像没法抽出来tokenizer
prompt_final = [{"role": "user", "content": "xxx"}]
tensor_parallel_size=1
pipeline_parallel_size=1
ckpt_path="./QwQ-32B-Q4_K_M.gguf"
sampling_params = SamplingParams(temperature=0.6, max_tokens=1000)
batch_prompts = [prompt_final]
llm = LLM(model=ckpt_path, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend="mp", pipeline_parallel_size=pipeline_parallel_size)#,
preds = llm.chat(batch_prompts, sampling_params)
for output in preds:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}\n")