canopylabs/orpheus-3b-0.1-ft · The model's max seq len (131072) is larger than the maximum number of tokens that can be stored in KV cache (130304)

vllm 0.7.3 as suggested on Github

from orpheus_tts import OrpheusModel
import wave
import time

model = OrpheusModel(model_name ="canopylabs/orpheus-tts-0.1-finetune-prod")
prompt = '''Man, the way social media has, um, completely changed how we interact is just wild, right? Like, we're all connected 24/7 but somehow people feel more alone than ever. And don't even get me started on how it's messing with kids' self-esteem and mental health and whatnot.'''

start_time = time.monotonic()
syn_tokens = model.generate_speech(
prompt=prompt,
voice="tara",
)

with wave.open("output.wav", "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(24000)

total_frames = 0
chunk_counter = 0
for audio_chunk in syn_tokens: # output streaming
chunk_counter += 1
frame_count = len(audio_chunk) // (wf.getsampwidth() * wf.getnchannels())
total_frames += frame_count
wf.writeframes(audio_chunk)
duration = total_frames / wf.getframerate()

end_time = time.monotonic()
print(f"It took {end_time - start_time} seconds to generate {duration:.2f} seconds of audio")

ValueError Traceback (most recent call last)
Cell In[1], line 5
2 import wave
3 import time
----> 5 model = OrpheusModel(model_name ="canopylabs/orpheus-tts-0.1-finetune-prod")
6 prompt = '''Man, the way social media has, um, completely changed how we interact is just wild, right? Like, we're all connected 24/7 but somehow people feel more alone than ever. And don't even get me started on how it's messing with kids' self-esteem and mental health and whatnot.'''
8 start_time = time.monotonic()

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/orpheus_tts/engine_class.py:13, in OrpheusModel.init(self, model_name, dtype)
11 self.model_name = self._map_model_params(model_name)
12 self.dtype = dtype
---> 13 self.engine = self._setup_engine()
14 self.available_voices = ["zoe", "zac","jess", "leo", "mia", "julia", "leah"]
15 self.tokeniser = AutoTokenizer.from_pretrained(model_name)

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/orpheus_tts/engine_class.py:46, in OrpheusModel._setup_engine(self)
41 def _setup_engine(self):
42 engine_args = AsyncEngineArgs(
43 model=self.model_name,
44 dtype=self.dtype,
45 )
---> 46 return AsyncLLMEngine.from_engine_args(engine_args)

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py:644, in AsyncLLMEngine.from_engine_args(cls, engine_args, engine_config, start_engine_loop, usage_context, stat_loggers)
641 executor_class = cls._get_executor_cls(engine_config)
643 # Create the async LLM engine.
--> 644 engine = cls(
645 vllm_config=engine_config,
646 executor_class=executor_class,
647 log_requests=not engine_args.disable_log_requests,
648 log_stats=not engine_args.disable_log_stats,
649 start_engine_loop=start_engine_loop,
650 usage_context=usage_context,
651 stat_loggers=stat_loggers,
652 )
653 return engine

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py:594, in AsyncLLMEngine.init(self, log_requests, start_engine_loop, *args, **kwargs)
588 def init(self,
589 *args,
590 log_requests: bool = True,
591 start_engine_loop: bool = True,
592 **kwargs) -> None:
593 self.log_requests = log_requests
--> 594 self.engine = self._engine_class(*args, **kwargs)
596 # This ensures quick processing of request outputs
597 # so the append to asyncio queues is not delayed,
598 # especially for multi-step.
599 self.use_process_request_outputs_callback = (
600 self.engine.model_config.use_async_output_proc)

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py:267, in _AsyncLLMEngine.init(self, *args, **kwargs)
266 def init(self, *args, **kwargs):
--> 267 super().init(*args, **kwargs)

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/vllm/engine/llm_engine.py:276, in LLMEngine.init(self, vllm_config, executor_class, log_stats, usage_context, stat_loggers, input_registry, mm_registry, use_cached_outputs)
273 self.model_executor = executor_class(vllm_config=vllm_config, )
275 if self.model_config.runner_type != "pooling":
--> 276 self._initialize_kv_caches()
278 # If usage stat is enabled, collect relevant info.
279 if is_usage_stats_enabled():

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/vllm/engine/llm_engine.py:434, in LLMEngine._initialize_kv_caches(self)
431 self.cache_config.num_gpu_blocks = num_gpu_blocks
432 self.cache_config.num_cpu_blocks = num_cpu_blocks
--> 434 self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
435 elapsed = time.time() - start
436 logger.info(("init engine (profile, create kv cache, "
437 "warmup model) took %.2f seconds"), elapsed)

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/vllm/executor/executor_base.py:122, in ExecutorBase.initialize_cache(self, num_gpu_blocks, num_cpu_blocks)
119 self.cache_config.num_gpu_blocks = num_gpu_blocks
120 self.cache_config.num_cpu_blocks = num_cpu_blocks
--> 122 self.collective_rpc("initialize_cache",
123 args=(num_gpu_blocks, num_cpu_blocks))

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py:56, in UniProcExecutor.collective_rpc(self, method, timeout, args, kwargs)
54 if kwargs is None:
55 kwargs = {}
---> 56 answer = run_method(self.driver_worker, method, args, kwargs)
57 return [answer]

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/vllm/utils.py:2196, in run_method(obj, method, args, kwargs)
2194 else:
2195 func = partial(method, obj) # type: ignore
-> 2196 return func(*args, **kwargs)

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/vllm/worker/worker.py:291, in Worker.initialize_cache(self, num_gpu_blocks, num_cpu_blocks)
285 def initialize_cache(self, num_gpu_blocks: int,
286 num_cpu_blocks: int) -> None:
287 """Allocate GPU and CPU KV cache with the specified number of blocks.
288
289 This also warms up the model, which may record CUDA graphs.
290 """
--> 291 raise_if_cache_size_invalid(num_gpu_blocks,
292 self.cache_config.block_size,
293 self.cache_config.is_attention_free,
294 self.model_config.max_model_len)
296 self.cache_config.num_gpu_blocks = num_gpu_blocks
297 self.cache_config.num_cpu_blocks = num_cpu_blocks

File ~/anaconda3/envs/orpheus/lib/python3.11/site-packages/vllm/worker/worker.py:544, in raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free, max_model_len)
542 max_seq_len = block_size * num_gpu_blocks
543 if not is_attention_free and max_model_len > max_seq_len:
--> 544 raise ValueError(
545 f"The model's max seq len ({max_model_len}) "
546 "is larger than the maximum number of tokens that can be "
547 f"stored in KV cache ({max_seq_len}). Try increasing "
548 "gpu_memory_utilization or decreasing max_model_len when "
549 "initializing the engine.")

ValueError: The model's max seq len (131072) is larger than the maximum number of tokens that can be stored in KV cache (130304). Try increasing gpu_memory_utilization or decreasing max_model_len when initializing the engine.