Model card code for local inference doesn't work
Hi, I'm trying to run this locally.
There's some obvious bugs in code given in the model card (like import torch
missing, or model
not being defined, or part of the model being on the GPU and part on the CPU).
After fixing them, I end up with:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from snac import SNAC
device = torch.device('cuda')
base_model = AutoModelForCausalLM.from_pretrained("canopylabs/3b-es_it-pretrain-research_release", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("canopylabs/3b-es_it-pretrain-research_release")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(device)
prompt = "alloy (intense_fear_dread_apprehension_and_horror): Estoy atrapado, por favor ayúdame."
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
output = base_model.generate(input_ids)
audio_tokens = output[0].tolist() # Postprocess generated tokens (simplified)
trimmed = [t - 128266 for t in audio_tokens if t >= 128266] # Trim to multiple of 7, subtract offset, and decode
layer_1, layer_2, layer_3 = [], [], []
for i in range(len(trimmed) // 7):
layer_1.append(trimmed[7*i])
layer_2.append(trimmed[7*i+1])
layer_3.extend(trimmed[7*i+2:7*i+4])
layer_2.append(trimmed[7*i+4])
layer_3.extend(trimmed[7*i+5:7*i+7])
layers = [
torch.tensor(layer_1).unsqueeze(0).to(device),
torch.tensor(layer_2).unsqueeze(0).to(device),
torch.tensor(layer_3).unsqueeze(0).to(device),
]
audio = snac_model.decode(layers).squeeze().cpu().numpy()
which now gives the error:
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1500: indexSelectSmallIndex: block: [0,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1500: indexSelectSmallIndex: block: [0,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1500: indexSelectSmallIndex: block: [0,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1500: indexSelectSmallIndex: block: [0,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1500: indexSelectSmallIndex: block: [0,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1500: indexSelectSmallIndex: block: [0,0,0], thread: [5,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1500: indexSelectSmallIndex: block: [0,0,0], thread: [6,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1500: indexSelectSmallIndex: block: [0,0,0], thread: [7,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
Traceback (most recent call last):
File "/home/da/git/w2l/./tts.py", line 130, in <module>
audio = snac_model.decode(layers).squeeze().cpu().numpy()
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/da/py312/lib/python3.12/site-packages/snac/snac.py", line 89, in decode
z_q = self.quantizer.from_codes(codes)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/da/py312/lib/python3.12/site-packages/snac/vq.py", line 95, in from_codes
z_q_i = self.quantizers[i].out_proj(z_p_i)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/da/py312/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/da/py312/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/da/py312/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 375, in forward
return self._conv_forward(input, self.weight, self.bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/da/py312/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 370, in _conv_forward
return F.conv1d(
^^^^^^^^^
RuntimeError: GET was unable to find an engine to execute this computation
Hello!
Thank you for your comment. As you said, the example was incorrect. I have corrected it, and now if you copy the code, it will work and generate the audio.
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from snac import SNAC
# --- Minimal config ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BASE = "canopylabs/3b-es_it-pretrain-research_release"
LORA = "sirekist98/orpheustts_spanish_finetuned"
SNAC_ID = "hubertsiuzdak/snac_24khz"
VOICE = "alloy"
EMOTION_ID = "intense_fear_dread_apprehension_horror_terror_panic"
TEXT = "Estoy atrapado, por favor ayúdame."
prompt = f"{VOICE} ({EMOTION_ID}): {TEXT}"
# --- Load models ---
tokenizer = AutoTokenizer.from_pretrained(BASE)
base_model = AutoModelForCausalLM.from_pretrained(
BASE,
torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
)
model = PeftModel.from_pretrained(base_model, LORA).to(device).eval()
snac_model = SNAC.from_pretrained(SNAC_ID).to(device)
# --- Prepare input ---
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
start_tok = torch.tensor([[128259]], dtype=torch.long).to(device)
end_toks = torch.tensor([[128009, 128260]], dtype=torch.long).to(device)
input_ids = torch.cat([start_tok, input_ids, end_toks], dim=1)
MAX_LEN = 4260
pad_len = MAX_LEN - input_ids.shape[1]
pad = torch.full((1, pad_len), 128263, dtype=torch.long).to(device)
input_ids = torch.cat([pad, input_ids], dim=1)
attention_mask = torch.cat(
[torch.zeros((1, pad_len), dtype=torch.long),
torch.ones((1, input_ids.shape[1] - pad_len), dtype=torch.long)],
dim=1
).to(device)
# --- Generate ---
generated = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=1200,
do_sample=True,
temperature=0.6,
top_p=0.95,
repetition_penalty=1.1,
num_return_sequences=1,
eos_token_id=128258,
use_cache=True
)
# --- Post-process ---
AUDIO_TOKEN_OFFSET = 128266
token_to_find = 128257
token_to_remove = 128258
idxs = (generated == token_to_find).nonzero(as_tuple=True)
cropped = generated[:, idxs[1][-1].item() + 1:] if len(idxs[1]) > 0 else generated
cleaned = cropped[cropped != token_to_remove]
codes = cleaned[: (len(cleaned) // 7) * 7].tolist()
codes = [int(t) - AUDIO_TOKEN_OFFSET for t in codes]
# --- SNAC decode ---
layer_1, layer_2, layer_3 = [], [], []
for i in range((len(codes) + 1) // 7):
b = 7 * i
if b + 6 >= len(codes):
break
layer_1.append(codes[b + 0])
layer_2.append(codes[b + 1] - 4096)
layer_3.append(codes[b + 2] - 2 * 4096)
layer_3.append(codes[b + 3] - 3 * 4096)
layer_2.append(codes[b + 4] - 4 * 4096)
layer_3.append(codes[b + 5] - 5 * 4096)
layer_3.append(codes[b + 6] - 6 * 4096)
dev_snac = snac_model.quantizer.quantizers[0].codebook.weight.device
layers = [
torch.tensor(layer_1).unsqueeze(0).to(dev_snac),
torch.tensor(layer_2).unsqueeze(0).to(dev_snac),
torch.tensor(layer_3).unsqueeze(0).to(dev_snac),
]
with torch.no_grad():
audio = snac_model.decode(layers).squeeze().cpu().numpy()
# 'audio' is the 24kHz waveform.
# Optional:
# from scipy.io.wavfile import write as write_wav
# write_wav("output.wav", 24000, audio)
Thank you, it's working now.
The maximum output audio length seems to be consistently truncated at 14.5 seconds or so.
Is there any way to make the output audio longer?
Hello again,
Yes, you can adjust the maximum number of tokens in the output by modifying this parameter:
max_new_tokens = 1200
The higher the maximum token value, the longer the model will take to generate the audio.
That worked too, thanks.
The original Orpheus model (the one you call with with orpheus_tts.OrpheusModel()
) also seems to truncate output at around 14.5 seconds.
Any idea if increasing max_new_tokens
is likely to introduce artifacts in the generated audio, and how high you can set max_new_tokens
before it does?
I don’t think Orpheus has a hard-coded maximum duration in the model itself, but in practice it often stops around ~14–15 s because of two factors:
-The default max_new_tokens value during inference.
-The model emitting its audio eos_token near the typical clip length it was trained on.