sirekist98 commited on
Commit
7a19353
·
verified ·
1 Parent(s): 2b2b1c5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +88 -23
README.md CHANGED
@@ -57,35 +57,100 @@ You can run inference using the demo space: [Orpheus TTS Spanish Fine-Tuned](htt
57
  To run inference locally with full control:
58
 
59
  ```pythonpython
 
60
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
61
  from snac import SNAC
62
 
63
- base_model = AutoModelForCausalLM.from_pretrained("canopylabs/3b-es_it-pretrain-research_release", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
64
- tokenizer = AutoTokenizer.from_pretrained("canopylabs/3b-es_it-pretrain-research_release")
65
- snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
66
-
67
- prompt = "alloy (intense_fear_dread_apprehension_and_horror): Estoy atrapado, por favor ayúdame."
68
- input_ids = tokenizer(prompt, return_tensors="pt").input_ids
69
- output = model.generate(input_ids)
70
- # Postprocess generated tokens (simplified)
71
- audio_tokens = output[0].tolist()
72
- # Trim to multiple of 7, subtract offset, and decode
73
- trimmed = [t - 128266 for t in audio_tokens if t >= 128266]
74
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  layer_1, layer_2, layer_3 = [], [], []
76
- for i in range(len(trimmed) // 7):
77
- layer_1.append(trimmed[7*i])
78
- layer_2.append(trimmed[7*i+1])
79
- layer_3.extend(trimmed[7*i+2:7*i+4])
80
- layer_2.append(trimmed[7*i+4])
81
- layer_3.extend(trimmed[7*i+5:7*i+7])
82
-
 
 
 
 
 
 
83
  layers = [
84
- torch.tensor(layer_1).unsqueeze(0).to(snac_model.device),
85
- torch.tensor(layer_2).unsqueeze(0).to(snac_model.device),
86
- torch.tensor(layer_3).unsqueeze(0).to(snac_model.device),
87
  ]
88
- audio = snac_model.decode(layers).squeeze().cpu().numpy()
 
 
 
 
 
 
 
89
  ```
90
 
91
  ---
 
57
  To run inference locally with full control:
58
 
59
  ```pythonpython
60
+ import torch
61
  from transformers import AutoTokenizer, AutoModelForCausalLM
62
+ from peft import PeftModel
63
  from snac import SNAC
64
 
65
+ # --- Minimal config ---
66
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
67
+ BASE = "canopylabs/3b-es_it-pretrain-research_release"
68
+ LORA = "sirekist98/orpheustts_spanish_finetuned"
69
+ SNAC_ID = "hubertsiuzdak/snac_24khz"
70
+
71
+ VOICE = "alloy"
72
+ EMOTION_ID = "intense_fear_dread_apprehension_horror_terror_panic"
73
+ TEXT = "Estoy atrapado, por favor ayúdame."
74
+ prompt = f"{VOICE} ({EMOTION_ID}): {TEXT}"
75
+
76
+ # --- Load models ---
77
+ tokenizer = AutoTokenizer.from_pretrained(BASE)
78
+ base_model = AutoModelForCausalLM.from_pretrained(
79
+ BASE,
80
+ torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
81
+ )
82
+ model = PeftModel.from_pretrained(base_model, LORA).to(device).eval()
83
+ snac_model = SNAC.from_pretrained(SNAC_ID).to(device)
84
+
85
+ # --- Prepare input (same as your Space) ---
86
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
87
+ start_tok = torch.tensor([[128259]], dtype=torch.long).to(device)
88
+ end_toks = torch.tensor([[128009, 128260]], dtype=torch.long).to(device)
89
+
90
+ input_ids = torch.cat([start_tok, input_ids, end_toks], dim=1)
91
+ MAX_LEN = 4260
92
+ pad_len = MAX_LEN - input_ids.shape[1]
93
+ pad = torch.full((1, pad_len), 128263, dtype=torch.long).to(device)
94
+ input_ids = torch.cat([pad, input_ids], dim=1)
95
+ attention_mask = torch.cat(
96
+ [torch.zeros((1, pad_len), dtype=torch.long),
97
+ torch.ones((1, input_ids.shape[1] - pad_len), dtype=torch.long)],
98
+ dim=1
99
+ ).to(device)
100
+
101
+ # --- Generate ---
102
+ generated = model.generate(
103
+ input_ids=input_ids,
104
+ attention_mask=attention_mask,
105
+ max_new_tokens=1200,
106
+ do_sample=True,
107
+ temperature=0.6,
108
+ top_p=0.95,
109
+ repetition_penalty=1.1,
110
+ num_return_sequences=1,
111
+ eos_token_id=128258,
112
+ use_cache=True
113
+ )
114
+
115
+ # --- Post-process (find 128257, remove 128258, multiple of 7, subtract 128266) ---
116
+ AUDIO_TOKEN_OFFSET = 128266
117
+ token_to_find = 128257
118
+ token_to_remove = 128258
119
+
120
+ idxs = (generated == token_to_find).nonzero(as_tuple=True)
121
+ cropped = generated[:, idxs[1][-1].item() + 1:] if len(idxs[1]) > 0 else generated
122
+ cleaned = cropped[cropped != token_to_remove]
123
+ codes = cleaned[: (len(cleaned) // 7) * 7].tolist()
124
+ codes = [int(t) - AUDIO_TOKEN_OFFSET for t in codes]
125
+
126
+ # --- SNAC decode (same layout as your Space) ---
127
  layer_1, layer_2, layer_3 = [], [], []
128
+ for i in range((len(codes) + 1) // 7):
129
+ b = 7 * i
130
+ if b + 6 >= len(codes):
131
+ break
132
+ layer_1.append(codes[b + 0])
133
+ layer_2.append(codes[b + 1] - 4096)
134
+ layer_3.append(codes[b + 2] - 2 * 4096)
135
+ layer_3.append(codes[b + 3] - 3 * 4096)
136
+ layer_2.append(codes[b + 4] - 4 * 4096)
137
+ layer_3.append(codes[b + 5] - 5 * 4096)
138
+ layer_3.append(codes[b + 6] - 6 * 4096)
139
+
140
+ dev_snac = snac_model.quantizer.quantizers[0].codebook.weight.device
141
  layers = [
142
+ torch.tensor(layer_1).unsqueeze(0).to(dev_snac),
143
+ torch.tensor(layer_2).unsqueeze(0).to(dev_snac),
144
+ torch.tensor(layer_3).unsqueeze(0).to(dev_snac),
145
  ]
146
+
147
+ with torch.no_grad():
148
+ audio = snac_model.decode(layers).squeeze().cpu().numpy()
149
+
150
+ # 'audio' is the 24kHz waveform.
151
+ # Optional:
152
+ # from scipy.io.wavfile import write as write_wav
153
+ # write_wav("output.wav", 24000, audio)
154
  ```
155
 
156
  ---