Update README.md
Browse files
README.md
CHANGED
@@ -50,10 +50,29 @@ This model has been 4-bit quantized Llada-8B-Base model with [GPTQModel](https:/
|
|
50 |
|
51 |
## Example:
|
52 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
import torch
|
54 |
from datasets import load_dataset
|
55 |
-
from gptqmodel import
|
56 |
-
from
|
|
|
|
|
57 |
import torch.nn.functional as F
|
58 |
import numpy as np
|
59 |
|
@@ -61,6 +80,64 @@ import numpy as np
|
|
61 |
|
62 |
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def add_gumbel_noise(logits, temperature):
|
65 |
'''
|
66 |
The Gumbel max is a method for sampling categorical distributions.
|
@@ -93,6 +170,82 @@ def get_num_transfer_tokens(mask_index, steps):
|
|
93 |
|
94 |
return num_transfer_tokens
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
|
98 |
|
@@ -167,29 +320,43 @@ def generate(model, prompt, steps=128, gen_length=128, block_length=128, tempera
|
|
167 |
return x
|
168 |
|
169 |
def main():
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
175 |
-
prompt = "
|
176 |
|
177 |
-
# #
|
178 |
m = [{"role": "user", "content": prompt}, ]
|
179 |
prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
|
180 |
|
181 |
input_ids = tokenizer(prompt)['input_ids']
|
182 |
input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)
|
183 |
|
184 |
-
|
185 |
|
186 |
|
|
|
187 |
model = GPTQModel.load(quantized_model_id, device=device , trust_remote_code=True )
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
-
steps=
|
191 |
-
|
192 |
-
print("*"*30+ f"Steps {steps}"+ "*"*30)
|
193 |
print(input_ids.shape)
|
194 |
print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])
|
195 |
|
@@ -206,4 +373,6 @@ if __name__ == "__main__":
|
|
206 |
|
207 |
main()
|
208 |
|
|
|
|
|
209 |
```
|
|
|
50 |
|
51 |
## Example:
|
52 |
```python
|
53 |
+
|
54 |
+
# Copyright 2024-2025 ModelCloud.ai
|
55 |
+
# Copyright 2024-2025 [email protected]
|
56 |
+
# Contact: [email protected], x.com/qubitium
|
57 |
+
#
|
58 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
59 |
+
# you may not use this file except in compliance with the License.
|
60 |
+
# You may obtain a copy of the License at
|
61 |
+
#
|
62 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
63 |
+
#
|
64 |
+
# Unless required by applicable law or agreed to in writing, software
|
65 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
66 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
67 |
+
# See the License for the specific language governing permissions and
|
68 |
+
# limitations under the License.
|
69 |
+
|
70 |
import torch
|
71 |
from datasets import load_dataset
|
72 |
+
from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
|
73 |
+
from gptqmodel.models.base import BaseGPTQModel
|
74 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
|
75 |
+
from gptqmodel.models.auto import MODEL_MAP
|
76 |
import torch.nn.functional as F
|
77 |
import numpy as np
|
78 |
|
|
|
80 |
|
81 |
|
82 |
|
83 |
+
pretrained_model_id = '/home/chentianqi/model/GSAI-ML/LLaDA-8B-Instruct' # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
84 |
+
quantized_model_id = "FunAGI/LLaDA-8B-Instruct-gptqmodel-4bit"
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
class LladaGPTQ(BaseGPTQModel):
|
89 |
+
# Non-repeating layers at the root level: same level as `layers_node`
|
90 |
+
# Excluding `layers_node`.
|
91 |
+
base_modules = ["model.transformer.wte", "model.transformer.ln_f"]
|
92 |
+
pre_lm_head_norm_module = "model.transformer.ln_f"
|
93 |
+
lm_head = "model.transformer.ff_out"
|
94 |
+
# Below describes all the repeating layers in this transformer model
|
95 |
+
# `model.layers` is a node/module that hold all the repeating layers. The parent node for all n-layers.
|
96 |
+
layers_node = "model.transformer.blocks"
|
97 |
+
# Each repeating layer in `model.layers` is of type `LlamaDecoderLayer`
|
98 |
+
layer_type = "LLaDALlamaBlock"
|
99 |
+
# Inside each `LlamaDecoderLayer` layer are many internal modules
|
100 |
+
# List them in the order executed in model forward() code
|
101 |
+
# Many models have same execution order of: attention (q_k_v) projection, attention (output) projection, mlp (n) projections
|
102 |
+
layer_modules = [
|
103 |
+
["attn_out", "k_proj", "v_proj", "q_proj"],
|
104 |
+
["ff_proj", "up_proj"],
|
105 |
+
["ff_out"],
|
106 |
+
]
|
107 |
+
MODEL_MAP ["llada"] = LladaGPTQ
|
108 |
+
|
109 |
+
# os.makedirs(quantized_model_dir, exist_ok=True)
|
110 |
+
def get_wikitext2(tokenizer, nsamples, seqlen):
|
111 |
+
traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train").filter(
|
112 |
+
lambda x: len(x["text"]) >= seqlen)
|
113 |
+
|
114 |
+
return [tokenizer(example["text"]) for example in traindata.select(range(nsamples))]
|
115 |
+
|
116 |
+
|
117 |
+
@torch.no_grad()
|
118 |
+
def calculate_avg_ppl(model, tokenizer):
|
119 |
+
from gptqmodel.utils import Perplexity
|
120 |
+
|
121 |
+
ppl = Perplexity(
|
122 |
+
model=model,
|
123 |
+
tokenizer=tokenizer,
|
124 |
+
dataset_path="wikitext",
|
125 |
+
dataset_name="wikitext-2-raw-v1",
|
126 |
+
split="train",
|
127 |
+
text_column="text",
|
128 |
+
)
|
129 |
+
|
130 |
+
all = ppl.calculate(n_ctx=512, n_batch=512)
|
131 |
+
|
132 |
+
# average ppl
|
133 |
+
avg = sum(all) / len(all)
|
134 |
+
|
135 |
+
return avg
|
136 |
+
|
137 |
+
dynamic = {
|
138 |
+
|
139 |
+
}
|
140 |
+
|
141 |
def add_gumbel_noise(logits, temperature):
|
142 |
'''
|
143 |
The Gumbel max is a method for sampling categorical distributions.
|
|
|
170 |
|
171 |
return num_transfer_tokens
|
172 |
|
173 |
+
def forward_process(batch, prompt_index, mask_id):
|
174 |
+
b, l = batch.shape
|
175 |
+
|
176 |
+
target_len = (l - prompt_index.sum()).item()
|
177 |
+
k = torch.randint(1, target_len + 1, (), device=batch.device)
|
178 |
+
|
179 |
+
x = torch.round(torch.linspace(float(k), k + (b - 1) * (target_len / b), steps=b, device=batch.device)).long()
|
180 |
+
x = ((x - 1) % target_len) + 1
|
181 |
+
assert x.min() >= 1 and x.max() <= target_len
|
182 |
+
|
183 |
+
indices = torch.arange(target_len, device=batch.device).repeat(b, 1)
|
184 |
+
is_mask = indices < x.unsqueeze(1)
|
185 |
+
for i in range(b):
|
186 |
+
is_mask[i] = is_mask[i][torch.randperm(target_len)]
|
187 |
+
|
188 |
+
is_mask = torch.cat((torch.zeros(b, prompt_index.sum(), dtype=torch.bool, device=batch.device), is_mask), dim=1)
|
189 |
+
noisy_batch = torch.where(is_mask, mask_id, batch)
|
190 |
+
|
191 |
+
# Return the masked batch and the mask ratio
|
192 |
+
return noisy_batch, (x / target_len).unsqueeze(1).repeat(1, l)
|
193 |
+
|
194 |
+
|
195 |
+
def get_logits(model, batch, prompt_index, cfg_scale, mask_id):
|
196 |
+
if cfg_scale > 0.:
|
197 |
+
assert len(prompt_index) == batch.shape[1]
|
198 |
+
prompt_index = prompt_index.unsqueeze(0).repeat(batch.shape[0], 1)
|
199 |
+
un_batch = batch.clone()
|
200 |
+
un_batch[prompt_index] = mask_id
|
201 |
+
batch = torch.cat([batch, un_batch])
|
202 |
+
|
203 |
+
input = batch
|
204 |
+
logits = model(input).logits
|
205 |
+
|
206 |
+
if cfg_scale > 0.:
|
207 |
+
logits, un_logits = torch.chunk(logits, 2, dim=0)
|
208 |
+
logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
|
209 |
+
return logits
|
210 |
+
|
211 |
+
|
212 |
+
|
213 |
+
@ torch.no_grad()
|
214 |
+
def get_log_likelihood(model, prompt, answer, mc_num=128, batch_size=32, cfg_scale=0., mask_id=126336):
|
215 |
+
'''
|
216 |
+
Args:
|
217 |
+
model: Mask predictor.
|
218 |
+
prompt: A tensor of shape (l1).
|
219 |
+
answer: A tensor of shape (l2).
|
220 |
+
mc_num: Monte Carlo estimation times.
|
221 |
+
As detailed in Appendix B.5. Since MMLU, CMMLU, and C-EVAL only require the likelihood of a single token, a
|
222 |
+
single Monte Carlo estimate is sufficient for these benchmarks. For all other benchmarks, we find that 128
|
223 |
+
Monte Carlo samples are adequate to produce stable results.
|
224 |
+
batch_size: Mini batch size.
|
225 |
+
cfg_scale: Unsupervised classifier-free guidance scale.
|
226 |
+
mask_id: The toke id of [MASK] is 126336.
|
227 |
+
'''
|
228 |
+
|
229 |
+
seq = torch.concatenate([prompt, answer])[None, :]
|
230 |
+
seq = seq.repeat((batch_size, 1)).to(model.device)
|
231 |
+
prompt_index = torch.arange(seq.shape[1], device=model.device) < len(prompt)
|
232 |
+
|
233 |
+
loss_ = []
|
234 |
+
for _ in range(mc_num // batch_size):
|
235 |
+
|
236 |
+
perturbed_seq, p_mask = forward_process(seq, prompt_index, mask_id)
|
237 |
+
mask_index = perturbed_seq == mask_id
|
238 |
+
|
239 |
+
logits = get_logits(model, perturbed_seq, prompt_index, cfg_scale, mask_id)
|
240 |
+
|
241 |
+
loss = F.cross_entropy(logits[mask_index], seq[mask_index], reduction='none') / p_mask[mask_index]
|
242 |
+
loss = loss.sum() / batch_size
|
243 |
+
|
244 |
+
loss_.append(loss.item())
|
245 |
+
|
246 |
+
return - sum(loss_) / len(loss_)
|
247 |
+
|
248 |
+
|
249 |
|
250 |
|
251 |
|
|
|
320 |
return x
|
321 |
|
322 |
def main():
|
323 |
+
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, use_fast=False)
|
324 |
+
|
325 |
+
traindataset = get_wikitext2(tokenizer, nsamples=128, seqlen=1024)
|
326 |
|
327 |
+
quantize_config = QuantizeConfig(
|
328 |
+
dynamic=dynamic,
|
329 |
+
bits=8, # quantize model to 4-bit
|
330 |
+
group_size=128, # it is recommended to set the value to 128,
|
331 |
+
desc_act = True,
|
332 |
+
sym=False
|
333 |
+
)
|
334 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
335 |
+
prompt = "Lily can run 12 kilometers per hour for 4 hours. After that, she runs 6 kilometers per hour. How many kilometers can she run in 8 hours?"
|
336 |
|
337 |
+
# # Add special tokens for the Instruct model. The Base model does not require the following two lines.
|
338 |
m = [{"role": "user", "content": prompt}, ]
|
339 |
prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
|
340 |
|
341 |
input_ids = tokenizer(prompt)['input_ids']
|
342 |
input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)
|
343 |
|
|
|
344 |
|
345 |
|
346 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
347 |
model = GPTQModel.load(quantized_model_id, device=device , trust_remote_code=True )
|
348 |
|
349 |
+
steps=128
|
350 |
+
out = generate(model, input_ids, steps=steps , gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
|
351 |
+
print("*"*30+ f"GPTQ-4bit Steps {steps}"+ "*"*30)
|
352 |
+
print(input_ids.shape)
|
353 |
+
print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])
|
354 |
+
del model
|
355 |
+
|
356 |
+
model =AutoModel.from_pretrained(pretrained_model_id, trust_remote_code=True ).cuda()
|
357 |
|
358 |
+
out = generate(model, input_ids, steps=steps , gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
|
359 |
+
print("*"*30+ f"FP16 Steps {steps}"+ "*"*30)
|
|
|
360 |
print(input_ids.shape)
|
361 |
print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])
|
362 |
|
|
|
373 |
|
374 |
main()
|
375 |
|
376 |
+
|
377 |
+
|
378 |
```
|