oliveiracwb commited on
Commit
95bb656
·
1 Parent(s): 050fc7d

Upload 3 files

Browse files
Files changed (3) hide show
  1. LICENSE +21 -0
  2. ckpt.pt +3 -0
  3. model.py +324 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Andrej Karpathy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
ckpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09248381f062fe8bcb33d62800e2ef2210ed52a87c52d8f4d27a8a24df5386ee
3
+ size 96881431
model.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Full definition of a GPT Language Model, all of it in this single file.
3
+ References:
4
+ 1) the official GPT-2 TensorFlow implementation released by OpenAI:
5
+ https://github.com/openai/gpt-2/blob/master/src/model.py
6
+ 2) huggingface/transformers PyTorch implementation:
7
+ https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
8
+ """
9
+
10
+ import math
11
+ from dataclasses import dataclass
12
+
13
+ import torch
14
+ import torch.nn as nn
15
+ from torch.nn import functional as F
16
+
17
+ # @torch.jit.script # good to enable when not using torch.compile, disable when using (our default)
18
+ def new_gelu(x):
19
+ """
20
+ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
21
+ Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
22
+ """
23
+ return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
24
+
25
+ class LayerNorm(nn.Module):
26
+ """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
27
+
28
+ def __init__(self, ndim, bias):
29
+ super().__init__()
30
+ self.weight = nn.Parameter(torch.ones(ndim))
31
+ self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
32
+
33
+ def forward(self, input):
34
+ return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
35
+
36
+ class CausalSelfAttention(nn.Module):
37
+
38
+ def __init__(self, config):
39
+ super().__init__()
40
+ assert config.n_embd % config.n_head == 0
41
+ # key, query, value projections for all heads, but in a batch
42
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
43
+ # output projection
44
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
45
+ # regularization
46
+ self.attn_dropout = nn.Dropout(config.dropout)
47
+ self.resid_dropout = nn.Dropout(config.dropout)
48
+ # causal mask to ensure that attention is only applied to the left in the input sequence
49
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
50
+ .view(1, 1, config.block_size, config.block_size))
51
+ self.n_head = config.n_head
52
+ self.n_embd = config.n_embd
53
+
54
+ def forward(self, x):
55
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
56
+
57
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
58
+ q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)
59
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
60
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
61
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
62
+
63
+ # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
64
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
65
+ att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
66
+ att = F.softmax(att, dim=-1)
67
+ att = self.attn_dropout(att)
68
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
69
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
70
+
71
+ # output projection
72
+ y = self.resid_dropout(self.c_proj(y))
73
+ return y
74
+
75
+ class MLP(nn.Module):
76
+
77
+ def __init__(self, config):
78
+ super().__init__()
79
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
80
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
81
+ self.dropout = nn.Dropout(config.dropout)
82
+
83
+ def forward(self, x):
84
+ x = self.c_fc(x)
85
+ x = new_gelu(x)
86
+ x = self.c_proj(x)
87
+ x = self.dropout(x)
88
+ return x
89
+
90
+ class Block(nn.Module):
91
+
92
+ def __init__(self, config):
93
+ super().__init__()
94
+ self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
95
+ self.attn = CausalSelfAttention(config)
96
+ self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
97
+ self.mlp = MLP(config)
98
+
99
+ def forward(self, x):
100
+ x = x + self.attn(self.ln_1(x))
101
+ x = x + self.mlp(self.ln_2(x))
102
+ return x
103
+
104
+ @dataclass
105
+ class GPTConfig:
106
+ block_size: int = 1024
107
+ vocab_size: int = 50257
108
+ n_layer: int = 12
109
+ n_head: int = 12
110
+ n_embd: int = 768
111
+ dropout: float = 0.0
112
+ bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
113
+
114
+ class GPT(nn.Module):
115
+
116
+ def __init__(self, config):
117
+ super().__init__()
118
+ assert config.vocab_size is not None
119
+ assert config.block_size is not None
120
+ self.config = config
121
+
122
+ self.transformer = nn.ModuleDict(dict(
123
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
124
+ wpe = nn.Embedding(config.block_size, config.n_embd),
125
+ drop = nn.Dropout(config.dropout),
126
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
127
+ ln_f = LayerNorm(config.n_embd, bias=config.bias),
128
+ ))
129
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
130
+ # with weight tying when using torch.compile() some warnings get generated:
131
+ # "UserWarning: functional_call was passed multiple values for tied weights.
132
+ # This behavior is deprecated and will be an error in future versions"
133
+ # not 100% sure what this is, so far seems to be harmless. TODO investigate
134
+ self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
135
+
136
+ # init all weights
137
+ self.apply(self._init_weights)
138
+ # apply special scaled init to the residual projections, per GPT-2 paper
139
+ for pn, p in self.named_parameters():
140
+ if pn.endswith('c_proj.weight'):
141
+ torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
142
+
143
+ # report number of parameters
144
+ n_params = sum(p.numel() for p in self.parameters())
145
+ print("number of parameters: %.2fM" % (n_params/1e6,))
146
+
147
+ def _init_weights(self, module):
148
+ if isinstance(module, nn.Linear):
149
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
150
+ if module.bias is not None:
151
+ torch.nn.init.zeros_(module.bias)
152
+ elif isinstance(module, nn.Embedding):
153
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
154
+ elif isinstance(module, (LayerNorm, nn.LayerNorm)):
155
+ torch.nn.init.ones_(module.weight)
156
+ if module.bias is not None:
157
+ torch.nn.init.zeros_(module.bias)
158
+
159
+ def forward(self, idx, targets=None):
160
+ device = idx.device
161
+ b, t = idx.size()
162
+ assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
163
+ pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
164
+
165
+ # forward the GPT model itself
166
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
167
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
168
+ x = self.transformer.drop(tok_emb + pos_emb)
169
+ for block in self.transformer.h:
170
+ x = block(x)
171
+ x = self.transformer.ln_f(x)
172
+
173
+ if targets is not None:
174
+ # if we are given some desired targets also calculate the loss
175
+ logits = self.lm_head(x)
176
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
177
+ else:
178
+ # inference-time mini-optimization: only forward the lm_head on the very last position
179
+ logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
180
+ loss = None
181
+
182
+ return logits, loss
183
+
184
+ def crop_block_size(self, block_size):
185
+ # model surgery to decrease the block size if necessary
186
+ # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
187
+ # but want to use a smaller block size for some smaller, simpler model
188
+ assert block_size <= self.config.block_size
189
+ self.config.block_size = block_size
190
+ self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
191
+ for block in self.transformer.h:
192
+ block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
193
+
194
+ @classmethod
195
+ def from_pretrained(cls, model_type, override_args=None):
196
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
197
+ override_args = override_args or {} # default to empty dict
198
+ # only dropout can be overridden see more notes below
199
+ assert all(k == 'dropout' for k in override_args)
200
+ from transformers import GPT2LMHeadModel
201
+ print("loading weights from pretrained gpt: %s" % model_type)
202
+
203
+ # n_layer, n_head and n_embd are determined from model_type
204
+ config_args = {
205
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
206
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
207
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
208
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
209
+ }[model_type]
210
+ # we can override the dropout rate
211
+ if 'dropout' in override_args:
212
+ config_args['dropout'] = override_args['dropout']
213
+ # block_size is always 1024 for GPT model checkpoints
214
+ # if one wants a lower block_size it has to be done through model surgery
215
+ # later, by calling crop_block_size()
216
+
217
+ # create a from-scratch initialized minGPT model
218
+ config = GPTConfig(block_size=1024, bias=True, **config_args) # note: force bias=True, as in gpt2 models
219
+ model = GPT(config)
220
+ sd = model.state_dict()
221
+
222
+ # init a huggingface/transformers model
223
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
224
+ sd_hf = model_hf.state_dict()
225
+
226
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
227
+ keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these
228
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
229
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
230
+ # this means that we have to transpose these weights when we import them
231
+ assert len(keys) == len(sd)
232
+ for k in keys:
233
+ if any(k.endswith(w) for w in transposed):
234
+ # special treatment for the Conv1D weights we need to transpose
235
+ assert sd_hf[k].shape[::-1] == sd[k].shape
236
+ with torch.no_grad():
237
+ sd[k].copy_(sd_hf[k].t())
238
+ else:
239
+ # vanilla copy over the other parameters
240
+ assert sd_hf[k].shape == sd[k].shape
241
+ with torch.no_grad():
242
+ sd[k].copy_(sd_hf[k])
243
+
244
+ return model
245
+
246
+ def configure_optimizers(self, weight_decay, learning_rate, betas):
247
+ """
248
+ This long function is unfortunately doing something very simple and is being very defensive:
249
+ We are separating out all parameters of the model into two buckets: those that will experience
250
+ weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
251
+ We are then returning the PyTorch optimizer object.
252
+ """
253
+
254
+ # separate out all parameters to those that will and won't experience regularizing weight decay
255
+ decay = set()
256
+ no_decay = set()
257
+ whitelist_weight_modules = (torch.nn.Linear, )
258
+ blacklist_weight_modules = (torch.nn.LayerNorm, LayerNorm, torch.nn.Embedding)
259
+ for mn, m in self.named_modules():
260
+ for pn, p in m.named_parameters():
261
+ fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
262
+ # random note: because named_modules and named_parameters are recursive
263
+ # we will see the same tensors p many many times. but doing it this way
264
+ # allows us to know which parent module any tensor p belongs to...
265
+ if pn.endswith('bias'):
266
+ # all biases will not be decayed
267
+ no_decay.add(fpn)
268
+ elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
269
+ # weights of whitelist modules will be weight decayed
270
+ decay.add(fpn)
271
+ elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
272
+ # weights of blacklist modules will NOT be weight decayed
273
+ no_decay.add(fpn)
274
+
275
+ # subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
276
+ # will appear in the no_decay and decay sets respectively after the above.
277
+ # In addition, because named_parameters() doesn't return duplicates, it
278
+ # will only return the first occurence, key'd by 'transformer.wte.weight', below.
279
+ # so let's manually remove 'lm_head.weight' from decay set. This will include
280
+ # this tensor into optimization via transformer.wte.weight only, and not decayed.
281
+ decay.remove('lm_head.weight')
282
+
283
+ # validate that we considered every parameter
284
+ param_dict = {pn: p for pn, p in self.named_parameters()}
285
+ inter_params = decay & no_decay
286
+ union_params = decay | no_decay
287
+ assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
288
+ assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
289
+ % (str(param_dict.keys() - union_params), )
290
+
291
+ # create the pytorch optimizer object
292
+ optim_groups = [
293
+ {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
294
+ {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
295
+ ]
296
+ optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas)
297
+ return optimizer
298
+
299
+ @torch.no_grad()
300
+ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
301
+ """
302
+ Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
303
+ the sequence max_new_tokens times, feeding the predictions back into the model each time.
304
+ Most likely you'll want to make sure to be in model.eval() mode of operation for this.
305
+ """
306
+ for _ in range(max_new_tokens):
307
+ # if the sequence context is growing too long we must crop it at block_size
308
+ idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
309
+ # forward the model to get the logits for the index in the sequence
310
+ logits, _ = self(idx_cond)
311
+ # pluck the logits at the final step and scale by desired temperature
312
+ logits = logits[:, -1, :] / temperature
313
+ # optionally crop the logits to only the top k options
314
+ if top_k is not None:
315
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
316
+ logits[logits < v[:, [-1]]] = -float('Inf')
317
+ # apply softmax to convert logits to (normalized) probabilities
318
+ probs = F.softmax(logits, dim=-1)
319
+ # sample from the distribution
320
+ idx_next = torch.multinomial(probs, num_samples=1)
321
+ # append sampled index to the running sequence and continue
322
+ idx = torch.cat((idx, idx_next), dim=1)
323
+
324
+ return idx