codebyzeb commited on
Commit
2aa2526
·
verified ·
1 Parent(s): 0718b56

Delete llm/fw57M-multi-tied

Browse files
llm/fw57M-multi-tied/.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm/fw57M-multi-tied/README.md DELETED
@@ -1,92 +0,0 @@
1
- ---
2
- {}
3
- ---
4
- ## Experiment Configuration
5
- ```yaml
6
- callbacks:
7
- grad_accum:
8
- _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler
9
- scheduling:
10
- 0: 2
11
- grad_norm:
12
- _target_: src.callbacks.grad_norm.GradNorm
13
- check_clipping: false
14
- group_separator: /
15
- histogram_freq: null
16
- log_weight_distribution: false
17
- norm_type: 2
18
- only_total: true
19
- lr_monitor:
20
- _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor
21
- model_checkpoint:
22
- _target_: src.callbacks.model_checkpoint.ModelCheckpoint
23
- dirpath: .checkpoints
24
- enable_version_counter: false
25
- every_n_train_steps: 1000
26
- filename: '{step}'
27
- save_initial_checkpoint: true
28
- save_last: link
29
- save_top_k: -1
30
- verbose: true
31
- speed_monitor:
32
- _target_: src.callbacks.speed_monitor.SpeedMonitor
33
- data:
34
- batch_size: 64
35
- drop_last: false
36
- eval_batch_size: 64
37
- multiprocessing_context: null
38
- num_workers: 12
39
- persistent_workers: false
40
- pin_memory: true
41
- prefetch_factor: 2
42
- shuffle: true
43
- dataset: common-corpus
44
- evaluation:
45
- blimp: false
46
- loggers:
47
- tensorboard:
48
- _target_: src.trainer.TensorBoardLogger
49
- name: ''
50
- save_dir: ./
51
- version: null
52
- model: fw57M-tied
53
- optim:
54
- lr: 0.0006
55
- num_warmup_steps: 2000
56
- optim_kwargs:
57
- betas:
58
- - 0.9
59
- - 0.95
60
- eps: 1.0e-08
61
- fused: true
62
- optim_name: adamw
63
- scheduler_kwargs:
64
- min_lr_ratio: 0.01
65
- num_decay_steps: 4000
66
- num_stable_steps: 44000
67
- scheduler_name: warmup_stable_decay
68
- weight_decay: 0.01
69
- out_parent_folder: model_train
70
- pwd: /home/zg258/rds/hpc-work/infotokenization
71
- resume_from_checkpoint: .checkpoints/last.ckpt
72
- run_folder: .
73
- save_initial_checkpoint: true
74
- seed: 42
75
- tok_name: bytelevel
76
- torch_compile: true
77
- train_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel/train
78
- trainer:
79
- accelerator: gpu
80
- deterministic: false
81
- devices: 1
82
- enable_progress_bar: true
83
- fast_dev_run: false
84
- gradient_clip_algorithm: norm
85
- gradient_clip_val: 1.0
86
- limit_val_batches: 500
87
- log_every_n_steps: 1
88
- max_steps: 50000
89
- precision: bf16-true
90
- val_check_interval: 1000
91
- val_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel/validation
92
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm/fw57M-multi-tied/config.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "architectures": [
3
- "LlamaForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": null,
8
- "eos_token_id": 1,
9
- "head_dim": 32,
10
- "hidden_act": "silu",
11
- "hidden_size": 768,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 3072,
14
- "max_position_embeddings": 2048,
15
- "mlp_bias": false,
16
- "model_type": "llama",
17
- "num_attention_heads": 24,
18
- "num_hidden_layers": 6,
19
- "num_key_value_heads": 24,
20
- "pad_token_id": 0,
21
- "pretraining_tp": 1,
22
- "rms_norm_eps": 1e-05,
23
- "rope_scaling": null,
24
- "rope_theta": 10000.0,
25
- "tie_word_embeddings": true,
26
- "torch_dtype": "bfloat16",
27
- "transformers_version": "4.51.3",
28
- "use_cache": true,
29
- "vocab_size": 258
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm/fw57M-multi-tied/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "eos_token_id": 1,
4
- "pad_token_id": 0,
5
- "transformers_version": "4.51.3"
6
- }
 
 
 
 
 
 
 
llm/fw57M-multi-tied/hparams.yaml DELETED
@@ -1,86 +0,0 @@
1
- loggers:
2
- tensorboard:
3
- _target_: src.trainer.TensorBoardLogger
4
- save_dir: ./
5
- name: ''
6
- version: null
7
- callbacks:
8
- lr_monitor:
9
- _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor
10
- grad_norm:
11
- _target_: src.callbacks.grad_norm.GradNorm
12
- norm_type: 2
13
- group_separator: /
14
- histogram_freq: null
15
- check_clipping: false
16
- log_weight_distribution: false
17
- only_total: true
18
- speed_monitor:
19
- _target_: src.callbacks.speed_monitor.SpeedMonitor
20
- grad_accum:
21
- _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler
22
- scheduling:
23
- 0: 2
24
- model_checkpoint:
25
- _target_: src.callbacks.model_checkpoint.ModelCheckpoint
26
- dirpath: .checkpoints
27
- filename: '{step}'
28
- enable_version_counter: false
29
- every_n_train_steps: 1000
30
- save_top_k: -1
31
- save_last: link
32
- verbose: true
33
- save_initial_checkpoint: true
34
- out_parent_folder: model_train
35
- tok_name: bytelevel
36
- run_folder: .
37
- dataset: common-corpus
38
- pwd: /home/zg258/rds/hpc-work/infotokenization
39
- train_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel/train
40
- val_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel/validation
41
- model: fw57M-tied
42
- resume_from_checkpoint: .checkpoints/last.ckpt
43
- save_initial_checkpoint: true
44
- seed: 42
45
- torch_compile: true
46
- data:
47
- batch_size: 64
48
- eval_batch_size: 64
49
- shuffle: true
50
- drop_last: false
51
- num_workers: 12
52
- pin_memory: true
53
- persistent_workers: false
54
- prefetch_factor: 2
55
- multiprocessing_context: null
56
- optim:
57
- optim_name: adamw
58
- lr: 0.0006
59
- weight_decay: 0.01
60
- optim_kwargs:
61
- fused: true
62
- eps: 1.0e-08
63
- betas:
64
- - 0.9
65
- - 0.95
66
- scheduler_name: warmup_stable_decay
67
- num_warmup_steps: 2000
68
- scheduler_kwargs:
69
- num_stable_steps: 44000
70
- num_decay_steps: 4000
71
- min_lr_ratio: 0.01
72
- trainer:
73
- accelerator: gpu
74
- devices: 1
75
- precision: bf16-true
76
- deterministic: false
77
- log_every_n_steps: 1
78
- enable_progress_bar: true
79
- fast_dev_run: false
80
- gradient_clip_val: 1.0
81
- gradient_clip_algorithm: norm
82
- val_check_interval: 1000
83
- max_steps: 50000
84
- limit_val_batches: 500
85
- evaluation:
86
- blimp: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm/fw57M-multi-tied/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8981d6075d05c747af49d372cce107b439671ea9fca62c4cd8e872907bb842ba
3
- size 113668624
 
 
 
 
llm/fw57M-multi-tied/special_tokens_map.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "eos_token": {
3
- "content": "<|endoftext|>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "pad_token": {
10
- "content": "<|padding|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- }
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm/fw57M-multi-tied/tb_logs.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b65af7245c45561bc3dab4259ca8db5b67f1834b53f280572c63a85cda230cdf
3
- size 2083150
 
 
 
 
llm/fw57M-multi-tied/tokenizer.json DELETED
@@ -1,322 +0,0 @@
1
- {
2
- "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
5
- "added_tokens": [
6
- {
7
- "id": 0,
8
- "content": "<|padding|>",
9
- "single_word": false,
10
- "lstrip": false,
11
- "rstrip": false,
12
- "normalized": false,
13
- "special": true
14
- },
15
- {
16
- "id": 1,
17
- "content": "<|endoftext|>",
18
- "single_word": false,
19
- "lstrip": false,
20
- "rstrip": false,
21
- "normalized": false,
22
- "special": true
23
- }
24
- ],
25
- "normalizer": {
26
- "type": "Sequence",
27
- "normalizers": [
28
- {
29
- "type": "NFD"
30
- }
31
- ]
32
- },
33
- "pre_tokenizer": {
34
- "type": "ByteLevel",
35
- "add_prefix_space": true,
36
- "trim_offsets": true,
37
- "use_regex": true
38
- },
39
- "post_processor": {
40
- "type": "ByteLevel",
41
- "add_prefix_space": true,
42
- "trim_offsets": true,
43
- "use_regex": true
44
- },
45
- "decoder": {
46
- "type": "ByteLevel",
47
- "add_prefix_space": true,
48
- "trim_offsets": true,
49
- "use_regex": true
50
- },
51
- "model": {
52
- "type": "BPE",
53
- "dropout": null,
54
- "unk_token": null,
55
- "continuing_subword_prefix": null,
56
- "end_of_word_suffix": null,
57
- "fuse_unk": false,
58
- "byte_fallback": false,
59
- "ignore_merges": false,
60
- "vocab": {
61
- "<|padding|>": 0,
62
- "<|endoftext|>": 1,
63
- "!": 2,
64
- "\"": 3,
65
- "#": 4,
66
- "$": 5,
67
- "%": 6,
68
- "&": 7,
69
- "'": 8,
70
- "(": 9,
71
- ")": 10,
72
- "*": 11,
73
- "+": 12,
74
- ",": 13,
75
- "-": 14,
76
- ".": 15,
77
- "/": 16,
78
- "0": 17,
79
- "1": 18,
80
- "2": 19,
81
- "3": 20,
82
- "4": 21,
83
- "5": 22,
84
- "6": 23,
85
- "7": 24,
86
- "8": 25,
87
- "9": 26,
88
- ":": 27,
89
- ";": 28,
90
- "<": 29,
91
- "=": 30,
92
- ">": 31,
93
- "?": 32,
94
- "@": 33,
95
- "A": 34,
96
- "B": 35,
97
- "C": 36,
98
- "D": 37,
99
- "E": 38,
100
- "F": 39,
101
- "G": 40,
102
- "H": 41,
103
- "I": 42,
104
- "J": 43,
105
- "K": 44,
106
- "L": 45,
107
- "M": 46,
108
- "N": 47,
109
- "O": 48,
110
- "P": 49,
111
- "Q": 50,
112
- "R": 51,
113
- "S": 52,
114
- "T": 53,
115
- "U": 54,
116
- "V": 55,
117
- "W": 56,
118
- "X": 57,
119
- "Y": 58,
120
- "Z": 59,
121
- "[": 60,
122
- "\\": 61,
123
- "]": 62,
124
- "^": 63,
125
- "_": 64,
126
- "`": 65,
127
- "a": 66,
128
- "b": 67,
129
- "c": 68,
130
- "d": 69,
131
- "e": 70,
132
- "f": 71,
133
- "g": 72,
134
- "h": 73,
135
- "i": 74,
136
- "j": 75,
137
- "k": 76,
138
- "l": 77,
139
- "m": 78,
140
- "n": 79,
141
- "o": 80,
142
- "p": 81,
143
- "q": 82,
144
- "r": 83,
145
- "s": 84,
146
- "t": 85,
147
- "u": 86,
148
- "v": 87,
149
- "w": 88,
150
- "x": 89,
151
- "y": 90,
152
- "z": 91,
153
- "{": 92,
154
- "|": 93,
155
- "}": 94,
156
- "~": 95,
157
- "¡": 96,
158
- "¢": 97,
159
- "£": 98,
160
- "¤": 99,
161
- "¥": 100,
162
- "¦": 101,
163
- "§": 102,
164
- "¨": 103,
165
- "©": 104,
166
- "ª": 105,
167
- "«": 106,
168
- "¬": 107,
169
- "®": 108,
170
- "¯": 109,
171
- "°": 110,
172
- "±": 111,
173
- "²": 112,
174
- "³": 113,
175
- "´": 114,
176
- "µ": 115,
177
- "¶": 116,
178
- "·": 117,
179
- "¸": 118,
180
- "¹": 119,
181
- "º": 120,
182
- "»": 121,
183
- "¼": 122,
184
- "½": 123,
185
- "¾": 124,
186
- "¿": 125,
187
- "À": 126,
188
- "Á": 127,
189
- "Â": 128,
190
- "Ã": 129,
191
- "Ä": 130,
192
- "Å": 131,
193
- "Æ": 132,
194
- "Ç": 133,
195
- "È": 134,
196
- "É": 135,
197
- "Ê": 136,
198
- "Ë": 137,
199
- "Ì": 138,
200
- "Í": 139,
201
- "Î": 140,
202
- "Ï": 141,
203
- "Ð": 142,
204
- "Ñ": 143,
205
- "Ò": 144,
206
- "Ó": 145,
207
- "Ô": 146,
208
- "Õ": 147,
209
- "Ö": 148,
210
- "×": 149,
211
- "Ø": 150,
212
- "Ù": 151,
213
- "Ú": 152,
214
- "Û": 153,
215
- "Ü": 154,
216
- "Ý": 155,
217
- "Þ": 156,
218
- "ß": 157,
219
- "à": 158,
220
- "á": 159,
221
- "â": 160,
222
- "ã": 161,
223
- "ä": 162,
224
- "å": 163,
225
- "æ": 164,
226
- "ç": 165,
227
- "è": 166,
228
- "é": 167,
229
- "ê": 168,
230
- "ë": 169,
231
- "ì": 170,
232
- "í": 171,
233
- "î": 172,
234
- "ï": 173,
235
- "ð": 174,
236
- "ñ": 175,
237
- "ò": 176,
238
- "ó": 177,
239
- "ô": 178,
240
- "õ": 179,
241
- "ö": 180,
242
- "÷": 181,
243
- "ø": 182,
244
- "ù": 183,
245
- "ú": 184,
246
- "û": 185,
247
- "ü": 186,
248
- "ý": 187,
249
- "þ": 188,
250
- "ÿ": 189,
251
- "Ā": 190,
252
- "ā": 191,
253
- "Ă": 192,
254
- "ă": 193,
255
- "Ą": 194,
256
- "ą": 195,
257
- "Ć": 196,
258
- "ć": 197,
259
- "Ĉ": 198,
260
- "ĉ": 199,
261
- "Ċ": 200,
262
- "ċ": 201,
263
- "Č": 202,
264
- "č": 203,
265
- "Ď": 204,
266
- "ď": 205,
267
- "Đ": 206,
268
- "đ": 207,
269
- "Ē": 208,
270
- "ē": 209,
271
- "Ĕ": 210,
272
- "ĕ": 211,
273
- "Ė": 212,
274
- "ė": 213,
275
- "Ę": 214,
276
- "ę": 215,
277
- "Ě": 216,
278
- "ě": 217,
279
- "Ĝ": 218,
280
- "ĝ": 219,
281
- "Ğ": 220,
282
- "ğ": 221,
283
- "Ġ": 222,
284
- "ġ": 223,
285
- "Ģ": 224,
286
- "ģ": 225,
287
- "Ĥ": 226,
288
- "ĥ": 227,
289
- "Ħ": 228,
290
- "ħ": 229,
291
- "Ĩ": 230,
292
- "ĩ": 231,
293
- "Ī": 232,
294
- "ī": 233,
295
- "Ĭ": 234,
296
- "ĭ": 235,
297
- "Į": 236,
298
- "į": 237,
299
- "İ": 238,
300
- "ı": 239,
301
- "IJ": 240,
302
- "ij": 241,
303
- "Ĵ": 242,
304
- "ĵ": 243,
305
- "Ķ": 244,
306
- "ķ": 245,
307
- "ĸ": 246,
308
- "Ĺ": 247,
309
- "ĺ": 248,
310
- "Ļ": 249,
311
- "ļ": 250,
312
- "Ľ": 251,
313
- "ľ": 252,
314
- "Ŀ": 253,
315
- "ŀ": 254,
316
- "Ł": 255,
317
- "ł": 256,
318
- "Ń": 257
319
- },
320
- "merges": []
321
- }
322
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm/fw57M-multi-tied/tokenizer_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "add_prefix_space": true,
3
- "added_tokens_decoder": {
4
- "0": {
5
- "content": "<|padding|>",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "1": {
13
- "content": "<|endoftext|>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- }
20
- },
21
- "bos_token": null,
22
- "clean_up_tokenization_spaces": false,
23
- "eos_token": "<|endoftext|>",
24
- "extra_special_tokens": {},
25
- "model_max_length": 1000000000000000019884624838656,
26
- "pad_token": "<|padding|>",
27
- "tokenizer_class": "PreTrainedTokenizer",
28
- "unk_token": null
29
- }