m4lw4r3exe commited on
Commit
6498a09
·
1 Parent(s): e7c3284

Training in progress, step 24576

Browse files
checkpoint-114688/config.json DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "activation_function": "gelu_new",
3
- "architectures": [
4
- "GPT2LMHeadModel"
5
- ],
6
- "attn_pdrop": 0.1,
7
- "bos_token_id": 50256,
8
- "embd_pdrop": 0.1,
9
- "eos_token_id": 50256,
10
- "initializer_range": 0.02,
11
- "layer_norm_epsilon": 1e-05,
12
- "model_type": "gpt2",
13
- "n_embd": 512,
14
- "n_head": 8,
15
- "n_inner": null,
16
- "n_layer": 8,
17
- "n_positions": 2048,
18
- "pad_token_id": 1,
19
- "reorder_and_upcast_attn": false,
20
- "resid_pdrop": 0.1,
21
- "scale_attn_by_inverse_layer_idx": false,
22
- "scale_attn_weights": true,
23
- "summary_activation": null,
24
- "summary_first_dropout": 0.1,
25
- "summary_proj_to_labels": true,
26
- "summary_type": "cls_index",
27
- "summary_use_proj": true,
28
- "torch_dtype": "float32",
29
- "transformers_version": "4.26.0.dev0",
30
- "use_cache": true,
31
- "vocab_size": 299
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-114688/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b68c6edb60414cfe637e2ec074a5f4acec75f5337e31730978b158998228eac5
3
- size 211432837
 
 
 
 
checkpoint-114688/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6849dc98fcc9e4181c102fa0bb6dd1d70c54edae9d1a8440b604c6f98ba117b4
3
- size 139279005
 
 
 
 
checkpoint-114688/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc866ce242256f98027b6016930d35e9d45293516285c23fdbadea7b58591d8a
3
- size 15597
 
 
 
 
checkpoint-114688/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9778e4932801d6fcefc7836eddd9fd5635b262a07b6eb7a75700d5c35643bb19
3
- size 557
 
 
 
 
checkpoint-114688/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fa9e591dbfa043fe5b9503f81bacab889f557e7055edb15e3594fae6f28701b
3
- size 627
 
 
 
 
checkpoint-114688/trainer_state.json DELETED
@@ -1,800 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 7.735599622285175,
5
- "global_step": 114688,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.14,
12
- "learning_rate": 0.00020480000000000002,
13
- "loss": 2.1719,
14
- "step": 2048
15
- },
16
- {
17
- "epoch": 0.14,
18
- "eval_loss": 1.0833024978637695,
19
- "eval_runtime": 15.7887,
20
- "eval_samples_per_second": 132.5,
21
- "eval_steps_per_second": 8.297,
22
- "step": 2048
23
- },
24
- {
25
- "epoch": 0.28,
26
- "learning_rate": 0.00040960000000000004,
27
- "loss": 0.9468,
28
- "step": 4096
29
- },
30
- {
31
- "epoch": 0.28,
32
- "eval_loss": 0.8000818490982056,
33
- "eval_runtime": 17.2632,
34
- "eval_samples_per_second": 121.183,
35
- "eval_steps_per_second": 7.588,
36
- "step": 4096
37
- },
38
- {
39
- "epoch": 0.41,
40
- "learning_rate": 0.0004998749142723946,
41
- "loss": 0.7542,
42
- "step": 6144
43
- },
44
- {
45
- "epoch": 0.41,
46
- "eval_loss": 0.7439925074577332,
47
- "eval_runtime": 17.4134,
48
- "eval_samples_per_second": 120.138,
49
- "eval_steps_per_second": 7.523,
50
- "step": 6144
51
- },
52
- {
53
- "epoch": 0.55,
54
- "learning_rate": 0.0004990273340312486,
55
- "loss": 0.6756,
56
- "step": 8192
57
- },
58
- {
59
- "epoch": 0.55,
60
- "eval_loss": 0.7018134593963623,
61
- "eval_runtime": 15.8601,
62
- "eval_samples_per_second": 131.904,
63
- "eval_steps_per_second": 8.26,
64
- "step": 8192
65
- },
66
- {
67
- "epoch": 0.69,
68
- "learning_rate": 0.0004973820371438889,
69
- "loss": 0.631,
70
- "step": 10240
71
- },
72
- {
73
- "epoch": 0.69,
74
- "eval_loss": 0.684688925743103,
75
- "eval_runtime": 18.0314,
76
- "eval_samples_per_second": 116.02,
77
- "eval_steps_per_second": 7.265,
78
- "step": 10240
79
- },
80
- {
81
- "epoch": 0.83,
82
- "learning_rate": 0.0004949442940386407,
83
- "loss": 0.5989,
84
- "step": 12288
85
- },
86
- {
87
- "epoch": 0.83,
88
- "eval_loss": 0.6822534203529358,
89
- "eval_runtime": 17.4182,
90
- "eval_samples_per_second": 120.104,
91
- "eval_steps_per_second": 7.521,
92
- "step": 12288
93
- },
94
- {
95
- "epoch": 0.97,
96
- "learning_rate": 0.0004917201492451735,
97
- "loss": 0.5776,
98
- "step": 14336
99
- },
100
- {
101
- "epoch": 0.97,
102
- "eval_loss": 0.6674544215202332,
103
- "eval_runtime": 17.7487,
104
- "eval_samples_per_second": 117.868,
105
- "eval_steps_per_second": 7.381,
106
- "step": 14336
107
- },
108
- {
109
- "epoch": 1.11,
110
- "learning_rate": 0.0004877230785006913,
111
- "loss": 0.5525,
112
- "step": 16384
113
- },
114
- {
115
- "epoch": 1.11,
116
- "eval_loss": 0.6640163660049438,
117
- "eval_runtime": 15.8951,
118
- "eval_samples_per_second": 131.613,
119
- "eval_steps_per_second": 8.242,
120
- "step": 16384
121
- },
122
- {
123
- "epoch": 1.24,
124
- "learning_rate": 0.0004829670105577666,
125
- "loss": 0.5373,
126
- "step": 18432
127
- },
128
- {
129
- "epoch": 1.24,
130
- "eval_loss": 0.6663170456886292,
131
- "eval_runtime": 17.5,
132
- "eval_samples_per_second": 119.543,
133
- "eval_steps_per_second": 7.486,
134
- "step": 18432
135
- },
136
- {
137
- "epoch": 1.38,
138
- "learning_rate": 0.0004774596641323791,
139
- "loss": 0.5261,
140
- "step": 20480
141
- },
142
- {
143
- "epoch": 1.38,
144
- "eval_loss": 0.6518951654434204,
145
- "eval_runtime": 17.1326,
146
- "eval_samples_per_second": 122.107,
147
- "eval_steps_per_second": 7.646,
148
- "step": 20480
149
- },
150
- {
151
- "epoch": 1.52,
152
- "learning_rate": 0.0004712261976082475,
153
- "loss": 0.5115,
154
- "step": 22528
155
- },
156
- {
157
- "epoch": 1.52,
158
- "eval_loss": 0.6658991575241089,
159
- "eval_runtime": 17.6874,
160
- "eval_samples_per_second": 118.277,
161
- "eval_steps_per_second": 7.406,
162
- "step": 22528
163
- },
164
- {
165
- "epoch": 1.66,
166
- "learning_rate": 0.00046428407064289515,
167
- "loss": 0.5033,
168
- "step": 24576
169
- },
170
- {
171
- "epoch": 1.66,
172
- "eval_loss": 0.6602604389190674,
173
- "eval_runtime": 15.8845,
174
- "eval_samples_per_second": 131.701,
175
- "eval_steps_per_second": 8.247,
176
- "step": 24576
177
- },
178
- {
179
- "epoch": 1.8,
180
- "learning_rate": 0.00045665163060732317,
181
- "loss": 0.4925,
182
- "step": 26624
183
- },
184
- {
185
- "epoch": 1.8,
186
- "eval_loss": 0.6551440954208374,
187
- "eval_runtime": 17.7902,
188
- "eval_samples_per_second": 117.593,
189
- "eval_steps_per_second": 7.364,
190
- "step": 26624
191
- },
192
- {
193
- "epoch": 1.93,
194
- "learning_rate": 0.000448360778288479,
195
- "loss": 0.4868,
196
- "step": 28672
197
- },
198
- {
199
- "epoch": 1.93,
200
- "eval_loss": 0.6580803394317627,
201
- "eval_runtime": 15.7841,
202
- "eval_samples_per_second": 132.538,
203
- "eval_steps_per_second": 8.299,
204
- "step": 28672
205
- },
206
- {
207
- "epoch": 2.07,
208
- "learning_rate": 0.00043942999964293453,
209
- "loss": 0.4741,
210
- "step": 30720
211
- },
212
- {
213
- "epoch": 2.07,
214
- "eval_loss": 0.661230742931366,
215
- "eval_runtime": 18.015,
216
- "eval_samples_per_second": 116.126,
217
- "eval_steps_per_second": 7.272,
218
- "step": 30720
219
- },
220
- {
221
- "epoch": 2.21,
222
- "learning_rate": 0.0004298966220346151,
223
- "loss": 0.4632,
224
- "step": 32768
225
- },
226
- {
227
- "epoch": 2.21,
228
- "eval_loss": 0.6618030071258545,
229
- "eval_runtime": 17.9073,
230
- "eval_samples_per_second": 116.824,
231
- "eval_steps_per_second": 7.315,
232
- "step": 32768
233
- },
234
- {
235
- "epoch": 2.35,
236
- "learning_rate": 0.00041978697624050446,
237
- "loss": 0.4584,
238
- "step": 34816
239
- },
240
- {
241
- "epoch": 2.35,
242
- "eval_loss": 0.6579039096832275,
243
- "eval_runtime": 17.8241,
244
- "eval_samples_per_second": 117.369,
245
- "eval_steps_per_second": 7.35,
246
- "step": 34816
247
- },
248
- {
249
- "epoch": 2.49,
250
- "learning_rate": 0.0004091334467888659,
251
- "loss": 0.451,
252
- "step": 36864
253
- },
254
- {
255
- "epoch": 2.49,
256
- "eval_loss": 0.6627541780471802,
257
- "eval_runtime": 17.6822,
258
- "eval_samples_per_second": 118.311,
259
- "eval_steps_per_second": 7.409,
260
- "step": 36864
261
- },
262
- {
263
- "epoch": 2.62,
264
- "learning_rate": 0.00039796458815002033,
265
- "loss": 0.445,
266
- "step": 38912
267
- },
268
- {
269
- "epoch": 2.62,
270
- "eval_loss": 0.656867265701294,
271
- "eval_runtime": 15.8659,
272
- "eval_samples_per_second": 131.855,
273
- "eval_steps_per_second": 8.257,
274
- "step": 38912
275
- },
276
- {
277
- "epoch": 2.76,
278
- "learning_rate": 0.0003863212870708643,
279
- "loss": 0.4404,
280
- "step": 40960
281
- },
282
- {
283
- "epoch": 2.76,
284
- "eval_loss": 0.6653844118118286,
285
- "eval_runtime": 17.9192,
286
- "eval_samples_per_second": 116.747,
287
- "eval_steps_per_second": 7.311,
288
- "step": 40960
289
- },
290
- {
291
- "epoch": 2.9,
292
- "learning_rate": 0.00037424687637876156,
293
- "loss": 0.4334,
294
- "step": 43008
295
- },
296
- {
297
- "epoch": 2.9,
298
- "eval_loss": 0.6553301811218262,
299
- "eval_runtime": 15.9304,
300
- "eval_samples_per_second": 131.321,
301
- "eval_steps_per_second": 8.223,
302
- "step": 43008
303
- },
304
- {
305
- "epoch": 3.04,
306
- "learning_rate": 0.00036178064571954134,
307
- "loss": 0.4278,
308
- "step": 45056
309
- },
310
- {
311
- "epoch": 3.04,
312
- "eval_loss": 0.6766741871833801,
313
- "eval_runtime": 16.9834,
314
- "eval_samples_per_second": 123.179,
315
- "eval_steps_per_second": 7.713,
316
- "step": 45056
317
- },
318
- {
319
- "epoch": 3.18,
320
- "learning_rate": 0.0003489439971353363,
321
- "loss": 0.4152,
322
- "step": 47104
323
- },
324
- {
325
- "epoch": 3.18,
326
- "eval_loss": 0.6645193696022034,
327
- "eval_runtime": 15.816,
328
- "eval_samples_per_second": 132.271,
329
- "eval_steps_per_second": 8.283,
330
- "step": 47104
331
- },
332
- {
333
- "epoch": 3.32,
334
- "learning_rate": 0.0003357965820476752,
335
- "loss": 0.411,
336
- "step": 49152
337
- },
338
- {
339
- "epoch": 3.32,
340
- "eval_loss": 0.6748972535133362,
341
- "eval_runtime": 17.7507,
342
- "eval_samples_per_second": 117.855,
343
- "eval_steps_per_second": 7.38,
344
- "step": 49152
345
- },
346
- {
347
- "epoch": 3.45,
348
- "learning_rate": 0.0003223743322236833,
349
- "loss": 0.4085,
350
- "step": 51200
351
- },
352
- {
353
- "epoch": 3.45,
354
- "eval_loss": 0.6703893542289734,
355
- "eval_runtime": 17.341,
356
- "eval_samples_per_second": 120.639,
357
- "eval_steps_per_second": 7.554,
358
- "step": 51200
359
- },
360
- {
361
- "epoch": 3.59,
362
- "learning_rate": 0.0003087269633577651,
363
- "loss": 0.4037,
364
- "step": 53248
365
- },
366
- {
367
- "epoch": 3.59,
368
- "eval_loss": 0.6561577916145325,
369
- "eval_runtime": 15.9295,
370
- "eval_samples_per_second": 131.329,
371
- "eval_steps_per_second": 8.224,
372
- "step": 53248
373
- },
374
- {
375
- "epoch": 3.73,
376
- "learning_rate": 0.0002948780545870247,
377
- "loss": 0.3975,
378
- "step": 55296
379
- },
380
- {
381
- "epoch": 3.73,
382
- "eval_loss": 0.6698916554450989,
383
- "eval_runtime": 17.9008,
384
- "eval_samples_per_second": 116.866,
385
- "eval_steps_per_second": 7.318,
386
- "step": 55296
387
- },
388
- {
389
- "epoch": 3.87,
390
- "learning_rate": 0.0002808921064161573,
391
- "loss": 0.3917,
392
- "step": 57344
393
- },
394
- {
395
- "epoch": 3.87,
396
- "eval_loss": 0.7029281854629517,
397
- "eval_runtime": 17.8146,
398
- "eval_samples_per_second": 117.432,
399
- "eval_steps_per_second": 7.354,
400
- "step": 57344
401
- },
402
- {
403
- "epoch": 4.01,
404
- "learning_rate": 0.00026680720064442787,
405
- "loss": 0.3876,
406
- "step": 59392
407
- },
408
- {
409
- "epoch": 4.01,
410
- "eval_loss": 0.6837398409843445,
411
- "eval_runtime": 17.4354,
412
- "eval_samples_per_second": 119.986,
413
- "eval_steps_per_second": 7.513,
414
- "step": 59392
415
- },
416
- {
417
- "epoch": 4.14,
418
- "learning_rate": 0.00025266845586830784,
419
- "loss": 0.3751,
420
- "step": 61440
421
- },
422
- {
423
- "epoch": 4.14,
424
- "eval_loss": 0.690500795841217,
425
- "eval_runtime": 17.8964,
426
- "eval_samples_per_second": 116.895,
427
- "eval_steps_per_second": 7.32,
428
- "step": 61440
429
- },
430
- {
431
- "epoch": 4.28,
432
- "learning_rate": 0.00023851425721450398,
433
- "loss": 0.3719,
434
- "step": 63488
435
- },
436
- {
437
- "epoch": 4.28,
438
- "eval_loss": 0.6938452124595642,
439
- "eval_runtime": 15.7962,
440
- "eval_samples_per_second": 132.437,
441
- "eval_steps_per_second": 8.293,
442
- "step": 63488
443
- },
444
- {
445
- "epoch": 4.42,
446
- "learning_rate": 0.0002244106409269819,
447
- "loss": 0.3644,
448
- "step": 65536
449
- },
450
- {
451
- "epoch": 4.42,
452
- "eval_loss": 0.6987010836601257,
453
- "eval_runtime": 15.9556,
454
- "eval_samples_per_second": 131.114,
455
- "eval_steps_per_second": 8.21,
456
- "step": 65536
457
- },
458
- {
459
- "epoch": 4.56,
460
- "learning_rate": 0.00021037526400320187,
461
- "loss": 0.357,
462
- "step": 67584
463
- },
464
- {
465
- "epoch": 4.56,
466
- "eval_loss": 0.6973423957824707,
467
- "eval_runtime": 15.8273,
468
- "eval_samples_per_second": 132.177,
469
- "eval_steps_per_second": 8.277,
470
- "step": 67584
471
- },
472
- {
473
- "epoch": 4.7,
474
- "learning_rate": 0.0001964736950807942,
475
- "loss": 0.357,
476
- "step": 69632
477
- },
478
- {
479
- "epoch": 4.7,
480
- "eval_loss": 0.6949540972709656,
481
- "eval_runtime": 16.1065,
482
- "eval_samples_per_second": 129.885,
483
- "eval_steps_per_second": 8.133,
484
- "step": 69632
485
- },
486
- {
487
- "epoch": 4.83,
488
- "learning_rate": 0.00018274358855873096,
489
- "loss": 0.3502,
490
- "step": 71680
491
- },
492
- {
493
- "epoch": 4.83,
494
- "eval_loss": 0.6959769129753113,
495
- "eval_runtime": 16.2001,
496
- "eval_samples_per_second": 129.135,
497
- "eval_steps_per_second": 8.086,
498
- "step": 71680
499
- },
500
- {
501
- "epoch": 4.97,
502
- "learning_rate": 0.00016922892649452222,
503
- "loss": 0.3458,
504
- "step": 73728
505
- },
506
- {
507
- "epoch": 4.97,
508
- "eval_loss": 0.7017838358879089,
509
- "eval_runtime": 18.0467,
510
- "eval_samples_per_second": 115.922,
511
- "eval_steps_per_second": 7.259,
512
- "step": 73728
513
- },
514
- {
515
- "epoch": 5.11,
516
- "learning_rate": 0.00015597300080605504,
517
- "loss": 0.3302,
518
- "step": 75776
519
- },
520
- {
521
- "epoch": 5.11,
522
- "eval_loss": 0.7075567841529846,
523
- "eval_runtime": 16.7136,
524
- "eval_samples_per_second": 125.168,
525
- "eval_steps_per_second": 7.838,
526
- "step": 75776
527
- },
528
- {
529
- "epoch": 5.25,
530
- "learning_rate": 0.0001430182745933093,
531
- "loss": 0.3278,
532
- "step": 77824
533
- },
534
- {
535
- "epoch": 5.25,
536
- "eval_loss": 0.7178707718849182,
537
- "eval_runtime": 15.7714,
538
- "eval_samples_per_second": 132.645,
539
- "eval_steps_per_second": 8.306,
540
- "step": 77824
541
- },
542
- {
543
- "epoch": 5.39,
544
- "learning_rate": 0.00013040017526934073,
545
- "loss": 0.3224,
546
- "step": 79872
547
- },
548
- {
549
- "epoch": 5.39,
550
- "eval_loss": 0.725006639957428,
551
- "eval_runtime": 15.8618,
552
- "eval_samples_per_second": 131.889,
553
- "eval_steps_per_second": 8.259,
554
- "step": 79872
555
- },
556
- {
557
- "epoch": 5.53,
558
- "learning_rate": 0.00011817144183980649,
559
- "loss": 0.3181,
560
- "step": 81920
561
- },
562
- {
563
- "epoch": 5.53,
564
- "eval_loss": 0.7247176766395569,
565
- "eval_runtime": 17.8123,
566
- "eval_samples_per_second": 117.447,
567
- "eval_steps_per_second": 7.354,
568
- "step": 81920
569
- },
570
- {
571
- "epoch": 5.66,
572
- "learning_rate": 0.00010636499874117036,
573
- "loss": 0.3154,
574
- "step": 83968
575
- },
576
- {
577
- "epoch": 5.66,
578
- "eval_loss": 0.7248900532722473,
579
- "eval_runtime": 17.751,
580
- "eval_samples_per_second": 117.852,
581
- "eval_steps_per_second": 7.38,
582
- "step": 83968
583
- },
584
- {
585
- "epoch": 5.8,
586
- "learning_rate": 9.501866590283475e-05,
587
- "loss": 0.3105,
588
- "step": 86016
589
- },
590
- {
591
- "epoch": 5.8,
592
- "eval_loss": 0.7352888584136963,
593
- "eval_runtime": 18.0338,
594
- "eval_samples_per_second": 116.004,
595
- "eval_steps_per_second": 7.264,
596
- "step": 86016
597
- },
598
- {
599
- "epoch": 5.94,
600
- "learning_rate": 8.416361604489855e-05,
601
- "loss": 0.3035,
602
- "step": 88064
603
- },
604
- {
605
- "epoch": 5.94,
606
- "eval_loss": 0.7256708741188049,
607
- "eval_runtime": 15.94,
608
- "eval_samples_per_second": 131.242,
609
- "eval_steps_per_second": 8.218,
610
- "step": 88064
611
- },
612
- {
613
- "epoch": 6.08,
614
- "learning_rate": 7.384521927589935e-05,
615
- "loss": 0.294,
616
- "step": 90112
617
- },
618
- {
619
- "epoch": 6.08,
620
- "eval_loss": 0.7387065291404724,
621
- "eval_runtime": 16.8562,
622
- "eval_samples_per_second": 124.109,
623
- "eval_steps_per_second": 7.772,
624
- "step": 90112
625
- },
626
- {
627
- "epoch": 6.22,
628
- "learning_rate": 6.409110434142392e-05,
629
- "loss": 0.2866,
630
- "step": 92160
631
- },
632
- {
633
- "epoch": 6.22,
634
- "eval_loss": 0.7492235898971558,
635
- "eval_runtime": 17.2872,
636
- "eval_samples_per_second": 121.015,
637
- "eval_steps_per_second": 7.578,
638
- "step": 92160
639
- },
640
- {
641
- "epoch": 6.35,
642
- "learning_rate": 5.492819313147518e-05,
643
- "loss": 0.2836,
644
- "step": 94208
645
- },
646
- {
647
- "epoch": 6.35,
648
- "eval_loss": 0.7440558671951294,
649
- "eval_runtime": 17.5257,
650
- "eval_samples_per_second": 119.367,
651
- "eval_steps_per_second": 7.475,
652
- "step": 94208
653
- },
654
- {
655
- "epoch": 6.49,
656
- "learning_rate": 4.6394783238561305e-05,
657
- "loss": 0.2811,
658
- "step": 96256
659
- },
660
- {
661
- "epoch": 6.49,
662
- "eval_loss": 0.7577848434448242,
663
- "eval_runtime": 15.8536,
664
- "eval_samples_per_second": 131.957,
665
- "eval_steps_per_second": 8.263,
666
- "step": 96256
667
- },
668
- {
669
- "epoch": 6.63,
670
- "learning_rate": 3.851358797621554e-05,
671
- "loss": 0.2774,
672
- "step": 98304
673
- },
674
- {
675
- "epoch": 6.63,
676
- "eval_loss": 0.7483424544334412,
677
- "eval_runtime": 15.8485,
678
- "eval_samples_per_second": 132.0,
679
- "eval_steps_per_second": 8.266,
680
- "step": 98304
681
- },
682
- {
683
- "epoch": 6.77,
684
- "learning_rate": 3.130985341100834e-05,
685
- "loss": 0.2736,
686
- "step": 100352
687
- },
688
- {
689
- "epoch": 6.77,
690
- "eval_loss": 0.7481484413146973,
691
- "eval_runtime": 17.9168,
692
- "eval_samples_per_second": 116.762,
693
- "eval_steps_per_second": 7.312,
694
- "step": 100352
695
- },
696
- {
697
- "epoch": 6.91,
698
- "learning_rate": 2.4803653223119228e-05,
699
- "loss": 0.27,
700
- "step": 102400
701
- },
702
- {
703
- "epoch": 6.91,
704
- "eval_loss": 0.7659121751785278,
705
- "eval_runtime": 15.8798,
706
- "eval_samples_per_second": 131.739,
707
- "eval_steps_per_second": 8.249,
708
- "step": 102400
709
- },
710
- {
711
- "epoch": 7.04,
712
- "learning_rate": 1.902218093192909e-05,
713
- "loss": 0.2636,
714
- "step": 104448
715
- },
716
- {
717
- "epoch": 7.04,
718
- "eval_loss": 0.7734333276748657,
719
- "eval_runtime": 17.8497,
720
- "eval_samples_per_second": 117.201,
721
- "eval_steps_per_second": 7.339,
722
- "step": 104448
723
- },
724
- {
725
- "epoch": 7.18,
726
- "learning_rate": 1.398060674025281e-05,
727
- "loss": 0.2583,
728
- "step": 106496
729
- },
730
- {
731
- "epoch": 7.18,
732
- "eval_loss": 0.7774596810340881,
733
- "eval_runtime": 17.9006,
734
- "eval_samples_per_second": 116.868,
735
- "eval_steps_per_second": 7.318,
736
- "step": 106496
737
- },
738
- {
739
- "epoch": 7.32,
740
- "learning_rate": 9.695080472251094e-06,
741
- "loss": 0.2577,
742
- "step": 108544
743
- },
744
- {
745
- "epoch": 7.32,
746
- "eval_loss": 0.7828282117843628,
747
- "eval_runtime": 15.9506,
748
- "eval_samples_per_second": 131.155,
749
- "eval_steps_per_second": 8.213,
750
- "step": 108544
751
- },
752
- {
753
- "epoch": 7.46,
754
- "learning_rate": 6.177802621565725e-06,
755
- "loss": 0.2553,
756
- "step": 110592
757
- },
758
- {
759
- "epoch": 7.46,
760
- "eval_loss": 0.7818995714187622,
761
- "eval_runtime": 15.975,
762
- "eval_samples_per_second": 130.955,
763
- "eval_steps_per_second": 8.2,
764
- "step": 110592
765
- },
766
- {
767
- "epoch": 7.6,
768
- "learning_rate": 3.4434741135661028e-06,
769
- "loss": 0.2545,
770
- "step": 112640
771
- },
772
- {
773
- "epoch": 7.6,
774
- "eval_loss": 0.7817807197570801,
775
- "eval_runtime": 15.9873,
776
- "eval_samples_per_second": 130.854,
777
- "eval_steps_per_second": 8.194,
778
- "step": 112640
779
- },
780
- {
781
- "epoch": 7.74,
782
- "learning_rate": 1.498947438756143e-06,
783
- "loss": 0.2536,
784
- "step": 114688
785
- },
786
- {
787
- "epoch": 7.74,
788
- "eval_loss": 0.7830276489257812,
789
- "eval_runtime": 17.6639,
790
- "eval_samples_per_second": 118.434,
791
- "eval_steps_per_second": 7.416,
792
- "step": 114688
793
- }
794
- ],
795
- "max_steps": 118608,
796
- "num_train_epochs": 8,
797
- "total_flos": 4.702934009537618e+17,
798
- "trial_name": null,
799
- "trial_params": null
800
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-114688/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0200139492e444a9d322a4f90a96e6dde09c7a882f05b816c2345dade5ea0f98
3
- size 3515
 
 
 
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7098b0183890d9e5a06af29dac1f65359434ef6ca388e453e5b64e173f3e5ccd
3
  size 139279005
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:319da55f7bdc14e455deb61ce68f9663cffd4b765a97952f7c04b2708e957075
3
  size 139279005