fatheroffire commited on
Commit
ed85df8
·
verified ·
1 Parent(s): 31892f1

Upload checkpoint-63/trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. checkpoint-63/trainer_state.json +601 -0
checkpoint-63/trainer_state.json ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 63,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016,
14
+ "grad_norm": 5.178493499755859,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.9778,
17
+ "mean_token_accuracy": 0.5731211602687836,
18
+ "num_tokens": 18095.0,
19
+ "step": 1
20
+ },
21
+ {
22
+ "epoch": 0.032,
23
+ "grad_norm": 4.289798736572266,
24
+ "learning_rate": 0.0001,
25
+ "loss": 2.0501,
26
+ "mean_token_accuracy": 0.5611206591129303,
27
+ "num_tokens": 33560.0,
28
+ "step": 2
29
+ },
30
+ {
31
+ "epoch": 0.048,
32
+ "grad_norm": 2.592556953430176,
33
+ "learning_rate": 0.0002,
34
+ "loss": 1.8085,
35
+ "mean_token_accuracy": 0.5969144999980927,
36
+ "num_tokens": 51416.0,
37
+ "step": 3
38
+ },
39
+ {
40
+ "epoch": 0.064,
41
+ "grad_norm": 1.5414831638336182,
42
+ "learning_rate": 0.00019988066808963474,
43
+ "loss": 1.825,
44
+ "mean_token_accuracy": 0.5979213863611221,
45
+ "num_tokens": 66606.0,
46
+ "step": 4
47
+ },
48
+ {
49
+ "epoch": 0.08,
50
+ "grad_norm": 1.532880425453186,
51
+ "learning_rate": 0.000199522988805313,
52
+ "loss": 1.6023,
53
+ "mean_token_accuracy": 0.6211032420396805,
54
+ "num_tokens": 81922.0,
55
+ "step": 5
56
+ },
57
+ {
58
+ "epoch": 0.096,
59
+ "grad_norm": 1.2121299505233765,
60
+ "learning_rate": 0.00019892791064819693,
61
+ "loss": 1.5619,
62
+ "mean_token_accuracy": 0.6331981718540192,
63
+ "num_tokens": 98973.0,
64
+ "step": 6
65
+ },
66
+ {
67
+ "epoch": 0.112,
68
+ "grad_norm": 0.9985714554786682,
69
+ "learning_rate": 0.00019809701165858222,
70
+ "loss": 1.401,
71
+ "mean_token_accuracy": 0.6640438586473465,
72
+ "num_tokens": 116786.0,
73
+ "step": 7
74
+ },
75
+ {
76
+ "epoch": 0.128,
77
+ "grad_norm": 1.235257863998413,
78
+ "learning_rate": 0.00019703249523121886,
79
+ "loss": 1.4001,
80
+ "mean_token_accuracy": 0.6453036367893219,
81
+ "num_tokens": 133738.0,
82
+ "step": 8
83
+ },
84
+ {
85
+ "epoch": 0.144,
86
+ "grad_norm": 1.1400103569030762,
87
+ "learning_rate": 0.00019573718427230442,
88
+ "loss": 1.2082,
89
+ "mean_token_accuracy": 0.6753191202878952,
90
+ "num_tokens": 155466.0,
91
+ "step": 9
92
+ },
93
+ {
94
+ "epoch": 0.16,
95
+ "grad_norm": 1.1436022520065308,
96
+ "learning_rate": 0.00019421451371364444,
97
+ "loss": 1.3161,
98
+ "mean_token_accuracy": 0.6673162132501602,
99
+ "num_tokens": 171140.0,
100
+ "step": 10
101
+ },
102
+ {
103
+ "epoch": 0.176,
104
+ "grad_norm": 0.9463217258453369,
105
+ "learning_rate": 0.00019246852140383043,
106
+ "loss": 1.3072,
107
+ "mean_token_accuracy": 0.662985697388649,
108
+ "num_tokens": 190013.0,
109
+ "step": 11
110
+ },
111
+ {
112
+ "epoch": 0.192,
113
+ "grad_norm": 0.718537449836731,
114
+ "learning_rate": 0.00019050383740059162,
115
+ "loss": 1.2311,
116
+ "mean_token_accuracy": 0.6756905913352966,
117
+ "num_tokens": 204990.0,
118
+ "step": 12
119
+ },
120
+ {
121
+ "epoch": 0.208,
122
+ "grad_norm": 0.5802047252655029,
123
+ "learning_rate": 0.000188325671692714,
124
+ "loss": 1.1599,
125
+ "mean_token_accuracy": 0.6960069835186005,
126
+ "num_tokens": 221669.0,
127
+ "step": 13
128
+ },
129
+ {
130
+ "epoch": 0.224,
131
+ "grad_norm": 0.6142511367797852,
132
+ "learning_rate": 0.0001859398003840867,
133
+ "loss": 1.1385,
134
+ "mean_token_accuracy": 0.6940146088600159,
135
+ "num_tokens": 238093.0,
136
+ "step": 14
137
+ },
138
+ {
139
+ "epoch": 0.24,
140
+ "grad_norm": 0.6557261347770691,
141
+ "learning_rate": 0.00018335255037651302,
142
+ "loss": 1.2242,
143
+ "mean_token_accuracy": 0.676676332950592,
144
+ "num_tokens": 255351.0,
145
+ "step": 15
146
+ },
147
+ {
148
+ "epoch": 0.256,
149
+ "grad_norm": 0.6320156455039978,
150
+ "learning_rate": 0.00018057078259190397,
151
+ "loss": 1.1493,
152
+ "mean_token_accuracy": 0.6945444643497467,
153
+ "num_tokens": 272261.0,
154
+ "step": 16
155
+ },
156
+ {
157
+ "epoch": 0.272,
158
+ "grad_norm": 0.58851158618927,
159
+ "learning_rate": 0.0001776018737783468,
160
+ "loss": 1.2255,
161
+ "mean_token_accuracy": 0.681264117360115,
162
+ "num_tokens": 288517.0,
163
+ "step": 17
164
+ },
165
+ {
166
+ "epoch": 0.288,
167
+ "grad_norm": 0.5777610540390015,
168
+ "learning_rate": 0.0001744536969482954,
169
+ "loss": 1.1277,
170
+ "mean_token_accuracy": 0.7005439400672913,
171
+ "num_tokens": 303397.0,
172
+ "step": 18
173
+ },
174
+ {
175
+ "epoch": 0.304,
176
+ "grad_norm": 0.4685754179954529,
177
+ "learning_rate": 0.00017113460050075638,
178
+ "loss": 1.0902,
179
+ "mean_token_accuracy": 0.7063727080821991,
180
+ "num_tokens": 320794.0,
181
+ "step": 19
182
+ },
183
+ {
184
+ "epoch": 0.32,
185
+ "grad_norm": 0.4389488101005554,
186
+ "learning_rate": 0.0001676533860828358,
187
+ "loss": 1.1166,
188
+ "mean_token_accuracy": 0.7016228437423706,
189
+ "num_tokens": 335928.0,
190
+ "step": 20
191
+ },
192
+ {
193
+ "epoch": 0.336,
194
+ "grad_norm": 0.47058454155921936,
195
+ "learning_rate": 0.00016401928524935314,
196
+ "loss": 1.0497,
197
+ "mean_token_accuracy": 0.7144980132579803,
198
+ "num_tokens": 353143.0,
199
+ "step": 21
200
+ },
201
+ {
202
+ "epoch": 0.352,
203
+ "grad_norm": 0.444322794675827,
204
+ "learning_rate": 0.0001602419349824178,
205
+ "loss": 1.0521,
206
+ "mean_token_accuracy": 0.7101306021213531,
207
+ "num_tokens": 371702.0,
208
+ "step": 22
209
+ },
210
+ {
211
+ "epoch": 0.368,
212
+ "grad_norm": 0.3345094621181488,
213
+ "learning_rate": 0.0001563313521358848,
214
+ "loss": 1.0394,
215
+ "mean_token_accuracy": 0.7164563238620758,
216
+ "num_tokens": 390548.0,
217
+ "step": 23
218
+ },
219
+ {
220
+ "epoch": 0.384,
221
+ "grad_norm": 0.318149209022522,
222
+ "learning_rate": 0.00015229790687245882,
223
+ "loss": 1.0316,
224
+ "mean_token_accuracy": 0.7225096970796585,
225
+ "num_tokens": 408577.0,
226
+ "step": 24
227
+ },
228
+ {
229
+ "epoch": 0.4,
230
+ "grad_norm": 0.36164650321006775,
231
+ "learning_rate": 0.0001481522951638875,
232
+ "loss": 1.0928,
233
+ "mean_token_accuracy": 0.7029632329940796,
234
+ "num_tokens": 426544.0,
235
+ "step": 25
236
+ },
237
+ {
238
+ "epoch": 0.416,
239
+ "grad_norm": 0.3186567723751068,
240
+ "learning_rate": 0.00014390551042716668,
241
+ "loss": 1.0932,
242
+ "mean_token_accuracy": 0.7054416686296463,
243
+ "num_tokens": 444668.0,
244
+ "step": 26
245
+ },
246
+ {
247
+ "epoch": 0.432,
248
+ "grad_norm": 0.3198276460170746,
249
+ "learning_rate": 0.00013956881437197514,
250
+ "loss": 0.8921,
251
+ "mean_token_accuracy": 0.7491990774869919,
252
+ "num_tokens": 460783.0,
253
+ "step": 27
254
+ },
255
+ {
256
+ "epoch": 0.448,
257
+ "grad_norm": 0.3033614158630371,
258
+ "learning_rate": 0.00013515370713664487,
259
+ "loss": 1.0343,
260
+ "mean_token_accuracy": 0.7144427299499512,
261
+ "num_tokens": 479754.0,
262
+ "step": 28
263
+ },
264
+ {
265
+ "epoch": 0.464,
266
+ "grad_norm": 0.3438175618648529,
267
+ "learning_rate": 0.00013067189679186162,
268
+ "loss": 1.0337,
269
+ "mean_token_accuracy": 0.7172342389822006,
270
+ "num_tokens": 496210.0,
271
+ "step": 29
272
+ },
273
+ {
274
+ "epoch": 0.48,
275
+ "grad_norm": 0.31151092052459717,
276
+ "learning_rate": 0.00012613526829296622,
277
+ "loss": 1.0293,
278
+ "mean_token_accuracy": 0.7132327109575272,
279
+ "num_tokens": 513602.0,
280
+ "step": 30
281
+ },
282
+ {
283
+ "epoch": 0.496,
284
+ "grad_norm": 0.37670907378196716,
285
+ "learning_rate": 0.0001215558519631896,
286
+ "loss": 1.0391,
287
+ "mean_token_accuracy": 0.7135330736637115,
288
+ "num_tokens": 528716.0,
289
+ "step": 31
290
+ },
291
+ {
292
+ "epoch": 0.512,
293
+ "grad_norm": 0.332743376493454,
294
+ "learning_rate": 0.0001169457915913982,
295
+ "loss": 1.0581,
296
+ "mean_token_accuracy": 0.7063487768173218,
297
+ "num_tokens": 545676.0,
298
+ "step": 32
299
+ },
300
+ {
301
+ "epoch": 0.528,
302
+ "grad_norm": 0.33181434869766235,
303
+ "learning_rate": 0.000112317312228949,
304
+ "loss": 1.0115,
305
+ "mean_token_accuracy": 0.7126458883285522,
306
+ "num_tokens": 558919.0,
307
+ "step": 33
308
+ },
309
+ {
310
+ "epoch": 0.544,
311
+ "grad_norm": 0.29894986748695374,
312
+ "learning_rate": 0.00010768268777105104,
313
+ "loss": 1.1125,
314
+ "mean_token_accuracy": 0.7048945128917694,
315
+ "num_tokens": 575928.0,
316
+ "step": 34
317
+ },
318
+ {
319
+ "epoch": 0.56,
320
+ "grad_norm": 0.3128003776073456,
321
+ "learning_rate": 0.00010305420840860182,
322
+ "loss": 1.0834,
323
+ "mean_token_accuracy": 0.7064289450645447,
324
+ "num_tokens": 591942.0,
325
+ "step": 35
326
+ },
327
+ {
328
+ "epoch": 0.576,
329
+ "grad_norm": 0.29789265990257263,
330
+ "learning_rate": 9.844414803681041e-05,
331
+ "loss": 0.942,
332
+ "mean_token_accuracy": 0.7331829965114594,
333
+ "num_tokens": 607626.0,
334
+ "step": 36
335
+ },
336
+ {
337
+ "epoch": 0.592,
338
+ "grad_norm": 0.2763383686542511,
339
+ "learning_rate": 9.386473170703382e-05,
340
+ "loss": 1.0408,
341
+ "mean_token_accuracy": 0.705119863152504,
342
+ "num_tokens": 626687.0,
343
+ "step": 37
344
+ },
345
+ {
346
+ "epoch": 0.608,
347
+ "grad_norm": 0.27845677733421326,
348
+ "learning_rate": 8.932810320813843e-05,
349
+ "loss": 0.963,
350
+ "mean_token_accuracy": 0.7295580208301544,
351
+ "num_tokens": 643989.0,
352
+ "step": 38
353
+ },
354
+ {
355
+ "epoch": 0.624,
356
+ "grad_norm": 0.2680862247943878,
357
+ "learning_rate": 8.484629286335517e-05,
358
+ "loss": 1.0114,
359
+ "mean_token_accuracy": 0.7323517799377441,
360
+ "num_tokens": 663165.0,
361
+ "step": 39
362
+ },
363
+ {
364
+ "epoch": 0.64,
365
+ "grad_norm": 0.2760467231273651,
366
+ "learning_rate": 8.043118562802488e-05,
367
+ "loss": 0.9966,
368
+ "mean_token_accuracy": 0.7186093181371689,
369
+ "num_tokens": 681802.0,
370
+ "step": 40
371
+ },
372
+ {
373
+ "epoch": 0.656,
374
+ "grad_norm": 0.26565995812416077,
375
+ "learning_rate": 7.609448957283334e-05,
376
+ "loss": 0.9775,
377
+ "mean_token_accuracy": 0.7272415161132812,
378
+ "num_tokens": 699772.0,
379
+ "step": 41
380
+ },
381
+ {
382
+ "epoch": 0.672,
383
+ "grad_norm": 0.29315948486328125,
384
+ "learning_rate": 7.184770483611256e-05,
385
+ "loss": 0.8769,
386
+ "mean_token_accuracy": 0.7540310174226761,
387
+ "num_tokens": 715209.0,
388
+ "step": 42
389
+ },
390
+ {
391
+ "epoch": 0.688,
392
+ "grad_norm": 0.24506399035453796,
393
+ "learning_rate": 6.770209312754124e-05,
394
+ "loss": 0.9194,
395
+ "mean_token_accuracy": 0.74180668592453,
396
+ "num_tokens": 735810.0,
397
+ "step": 43
398
+ },
399
+ {
400
+ "epoch": 0.704,
401
+ "grad_norm": 0.3123375177383423,
402
+ "learning_rate": 6.366864786411526e-05,
403
+ "loss": 1.0908,
404
+ "mean_token_accuracy": 0.7076454162597656,
405
+ "num_tokens": 752738.0,
406
+ "step": 44
407
+ },
408
+ {
409
+ "epoch": 0.72,
410
+ "grad_norm": 0.2974242866039276,
411
+ "learning_rate": 5.9758065017582185e-05,
412
+ "loss": 1.044,
413
+ "mean_token_accuracy": 0.712090015411377,
414
+ "num_tokens": 766970.0,
415
+ "step": 45
416
+ },
417
+ {
418
+ "epoch": 0.736,
419
+ "grad_norm": 0.3142162561416626,
420
+ "learning_rate": 5.598071475064688e-05,
421
+ "loss": 1.1099,
422
+ "mean_token_accuracy": 0.7006032019853592,
423
+ "num_tokens": 781115.0,
424
+ "step": 46
425
+ },
426
+ {
427
+ "epoch": 0.752,
428
+ "grad_norm": 0.2571069598197937,
429
+ "learning_rate": 5.2346613917164246e-05,
430
+ "loss": 0.9873,
431
+ "mean_token_accuracy": 0.7231301814317703,
432
+ "num_tokens": 801039.0,
433
+ "step": 47
434
+ },
435
+ {
436
+ "epoch": 0.768,
437
+ "grad_norm": 0.2709988057613373,
438
+ "learning_rate": 4.886539949924362e-05,
439
+ "loss": 0.8684,
440
+ "mean_token_accuracy": 0.7532013207674026,
441
+ "num_tokens": 818783.0,
442
+ "step": 48
443
+ },
444
+ {
445
+ "epoch": 0.784,
446
+ "grad_norm": 0.2727779746055603,
447
+ "learning_rate": 4.554630305170462e-05,
448
+ "loss": 1.0595,
449
+ "mean_token_accuracy": 0.712963730096817,
450
+ "num_tokens": 836166.0,
451
+ "step": 49
452
+ },
453
+ {
454
+ "epoch": 0.8,
455
+ "grad_norm": 0.2671469449996948,
456
+ "learning_rate": 4.2398126221653236e-05,
457
+ "loss": 0.9982,
458
+ "mean_token_accuracy": 0.7242924720048904,
459
+ "num_tokens": 856465.0,
460
+ "step": 50
461
+ },
462
+ {
463
+ "epoch": 0.816,
464
+ "grad_norm": 0.29176750779151917,
465
+ "learning_rate": 3.9429217408096075e-05,
466
+ "loss": 0.9357,
467
+ "mean_token_accuracy": 0.7428720593452454,
468
+ "num_tokens": 874214.0,
469
+ "step": 51
470
+ },
471
+ {
472
+ "epoch": 0.832,
473
+ "grad_norm": 0.2942723035812378,
474
+ "learning_rate": 3.664744962348699e-05,
475
+ "loss": 1.0035,
476
+ "mean_token_accuracy": 0.7168596237897873,
477
+ "num_tokens": 890584.0,
478
+ "step": 52
479
+ },
480
+ {
481
+ "epoch": 0.848,
482
+ "grad_norm": 0.29683244228363037,
483
+ "learning_rate": 3.40601996159133e-05,
484
+ "loss": 1.0647,
485
+ "mean_token_accuracy": 0.7118570357561111,
486
+ "num_tokens": 906031.0,
487
+ "step": 53
488
+ },
489
+ {
490
+ "epoch": 0.864,
491
+ "grad_norm": 0.2509916126728058,
492
+ "learning_rate": 3.167432830728603e-05,
493
+ "loss": 0.9915,
494
+ "mean_token_accuracy": 0.7244430631399155,
495
+ "num_tokens": 926256.0,
496
+ "step": 54
497
+ },
498
+ {
499
+ "epoch": 0.88,
500
+ "grad_norm": 0.29129576683044434,
501
+ "learning_rate": 2.949616259940842e-05,
502
+ "loss": 1.0276,
503
+ "mean_token_accuracy": 0.712443009018898,
504
+ "num_tokens": 942399.0,
505
+ "step": 55
506
+ },
507
+ {
508
+ "epoch": 0.896,
509
+ "grad_norm": 0.27668559551239014,
510
+ "learning_rate": 2.7531478596169587e-05,
511
+ "loss": 1.0886,
512
+ "mean_token_accuracy": 0.7021819353103638,
513
+ "num_tokens": 959208.0,
514
+ "step": 56
515
+ },
516
+ {
517
+ "epoch": 0.912,
518
+ "grad_norm": 0.27250081300735474,
519
+ "learning_rate": 2.5785486286355586e-05,
520
+ "loss": 1.1176,
521
+ "mean_token_accuracy": 0.6994952410459518,
522
+ "num_tokens": 978283.0,
523
+ "step": 57
524
+ },
525
+ {
526
+ "epoch": 0.928,
527
+ "grad_norm": 0.47203993797302246,
528
+ "learning_rate": 2.4262815727695575e-05,
529
+ "loss": 1.0219,
530
+ "mean_token_accuracy": 0.7143173664808273,
531
+ "num_tokens": 991015.0,
532
+ "step": 58
533
+ },
534
+ {
535
+ "epoch": 0.944,
536
+ "grad_norm": 0.27971458435058594,
537
+ "learning_rate": 2.2967504768781168e-05,
538
+ "loss": 1.0169,
539
+ "mean_token_accuracy": 0.7231282144784927,
540
+ "num_tokens": 1008413.0,
541
+ "step": 59
542
+ },
543
+ {
544
+ "epoch": 0.96,
545
+ "grad_norm": 0.2779776155948639,
546
+ "learning_rate": 2.19029883414178e-05,
547
+ "loss": 0.9886,
548
+ "mean_token_accuracy": 0.721703976392746,
549
+ "num_tokens": 1025326.0,
550
+ "step": 60
551
+ },
552
+ {
553
+ "epoch": 0.976,
554
+ "grad_norm": 0.2798386812210083,
555
+ "learning_rate": 2.1072089351803067e-05,
556
+ "loss": 0.9339,
557
+ "mean_token_accuracy": 0.7390532195568085,
558
+ "num_tokens": 1041871.0,
559
+ "step": 61
560
+ },
561
+ {
562
+ "epoch": 0.992,
563
+ "grad_norm": 0.27825894951820374,
564
+ "learning_rate": 2.0477011194686986e-05,
565
+ "loss": 0.9924,
566
+ "mean_token_accuracy": 0.7242806553840637,
567
+ "num_tokens": 1058238.0,
568
+ "step": 62
569
+ },
570
+ {
571
+ "epoch": 1.0,
572
+ "grad_norm": 0.4403225779533386,
573
+ "learning_rate": 2.0119331910365256e-05,
574
+ "loss": 1.0934,
575
+ "mean_token_accuracy": 0.6979949474334717,
576
+ "num_tokens": 1065193.0,
577
+ "step": 63
578
+ }
579
+ ],
580
+ "logging_steps": 1,
581
+ "max_steps": 63,
582
+ "num_input_tokens_seen": 0,
583
+ "num_train_epochs": 1,
584
+ "save_steps": 500,
585
+ "stateful_callbacks": {
586
+ "TrainerControl": {
587
+ "args": {
588
+ "should_epoch_stop": false,
589
+ "should_evaluate": false,
590
+ "should_log": false,
591
+ "should_save": true,
592
+ "should_training_stop": true
593
+ },
594
+ "attributes": {}
595
+ }
596
+ },
597
+ "total_flos": 1.995467289064105e+17,
598
+ "train_batch_size": 4,
599
+ "trial_name": null,
600
+ "trial_params": null
601
+ }