hssawhney commited on
Commit
39da989
·
verified ·
1 Parent(s): df9225e

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. model.safetensors +0 -3
  2. rng_state.pth +3 -0
  3. scheduler.pt +3 -0
  4. tokenizer.json +2 -2
  5. trainer_state.json +819 -1527
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a03b2ce91ceab3d848e78ff1b76b74dd35ce5147b452c9292df7245c1aa088fb
3
- size 1192135096
 
 
 
 
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9196a1e708bf24d6abba41cce3f8558820acc3e50f9394c5955e29eb41ffea3d
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:badda73606807bdb5725e7e75013fdd1ead0f2a3f2f29bd131bb02292e40d7f2
3
+ size 1064
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c760c4e715b8fa39254607191eca619d9b0612d5b29d3002ac512a5b6cad7d55
3
- size 11422934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
trainer_state.json CHANGED
@@ -1,2141 +1,1433 @@
1
  {
2
- "best_global_step": 1500,
3
- "best_metric": 0.8288407325744629,
4
- "best_model_checkpoint": "output/reasoning-model_v2/checkpoint-1500",
5
- "epoch": 0.13859373556315255,
6
  "eval_steps": 500,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.0004619791185438418,
14
- "grad_norm": 10.1875,
15
- "learning_rate": 2.4615384615384616e-07,
16
- "loss": 0.9733,
17
  "step": 5
18
  },
19
  {
20
- "epoch": 0.0009239582370876836,
21
- "grad_norm": 7.46875,
22
- "learning_rate": 5.53846153846154e-07,
23
- "loss": 1.0462,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 0.0013859373556315254,
28
- "grad_norm": 10.0625,
29
- "learning_rate": 8.615384615384616e-07,
30
- "loss": 0.91,
31
  "step": 15
32
  },
33
  {
34
- "epoch": 0.0018479164741753672,
35
- "grad_norm": 9.5,
36
- "learning_rate": 1.1692307692307693e-06,
37
- "loss": 0.9938,
38
  "step": 20
39
  },
40
  {
41
- "epoch": 0.002309895592719209,
42
- "grad_norm": 8.1875,
43
- "learning_rate": 1.476923076923077e-06,
44
- "loss": 0.922,
45
  "step": 25
46
  },
47
  {
48
- "epoch": 0.0027718747112630508,
49
- "grad_norm": 5.78125,
50
- "learning_rate": 1.7846153846153846e-06,
51
- "loss": 0.9497,
52
  "step": 30
53
  },
54
  {
55
- "epoch": 0.0032338538298068925,
56
- "grad_norm": 5.78125,
57
- "learning_rate": 2.0923076923076926e-06,
58
- "loss": 0.9382,
59
  "step": 35
60
  },
61
  {
62
- "epoch": 0.0036958329483507343,
63
- "grad_norm": 5.75,
64
- "learning_rate": 2.4000000000000003e-06,
65
- "loss": 0.9658,
66
  "step": 40
67
  },
68
  {
69
- "epoch": 0.004157812066894576,
70
- "grad_norm": 6.65625,
71
- "learning_rate": 2.7076923076923076e-06,
72
- "loss": 0.9687,
73
  "step": 45
74
  },
75
  {
76
- "epoch": 0.004619791185438418,
77
- "grad_norm": 7.03125,
78
- "learning_rate": 3.0153846153846154e-06,
79
- "loss": 0.9775,
80
  "step": 50
81
  },
82
  {
83
- "epoch": 0.00508177030398226,
84
- "grad_norm": 5.59375,
85
- "learning_rate": 3.323076923076923e-06,
86
- "loss": 0.9061,
87
  "step": 55
88
  },
89
  {
90
- "epoch": 0.0055437494225261015,
91
- "grad_norm": 5.3125,
92
- "learning_rate": 3.630769230769231e-06,
93
- "loss": 0.9917,
94
  "step": 60
95
  },
96
  {
97
- "epoch": 0.006005728541069943,
98
- "grad_norm": 4.09375,
99
- "learning_rate": 3.938461538461539e-06,
100
- "loss": 0.7981,
101
  "step": 65
102
  },
103
  {
104
- "epoch": 0.006467707659613785,
105
- "grad_norm": 5.65625,
106
- "learning_rate": 4.246153846153846e-06,
107
- "loss": 0.9878,
108
  "step": 70
109
  },
110
  {
111
- "epoch": 0.006929686778157627,
112
- "grad_norm": 5.15625,
113
- "learning_rate": 4.553846153846154e-06,
114
- "loss": 0.9152,
115
  "step": 75
116
  },
117
  {
118
- "epoch": 0.007391665896701469,
119
- "grad_norm": 7.4375,
120
- "learning_rate": 4.861538461538462e-06,
121
- "loss": 1.0513,
122
  "step": 80
123
  },
124
  {
125
- "epoch": 0.007853645015245311,
126
- "grad_norm": 6.40625,
127
- "learning_rate": 5.16923076923077e-06,
128
- "loss": 0.9603,
129
  "step": 85
130
  },
131
  {
132
- "epoch": 0.008315624133789152,
133
- "grad_norm": 5.625,
134
- "learning_rate": 5.476923076923077e-06,
135
- "loss": 0.9008,
136
  "step": 90
137
  },
138
  {
139
- "epoch": 0.008777603252332995,
140
- "grad_norm": 6.0,
141
- "learning_rate": 5.784615384615385e-06,
142
- "loss": 0.9536,
143
  "step": 95
144
  },
145
  {
146
- "epoch": 0.009239582370876836,
147
- "grad_norm": 5.21875,
148
- "learning_rate": 6.092307692307693e-06,
149
- "loss": 0.9726,
150
  "step": 100
151
  },
152
  {
153
- "epoch": 0.009701561489420679,
154
- "grad_norm": 4.84375,
155
- "learning_rate": 6.4000000000000006e-06,
156
- "loss": 1.0662,
157
  "step": 105
158
  },
159
  {
160
- "epoch": 0.01016354060796452,
161
- "grad_norm": 4.28125,
162
- "learning_rate": 6.707692307692308e-06,
163
- "loss": 0.8922,
164
  "step": 110
165
  },
166
  {
167
- "epoch": 0.010625519726508362,
168
- "grad_norm": 5.0625,
169
- "learning_rate": 7.015384615384616e-06,
170
- "loss": 0.8976,
171
  "step": 115
172
  },
173
  {
174
- "epoch": 0.011087498845052203,
175
- "grad_norm": 4.71875,
176
- "learning_rate": 7.323076923076924e-06,
177
- "loss": 0.7963,
178
  "step": 120
179
  },
180
  {
181
- "epoch": 0.011549477963596046,
182
- "grad_norm": 5.53125,
183
- "learning_rate": 7.630769230769232e-06,
184
- "loss": 0.8124,
185
  "step": 125
186
  },
187
  {
188
- "epoch": 0.012011457082139887,
189
- "grad_norm": 8.3125,
190
- "learning_rate": 7.93846153846154e-06,
191
- "loss": 0.8409,
192
  "step": 130
193
  },
194
  {
195
- "epoch": 0.01247343620068373,
196
- "grad_norm": 5.375,
197
- "learning_rate": 8.246153846153848e-06,
198
- "loss": 0.9,
199
  "step": 135
200
  },
201
  {
202
- "epoch": 0.01293541531922757,
203
- "grad_norm": 5.625,
204
- "learning_rate": 8.553846153846156e-06,
205
- "loss": 0.8444,
206
  "step": 140
207
  },
208
  {
209
- "epoch": 0.013397394437771413,
210
- "grad_norm": 5.25,
211
- "learning_rate": 8.861538461538463e-06,
212
- "loss": 0.8523,
213
  "step": 145
214
  },
215
  {
216
- "epoch": 0.013859373556315254,
217
- "grad_norm": 6.3125,
218
- "learning_rate": 9.169230769230771e-06,
219
- "loss": 0.9402,
220
  "step": 150
221
  },
222
  {
223
- "epoch": 0.014321352674859096,
224
- "grad_norm": 4.28125,
225
- "learning_rate": 9.476923076923079e-06,
226
- "loss": 0.7411,
227
  "step": 155
228
  },
229
  {
230
- "epoch": 0.014783331793402937,
231
- "grad_norm": 4.28125,
232
- "learning_rate": 9.784615384615387e-06,
233
- "loss": 0.8616,
234
  "step": 160
235
  },
236
  {
237
- "epoch": 0.01524531091194678,
238
- "grad_norm": 4.8125,
239
- "learning_rate": 1.0092307692307693e-05,
240
- "loss": 0.8655,
241
  "step": 165
242
  },
243
  {
244
- "epoch": 0.015707290030490623,
245
- "grad_norm": 4.5625,
246
- "learning_rate": 1.04e-05,
247
- "loss": 0.8105,
248
  "step": 170
249
  },
250
  {
251
- "epoch": 0.016169269149034465,
252
- "grad_norm": 6.96875,
253
- "learning_rate": 1.0707692307692308e-05,
254
- "loss": 0.8538,
255
  "step": 175
256
  },
257
  {
258
- "epoch": 0.016631248267578305,
259
- "grad_norm": 5.9375,
260
- "learning_rate": 1.1015384615384616e-05,
261
- "loss": 0.8744,
262
  "step": 180
263
  },
264
  {
265
- "epoch": 0.017093227386122147,
266
- "grad_norm": 4.5625,
267
- "learning_rate": 1.1323076923076924e-05,
268
- "loss": 0.8101,
269
  "step": 185
270
  },
271
  {
272
- "epoch": 0.01755520650466599,
273
- "grad_norm": 4.28125,
274
- "learning_rate": 1.1630769230769231e-05,
275
- "loss": 0.831,
276
  "step": 190
277
  },
278
  {
279
- "epoch": 0.018017185623209832,
280
- "grad_norm": 5.9375,
281
- "learning_rate": 1.1938461538461539e-05,
282
- "loss": 0.835,
283
  "step": 195
284
  },
285
  {
286
- "epoch": 0.01847916474175367,
287
- "grad_norm": 5.1875,
288
- "learning_rate": 1.2246153846153847e-05,
289
- "loss": 0.7559,
290
  "step": 200
291
  },
292
  {
293
- "epoch": 0.018941143860297514,
294
- "grad_norm": 4.46875,
295
- "learning_rate": 1.2553846153846155e-05,
296
- "loss": 0.7849,
297
  "step": 205
298
  },
299
  {
300
- "epoch": 0.019403122978841357,
301
- "grad_norm": 3.625,
302
- "learning_rate": 1.2861538461538462e-05,
303
- "loss": 0.9112,
304
  "step": 210
305
  },
306
  {
307
- "epoch": 0.0198651020973852,
308
- "grad_norm": 4.125,
309
- "learning_rate": 1.316923076923077e-05,
310
- "loss": 1.0893,
311
  "step": 215
312
  },
313
  {
314
- "epoch": 0.02032708121592904,
315
- "grad_norm": 5.96875,
316
- "learning_rate": 1.3476923076923078e-05,
317
- "loss": 0.7646,
318
  "step": 220
319
  },
320
  {
321
- "epoch": 0.02078906033447288,
322
- "grad_norm": 4.1875,
323
- "learning_rate": 1.3784615384615386e-05,
324
- "loss": 0.8373,
325
  "step": 225
326
  },
327
  {
328
- "epoch": 0.021251039453016724,
329
- "grad_norm": 5.6875,
330
- "learning_rate": 1.4092307692307693e-05,
331
- "loss": 0.8159,
332
  "step": 230
333
  },
334
  {
335
- "epoch": 0.021713018571560567,
336
- "grad_norm": 4.875,
337
- "learning_rate": 1.4400000000000001e-05,
338
- "loss": 0.8434,
339
  "step": 235
340
  },
341
  {
342
- "epoch": 0.022174997690104406,
343
- "grad_norm": 5.09375,
344
- "learning_rate": 1.4707692307692309e-05,
345
- "loss": 0.8048,
346
  "step": 240
347
  },
348
  {
349
- "epoch": 0.02263697680864825,
350
- "grad_norm": 5.375,
351
- "learning_rate": 1.5015384615384617e-05,
352
- "loss": 0.8276,
353
  "step": 245
354
  },
355
  {
356
- "epoch": 0.02309895592719209,
357
- "grad_norm": 6.0625,
358
- "learning_rate": 1.5323076923076926e-05,
359
- "loss": 0.9429,
360
  "step": 250
361
  },
362
  {
363
- "epoch": 0.023560935045735934,
364
- "grad_norm": 6.15625,
365
- "learning_rate": 1.5630769230769232e-05,
366
- "loss": 0.8995,
367
  "step": 255
368
  },
369
  {
370
- "epoch": 0.024022914164279773,
371
- "grad_norm": 4.5,
372
- "learning_rate": 1.593846153846154e-05,
373
- "loss": 0.8191,
374
  "step": 260
375
  },
376
  {
377
- "epoch": 0.024484893282823616,
378
- "grad_norm": 5.09375,
379
- "learning_rate": 1.6246153846153848e-05,
380
- "loss": 0.9168,
381
  "step": 265
382
  },
383
  {
384
- "epoch": 0.02494687240136746,
385
- "grad_norm": 4.21875,
386
- "learning_rate": 1.6553846153846157e-05,
387
- "loss": 0.8813,
388
  "step": 270
389
  },
390
  {
391
- "epoch": 0.0254088515199113,
392
- "grad_norm": 4.3125,
393
- "learning_rate": 1.6861538461538463e-05,
394
- "loss": 0.8089,
395
  "step": 275
396
  },
397
  {
398
- "epoch": 0.02587083063845514,
399
- "grad_norm": 7.625,
400
- "learning_rate": 1.7169230769230772e-05,
401
- "loss": 0.7353,
402
  "step": 280
403
  },
404
  {
405
- "epoch": 0.026332809756998983,
406
- "grad_norm": 5.5625,
407
- "learning_rate": 1.747692307692308e-05,
408
- "loss": 0.7367,
409
  "step": 285
410
  },
411
  {
412
- "epoch": 0.026794788875542826,
413
- "grad_norm": 5.0625,
414
- "learning_rate": 1.7784615384615388e-05,
415
- "loss": 0.7641,
416
  "step": 290
417
  },
418
  {
419
- "epoch": 0.02725676799408667,
420
- "grad_norm": 6.4375,
421
- "learning_rate": 1.8092307692307694e-05,
422
- "loss": 0.7952,
423
  "step": 295
424
  },
425
  {
426
- "epoch": 0.027718747112630508,
427
- "grad_norm": 4.75,
428
- "learning_rate": 1.8400000000000003e-05,
429
- "loss": 0.8539,
430
  "step": 300
431
  },
432
  {
433
- "epoch": 0.02818072623117435,
434
- "grad_norm": 4.34375,
435
- "learning_rate": 1.870769230769231e-05,
436
- "loss": 0.8364,
437
  "step": 305
438
  },
439
  {
440
- "epoch": 0.028642705349718193,
441
- "grad_norm": 5.03125,
442
- "learning_rate": 1.901538461538462e-05,
443
- "loss": 0.9327,
444
  "step": 310
445
  },
446
  {
447
- "epoch": 0.029104684468262036,
448
- "grad_norm": 4.6875,
449
- "learning_rate": 1.9323076923076925e-05,
450
- "loss": 0.7912,
451
  "step": 315
452
  },
453
  {
454
- "epoch": 0.029566663586805875,
455
- "grad_norm": 4.4375,
456
- "learning_rate": 1.9630769230769234e-05,
457
- "loss": 1.0159,
458
  "step": 320
459
  },
460
  {
461
- "epoch": 0.030028642705349717,
462
- "grad_norm": 4.90625,
463
- "learning_rate": 1.993846153846154e-05,
464
- "loss": 0.8499,
465
  "step": 325
466
  },
467
  {
468
- "epoch": 0.03049062182389356,
469
- "grad_norm": 6.96875,
470
- "learning_rate": 1.9999992835654137e-05,
471
- "loss": 0.8496,
472
  "step": 330
473
  },
474
  {
475
- "epoch": 0.030952600942437403,
476
- "grad_norm": 3.90625,
477
- "learning_rate": 1.999996373051665e-05,
478
- "loss": 0.8142,
479
  "step": 335
480
  },
481
  {
482
- "epoch": 0.031414580060981245,
483
- "grad_norm": 4.0,
484
- "learning_rate": 1.9999912236881042e-05,
485
- "loss": 0.8672,
486
  "step": 340
487
  },
488
  {
489
- "epoch": 0.03187655917952509,
490
- "grad_norm": 5.15625,
491
- "learning_rate": 1.99998383548626e-05,
492
- "loss": 0.9359,
493
  "step": 345
494
  },
495
  {
496
- "epoch": 0.03233853829806893,
497
- "grad_norm": 4.625,
498
- "learning_rate": 1.9999742084626726e-05,
499
- "loss": 0.9701,
500
  "step": 350
501
  },
502
  {
503
- "epoch": 0.032800517416612766,
504
- "grad_norm": 3.703125,
505
- "learning_rate": 1.9999623426388963e-05,
506
- "loss": 0.7881,
507
  "step": 355
508
  },
509
  {
510
- "epoch": 0.03326249653515661,
511
- "grad_norm": 3.671875,
512
- "learning_rate": 1.9999482380414973e-05,
513
- "loss": 0.7965,
514
  "step": 360
515
  },
516
  {
517
- "epoch": 0.03372447565370045,
518
- "grad_norm": 4.6875,
519
- "learning_rate": 1.9999318947020527e-05,
520
- "loss": 0.7915,
521
  "step": 365
522
  },
523
  {
524
- "epoch": 0.034186454772244294,
525
- "grad_norm": 6.03125,
526
- "learning_rate": 1.999913312657154e-05,
527
- "loss": 0.9149,
528
  "step": 370
529
  },
530
  {
531
- "epoch": 0.03464843389078814,
532
- "grad_norm": 5.53125,
533
- "learning_rate": 1.9998924919484034e-05,
534
- "loss": 0.7844,
535
  "step": 375
536
  },
537
  {
538
- "epoch": 0.03511041300933198,
539
- "grad_norm": 4.8125,
540
- "learning_rate": 1.9998694326224154e-05,
541
- "loss": 0.7716,
542
  "step": 380
543
  },
544
  {
545
- "epoch": 0.03557239212787582,
546
- "grad_norm": 4.28125,
547
- "learning_rate": 1.9998441347308164e-05,
548
- "loss": 0.7758,
549
  "step": 385
550
  },
551
  {
552
- "epoch": 0.036034371246419665,
553
- "grad_norm": 4.90625,
554
- "learning_rate": 1.999816598330245e-05,
555
- "loss": 0.8478,
556
  "step": 390
557
  },
558
  {
559
- "epoch": 0.0364963503649635,
560
- "grad_norm": 5.0,
561
- "learning_rate": 1.9997868234823513e-05,
562
- "loss": 0.7166,
563
  "step": 395
564
  },
565
  {
566
- "epoch": 0.03695832948350734,
567
- "grad_norm": 5.21875,
568
- "learning_rate": 1.999754810253797e-05,
569
- "loss": 0.8396,
570
  "step": 400
571
  },
572
  {
573
- "epoch": 0.037420308602051186,
574
- "grad_norm": 5.21875,
575
- "learning_rate": 1.999720558716255e-05,
576
- "loss": 0.9007,
577
  "step": 405
578
  },
579
  {
580
- "epoch": 0.03788228772059503,
581
- "grad_norm": 4.78125,
582
- "learning_rate": 1.99968406894641e-05,
583
- "loss": 0.8649,
584
  "step": 410
585
  },
586
  {
587
- "epoch": 0.03834426683913887,
588
- "grad_norm": 5.1875,
589
- "learning_rate": 1.999645341025957e-05,
590
- "loss": 0.8454,
591
  "step": 415
592
  },
593
  {
594
- "epoch": 0.038806245957682714,
595
- "grad_norm": 5.0625,
596
- "learning_rate": 1.9996043750416026e-05,
597
- "loss": 0.8146,
598
  "step": 420
599
  },
600
  {
601
- "epoch": 0.03926822507622656,
602
- "grad_norm": 4.25,
603
- "learning_rate": 1.9995611710850634e-05,
604
- "loss": 0.7879,
605
  "step": 425
606
  },
607
  {
608
- "epoch": 0.0397302041947704,
609
- "grad_norm": 3.5625,
610
- "learning_rate": 1.9995157292530672e-05,
611
- "loss": 0.8767,
612
  "step": 430
613
  },
614
  {
615
- "epoch": 0.040192183313314235,
616
- "grad_norm": 4.875,
617
- "learning_rate": 1.999468049647352e-05,
618
- "loss": 0.7827,
619
  "step": 435
620
  },
621
  {
622
- "epoch": 0.04065416243185808,
623
- "grad_norm": 5.34375,
624
- "learning_rate": 1.9994181323746652e-05,
625
- "loss": 0.9704,
626
  "step": 440
627
  },
628
  {
629
- "epoch": 0.04111614155040192,
630
- "grad_norm": 5.0625,
631
- "learning_rate": 1.9993659775467648e-05,
632
- "loss": 0.7169,
633
  "step": 445
634
  },
635
  {
636
- "epoch": 0.04157812066894576,
637
- "grad_norm": 7.40625,
638
- "learning_rate": 1.999311585280418e-05,
639
- "loss": 1.0719,
640
  "step": 450
641
  },
642
  {
643
- "epoch": 0.042040099787489606,
644
- "grad_norm": 4.34375,
645
- "learning_rate": 1.9992549556974015e-05,
646
- "loss": 0.8138,
647
  "step": 455
648
  },
649
  {
650
- "epoch": 0.04250207890603345,
651
- "grad_norm": 3.890625,
652
- "learning_rate": 1.9991960889245005e-05,
653
- "loss": 0.724,
654
  "step": 460
655
  },
656
  {
657
- "epoch": 0.04296405802457729,
658
- "grad_norm": 4.90625,
659
- "learning_rate": 1.99913498509351e-05,
660
- "loss": 0.7338,
661
  "step": 465
662
  },
663
  {
664
- "epoch": 0.043426037143121134,
665
- "grad_norm": 5.78125,
666
- "learning_rate": 1.999071644341232e-05,
667
- "loss": 0.7862,
668
  "step": 470
669
  },
670
  {
671
- "epoch": 0.04388801626166497,
672
- "grad_norm": 6.15625,
673
- "learning_rate": 1.9990060668094778e-05,
674
- "loss": 1.0113,
675
  "step": 475
676
  },
677
  {
678
- "epoch": 0.04434999538020881,
679
- "grad_norm": 5.9375,
680
- "learning_rate": 1.998938252645066e-05,
681
- "loss": 0.7534,
682
  "step": 480
683
  },
684
  {
685
- "epoch": 0.044811974498752655,
686
- "grad_norm": 5.5,
687
- "learning_rate": 1.9988682019998236e-05,
688
- "loss": 0.8548,
689
  "step": 485
690
  },
691
  {
692
- "epoch": 0.0452739536172965,
693
- "grad_norm": 4.9375,
694
- "learning_rate": 1.9987959150305834e-05,
695
- "loss": 0.9163,
696
  "step": 490
697
  },
698
  {
699
- "epoch": 0.04573593273584034,
700
- "grad_norm": 4.34375,
701
- "learning_rate": 1.9987213918991855e-05,
702
- "loss": 0.7051,
703
  "step": 495
704
  },
705
  {
706
- "epoch": 0.04619791185438418,
707
- "grad_norm": 7.71875,
708
- "learning_rate": 1.998644632772477e-05,
709
- "loss": 0.7463,
710
  "step": 500
711
  },
712
  {
713
- "epoch": 0.04619791185438418,
714
- "eval_loss": 0.8494759798049927,
715
- "eval_runtime": 442.9436,
716
- "eval_samples_per_second": 20.578,
717
- "eval_steps_per_second": 2.574,
718
  "step": 500
719
  },
720
  {
721
- "epoch": 0.046659890972928025,
722
- "grad_norm": 4.625,
723
- "learning_rate": 1.998565637822311e-05,
724
- "loss": 0.8329,
725
  "step": 505
726
  },
727
  {
728
- "epoch": 0.04712187009147187,
729
- "grad_norm": 5.5625,
730
- "learning_rate": 1.998484407225545e-05,
731
- "loss": 0.7516,
732
  "step": 510
733
  },
734
  {
735
- "epoch": 0.04758384921001571,
736
- "grad_norm": 4.6875,
737
- "learning_rate": 1.9984009411640433e-05,
738
- "loss": 0.6933,
739
  "step": 515
740
  },
741
  {
742
- "epoch": 0.048045828328559546,
743
- "grad_norm": 4.125,
744
- "learning_rate": 1.9983152398246747e-05,
745
- "loss": 0.6367,
746
  "step": 520
747
  },
748
  {
749
- "epoch": 0.04850780744710339,
750
- "grad_norm": 4.25,
751
- "learning_rate": 1.998227303399312e-05,
752
- "loss": 0.8222,
753
  "step": 525
754
  },
755
  {
756
- "epoch": 0.04896978656564723,
757
- "grad_norm": 5.5625,
758
- "learning_rate": 1.9981371320848327e-05,
759
- "loss": 0.7939,
760
  "step": 530
761
  },
762
  {
763
- "epoch": 0.049431765684191074,
764
- "grad_norm": 4.84375,
765
- "learning_rate": 1.9980447260831177e-05,
766
- "loss": 0.8247,
767
  "step": 535
768
  },
769
  {
770
- "epoch": 0.04989374480273492,
771
- "grad_norm": 4.46875,
772
- "learning_rate": 1.99795008560105e-05,
773
- "loss": 0.7723,
774
  "step": 540
775
  },
776
  {
777
- "epoch": 0.05035572392127876,
778
- "grad_norm": 3.828125,
779
- "learning_rate": 1.997853210850517e-05,
780
- "loss": 0.7777,
781
  "step": 545
782
  },
783
  {
784
- "epoch": 0.0508177030398226,
785
- "grad_norm": 5.6875,
786
- "learning_rate": 1.9977541020484078e-05,
787
- "loss": 0.9256,
788
  "step": 550
789
  },
790
  {
791
- "epoch": 0.051279682158366445,
792
- "grad_norm": 4.46875,
793
- "learning_rate": 1.9976527594166116e-05,
794
- "loss": 0.841,
795
  "step": 555
796
  },
797
  {
798
- "epoch": 0.05174166127691028,
799
- "grad_norm": 4.84375,
800
- "learning_rate": 1.9975491831820216e-05,
801
- "loss": 0.8144,
802
  "step": 560
803
  },
804
  {
805
- "epoch": 0.05220364039545412,
806
- "grad_norm": 6.125,
807
- "learning_rate": 1.9974433735765297e-05,
808
- "loss": 0.8988,
809
  "step": 565
810
  },
811
  {
812
- "epoch": 0.052665619513997966,
813
- "grad_norm": 4.53125,
814
- "learning_rate": 1.9973353308370282e-05,
815
- "loss": 0.9735,
816
  "step": 570
817
  },
818
  {
819
- "epoch": 0.05312759863254181,
820
- "grad_norm": 4.84375,
821
- "learning_rate": 1.99722505520541e-05,
822
- "loss": 0.7992,
823
  "step": 575
824
  },
825
  {
826
- "epoch": 0.05358957775108565,
827
- "grad_norm": 4.84375,
828
- "learning_rate": 1.9971125469285663e-05,
829
- "loss": 0.7968,
830
  "step": 580
831
  },
832
  {
833
- "epoch": 0.054051556869629494,
834
- "grad_norm": 4.625,
835
- "learning_rate": 1.996997806258387e-05,
836
- "loss": 0.7315,
837
  "step": 585
838
  },
839
  {
840
- "epoch": 0.05451353598817334,
841
- "grad_norm": 6.59375,
842
- "learning_rate": 1.9968808334517607e-05,
843
- "loss": 0.8133,
844
  "step": 590
845
  },
846
  {
847
- "epoch": 0.05497551510671718,
848
- "grad_norm": 3.828125,
849
- "learning_rate": 1.9967616287705724e-05,
850
- "loss": 0.8487,
851
  "step": 595
852
  },
853
  {
854
- "epoch": 0.055437494225261015,
855
- "grad_norm": 6.59375,
856
- "learning_rate": 1.9966401924817042e-05,
857
- "loss": 0.8386,
858
  "step": 600
859
  },
860
  {
861
- "epoch": 0.05589947334380486,
862
- "grad_norm": 4.625,
863
- "learning_rate": 1.9965165248570357e-05,
864
- "loss": 0.911,
865
  "step": 605
866
  },
867
  {
868
- "epoch": 0.0563614524623487,
869
- "grad_norm": 5.4375,
870
- "learning_rate": 1.9963906261734404e-05,
871
- "loss": 0.8836,
872
  "step": 610
873
  },
874
  {
875
- "epoch": 0.05682343158089254,
876
- "grad_norm": 4.84375,
877
- "learning_rate": 1.9962624967127877e-05,
878
- "loss": 0.7977,
879
  "step": 615
880
  },
881
  {
882
- "epoch": 0.057285410699436386,
883
- "grad_norm": 5.9375,
884
- "learning_rate": 1.9961321367619413e-05,
885
- "loss": 0.8145,
886
  "step": 620
887
  },
888
  {
889
- "epoch": 0.05774738981798023,
890
- "grad_norm": 3.84375,
891
- "learning_rate": 1.9959995466127582e-05,
892
- "loss": 0.7705,
893
  "step": 625
894
  },
895
  {
896
- "epoch": 0.05820936893652407,
897
- "grad_norm": 4.78125,
898
- "learning_rate": 1.99586472656209e-05,
899
- "loss": 0.7896,
900
  "step": 630
901
  },
902
  {
903
- "epoch": 0.058671348055067914,
904
- "grad_norm": 6.625,
905
- "learning_rate": 1.9957276769117785e-05,
906
- "loss": 0.8064,
907
  "step": 635
908
  },
909
  {
910
- "epoch": 0.05913332717361175,
911
- "grad_norm": 4.21875,
912
- "learning_rate": 1.9955883979686587e-05,
913
- "loss": 0.7744,
914
  "step": 640
915
  },
916
  {
917
- "epoch": 0.05959530629215559,
918
- "grad_norm": 5.59375,
919
- "learning_rate": 1.9954468900445567e-05,
920
- "loss": 0.8338,
921
  "step": 645
922
  },
923
  {
924
- "epoch": 0.060057285410699435,
925
- "grad_norm": 3.96875,
926
- "learning_rate": 1.9953031534562884e-05,
927
- "loss": 0.7888,
928
  "step": 650
929
  },
930
  {
931
- "epoch": 0.06051926452924328,
932
- "grad_norm": 5.0625,
933
- "learning_rate": 1.9951571885256594e-05,
934
- "loss": 0.9061,
935
  "step": 655
936
  },
937
  {
938
- "epoch": 0.06098124364778712,
939
- "grad_norm": 5.4375,
940
- "learning_rate": 1.995008995579465e-05,
941
- "loss": 0.8619,
942
  "step": 660
943
  },
944
  {
945
- "epoch": 0.06144322276633096,
946
- "grad_norm": 4.6875,
947
- "learning_rate": 1.9948585749494877e-05,
948
- "loss": 0.7711,
949
  "step": 665
950
  },
951
  {
952
- "epoch": 0.061905201884874805,
953
- "grad_norm": 4.5,
954
- "learning_rate": 1.9947059269724983e-05,
955
- "loss": 0.7418,
956
  "step": 670
957
  },
958
  {
959
- "epoch": 0.06236718100341865,
960
- "grad_norm": 6.34375,
961
- "learning_rate": 1.9945510519902533e-05,
962
- "loss": 0.9288,
963
  "step": 675
964
  },
965
  {
966
- "epoch": 0.06282916012196249,
967
- "grad_norm": 5.0625,
968
- "learning_rate": 1.994393950349496e-05,
969
- "loss": 0.8096,
970
  "step": 680
971
  },
972
  {
973
- "epoch": 0.06329113924050633,
974
- "grad_norm": 4.875,
975
- "learning_rate": 1.9942346224019557e-05,
976
- "loss": 0.852,
977
  "step": 685
978
  },
979
  {
980
- "epoch": 0.06375311835905018,
981
- "grad_norm": 5.71875,
982
- "learning_rate": 1.9940730685043435e-05,
983
- "loss": 0.942,
984
  "step": 690
985
  },
986
  {
987
- "epoch": 0.06421509747759402,
988
- "grad_norm": 4.09375,
989
- "learning_rate": 1.9939092890183562e-05,
990
- "loss": 0.8851,
991
  "step": 695
992
  },
993
  {
994
- "epoch": 0.06467707659613786,
995
- "grad_norm": 4.875,
996
- "learning_rate": 1.9937432843106733e-05,
997
- "loss": 0.7966,
998
  "step": 700
999
  },
1000
  {
1001
- "epoch": 0.06513905571468169,
1002
- "grad_norm": 4.9375,
1003
- "learning_rate": 1.9935750547529547e-05,
1004
- "loss": 0.9728,
1005
  "step": 705
1006
  },
1007
  {
1008
- "epoch": 0.06560103483322553,
1009
- "grad_norm": 4.625,
1010
- "learning_rate": 1.9934046007218437e-05,
1011
- "loss": 0.8777,
1012
  "step": 710
1013
  },
1014
  {
1015
- "epoch": 0.06606301395176938,
1016
- "grad_norm": 3.546875,
1017
- "learning_rate": 1.993231922598962e-05,
1018
- "loss": 0.8269,
1019
  "step": 715
1020
  },
1021
  {
1022
- "epoch": 0.06652499307031322,
1023
- "grad_norm": 4.25,
1024
- "learning_rate": 1.993057020770911e-05,
1025
- "loss": 0.7792,
1026
  "step": 720
1027
  },
1028
  {
1029
- "epoch": 0.06698697218885706,
1030
- "grad_norm": 4.6875,
1031
- "learning_rate": 1.9928798956292722e-05,
1032
- "loss": 0.8521,
1033
  "step": 725
1034
  },
1035
  {
1036
- "epoch": 0.0674489513074009,
1037
- "grad_norm": 4.4375,
1038
- "learning_rate": 1.9927005475706024e-05,
1039
- "loss": 0.7401,
1040
  "step": 730
1041
  },
1042
  {
1043
- "epoch": 0.06791093042594475,
1044
- "grad_norm": 5.21875,
1045
- "learning_rate": 1.9925189769964374e-05,
1046
- "loss": 0.7713,
1047
  "step": 735
1048
  },
1049
  {
1050
- "epoch": 0.06837290954448859,
1051
- "grad_norm": 8.375,
1052
- "learning_rate": 1.992335184313287e-05,
1053
- "loss": 0.8873,
1054
  "step": 740
1055
  },
1056
  {
1057
- "epoch": 0.06883488866303243,
1058
- "grad_norm": 4.96875,
1059
- "learning_rate": 1.992149169932638e-05,
1060
- "loss": 0.7554,
1061
  "step": 745
1062
  },
1063
  {
1064
- "epoch": 0.06929686778157627,
1065
- "grad_norm": 6.1875,
1066
- "learning_rate": 1.9919609342709493e-05,
1067
- "loss": 0.8749,
1068
  "step": 750
1069
  },
1070
  {
1071
- "epoch": 0.06975884690012012,
1072
- "grad_norm": 4.75,
1073
- "learning_rate": 1.991770477749654e-05,
1074
- "loss": 0.8489,
1075
  "step": 755
1076
  },
1077
  {
1078
- "epoch": 0.07022082601866396,
1079
- "grad_norm": 4.5,
1080
- "learning_rate": 1.9915778007951572e-05,
1081
- "loss": 0.7187,
1082
  "step": 760
1083
  },
1084
  {
1085
- "epoch": 0.0706828051372078,
1086
- "grad_norm": 4.9375,
1087
- "learning_rate": 1.9913829038388355e-05,
1088
- "loss": 0.8325,
1089
  "step": 765
1090
  },
1091
  {
1092
- "epoch": 0.07114478425575164,
1093
- "grad_norm": 4.84375,
1094
- "learning_rate": 1.9911857873170352e-05,
1095
- "loss": 0.8896,
1096
  "step": 770
1097
  },
1098
  {
1099
- "epoch": 0.07160676337429549,
1100
- "grad_norm": 7.1875,
1101
- "learning_rate": 1.9909864516710724e-05,
1102
- "loss": 0.7469,
1103
  "step": 775
1104
  },
1105
  {
1106
- "epoch": 0.07206874249283933,
1107
- "grad_norm": 4.21875,
1108
- "learning_rate": 1.9907848973472307e-05,
1109
- "loss": 0.9805,
1110
  "step": 780
1111
  },
1112
  {
1113
- "epoch": 0.07253072161138316,
1114
- "grad_norm": 3.703125,
1115
- "learning_rate": 1.9905811247967623e-05,
1116
- "loss": 0.806,
1117
  "step": 785
1118
  },
1119
  {
1120
- "epoch": 0.072992700729927,
1121
- "grad_norm": 4.84375,
1122
- "learning_rate": 1.990375134475885e-05,
1123
- "loss": 0.8426,
1124
  "step": 790
1125
  },
1126
  {
1127
- "epoch": 0.07345467984847084,
1128
- "grad_norm": 4.0625,
1129
- "learning_rate": 1.9901669268457814e-05,
1130
- "loss": 0.7435,
1131
  "step": 795
1132
  },
1133
  {
1134
- "epoch": 0.07391665896701469,
1135
- "grad_norm": 4.78125,
1136
- "learning_rate": 1.9899565023725992e-05,
1137
- "loss": 0.8943,
1138
  "step": 800
1139
  },
1140
  {
1141
- "epoch": 0.07437863808555853,
1142
- "grad_norm": 4.15625,
1143
- "learning_rate": 1.989743861527448e-05,
1144
- "loss": 0.9149,
1145
  "step": 805
1146
  },
1147
  {
1148
- "epoch": 0.07484061720410237,
1149
- "grad_norm": 5.375,
1150
- "learning_rate": 1.989529004786402e-05,
1151
- "loss": 0.8323,
1152
  "step": 810
1153
  },
1154
  {
1155
- "epoch": 0.07530259632264621,
1156
- "grad_norm": 3.421875,
1157
- "learning_rate": 1.9893119326304938e-05,
1158
- "loss": 0.7721,
1159
  "step": 815
1160
  },
1161
  {
1162
- "epoch": 0.07576457544119006,
1163
- "grad_norm": 4.25,
1164
- "learning_rate": 1.9890926455457172e-05,
1165
- "loss": 0.7716,
1166
  "step": 820
1167
  },
1168
  {
1169
- "epoch": 0.0762265545597339,
1170
- "grad_norm": 5.96875,
1171
- "learning_rate": 1.9888711440230258e-05,
1172
- "loss": 0.7043,
1173
  "step": 825
1174
  },
1175
  {
1176
- "epoch": 0.07668853367827774,
1177
- "grad_norm": 4.5625,
1178
- "learning_rate": 1.9886474285583283e-05,
1179
- "loss": 0.777,
1180
  "step": 830
1181
  },
1182
  {
1183
- "epoch": 0.07715051279682159,
1184
- "grad_norm": 4.65625,
1185
- "learning_rate": 1.9884214996524935e-05,
1186
- "loss": 0.9524,
1187
  "step": 835
1188
  },
1189
  {
1190
- "epoch": 0.07761249191536543,
1191
- "grad_norm": 4.78125,
1192
- "learning_rate": 1.988193357811343e-05,
1193
- "loss": 0.7875,
1194
  "step": 840
1195
  },
1196
  {
1197
- "epoch": 0.07807447103390927,
1198
- "grad_norm": 5.34375,
1199
- "learning_rate": 1.987963003545655e-05,
1200
- "loss": 0.8051,
1201
  "step": 845
1202
  },
1203
  {
1204
- "epoch": 0.07853645015245311,
1205
- "grad_norm": 4.4375,
1206
- "learning_rate": 1.9877304373711588e-05,
1207
- "loss": 0.8002,
1208
  "step": 850
1209
  },
1210
  {
1211
- "epoch": 0.07899842927099696,
1212
- "grad_norm": 4.84375,
1213
- "learning_rate": 1.9874956598085378e-05,
1214
- "loss": 0.7691,
1215
  "step": 855
1216
  },
1217
  {
1218
- "epoch": 0.0794604083895408,
1219
- "grad_norm": 5.625,
1220
- "learning_rate": 1.9872586713834253e-05,
1221
- "loss": 0.8483,
1222
  "step": 860
1223
  },
1224
  {
1225
- "epoch": 0.07992238750808464,
1226
- "grad_norm": 4.71875,
1227
- "learning_rate": 1.987019472626405e-05,
1228
- "loss": 0.8258,
1229
  "step": 865
1230
  },
1231
  {
1232
- "epoch": 0.08038436662662847,
1233
- "grad_norm": 6.75,
1234
- "learning_rate": 1.986778064073009e-05,
1235
- "loss": 0.9104,
1236
  "step": 870
1237
  },
1238
  {
1239
- "epoch": 0.08084634574517231,
1240
- "grad_norm": 5.28125,
1241
- "learning_rate": 1.9865344462637163e-05,
1242
- "loss": 0.765,
1243
  "step": 875
1244
  },
1245
  {
1246
- "epoch": 0.08130832486371616,
1247
- "grad_norm": 3.984375,
1248
- "learning_rate": 1.9862886197439525e-05,
1249
- "loss": 0.8766,
1250
  "step": 880
1251
  },
1252
  {
1253
- "epoch": 0.08177030398226,
1254
- "grad_norm": 5.46875,
1255
- "learning_rate": 1.9860405850640888e-05,
1256
- "loss": 0.7801,
1257
  "step": 885
1258
  },
1259
  {
1260
- "epoch": 0.08223228310080384,
1261
- "grad_norm": 4.5625,
1262
- "learning_rate": 1.9857903427794393e-05,
1263
- "loss": 0.8898,
1264
  "step": 890
1265
  },
1266
  {
1267
- "epoch": 0.08269426221934768,
1268
- "grad_norm": 3.859375,
1269
- "learning_rate": 1.985537893450261e-05,
1270
- "loss": 0.7725,
1271
  "step": 895
1272
  },
1273
  {
1274
- "epoch": 0.08315624133789153,
1275
- "grad_norm": 4.71875,
1276
- "learning_rate": 1.985283237641752e-05,
1277
- "loss": 0.8464,
1278
  "step": 900
1279
  },
1280
  {
1281
- "epoch": 0.08361822045643537,
1282
- "grad_norm": 4.5,
1283
- "learning_rate": 1.9850263759240507e-05,
1284
- "loss": 0.7433,
1285
  "step": 905
1286
  },
1287
  {
1288
- "epoch": 0.08408019957497921,
1289
- "grad_norm": 3.71875,
1290
- "learning_rate": 1.9847673088722337e-05,
1291
- "loss": 0.7768,
1292
  "step": 910
1293
  },
1294
  {
1295
- "epoch": 0.08454217869352305,
1296
- "grad_norm": 4.4375,
1297
- "learning_rate": 1.9845060370663157e-05,
1298
- "loss": 0.9128,
1299
  "step": 915
1300
  },
1301
  {
1302
- "epoch": 0.0850041578120669,
1303
- "grad_norm": 6.46875,
1304
- "learning_rate": 1.9842425610912467e-05,
1305
- "loss": 0.7274,
1306
  "step": 920
1307
  },
1308
  {
1309
- "epoch": 0.08546613693061074,
1310
- "grad_norm": 5.9375,
1311
- "learning_rate": 1.983976881536912e-05,
1312
- "loss": 0.7639,
1313
  "step": 925
1314
  },
1315
  {
1316
- "epoch": 0.08592811604915458,
1317
- "grad_norm": 3.921875,
1318
- "learning_rate": 1.9837089989981307e-05,
1319
- "loss": 0.7655,
1320
  "step": 930
1321
  },
1322
  {
1323
- "epoch": 0.08639009516769842,
1324
- "grad_norm": 3.796875,
1325
- "learning_rate": 1.983438914074654e-05,
1326
- "loss": 0.7869,
1327
  "step": 935
1328
  },
1329
  {
1330
- "epoch": 0.08685207428624227,
1331
- "grad_norm": 4.1875,
1332
- "learning_rate": 1.9831666273711628e-05,
1333
- "loss": 0.9739,
1334
  "step": 940
1335
  },
1336
  {
1337
- "epoch": 0.08731405340478611,
1338
- "grad_norm": 7.21875,
1339
- "learning_rate": 1.982892139497269e-05,
1340
- "loss": 1.0108,
1341
  "step": 945
1342
  },
1343
  {
1344
- "epoch": 0.08777603252332994,
1345
- "grad_norm": 5.1875,
1346
- "learning_rate": 1.9826154510675118e-05,
1347
- "loss": 0.8337,
1348
  "step": 950
1349
  },
1350
  {
1351
- "epoch": 0.08823801164187378,
1352
- "grad_norm": 4.1875,
1353
- "learning_rate": 1.9823365627013573e-05,
1354
- "loss": 0.8139,
1355
  "step": 955
1356
  },
1357
  {
1358
- "epoch": 0.08869999076041762,
1359
- "grad_norm": 5.09375,
1360
- "learning_rate": 1.9820554750231968e-05,
1361
- "loss": 0.7524,
1362
  "step": 960
1363
  },
1364
  {
1365
- "epoch": 0.08916196987896147,
1366
- "grad_norm": 5.5625,
1367
- "learning_rate": 1.981772188662346e-05,
1368
- "loss": 0.8152,
1369
  "step": 965
1370
  },
1371
  {
1372
- "epoch": 0.08962394899750531,
1373
- "grad_norm": 4.25,
1374
- "learning_rate": 1.981486704253042e-05,
1375
- "loss": 0.8497,
1376
  "step": 970
1377
  },
1378
  {
1379
- "epoch": 0.09008592811604915,
1380
- "grad_norm": 7.1875,
1381
- "learning_rate": 1.981199022434445e-05,
1382
- "loss": 0.8954,
1383
  "step": 975
1384
  },
1385
  {
1386
- "epoch": 0.090547907234593,
1387
- "grad_norm": 5.25,
1388
- "learning_rate": 1.9809091438506333e-05,
1389
- "loss": 0.8133,
1390
  "step": 980
1391
  },
1392
  {
1393
- "epoch": 0.09100988635313684,
1394
- "grad_norm": 4.4375,
1395
- "learning_rate": 1.980617069150603e-05,
1396
- "loss": 0.7652,
1397
  "step": 985
1398
  },
1399
  {
1400
- "epoch": 0.09147186547168068,
1401
- "grad_norm": 4.28125,
1402
- "learning_rate": 1.9803227989882693e-05,
1403
- "loss": 0.7811,
1404
  "step": 990
1405
  },
1406
  {
1407
- "epoch": 0.09193384459022452,
1408
- "grad_norm": 4.625,
1409
- "learning_rate": 1.9800263340224603e-05,
1410
- "loss": 0.8253,
1411
  "step": 995
1412
  },
1413
  {
1414
- "epoch": 0.09239582370876837,
1415
- "grad_norm": 4.625,
1416
- "learning_rate": 1.9797276749169192e-05,
1417
- "loss": 0.8832,
1418
  "step": 1000
1419
  },
1420
  {
1421
- "epoch": 0.09239582370876837,
1422
- "eval_loss": 0.8367779850959778,
1423
- "eval_runtime": 442.761,
1424
- "eval_samples_per_second": 20.587,
1425
- "eval_steps_per_second": 2.575,
1426
  "step": 1000
1427
- },
1428
- {
1429
- "epoch": 0.09285780282731221,
1430
- "grad_norm": 4.375,
1431
- "learning_rate": 1.9794268223403012e-05,
1432
- "loss": 1.0233,
1433
- "step": 1005
1434
- },
1435
- {
1436
- "epoch": 0.09331978194585605,
1437
- "grad_norm": 3.84375,
1438
- "learning_rate": 1.9791237769661728e-05,
1439
- "loss": 0.8371,
1440
- "step": 1010
1441
- },
1442
- {
1443
- "epoch": 0.0937817610643999,
1444
- "grad_norm": 4.40625,
1445
- "learning_rate": 1.978818539473009e-05,
1446
- "loss": 1.0483,
1447
- "step": 1015
1448
- },
1449
- {
1450
- "epoch": 0.09424374018294374,
1451
- "grad_norm": 4.625,
1452
- "learning_rate": 1.9785111105441942e-05,
1453
- "loss": 0.7854,
1454
- "step": 1020
1455
- },
1456
- {
1457
- "epoch": 0.09470571930148758,
1458
- "grad_norm": 4.40625,
1459
- "learning_rate": 1.9782014908680167e-05,
1460
- "loss": 0.6771,
1461
- "step": 1025
1462
- },
1463
- {
1464
- "epoch": 0.09516769842003142,
1465
- "grad_norm": 6.4375,
1466
- "learning_rate": 1.977889681137672e-05,
1467
- "loss": 0.7874,
1468
- "step": 1030
1469
- },
1470
- {
1471
- "epoch": 0.09562967753857525,
1472
- "grad_norm": 6.4375,
1473
- "learning_rate": 1.9775756820512574e-05,
1474
- "loss": 0.9448,
1475
- "step": 1035
1476
- },
1477
- {
1478
- "epoch": 0.09609165665711909,
1479
- "grad_norm": 5.78125,
1480
- "learning_rate": 1.9772594943117723e-05,
1481
- "loss": 0.8937,
1482
- "step": 1040
1483
- },
1484
- {
1485
- "epoch": 0.09655363577566294,
1486
- "grad_norm": 6.03125,
1487
- "learning_rate": 1.9769411186271162e-05,
1488
- "loss": 0.9173,
1489
- "step": 1045
1490
- },
1491
- {
1492
- "epoch": 0.09701561489420678,
1493
- "grad_norm": 7.84375,
1494
- "learning_rate": 1.976620555710087e-05,
1495
- "loss": 0.8327,
1496
- "step": 1050
1497
- },
1498
- {
1499
- "epoch": 0.09747759401275062,
1500
- "grad_norm": 4.28125,
1501
- "learning_rate": 1.9762978062783793e-05,
1502
- "loss": 0.7447,
1503
- "step": 1055
1504
- },
1505
- {
1506
- "epoch": 0.09793957313129446,
1507
- "grad_norm": 4.53125,
1508
- "learning_rate": 1.9759728710545836e-05,
1509
- "loss": 0.7932,
1510
- "step": 1060
1511
- },
1512
- {
1513
- "epoch": 0.0984015522498383,
1514
- "grad_norm": 5.21875,
1515
- "learning_rate": 1.9756457507661833e-05,
1516
- "loss": 0.8749,
1517
- "step": 1065
1518
- },
1519
- {
1520
- "epoch": 0.09886353136838215,
1521
- "grad_norm": 5.6875,
1522
- "learning_rate": 1.9753164461455548e-05,
1523
- "loss": 0.7447,
1524
- "step": 1070
1525
- },
1526
- {
1527
- "epoch": 0.09932551048692599,
1528
- "grad_norm": 5.375,
1529
- "learning_rate": 1.974984957929964e-05,
1530
- "loss": 0.7091,
1531
- "step": 1075
1532
- },
1533
- {
1534
- "epoch": 0.09978748960546983,
1535
- "grad_norm": 4.65625,
1536
- "learning_rate": 1.9746512868615656e-05,
1537
- "loss": 0.9072,
1538
- "step": 1080
1539
- },
1540
- {
1541
- "epoch": 0.10024946872401368,
1542
- "grad_norm": 4.5,
1543
- "learning_rate": 1.9743154336874024e-05,
1544
- "loss": 0.7947,
1545
- "step": 1085
1546
- },
1547
- {
1548
- "epoch": 0.10071144784255752,
1549
- "grad_norm": 4.75,
1550
- "learning_rate": 1.9739773991594017e-05,
1551
- "loss": 0.9017,
1552
- "step": 1090
1553
- },
1554
- {
1555
- "epoch": 0.10117342696110136,
1556
- "grad_norm": 5.21875,
1557
- "learning_rate": 1.9736371840343745e-05,
1558
- "loss": 0.7268,
1559
- "step": 1095
1560
- },
1561
- {
1562
- "epoch": 0.1016354060796452,
1563
- "grad_norm": 5.34375,
1564
- "learning_rate": 1.9732947890740143e-05,
1565
- "loss": 0.7386,
1566
- "step": 1100
1567
- },
1568
- {
1569
- "epoch": 0.10209738519818905,
1570
- "grad_norm": 5.4375,
1571
- "learning_rate": 1.972950215044895e-05,
1572
- "loss": 0.7445,
1573
- "step": 1105
1574
- },
1575
- {
1576
- "epoch": 0.10255936431673289,
1577
- "grad_norm": 4.28125,
1578
- "learning_rate": 1.9726034627184685e-05,
1579
- "loss": 0.8159,
1580
- "step": 1110
1581
- },
1582
- {
1583
- "epoch": 0.10302134343527672,
1584
- "grad_norm": 4.15625,
1585
- "learning_rate": 1.9722545328710643e-05,
1586
- "loss": 0.7683,
1587
- "step": 1115
1588
- },
1589
- {
1590
- "epoch": 0.10348332255382056,
1591
- "grad_norm": 5.9375,
1592
- "learning_rate": 1.971903426283887e-05,
1593
- "loss": 0.9848,
1594
- "step": 1120
1595
- },
1596
- {
1597
- "epoch": 0.1039453016723644,
1598
- "grad_norm": 4.9375,
1599
- "learning_rate": 1.971550143743014e-05,
1600
- "loss": 0.7591,
1601
- "step": 1125
1602
- },
1603
- {
1604
- "epoch": 0.10440728079090825,
1605
- "grad_norm": 5.15625,
1606
- "learning_rate": 1.971194686039394e-05,
1607
- "loss": 0.7353,
1608
- "step": 1130
1609
- },
1610
- {
1611
- "epoch": 0.10486925990945209,
1612
- "grad_norm": 5.25,
1613
- "learning_rate": 1.9708370539688476e-05,
1614
- "loss": 0.8047,
1615
- "step": 1135
1616
- },
1617
- {
1618
- "epoch": 0.10533123902799593,
1619
- "grad_norm": 4.625,
1620
- "learning_rate": 1.9704772483320616e-05,
1621
- "loss": 0.8734,
1622
- "step": 1140
1623
- },
1624
- {
1625
- "epoch": 0.10579321814653977,
1626
- "grad_norm": 4.59375,
1627
- "learning_rate": 1.9701152699345898e-05,
1628
- "loss": 1.0071,
1629
- "step": 1145
1630
- },
1631
- {
1632
- "epoch": 0.10625519726508362,
1633
- "grad_norm": 6.4375,
1634
- "learning_rate": 1.9697511195868504e-05,
1635
- "loss": 0.9231,
1636
- "step": 1150
1637
- },
1638
- {
1639
- "epoch": 0.10671717638362746,
1640
- "grad_norm": 4.375,
1641
- "learning_rate": 1.969384798104124e-05,
1642
- "loss": 0.7658,
1643
- "step": 1155
1644
- },
1645
- {
1646
- "epoch": 0.1071791555021713,
1647
- "grad_norm": 4.65625,
1648
- "learning_rate": 1.9690163063065532e-05,
1649
- "loss": 0.8602,
1650
- "step": 1160
1651
- },
1652
- {
1653
- "epoch": 0.10764113462071515,
1654
- "grad_norm": 6.03125,
1655
- "learning_rate": 1.9686456450191372e-05,
1656
- "loss": 0.8779,
1657
- "step": 1165
1658
- },
1659
- {
1660
- "epoch": 0.10810311373925899,
1661
- "grad_norm": 5.40625,
1662
- "learning_rate": 1.968272815071736e-05,
1663
- "loss": 0.8708,
1664
- "step": 1170
1665
- },
1666
- {
1667
- "epoch": 0.10856509285780283,
1668
- "grad_norm": 5.46875,
1669
- "learning_rate": 1.9678978172990612e-05,
1670
- "loss": 0.8517,
1671
- "step": 1175
1672
- },
1673
- {
1674
- "epoch": 0.10902707197634667,
1675
- "grad_norm": 6.59375,
1676
- "learning_rate": 1.9675206525406803e-05,
1677
- "loss": 0.8727,
1678
- "step": 1180
1679
- },
1680
- {
1681
- "epoch": 0.10948905109489052,
1682
- "grad_norm": 4.625,
1683
- "learning_rate": 1.9671413216410116e-05,
1684
- "loss": 0.7547,
1685
- "step": 1185
1686
- },
1687
- {
1688
- "epoch": 0.10995103021343436,
1689
- "grad_norm": 4.03125,
1690
- "learning_rate": 1.966759825449323e-05,
1691
- "loss": 0.7722,
1692
- "step": 1190
1693
- },
1694
- {
1695
- "epoch": 0.1104130093319782,
1696
- "grad_norm": 3.78125,
1697
- "learning_rate": 1.9663761648197302e-05,
1698
- "loss": 0.9483,
1699
- "step": 1195
1700
- },
1701
- {
1702
- "epoch": 0.11087498845052203,
1703
- "grad_norm": 3.84375,
1704
- "learning_rate": 1.965990340611195e-05,
1705
- "loss": 0.846,
1706
- "step": 1200
1707
- },
1708
- {
1709
- "epoch": 0.11133696756906587,
1710
- "grad_norm": 4.9375,
1711
- "learning_rate": 1.9656023536875227e-05,
1712
- "loss": 0.8802,
1713
- "step": 1205
1714
- },
1715
- {
1716
- "epoch": 0.11179894668760972,
1717
- "grad_norm": 3.921875,
1718
- "learning_rate": 1.965212204917361e-05,
1719
- "loss": 0.8081,
1720
- "step": 1210
1721
- },
1722
- {
1723
- "epoch": 0.11226092580615356,
1724
- "grad_norm": 5.0,
1725
- "learning_rate": 1.964819895174198e-05,
1726
- "loss": 0.8051,
1727
- "step": 1215
1728
- },
1729
- {
1730
- "epoch": 0.1127229049246974,
1731
- "grad_norm": 4.15625,
1732
- "learning_rate": 1.964425425336359e-05,
1733
- "loss": 0.8133,
1734
- "step": 1220
1735
- },
1736
- {
1737
- "epoch": 0.11318488404324124,
1738
- "grad_norm": 4.40625,
1739
- "learning_rate": 1.9640287962870063e-05,
1740
- "loss": 0.8061,
1741
- "step": 1225
1742
- },
1743
- {
1744
- "epoch": 0.11364686316178509,
1745
- "grad_norm": 3.984375,
1746
- "learning_rate": 1.9636300089141355e-05,
1747
- "loss": 0.7362,
1748
- "step": 1230
1749
- },
1750
- {
1751
- "epoch": 0.11410884228032893,
1752
- "grad_norm": 7.625,
1753
- "learning_rate": 1.9632290641105754e-05,
1754
- "loss": 1.0105,
1755
- "step": 1235
1756
- },
1757
- {
1758
- "epoch": 0.11457082139887277,
1759
- "grad_norm": 4.75,
1760
- "learning_rate": 1.962825962773984e-05,
1761
- "loss": 0.6403,
1762
- "step": 1240
1763
- },
1764
- {
1765
- "epoch": 0.11503280051741661,
1766
- "grad_norm": 5.75,
1767
- "learning_rate": 1.962420705806848e-05,
1768
- "loss": 0.8769,
1769
- "step": 1245
1770
- },
1771
- {
1772
- "epoch": 0.11549477963596046,
1773
- "grad_norm": 4.21875,
1774
- "learning_rate": 1.9620132941164806e-05,
1775
- "loss": 0.8211,
1776
- "step": 1250
1777
- },
1778
- {
1779
- "epoch": 0.1159567587545043,
1780
- "grad_norm": 4.8125,
1781
- "learning_rate": 1.961603728615018e-05,
1782
- "loss": 0.8711,
1783
- "step": 1255
1784
- },
1785
- {
1786
- "epoch": 0.11641873787304814,
1787
- "grad_norm": 6.375,
1788
- "learning_rate": 1.96119201021942e-05,
1789
- "loss": 0.8252,
1790
- "step": 1260
1791
- },
1792
- {
1793
- "epoch": 0.11688071699159198,
1794
- "grad_norm": 4.375,
1795
- "learning_rate": 1.9607781398514646e-05,
1796
- "loss": 0.8147,
1797
- "step": 1265
1798
- },
1799
- {
1800
- "epoch": 0.11734269611013583,
1801
- "grad_norm": 4.21875,
1802
- "learning_rate": 1.9603621184377498e-05,
1803
- "loss": 0.7746,
1804
- "step": 1270
1805
- },
1806
- {
1807
- "epoch": 0.11780467522867967,
1808
- "grad_norm": 4.5625,
1809
- "learning_rate": 1.9599439469096876e-05,
1810
- "loss": 0.965,
1811
- "step": 1275
1812
- },
1813
- {
1814
- "epoch": 0.1182666543472235,
1815
- "grad_norm": 4.34375,
1816
- "learning_rate": 1.9595236262035057e-05,
1817
- "loss": 0.711,
1818
- "step": 1280
1819
- },
1820
- {
1821
- "epoch": 0.11872863346576734,
1822
- "grad_norm": 4.1875,
1823
- "learning_rate": 1.959101157260241e-05,
1824
- "loss": 0.7464,
1825
- "step": 1285
1826
- },
1827
- {
1828
- "epoch": 0.11919061258431118,
1829
- "grad_norm": 5.125,
1830
- "learning_rate": 1.9586765410257424e-05,
1831
- "loss": 0.7712,
1832
- "step": 1290
1833
- },
1834
- {
1835
- "epoch": 0.11965259170285503,
1836
- "grad_norm": 6.0625,
1837
- "learning_rate": 1.958249778450665e-05,
1838
- "loss": 0.8344,
1839
- "step": 1295
1840
- },
1841
- {
1842
- "epoch": 0.12011457082139887,
1843
- "grad_norm": 5.84375,
1844
- "learning_rate": 1.95782087049047e-05,
1845
- "loss": 0.7442,
1846
- "step": 1300
1847
- },
1848
- {
1849
- "epoch": 0.12057654993994271,
1850
- "grad_norm": 7.8125,
1851
- "learning_rate": 1.957389818105421e-05,
1852
- "loss": 0.818,
1853
- "step": 1305
1854
- },
1855
- {
1856
- "epoch": 0.12103852905848655,
1857
- "grad_norm": 4.125,
1858
- "learning_rate": 1.9569566222605832e-05,
1859
- "loss": 0.6755,
1860
- "step": 1310
1861
- },
1862
- {
1863
- "epoch": 0.1215005081770304,
1864
- "grad_norm": 5.0625,
1865
- "learning_rate": 1.9565212839258204e-05,
1866
- "loss": 0.8302,
1867
- "step": 1315
1868
- },
1869
- {
1870
- "epoch": 0.12196248729557424,
1871
- "grad_norm": 4.3125,
1872
- "learning_rate": 1.9560838040757933e-05,
1873
- "loss": 0.7821,
1874
- "step": 1320
1875
- },
1876
- {
1877
- "epoch": 0.12242446641411808,
1878
- "grad_norm": 5.75,
1879
- "learning_rate": 1.955644183689957e-05,
1880
- "loss": 0.8591,
1881
- "step": 1325
1882
- },
1883
- {
1884
- "epoch": 0.12288644553266193,
1885
- "grad_norm": 4.0,
1886
- "learning_rate": 1.9552024237525597e-05,
1887
- "loss": 0.7422,
1888
- "step": 1330
1889
- },
1890
- {
1891
- "epoch": 0.12334842465120577,
1892
- "grad_norm": 5.71875,
1893
- "learning_rate": 1.9547585252526388e-05,
1894
- "loss": 0.7762,
1895
- "step": 1335
1896
- },
1897
- {
1898
- "epoch": 0.12381040376974961,
1899
- "grad_norm": 4.5625,
1900
- "learning_rate": 1.9543124891840196e-05,
1901
- "loss": 0.8118,
1902
- "step": 1340
1903
- },
1904
- {
1905
- "epoch": 0.12427238288829345,
1906
- "grad_norm": 5.0625,
1907
- "learning_rate": 1.9538643165453138e-05,
1908
- "loss": 0.7443,
1909
- "step": 1345
1910
- },
1911
- {
1912
- "epoch": 0.1247343620068373,
1913
- "grad_norm": 5.375,
1914
- "learning_rate": 1.9534140083399165e-05,
1915
- "loss": 0.864,
1916
- "step": 1350
1917
- },
1918
- {
1919
- "epoch": 0.12519634112538114,
1920
- "grad_norm": 4.1875,
1921
- "learning_rate": 1.9529615655760034e-05,
1922
- "loss": 0.7972,
1923
- "step": 1355
1924
- },
1925
- {
1926
- "epoch": 0.12565832024392498,
1927
- "grad_norm": 5.40625,
1928
- "learning_rate": 1.9525069892665295e-05,
1929
- "loss": 0.8191,
1930
- "step": 1360
1931
- },
1932
- {
1933
- "epoch": 0.12612029936246882,
1934
- "grad_norm": 4.9375,
1935
- "learning_rate": 1.952050280429227e-05,
1936
- "loss": 0.7575,
1937
- "step": 1365
1938
- },
1939
- {
1940
- "epoch": 0.12658227848101267,
1941
- "grad_norm": 5.3125,
1942
- "learning_rate": 1.9515914400866022e-05,
1943
- "loss": 0.817,
1944
- "step": 1370
1945
- },
1946
- {
1947
- "epoch": 0.1270442575995565,
1948
- "grad_norm": 5.5625,
1949
- "learning_rate": 1.951130469265933e-05,
1950
- "loss": 0.9624,
1951
- "step": 1375
1952
- },
1953
- {
1954
- "epoch": 0.12750623671810035,
1955
- "grad_norm": 4.84375,
1956
- "learning_rate": 1.9506673689992673e-05,
1957
- "loss": 0.8555,
1958
- "step": 1380
1959
- },
1960
- {
1961
- "epoch": 0.1279682158366442,
1962
- "grad_norm": 4.03125,
1963
- "learning_rate": 1.950202140323422e-05,
1964
- "loss": 0.803,
1965
- "step": 1385
1966
- },
1967
- {
1968
- "epoch": 0.12843019495518804,
1969
- "grad_norm": 4.5,
1970
- "learning_rate": 1.9497347842799767e-05,
1971
- "loss": 0.8625,
1972
- "step": 1390
1973
- },
1974
- {
1975
- "epoch": 0.12889217407373188,
1976
- "grad_norm": 5.90625,
1977
- "learning_rate": 1.9492653019152762e-05,
1978
- "loss": 0.7429,
1979
- "step": 1395
1980
- },
1981
- {
1982
- "epoch": 0.12935415319227572,
1983
- "grad_norm": 6.71875,
1984
- "learning_rate": 1.9487936942804237e-05,
1985
- "loss": 0.8089,
1986
- "step": 1400
1987
- },
1988
- {
1989
- "epoch": 0.12981613231081954,
1990
- "grad_norm": 5.84375,
1991
- "learning_rate": 1.948319962431283e-05,
1992
- "loss": 0.8077,
1993
- "step": 1405
1994
- },
1995
- {
1996
- "epoch": 0.13027811142936338,
1997
- "grad_norm": 4.0625,
1998
- "learning_rate": 1.9478441074284713e-05,
1999
- "loss": 0.8612,
2000
- "step": 1410
2001
- },
2002
- {
2003
- "epoch": 0.13074009054790722,
2004
- "grad_norm": 4.96875,
2005
- "learning_rate": 1.947366130337361e-05,
2006
- "loss": 0.7953,
2007
- "step": 1415
2008
- },
2009
- {
2010
- "epoch": 0.13120206966645107,
2011
- "grad_norm": 4.5625,
2012
- "learning_rate": 1.9468860322280746e-05,
2013
- "loss": 0.8029,
2014
- "step": 1420
2015
- },
2016
- {
2017
- "epoch": 0.1316640487849949,
2018
- "grad_norm": 6.0,
2019
- "learning_rate": 1.946403814175484e-05,
2020
- "loss": 0.9268,
2021
- "step": 1425
2022
- },
2023
- {
2024
- "epoch": 0.13212602790353875,
2025
- "grad_norm": 3.921875,
2026
- "learning_rate": 1.9459194772592062e-05,
2027
- "loss": 0.8112,
2028
- "step": 1430
2029
- },
2030
- {
2031
- "epoch": 0.1325880070220826,
2032
- "grad_norm": 4.96875,
2033
- "learning_rate": 1.9454330225636035e-05,
2034
- "loss": 0.6757,
2035
- "step": 1435
2036
- },
2037
- {
2038
- "epoch": 0.13304998614062644,
2039
- "grad_norm": 5.25,
2040
- "learning_rate": 1.944944451177778e-05,
2041
- "loss": 0.784,
2042
- "step": 1440
2043
- },
2044
- {
2045
- "epoch": 0.13351196525917028,
2046
- "grad_norm": 4.46875,
2047
- "learning_rate": 1.9444537641955725e-05,
2048
- "loss": 0.9202,
2049
- "step": 1445
2050
- },
2051
- {
2052
- "epoch": 0.13397394437771412,
2053
- "grad_norm": 4.90625,
2054
- "learning_rate": 1.943960962715565e-05,
2055
- "loss": 0.8239,
2056
- "step": 1450
2057
- },
2058
- {
2059
- "epoch": 0.13443592349625796,
2060
- "grad_norm": 5.125,
2061
- "learning_rate": 1.9434660478410676e-05,
2062
- "loss": 0.8025,
2063
- "step": 1455
2064
- },
2065
- {
2066
- "epoch": 0.1348979026148018,
2067
- "grad_norm": 5.03125,
2068
- "learning_rate": 1.9429690206801255e-05,
2069
- "loss": 0.8056,
2070
- "step": 1460
2071
- },
2072
- {
2073
- "epoch": 0.13535988173334565,
2074
- "grad_norm": 5.78125,
2075
- "learning_rate": 1.942469882345511e-05,
2076
- "loss": 0.9176,
2077
- "step": 1465
2078
- },
2079
- {
2080
- "epoch": 0.1358218608518895,
2081
- "grad_norm": 4.8125,
2082
- "learning_rate": 1.941968633954724e-05,
2083
- "loss": 0.8589,
2084
- "step": 1470
2085
- },
2086
- {
2087
- "epoch": 0.13628383997043333,
2088
- "grad_norm": 4.375,
2089
- "learning_rate": 1.9414652766299887e-05,
2090
- "loss": 0.9654,
2091
- "step": 1475
2092
- },
2093
- {
2094
- "epoch": 0.13674581908897718,
2095
- "grad_norm": 4.5,
2096
- "learning_rate": 1.9409598114982503e-05,
2097
- "loss": 0.7178,
2098
- "step": 1480
2099
- },
2100
- {
2101
- "epoch": 0.13720779820752102,
2102
- "grad_norm": 5.875,
2103
- "learning_rate": 1.9404522396911742e-05,
2104
- "loss": 0.8832,
2105
- "step": 1485
2106
- },
2107
- {
2108
- "epoch": 0.13766977732606486,
2109
- "grad_norm": 4.46875,
2110
- "learning_rate": 1.9399425623451405e-05,
2111
- "loss": 0.8601,
2112
- "step": 1490
2113
- },
2114
- {
2115
- "epoch": 0.1381317564446087,
2116
- "grad_norm": 4.0,
2117
- "learning_rate": 1.9394307806012454e-05,
2118
- "loss": 0.6866,
2119
- "step": 1495
2120
- },
2121
- {
2122
- "epoch": 0.13859373556315255,
2123
- "grad_norm": 5.125,
2124
- "learning_rate": 1.9389168956052945e-05,
2125
- "loss": 0.7372,
2126
- "step": 1500
2127
- },
2128
- {
2129
- "epoch": 0.13859373556315255,
2130
- "eval_loss": 0.8288407325744629,
2131
- "eval_runtime": 443.0014,
2132
- "eval_samples_per_second": 20.576,
2133
- "eval_steps_per_second": 2.573,
2134
- "step": 1500
2135
  }
2136
  ],
2137
  "logging_steps": 5,
2138
- "max_steps": 10823,
2139
  "num_input_tokens_seen": 0,
2140
  "num_train_epochs": 1,
2141
  "save_steps": 500,
@@ -2143,10 +1435,10 @@
2143
  "EarlyStoppingCallback": {
2144
  "args": {
2145
  "early_stopping_patience": 3,
2146
- "early_stopping_threshold": 0.01
2147
  },
2148
  "attributes": {
2149
- "early_stopping_patience_counter": 1
2150
  }
2151
  },
2152
  "TrainerControl": {
@@ -2160,7 +1452,7 @@
2160
  "attributes": {}
2161
  }
2162
  },
2163
- "total_flos": 6.4949569191936e+16,
2164
  "train_batch_size": 4,
2165
  "trial_name": null,
2166
  "trial_params": null
 
1
  {
2
+ "best_global_step": 1000,
3
+ "best_metric": 1.0913933515548706,
4
+ "best_model_checkpoint": "output/reasoning-model_v11/checkpoint-1000",
5
+ "epoch": 0.37512895057676077,
6
  "eval_steps": 500,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.0018756447528838038,
14
+ "grad_norm": 17.375,
15
+ "learning_rate": 1.0000000000000002e-06,
16
+ "loss": 1.5042,
17
  "step": 5
18
  },
19
  {
20
+ "epoch": 0.0037512895057676076,
21
+ "grad_norm": 14.875,
22
+ "learning_rate": 2.25e-06,
23
+ "loss": 1.413,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 0.005626934258651411,
28
+ "grad_norm": 13.375,
29
+ "learning_rate": 3.5e-06,
30
+ "loss": 1.5803,
31
  "step": 15
32
  },
33
  {
34
+ "epoch": 0.007502579011535215,
35
+ "grad_norm": 9.1875,
36
+ "learning_rate": 4.75e-06,
37
+ "loss": 1.5081,
38
  "step": 20
39
  },
40
  {
41
+ "epoch": 0.00937822376441902,
42
+ "grad_norm": 9.5,
43
+ "learning_rate": 6e-06,
44
+ "loss": 1.3924,
45
  "step": 25
46
  },
47
  {
48
+ "epoch": 0.011253868517302822,
49
+ "grad_norm": 7.96875,
50
+ "learning_rate": 7.25e-06,
51
+ "loss": 1.4063,
52
  "step": 30
53
  },
54
  {
55
+ "epoch": 0.013129513270186627,
56
+ "grad_norm": 7.78125,
57
+ "learning_rate": 8.5e-06,
58
+ "loss": 1.2718,
59
  "step": 35
60
  },
61
  {
62
+ "epoch": 0.01500515802307043,
63
+ "grad_norm": 10.4375,
64
+ "learning_rate": 9.75e-06,
65
+ "loss": 1.2445,
66
  "step": 40
67
  },
68
  {
69
+ "epoch": 0.016880802775954235,
70
+ "grad_norm": 7.59375,
71
+ "learning_rate": 1.1000000000000001e-05,
72
+ "loss": 1.1666,
73
  "step": 45
74
  },
75
  {
76
+ "epoch": 0.01875644752883804,
77
+ "grad_norm": 7.0,
78
+ "learning_rate": 1.2250000000000001e-05,
79
+ "loss": 1.2257,
80
  "step": 50
81
  },
82
  {
83
+ "epoch": 0.02063209228172184,
84
+ "grad_norm": 8.5625,
85
+ "learning_rate": 1.3500000000000001e-05,
86
+ "loss": 1.1071,
87
  "step": 55
88
  },
89
  {
90
+ "epoch": 0.022507737034605645,
91
+ "grad_norm": 7.5625,
92
+ "learning_rate": 1.4750000000000003e-05,
93
+ "loss": 1.2369,
94
  "step": 60
95
  },
96
  {
97
+ "epoch": 0.02438338178748945,
98
+ "grad_norm": 8.5625,
99
+ "learning_rate": 1.6000000000000003e-05,
100
+ "loss": 1.2355,
101
  "step": 65
102
  },
103
  {
104
+ "epoch": 0.026259026540373254,
105
+ "grad_norm": 9.8125,
106
+ "learning_rate": 1.7250000000000003e-05,
107
+ "loss": 1.0905,
108
  "step": 70
109
  },
110
  {
111
+ "epoch": 0.028134671293257058,
112
+ "grad_norm": 6.6875,
113
+ "learning_rate": 1.8500000000000002e-05,
114
+ "loss": 1.1362,
115
  "step": 75
116
  },
117
  {
118
+ "epoch": 0.03001031604614086,
119
+ "grad_norm": 12.875,
120
+ "learning_rate": 1.9750000000000002e-05,
121
+ "loss": 1.1355,
122
  "step": 80
123
  },
124
  {
125
+ "epoch": 0.031885960799024664,
126
+ "grad_norm": 8.0625,
127
+ "learning_rate": 1.999988193210057e-05,
128
+ "loss": 1.2262,
129
  "step": 85
130
  },
131
  {
132
+ "epoch": 0.03376160555190847,
133
+ "grad_norm": 6.40625,
134
+ "learning_rate": 1.9999402286037404e-05,
135
+ "loss": 1.0978,
136
  "step": 90
137
  },
138
  {
139
+ "epoch": 0.03563725030479227,
140
+ "grad_norm": 9.375,
141
+ "learning_rate": 1.9998553700250286e-05,
142
+ "loss": 1.098,
143
  "step": 95
144
  },
145
  {
146
+ "epoch": 0.03751289505767608,
147
+ "grad_norm": 7.21875,
148
+ "learning_rate": 1.9997336206048778e-05,
149
+ "loss": 1.2503,
150
  "step": 100
151
  },
152
  {
153
+ "epoch": 0.03938853981055988,
154
+ "grad_norm": 8.1875,
155
+ "learning_rate": 1.999574984835377e-05,
156
+ "loss": 1.0663,
157
  "step": 105
158
  },
159
  {
160
+ "epoch": 0.04126418456344368,
161
+ "grad_norm": 7.46875,
162
+ "learning_rate": 1.9993794685695792e-05,
163
+ "loss": 1.3083,
164
  "step": 110
165
  },
166
  {
167
+ "epoch": 0.04313982931632749,
168
+ "grad_norm": 6.9375,
169
+ "learning_rate": 1.9991470790212877e-05,
170
+ "loss": 1.1906,
171
  "step": 115
172
  },
173
  {
174
+ "epoch": 0.04501547406921129,
175
+ "grad_norm": 9.8125,
176
+ "learning_rate": 1.9988778247647887e-05,
177
+ "loss": 1.1358,
178
  "step": 120
179
  },
180
  {
181
+ "epoch": 0.046891118822095096,
182
+ "grad_norm": 9.375,
183
+ "learning_rate": 1.9985717157345346e-05,
184
+ "loss": 1.1421,
185
  "step": 125
186
  },
187
  {
188
+ "epoch": 0.0487667635749789,
189
+ "grad_norm": 7.125,
190
+ "learning_rate": 1.998228763224779e-05,
191
+ "loss": 1.1236,
192
  "step": 130
193
  },
194
  {
195
+ "epoch": 0.0506424083278627,
196
+ "grad_norm": 6.1875,
197
+ "learning_rate": 1.9978489798891584e-05,
198
+ "loss": 1.2223,
199
  "step": 135
200
  },
201
  {
202
+ "epoch": 0.05251805308074651,
203
+ "grad_norm": 6.875,
204
+ "learning_rate": 1.9974323797402264e-05,
205
+ "loss": 1.1818,
206
  "step": 140
207
  },
208
  {
209
+ "epoch": 0.05439369783363031,
210
+ "grad_norm": 5.78125,
211
+ "learning_rate": 1.996978978148936e-05,
212
+ "loss": 1.0341,
213
  "step": 145
214
  },
215
  {
216
+ "epoch": 0.056269342586514115,
217
+ "grad_norm": 7.40625,
218
+ "learning_rate": 1.9964887918440735e-05,
219
+ "loss": 1.118,
220
  "step": 150
221
  },
222
  {
223
+ "epoch": 0.058144987339397915,
224
+ "grad_norm": 7.71875,
225
+ "learning_rate": 1.995961838911639e-05,
226
+ "loss": 1.2993,
227
  "step": 155
228
  },
229
  {
230
+ "epoch": 0.06002063209228172,
231
+ "grad_norm": 7.03125,
232
+ "learning_rate": 1.995398138794182e-05,
233
+ "loss": 1.1533,
234
  "step": 160
235
  },
236
  {
237
+ "epoch": 0.06189627684516553,
238
+ "grad_norm": 7.40625,
239
+ "learning_rate": 1.9947977122900825e-05,
240
+ "loss": 1.0802,
241
  "step": 165
242
  },
243
  {
244
+ "epoch": 0.06377192159804933,
245
+ "grad_norm": 7.15625,
246
+ "learning_rate": 1.9941605815527827e-05,
247
+ "loss": 1.0135,
248
  "step": 170
249
  },
250
  {
251
+ "epoch": 0.06564756635093313,
252
+ "grad_norm": 6.25,
253
+ "learning_rate": 1.9934867700899724e-05,
254
+ "loss": 1.074,
255
  "step": 175
256
  },
257
  {
258
+ "epoch": 0.06752321110381694,
259
+ "grad_norm": 6.9375,
260
+ "learning_rate": 1.9927763027627184e-05,
261
+ "loss": 1.0574,
262
  "step": 180
263
  },
264
  {
265
+ "epoch": 0.06939885585670075,
266
+ "grad_norm": 8.4375,
267
+ "learning_rate": 1.99202920578455e-05,
268
+ "loss": 1.0316,
269
  "step": 185
270
  },
271
  {
272
+ "epoch": 0.07127450060958454,
273
+ "grad_norm": 7.28125,
274
+ "learning_rate": 1.9912455067204898e-05,
275
+ "loss": 1.0653,
276
  "step": 190
277
  },
278
  {
279
+ "epoch": 0.07315014536246835,
280
+ "grad_norm": 10.4375,
281
+ "learning_rate": 1.990425234486038e-05,
282
+ "loss": 1.1119,
283
  "step": 195
284
  },
285
  {
286
+ "epoch": 0.07502579011535215,
287
+ "grad_norm": 7.71875,
288
+ "learning_rate": 1.9895684193461047e-05,
289
+ "loss": 1.28,
290
  "step": 200
291
  },
292
  {
293
+ "epoch": 0.07690143486823596,
294
+ "grad_norm": 7.6875,
295
+ "learning_rate": 1.9886750929138935e-05,
296
+ "loss": 1.1392,
297
  "step": 205
298
  },
299
  {
300
+ "epoch": 0.07877707962111977,
301
+ "grad_norm": 5.90625,
302
+ "learning_rate": 1.987745288149735e-05,
303
+ "loss": 1.1955,
304
  "step": 210
305
  },
306
  {
307
+ "epoch": 0.08065272437400356,
308
+ "grad_norm": 6.625,
309
+ "learning_rate": 1.986779039359871e-05,
310
+ "loss": 1.1513,
311
  "step": 215
312
  },
313
  {
314
+ "epoch": 0.08252836912688737,
315
+ "grad_norm": 7.75,
316
+ "learning_rate": 1.985776382195189e-05,
317
+ "loss": 1.0088,
318
  "step": 220
319
  },
320
  {
321
+ "epoch": 0.08440401387977117,
322
+ "grad_norm": 7.5625,
323
+ "learning_rate": 1.984737353649906e-05,
324
+ "loss": 0.9908,
325
  "step": 225
326
  },
327
  {
328
+ "epoch": 0.08627965863265498,
329
+ "grad_norm": 8.5,
330
+ "learning_rate": 1.9836619920602032e-05,
331
+ "loss": 1.1958,
332
  "step": 230
333
  },
334
  {
335
+ "epoch": 0.08815530338553879,
336
+ "grad_norm": 11.3125,
337
+ "learning_rate": 1.9825503371028136e-05,
338
+ "loss": 1.1107,
339
  "step": 235
340
  },
341
  {
342
+ "epoch": 0.09003094813842258,
343
+ "grad_norm": 6.78125,
344
+ "learning_rate": 1.981402429793556e-05,
345
+ "loss": 1.062,
346
  "step": 240
347
  },
348
  {
349
+ "epoch": 0.09190659289130639,
350
+ "grad_norm": 7.75,
351
+ "learning_rate": 1.980218312485822e-05,
352
+ "loss": 1.1393,
353
  "step": 245
354
  },
355
  {
356
+ "epoch": 0.09378223764419019,
357
+ "grad_norm": 9.5625,
358
+ "learning_rate": 1.978998028869015e-05,
359
+ "loss": 1.0907,
360
  "step": 250
361
  },
362
  {
363
+ "epoch": 0.095657882397074,
364
+ "grad_norm": 9.4375,
365
+ "learning_rate": 1.977741623966936e-05,
366
+ "loss": 1.0887,
367
  "step": 255
368
  },
369
  {
370
+ "epoch": 0.0975335271499578,
371
+ "grad_norm": 8.125,
372
+ "learning_rate": 1.9764491441361227e-05,
373
+ "loss": 1.1234,
374
  "step": 260
375
  },
376
  {
377
+ "epoch": 0.0994091719028416,
378
+ "grad_norm": 7.03125,
379
+ "learning_rate": 1.975120637064142e-05,
380
+ "loss": 1.0825,
381
  "step": 265
382
  },
383
  {
384
+ "epoch": 0.1012848166557254,
385
+ "grad_norm": 7.5,
386
+ "learning_rate": 1.973756151767826e-05,
387
+ "loss": 1.0728,
388
  "step": 270
389
  },
390
  {
391
+ "epoch": 0.10316046140860921,
392
+ "grad_norm": 7.65625,
393
+ "learning_rate": 1.972355738591467e-05,
394
+ "loss": 1.2317,
395
  "step": 275
396
  },
397
  {
398
+ "epoch": 0.10503610616149302,
399
+ "grad_norm": 8.5625,
400
+ "learning_rate": 1.9709194492049585e-05,
401
+ "loss": 1.1281,
402
  "step": 280
403
  },
404
  {
405
+ "epoch": 0.10691175091437681,
406
+ "grad_norm": 5.84375,
407
+ "learning_rate": 1.9694473366018887e-05,
408
+ "loss": 0.9848,
409
  "step": 285
410
  },
411
  {
412
+ "epoch": 0.10878739566726062,
413
+ "grad_norm": 8.4375,
414
+ "learning_rate": 1.9679394550975864e-05,
415
+ "loss": 1.0793,
416
  "step": 290
417
  },
418
  {
419
+ "epoch": 0.11066304042014442,
420
+ "grad_norm": 8.875,
421
+ "learning_rate": 1.9663958603271148e-05,
422
+ "loss": 1.1765,
423
  "step": 295
424
  },
425
  {
426
+ "epoch": 0.11253868517302823,
427
+ "grad_norm": 6.4375,
428
+ "learning_rate": 1.9648166092432216e-05,
429
+ "loss": 1.0936,
430
  "step": 300
431
  },
432
  {
433
+ "epoch": 0.11441432992591204,
434
+ "grad_norm": 7.21875,
435
+ "learning_rate": 1.9632017601142353e-05,
436
+ "loss": 1.0343,
437
  "step": 305
438
  },
439
  {
440
+ "epoch": 0.11628997467879583,
441
+ "grad_norm": 8.75,
442
+ "learning_rate": 1.961551372521916e-05,
443
+ "loss": 1.2004,
444
  "step": 310
445
  },
446
  {
447
+ "epoch": 0.11816561943167964,
448
+ "grad_norm": 8.5,
449
+ "learning_rate": 1.9598655073592583e-05,
450
+ "loss": 1.1076,
451
  "step": 315
452
  },
453
  {
454
+ "epoch": 0.12004126418456344,
455
+ "grad_norm": 7.0625,
456
+ "learning_rate": 1.9581442268282426e-05,
457
+ "loss": 1.1502,
458
  "step": 320
459
  },
460
  {
461
+ "epoch": 0.12191690893744725,
462
+ "grad_norm": 7.65625,
463
+ "learning_rate": 1.956387594437541e-05,
464
+ "loss": 1.0411,
465
  "step": 325
466
  },
467
  {
468
+ "epoch": 0.12379255369033106,
469
+ "grad_norm": 7.59375,
470
+ "learning_rate": 1.9545956750001744e-05,
471
+ "loss": 0.9873,
472
  "step": 330
473
  },
474
  {
475
+ "epoch": 0.12566819844321486,
476
+ "grad_norm": 6.1875,
477
+ "learning_rate": 1.952768534631121e-05,
478
+ "loss": 1.1338,
479
  "step": 335
480
  },
481
  {
482
+ "epoch": 0.12754384319609866,
483
+ "grad_norm": 9.125,
484
+ "learning_rate": 1.950906240744877e-05,
485
+ "loss": 1.0202,
486
  "step": 340
487
  },
488
  {
489
+ "epoch": 0.12941948794898248,
490
+ "grad_norm": 8.5625,
491
+ "learning_rate": 1.9490088620529678e-05,
492
+ "loss": 1.1292,
493
  "step": 345
494
  },
495
  {
496
+ "epoch": 0.13129513270186627,
497
+ "grad_norm": 5.8125,
498
+ "learning_rate": 1.9470764685614158e-05,
499
+ "loss": 1.0699,
500
  "step": 350
501
  },
502
  {
503
+ "epoch": 0.13317077745475006,
504
+ "grad_norm": 6.15625,
505
+ "learning_rate": 1.945109131568154e-05,
506
+ "loss": 1.1161,
507
  "step": 355
508
  },
509
  {
510
+ "epoch": 0.13504642220763388,
511
+ "grad_norm": 6.125,
512
+ "learning_rate": 1.943106923660398e-05,
513
+ "loss": 1.099,
514
  "step": 360
515
  },
516
  {
517
+ "epoch": 0.13692206696051767,
518
+ "grad_norm": 6.53125,
519
+ "learning_rate": 1.9410699187119662e-05,
520
+ "loss": 1.0477,
521
  "step": 365
522
  },
523
  {
524
+ "epoch": 0.1387977117134015,
525
+ "grad_norm": 8.9375,
526
+ "learning_rate": 1.938998191880556e-05,
527
+ "loss": 1.0746,
528
  "step": 370
529
  },
530
  {
531
+ "epoch": 0.1406733564662853,
532
+ "grad_norm": 7.09375,
533
+ "learning_rate": 1.936891819604968e-05,
534
+ "loss": 1.0619,
535
  "step": 375
536
  },
537
  {
538
+ "epoch": 0.14254900121916908,
539
+ "grad_norm": 6.6875,
540
+ "learning_rate": 1.9347508796022888e-05,
541
+ "loss": 1.1807,
542
  "step": 380
543
  },
544
  {
545
+ "epoch": 0.1444246459720529,
546
+ "grad_norm": 6.75,
547
+ "learning_rate": 1.9325754508650208e-05,
548
+ "loss": 1.0707,
549
  "step": 385
550
  },
551
  {
552
+ "epoch": 0.1463002907249367,
553
+ "grad_norm": 7.1875,
554
+ "learning_rate": 1.9303656136581694e-05,
555
+ "loss": 1.0464,
556
  "step": 390
557
  },
558
  {
559
+ "epoch": 0.14817593547782051,
560
+ "grad_norm": 9.0,
561
+ "learning_rate": 1.928121449516281e-05,
562
+ "loss": 1.0882,
563
  "step": 395
564
  },
565
  {
566
+ "epoch": 0.1500515802307043,
567
+ "grad_norm": 7.71875,
568
+ "learning_rate": 1.9258430412404344e-05,
569
+ "loss": 1.125,
570
  "step": 400
571
  },
572
  {
573
+ "epoch": 0.1519272249835881,
574
+ "grad_norm": 6.6875,
575
+ "learning_rate": 1.9235304728951868e-05,
576
+ "loss": 1.1356,
577
  "step": 405
578
  },
579
  {
580
+ "epoch": 0.15380286973647192,
581
+ "grad_norm": 9.9375,
582
+ "learning_rate": 1.9211838298054704e-05,
583
+ "loss": 1.1532,
584
  "step": 410
585
  },
586
  {
587
+ "epoch": 0.1556785144893557,
588
+ "grad_norm": 8.5,
589
+ "learning_rate": 1.918803198553446e-05,
590
+ "loss": 1.1386,
591
  "step": 415
592
  },
593
  {
594
+ "epoch": 0.15755415924223953,
595
+ "grad_norm": 7.96875,
596
+ "learning_rate": 1.916388666975307e-05,
597
+ "loss": 1.064,
598
  "step": 420
599
  },
600
  {
601
+ "epoch": 0.15942980399512333,
602
+ "grad_norm": 9.0,
603
+ "learning_rate": 1.9139403241580403e-05,
604
+ "loss": 1.2345,
605
  "step": 425
606
  },
607
  {
608
+ "epoch": 0.16130544874800712,
609
+ "grad_norm": 7.25,
610
+ "learning_rate": 1.9114582604361368e-05,
611
+ "loss": 1.1021,
612
  "step": 430
613
  },
614
  {
615
+ "epoch": 0.16318109350089094,
616
+ "grad_norm": 9.4375,
617
+ "learning_rate": 1.9089425673882617e-05,
618
+ "loss": 1.1105,
619
  "step": 435
620
  },
621
  {
622
+ "epoch": 0.16505673825377473,
623
+ "grad_norm": 8.375,
624
+ "learning_rate": 1.906393337833872e-05,
625
+ "loss": 1.0213,
626
  "step": 440
627
  },
628
  {
629
+ "epoch": 0.16693238300665855,
630
+ "grad_norm": 6.5625,
631
+ "learning_rate": 1.9038106658297946e-05,
632
+ "loss": 1.2062,
633
  "step": 445
634
  },
635
  {
636
+ "epoch": 0.16880802775954235,
637
+ "grad_norm": 7.6875,
638
+ "learning_rate": 1.9011946466667553e-05,
639
+ "loss": 1.2408,
640
  "step": 450
641
  },
642
  {
643
+ "epoch": 0.17068367251242614,
644
+ "grad_norm": 6.3125,
645
+ "learning_rate": 1.8985453768658613e-05,
646
+ "loss": 1.0678,
647
  "step": 455
648
  },
649
  {
650
+ "epoch": 0.17255931726530996,
651
+ "grad_norm": 7.5,
652
+ "learning_rate": 1.8958629541750422e-05,
653
+ "loss": 1.071,
654
  "step": 460
655
  },
656
  {
657
+ "epoch": 0.17443496201819375,
658
+ "grad_norm": 8.4375,
659
+ "learning_rate": 1.893147477565443e-05,
660
+ "loss": 1.2118,
661
  "step": 465
662
  },
663
  {
664
+ "epoch": 0.17631060677107757,
665
+ "grad_norm": 7.90625,
666
+ "learning_rate": 1.8903990472277707e-05,
667
+ "loss": 1.0279,
668
  "step": 470
669
  },
670
  {
671
+ "epoch": 0.17818625152396136,
672
+ "grad_norm": 8.9375,
673
+ "learning_rate": 1.8876177645685997e-05,
674
+ "loss": 1.1233,
675
  "step": 475
676
  },
677
  {
678
+ "epoch": 0.18006189627684516,
679
+ "grad_norm": 6.78125,
680
+ "learning_rate": 1.8848037322066295e-05,
681
+ "loss": 1.0398,
682
  "step": 480
683
  },
684
  {
685
+ "epoch": 0.18193754102972898,
686
+ "grad_norm": 7.4375,
687
+ "learning_rate": 1.881957053968898e-05,
688
+ "loss": 1.1796,
689
  "step": 485
690
  },
691
  {
692
+ "epoch": 0.18381318578261277,
693
+ "grad_norm": 7.15625,
694
+ "learning_rate": 1.8790778348869516e-05,
695
+ "loss": 1.0767,
696
  "step": 490
697
  },
698
  {
699
+ "epoch": 0.1856888305354966,
700
+ "grad_norm": 8.375,
701
+ "learning_rate": 1.8761661811929686e-05,
702
+ "loss": 1.1285,
703
  "step": 495
704
  },
705
  {
706
+ "epoch": 0.18756447528838038,
707
+ "grad_norm": 8.875,
708
+ "learning_rate": 1.8732222003158423e-05,
709
+ "loss": 1.163,
710
  "step": 500
711
  },
712
  {
713
+ "epoch": 0.18756447528838038,
714
+ "eval_loss": 1.1088175773620605,
715
+ "eval_runtime": 108.1214,
716
+ "eval_samples_per_second": 20.486,
717
+ "eval_steps_per_second": 2.562,
718
  "step": 500
719
  },
720
  {
721
+ "epoch": 0.18944012004126418,
722
+ "grad_norm": 8.0625,
723
+ "learning_rate": 1.870246000877214e-05,
724
+ "loss": 1.031,
725
  "step": 505
726
  },
727
  {
728
+ "epoch": 0.191315764794148,
729
+ "grad_norm": 6.84375,
730
+ "learning_rate": 1.8672376926874668e-05,
731
+ "loss": 1.1465,
732
  "step": 510
733
  },
734
  {
735
+ "epoch": 0.1931914095470318,
736
+ "grad_norm": 6.90625,
737
+ "learning_rate": 1.8641973867416742e-05,
738
+ "loss": 1.1154,
739
  "step": 515
740
  },
741
  {
742
+ "epoch": 0.1950670542999156,
743
+ "grad_norm": 7.375,
744
+ "learning_rate": 1.8611251952155057e-05,
745
+ "loss": 1.0743,
746
  "step": 520
747
  },
748
  {
749
+ "epoch": 0.1969426990527994,
750
+ "grad_norm": 7.40625,
751
+ "learning_rate": 1.8580212314610847e-05,
752
+ "loss": 1.0347,
753
  "step": 525
754
  },
755
  {
756
+ "epoch": 0.1988183438056832,
757
+ "grad_norm": 6.28125,
758
+ "learning_rate": 1.85488561000281e-05,
759
+ "loss": 1.0763,
760
  "step": 530
761
  },
762
  {
763
+ "epoch": 0.20069398855856702,
764
+ "grad_norm": 5.84375,
765
+ "learning_rate": 1.8517184465331288e-05,
766
+ "loss": 1.078,
767
  "step": 535
768
  },
769
  {
770
+ "epoch": 0.2025696333114508,
771
+ "grad_norm": 8.1875,
772
+ "learning_rate": 1.848519857908267e-05,
773
+ "loss": 1.178,
774
  "step": 540
775
  },
776
  {
777
+ "epoch": 0.2044452780643346,
778
+ "grad_norm": 7.375,
779
+ "learning_rate": 1.845289962143918e-05,
780
+ "loss": 1.0804,
781
  "step": 545
782
  },
783
  {
784
+ "epoch": 0.20632092281721842,
785
+ "grad_norm": 7.96875,
786
+ "learning_rate": 1.8420288784108917e-05,
787
+ "loss": 1.1589,
788
  "step": 550
789
  },
790
  {
791
+ "epoch": 0.20819656757010221,
792
+ "grad_norm": 7.375,
793
+ "learning_rate": 1.8387367270307122e-05,
794
+ "loss": 1.1116,
795
  "step": 555
796
  },
797
  {
798
+ "epoch": 0.21007221232298603,
799
+ "grad_norm": 8.1875,
800
+ "learning_rate": 1.835413629471182e-05,
801
+ "loss": 0.9735,
802
  "step": 560
803
  },
804
  {
805
+ "epoch": 0.21194785707586983,
806
+ "grad_norm": 7.625,
807
+ "learning_rate": 1.832059708341899e-05,
808
+ "loss": 1.0991,
809
  "step": 565
810
  },
811
  {
812
+ "epoch": 0.21382350182875362,
813
+ "grad_norm": 9.125,
814
+ "learning_rate": 1.8286750873897338e-05,
815
+ "loss": 1.1227,
816
  "step": 570
817
  },
818
  {
819
+ "epoch": 0.21569914658163744,
820
+ "grad_norm": 10.0,
821
+ "learning_rate": 1.8252598914942624e-05,
822
+ "loss": 0.9956,
823
  "step": 575
824
  },
825
  {
826
+ "epoch": 0.21757479133452123,
827
+ "grad_norm": 7.8125,
828
+ "learning_rate": 1.8218142466631595e-05,
829
+ "loss": 1.0511,
830
  "step": 580
831
  },
832
  {
833
+ "epoch": 0.21945043608740505,
834
+ "grad_norm": 7.0,
835
+ "learning_rate": 1.8183382800275492e-05,
836
+ "loss": 1.1195,
837
  "step": 585
838
  },
839
  {
840
+ "epoch": 0.22132608084028885,
841
+ "grad_norm": 8.125,
842
+ "learning_rate": 1.8148321198373146e-05,
843
+ "loss": 1.0831,
844
  "step": 590
845
  },
846
  {
847
+ "epoch": 0.22320172559317264,
848
+ "grad_norm": 9.0625,
849
+ "learning_rate": 1.8112958954563647e-05,
850
+ "loss": 1.2181,
851
  "step": 595
852
  },
853
  {
854
+ "epoch": 0.22507737034605646,
855
+ "grad_norm": 5.6875,
856
+ "learning_rate": 1.8077297373578625e-05,
857
+ "loss": 1.0144,
858
  "step": 600
859
  },
860
  {
861
+ "epoch": 0.22695301509894025,
862
+ "grad_norm": 7.0,
863
+ "learning_rate": 1.8041337771194124e-05,
864
+ "loss": 0.8084,
865
  "step": 605
866
  },
867
  {
868
+ "epoch": 0.22882865985182407,
869
+ "grad_norm": 6.46875,
870
+ "learning_rate": 1.800508147418201e-05,
871
+ "loss": 1.0292,
872
  "step": 610
873
  },
874
  {
875
+ "epoch": 0.23070430460470787,
876
+ "grad_norm": 7.25,
877
+ "learning_rate": 1.796852982026107e-05,
878
+ "loss": 1.0705,
879
  "step": 615
880
  },
881
  {
882
+ "epoch": 0.23257994935759166,
883
+ "grad_norm": 9.4375,
884
+ "learning_rate": 1.7931684158047623e-05,
885
+ "loss": 1.0863,
886
  "step": 620
887
  },
888
  {
889
+ "epoch": 0.23445559411047548,
890
+ "grad_norm": 6.15625,
891
+ "learning_rate": 1.7894545847005764e-05,
892
+ "loss": 1.1707,
893
  "step": 625
894
  },
895
  {
896
+ "epoch": 0.23633123886335927,
897
+ "grad_norm": 7.71875,
898
+ "learning_rate": 1.7857116257397225e-05,
899
+ "loss": 0.9151,
900
  "step": 630
901
  },
902
  {
903
+ "epoch": 0.2382068836162431,
904
+ "grad_norm": 7.5625,
905
+ "learning_rate": 1.7819396770230796e-05,
906
+ "loss": 1.0119,
907
  "step": 635
908
  },
909
  {
910
+ "epoch": 0.24008252836912689,
911
+ "grad_norm": 7.21875,
912
+ "learning_rate": 1.7781388777211374e-05,
913
+ "loss": 1.1517,
914
  "step": 640
915
  },
916
  {
917
+ "epoch": 0.24195817312201068,
918
+ "grad_norm": 7.96875,
919
+ "learning_rate": 1.7743093680688626e-05,
920
+ "loss": 1.0162,
921
  "step": 645
922
  },
923
  {
924
+ "epoch": 0.2438338178748945,
925
+ "grad_norm": 7.78125,
926
+ "learning_rate": 1.7704512893605247e-05,
927
+ "loss": 1.208,
928
  "step": 650
929
  },
930
  {
931
+ "epoch": 0.2457094626277783,
932
+ "grad_norm": 6.34375,
933
+ "learning_rate": 1.7665647839444807e-05,
934
+ "loss": 1.0252,
935
  "step": 655
936
  },
937
  {
938
+ "epoch": 0.2475851073806621,
939
+ "grad_norm": 7.90625,
940
+ "learning_rate": 1.7626499952179255e-05,
941
+ "loss": 1.1601,
942
  "step": 660
943
  },
944
  {
945
+ "epoch": 0.2494607521335459,
946
+ "grad_norm": 8.625,
947
+ "learning_rate": 1.7587070676215995e-05,
948
+ "loss": 1.1795,
949
  "step": 665
950
  },
951
  {
952
+ "epoch": 0.2513363968864297,
953
+ "grad_norm": 6.59375,
954
+ "learning_rate": 1.75473614663446e-05,
955
+ "loss": 0.9092,
956
  "step": 670
957
  },
958
  {
959
+ "epoch": 0.2532120416393135,
960
+ "grad_norm": 12.5625,
961
+ "learning_rate": 1.750737378768314e-05,
962
+ "loss": 1.1697,
963
  "step": 675
964
  },
965
  {
966
+ "epoch": 0.2550876863921973,
967
+ "grad_norm": 7.59375,
968
+ "learning_rate": 1.7467109115624113e-05,
969
+ "loss": 1.1992,
970
  "step": 680
971
  },
972
  {
973
+ "epoch": 0.25696333114508113,
974
+ "grad_norm": 7.4375,
975
+ "learning_rate": 1.7426568935780007e-05,
976
+ "loss": 1.0365,
977
  "step": 685
978
  },
979
  {
980
+ "epoch": 0.25883897589796495,
981
+ "grad_norm": 7.84375,
982
+ "learning_rate": 1.7385754743928512e-05,
983
+ "loss": 1.2158,
984
  "step": 690
985
  },
986
  {
987
+ "epoch": 0.2607146206508487,
988
+ "grad_norm": 7.5,
989
+ "learning_rate": 1.7344668045957303e-05,
990
+ "loss": 1.1652,
991
  "step": 695
992
  },
993
  {
994
+ "epoch": 0.26259026540373254,
995
+ "grad_norm": 7.1875,
996
+ "learning_rate": 1.730331035780849e-05,
997
+ "loss": 1.1178,
998
  "step": 700
999
  },
1000
  {
1001
+ "epoch": 0.26446591015661636,
1002
+ "grad_norm": 10.375,
1003
+ "learning_rate": 1.726168320542269e-05,
1004
+ "loss": 1.0897,
1005
  "step": 705
1006
  },
1007
  {
1008
+ "epoch": 0.2663415549095001,
1009
+ "grad_norm": 7.625,
1010
+ "learning_rate": 1.7219788124682702e-05,
1011
+ "loss": 1.2076,
1012
  "step": 710
1013
  },
1014
  {
1015
+ "epoch": 0.26821719966238394,
1016
+ "grad_norm": 6.15625,
1017
+ "learning_rate": 1.7177626661356885e-05,
1018
+ "loss": 1.0381,
1019
  "step": 715
1020
  },
1021
  {
1022
+ "epoch": 0.27009284441526776,
1023
+ "grad_norm": 8.25,
1024
+ "learning_rate": 1.713520037104208e-05,
1025
+ "loss": 1.0232,
1026
  "step": 720
1027
  },
1028
  {
1029
+ "epoch": 0.27196848916815153,
1030
+ "grad_norm": 7.4375,
1031
+ "learning_rate": 1.709251081910623e-05,
1032
+ "loss": 1.0218,
1033
  "step": 725
1034
  },
1035
  {
1036
+ "epoch": 0.27384413392103535,
1037
+ "grad_norm": 9.625,
1038
+ "learning_rate": 1.704955958063063e-05,
1039
+ "loss": 1.0518,
1040
  "step": 730
1041
  },
1042
  {
1043
+ "epoch": 0.27571977867391917,
1044
+ "grad_norm": 8.3125,
1045
+ "learning_rate": 1.700634824035182e-05,
1046
+ "loss": 1.043,
1047
  "step": 735
1048
  },
1049
  {
1050
+ "epoch": 0.277595423426803,
1051
+ "grad_norm": 7.0,
1052
+ "learning_rate": 1.696287839260308e-05,
1053
+ "loss": 1.0724,
1054
  "step": 740
1055
  },
1056
  {
1057
+ "epoch": 0.27947106817968675,
1058
+ "grad_norm": 5.84375,
1059
+ "learning_rate": 1.6919151641255642e-05,
1060
+ "loss": 0.9938,
1061
  "step": 745
1062
  },
1063
  {
1064
+ "epoch": 0.2813467129325706,
1065
+ "grad_norm": 9.4375,
1066
+ "learning_rate": 1.6875169599659495e-05,
1067
+ "loss": 1.1153,
1068
  "step": 750
1069
  },
1070
  {
1071
+ "epoch": 0.2832223576854544,
1072
+ "grad_norm": 7.375,
1073
+ "learning_rate": 1.6830933890583863e-05,
1074
+ "loss": 1.1801,
1075
  "step": 755
1076
  },
1077
  {
1078
+ "epoch": 0.28509800243833816,
1079
+ "grad_norm": 7.4375,
1080
+ "learning_rate": 1.6786446146157332e-05,
1081
+ "loss": 1.1175,
1082
  "step": 760
1083
  },
1084
  {
1085
+ "epoch": 0.286973647191222,
1086
+ "grad_norm": 8.0,
1087
+ "learning_rate": 1.6741708007807626e-05,
1088
+ "loss": 1.1015,
1089
  "step": 765
1090
  },
1091
  {
1092
+ "epoch": 0.2888492919441058,
1093
+ "grad_norm": 8.1875,
1094
+ "learning_rate": 1.6696721126201048e-05,
1095
+ "loss": 1.2244,
1096
  "step": 770
1097
  },
1098
  {
1099
+ "epoch": 0.29072493669698957,
1100
+ "grad_norm": 9.5625,
1101
+ "learning_rate": 1.6651487161181577e-05,
1102
+ "loss": 1.0448,
1103
  "step": 775
1104
  },
1105
  {
1106
+ "epoch": 0.2926005814498734,
1107
+ "grad_norm": 8.6875,
1108
+ "learning_rate": 1.6606007781709626e-05,
1109
+ "loss": 1.0295,
1110
  "step": 780
1111
  },
1112
  {
1113
+ "epoch": 0.2944762262027572,
1114
+ "grad_norm": 5.625,
1115
+ "learning_rate": 1.6560284665800464e-05,
1116
+ "loss": 0.9881,
1117
  "step": 785
1118
  },
1119
  {
1120
+ "epoch": 0.29635187095564103,
1121
+ "grad_norm": 10.0625,
1122
+ "learning_rate": 1.6514319500462303e-05,
1123
+ "loss": 1.0681,
1124
  "step": 790
1125
  },
1126
  {
1127
+ "epoch": 0.2982275157085248,
1128
+ "grad_norm": 6.71875,
1129
+ "learning_rate": 1.646811398163405e-05,
1130
+ "loss": 1.0571,
1131
  "step": 795
1132
  },
1133
  {
1134
+ "epoch": 0.3001031604614086,
1135
+ "grad_norm": 5.6875,
1136
+ "learning_rate": 1.642166981412274e-05,
1137
+ "loss": 0.9326,
1138
  "step": 800
1139
  },
1140
  {
1141
+ "epoch": 0.30197880521429243,
1142
+ "grad_norm": 6.5625,
1143
+ "learning_rate": 1.6374988711540634e-05,
1144
+ "loss": 1.2067,
1145
  "step": 805
1146
  },
1147
  {
1148
+ "epoch": 0.3038544499671762,
1149
+ "grad_norm": 7.75,
1150
+ "learning_rate": 1.6328072396241993e-05,
1151
+ "loss": 1.1231,
1152
  "step": 810
1153
  },
1154
  {
1155
+ "epoch": 0.30573009472006,
1156
+ "grad_norm": 8.1875,
1157
+ "learning_rate": 1.6280922599259515e-05,
1158
+ "loss": 1.2164,
1159
  "step": 815
1160
  },
1161
  {
1162
+ "epoch": 0.30760573947294384,
1163
+ "grad_norm": 9.5625,
1164
+ "learning_rate": 1.62335410602405e-05,
1165
+ "loss": 1.0431,
1166
  "step": 820
1167
  },
1168
  {
1169
+ "epoch": 0.3094813842258276,
1170
+ "grad_norm": 9.3125,
1171
+ "learning_rate": 1.6185929527382628e-05,
1172
+ "loss": 1.2427,
1173
  "step": 825
1174
  },
1175
  {
1176
+ "epoch": 0.3113570289787114,
1177
+ "grad_norm": 7.125,
1178
+ "learning_rate": 1.6138089757369475e-05,
1179
+ "loss": 1.1003,
1180
  "step": 830
1181
  },
1182
  {
1183
+ "epoch": 0.31323267373159525,
1184
+ "grad_norm": 5.78125,
1185
+ "learning_rate": 1.6090023515305703e-05,
1186
+ "loss": 1.1261,
1187
  "step": 835
1188
  },
1189
  {
1190
+ "epoch": 0.31510831848447907,
1191
+ "grad_norm": 5.96875,
1192
+ "learning_rate": 1.604173257465192e-05,
1193
+ "loss": 0.9375,
1194
  "step": 840
1195
  },
1196
  {
1197
+ "epoch": 0.31698396323736283,
1198
+ "grad_norm": 7.875,
1199
+ "learning_rate": 1.5993218717159253e-05,
1200
+ "loss": 1.0321,
1201
  "step": 845
1202
  },
1203
  {
1204
+ "epoch": 0.31885960799024665,
1205
+ "grad_norm": 8.0625,
1206
+ "learning_rate": 1.5944483732803612e-05,
1207
+ "loss": 1.1149,
1208
  "step": 850
1209
  },
1210
  {
1211
+ "epoch": 0.3207352527431305,
1212
+ "grad_norm": 10.125,
1213
+ "learning_rate": 1.5895529419719645e-05,
1214
+ "loss": 0.8962,
1215
  "step": 855
1216
  },
1217
  {
1218
+ "epoch": 0.32261089749601424,
1219
+ "grad_norm": 7.5,
1220
+ "learning_rate": 1.5846357584134385e-05,
1221
+ "loss": 0.9511,
1222
  "step": 860
1223
  },
1224
  {
1225
+ "epoch": 0.32448654224889806,
1226
+ "grad_norm": 6.375,
1227
+ "learning_rate": 1.579697004030061e-05,
1228
+ "loss": 1.0865,
1229
  "step": 865
1230
  },
1231
  {
1232
+ "epoch": 0.3263621870017819,
1233
+ "grad_norm": 8.0625,
1234
+ "learning_rate": 1.5747368610429933e-05,
1235
+ "loss": 0.9981,
1236
  "step": 870
1237
  },
1238
  {
1239
+ "epoch": 0.32823783175466564,
1240
+ "grad_norm": 6.90625,
1241
+ "learning_rate": 1.569755512462551e-05,
1242
+ "loss": 1.1338,
1243
  "step": 875
1244
  },
1245
  {
1246
+ "epoch": 0.33011347650754946,
1247
+ "grad_norm": 6.6875,
1248
+ "learning_rate": 1.5647531420814574e-05,
1249
+ "loss": 0.9559,
1250
  "step": 880
1251
  },
1252
  {
1253
+ "epoch": 0.3319891212604333,
1254
+ "grad_norm": 7.59375,
1255
+ "learning_rate": 1.559729934468059e-05,
1256
+ "loss": 0.949,
1257
  "step": 885
1258
  },
1259
  {
1260
+ "epoch": 0.3338647660133171,
1261
+ "grad_norm": 8.75,
1262
+ "learning_rate": 1.5546860749595165e-05,
1263
+ "loss": 1.0738,
1264
  "step": 890
1265
  },
1266
  {
1267
+ "epoch": 0.33574041076620087,
1268
+ "grad_norm": 10.6875,
1269
+ "learning_rate": 1.5496217496549673e-05,
1270
+ "loss": 1.2675,
1271
  "step": 895
1272
  },
1273
  {
1274
+ "epoch": 0.3376160555190847,
1275
+ "grad_norm": 8.125,
1276
+ "learning_rate": 1.5445371454086574e-05,
1277
+ "loss": 1.1757,
1278
  "step": 900
1279
  },
1280
  {
1281
+ "epoch": 0.3394917002719685,
1282
+ "grad_norm": 7.59375,
1283
+ "learning_rate": 1.5394324498230487e-05,
1284
+ "loss": 1.1336,
1285
  "step": 905
1286
  },
1287
  {
1288
+ "epoch": 0.3413673450248523,
1289
+ "grad_norm": 7.65625,
1290
+ "learning_rate": 1.5343078512418977e-05,
1291
+ "loss": 1.008,
1292
  "step": 910
1293
  },
1294
  {
1295
+ "epoch": 0.3432429897777361,
1296
+ "grad_norm": 7.40625,
1297
+ "learning_rate": 1.529163538743303e-05,
1298
+ "loss": 1.1445,
1299
  "step": 915
1300
  },
1301
  {
1302
+ "epoch": 0.3451186345306199,
1303
+ "grad_norm": 6.5,
1304
+ "learning_rate": 1.5239997021327343e-05,
1305
+ "loss": 1.0209,
1306
  "step": 920
1307
  },
1308
  {
1309
+ "epoch": 0.3469942792835037,
1310
+ "grad_norm": 6.59375,
1311
+ "learning_rate": 1.518816531936024e-05,
1312
+ "loss": 1.0617,
1313
  "step": 925
1314
  },
1315
  {
1316
+ "epoch": 0.3488699240363875,
1317
+ "grad_norm": 7.90625,
1318
+ "learning_rate": 1.5136142193923413e-05,
1319
+ "loss": 1.0885,
1320
  "step": 930
1321
  },
1322
  {
1323
+ "epoch": 0.3507455687892713,
1324
+ "grad_norm": 7.875,
1325
+ "learning_rate": 1.5083929564471344e-05,
1326
+ "loss": 0.9534,
1327
  "step": 935
1328
  },
1329
  {
1330
+ "epoch": 0.35262121354215514,
1331
+ "grad_norm": 6.8125,
1332
+ "learning_rate": 1.5031529357450487e-05,
1333
+ "loss": 0.9802,
1334
  "step": 940
1335
  },
1336
  {
1337
+ "epoch": 0.3544968582950389,
1338
+ "grad_norm": 11.375,
1339
+ "learning_rate": 1.4978943506228198e-05,
1340
+ "loss": 1.149,
1341
  "step": 945
1342
  },
1343
  {
1344
+ "epoch": 0.35637250304792273,
1345
+ "grad_norm": 6.25,
1346
+ "learning_rate": 1.4926173951021384e-05,
1347
+ "loss": 1.179,
1348
  "step": 950
1349
  },
1350
  {
1351
+ "epoch": 0.35824814780080655,
1352
+ "grad_norm": 9.0625,
1353
+ "learning_rate": 1.4873222638824938e-05,
1354
+ "loss": 0.9677,
1355
  "step": 955
1356
  },
1357
  {
1358
+ "epoch": 0.3601237925536903,
1359
+ "grad_norm": 7.75,
1360
+ "learning_rate": 1.4820091523339883e-05,
1361
+ "loss": 1.0542,
1362
  "step": 960
1363
  },
1364
  {
1365
+ "epoch": 0.36199943730657413,
1366
+ "grad_norm": 7.0,
1367
+ "learning_rate": 1.4766782564901299e-05,
1368
+ "loss": 0.9912,
1369
  "step": 965
1370
  },
1371
  {
1372
+ "epoch": 0.36387508205945795,
1373
+ "grad_norm": 8.5,
1374
+ "learning_rate": 1.471329773040599e-05,
1375
+ "loss": 1.1461,
1376
  "step": 970
1377
  },
1378
  {
1379
+ "epoch": 0.3657507268123417,
1380
+ "grad_norm": 9.25,
1381
+ "learning_rate": 1.465963899323992e-05,
1382
+ "loss": 1.1274,
1383
  "step": 975
1384
  },
1385
  {
1386
+ "epoch": 0.36762637156522554,
1387
+ "grad_norm": 7.03125,
1388
+ "learning_rate": 1.4605808333205387e-05,
1389
+ "loss": 1.0867,
1390
  "step": 980
1391
  },
1392
  {
1393
+ "epoch": 0.36950201631810936,
1394
+ "grad_norm": 8.875,
1395
+ "learning_rate": 1.4551807736447996e-05,
1396
+ "loss": 1.0081,
1397
  "step": 985
1398
  },
1399
  {
1400
+ "epoch": 0.3713776610709932,
1401
+ "grad_norm": 6.96875,
1402
+ "learning_rate": 1.4497639195383362e-05,
1403
+ "loss": 0.9978,
1404
  "step": 990
1405
  },
1406
  {
1407
+ "epoch": 0.37325330582387695,
1408
+ "grad_norm": 7.875,
1409
+ "learning_rate": 1.4443304708623598e-05,
1410
+ "loss": 0.9439,
1411
  "step": 995
1412
  },
1413
  {
1414
+ "epoch": 0.37512895057676077,
1415
+ "grad_norm": 7.1875,
1416
+ "learning_rate": 1.4388806280903591e-05,
1417
+ "loss": 0.9749,
1418
  "step": 1000
1419
  },
1420
  {
1421
+ "epoch": 0.37512895057676077,
1422
+ "eval_loss": 1.0913933515548706,
1423
+ "eval_runtime": 107.8156,
1424
+ "eval_samples_per_second": 20.544,
1425
+ "eval_steps_per_second": 2.569,
1426
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1427
  }
1428
  ],
1429
  "logging_steps": 5,
1430
+ "max_steps": 2666,
1431
  "num_input_tokens_seen": 0,
1432
  "num_train_epochs": 1,
1433
  "save_steps": 500,
 
1435
  "EarlyStoppingCallback": {
1436
  "args": {
1437
  "early_stopping_patience": 3,
1438
+ "early_stopping_threshold": 0.001
1439
  },
1440
  "attributes": {
1441
+ "early_stopping_patience_counter": 0
1442
  }
1443
  },
1444
  "TrainerControl": {
 
1452
  "attributes": {}
1453
  }
1454
  },
1455
+ "total_flos": 4.3299712794624e+16,
1456
  "train_batch_size": 4,
1457
  "trial_name": null,
1458
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00d95a87267b3633f3a8999a8cb011935857ff9662d046856ee9b22cb9bf3e40
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66aac7b0763919a1f3f8b3a5c5a28e87094299c3e382fe15d07c8e8d2e5c782c
3
  size 5304