File size: 26,166 Bytes
d7c12e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0421052631578946,
  "eval_steps": 500,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.021052631578947368,
      "grad_norm": 1.224082589149475,
      "kl": 0.0,
      "learning_rate": 5e-06,
      "logits/chosen": 270973525.3333333,
      "logits/rejected": 308584228.5714286,
      "logps/chosen": -743.7762586805555,
      "logps/rejected": -327.42550223214283,
      "loss": 0.5,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "epoch": 0.042105263157894736,
      "grad_norm": 1.7011622190475464,
      "kl": 0.08848989009857178,
      "learning_rate": 4.999405067699773e-06,
      "logits/chosen": 283945024.0,
      "logits/rejected": 300706848.0,
      "logps/chosen": -670.29150390625,
      "logps/rejected": -340.8790283203125,
      "loss": 0.5072,
      "rewards/chosen": -0.04161600396037102,
      "rewards/margins": -0.051780181005597115,
      "rewards/rejected": 0.010164177045226097,
      "step": 2
    },
    {
      "epoch": 0.06315789473684211,
      "grad_norm": 1.2693746089935303,
      "kl": 0.17015418410301208,
      "learning_rate": 4.997620553954645e-06,
      "logits/chosen": 268529444.5714286,
      "logits/rejected": 305261397.3333333,
      "logps/chosen": -813.0807756696429,
      "logps/rejected": -319.21929253472223,
      "loss": 0.496,
      "rewards/chosen": 0.018497141344206675,
      "rewards/margins": 0.028634450974918547,
      "rewards/rejected": -0.010137309630711874,
      "step": 3
    },
    {
      "epoch": 0.08421052631578947,
      "grad_norm": 2.279048442840576,
      "kl": 0.06164896488189697,
      "learning_rate": 4.994647308096509e-06,
      "logits/chosen": 252110563.55555555,
      "logits/rejected": 306722450.28571427,
      "logps/chosen": -748.6314019097222,
      "logps/rejected": -325.29725864955356,
      "loss": 0.4976,
      "rewards/chosen": 0.011831367181407081,
      "rewards/margins": 0.011405953797437841,
      "rewards/rejected": 0.0004254133839692388,
      "step": 4
    },
    {
      "epoch": 0.10526315789473684,
      "grad_norm": 1.5578685998916626,
      "kl": 0.06295323371887207,
      "learning_rate": 4.990486745229364e-06,
      "logits/chosen": 292113728.0,
      "logits/rejected": 316883904.0,
      "logps/chosen": -815.0404663085938,
      "logps/rejected": -337.57080078125,
      "loss": 0.499,
      "rewards/chosen": 0.006664060987532139,
      "rewards/margins": 0.011198383755981922,
      "rewards/rejected": -0.004534322768449783,
      "step": 5
    },
    {
      "epoch": 0.12631578947368421,
      "grad_norm": 1.006475567817688,
      "kl": 0.049591064453125,
      "learning_rate": 4.985140845555799e-06,
      "logits/chosen": 306853741.71428573,
      "logits/rejected": 325227320.8888889,
      "logps/chosen": -479.3779994419643,
      "logps/rejected": -286.5568576388889,
      "loss": 0.4958,
      "rewards/chosen": 0.026198712842805044,
      "rewards/margins": 0.030564389649837737,
      "rewards/rejected": -0.004365676807032691,
      "step": 6
    },
    {
      "epoch": 0.14736842105263157,
      "grad_norm": 1.2592947483062744,
      "kl": 0.15429818630218506,
      "learning_rate": 4.978612153434527e-06,
      "logits/chosen": 305409763.5555556,
      "logits/rejected": 309714139.4285714,
      "logps/chosen": -609.1155056423611,
      "logps/rejected": -277.2419956752232,
      "loss": 0.4977,
      "rewards/chosen": 0.01816416945722368,
      "rewards/margins": 0.028014377705634586,
      "rewards/rejected": -0.009850208248410906,
      "step": 7
    },
    {
      "epoch": 0.16842105263157894,
      "grad_norm": 1.420854926109314,
      "kl": 0.20131784677505493,
      "learning_rate": 4.970903776169403e-06,
      "logits/chosen": 300502112.0,
      "logits/rejected": 337999712.0,
      "logps/chosen": -599.28125,
      "logps/rejected": -321.2285461425781,
      "loss": 0.5003,
      "rewards/chosen": 0.006110990885645151,
      "rewards/margins": 0.016330440063029528,
      "rewards/rejected": -0.010219449177384377,
      "step": 8
    },
    {
      "epoch": 0.18947368421052632,
      "grad_norm": 1.0294359922409058,
      "kl": 0.15416035056114197,
      "learning_rate": 4.962019382530521e-06,
      "logits/chosen": 280579858.28571427,
      "logits/rejected": 310000554.6666667,
      "logps/chosen": -656.5274135044643,
      "logps/rejected": -314.7470703125,
      "loss": 0.4943,
      "rewards/chosen": 0.05247082880565098,
      "rewards/margins": 0.061781181111222216,
      "rewards/rejected": -0.009310352305571238,
      "step": 9
    },
    {
      "epoch": 0.21052631578947367,
      "grad_norm": 1.8129311800003052,
      "kl": 0.1927608847618103,
      "learning_rate": 4.9519632010080765e-06,
      "logits/chosen": 238834005.33333334,
      "logits/rejected": 301916818.28571427,
      "logps/chosen": -759.1360134548611,
      "logps/rejected": -292.9808872767857,
      "loss": 0.5004,
      "rewards/chosen": 0.014881134033203125,
      "rewards/margins": 0.019827809184789658,
      "rewards/rejected": -0.004946675151586533,
      "step": 10
    },
    {
      "epoch": 0.23157894736842105,
      "grad_norm": 1.232249140739441,
      "kl": 0.23234114050865173,
      "learning_rate": 4.9407400177998335e-06,
      "logits/chosen": 286699296.0,
      "logits/rejected": 300436576.0,
      "logps/chosen": -627.421875,
      "logps/rejected": -282.529296875,
      "loss": 0.4981,
      "rewards/chosen": 0.03665875270962715,
      "rewards/margins": 0.0362735278904438,
      "rewards/rejected": 0.0003852248191833496,
      "step": 11
    },
    {
      "epoch": 0.25263157894736843,
      "grad_norm": 1.2517226934432983,
      "kl": 0.2269219160079956,
      "learning_rate": 4.928355174533153e-06,
      "logits/chosen": 283396937.14285713,
      "logits/rejected": 298269838.2222222,
      "logps/chosen": -700.8916015625,
      "logps/rejected": -346.30750868055554,
      "loss": 0.4953,
      "rewards/chosen": 0.052171528339385986,
      "rewards/margins": 0.05235843691560957,
      "rewards/rejected": -0.00018690857622358535,
      "step": 12
    },
    {
      "epoch": 0.2736842105263158,
      "grad_norm": 1.2330269813537598,
      "kl": 0.2519031763076782,
      "learning_rate": 4.914814565722671e-06,
      "logits/chosen": 302669795.5555556,
      "logits/rejected": 313485238.85714287,
      "logps/chosen": -654.2307942708334,
      "logps/rejected": -330.16427176339283,
      "loss": 0.4958,
      "rewards/chosen": 0.0363319648636712,
      "rewards/margins": 0.07227063652068849,
      "rewards/rejected": -0.0359386716570173,
      "step": 13
    },
    {
      "epoch": 0.29473684210526313,
      "grad_norm": 1.4144365787506104,
      "kl": 0.130226731300354,
      "learning_rate": 4.900124635964823e-06,
      "logits/chosen": 276869248.0,
      "logits/rejected": 289425024.0,
      "logps/chosen": -519.8916015625,
      "logps/rejected": -265.8385314941406,
      "loss": 0.496,
      "rewards/chosen": 0.03466583415865898,
      "rewards/margins": 0.0368356395047158,
      "rewards/rejected": -0.002169805346056819,
      "step": 14
    },
    {
      "epoch": 0.3157894736842105,
      "grad_norm": 1.305309772491455,
      "kl": 0.12101024389266968,
      "learning_rate": 4.884292376870567e-06,
      "logits/chosen": 293158875.4285714,
      "logits/rejected": 304345201.7777778,
      "logps/chosen": -500.9711216517857,
      "logps/rejected": -344.5473361545139,
      "loss": 0.4944,
      "rewards/chosen": 0.06058854716164725,
      "rewards/margins": 0.07498784883627815,
      "rewards/rejected": -0.014399301674630906,
      "step": 15
    },
    {
      "epoch": 0.3368421052631579,
      "grad_norm": 1.447916865348816,
      "kl": 0.1458943486213684,
      "learning_rate": 4.867325323737765e-06,
      "logits/chosen": 290241507.5555556,
      "logits/rejected": 316717494.85714287,
      "logps/chosen": -713.4215494791666,
      "logps/rejected": -295.7858189174107,
      "loss": 0.4897,
      "rewards/chosen": 0.08352628681394789,
      "rewards/margins": 0.09608901836096294,
      "rewards/rejected": -0.012562731547015054,
      "step": 16
    },
    {
      "epoch": 0.35789473684210527,
      "grad_norm": 1.3701616525650024,
      "kl": 0.3826329708099365,
      "learning_rate": 4.849231551964771e-06,
      "logits/chosen": 279923008.0,
      "logits/rejected": 295235712.0,
      "logps/chosen": -549.1046752929688,
      "logps/rejected": -335.35260009765625,
      "loss": 0.4927,
      "rewards/chosen": 0.06883127987384796,
      "rewards/margins": 0.08687522634863853,
      "rewards/rejected": -0.018043946474790573,
      "step": 17
    },
    {
      "epoch": 0.37894736842105264,
      "grad_norm": 1.467745065689087,
      "kl": 0.5873703956604004,
      "learning_rate": 4.830019673206997e-06,
      "logits/chosen": 278496658.28571427,
      "logits/rejected": 300650268.4444444,
      "logps/chosen": -629.6449497767857,
      "logps/rejected": -360.1111653645833,
      "loss": 0.4923,
      "rewards/chosen": 0.06821728178433009,
      "rewards/margins": 0.0920389105403234,
      "rewards/rejected": -0.023821628755993314,
      "step": 18
    },
    {
      "epoch": 0.4,
      "grad_norm": 1.9833446741104126,
      "kl": 0.7327308654785156,
      "learning_rate": 4.809698831278217e-06,
      "logits/chosen": 277803520.0,
      "logits/rejected": 308114651.4285714,
      "logps/chosen": -737.9443901909722,
      "logps/rejected": -338.15098353794644,
      "loss": 0.4907,
      "rewards/chosen": 0.10881086852815416,
      "rewards/margins": 0.15786849695538718,
      "rewards/rejected": -0.04905762842723301,
      "step": 19
    },
    {
      "epoch": 0.42105263157894735,
      "grad_norm": 1.154601812362671,
      "kl": 0.1896182894706726,
      "learning_rate": 4.788278697798619e-06,
      "logits/chosen": 283599232.0,
      "logits/rejected": 313346368.0,
      "logps/chosen": -707.3074951171875,
      "logps/rejected": -316.3603515625,
      "loss": 0.4943,
      "rewards/chosen": 0.04999881610274315,
      "rewards/margins": 0.0727224051952362,
      "rewards/rejected": -0.022723589092493057,
      "step": 20
    },
    {
      "epoch": 0.4421052631578947,
      "grad_norm": 1.574962854385376,
      "kl": 0.4762837886810303,
      "learning_rate": 4.765769467591626e-06,
      "logits/chosen": 287853293.71428573,
      "logits/rejected": 302228480.0,
      "logps/chosen": -572.1301618303571,
      "logps/rejected": -282.95513237847223,
      "loss": 0.488,
      "rewards/chosen": 0.12458467483520508,
      "rewards/margins": 0.15258528788884482,
      "rewards/rejected": -0.02800061305363973,
      "step": 21
    },
    {
      "epoch": 0.4631578947368421,
      "grad_norm": 1.5850838422775269,
      "kl": 0.5586809515953064,
      "learning_rate": 4.742181853831721e-06,
      "logits/chosen": 290416867.5555556,
      "logits/rejected": 309413156.5714286,
      "logps/chosen": -668.2986653645834,
      "logps/rejected": -324.6029575892857,
      "loss": 0.4884,
      "rewards/chosen": 0.09367326895395915,
      "rewards/margins": 0.1388323534102667,
      "rewards/rejected": -0.04515908445630755,
      "step": 22
    },
    {
      "epoch": 0.4842105263157895,
      "grad_norm": 1.4497779607772827,
      "kl": 0.5230355262756348,
      "learning_rate": 4.717527082945555e-06,
      "logits/chosen": 280489216.0,
      "logits/rejected": 309412736.0,
      "logps/chosen": -682.4186401367188,
      "logps/rejected": -332.3192138671875,
      "loss": 0.488,
      "rewards/chosen": 0.1085066944360733,
      "rewards/margins": 0.14131877198815346,
      "rewards/rejected": -0.032812077552080154,
      "step": 23
    },
    {
      "epoch": 0.5052631578947369,
      "grad_norm": 1.3657130002975464,
      "kl": 0.5021036863327026,
      "learning_rate": 4.69181688926877e-06,
      "logits/chosen": 239562038.85714287,
      "logits/rejected": 313432405.3333333,
      "logps/chosen": -761.9135044642857,
      "logps/rejected": -289.5192057291667,
      "loss": 0.4848,
      "rewards/chosen": 0.1538386004311698,
      "rewards/margins": 0.17894491955401406,
      "rewards/rejected": -0.02510631912284427,
      "step": 24
    },
    {
      "epoch": 0.5263157894736842,
      "grad_norm": 1.446842074394226,
      "kl": 0.6509883999824524,
      "learning_rate": 4.665063509461098e-06,
      "logits/chosen": 293611036.4444444,
      "logits/rejected": 316255341.71428573,
      "logps/chosen": -721.4010416666666,
      "logps/rejected": -285.1228724888393,
      "loss": 0.4881,
      "rewards/chosen": 0.12438484032948811,
      "rewards/margins": 0.1453892659573328,
      "rewards/rejected": -0.021004425627844676,
      "step": 25
    },
    {
      "epoch": 0.5473684210526316,
      "grad_norm": 1.2756037712097168,
      "kl": 0.4022580087184906,
      "learning_rate": 4.637279676682367e-06,
      "logits/chosen": 292368704.0,
      "logits/rejected": 317329792.0,
      "logps/chosen": -469.4183044433594,
      "logps/rejected": -302.54144287109375,
      "loss": 0.4966,
      "rewards/chosen": 0.07243937253952026,
      "rewards/margins": 0.10708872973918915,
      "rewards/rejected": -0.034649357199668884,
      "step": 26
    },
    {
      "epoch": 0.5684210526315789,
      "grad_norm": 1.2571344375610352,
      "kl": 1.006415605545044,
      "learning_rate": 4.608478614532215e-06,
      "logits/chosen": 252147291.42857143,
      "logits/rejected": 306243811.5555556,
      "logps/chosen": -751.1729910714286,
      "logps/rejected": -298.89708116319446,
      "loss": 0.4868,
      "rewards/chosen": 0.15643044880458287,
      "rewards/margins": 0.2008941164092412,
      "rewards/rejected": -0.044463667604658336,
      "step": 27
    },
    {
      "epoch": 0.5894736842105263,
      "grad_norm": 1.2348135709762573,
      "kl": 0.844231903553009,
      "learning_rate": 4.578674030756364e-06,
      "logits/chosen": 328972714.6666667,
      "logits/rejected": 334514980.5714286,
      "logps/chosen": -661.2038302951389,
      "logps/rejected": -348.25048828125,
      "loss": 0.4897,
      "rewards/chosen": 0.1231810384326511,
      "rewards/margins": 0.1448404531157206,
      "rewards/rejected": -0.021659414683069502,
      "step": 28
    },
    {
      "epoch": 0.6105263157894737,
      "grad_norm": 1.038001298904419,
      "kl": 0.5513710379600525,
      "learning_rate": 4.54788011072248e-06,
      "logits/chosen": 276993920.0,
      "logits/rejected": 302539648.0,
      "logps/chosen": -632.3786010742188,
      "logps/rejected": -318.97100830078125,
      "loss": 0.489,
      "rewards/chosen": 0.10169073939323425,
      "rewards/margins": 0.1353834606707096,
      "rewards/rejected": -0.03369272127747536,
      "step": 29
    },
    {
      "epoch": 0.631578947368421,
      "grad_norm": 1.5136151313781738,
      "kl": 0.7220326066017151,
      "learning_rate": 4.516111510668707e-06,
      "logits/chosen": 276634130.28571427,
      "logits/rejected": 300450048.0,
      "logps/chosen": -667.6506696428571,
      "logps/rejected": -341.21375868055554,
      "loss": 0.4831,
      "rewards/chosen": 0.2004882778440203,
      "rewards/margins": 0.24258499533411054,
      "rewards/rejected": -0.042096717490090266,
      "step": 30
    },
    {
      "epoch": 0.6526315789473685,
      "grad_norm": 1.0843844413757324,
      "kl": 0.6163355112075806,
      "learning_rate": 4.4833833507280884e-06,
      "logits/chosen": 307501454.2222222,
      "logits/rejected": 303213787.4285714,
      "logps/chosen": -486.1819661458333,
      "logps/rejected": -337.0064174107143,
      "loss": 0.4833,
      "rewards/chosen": 0.12439311875237359,
      "rewards/margins": 0.1898724100892506,
      "rewards/rejected": -0.06547929133687701,
      "step": 31
    },
    {
      "epoch": 0.6736842105263158,
      "grad_norm": 1.1725709438323975,
      "kl": 0.6453120708465576,
      "learning_rate": 4.4497112077322045e-06,
      "logits/chosen": 314310528.0,
      "logits/rejected": 323297440.0,
      "logps/chosen": -570.1083374023438,
      "logps/rejected": -277.752197265625,
      "loss": 0.4858,
      "rewards/chosen": 0.13651692867279053,
      "rewards/margins": 0.190122302621603,
      "rewards/rejected": -0.053605373948812485,
      "step": 32
    },
    {
      "epoch": 0.6947368421052632,
      "grad_norm": 1.653479814529419,
      "kl": 1.0490498542785645,
      "learning_rate": 4.415111107797445e-06,
      "logits/chosen": 311787922.28571427,
      "logits/rejected": 350929635.5555556,
      "logps/chosen": -676.2388392857143,
      "logps/rejected": -374.4116482204861,
      "loss": 0.4785,
      "rewards/chosen": 0.2800070898873465,
      "rewards/margins": 0.29183324911291636,
      "rewards/rejected": -0.011826159225569831,
      "step": 33
    },
    {
      "epoch": 0.7157894736842105,
      "grad_norm": 1.5196908712387085,
      "kl": 0.8070676922798157,
      "learning_rate": 4.379599518697444e-06,
      "logits/chosen": 278536049.7777778,
      "logits/rejected": 313191350.85714287,
      "logps/chosen": -602.8457573784722,
      "logps/rejected": -309.17257254464283,
      "loss": 0.475,
      "rewards/chosen": 0.19871669345431858,
      "rewards/margins": 0.28476871952177985,
      "rewards/rejected": -0.08605202606746129,
      "step": 34
    },
    {
      "epoch": 0.7368421052631579,
      "grad_norm": 1.0484706163406372,
      "kl": 0.807353138923645,
      "learning_rate": 4.34319334202531e-06,
      "logits/chosen": 299660160.0,
      "logits/rejected": 306065632.0,
      "logps/chosen": -465.7568359375,
      "logps/rejected": -229.71621704101562,
      "loss": 0.4919,
      "rewards/chosen": 0.11409495025873184,
      "rewards/margins": 0.16953209787607193,
      "rewards/rejected": -0.05543714761734009,
      "step": 35
    },
    {
      "epoch": 0.7578947368421053,
      "grad_norm": 1.3274704217910767,
      "kl": 0.8778089880943298,
      "learning_rate": 4.305909905149389e-06,
      "logits/chosen": 270111561.14285713,
      "logits/rejected": 306747079.1111111,
      "logps/chosen": -576.6547502790179,
      "logps/rejected": -307.5883517795139,
      "loss": 0.4756,
      "rewards/chosen": 0.23983100482395717,
      "rewards/margins": 0.28282778887521653,
      "rewards/rejected": -0.04299678405125936,
      "step": 36
    },
    {
      "epoch": 0.7789473684210526,
      "grad_norm": 1.6128290891647339,
      "kl": 1.20198392868042,
      "learning_rate": 4.267766952966369e-06,
      "logits/chosen": 279583146.6666667,
      "logits/rejected": 321376694.85714287,
      "logps/chosen": -716.5959201388889,
      "logps/rejected": -284.7782505580357,
      "loss": 0.4762,
      "rewards/chosen": 0.2411472267574734,
      "rewards/margins": 0.34859214321015375,
      "rewards/rejected": -0.10744491645268031,
      "step": 37
    },
    {
      "epoch": 0.8,
      "grad_norm": 1.4675477743148804,
      "kl": 0.7991436719894409,
      "learning_rate": 4.228782639455674e-06,
      "logits/chosen": 289123392.0,
      "logits/rejected": 312878304.0,
      "logps/chosen": -548.8831176757812,
      "logps/rejected": -298.0077819824219,
      "loss": 0.4854,
      "rewards/chosen": 0.1557559221982956,
      "rewards/margins": 0.20764141902327538,
      "rewards/rejected": -0.05188549682497978,
      "step": 38
    },
    {
      "epoch": 0.8210526315789474,
      "grad_norm": 1.5274215936660767,
      "kl": 1.3201950788497925,
      "learning_rate": 4.188975519039151e-06,
      "logits/chosen": 306054198.85714287,
      "logits/rejected": 318293873.7777778,
      "logps/chosen": -641.423828125,
      "logps/rejected": -323.07017686631946,
      "loss": 0.4675,
      "rewards/chosen": 0.2957319532121931,
      "rewards/margins": 0.35744103268971517,
      "rewards/rejected": -0.061709079477522105,
      "step": 39
    },
    {
      "epoch": 0.8421052631578947,
      "grad_norm": 1.8003302812576294,
      "kl": 1.431631326675415,
      "learning_rate": 4.1483645377501726e-06,
      "logits/chosen": 286558065.7777778,
      "logits/rejected": 281730944.0,
      "logps/chosen": -683.1265190972222,
      "logps/rejected": -394.296875,
      "loss": 0.4595,
      "rewards/chosen": 0.2565513981713189,
      "rewards/margins": 0.4815535848102872,
      "rewards/rejected": -0.22500218663896834,
      "step": 40
    },
    {
      "epoch": 0.8631578947368421,
      "grad_norm": 1.385386347770691,
      "kl": 1.2051702737808228,
      "learning_rate": 4.106969024216348e-06,
      "logits/chosen": 297838432.0,
      "logits/rejected": 317846336.0,
      "logps/chosen": -683.0381469726562,
      "logps/rejected": -333.70623779296875,
      "loss": 0.4735,
      "rewards/chosen": 0.30369484424591064,
      "rewards/margins": 0.38917434215545654,
      "rewards/rejected": -0.0854794979095459,
      "step": 41
    },
    {
      "epoch": 0.8842105263157894,
      "grad_norm": 2.4110379219055176,
      "kl": 1.4937351942062378,
      "learning_rate": 4.064808680460149e-06,
      "logits/chosen": 275613819.5862069,
      "logits/rejected": 329032821.0285714,
      "logps/chosen": -821.4407327586207,
      "logps/rejected": -363.7448660714286,
      "loss": 0.4399,
      "rewards/chosen": 0.4741830825805664,
      "rewards/margins": 0.6639985765729631,
      "rewards/rejected": -0.18981549399239675,
      "step": 42
    },
    {
      "epoch": 0.9052631578947369,
      "grad_norm": 1.4786880016326904,
      "kl": 1.3580402135849,
      "learning_rate": 4.021903572521802e-06,
      "logits/chosen": 292805532.9032258,
      "logits/rejected": 320448667.1515151,
      "logps/chosen": -615.5443548387096,
      "logps/rejected": -311.78767163825756,
      "loss": 0.4669,
      "rewards/chosen": 0.3574201829971806,
      "rewards/margins": 0.4636997006622344,
      "rewards/rejected": -0.10627951766505386,
      "step": 43
    },
    {
      "epoch": 0.9263157894736842,
      "grad_norm": 1.0773484706878662,
      "kl": 1.1364717483520508,
      "learning_rate": 3.978274120908957e-06,
      "logits/chosen": 298700241.45454544,
      "logits/rejected": 323973846.7096774,
      "logps/chosen": -521.9517045454545,
      "logps/rejected": -351.1326864919355,
      "loss": 0.4819,
      "rewards/chosen": 0.21399710395119406,
      "rewards/margins": 0.30352772575669273,
      "rewards/rejected": -0.08953062180549867,
      "step": 44
    },
    {
      "epoch": 0.9473684210526315,
      "grad_norm": 1.8579978942871094,
      "kl": 1.5150351524353027,
      "learning_rate": 3.933941090877615e-06,
      "logits/chosen": 250053461.33333334,
      "logits/rejected": 297370359.7419355,
      "logps/chosen": -674.4289180871212,
      "logps/rejected": -328.01861769153226,
      "loss": 0.4562,
      "rewards/chosen": 0.3749615929343484,
      "rewards/margins": 0.5518351999545726,
      "rewards/rejected": -0.17687360702022428,
      "step": 45
    },
    {
      "epoch": 0.968421052631579,
      "grad_norm": 1.442551612854004,
      "kl": 1.4570834636688232,
      "learning_rate": 3.888925582549006e-06,
      "logits/chosen": 296643956.3636364,
      "logits/rejected": 300621658.83870965,
      "logps/chosen": -726.8507339015151,
      "logps/rejected": -310.6574470766129,
      "loss": 0.4584,
      "rewards/chosen": 0.3162169022993608,
      "rewards/margins": 0.4777500119027504,
      "rewards/rejected": -0.16153310960338962,
      "step": 46
    },
    {
      "epoch": 0.9894736842105263,
      "grad_norm": 1.5954558849334717,
      "kl": 1.3480093479156494,
      "learning_rate": 3.8432490208670605e-06,
      "logits/chosen": 266226656.96969697,
      "logits/rejected": 327847011.0967742,
      "logps/chosen": -818.7182765151515,
      "logps/rejected": -304.5207283266129,
      "loss": 0.4383,
      "rewards/chosen": 0.4229244463371508,
      "rewards/margins": 0.5965663433541067,
      "rewards/rejected": -0.17364189701695595,
      "step": 47
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.6833202838897705,
      "kl": 0.490053653717041,
      "learning_rate": 3.796933145401304e-06,
      "logits/chosen": 347558865.45454544,
      "logits/rejected": 323800656.84210527,
      "logps/chosen": -481.98979048295456,
      "logps/rejected": -379.3189761513158,
      "loss": 0.231,
      "rewards/chosen": 0.20139399441805753,
      "rewards/margins": 0.35350871200196476,
      "rewards/rejected": -0.15211471758390727,
      "step": 48
    },
    {
      "epoch": 1.0210526315789474,
      "grad_norm": 1.429196834564209,
      "kl": 2.1841373443603516,
      "learning_rate": 3.7500000000000005e-06,
      "logits/chosen": 271206684.4444444,
      "logits/rejected": 308082724.5714286,
      "logps/chosen": -739.5323893229166,
      "logps/rejected": -329.6499720982143,
      "loss": 0.452,
      "rewards/chosen": 0.4243852562374539,
      "rewards/margins": 0.6468354274356176,
      "rewards/rejected": -0.2224501711981637,
      "step": 49
    },
    {
      "epoch": 1.0421052631578946,
      "grad_norm": 1.9531340599060059,
      "kl": 2.2712414264678955,
      "learning_rate": 3.7024719222984696e-06,
      "logits/chosen": 283599008.0,
      "logits/rejected": 300947008.0,
      "logps/chosen": -664.508056640625,
      "logps/rejected": -342.87835693359375,
      "loss": 0.4543,
      "rewards/chosen": 0.5367215871810913,
      "rewards/margins": 0.7264900505542755,
      "rewards/rejected": -0.1897684633731842,
      "step": 50
    }
  ],
  "logging_steps": 1,
  "max_steps": 144,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 25,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}