File size: 25,732 Bytes
f8a43c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.6323589954713874,
  "eval_steps": 500,
  "global_step": 48,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.013174145738987238,
      "grad_norm": 0.5102696418762207,
      "learning_rate": 1.25e-07,
      "logits/chosen": 10.088521957397461,
      "logits/rejected": 10.263787269592285,
      "logps/chosen": -0.9118157029151917,
      "logps/rejected": -0.9621729850769043,
      "loss": 1.3897,
      "rewards/accuracies": 0.5234375,
      "rewards/chosen": -1.8236314058303833,
      "rewards/margins": 0.10071463882923126,
      "rewards/rejected": -1.9243459701538086,
      "step": 1
    },
    {
      "epoch": 0.026348291477974475,
      "grad_norm": 0.9815747141838074,
      "learning_rate": 2.5e-07,
      "logits/chosen": 10.592972755432129,
      "logits/rejected": 10.720216751098633,
      "logps/chosen": -0.945902943611145,
      "logps/rejected": -1.0317902565002441,
      "loss": 1.3077,
      "rewards/accuracies": 0.609375,
      "rewards/chosen": -1.89180588722229,
      "rewards/margins": 0.1717745065689087,
      "rewards/rejected": -2.0635805130004883,
      "step": 2
    },
    {
      "epoch": 0.03952243721696171,
      "grad_norm": 0.9049758315086365,
      "learning_rate": 3.75e-07,
      "logits/chosen": 10.041976928710938,
      "logits/rejected": 10.399316787719727,
      "logps/chosen": -1.0869810581207275,
      "logps/rejected": -1.1895216703414917,
      "loss": 1.346,
      "rewards/accuracies": 0.578125,
      "rewards/chosen": -2.173962116241455,
      "rewards/margins": 0.20508113503456116,
      "rewards/rejected": -2.3790433406829834,
      "step": 3
    },
    {
      "epoch": 0.05269658295594895,
      "grad_norm": 1.8911848068237305,
      "learning_rate": 5e-07,
      "logits/chosen": 10.243470191955566,
      "logits/rejected": 10.443375587463379,
      "logps/chosen": -0.966098427772522,
      "logps/rejected": -1.0040662288665771,
      "loss": 1.4032,
      "rewards/accuracies": 0.546875,
      "rewards/chosen": -1.932196855545044,
      "rewards/margins": 0.07593552023172379,
      "rewards/rejected": -2.0081324577331543,
      "step": 4
    },
    {
      "epoch": 0.06587072869493618,
      "grad_norm": 0.6135074496269226,
      "learning_rate": 6.249999999999999e-07,
      "logits/chosen": 10.439040184020996,
      "logits/rejected": 10.739177703857422,
      "logps/chosen": -0.9262609481811523,
      "logps/rejected": -0.9657196998596191,
      "loss": 1.3727,
      "rewards/accuracies": 0.5546875,
      "rewards/chosen": -1.8525218963623047,
      "rewards/margins": 0.07891744375228882,
      "rewards/rejected": -1.9314393997192383,
      "step": 5
    },
    {
      "epoch": 0.07904487443392343,
      "grad_norm": 0.5990542769432068,
      "learning_rate": 7.5e-07,
      "logits/chosen": 10.910269737243652,
      "logits/rejected": 11.204473495483398,
      "logps/chosen": -0.9439595341682434,
      "logps/rejected": -1.0420396327972412,
      "loss": 1.3491,
      "rewards/accuracies": 0.59375,
      "rewards/chosen": -1.8879190683364868,
      "rewards/margins": 0.196160227060318,
      "rewards/rejected": -2.0840792655944824,
      "step": 6
    },
    {
      "epoch": 0.09221902017291066,
      "grad_norm": 1.3676807880401611,
      "learning_rate": 8.75e-07,
      "logits/chosen": 9.873465538024902,
      "logits/rejected": 10.022269248962402,
      "logps/chosen": -0.8941428661346436,
      "logps/rejected": -1.0010743141174316,
      "loss": 1.3507,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -1.788285732269287,
      "rewards/margins": 0.21386288106441498,
      "rewards/rejected": -2.0021486282348633,
      "step": 7
    },
    {
      "epoch": 0.1053931659118979,
      "grad_norm": 1.7690002918243408,
      "learning_rate": 1e-06,
      "logits/chosen": 10.597719192504883,
      "logits/rejected": 10.780376434326172,
      "logps/chosen": -0.9080270528793335,
      "logps/rejected": -0.9909782409667969,
      "loss": 1.3305,
      "rewards/accuracies": 0.640625,
      "rewards/chosen": -1.816054105758667,
      "rewards/margins": 0.16590236127376556,
      "rewards/rejected": -1.9819564819335938,
      "step": 8
    },
    {
      "epoch": 0.11856731165088513,
      "grad_norm": 1.056247353553772,
      "learning_rate": 9.994504457428556e-07,
      "logits/chosen": 10.446786880493164,
      "logits/rejected": 10.839168548583984,
      "logps/chosen": -1.1091859340667725,
      "logps/rejected": -1.0694739818572998,
      "loss": 1.5127,
      "rewards/accuracies": 0.5390625,
      "rewards/chosen": -2.218371868133545,
      "rewards/margins": -0.07942387461662292,
      "rewards/rejected": -2.1389479637145996,
      "step": 9
    },
    {
      "epoch": 0.13174145738987236,
      "grad_norm": 2.076240062713623,
      "learning_rate": 9.97802991010949e-07,
      "logits/chosen": 10.343971252441406,
      "logits/rejected": 10.492179870605469,
      "logps/chosen": -0.9705042839050293,
      "logps/rejected": -0.9916192889213562,
      "loss": 1.4611,
      "rewards/accuracies": 0.53125,
      "rewards/chosen": -1.9410085678100586,
      "rewards/margins": 0.04223020374774933,
      "rewards/rejected": -1.9832385778427124,
      "step": 10
    },
    {
      "epoch": 0.14491560312885962,
      "grad_norm": 1.13358736038208,
      "learning_rate": 9.950612572673255e-07,
      "logits/chosen": 10.49313735961914,
      "logits/rejected": 10.680143356323242,
      "logps/chosen": -1.1081148386001587,
      "logps/rejected": -1.223841667175293,
      "loss": 1.3449,
      "rewards/accuracies": 0.59375,
      "rewards/chosen": -2.2162296772003174,
      "rewards/margins": 0.23145350813865662,
      "rewards/rejected": -2.447683334350586,
      "step": 11
    },
    {
      "epoch": 0.15808974886784685,
      "grad_norm": 1.1638388633728027,
      "learning_rate": 9.912312714377879e-07,
      "logits/chosen": 10.328557014465332,
      "logits/rejected": 10.365793228149414,
      "logps/chosen": -0.9249637722969055,
      "logps/rejected": -0.9842618703842163,
      "loss": 1.351,
      "rewards/accuracies": 0.6015625,
      "rewards/chosen": -1.849927544593811,
      "rewards/margins": 0.11859625577926636,
      "rewards/rejected": -1.9685237407684326,
      "step": 12
    },
    {
      "epoch": 0.17126389460683408,
      "grad_norm": 0.9941473007202148,
      "learning_rate": 9.863214526624063e-07,
      "logits/chosen": 9.909621238708496,
      "logits/rejected": 10.248769760131836,
      "logps/chosen": -0.9913480877876282,
      "logps/rejected": -1.1752512454986572,
      "loss": 1.2767,
      "rewards/accuracies": 0.5703125,
      "rewards/chosen": -1.9826961755752563,
      "rewards/margins": 0.36780619621276855,
      "rewards/rejected": -2.3505024909973145,
      "step": 13
    },
    {
      "epoch": 0.1844380403458213,
      "grad_norm": 1.3600269556045532,
      "learning_rate": 9.8034259378842e-07,
      "logits/chosen": 10.472145080566406,
      "logits/rejected": 10.987956047058105,
      "logps/chosen": -0.9751205444335938,
      "logps/rejected": -1.0532664060592651,
      "loss": 1.3626,
      "rewards/accuracies": 0.578125,
      "rewards/chosen": -1.9502410888671875,
      "rewards/margins": 0.15629185736179352,
      "rewards/rejected": -2.1065328121185303,
      "step": 14
    },
    {
      "epoch": 0.19761218608480857,
      "grad_norm": 0.3477364182472229,
      "learning_rate": 9.73307837645217e-07,
      "logits/chosen": 10.209980010986328,
      "logits/rejected": 10.457592964172363,
      "logps/chosen": -0.9716652035713196,
      "logps/rejected": -1.0775285959243774,
      "loss": 1.3132,
      "rewards/accuracies": 0.5859375,
      "rewards/chosen": -1.9433304071426392,
      "rewards/margins": 0.2117268592119217,
      "rewards/rejected": -2.155057191848755,
      "step": 15
    },
    {
      "epoch": 0.2107863318237958,
      "grad_norm": 0.975040853023529,
      "learning_rate": 9.652326481535433e-07,
      "logits/chosen": 10.770889282226562,
      "logits/rejected": 11.057292938232422,
      "logps/chosen": -0.9405269026756287,
      "logps/rejected": -0.9816387891769409,
      "loss": 1.4142,
      "rewards/accuracies": 0.5078125,
      "rewards/chosen": -1.8810538053512573,
      "rewards/margins": 0.08222392201423645,
      "rewards/rejected": -1.9632775783538818,
      "step": 16
    },
    {
      "epoch": 0.22396047756278303,
      "grad_norm": 0.47477808594703674,
      "learning_rate": 9.561347763324483e-07,
      "logits/chosen": 10.384443283081055,
      "logits/rejected": 10.546278953552246,
      "logps/chosen": -0.9655594229698181,
      "logps/rejected": -0.9963297247886658,
      "loss": 1.4058,
      "rewards/accuracies": 0.578125,
      "rewards/chosen": -1.9311188459396362,
      "rewards/margins": 0.061540693044662476,
      "rewards/rejected": -1.9926594495773315,
      "step": 17
    },
    {
      "epoch": 0.23713462330177026,
      "grad_norm": 0.9369856119155884,
      "learning_rate": 9.460342212786932e-07,
      "logits/chosen": 10.428518295288086,
      "logits/rejected": 10.742942810058594,
      "logps/chosen": -1.0061042308807373,
      "logps/rejected": -0.9558196067810059,
      "loss": 1.5279,
      "rewards/accuracies": 0.5390625,
      "rewards/chosen": -2.0122084617614746,
      "rewards/margins": -0.10056903213262558,
      "rewards/rejected": -1.9116392135620117,
      "step": 18
    },
    {
      "epoch": 0.2503087690407575,
      "grad_norm": 0.6867318153381348,
      "learning_rate": 9.349531862043951e-07,
      "logits/chosen": 10.536978721618652,
      "logits/rejected": 10.496305465698242,
      "logps/chosen": -1.0390187501907349,
      "logps/rejected": -1.1175179481506348,
      "loss": 1.3199,
      "rewards/accuracies": 0.6953125,
      "rewards/chosen": -2.0780375003814697,
      "rewards/margins": 0.1569983810186386,
      "rewards/rejected": -2.2350358963012695,
      "step": 19
    },
    {
      "epoch": 0.2634829147797447,
      "grad_norm": 1.6277413368225098,
      "learning_rate": 9.229160296295487e-07,
      "logits/chosen": 10.487991333007812,
      "logits/rejected": 10.849261283874512,
      "logps/chosen": -1.005416989326477,
      "logps/rejected": -1.0715974569320679,
      "loss": 1.3772,
      "rewards/accuracies": 0.6328125,
      "rewards/chosen": -2.010833978652954,
      "rewards/margins": 0.1323607861995697,
      "rewards/rejected": -2.1431949138641357,
      "step": 20
    },
    {
      "epoch": 0.276657060518732,
      "grad_norm": 1.1200292110443115,
      "learning_rate": 9.099492118367122e-07,
      "logits/chosen": 10.419047355651855,
      "logits/rejected": 10.756099700927734,
      "logps/chosen": -0.9289014935493469,
      "logps/rejected": -1.020794153213501,
      "loss": 1.3128,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -1.8578029870986938,
      "rewards/margins": 0.18378500640392303,
      "rewards/rejected": -2.041588306427002,
      "step": 21
    },
    {
      "epoch": 0.28983120625771924,
      "grad_norm": 1.3312275409698486,
      "learning_rate": 8.960812367055646e-07,
      "logits/chosen": 10.375129699707031,
      "logits/rejected": 10.722561836242676,
      "logps/chosen": -1.0812478065490723,
      "logps/rejected": -1.104426383972168,
      "loss": 1.438,
      "rewards/accuracies": 0.5703125,
      "rewards/chosen": -2.1624956130981445,
      "rewards/margins": 0.04635699465870857,
      "rewards/rejected": -2.208852767944336,
      "step": 22
    },
    {
      "epoch": 0.3030053519967065,
      "grad_norm": 0.36184069514274597,
      "learning_rate": 8.813425890551909e-07,
      "logits/chosen": 10.423131942749023,
      "logits/rejected": 10.69787311553955,
      "logps/chosen": -1.0428340435028076,
      "logps/rejected": -1.0485754013061523,
      "loss": 1.4543,
      "rewards/accuracies": 0.5625,
      "rewards/chosen": -2.0856680870056152,
      "rewards/margins": 0.01148274727165699,
      "rewards/rejected": -2.0971508026123047,
      "step": 23
    },
    {
      "epoch": 0.3161794977356937,
      "grad_norm": 0.5770995020866394,
      "learning_rate": 8.657656676318345e-07,
      "logits/chosen": 10.316368103027344,
      "logits/rejected": 10.479074478149414,
      "logps/chosen": -0.9681622982025146,
      "logps/rejected": -1.050377607345581,
      "loss": 1.3743,
      "rewards/accuracies": 0.515625,
      "rewards/chosen": -1.9363245964050293,
      "rewards/margins": 0.16443049907684326,
      "rewards/rejected": -2.100755214691162,
      "step": 24
    },
    {
      "epoch": 0.32935364347468093,
      "grad_norm": 2.933119297027588,
      "learning_rate": 8.493847138894208e-07,
      "logits/chosen": 10.537033081054688,
      "logits/rejected": 10.929938316345215,
      "logps/chosen": -0.8849888443946838,
      "logps/rejected": -0.9310915470123291,
      "loss": 1.3801,
      "rewards/accuracies": 0.5703125,
      "rewards/chosen": -1.7699776887893677,
      "rewards/margins": 0.09220531582832336,
      "rewards/rejected": -1.8621830940246582,
      "step": 25
    },
    {
      "epoch": 0.34252778921366817,
      "grad_norm": 0.6123780012130737,
      "learning_rate": 8.322357367194108e-07,
      "logits/chosen": 10.728922843933105,
      "logits/rejected": 10.86151123046875,
      "logps/chosen": -0.9060953855514526,
      "logps/rejected": -0.9559903144836426,
      "loss": 1.3675,
      "rewards/accuracies": 0.59375,
      "rewards/chosen": -1.8121907711029053,
      "rewards/margins": 0.09978975355625153,
      "rewards/rejected": -1.9119806289672852,
      "step": 26
    },
    {
      "epoch": 0.3557019349526554,
      "grad_norm": 1.8228118419647217,
      "learning_rate": 8.143564332954425e-07,
      "logits/chosen": 10.698800086975098,
      "logits/rejected": 11.009087562561035,
      "logps/chosen": -1.0113650560379028,
      "logps/rejected": -1.0222567319869995,
      "loss": 1.4265,
      "rewards/accuracies": 0.5234375,
      "rewards/chosen": -2.0227301120758057,
      "rewards/margins": 0.021783310920000076,
      "rewards/rejected": -2.044513463973999,
      "step": 27
    },
    {
      "epoch": 0.3688760806916426,
      "grad_norm": 0.5645075440406799,
      "learning_rate": 7.957861062067612e-07,
      "logits/chosen": 10.079174995422363,
      "logits/rejected": 10.544219970703125,
      "logps/chosen": -0.905964195728302,
      "logps/rejected": -0.9862813949584961,
      "loss": 1.3617,
      "rewards/accuracies": 0.578125,
      "rewards/chosen": -1.811928391456604,
      "rewards/margins": 0.16063442826271057,
      "rewards/rejected": -1.9725627899169922,
      "step": 28
    },
    {
      "epoch": 0.3820502264306299,
      "grad_norm": 2.0421998500823975,
      "learning_rate": 7.765655770625996e-07,
      "logits/chosen": 10.456260681152344,
      "logits/rejected": 10.634986877441406,
      "logps/chosen": -0.8751351237297058,
      "logps/rejected": -0.9531751871109009,
      "loss": 1.3385,
      "rewards/accuracies": 0.671875,
      "rewards/chosen": -1.7502702474594116,
      "rewards/margins": 0.15608005225658417,
      "rewards/rejected": -1.9063503742218018,
      "step": 29
    },
    {
      "epoch": 0.39522437216961714,
      "grad_norm": 0.5317772030830383,
      "learning_rate": 7.567370967574209e-07,
      "logits/chosen": 10.539588928222656,
      "logits/rejected": 11.065520286560059,
      "logps/chosen": -0.9008954763412476,
      "logps/rejected": -0.9472513794898987,
      "loss": 1.3915,
      "rewards/accuracies": 0.5703125,
      "rewards/chosen": -1.8017909526824951,
      "rewards/margins": 0.0927119180560112,
      "rewards/rejected": -1.8945027589797974,
      "step": 30
    },
    {
      "epoch": 0.4083985179086044,
      "grad_norm": 0.929427981376648,
      "learning_rate": 7.363442525942826e-07,
      "logits/chosen": 10.132299423217773,
      "logits/rejected": 10.620383262634277,
      "logps/chosen": -0.8826152086257935,
      "logps/rejected": -0.9457932114601135,
      "loss": 1.3575,
      "rewards/accuracies": 0.5859375,
      "rewards/chosen": -1.765230417251587,
      "rewards/margins": 0.12635602056980133,
      "rewards/rejected": -1.891586422920227,
      "step": 31
    },
    {
      "epoch": 0.4215726636475916,
      "grad_norm": 0.287424236536026,
      "learning_rate": 7.154318724704851e-07,
      "logits/chosen": 10.419651985168457,
      "logits/rejected": 10.7128267288208,
      "logps/chosen": -0.9339243173599243,
      "logps/rejected": -0.95993971824646,
      "loss": 1.4233,
      "rewards/accuracies": 0.5625,
      "rewards/chosen": -1.8678486347198486,
      "rewards/margins": 0.052030690014362335,
      "rewards/rejected": -1.91987943649292,
      "step": 32
    },
    {
      "epoch": 0.43474680938657884,
      "grad_norm": 0.8097081184387207,
      "learning_rate": 6.940459263361248e-07,
      "logits/chosen": 10.180652618408203,
      "logits/rejected": 10.812442779541016,
      "logps/chosen": -0.9092710614204407,
      "logps/rejected": -0.8953740000724792,
      "loss": 1.4729,
      "rewards/accuracies": 0.515625,
      "rewards/chosen": -1.8185421228408813,
      "rewards/margins": -0.027794085443019867,
      "rewards/rejected": -1.7907480001449585,
      "step": 33
    },
    {
      "epoch": 0.44792095512556607,
      "grad_norm": 0.6889541745185852,
      "learning_rate": 6.722334251421664e-07,
      "logits/chosen": 10.093196868896484,
      "logits/rejected": 10.485943794250488,
      "logps/chosen": -1.0066155195236206,
      "logps/rejected": -1.033683180809021,
      "loss": 1.46,
      "rewards/accuracies": 0.5546875,
      "rewards/chosen": -2.013231039047241,
      "rewards/margins": 0.054135359823703766,
      "rewards/rejected": -2.067366361618042,
      "step": 34
    },
    {
      "epoch": 0.4610951008645533,
      "grad_norm": 1.549689531326294,
      "learning_rate": 6.500423175001703e-07,
      "logits/chosen": 10.556034088134766,
      "logits/rejected": 10.797924995422363,
      "logps/chosen": -0.9926838278770447,
      "logps/rejected": -1.0250440835952759,
      "loss": 1.45,
      "rewards/accuracies": 0.5703125,
      "rewards/chosen": -1.9853676557540894,
      "rewards/margins": 0.06472064554691315,
      "rewards/rejected": -2.0500881671905518,
      "step": 35
    },
    {
      "epoch": 0.47426924660354053,
      "grad_norm": 0.8994730710983276,
      "learning_rate": 6.275213842808382e-07,
      "logits/chosen": 10.474414825439453,
      "logits/rejected": 10.848793983459473,
      "logps/chosen": -0.875037431716919,
      "logps/rejected": -0.9078534841537476,
      "loss": 1.3809,
      "rewards/accuracies": 0.5390625,
      "rewards/chosen": -1.750074863433838,
      "rewards/margins": 0.0656321793794632,
      "rewards/rejected": -1.8157069683074951,
      "step": 36
    },
    {
      "epoch": 0.4874433923425278,
      "grad_norm": 0.47796007990837097,
      "learning_rate": 6.047201313830723e-07,
      "logits/chosen": 10.430042266845703,
      "logits/rejected": 10.628593444824219,
      "logps/chosen": -0.9276302456855774,
      "logps/rejected": -1.0414646863937378,
      "loss": 1.2843,
      "rewards/accuracies": 0.6796875,
      "rewards/chosen": -1.8552604913711548,
      "rewards/margins": 0.22766906023025513,
      "rewards/rejected": -2.0829293727874756,
      "step": 37
    },
    {
      "epoch": 0.500617538081515,
      "grad_norm": 1.8155546188354492,
      "learning_rate": 5.816886809092651e-07,
      "logits/chosen": 10.451735496520996,
      "logits/rejected": 10.694074630737305,
      "logps/chosen": -0.9971815943717957,
      "logps/rejected": -1.0258358716964722,
      "loss": 1.4073,
      "rewards/accuracies": 0.640625,
      "rewards/chosen": -1.9943631887435913,
      "rewards/margins": 0.057308606803417206,
      "rewards/rejected": -2.0516717433929443,
      "step": 38
    },
    {
      "epoch": 0.5137916838205022,
      "grad_norm": 1.4760124683380127,
      "learning_rate": 5.584776609860413e-07,
      "logits/chosen": 10.413124084472656,
      "logits/rejected": 10.561273574829102,
      "logps/chosen": -0.9041131734848022,
      "logps/rejected": -0.8810122609138489,
      "loss": 1.4648,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -1.8082263469696045,
      "rewards/margins": -0.046201735734939575,
      "rewards/rejected": -1.7620245218276978,
      "step": 39
    },
    {
      "epoch": 0.5269658295594895,
      "grad_norm": 0.5761233568191528,
      "learning_rate": 5.351380944726465e-07,
      "logits/chosen": 10.481398582458496,
      "logits/rejected": 10.893855094909668,
      "logps/chosen": -0.8643166422843933,
      "logps/rejected": -0.9709457159042358,
      "loss": 1.283,
      "rewards/accuracies": 0.671875,
      "rewards/chosen": -1.7286332845687866,
      "rewards/margins": 0.21325840055942535,
      "rewards/rejected": -1.9418914318084717,
      "step": 40
    },
    {
      "epoch": 0.5401399752984768,
      "grad_norm": 2.2331480979919434,
      "learning_rate": 5.117212868016303e-07,
      "logits/chosen": 10.834035873413086,
      "logits/rejected": 10.830018997192383,
      "logps/chosen": -0.8488631248474121,
      "logps/rejected": -0.896608293056488,
      "loss": 1.3632,
      "rewards/accuracies": 0.5625,
      "rewards/chosen": -1.6977262496948242,
      "rewards/margins": 0.09549038112163544,
      "rewards/rejected": -1.793216586112976,
      "step": 41
    },
    {
      "epoch": 0.553314121037464,
      "grad_norm": 0.8566221594810486,
      "learning_rate": 4.882787131983697e-07,
      "logits/chosen": 10.066476821899414,
      "logits/rejected": 10.382861137390137,
      "logps/chosen": -0.7725206613540649,
      "logps/rejected": -0.8358003497123718,
      "loss": 1.3445,
      "rewards/accuracies": 0.5390625,
      "rewards/chosen": -1.5450413227081299,
      "rewards/margins": 0.12655934691429138,
      "rewards/rejected": -1.6716006994247437,
      "step": 42
    },
    {
      "epoch": 0.5664882667764513,
      "grad_norm": 1.208364725112915,
      "learning_rate": 4.648619055273537e-07,
      "logits/chosen": 9.969656944274902,
      "logits/rejected": 10.292926788330078,
      "logps/chosen": -0.8092156052589417,
      "logps/rejected": -0.9089646339416504,
      "loss": 1.302,
      "rewards/accuracies": 0.6796875,
      "rewards/chosen": -1.6184312105178833,
      "rewards/margins": 0.19949808716773987,
      "rewards/rejected": -1.8179292678833008,
      "step": 43
    },
    {
      "epoch": 0.5796624125154385,
      "grad_norm": 0.6581346988677979,
      "learning_rate": 4.4152233901395875e-07,
      "logits/chosen": 10.372085571289062,
      "logits/rejected": 10.586714744567871,
      "logps/chosen": -0.8714113235473633,
      "logps/rejected": -0.8994148969650269,
      "loss": 1.384,
      "rewards/accuracies": 0.609375,
      "rewards/chosen": -1.7428226470947266,
      "rewards/margins": 0.05600719153881073,
      "rewards/rejected": -1.7988297939300537,
      "step": 44
    },
    {
      "epoch": 0.5928365582544257,
      "grad_norm": 0.4405794143676758,
      "learning_rate": 4.183113190907348e-07,
      "logits/chosen": 10.185098648071289,
      "logits/rejected": 10.43220329284668,
      "logps/chosen": -0.8584359288215637,
      "logps/rejected": -0.9151718616485596,
      "loss": 1.3552,
      "rewards/accuracies": 0.640625,
      "rewards/chosen": -1.7168718576431274,
      "rewards/margins": 0.11347203701734543,
      "rewards/rejected": -1.8303437232971191,
      "step": 45
    },
    {
      "epoch": 0.606010703993413,
      "grad_norm": 3.3551409244537354,
      "learning_rate": 3.9527986861692785e-07,
      "logits/chosen": 10.364571571350098,
      "logits/rejected": 10.566858291625977,
      "logps/chosen": -0.8840798735618591,
      "logps/rejected": -0.9646981954574585,
      "loss": 1.343,
      "rewards/accuracies": 0.5859375,
      "rewards/chosen": -1.7681597471237183,
      "rewards/margins": 0.16123665869235992,
      "rewards/rejected": -1.929396390914917,
      "step": 46
    },
    {
      "epoch": 0.6191848497324002,
      "grad_norm": 0.4494335949420929,
      "learning_rate": 3.724786157191618e-07,
      "logits/chosen": 10.511517524719238,
      "logits/rejected": 10.93988037109375,
      "logps/chosen": -0.9318357706069946,
      "logps/rejected": -0.9812297224998474,
      "loss": 1.3768,
      "rewards/accuracies": 0.578125,
      "rewards/chosen": -1.8636715412139893,
      "rewards/margins": 0.09878775477409363,
      "rewards/rejected": -1.9624594449996948,
      "step": 47
    },
    {
      "epoch": 0.6323589954713874,
      "grad_norm": 0.5882486701011658,
      "learning_rate": 3.499576824998297e-07,
      "logits/chosen": 10.835572242736816,
      "logits/rejected": 10.978731155395508,
      "logps/chosen": -0.8741555213928223,
      "logps/rejected": -0.974146842956543,
      "loss": 1.3111,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -1.7483110427856445,
      "rewards/margins": 0.1999826580286026,
      "rewards/rejected": -1.948293685913086,
      "step": 48
    }
  ],
  "logging_steps": 1,
  "max_steps": 75,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 12,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}