diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11956 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100, + "global_step": 7642, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 6.535947712418301e-09, + "logits/chosen": -2.795367956161499, + "logits/rejected": -2.783267021179199, + "logps/chosen": -1426.8870849609375, + "logps/rejected": -1156.8780517578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.535947712418302e-08, + "logits/chosen": -2.834068775177002, + "logits/rejected": -2.826040267944336, + "logps/chosen": -1849.96728515625, + "logps/rejected": -1517.9075927734375, + "loss": 0.6933, + "rewards/accuracies": 0.2222222238779068, + "rewards/chosen": 9.649489948060364e-05, + "rewards/margins": -6.48392378934659e-05, + "rewards/rejected": 0.0001613341155461967, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.3071895424836603e-07, + "logits/chosen": -2.8028738498687744, + "logits/rejected": -2.802607297897339, + "logps/chosen": -1647.7279052734375, + "logps/rejected": -1586.53955078125, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0003318481321912259, + "rewards/margins": -0.0006864747265353799, + "rewards/rejected": 0.0003546266525518149, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 1.9607843137254904e-07, + "logits/chosen": -2.796626567840576, + "logits/rejected": -2.784531593322754, + "logps/chosen": -1275.7952880859375, + "logps/rejected": -957.0416259765625, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00011474229540908709, + "rewards/margins": 5.542132930713706e-05, + "rewards/rejected": 5.932092244620435e-05, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 2.6143790849673207e-07, + "logits/chosen": -2.806631326675415, + "logits/rejected": -2.8096060752868652, + "logps/chosen": -1816.1331787109375, + "logps/rejected": -1482.34375, + "loss": 0.6931, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0008866718853823841, + "rewards/margins": 0.0015330163296312094, + "rewards/rejected": -0.0006463441532105207, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 3.267973856209151e-07, + "logits/chosen": -2.792966365814209, + "logits/rejected": -2.794290542602539, + "logps/chosen": -1384.16552734375, + "logps/rejected": -1125.865966796875, + "loss": 0.6932, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0008044252172112465, + "rewards/margins": -0.00011883594561368227, + "rewards/rejected": -0.0006855892715975642, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 3.921568627450981e-07, + "logits/chosen": -2.7673702239990234, + "logits/rejected": -2.7500195503234863, + "logps/chosen": -1217.739501953125, + "logps/rejected": -1133.6571044921875, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0003805880551226437, + "rewards/margins": -0.0006572251440957189, + "rewards/rejected": 0.0002766371180769056, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 4.5751633986928105e-07, + "logits/chosen": -2.7791049480438232, + "logits/rejected": -2.78680419921875, + "logps/chosen": -995.9110107421875, + "logps/rejected": -1084.221923828125, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0002094264345942065, + "rewards/margins": 0.0012642501387745142, + "rewards/rejected": -0.001054823980666697, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 5.228758169934641e-07, + "logits/chosen": -2.808701753616333, + "logits/rejected": -2.799790859222412, + "logps/chosen": -1342.0267333984375, + "logps/rejected": -1569.693603515625, + "loss": 0.6933, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0006827354663982987, + "rewards/margins": -0.00042857881635427475, + "rewards/rejected": -0.0002541565918363631, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 5.882352941176471e-07, + "logits/chosen": -2.8107619285583496, + "logits/rejected": -2.781428575515747, + "logps/chosen": -1558.470458984375, + "logps/rejected": -1330.8447265625, + "loss": 0.6931, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0006004419410601258, + "rewards/margins": 0.00014814567111898214, + "rewards/rejected": 0.00045229625538922846, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 6.535947712418302e-07, + "logits/chosen": -2.791748285293579, + "logits/rejected": -2.796267032623291, + "logps/chosen": -1339.14013671875, + "logps/rejected": -1190.2244873046875, + "loss": 0.6932, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00010777993884403259, + "rewards/margins": 0.00042422435944899917, + "rewards/rejected": -0.00031644434784539044, + "step": 100 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.798377513885498, + "eval_logits/rejected": -2.790869951248169, + "eval_logps/chosen": -1562.5352783203125, + "eval_logps/rejected": -1351.64404296875, + "eval_loss": 0.6931096911430359, + "eval_rewards/accuracies": 0.494047611951828, + "eval_rewards/chosen": 0.00013916695024818182, + "eval_rewards/margins": -6.539197056554258e-05, + "eval_rewards/rejected": 0.00020455890626180917, + "eval_runtime": 222.1187, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 0.284, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 7.189542483660131e-07, + "logits/chosen": -2.8092565536499023, + "logits/rejected": -2.8138070106506348, + "logps/chosen": -1324.2572021484375, + "logps/rejected": -977.5646362304688, + "loss": 0.6929, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0004745650221593678, + "rewards/margins": 0.0005655864952132106, + "rewards/rejected": -9.102150943363085e-05, + "step": 110 + }, + { + "epoch": 0.03, + "learning_rate": 7.843137254901962e-07, + "logits/chosen": -2.7817893028259277, + "logits/rejected": -2.7743403911590576, + "logps/chosen": -1438.9947509765625, + "logps/rejected": -1309.9793701171875, + "loss": 0.693, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -9.187074465444311e-05, + "rewards/margins": -0.00023352287826128304, + "rewards/rejected": 0.00014165208267513663, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 8.496732026143792e-07, + "logits/chosen": -2.798006772994995, + "logits/rejected": -2.7805659770965576, + "logps/chosen": -1641.3851318359375, + "logps/rejected": -1417.8428955078125, + "loss": 0.6928, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0013305357424542308, + "rewards/margins": 0.0011499880347400904, + "rewards/rejected": 0.00018054773681797087, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 9.150326797385621e-07, + "logits/chosen": -2.837071180343628, + "logits/rejected": -2.8371713161468506, + "logps/chosen": -1430.638671875, + "logps/rejected": -1356.1748046875, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -7.816695870133117e-05, + "rewards/margins": -0.0002333349548280239, + "rewards/rejected": 0.0001551680325064808, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -2.828702449798584, + "logits/rejected": -2.8166086673736572, + "logps/chosen": -1465.7197265625, + "logps/rejected": -1289.9031982421875, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0003954515850637108, + "rewards/margins": -0.00013864324137102813, + "rewards/rejected": 0.0005340948118828237, + "step": 150 + }, + { + "epoch": 0.04, + "learning_rate": 1.0457516339869283e-06, + "logits/chosen": -2.8205227851867676, + "logits/rejected": -2.8256192207336426, + "logps/chosen": -1326.330322265625, + "logps/rejected": -1407.6767578125, + "loss": 0.6928, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0017105670413002372, + "rewards/margins": 0.001616004854440689, + "rewards/rejected": 9.456199040869251e-05, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 1.111111111111111e-06, + "logits/chosen": -2.7866125106811523, + "logits/rejected": -2.7868504524230957, + "logps/chosen": -1298.046630859375, + "logps/rejected": -1401.292236328125, + "loss": 0.6927, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0013844614150002599, + "rewards/margins": 0.00066300731850788, + "rewards/rejected": 0.0007214541547000408, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 1.1764705882352942e-06, + "logits/chosen": -2.7920944690704346, + "logits/rejected": -2.785459280014038, + "logps/chosen": -1213.7120361328125, + "logps/rejected": -1024.6798095703125, + "loss": 0.6928, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0017878736834973097, + "rewards/margins": 0.00030651717679575086, + "rewards/rejected": 0.001481356448493898, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 1.2418300653594772e-06, + "logits/chosen": -2.7856953144073486, + "logits/rejected": -2.788532257080078, + "logps/chosen": -1345.113525390625, + "logps/rejected": -1323.069091796875, + "loss": 0.6923, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0006524743512272835, + "rewards/margins": 0.00080809963401407, + "rewards/rejected": -0.00015562539920210838, + "step": 190 + }, + { + "epoch": 0.05, + "learning_rate": 1.3071895424836604e-06, + "logits/chosen": -2.819009304046631, + "logits/rejected": -2.8260154724121094, + "logps/chosen": -1667.3140869140625, + "logps/rejected": -1410.6632080078125, + "loss": 0.6923, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0064869350753724575, + "rewards/margins": 0.005160582717508078, + "rewards/rejected": 0.0013263521250337362, + "step": 200 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.797353744506836, + "eval_logits/rejected": -2.78989839553833, + "eval_logps/chosen": -1562.09912109375, + "eval_logps/rejected": -1351.3734130859375, + "eval_loss": 0.6924985647201538, + "eval_rewards/accuracies": 0.511904776096344, + "eval_rewards/chosen": 0.00449884170666337, + "eval_rewards/margins": 0.0015855859965085983, + "eval_rewards/rejected": 0.0029132559429854155, + "eval_runtime": 221.9883, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 1.3725490196078434e-06, + "logits/chosen": -2.767564058303833, + "logits/rejected": -2.7736399173736572, + "logps/chosen": -1639.3736572265625, + "logps/rejected": -1532.915771484375, + "loss": 0.6926, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.004849494434893131, + "rewards/margins": 0.0012285599950700998, + "rewards/rejected": 0.0036209344398230314, + "step": 210 + }, + { + "epoch": 0.06, + "learning_rate": 1.4379084967320261e-06, + "logits/chosen": -2.792647361755371, + "logits/rejected": -2.795532703399658, + "logps/chosen": -1155.707763671875, + "logps/rejected": -1254.2430419921875, + "loss": 0.692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00366068072617054, + "rewards/margins": 0.0027366329450160265, + "rewards/rejected": 0.0009240478393621743, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 1.5032679738562091e-06, + "logits/chosen": -2.823070764541626, + "logits/rejected": -2.8033041954040527, + "logps/chosen": -1567.356689453125, + "logps/rejected": -1238.0029296875, + "loss": 0.6909, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0061655775643885136, + "rewards/margins": 0.006119781639426947, + "rewards/rejected": 4.579539017868228e-05, + "step": 230 + }, + { + "epoch": 0.06, + "learning_rate": 1.5686274509803923e-06, + "logits/chosen": -2.8057596683502197, + "logits/rejected": -2.79594349861145, + "logps/chosen": -1635.199951171875, + "logps/rejected": -1150.596435546875, + "loss": 0.692, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.005689248908311129, + "rewards/margins": 0.0024390851613134146, + "rewards/rejected": 0.0032501642126590014, + "step": 240 + }, + { + "epoch": 0.07, + "learning_rate": 1.6339869281045753e-06, + "logits/chosen": -2.831570863723755, + "logits/rejected": -2.8272597789764404, + "logps/chosen": -1722.236328125, + "logps/rejected": -1391.227294921875, + "loss": 0.6926, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006255643907934427, + "rewards/margins": 0.000186113640666008, + "rewards/rejected": 0.006069529801607132, + "step": 250 + }, + { + "epoch": 0.07, + "learning_rate": 1.6993464052287585e-06, + "logits/chosen": -2.7985095977783203, + "logits/rejected": -2.804990768432617, + "logps/chosen": -1589.3802490234375, + "logps/rejected": -1299.8837890625, + "loss": 0.6928, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0035744800698012114, + "rewards/margins": -0.003007827326655388, + "rewards/rejected": 0.006582307629287243, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 1.7647058823529414e-06, + "logits/chosen": -2.798835277557373, + "logits/rejected": -2.7780652046203613, + "logps/chosen": -1849.673583984375, + "logps/rejected": -1373.615478515625, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.012151990085840225, + "rewards/margins": 0.010117733851075172, + "rewards/rejected": 0.0020342557691037655, + "step": 270 + }, + { + "epoch": 0.07, + "learning_rate": 1.8300653594771242e-06, + "logits/chosen": -2.7835683822631836, + "logits/rejected": -2.7650108337402344, + "logps/chosen": -1215.5987548828125, + "logps/rejected": -1194.396240234375, + "loss": 0.6909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.012225615791976452, + "rewards/margins": 0.0057212356477975845, + "rewards/rejected": 0.006504380609840155, + "step": 280 + }, + { + "epoch": 0.08, + "learning_rate": 1.8954248366013072e-06, + "logits/chosen": -2.7740135192871094, + "logits/rejected": -2.766909122467041, + "logps/chosen": -1559.8885498046875, + "logps/rejected": -1529.93994140625, + "loss": 0.6913, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.004790668375790119, + "rewards/margins": 0.0015975991263985634, + "rewards/rejected": 0.003193069249391556, + "step": 290 + }, + { + "epoch": 0.08, + "learning_rate": 1.96078431372549e-06, + "logits/chosen": -2.8152096271514893, + "logits/rejected": -2.797621965408325, + "logps/chosen": -1500.9361572265625, + "logps/rejected": -1118.470947265625, + "loss": 0.6937, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.003361351788043976, + "rewards/margins": -0.00021784492128062993, + "rewards/rejected": 0.0035791967529803514, + "step": 300 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -2.794529438018799, + "eval_logits/rejected": -2.7872204780578613, + "eval_logps/chosen": -1561.58154296875, + "eval_logps/rejected": -1351.146240234375, + "eval_loss": 0.6909098625183105, + "eval_rewards/accuracies": 0.5376983880996704, + "eval_rewards/chosen": 0.009676768444478512, + "eval_rewards/margins": 0.004492546897381544, + "eval_rewards/rejected": 0.00518422294408083, + "eval_runtime": 222.0362, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.284, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 2.0261437908496734e-06, + "logits/chosen": -2.746243953704834, + "logits/rejected": -2.7564194202423096, + "logps/chosen": -1628.013427734375, + "logps/rejected": -1673.1204833984375, + "loss": 0.6906, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.004660730250179768, + "rewards/margins": -0.0001595167414052412, + "rewards/rejected": 0.004820247646421194, + "step": 310 + }, + { + "epoch": 0.08, + "learning_rate": 2.0915032679738565e-06, + "logits/chosen": -2.7773218154907227, + "logits/rejected": -2.7734968662261963, + "logps/chosen": -1595.344482421875, + "logps/rejected": -1299.114013671875, + "loss": 0.6908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.011113145388662815, + "rewards/margins": 0.008930252864956856, + "rewards/rejected": 0.002182893455028534, + "step": 320 + }, + { + "epoch": 0.09, + "learning_rate": 2.1568627450980393e-06, + "logits/chosen": -2.7626965045928955, + "logits/rejected": -2.78471040725708, + "logps/chosen": -1494.483154296875, + "logps/rejected": -1481.1407470703125, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006823359522968531, + "rewards/margins": -0.0029601803980767727, + "rewards/rejected": 0.009783540852367878, + "step": 330 + }, + { + "epoch": 0.09, + "learning_rate": 2.222222222222222e-06, + "logits/chosen": -2.7922797203063965, + "logits/rejected": -2.797437906265259, + "logps/chosen": -1879.9134521484375, + "logps/rejected": -1432.9617919921875, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024633025750517845, + "rewards/margins": 0.02049541473388672, + "rewards/rejected": 0.004137612413614988, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 2.2875816993464053e-06, + "logits/chosen": -2.7698655128479004, + "logits/rejected": -2.767660140991211, + "logps/chosen": -1492.0418701171875, + "logps/rejected": -1487.5615234375, + "loss": 0.6915, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.012676766142249107, + "rewards/margins": -0.007590385619550943, + "rewards/rejected": 0.020267153158783913, + "step": 350 + }, + { + "epoch": 0.09, + "learning_rate": 2.3529411764705885e-06, + "logits/chosen": -2.8128461837768555, + "logits/rejected": -2.8049893379211426, + "logps/chosen": -1627.913818359375, + "logps/rejected": -1413.0662841796875, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.009338868781924248, + "rewards/margins": 0.01759433187544346, + "rewards/rejected": -0.00825546495616436, + "step": 360 + }, + { + "epoch": 0.1, + "learning_rate": 2.4183006535947716e-06, + "logits/chosen": -2.766038179397583, + "logits/rejected": -2.777884006500244, + "logps/chosen": -1519.193359375, + "logps/rejected": -1519.871337890625, + "loss": 0.6928, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.009928613901138306, + "rewards/margins": -0.002825252478942275, + "rewards/rejected": 0.012753868475556374, + "step": 370 + }, + { + "epoch": 0.1, + "learning_rate": 2.4836601307189544e-06, + "logits/chosen": -2.766292095184326, + "logits/rejected": -2.7770702838897705, + "logps/chosen": -1192.345703125, + "logps/rejected": -1111.7589111328125, + "loss": 0.6884, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.015065248124301434, + "rewards/margins": 0.004281006287783384, + "rewards/rejected": 0.010784241370856762, + "step": 380 + }, + { + "epoch": 0.1, + "learning_rate": 2.549019607843137e-06, + "logits/chosen": -2.7778379917144775, + "logits/rejected": -2.7809524536132812, + "logps/chosen": -1430.665283203125, + "logps/rejected": -1653.656005859375, + "loss": 0.6901, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.022431906312704086, + "rewards/margins": 0.010017070919275284, + "rewards/rejected": 0.012414836324751377, + "step": 390 + }, + { + "epoch": 0.1, + "learning_rate": 2.6143790849673208e-06, + "logits/chosen": -2.756230115890503, + "logits/rejected": -2.7588517665863037, + "logps/chosen": -1730.5394287109375, + "logps/rejected": -1502.6300048828125, + "loss": 0.6867, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01862536557018757, + "rewards/margins": 0.007056856993585825, + "rewards/rejected": 0.011568508110940456, + "step": 400 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.792309522628784, + "eval_logits/rejected": -2.7853169441223145, + "eval_logps/chosen": -1561.1024169921875, + "eval_logps/rejected": -1351.063232421875, + "eval_loss": 0.6892846822738647, + "eval_rewards/accuracies": 0.5595238208770752, + "eval_rewards/chosen": 0.014464985579252243, + "eval_rewards/margins": 0.008450279943645, + "eval_rewards/rejected": 0.006014703772962093, + "eval_runtime": 222.1309, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 0.284, + "step": 400 + }, + { + "epoch": 0.11, + "learning_rate": 2.6797385620915036e-06, + "logits/chosen": -2.8415043354034424, + "logits/rejected": -2.7838778495788574, + "logps/chosen": -1729.2962646484375, + "logps/rejected": -1401.625732421875, + "loss": 0.6906, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.011543015018105507, + "rewards/margins": -0.007434485945850611, + "rewards/rejected": 0.01897750422358513, + "step": 410 + }, + { + "epoch": 0.11, + "learning_rate": 2.7450980392156867e-06, + "logits/chosen": -2.779257297515869, + "logits/rejected": -2.76314640045166, + "logps/chosen": -1407.1253662109375, + "logps/rejected": -882.9191284179688, + "loss": 0.6872, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.008474389091134071, + "rewards/margins": 0.0015680404612794518, + "rewards/rejected": 0.006906348280608654, + "step": 420 + }, + { + "epoch": 0.11, + "learning_rate": 2.8104575163398695e-06, + "logits/chosen": -2.761399745941162, + "logits/rejected": -2.765044927597046, + "logps/chosen": -1092.9505615234375, + "logps/rejected": -1225.6053466796875, + "loss": 0.6872, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015711713582277298, + "rewards/margins": 0.01204732246696949, + "rewards/rejected": 0.0036643915809690952, + "step": 430 + }, + { + "epoch": 0.12, + "learning_rate": 2.8758169934640523e-06, + "logits/chosen": -2.7783565521240234, + "logits/rejected": -2.770191192626953, + "logps/chosen": -1729.1246337890625, + "logps/rejected": -1355.4561767578125, + "loss": 0.6848, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.030698176473379135, + "rewards/margins": 0.025134120136499405, + "rewards/rejected": 0.005564060527831316, + "step": 440 + }, + { + "epoch": 0.12, + "learning_rate": 2.9411764705882355e-06, + "logits/chosen": -2.7931742668151855, + "logits/rejected": -2.782345771789551, + "logps/chosen": -1868.706787109375, + "logps/rejected": -1616.277099609375, + "loss": 0.6943, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.013443054631352425, + "rewards/margins": -0.013905840925872326, + "rewards/rejected": 0.0004627843445632607, + "step": 450 + }, + { + "epoch": 0.12, + "learning_rate": 3.0065359477124182e-06, + "logits/chosen": -2.793461322784424, + "logits/rejected": -2.778320550918579, + "logps/chosen": -1475.04541015625, + "logps/rejected": -1274.9873046875, + "loss": 0.6861, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.010366128757596016, + "rewards/margins": 0.004415568895637989, + "rewards/rejected": -0.01478169672191143, + "step": 460 + }, + { + "epoch": 0.12, + "learning_rate": 3.071895424836602e-06, + "logits/chosen": -2.7877297401428223, + "logits/rejected": -2.7807395458221436, + "logps/chosen": -1361.2353515625, + "logps/rejected": -1279.2861328125, + "loss": 0.6818, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.012626223266124725, + "rewards/margins": 0.018246522173285484, + "rewards/rejected": -0.005620299372822046, + "step": 470 + }, + { + "epoch": 0.13, + "learning_rate": 3.1372549019607846e-06, + "logits/chosen": -2.812326192855835, + "logits/rejected": -2.7740156650543213, + "logps/chosen": -1300.698486328125, + "logps/rejected": -972.8448486328125, + "loss": 0.6845, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.001179039478302002, + "rewards/margins": 0.026014486327767372, + "rewards/rejected": -0.027193522080779076, + "step": 480 + }, + { + "epoch": 0.13, + "learning_rate": 3.2026143790849674e-06, + "logits/chosen": -2.7627980709075928, + "logits/rejected": -2.761810541152954, + "logps/chosen": -1492.3681640625, + "logps/rejected": -1233.1475830078125, + "loss": 0.6853, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02171669341623783, + "rewards/margins": 0.015219648368656635, + "rewards/rejected": 0.006497042719274759, + "step": 490 + }, + { + "epoch": 0.13, + "learning_rate": 3.2679738562091506e-06, + "logits/chosen": -2.781360149383545, + "logits/rejected": -2.7785849571228027, + "logps/chosen": -1810.2314453125, + "logps/rejected": -1465.8914794921875, + "loss": 0.6921, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0031786567997187376, + "rewards/margins": -0.0039799753576517105, + "rewards/rejected": 0.00715863099321723, + "step": 500 + }, + { + "epoch": 0.13, + "eval_logits/chosen": -2.789332389831543, + "eval_logits/rejected": -2.782862663269043, + "eval_logps/chosen": -1562.4755859375, + "eval_logps/rejected": -1352.8848876953125, + "eval_loss": 0.686707615852356, + "eval_rewards/accuracies": 0.5734127163887024, + "eval_rewards/chosen": 0.00073534733382985, + "eval_rewards/margins": 0.01293771993368864, + "eval_rewards/rejected": -0.012202374637126923, + "eval_runtime": 222.0162, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.284, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -2.7887978553771973, + "logits/rejected": -2.782435655593872, + "logps/chosen": -1512.6820068359375, + "logps/rejected": -1341.0167236328125, + "loss": 0.6922, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008763214573264122, + "rewards/margins": 0.010548645630478859, + "rewards/rejected": -0.001785430358722806, + "step": 510 + }, + { + "epoch": 0.14, + "learning_rate": 3.398692810457517e-06, + "logits/chosen": -2.7754645347595215, + "logits/rejected": -2.7587666511535645, + "logps/chosen": -1542.706298828125, + "logps/rejected": -1310.3466796875, + "loss": 0.6867, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0118903499096632, + "rewards/margins": 0.02525492012500763, + "rewards/rejected": -0.01336456835269928, + "step": 520 + }, + { + "epoch": 0.14, + "learning_rate": 3.4640522875816997e-06, + "logits/chosen": -2.809363842010498, + "logits/rejected": -2.8009610176086426, + "logps/chosen": -1153.473388671875, + "logps/rejected": -1087.641357421875, + "loss": 0.6878, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.005253266543149948, + "rewards/margins": 0.028111198917031288, + "rewards/rejected": -0.02285792864859104, + "step": 530 + }, + { + "epoch": 0.14, + "learning_rate": 3.529411764705883e-06, + "logits/chosen": -2.827617883682251, + "logits/rejected": -2.8249759674072266, + "logps/chosen": -1407.7529296875, + "logps/rejected": -1284.2222900390625, + "loss": 0.6853, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006767097860574722, + "rewards/margins": 0.021953441202640533, + "rewards/rejected": -0.015186344273388386, + "step": 540 + }, + { + "epoch": 0.14, + "learning_rate": 3.5947712418300657e-06, + "logits/chosen": -2.7767438888549805, + "logits/rejected": -2.7729332447052, + "logps/chosen": -1286.873291015625, + "logps/rejected": -1123.163818359375, + "loss": 0.68, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.008051171898841858, + "rewards/margins": 0.0015671855071559548, + "rewards/rejected": 0.006483986973762512, + "step": 550 + }, + { + "epoch": 0.15, + "learning_rate": 3.6601307189542484e-06, + "logits/chosen": -2.774984359741211, + "logits/rejected": -2.764968156814575, + "logps/chosen": -1587.454833984375, + "logps/rejected": -1288.8699951171875, + "loss": 0.6843, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0070315636694431305, + "rewards/margins": 0.024187782779335976, + "rewards/rejected": -0.017156217247247696, + "step": 560 + }, + { + "epoch": 0.15, + "learning_rate": 3.7254901960784316e-06, + "logits/chosen": -2.7530901432037354, + "logits/rejected": -2.7850308418273926, + "logps/chosen": -1379.649658203125, + "logps/rejected": -1344.728515625, + "loss": 0.6877, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.021333372220396996, + "rewards/margins": -0.03623160347342491, + "rewards/rejected": 0.014898233115673065, + "step": 570 + }, + { + "epoch": 0.15, + "learning_rate": 3.7908496732026144e-06, + "logits/chosen": -2.7495524883270264, + "logits/rejected": -2.741302251815796, + "logps/chosen": -1309.9893798828125, + "logps/rejected": -1294.010498046875, + "loss": 0.694, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.002793100429698825, + "rewards/margins": 0.03543297201395035, + "rewards/rejected": -0.03822607547044754, + "step": 580 + }, + { + "epoch": 0.15, + "learning_rate": 3.856209150326798e-06, + "logits/chosen": -2.803638458251953, + "logits/rejected": -2.794506549835205, + "logps/chosen": -1869.0966796875, + "logps/rejected": -1490.4566650390625, + "loss": 0.6817, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0023705377243459225, + "rewards/margins": 0.050473470240831375, + "rewards/rejected": -0.048102933913469315, + "step": 590 + }, + { + "epoch": 0.16, + "learning_rate": 3.92156862745098e-06, + "logits/chosen": -2.742553472518921, + "logits/rejected": -2.748706102371216, + "logps/chosen": -1474.915771484375, + "logps/rejected": -1218.5316162109375, + "loss": 0.6895, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0037573534063994884, + "rewards/margins": 0.022884074598550797, + "rewards/rejected": -0.026641424745321274, + "step": 600 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -2.780553102493286, + "eval_logits/rejected": -2.7740375995635986, + "eval_logps/chosen": -1562.0875244140625, + "eval_logps/rejected": -1353.28662109375, + "eval_loss": 0.6838135123252869, + "eval_rewards/accuracies": 0.591269850730896, + "eval_rewards/chosen": 0.004615093115717173, + "eval_rewards/margins": 0.02083371952176094, + "eval_rewards/rejected": -0.016218625009059906, + "eval_runtime": 221.9789, + "eval_samples_per_second": 9.01, + "eval_steps_per_second": 0.284, + "step": 600 + }, + { + "epoch": 0.16, + "learning_rate": 3.986928104575164e-06, + "logits/chosen": -2.796996593475342, + "logits/rejected": -2.7961318492889404, + "logps/chosen": -1551.861328125, + "logps/rejected": -1349.908203125, + "loss": 0.6787, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00037096330197528005, + "rewards/margins": 0.019331419840455055, + "rewards/rejected": -0.01896045356988907, + "step": 610 + }, + { + "epoch": 0.16, + "learning_rate": 4.052287581699347e-06, + "logits/chosen": -2.764712333679199, + "logits/rejected": -2.7671523094177246, + "logps/chosen": -1459.205078125, + "logps/rejected": -1273.398681640625, + "loss": 0.6829, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004301647190004587, + "rewards/margins": 0.02450401708483696, + "rewards/rejected": -0.02020237222313881, + "step": 620 + }, + { + "epoch": 0.16, + "learning_rate": 4.11764705882353e-06, + "logits/chosen": -2.781093120574951, + "logits/rejected": -2.7770438194274902, + "logps/chosen": -1245.2060546875, + "logps/rejected": -1376.152099609375, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.001100041321478784, + "rewards/margins": 0.004019447136670351, + "rewards/rejected": -0.002919405000284314, + "step": 630 + }, + { + "epoch": 0.17, + "learning_rate": 4.183006535947713e-06, + "logits/chosen": -2.7561535835266113, + "logits/rejected": -2.755764961242676, + "logps/chosen": -1589.4029541015625, + "logps/rejected": -1410.949951171875, + "loss": 0.6752, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03143695369362831, + "rewards/margins": 0.008360566571354866, + "rewards/rejected": -0.03979751467704773, + "step": 640 + }, + { + "epoch": 0.17, + "learning_rate": 4.2483660130718954e-06, + "logits/chosen": -2.775606393814087, + "logits/rejected": -2.7552859783172607, + "logps/chosen": -1316.2952880859375, + "logps/rejected": -1200.760986328125, + "loss": 0.6912, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.008813906461000443, + "rewards/margins": 0.0012561812764033675, + "rewards/rejected": 0.007557724602520466, + "step": 650 + }, + { + "epoch": 0.17, + "learning_rate": 4.313725490196079e-06, + "logits/chosen": -2.7824950218200684, + "logits/rejected": -2.780513048171997, + "logps/chosen": -1619.6287841796875, + "logps/rejected": -1157.877685546875, + "loss": 0.6842, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0035666965413838625, + "rewards/margins": 0.019938554614782333, + "rewards/rejected": -0.023505253717303276, + "step": 660 + }, + { + "epoch": 0.18, + "learning_rate": 4.379084967320262e-06, + "logits/chosen": -2.773609161376953, + "logits/rejected": -2.773920774459839, + "logps/chosen": -1857.786376953125, + "logps/rejected": -1201.499755859375, + "loss": 0.6815, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.022376973181962967, + "rewards/margins": 0.019164763391017914, + "rewards/rejected": -0.04154173657298088, + "step": 670 + }, + { + "epoch": 0.18, + "learning_rate": 4.444444444444444e-06, + "logits/chosen": -2.773653745651245, + "logits/rejected": -2.783552885055542, + "logps/chosen": -1578.38525390625, + "logps/rejected": -1208.565673828125, + "loss": 0.6913, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.01020563580095768, + "rewards/margins": 0.015890780836343765, + "rewards/rejected": -0.026096414774656296, + "step": 680 + }, + { + "epoch": 0.18, + "learning_rate": 4.509803921568628e-06, + "logits/chosen": -2.759702205657959, + "logits/rejected": -2.7624175548553467, + "logps/chosen": -1402.7972412109375, + "logps/rejected": -1246.9429931640625, + "loss": 0.6752, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0054585887119174, + "rewards/margins": 0.015420796349644661, + "rewards/rejected": -0.020879384130239487, + "step": 690 + }, + { + "epoch": 0.18, + "learning_rate": 4.5751633986928105e-06, + "logits/chosen": -2.7728593349456787, + "logits/rejected": -2.776383876800537, + "logps/chosen": -1766.4078369140625, + "logps/rejected": -1340.6552734375, + "loss": 0.6792, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01140767615288496, + "rewards/margins": 0.010001585818827152, + "rewards/rejected": -0.021409258246421814, + "step": 700 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -2.765734910964966, + "eval_logits/rejected": -2.7592179775238037, + "eval_logps/chosen": -1564.490966796875, + "eval_logps/rejected": -1356.0621337890625, + "eval_loss": 0.6818892359733582, + "eval_rewards/accuracies": 0.5992063283920288, + "eval_rewards/chosen": -0.019420143216848373, + "eval_rewards/margins": 0.024553872644901276, + "eval_rewards/rejected": -0.04397401958703995, + "eval_runtime": 221.9951, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 700 + }, + { + "epoch": 0.19, + "learning_rate": 4.640522875816994e-06, + "logits/chosen": -2.780527353286743, + "logits/rejected": -2.7740638256073, + "logps/chosen": -1199.1182861328125, + "logps/rejected": -1168.7066650390625, + "loss": 0.6812, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0392436683177948, + "rewards/margins": 0.012266580015420914, + "rewards/rejected": -0.05151023715734482, + "step": 710 + }, + { + "epoch": 0.19, + "learning_rate": 4.705882352941177e-06, + "logits/chosen": -2.743067979812622, + "logits/rejected": -2.721900224685669, + "logps/chosen": -1631.0428466796875, + "logps/rejected": -1451.7489013671875, + "loss": 0.689, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.027515646070241928, + "rewards/margins": 0.017311880365014076, + "rewards/rejected": -0.044827524572610855, + "step": 720 + }, + { + "epoch": 0.19, + "learning_rate": 4.77124183006536e-06, + "logits/chosen": -2.7711644172668457, + "logits/rejected": -2.7652924060821533, + "logps/chosen": -1598.7469482421875, + "logps/rejected": -1494.751708984375, + "loss": 0.6858, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.016739655286073685, + "rewards/margins": 0.026004815474152565, + "rewards/rejected": -0.0427444651722908, + "step": 730 + }, + { + "epoch": 0.19, + "learning_rate": 4.836601307189543e-06, + "logits/chosen": -2.7872672080993652, + "logits/rejected": -2.7747020721435547, + "logps/chosen": -1697.3082275390625, + "logps/rejected": -1134.6143798828125, + "loss": 0.6815, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.005160794127732515, + "rewards/margins": 0.048423707485198975, + "rewards/rejected": -0.0535845048725605, + "step": 740 + }, + { + "epoch": 0.2, + "learning_rate": 4.901960784313726e-06, + "logits/chosen": -2.790926694869995, + "logits/rejected": -2.770566940307617, + "logps/chosen": -1459.629150390625, + "logps/rejected": -1280.535888671875, + "loss": 0.6791, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.036373939365148544, + "rewards/margins": 0.00627900892868638, + "rewards/rejected": -0.04265294969081879, + "step": 750 + }, + { + "epoch": 0.2, + "learning_rate": 4.967320261437909e-06, + "logits/chosen": -2.7549121379852295, + "logits/rejected": -2.7587850093841553, + "logps/chosen": -1506.0882568359375, + "logps/rejected": -1087.849853515625, + "loss": 0.6907, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.062040358781814575, + "rewards/margins": 0.02471708320081234, + "rewards/rejected": -0.08675744384527206, + "step": 760 + }, + { + "epoch": 0.2, + "learning_rate": 4.99999347843947e-06, + "logits/chosen": -2.738149642944336, + "logits/rejected": -2.7390432357788086, + "logps/chosen": -1282.12353515625, + "logps/rejected": -1458.2039794921875, + "loss": 0.6781, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.029346242547035217, + "rewards/margins": 0.007969383150339127, + "rewards/rejected": -0.037315625697374344, + "step": 770 + }, + { + "epoch": 0.2, + "learning_rate": 4.999941306159375e-06, + "logits/chosen": -2.7938692569732666, + "logits/rejected": -2.78912615776062, + "logps/chosen": -1432.209716796875, + "logps/rejected": -1223.073486328125, + "loss": 0.6693, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023907741531729698, + "rewards/margins": 0.02465171553194523, + "rewards/rejected": -0.04855945706367493, + "step": 780 + }, + { + "epoch": 0.21, + "learning_rate": 4.999836962687967e-06, + "logits/chosen": -2.7668633460998535, + "logits/rejected": -2.784122943878174, + "logps/chosen": -1346.76953125, + "logps/rejected": -1334.90185546875, + "loss": 0.6859, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.08885478973388672, + "rewards/margins": 0.02054060809314251, + "rewards/rejected": -0.10939540714025497, + "step": 790 + }, + { + "epoch": 0.21, + "learning_rate": 4.999680450202786e-06, + "logits/chosen": -2.7835853099823, + "logits/rejected": -2.7726187705993652, + "logps/chosen": -1530.6263427734375, + "logps/rejected": -1177.0797119140625, + "loss": 0.6802, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.06294900178909302, + "rewards/margins": 0.0048830811865627766, + "rewards/rejected": -0.0678320825099945, + "step": 800 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.7611026763916016, + "eval_logits/rejected": -2.755068302154541, + "eval_logps/chosen": -1567.8170166015625, + "eval_logps/rejected": -1359.8597412109375, + "eval_loss": 0.6791194081306458, + "eval_rewards/accuracies": 0.5813491940498352, + "eval_rewards/chosen": -0.05267925187945366, + "eval_rewards/margins": 0.02927049808204174, + "eval_rewards/rejected": -0.08194974809885025, + "eval_runtime": 222.0481, + "eval_samples_per_second": 9.007, + "eval_steps_per_second": 0.284, + "step": 800 + }, + { + "epoch": 0.21, + "learning_rate": 4.999471771970087e-06, + "logits/chosen": -2.7775638103485107, + "logits/rejected": -2.7644567489624023, + "logps/chosen": -1436.937255859375, + "logps/rejected": -1275.5181884765625, + "loss": 0.6842, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07279221713542938, + "rewards/margins": 0.000938097364269197, + "rewards/rejected": -0.07373031228780746, + "step": 810 + }, + { + "epoch": 0.21, + "learning_rate": 4.999210932344767e-06, + "logits/chosen": -2.761268377304077, + "logits/rejected": -2.7647528648376465, + "logps/chosen": -1623.6156005859375, + "logps/rejected": -1342.4853515625, + "loss": 0.6801, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.033265478909015656, + "rewards/margins": 0.06735644489526749, + "rewards/rejected": -0.10062190145254135, + "step": 820 + }, + { + "epoch": 0.22, + "learning_rate": 4.998897936770281e-06, + "logits/chosen": -2.695216655731201, + "logits/rejected": -2.7059473991394043, + "logps/chosen": -1521.4244384765625, + "logps/rejected": -1069.4229736328125, + "loss": 0.6838, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05603489279747009, + "rewards/margins": 0.02252401039004326, + "rewards/rejected": -0.07855890691280365, + "step": 830 + }, + { + "epoch": 0.22, + "learning_rate": 4.998532791778521e-06, + "logits/chosen": -2.764151096343994, + "logits/rejected": -2.7464096546173096, + "logps/chosen": -1710.6044921875, + "logps/rejected": -1356.273681640625, + "loss": 0.6778, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.053830236196517944, + "rewards/margins": 0.028797442093491554, + "rewards/rejected": -0.08262769132852554, + "step": 840 + }, + { + "epoch": 0.22, + "learning_rate": 4.9981155049896885e-06, + "logits/chosen": -2.762856960296631, + "logits/rejected": -2.757084846496582, + "logps/chosen": -1510.125244140625, + "logps/rejected": -1310.7681884765625, + "loss": 0.6705, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02246144786477089, + "rewards/margins": 0.04654809087514877, + "rewards/rejected": -0.06900953501462936, + "step": 850 + }, + { + "epoch": 0.23, + "learning_rate": 4.997646085112126e-06, + "logits/chosen": -2.7250123023986816, + "logits/rejected": -2.733142137527466, + "logps/chosen": -1746.1890869140625, + "logps/rejected": -1475.557373046875, + "loss": 0.6779, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03953806310892105, + "rewards/margins": 0.04774565249681473, + "rewards/rejected": -0.08728370070457458, + "step": 860 + }, + { + "epoch": 0.23, + "learning_rate": 4.997124541942141e-06, + "logits/chosen": -2.733189105987549, + "logits/rejected": -2.755445957183838, + "logps/chosen": -1365.820556640625, + "logps/rejected": -1418.225830078125, + "loss": 0.6864, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.041343413293361664, + "rewards/margins": 0.023359699174761772, + "rewards/rejected": -0.06470310688018799, + "step": 870 + }, + { + "epoch": 0.23, + "learning_rate": 4.996550886363801e-06, + "logits/chosen": -2.7360308170318604, + "logits/rejected": -2.7451579570770264, + "logps/chosen": -1361.407958984375, + "logps/rejected": -1424.7965087890625, + "loss": 0.6817, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03000304475426674, + "rewards/margins": 0.032740574330091476, + "rewards/rejected": -0.06274361908435822, + "step": 880 + }, + { + "epoch": 0.23, + "learning_rate": 4.995925130348706e-06, + "logits/chosen": -2.7535760402679443, + "logits/rejected": -2.753044366836548, + "logps/chosen": -1625.3104248046875, + "logps/rejected": -1407.3009033203125, + "loss": 0.6788, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.03740937262773514, + "rewards/margins": 0.04210934415459633, + "rewards/rejected": -0.07951872050762177, + "step": 890 + }, + { + "epoch": 0.24, + "learning_rate": 4.995247286955734e-06, + "logits/chosen": -2.752267837524414, + "logits/rejected": -2.7573249340057373, + "logps/chosen": -1691.3726806640625, + "logps/rejected": -1371.1131591796875, + "loss": 0.6812, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03671065717935562, + "rewards/margins": 0.020868580788373947, + "rewards/rejected": -0.05757923796772957, + "step": 900 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -2.7654597759246826, + "eval_logits/rejected": -2.758779525756836, + "eval_logps/chosen": -1566.5771484375, + "eval_logps/rejected": -1359.92431640625, + "eval_loss": 0.6772189140319824, + "eval_rewards/accuracies": 0.5714285969734192, + "eval_rewards/chosen": -0.04028034210205078, + "eval_rewards/margins": 0.04231574013829231, + "eval_rewards/rejected": -0.08259608596563339, + "eval_runtime": 222.1282, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 0.284, + "step": 900 + }, + { + "epoch": 0.24, + "learning_rate": 4.994517370330779e-06, + "logits/chosen": -2.726576566696167, + "logits/rejected": -2.7278590202331543, + "logps/chosen": -1630.087646484375, + "logps/rejected": -1243.8406982421875, + "loss": 0.6606, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008724043145775795, + "rewards/margins": 0.08514805883169174, + "rewards/rejected": -0.09387209266424179, + "step": 910 + }, + { + "epoch": 0.24, + "learning_rate": 4.993735395706446e-06, + "logits/chosen": -2.746229648590088, + "logits/rejected": -2.7480199337005615, + "logps/chosen": -1577.779052734375, + "logps/rejected": -1403.44970703125, + "loss": 0.6856, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.06129397824406624, + "rewards/margins": 0.010011469945311546, + "rewards/rejected": -0.07130544632673264, + "step": 920 + }, + { + "epoch": 0.24, + "learning_rate": 4.992901379401737e-06, + "logits/chosen": -2.7393672466278076, + "logits/rejected": -2.749816417694092, + "logps/chosen": -1264.5728759765625, + "logps/rejected": -1142.581787109375, + "loss": 0.6765, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.052227932959795, + "rewards/margins": 0.07175948470830917, + "rewards/rejected": -0.12398741394281387, + "step": 930 + }, + { + "epoch": 0.25, + "learning_rate": 4.992015338821711e-06, + "logits/chosen": -2.7358882427215576, + "logits/rejected": -2.728848934173584, + "logps/chosen": -1451.732666015625, + "logps/rejected": -1161.045654296875, + "loss": 0.6771, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06770779937505722, + "rewards/margins": 0.08663908392190933, + "rewards/rejected": -0.15434686839580536, + "step": 940 + }, + { + "epoch": 0.25, + "learning_rate": 4.991077292457117e-06, + "logits/chosen": -2.7200193405151367, + "logits/rejected": -2.7250800132751465, + "logps/chosen": -1698.7503662109375, + "logps/rejected": -1226.952880859375, + "loss": 0.6747, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09897660464048386, + "rewards/margins": 0.03143421933054924, + "rewards/rejected": -0.1304108202457428, + "step": 950 + }, + { + "epoch": 0.25, + "learning_rate": 4.990087259884016e-06, + "logits/chosen": -2.7504963874816895, + "logits/rejected": -2.7447590827941895, + "logps/chosen": -1224.8485107421875, + "logps/rejected": -1163.18408203125, + "loss": 0.6793, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09718232601881027, + "rewards/margins": 0.026497045531868935, + "rewards/rejected": -0.12367937713861465, + "step": 960 + }, + { + "epoch": 0.25, + "learning_rate": 4.989045261763362e-06, + "logits/chosen": -2.722668409347534, + "logits/rejected": -2.7087206840515137, + "logps/chosen": -1633.8773193359375, + "logps/rejected": -1421.2122802734375, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10039496421813965, + "rewards/margins": 0.020698342472314835, + "rewards/rejected": -0.12109329551458359, + "step": 970 + }, + { + "epoch": 0.26, + "learning_rate": 4.98795131984058e-06, + "logits/chosen": -2.7724738121032715, + "logits/rejected": -2.7552850246429443, + "logps/chosen": -1841.216796875, + "logps/rejected": -1583.920654296875, + "loss": 0.6713, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.006210956256836653, + "rewards/margins": 0.09263283014297485, + "rewards/rejected": -0.08642186224460602, + "step": 980 + }, + { + "epoch": 0.26, + "learning_rate": 4.986805456945107e-06, + "logits/chosen": -2.721271514892578, + "logits/rejected": -2.737414836883545, + "logps/chosen": -1505.755859375, + "logps/rejected": -1432.326416015625, + "loss": 0.6811, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07596326619386673, + "rewards/margins": 0.058528609573841095, + "rewards/rejected": -0.13449189066886902, + "step": 990 + }, + { + "epoch": 0.26, + "learning_rate": 4.985607696989919e-06, + "logits/chosen": -2.7340989112854004, + "logits/rejected": -2.7340407371520996, + "logps/chosen": -1303.5263671875, + "logps/rejected": -1050.8271484375, + "loss": 0.6714, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.06350848078727722, + "rewards/margins": 0.08284474909305573, + "rewards/rejected": -0.14635322988033295, + "step": 1000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.7476096153259277, + "eval_logits/rejected": -2.741795539855957, + "eval_logps/chosen": -1571.4063720703125, + "eval_logps/rejected": -1365.27587890625, + "eval_loss": 0.6746096014976501, + "eval_rewards/accuracies": 0.5714285969734192, + "eval_rewards/chosen": -0.08857344090938568, + "eval_rewards/margins": 0.04753944277763367, + "eval_rewards/rejected": -0.13611288368701935, + "eval_runtime": 222.1714, + "eval_samples_per_second": 9.002, + "eval_steps_per_second": 0.284, + "step": 1000 + }, + { + "epoch": 0.26, + "learning_rate": 4.984358064971026e-06, + "logits/chosen": -2.7566912174224854, + "logits/rejected": -2.7654871940612793, + "logps/chosen": -1185.7261962890625, + "logps/rejected": -1369.632080078125, + "loss": 0.6657, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.050084006041288376, + "rewards/margins": 0.06669258326292038, + "rewards/rejected": -0.11677658557891846, + "step": 1010 + }, + { + "epoch": 0.27, + "learning_rate": 4.983056586966958e-06, + "logits/chosen": -2.7586092948913574, + "logits/rejected": -2.7438788414001465, + "logps/chosen": -1385.3212890625, + "logps/rejected": -1269.158935546875, + "loss": 0.6855, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.10389542579650879, + "rewards/margins": 0.043193086981773376, + "rewards/rejected": -0.14708851277828217, + "step": 1020 + }, + { + "epoch": 0.27, + "learning_rate": 4.981703290138215e-06, + "logits/chosen": -2.7322840690612793, + "logits/rejected": -2.7069272994995117, + "logps/chosen": -1495.0472412109375, + "logps/rejected": -1233.1636962890625, + "loss": 0.6619, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0797240361571312, + "rewards/margins": 0.03941266983747482, + "rewards/rejected": -0.11913671344518661, + "step": 1030 + }, + { + "epoch": 0.27, + "learning_rate": 4.980298202726706e-06, + "logits/chosen": -2.7719852924346924, + "logits/rejected": -2.77016544342041, + "logps/chosen": -1452.3310546875, + "logps/rejected": -1250.4979248046875, + "loss": 0.6874, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.09233850240707397, + "rewards/margins": 0.043667055666446686, + "rewards/rejected": -0.13600555062294006, + "step": 1040 + }, + { + "epoch": 0.27, + "learning_rate": 4.978841354055148e-06, + "logits/chosen": -2.734018325805664, + "logits/rejected": -2.733374834060669, + "logps/chosen": -1476.0380859375, + "logps/rejected": -1346.449951171875, + "loss": 0.6578, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.022154351696372032, + "rewards/margins": 0.14931711554527283, + "rewards/rejected": -0.17147144675254822, + "step": 1050 + }, + { + "epoch": 0.28, + "learning_rate": 4.977332774526471e-06, + "logits/chosen": -2.7099456787109375, + "logits/rejected": -2.7068095207214355, + "logps/chosen": -1252.1337890625, + "logps/rejected": -1303.8621826171875, + "loss": 0.6841, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1030571460723877, + "rewards/margins": 0.04800555855035782, + "rewards/rejected": -0.1510627120733261, + "step": 1060 + }, + { + "epoch": 0.28, + "learning_rate": 4.97577249562317e-06, + "logits/chosen": -2.7273364067077637, + "logits/rejected": -2.71481990814209, + "logps/chosen": -1782.357666015625, + "logps/rejected": -1272.0413818359375, + "loss": 0.6679, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04302068054676056, + "rewards/margins": 0.08528807759284973, + "rewards/rejected": -0.12830877304077148, + "step": 1070 + }, + { + "epoch": 0.28, + "learning_rate": 4.974160549906652e-06, + "logits/chosen": -2.7501015663146973, + "logits/rejected": -2.725672960281372, + "logps/chosen": -1465.035888671875, + "logps/rejected": -1474.959228515625, + "loss": 0.6817, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11869201809167862, + "rewards/margins": 0.05353887006640434, + "rewards/rejected": -0.17223089933395386, + "step": 1080 + }, + { + "epoch": 0.29, + "learning_rate": 4.972496971016559e-06, + "logits/chosen": -2.7162396907806396, + "logits/rejected": -2.726938486099243, + "logps/chosen": -1366.2523193359375, + "logps/rejected": -1355.443603515625, + "loss": 0.6651, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.052674632519483566, + "rewards/margins": 0.05954190343618393, + "rewards/rejected": -0.11221654713153839, + "step": 1090 + }, + { + "epoch": 0.29, + "learning_rate": 4.9707817936700635e-06, + "logits/chosen": -2.7875306606292725, + "logits/rejected": -2.7665858268737793, + "logps/chosen": -1312.6688232421875, + "logps/rejected": -1187.6529541015625, + "loss": 0.676, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.14929260313510895, + "rewards/margins": -0.008267087861895561, + "rewards/rejected": -0.14102551341056824, + "step": 1100 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.7504940032958984, + "eval_logits/rejected": -2.7433109283447266, + "eval_logps/chosen": -1573.961669921875, + "eval_logps/rejected": -1368.9942626953125, + "eval_loss": 0.6743620038032532, + "eval_rewards/accuracies": 0.5892857313156128, + "eval_rewards/chosen": -0.11412478238344193, + "eval_rewards/margins": 0.05917017161846161, + "eval_rewards/rejected": -0.17329494655132294, + "eval_runtime": 221.9771, + "eval_samples_per_second": 9.01, + "eval_steps_per_second": 0.284, + "step": 1100 + }, + { + "epoch": 0.29, + "learning_rate": 4.969015053661142e-06, + "logits/chosen": -2.769805669784546, + "logits/rejected": -2.75061297416687, + "logps/chosen": -1684.818603515625, + "logps/rejected": -1462.84033203125, + "loss": 0.668, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07076840102672577, + "rewards/margins": 0.1183507889509201, + "rewards/rejected": -0.18911918997764587, + "step": 1110 + }, + { + "epoch": 0.29, + "learning_rate": 4.967196787859835e-06, + "logits/chosen": -2.7415719032287598, + "logits/rejected": -2.7441704273223877, + "logps/chosen": -1657.051025390625, + "logps/rejected": -1375.3583984375, + "loss": 0.6699, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1343439519405365, + "rewards/margins": 0.05747341364622116, + "rewards/rejected": -0.19181737303733826, + "step": 1120 + }, + { + "epoch": 0.3, + "learning_rate": 4.965327034211469e-06, + "logits/chosen": -2.735103130340576, + "logits/rejected": -2.7535860538482666, + "logps/chosen": -1481.1834716796875, + "logps/rejected": -1282.894775390625, + "loss": 0.6657, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.08777041733264923, + "rewards/margins": 0.08276239037513733, + "rewards/rejected": -0.17053279280662537, + "step": 1130 + }, + { + "epoch": 0.3, + "learning_rate": 4.96340583173587e-06, + "logits/chosen": -2.752781867980957, + "logits/rejected": -2.717355728149414, + "logps/chosen": -1356.7645263671875, + "logps/rejected": -978.78759765625, + "loss": 0.6581, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.08919491618871689, + "rewards/margins": 0.07804575562477112, + "rewards/rejected": -0.1672406792640686, + "step": 1140 + }, + { + "epoch": 0.3, + "learning_rate": 4.96143322052655e-06, + "logits/chosen": -2.7316513061523438, + "logits/rejected": -2.7153308391571045, + "logps/chosen": -1602.01513671875, + "logps/rejected": -1217.658935546875, + "loss": 0.6696, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08242569863796234, + "rewards/margins": 0.12858238816261292, + "rewards/rejected": -0.21100810170173645, + "step": 1150 + }, + { + "epoch": 0.3, + "learning_rate": 4.959409241749864e-06, + "logits/chosen": -2.7121639251708984, + "logits/rejected": -2.709986448287964, + "logps/chosen": -1387.0302734375, + "logps/rejected": -1208.765380859375, + "loss": 0.665, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.15488040447235107, + "rewards/margins": 0.045979466289281845, + "rewards/rejected": -0.20085985958576202, + "step": 1160 + }, + { + "epoch": 0.31, + "learning_rate": 4.957333937644159e-06, + "logits/chosen": -2.7244646549224854, + "logits/rejected": -2.733808994293213, + "logps/chosen": -1502.994873046875, + "logps/rejected": -1184.745849609375, + "loss": 0.6854, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.18733811378479004, + "rewards/margins": 0.021027732640504837, + "rewards/rejected": -0.20836582779884338, + "step": 1170 + }, + { + "epoch": 0.31, + "learning_rate": 4.955207351518885e-06, + "logits/chosen": -2.7539525032043457, + "logits/rejected": -2.7505831718444824, + "logps/chosen": -1319.450927734375, + "logps/rejected": -1050.664794921875, + "loss": 0.6786, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13988730311393738, + "rewards/margins": 0.050179190933704376, + "rewards/rejected": -0.19006650149822235, + "step": 1180 + }, + { + "epoch": 0.31, + "learning_rate": 4.953029527753699e-06, + "logits/chosen": -2.720780611038208, + "logits/rejected": -2.7177436351776123, + "logps/chosen": -1440.0672607421875, + "logps/rejected": -1173.736083984375, + "loss": 0.6724, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10402397066354752, + "rewards/margins": 0.10216061025857925, + "rewards/rejected": -0.20618458092212677, + "step": 1190 + }, + { + "epoch": 0.31, + "learning_rate": 4.95080051179753e-06, + "logits/chosen": -2.7565178871154785, + "logits/rejected": -2.74467396736145, + "logps/chosen": -1439.5458984375, + "logps/rejected": -1342.901611328125, + "loss": 0.6779, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.11544078588485718, + "rewards/margins": 0.02371135726571083, + "rewards/rejected": -0.1391521394252777, + "step": 1200 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.7510786056518555, + "eval_logits/rejected": -2.743079423904419, + "eval_logps/chosen": -1573.1090087890625, + "eval_logps/rejected": -1368.6934814453125, + "eval_loss": 0.6702868938446045, + "eval_rewards/accuracies": 0.5932539701461792, + "eval_rewards/chosen": -0.10559960454702377, + "eval_rewards/margins": 0.06468784809112549, + "eval_rewards/rejected": -0.17028746008872986, + "eval_runtime": 221.8656, + "eval_samples_per_second": 9.014, + "eval_steps_per_second": 0.284, + "step": 1200 + }, + { + "epoch": 0.32, + "learning_rate": 4.948520350167637e-06, + "logits/chosen": -2.7409512996673584, + "logits/rejected": -2.7256882190704346, + "logps/chosen": -1389.34033203125, + "logps/rejected": -1342.9583740234375, + "loss": 0.6709, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11397924274206161, + "rewards/margins": 0.09994085133075714, + "rewards/rejected": -0.21392011642456055, + "step": 1210 + }, + { + "epoch": 0.32, + "learning_rate": 4.946189090448639e-06, + "logits/chosen": -2.728647470474243, + "logits/rejected": -2.724257707595825, + "logps/chosen": -1304.4896240234375, + "logps/rejected": -1695.031494140625, + "loss": 0.675, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12990212440490723, + "rewards/margins": 0.12769030034542084, + "rewards/rejected": -0.25759243965148926, + "step": 1220 + }, + { + "epoch": 0.32, + "learning_rate": 4.943806781291515e-06, + "logits/chosen": -2.717841386795044, + "logits/rejected": -2.705570697784424, + "logps/chosen": -1433.613525390625, + "logps/rejected": -1162.338134765625, + "loss": 0.648, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.10121190547943115, + "rewards/margins": 0.07192480564117432, + "rewards/rejected": -0.17313668131828308, + "step": 1230 + }, + { + "epoch": 0.32, + "learning_rate": 4.941373472412595e-06, + "logits/chosen": -2.719426155090332, + "logits/rejected": -2.707892894744873, + "logps/chosen": -1468.894775390625, + "logps/rejected": -1463.6505126953125, + "loss": 0.6706, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.09925106167793274, + "rewards/margins": 0.07948430627584457, + "rewards/rejected": -0.17873536050319672, + "step": 1240 + }, + { + "epoch": 0.33, + "learning_rate": 4.938889214592521e-06, + "logits/chosen": -2.7018826007843018, + "logits/rejected": -2.7066359519958496, + "logps/chosen": -1199.9141845703125, + "logps/rejected": -1288.7686767578125, + "loss": 0.6584, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.14441849291324615, + "rewards/margins": 0.06486638635396957, + "rewards/rejected": -0.20928487181663513, + "step": 1250 + }, + { + "epoch": 0.33, + "learning_rate": 4.936354059675186e-06, + "logits/chosen": -2.7446064949035645, + "logits/rejected": -2.7585690021514893, + "logps/chosen": -1435.941162109375, + "logps/rejected": -1301.364501953125, + "loss": 0.6616, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1579599678516388, + "rewards/margins": 0.09013622999191284, + "rewards/rejected": -0.24809618294239044, + "step": 1260 + }, + { + "epoch": 0.33, + "learning_rate": 4.933768060566654e-06, + "logits/chosen": -2.7097179889678955, + "logits/rejected": -2.712663412094116, + "logps/chosen": -1599.5322265625, + "logps/rejected": -1429.913330078125, + "loss": 0.6781, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.10585727542638779, + "rewards/margins": 0.09449507296085358, + "rewards/rejected": -0.20035234093666077, + "step": 1270 + }, + { + "epoch": 0.33, + "learning_rate": 4.931131271234052e-06, + "logits/chosen": -2.702580451965332, + "logits/rejected": -2.705609083175659, + "logps/chosen": -2137.19091796875, + "logps/rejected": -1472.0059814453125, + "loss": 0.6608, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15121182799339294, + "rewards/margins": 0.07554563879966736, + "rewards/rejected": -0.2267574816942215, + "step": 1280 + }, + { + "epoch": 0.34, + "learning_rate": 4.928443746704448e-06, + "logits/chosen": -2.7274062633514404, + "logits/rejected": -2.7447800636291504, + "logps/chosen": -1391.8414306640625, + "logps/rejected": -1164.763916015625, + "loss": 0.6842, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1704031229019165, + "rewards/margins": 0.01789279095828533, + "rewards/rejected": -0.1882958859205246, + "step": 1290 + }, + { + "epoch": 0.34, + "learning_rate": 4.925705543063703e-06, + "logits/chosen": -2.7310848236083984, + "logits/rejected": -2.7340548038482666, + "logps/chosen": -1622.049560546875, + "logps/rejected": -1287.524658203125, + "loss": 0.6888, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.12351367622613907, + "rewards/margins": 0.08872373402118683, + "rewards/rejected": -0.2122374027967453, + "step": 1300 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -2.745222568511963, + "eval_logits/rejected": -2.7375075817108154, + "eval_logps/chosen": -1573.912109375, + "eval_logps/rejected": -1370.159912109375, + "eval_loss": 0.6676135659217834, + "eval_rewards/accuracies": 0.5972222089767456, + "eval_rewards/chosen": -0.1136305034160614, + "eval_rewards/margins": 0.07132188230752945, + "eval_rewards/rejected": -0.18495237827301025, + "eval_runtime": 221.931, + "eval_samples_per_second": 9.012, + "eval_steps_per_second": 0.284, + "step": 1300 + }, + { + "epoch": 0.34, + "learning_rate": 4.922916717455297e-06, + "logits/chosen": -2.7452569007873535, + "logits/rejected": -2.7377490997314453, + "logps/chosen": -1249.689697265625, + "logps/rejected": -1024.716064453125, + "loss": 0.6612, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11122635751962662, + "rewards/margins": 0.10638532787561417, + "rewards/rejected": -0.2176116704940796, + "step": 1310 + }, + { + "epoch": 0.35, + "learning_rate": 4.920077328079136e-06, + "logits/chosen": -2.7663183212280273, + "logits/rejected": -2.7657904624938965, + "logps/chosen": -1489.3701171875, + "logps/rejected": -1184.8206787109375, + "loss": 0.6627, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09591363370418549, + "rewards/margins": 0.10784796625375748, + "rewards/rejected": -0.20376160740852356, + "step": 1320 + }, + { + "epoch": 0.35, + "learning_rate": 4.9171874341903445e-06, + "logits/chosen": -2.7501184940338135, + "logits/rejected": -2.7545647621154785, + "logps/chosen": -1703.434814453125, + "logps/rejected": -1121.3775634765625, + "loss": 0.6766, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.11108261346817017, + "rewards/margins": 0.11215372383594513, + "rewards/rejected": -0.2232363522052765, + "step": 1330 + }, + { + "epoch": 0.35, + "learning_rate": 4.914247096098019e-06, + "logits/chosen": -2.755143642425537, + "logits/rejected": -2.736085891723633, + "logps/chosen": -1771.996826171875, + "logps/rejected": -1301.4976806640625, + "loss": 0.672, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.15492898225784302, + "rewards/margins": 0.040635328739881516, + "rewards/rejected": -0.19556431472301483, + "step": 1340 + }, + { + "epoch": 0.35, + "learning_rate": 4.911256375163977e-06, + "logits/chosen": -2.7278788089752197, + "logits/rejected": -2.7153897285461426, + "logps/chosen": -1302.1484375, + "logps/rejected": -1428.9267578125, + "loss": 0.6697, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1328674554824829, + "rewards/margins": 0.05615769699215889, + "rewards/rejected": -0.1890251487493515, + "step": 1350 + }, + { + "epoch": 0.36, + "learning_rate": 4.908215333801474e-06, + "logits/chosen": -2.714822292327881, + "logits/rejected": -2.7064924240112305, + "logps/chosen": -1233.5889892578125, + "logps/rejected": -1167.036376953125, + "loss": 0.6868, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.16341665387153625, + "rewards/margins": 0.04497160390019417, + "rewards/rejected": -0.20838825404644012, + "step": 1360 + }, + { + "epoch": 0.36, + "learning_rate": 4.9051240354739004e-06, + "logits/chosen": -2.747804880142212, + "logits/rejected": -2.756502866744995, + "logps/chosen": -1667.056640625, + "logps/rejected": -1420.6734619140625, + "loss": 0.6721, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14670798182487488, + "rewards/margins": 0.09335624426603317, + "rewards/rejected": -0.24006421864032745, + "step": 1370 + }, + { + "epoch": 0.36, + "learning_rate": 4.901982544693457e-06, + "logits/chosen": -2.784402370452881, + "logits/rejected": -2.7820801734924316, + "logps/chosen": -1775.5872802734375, + "logps/rejected": -1598.1461181640625, + "loss": 0.6679, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.08600615710020065, + "rewards/margins": 0.07457789778709412, + "rewards/rejected": -0.16058406233787537, + "step": 1380 + }, + { + "epoch": 0.36, + "learning_rate": 4.898790927019809e-06, + "logits/chosen": -2.728027820587158, + "logits/rejected": -2.7303080558776855, + "logps/chosen": -1424.24755859375, + "logps/rejected": -1356.0853271484375, + "loss": 0.672, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1718912571668625, + "rewards/margins": 0.11604329198598862, + "rewards/rejected": -0.2879345417022705, + "step": 1390 + }, + { + "epoch": 0.37, + "learning_rate": 4.895549249058718e-06, + "logits/chosen": -2.6968743801116943, + "logits/rejected": -2.695655584335327, + "logps/chosen": -1699.045166015625, + "logps/rejected": -1486.0692138671875, + "loss": 0.6664, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12043057382106781, + "rewards/margins": 0.08103757351636887, + "rewards/rejected": -0.20146813988685608, + "step": 1400 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -2.737523317337036, + "eval_logits/rejected": -2.730177879333496, + "eval_logps/chosen": -1576.802734375, + "eval_logps/rejected": -1373.31103515625, + "eval_loss": 0.6669156551361084, + "eval_rewards/accuracies": 0.6071428656578064, + "eval_rewards/chosen": -0.1425366997718811, + "eval_rewards/margins": 0.07392816990613937, + "eval_rewards/rejected": -0.21646487712860107, + "eval_runtime": 221.9953, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 1400 + }, + { + "epoch": 0.37, + "learning_rate": 4.892257578460656e-06, + "logits/chosen": -2.736672878265381, + "logits/rejected": -2.731189250946045, + "logps/chosen": -1334.677001953125, + "logps/rejected": -1220.300048828125, + "loss": 0.6567, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19311638176441193, + "rewards/margins": 0.0334378182888031, + "rewards/rejected": -0.22655422985553741, + "step": 1410 + }, + { + "epoch": 0.37, + "learning_rate": 4.888915983919383e-06, + "logits/chosen": -2.751110553741455, + "logits/rejected": -2.7499165534973145, + "logps/chosen": -1545.2374267578125, + "logps/rejected": -1497.2535400390625, + "loss": 0.6665, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17348986864089966, + "rewards/margins": 0.014758164063096046, + "rewards/rejected": -0.18824802339076996, + "step": 1420 + }, + { + "epoch": 0.37, + "learning_rate": 4.885524535170525e-06, + "logits/chosen": -2.716099500656128, + "logits/rejected": -2.7249863147735596, + "logps/chosen": -1443.5123291015625, + "logps/rejected": -1279.0321044921875, + "loss": 0.6687, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19876326620578766, + "rewards/margins": 0.085506372153759, + "rewards/rejected": -0.28426963090896606, + "step": 1430 + }, + { + "epoch": 0.38, + "learning_rate": 4.882083302990113e-06, + "logits/chosen": -2.7442541122436523, + "logits/rejected": -2.7311859130859375, + "logps/chosen": -1410.497802734375, + "logps/rejected": -1370.2880859375, + "loss": 0.6697, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1760236769914627, + "rewards/margins": 0.07774855941534042, + "rewards/rejected": -0.25377222895622253, + "step": 1440 + }, + { + "epoch": 0.38, + "learning_rate": 4.878592359193104e-06, + "logits/chosen": -2.7766852378845215, + "logits/rejected": -2.7610340118408203, + "logps/chosen": -1116.6553955078125, + "logps/rejected": -992.8492431640625, + "loss": 0.6491, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10915567725896835, + "rewards/margins": 0.18461118638515472, + "rewards/rejected": -0.29376688599586487, + "step": 1450 + }, + { + "epoch": 0.38, + "learning_rate": 4.875051776631888e-06, + "logits/chosen": -2.7420220375061035, + "logits/rejected": -2.7181191444396973, + "logps/chosen": -2061.544921875, + "logps/rejected": -1855.9967041015625, + "loss": 0.6683, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14548955857753754, + "rewards/margins": 0.05113809555768967, + "rewards/rejected": -0.1966276466846466, + "step": 1460 + }, + { + "epoch": 0.38, + "learning_rate": 4.871461629194764e-06, + "logits/chosen": -2.7094175815582275, + "logits/rejected": -2.7036585807800293, + "logps/chosen": -1583.8709716796875, + "logps/rejected": -1170.502685546875, + "loss": 0.67, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24922314286231995, + "rewards/margins": 0.04640321061015129, + "rewards/rejected": -0.29562637209892273, + "step": 1470 + }, + { + "epoch": 0.39, + "learning_rate": 4.8678219918043984e-06, + "logits/chosen": -2.713094711303711, + "logits/rejected": -2.7234363555908203, + "logps/chosen": -1121.4586181640625, + "logps/rejected": -1276.220458984375, + "loss": 0.6404, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18884606659412384, + "rewards/margins": 0.07648530602455139, + "rewards/rejected": -0.26533135771751404, + "step": 1480 + }, + { + "epoch": 0.39, + "learning_rate": 4.864132940416262e-06, + "logits/chosen": -2.7564730644226074, + "logits/rejected": -2.7708687782287598, + "logps/chosen": -1380.2691650390625, + "logps/rejected": -1315.7677001953125, + "loss": 0.6967, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.24362365901470184, + "rewards/margins": 0.0016543098026886582, + "rewards/rejected": -0.24527797102928162, + "step": 1490 + }, + { + "epoch": 0.39, + "learning_rate": 4.860394552017044e-06, + "logits/chosen": -2.774509906768799, + "logits/rejected": -2.7636191844940186, + "logps/chosen": -1612.9970703125, + "logps/rejected": -1335.4024658203125, + "loss": 0.6705, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1847611963748932, + "rewards/margins": 0.07922552525997162, + "rewards/rejected": -0.263986736536026, + "step": 1500 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -2.7545900344848633, + "eval_logits/rejected": -2.748063802719116, + "eval_logps/chosen": -1580.59130859375, + "eval_logps/rejected": -1378.6722412109375, + "eval_loss": 0.666502058506012, + "eval_rewards/accuracies": 0.6071428656578064, + "eval_rewards/chosen": -0.18042320013046265, + "eval_rewards/margins": 0.0896516814827919, + "eval_rewards/rejected": -0.27007487416267395, + "eval_runtime": 222.0107, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 1500 + }, + { + "epoch": 0.4, + "learning_rate": 4.856606904623047e-06, + "logits/chosen": -2.731475591659546, + "logits/rejected": -2.735136032104492, + "logps/chosen": -1508.71142578125, + "logps/rejected": -1404.9801025390625, + "loss": 0.6814, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19713035225868225, + "rewards/margins": 0.09451910108327866, + "rewards/rejected": -0.2916494607925415, + "step": 1510 + }, + { + "epoch": 0.4, + "learning_rate": 4.852770077278557e-06, + "logits/chosen": -2.726473808288574, + "logits/rejected": -2.717942714691162, + "logps/chosen": -1575.031005859375, + "logps/rejected": -1413.8336181640625, + "loss": 0.6629, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18052199482917786, + "rewards/margins": 0.1278577744960785, + "rewards/rejected": -0.30837976932525635, + "step": 1520 + }, + { + "epoch": 0.4, + "learning_rate": 4.848884150054196e-06, + "logits/chosen": -2.718505382537842, + "logits/rejected": -2.7180285453796387, + "logps/chosen": -1285.03125, + "logps/rejected": -1022.1572265625, + "loss": 0.6622, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2162628471851349, + "rewards/margins": 0.07336204499006271, + "rewards/rejected": -0.2896248698234558, + "step": 1530 + }, + { + "epoch": 0.4, + "learning_rate": 4.8449492040452495e-06, + "logits/chosen": -2.719595432281494, + "logits/rejected": -2.7203640937805176, + "logps/chosen": -1711.162109375, + "logps/rejected": -1532.151611328125, + "loss": 0.6709, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2285035401582718, + "rewards/margins": 0.13888053596019745, + "rewards/rejected": -0.36738404631614685, + "step": 1540 + }, + { + "epoch": 0.41, + "learning_rate": 4.840965321369973e-06, + "logits/chosen": -2.737882137298584, + "logits/rejected": -2.7262706756591797, + "logps/chosen": -1420.25439453125, + "logps/rejected": -1141.2373046875, + "loss": 0.689, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19759336113929749, + "rewards/margins": 0.03492305055260658, + "rewards/rejected": -0.23251643776893616, + "step": 1550 + }, + { + "epoch": 0.41, + "learning_rate": 4.8369325851678795e-06, + "logits/chosen": -2.737092971801758, + "logits/rejected": -2.7480967044830322, + "logps/chosen": -1542.2943115234375, + "logps/rejected": -1442.6156005859375, + "loss": 0.6566, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.14410072565078735, + "rewards/margins": 0.07033728063106537, + "rewards/rejected": -0.21443800628185272, + "step": 1560 + }, + { + "epoch": 0.41, + "learning_rate": 4.832851079598007e-06, + "logits/chosen": -2.722095012664795, + "logits/rejected": -2.7370333671569824, + "logps/chosen": -1584.534912109375, + "logps/rejected": -1324.1781005859375, + "loss": 0.673, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2010543793439865, + "rewards/margins": 0.03377969563007355, + "rewards/rejected": -0.23483404517173767, + "step": 1570 + }, + { + "epoch": 0.41, + "learning_rate": 4.828720889837158e-06, + "logits/chosen": -2.731977701187134, + "logits/rejected": -2.7154393196105957, + "logps/chosen": -1614.5052490234375, + "logps/rejected": -1165.898193359375, + "loss": 0.6693, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21724538505077362, + "rewards/margins": 0.11410681903362274, + "rewards/rejected": -0.33135223388671875, + "step": 1580 + }, + { + "epoch": 0.42, + "learning_rate": 4.824542102078125e-06, + "logits/chosen": -2.726020336151123, + "logits/rejected": -2.737281322479248, + "logps/chosen": -1595.6234130859375, + "logps/rejected": -1460.412841796875, + "loss": 0.6655, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13410621881484985, + "rewards/margins": 0.13845457136631012, + "rewards/rejected": -0.2725607752799988, + "step": 1590 + }, + { + "epoch": 0.42, + "learning_rate": 4.820314803527888e-06, + "logits/chosen": -2.759023904800415, + "logits/rejected": -2.7574801445007324, + "logps/chosen": -1377.1361083984375, + "logps/rejected": -1252.229736328125, + "loss": 0.6411, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.13430440425872803, + "rewards/margins": 0.14161941409111023, + "rewards/rejected": -0.27592384815216064, + "step": 1600 + }, + { + "epoch": 0.42, + "eval_logits/chosen": -2.731712818145752, + "eval_logits/rejected": -2.7248520851135254, + "eval_logps/chosen": -1581.7911376953125, + "eval_logps/rejected": -1378.941650390625, + "eval_loss": 0.6652618646621704, + "eval_rewards/accuracies": 0.6329365372657776, + "eval_rewards/chosen": -0.1924203485250473, + "eval_rewards/margins": 0.08035055547952652, + "eval_rewards/rejected": -0.2727709412574768, + "eval_runtime": 222.0254, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.284, + "step": 1600 + }, + { + "epoch": 0.42, + "learning_rate": 4.816039082405799e-06, + "logits/chosen": -2.7569971084594727, + "logits/rejected": -2.735839366912842, + "logps/chosen": -1658.815673828125, + "logps/rejected": -1198.2847900390625, + "loss": 0.665, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21991725265979767, + "rewards/margins": 0.11259231716394424, + "rewards/rejected": -0.3325095772743225, + "step": 1610 + }, + { + "epoch": 0.42, + "learning_rate": 4.81171502794174e-06, + "logits/chosen": -2.7295005321502686, + "logits/rejected": -2.718212604522705, + "logps/chosen": -1451.7779541015625, + "logps/rejected": -1099.2412109375, + "loss": 0.6443, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11921729892492294, + "rewards/margins": 0.14378681778907776, + "rewards/rejected": -0.2630041241645813, + "step": 1620 + }, + { + "epoch": 0.43, + "learning_rate": 4.8073427303742584e-06, + "logits/chosen": -2.743295907974243, + "logits/rejected": -2.7398316860198975, + "logps/chosen": -1610.1199951171875, + "logps/rejected": -1429.06591796875, + "loss": 0.6755, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18854033946990967, + "rewards/margins": 0.029284056276082993, + "rewards/rejected": -0.21782438457012177, + "step": 1630 + }, + { + "epoch": 0.43, + "learning_rate": 4.802922280948685e-06, + "logits/chosen": -2.7415010929107666, + "logits/rejected": -2.735572338104248, + "logps/chosen": -1472.3653564453125, + "logps/rejected": -1607.3350830078125, + "loss": 0.6738, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19889327883720398, + "rewards/margins": -0.004034703131765127, + "rewards/rejected": -0.1948585957288742, + "step": 1640 + }, + { + "epoch": 0.43, + "learning_rate": 4.798453771915231e-06, + "logits/chosen": -2.673447370529175, + "logits/rejected": -2.6908771991729736, + "logps/chosen": -1097.9010009765625, + "logps/rejected": -1113.3797607421875, + "loss": 0.6662, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20751328766345978, + "rewards/margins": 0.0491621233522892, + "rewards/rejected": -0.2566754221916199, + "step": 1650 + }, + { + "epoch": 0.43, + "learning_rate": 4.793937296527062e-06, + "logits/chosen": -2.7060468196868896, + "logits/rejected": -2.696061849594116, + "logps/chosen": -1091.1229248046875, + "logps/rejected": -950.5789184570312, + "loss": 0.6589, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2101879119873047, + "rewards/margins": 0.07423336803913116, + "rewards/rejected": -0.28442126512527466, + "step": 1660 + }, + { + "epoch": 0.44, + "learning_rate": 4.78937294903835e-06, + "logits/chosen": -2.7246451377868652, + "logits/rejected": -2.7175862789154053, + "logps/chosen": -1581.0328369140625, + "logps/rejected": -1174.4051513671875, + "loss": 0.6562, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.11526259034872055, + "rewards/margins": 0.12269117683172226, + "rewards/rejected": -0.23795375227928162, + "step": 1670 + }, + { + "epoch": 0.44, + "learning_rate": 4.78476082470231e-06, + "logits/chosen": -2.7286760807037354, + "logits/rejected": -2.7100577354431152, + "logps/chosen": -1294.1153564453125, + "logps/rejected": -1143.6041259765625, + "loss": 0.6792, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22615864872932434, + "rewards/margins": 0.050124991685152054, + "rewards/rejected": -0.2762836217880249, + "step": 1680 + }, + { + "epoch": 0.44, + "learning_rate": 4.780101019769212e-06, + "logits/chosen": -2.736978054046631, + "logits/rejected": -2.736088275909424, + "logps/chosen": -1240.342041015625, + "logps/rejected": -1338.625732421875, + "loss": 0.6746, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20585688948631287, + "rewards/margins": 0.09276419878005981, + "rewards/rejected": -0.29862111806869507, + "step": 1690 + }, + { + "epoch": 0.44, + "learning_rate": 4.775393631484368e-06, + "logits/chosen": -2.7205305099487305, + "logits/rejected": -2.7118542194366455, + "logps/chosen": -1580.404541015625, + "logps/rejected": -1334.212646484375, + "loss": 0.665, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1873815357685089, + "rewards/margins": 0.06995360553264618, + "rewards/rejected": -0.2573351263999939, + "step": 1700 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -2.742210626602173, + "eval_logits/rejected": -2.7354586124420166, + "eval_logps/chosen": -1582.2147216796875, + "eval_logps/rejected": -1379.5565185546875, + "eval_loss": 0.6644400358200073, + "eval_rewards/accuracies": 0.613095223903656, + "eval_rewards/chosen": -0.19665634632110596, + "eval_rewards/margins": 0.08226174861192703, + "eval_rewards/rejected": -0.2789180874824524, + "eval_runtime": 221.9276, + "eval_samples_per_second": 9.012, + "eval_steps_per_second": 0.284, + "step": 1700 + }, + { + "epoch": 0.45, + "learning_rate": 4.770638758086105e-06, + "logits/chosen": -2.7356081008911133, + "logits/rejected": -2.731968641281128, + "logps/chosen": -1441.8275146484375, + "logps/rejected": -1499.3004150390625, + "loss": 0.6697, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2256285697221756, + "rewards/margins": 0.053405504673719406, + "rewards/rejected": -0.2790340781211853, + "step": 1710 + }, + { + "epoch": 0.45, + "learning_rate": 4.7658364988037184e-06, + "logits/chosen": -2.7387094497680664, + "logits/rejected": -2.7272191047668457, + "logps/chosen": -1506.8668212890625, + "logps/rejected": -1364.9649658203125, + "loss": 0.6646, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1541571319103241, + "rewards/margins": 0.14272871613502502, + "rewards/rejected": -0.2968858480453491, + "step": 1720 + }, + { + "epoch": 0.45, + "learning_rate": 4.760986953855395e-06, + "logits/chosen": -2.7447781562805176, + "logits/rejected": -2.731154203414917, + "logps/chosen": -1475.6947021484375, + "logps/rejected": -1100.525146484375, + "loss": 0.6701, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2217240035533905, + "rewards/margins": 0.06821813434362411, + "rewards/rejected": -0.2899421155452728, + "step": 1730 + }, + { + "epoch": 0.46, + "learning_rate": 4.756090224446127e-06, + "logits/chosen": -2.780522108078003, + "logits/rejected": -2.782181978225708, + "logps/chosen": -1490.47412109375, + "logps/rejected": -1478.94775390625, + "loss": 0.676, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.25417935848236084, + "rewards/margins": 0.084381103515625, + "rewards/rejected": -0.3385604918003082, + "step": 1740 + }, + { + "epoch": 0.46, + "learning_rate": 4.7511464127655945e-06, + "logits/chosen": -2.737290143966675, + "logits/rejected": -2.7433362007141113, + "logps/chosen": -1266.33056640625, + "logps/rejected": -1185.47802734375, + "loss": 0.6604, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19823206961154938, + "rewards/margins": 0.0937265157699585, + "rewards/rejected": -0.29195863008499146, + "step": 1750 + }, + { + "epoch": 0.46, + "learning_rate": 4.74615562198604e-06, + "logits/chosen": -2.7513489723205566, + "logits/rejected": -2.7504875659942627, + "logps/chosen": -1600.952880859375, + "logps/rejected": -1202.5400390625, + "loss": 0.6775, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19948622584342957, + "rewards/margins": 0.025036226958036423, + "rewards/rejected": -0.2245224416255951, + "step": 1760 + }, + { + "epoch": 0.46, + "learning_rate": 4.741117956260107e-06, + "logits/chosen": -2.7026009559631348, + "logits/rejected": -2.7085928916931152, + "logps/chosen": -1406.4818115234375, + "logps/rejected": -1320.4462890625, + "loss": 0.6576, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22521424293518066, + "rewards/margins": 0.03839210420846939, + "rewards/rejected": -0.26360636949539185, + "step": 1770 + }, + { + "epoch": 0.47, + "learning_rate": 4.736033520718672e-06, + "logits/chosen": -2.690847158432007, + "logits/rejected": -2.710789918899536, + "logps/chosen": -1289.645263671875, + "logps/rejected": -1037.477294921875, + "loss": 0.6612, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21732059121131897, + "rewards/margins": 0.03371669352054596, + "rewards/rejected": -0.25103726983070374, + "step": 1780 + }, + { + "epoch": 0.47, + "learning_rate": 4.730902421468652e-06, + "logits/chosen": -2.6835672855377197, + "logits/rejected": -2.672365665435791, + "logps/chosen": -1312.6485595703125, + "logps/rejected": -1355.291748046875, + "loss": 0.6555, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1490660011768341, + "rewards/margins": 0.14994415640830994, + "rewards/rejected": -0.29901012778282166, + "step": 1790 + }, + { + "epoch": 0.47, + "learning_rate": 4.7257247655907854e-06, + "logits/chosen": -2.7463784217834473, + "logits/rejected": -2.750366449356079, + "logps/chosen": -1203.7630615234375, + "logps/rejected": -1130.248291015625, + "loss": 0.6563, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2655632495880127, + "rewards/margins": 0.054849814623594284, + "rewards/rejected": -0.3204130530357361, + "step": 1800 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -2.73252534866333, + "eval_logits/rejected": -2.725741147994995, + "eval_logps/chosen": -1583.275146484375, + "eval_logps/rejected": -1381.0634765625, + "eval_loss": 0.6638898849487305, + "eval_rewards/accuracies": 0.6210317611694336, + "eval_rewards/chosen": -0.20726004242897034, + "eval_rewards/margins": 0.08672784268856049, + "eval_rewards/rejected": -0.29398787021636963, + "eval_runtime": 221.9932, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 1800 + }, + { + "epoch": 0.47, + "learning_rate": 4.720500661137397e-06, + "logits/chosen": -2.744943380355835, + "logits/rejected": -2.7473301887512207, + "logps/chosen": -1646.609619140625, + "logps/rejected": -1309.5911865234375, + "loss": 0.6391, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18750329315662384, + "rewards/margins": 0.09809277206659317, + "rewards/rejected": -0.2855960726737976, + "step": 1810 + }, + { + "epoch": 0.48, + "learning_rate": 4.71523021713015e-06, + "logits/chosen": -2.7237682342529297, + "logits/rejected": -2.7082934379577637, + "logps/chosen": -1719.270751953125, + "logps/rejected": -1236.6881103515625, + "loss": 0.6651, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20721980929374695, + "rewards/margins": 0.06654822826385498, + "rewards/rejected": -0.27376803755760193, + "step": 1820 + }, + { + "epoch": 0.48, + "learning_rate": 4.709913543557761e-06, + "logits/chosen": -2.7400131225585938, + "logits/rejected": -2.740361452102661, + "logps/chosen": -1691.213623046875, + "logps/rejected": -1475.6851806640625, + "loss": 0.6585, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18694502115249634, + "rewards/margins": 0.13363699615001678, + "rewards/rejected": -0.3205820322036743, + "step": 1830 + }, + { + "epoch": 0.48, + "learning_rate": 4.704550751373715e-06, + "logits/chosen": -2.7532076835632324, + "logits/rejected": -2.75807523727417, + "logps/chosen": -1608.8062744140625, + "logps/rejected": -1378.0643310546875, + "loss": 0.6639, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2479935884475708, + "rewards/margins": 0.10084398090839386, + "rewards/rejected": -0.34883755445480347, + "step": 1840 + }, + { + "epoch": 0.48, + "learning_rate": 4.699141952493941e-06, + "logits/chosen": -2.7423033714294434, + "logits/rejected": -2.73795223236084, + "logps/chosen": -1499.7720947265625, + "logps/rejected": -1187.2939453125, + "loss": 0.649, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.23238825798034668, + "rewards/margins": 0.04651142284274101, + "rewards/rejected": -0.2788996994495392, + "step": 1850 + }, + { + "epoch": 0.49, + "learning_rate": 4.6936872597944814e-06, + "logits/chosen": -2.7192182540893555, + "logits/rejected": -2.7208571434020996, + "logps/chosen": -1266.417236328125, + "logps/rejected": -1494.069091796875, + "loss": 0.6431, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2008957415819168, + "rewards/margins": 0.16491912305355072, + "rewards/rejected": -0.3658148944377899, + "step": 1860 + }, + { + "epoch": 0.49, + "learning_rate": 4.688186787109136e-06, + "logits/chosen": -2.7188289165496826, + "logits/rejected": -2.701460361480713, + "logps/chosen": -1499.3271484375, + "logps/rejected": -1498.5401611328125, + "loss": 0.6625, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.2466460019350052, + "rewards/margins": 0.04578220099210739, + "rewards/rejected": -0.29242822527885437, + "step": 1870 + }, + { + "epoch": 0.49, + "learning_rate": 4.682640649227085e-06, + "logits/chosen": -2.7496445178985596, + "logits/rejected": -2.7551424503326416, + "logps/chosen": -1668.005126953125, + "logps/rejected": -1245.0250244140625, + "loss": 0.6558, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24418357014656067, + "rewards/margins": 0.12311581522226334, + "rewards/rejected": -0.3672993779182434, + "step": 1880 + }, + { + "epoch": 0.49, + "learning_rate": 4.677048961890492e-06, + "logits/chosen": -2.733494997024536, + "logits/rejected": -2.7331326007843018, + "logps/chosen": -1348.33203125, + "logps/rejected": -1327.357177734375, + "loss": 0.6894, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.31300076842308044, + "rewards/margins": -0.028551051393151283, + "rewards/rejected": -0.2844497561454773, + "step": 1890 + }, + { + "epoch": 0.5, + "learning_rate": 4.671411841792096e-06, + "logits/chosen": -2.7191219329833984, + "logits/rejected": -2.7150659561157227, + "logps/chosen": -1335.385986328125, + "logps/rejected": -1472.392333984375, + "loss": 0.6668, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2669922113418579, + "rewards/margins": 0.07704712450504303, + "rewards/rejected": -0.34403929114341736, + "step": 1900 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.742595672607422, + "eval_logits/rejected": -2.7350475788116455, + "eval_logps/chosen": -1585.14697265625, + "eval_logps/rejected": -1384.1845703125, + "eval_loss": 0.6620241403579712, + "eval_rewards/accuracies": 0.6170634627342224, + "eval_rewards/chosen": -0.225979283452034, + "eval_rewards/margins": 0.09921804070472717, + "eval_rewards/rejected": -0.3251972794532776, + "eval_runtime": 222.0637, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 0.284, + "step": 1900 + }, + { + "epoch": 0.5, + "learning_rate": 4.665729406572764e-06, + "logits/chosen": -2.7427189350128174, + "logits/rejected": -2.753286838531494, + "logps/chosen": -1317.99560546875, + "logps/rejected": -1335.033203125, + "loss": 0.6874, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32792624831199646, + "rewards/margins": 0.041652776300907135, + "rewards/rejected": -0.3695790767669678, + "step": 1910 + }, + { + "epoch": 0.5, + "learning_rate": 4.660001774819048e-06, + "logits/chosen": -2.7054402828216553, + "logits/rejected": -2.7036356925964355, + "logps/chosen": -1334.2403564453125, + "logps/rejected": -1241.926513671875, + "loss": 0.6782, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.3110745847225189, + "rewards/margins": 0.018700579181313515, + "rewards/rejected": -0.3297751545906067, + "step": 1920 + }, + { + "epoch": 0.51, + "learning_rate": 4.654229066060702e-06, + "logits/chosen": -2.7333807945251465, + "logits/rejected": -2.742948055267334, + "logps/chosen": -1359.8046875, + "logps/rejected": -1150.7369384765625, + "loss": 0.6598, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.23911187052726746, + "rewards/margins": 0.02121734246611595, + "rewards/rejected": -0.2603291869163513, + "step": 1930 + }, + { + "epoch": 0.51, + "learning_rate": 4.648411400768193e-06, + "logits/chosen": -2.73518705368042, + "logits/rejected": -2.725837230682373, + "logps/chosen": -1309.609130859375, + "logps/rejected": -1176.9456787109375, + "loss": 0.6567, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.264873206615448, + "rewards/margins": 0.09074047952890396, + "rewards/rejected": -0.355613648891449, + "step": 1940 + }, + { + "epoch": 0.51, + "learning_rate": 4.642548900350182e-06, + "logits/chosen": -2.7353572845458984, + "logits/rejected": -2.7142481803894043, + "logps/chosen": -1753.3665771484375, + "logps/rejected": -1410.8369140625, + "loss": 0.6604, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19214151799678802, + "rewards/margins": 0.10666815936565399, + "rewards/rejected": -0.298809677362442, + "step": 1950 + }, + { + "epoch": 0.51, + "learning_rate": 4.636641687150994e-06, + "logits/chosen": -2.7391467094421387, + "logits/rejected": -2.717474937438965, + "logps/chosen": -1374.9207763671875, + "logps/rejected": -1112.2822265625, + "loss": 0.6682, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11961638927459717, + "rewards/margins": 0.14005030691623688, + "rewards/rejected": -0.25966668128967285, + "step": 1960 + }, + { + "epoch": 0.52, + "learning_rate": 4.6306898844480615e-06, + "logits/chosen": -2.7787628173828125, + "logits/rejected": -2.751187324523926, + "logps/chosen": -1711.725830078125, + "logps/rejected": -1327.750732421875, + "loss": 0.6574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2270718514919281, + "rewards/margins": 0.06859883666038513, + "rewards/rejected": -0.29567068815231323, + "step": 1970 + }, + { + "epoch": 0.52, + "learning_rate": 4.624693616449358e-06, + "logits/chosen": -2.7466940879821777, + "logits/rejected": -2.7169535160064697, + "logps/chosen": -1387.667724609375, + "logps/rejected": -1170.1529541015625, + "loss": 0.6672, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2690119445323944, + "rewards/margins": 0.009896782226860523, + "rewards/rejected": -0.27890869975090027, + "step": 1980 + }, + { + "epoch": 0.52, + "learning_rate": 4.6186530082908e-06, + "logits/chosen": -2.7376532554626465, + "logits/rejected": -2.7437498569488525, + "logps/chosen": -1513.381103515625, + "logps/rejected": -1244.107421875, + "loss": 0.6808, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.20415648818016052, + "rewards/margins": 0.024326255545020103, + "rewards/rejected": -0.22848275303840637, + "step": 1990 + }, + { + "epoch": 0.52, + "learning_rate": 4.612568186033633e-06, + "logits/chosen": -2.779754638671875, + "logits/rejected": -2.765529155731201, + "logps/chosen": -1780.3160400390625, + "logps/rejected": -1493.633056640625, + "loss": 0.6632, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18064382672309875, + "rewards/margins": 0.1619485467672348, + "rewards/rejected": -0.34259235858917236, + "step": 2000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.744863986968994, + "eval_logits/rejected": -2.737123966217041, + "eval_logps/chosen": -1581.7919921875, + "eval_logps/rejected": -1379.9453125, + "eval_loss": 0.660542905330658, + "eval_rewards/accuracies": 0.6329365372657776, + "eval_rewards/chosen": -0.19242867827415466, + "eval_rewards/margins": 0.09037821739912033, + "eval_rewards/rejected": -0.2828068733215332, + "eval_runtime": 222.0239, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.284, + "step": 2000 + }, + { + "epoch": 0.53, + "learning_rate": 4.6064392766618125e-06, + "logits/chosen": -2.7288384437561035, + "logits/rejected": -2.7342894077301025, + "logps/chosen": -1477.8890380859375, + "logps/rejected": -1176.8448486328125, + "loss": 0.6426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17168466746807098, + "rewards/margins": 0.12829741835594177, + "rewards/rejected": -0.29998213052749634, + "step": 2010 + }, + { + "epoch": 0.53, + "learning_rate": 4.60026640807934e-06, + "logits/chosen": -2.7314982414245605, + "logits/rejected": -2.721731424331665, + "logps/chosen": -1486.63818359375, + "logps/rejected": -1138.0081787109375, + "loss": 0.6408, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.19623301923274994, + "rewards/margins": 0.20260627567768097, + "rewards/rejected": -0.3988392949104309, + "step": 2020 + }, + { + "epoch": 0.53, + "learning_rate": 4.594049709107604e-06, + "logits/chosen": -2.7123606204986572, + "logits/rejected": -2.7007033824920654, + "logps/chosen": -1565.0352783203125, + "logps/rejected": -1272.5074462890625, + "loss": 0.6605, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.25973600149154663, + "rewards/margins": 0.08567461371421814, + "rewards/rejected": -0.3454105854034424, + "step": 2030 + }, + { + "epoch": 0.53, + "learning_rate": 4.587789309482687e-06, + "logits/chosen": -2.734696626663208, + "logits/rejected": -2.7014527320861816, + "logps/chosen": -1527.3094482421875, + "logps/rejected": -1281.611083984375, + "loss": 0.6817, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3249433636665344, + "rewards/margins": 0.0034400448203086853, + "rewards/rejected": -0.3283833861351013, + "step": 2040 + }, + { + "epoch": 0.54, + "learning_rate": 4.581485339852659e-06, + "logits/chosen": -2.7503104209899902, + "logits/rejected": -2.744598865509033, + "logps/chosen": -1337.9700927734375, + "logps/rejected": -1396.9603271484375, + "loss": 0.6815, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.26085740327835083, + "rewards/margins": 0.07063094526529312, + "rewards/rejected": -0.33148834109306335, + "step": 2050 + }, + { + "epoch": 0.54, + "learning_rate": 4.5751379317748514e-06, + "logits/chosen": -2.682860851287842, + "logits/rejected": -2.688174247741699, + "logps/chosen": -1799.8050537109375, + "logps/rejected": -1376.067626953125, + "loss": 0.6568, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.16199791431427002, + "rewards/margins": 0.1496579945087433, + "rewards/rejected": -0.3116559088230133, + "step": 2060 + }, + { + "epoch": 0.54, + "learning_rate": 4.56874721771311e-06, + "logits/chosen": -2.7304446697235107, + "logits/rejected": -2.731867551803589, + "logps/chosen": -1426.6251220703125, + "logps/rejected": -1212.8525390625, + "loss": 0.6755, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22788569331169128, + "rewards/margins": 0.08013808727264404, + "rewards/rejected": -0.3080237805843353, + "step": 2070 + }, + { + "epoch": 0.54, + "learning_rate": 4.562313331035032e-06, + "logits/chosen": -2.7180185317993164, + "logits/rejected": -2.710294246673584, + "logps/chosen": -1632.967041015625, + "logps/rejected": -1406.680419921875, + "loss": 0.6626, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2711654007434845, + "rewards/margins": 0.04249387979507446, + "rewards/rejected": -0.31365928053855896, + "step": 2080 + }, + { + "epoch": 0.55, + "learning_rate": 4.555836406009183e-06, + "logits/chosen": -2.7600905895233154, + "logits/rejected": -2.750833749771118, + "logps/chosen": -1816.625244140625, + "logps/rejected": -1559.4515380859375, + "loss": 0.6585, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.25869041681289673, + "rewards/margins": 0.05229368805885315, + "rewards/rejected": -0.3109840750694275, + "step": 2090 + }, + { + "epoch": 0.55, + "learning_rate": 4.5493165778022945e-06, + "logits/chosen": -2.732236385345459, + "logits/rejected": -2.7228591442108154, + "logps/chosen": -1642.137451171875, + "logps/rejected": -1425.29296875, + "loss": 0.6427, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20737795531749725, + "rewards/margins": 0.16400590538978577, + "rewards/rejected": -0.3713838756084442, + "step": 2100 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -2.7333147525787354, + "eval_logits/rejected": -2.726013660430908, + "eval_logps/chosen": -1583.61376953125, + "eval_logps/rejected": -1382.8006591796875, + "eval_loss": 0.6596797108650208, + "eval_rewards/accuracies": 0.6230158805847168, + "eval_rewards/chosen": -0.21064533293247223, + "eval_rewards/margins": 0.10071565955877304, + "eval_rewards/rejected": -0.31136101484298706, + "eval_runtime": 221.9246, + "eval_samples_per_second": 9.012, + "eval_steps_per_second": 0.284, + "step": 2100 + }, + { + "epoch": 0.55, + "learning_rate": 4.542753982476443e-06, + "logits/chosen": -2.731502056121826, + "logits/rejected": -2.7018508911132812, + "logps/chosen": -1598.5562744140625, + "logps/rejected": -1559.6695556640625, + "loss": 0.6689, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1821993887424469, + "rewards/margins": 0.11268649995326996, + "rewards/rejected": -0.29488590359687805, + "step": 2110 + }, + { + "epoch": 0.55, + "learning_rate": 4.53614875698621e-06, + "logits/chosen": -2.695430278778076, + "logits/rejected": -2.695596933364868, + "logps/chosen": -1473.1234130859375, + "logps/rejected": -1316.5185546875, + "loss": 0.6676, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21487128734588623, + "rewards/margins": 0.08438606560230255, + "rewards/rejected": -0.2992573380470276, + "step": 2120 + }, + { + "epoch": 0.56, + "learning_rate": 4.529501039175824e-06, + "logits/chosen": -2.7299904823303223, + "logits/rejected": -2.7255425453186035, + "logps/chosen": -1602.1400146484375, + "logps/rejected": -1485.03759765625, + "loss": 0.6502, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18949568271636963, + "rewards/margins": 0.15136829018592834, + "rewards/rejected": -0.34086400270462036, + "step": 2130 + }, + { + "epoch": 0.56, + "learning_rate": 4.522810967776287e-06, + "logits/chosen": -2.7604377269744873, + "logits/rejected": -2.750189781188965, + "logps/chosen": -1660.505615234375, + "logps/rejected": -1412.712890625, + "loss": 0.6334, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.18246378004550934, + "rewards/margins": 0.1688774675130844, + "rewards/rejected": -0.35134127736091614, + "step": 2140 + }, + { + "epoch": 0.56, + "learning_rate": 4.516078682402473e-06, + "logits/chosen": -2.7063987255096436, + "logits/rejected": -2.71260666847229, + "logps/chosen": -1543.899169921875, + "logps/rejected": -1228.962158203125, + "loss": 0.672, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.23417046666145325, + "rewards/margins": 0.06204131245613098, + "rewards/rejected": -0.29621177911758423, + "step": 2150 + }, + { + "epoch": 0.57, + "learning_rate": 4.509304323550221e-06, + "logits/chosen": -2.761087417602539, + "logits/rejected": -2.753976821899414, + "logps/chosen": -1480.669921875, + "logps/rejected": -1370.41796875, + "loss": 0.6594, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1856047511100769, + "rewards/margins": 0.15950414538383484, + "rewards/rejected": -0.34510886669158936, + "step": 2160 + }, + { + "epoch": 0.57, + "learning_rate": 4.502488032593398e-06, + "logits/chosen": -2.7342491149902344, + "logits/rejected": -2.7242026329040527, + "logps/chosen": -1605.33447265625, + "logps/rejected": -1437.894287109375, + "loss": 0.672, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.30404919385910034, + "rewards/margins": 0.0675443634390831, + "rewards/rejected": -0.37159356474876404, + "step": 2170 + }, + { + "epoch": 0.57, + "learning_rate": 4.495629951780951e-06, + "logits/chosen": -2.753080129623413, + "logits/rejected": -2.7350218296051025, + "logps/chosen": -1460.925048828125, + "logps/rejected": -1252.1539306640625, + "loss": 0.6588, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.24314546585083008, + "rewards/margins": 0.0389077290892601, + "rewards/rejected": -0.2820531725883484, + "step": 2180 + }, + { + "epoch": 0.57, + "learning_rate": 4.488730224233941e-06, + "logits/chosen": -2.7286741733551025, + "logits/rejected": -2.719348430633545, + "logps/chosen": -1473.2852783203125, + "logps/rejected": -1500.265380859375, + "loss": 0.6556, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2850351929664612, + "rewards/margins": 0.12426628917455673, + "rewards/rejected": -0.4093014597892761, + "step": 2190 + }, + { + "epoch": 0.58, + "learning_rate": 4.481788993942547e-06, + "logits/chosen": -2.7154481410980225, + "logits/rejected": -2.722774028778076, + "logps/chosen": -1465.152587890625, + "logps/rejected": -1138.082763671875, + "loss": 0.6923, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.25783294439315796, + "rewards/margins": 0.05801212787628174, + "rewards/rejected": -0.3158450722694397, + "step": 2200 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.7242798805236816, + "eval_logits/rejected": -2.717473030090332, + "eval_logps/chosen": -1583.8399658203125, + "eval_logps/rejected": -1383.4486083984375, + "eval_loss": 0.659185528755188, + "eval_rewards/accuracies": 0.6230158805847168, + "eval_rewards/chosen": -0.2129082977771759, + "eval_rewards/margins": 0.10493296384811401, + "eval_rewards/rejected": -0.3178412616252899, + "eval_runtime": 221.9944, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 2200 + }, + { + "epoch": 0.58, + "learning_rate": 4.474806405763076e-06, + "logits/chosen": -2.7109556198120117, + "logits/rejected": -2.6972789764404297, + "logps/chosen": -1306.2476806640625, + "logps/rejected": -899.9601440429688, + "loss": 0.6719, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.30284592509269714, + "rewards/margins": 0.037451691925525665, + "rewards/rejected": -0.3402976095676422, + "step": 2210 + }, + { + "epoch": 0.58, + "learning_rate": 4.4677826054149235e-06, + "logits/chosen": -2.637697696685791, + "logits/rejected": -2.6475868225097656, + "logps/chosen": -1411.45458984375, + "logps/rejected": -1260.859130859375, + "loss": 0.669, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23996862769126892, + "rewards/margins": 0.13470852375030518, + "rewards/rejected": -0.3746771514415741, + "step": 2220 + }, + { + "epoch": 0.58, + "learning_rate": 4.460717739477543e-06, + "logits/chosen": -2.740626096725464, + "logits/rejected": -2.7197346687316895, + "logps/chosen": -1452.495361328125, + "logps/rejected": -1311.1324462890625, + "loss": 0.6558, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1936483234167099, + "rewards/margins": 0.14396671950817108, + "rewards/rejected": -0.337615042924881, + "step": 2230 + }, + { + "epoch": 0.59, + "learning_rate": 4.4536119553873866e-06, + "logits/chosen": -2.691063642501831, + "logits/rejected": -2.698746681213379, + "logps/chosen": -1228.3648681640625, + "logps/rejected": -996.9989013671875, + "loss": 0.6491, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21332640945911407, + "rewards/margins": 0.14366553723812103, + "rewards/rejected": -0.3569919466972351, + "step": 2240 + }, + { + "epoch": 0.59, + "learning_rate": 4.446465401434824e-06, + "logits/chosen": -2.701746940612793, + "logits/rejected": -2.6930394172668457, + "logps/chosen": -1635.9879150390625, + "logps/rejected": -1463.801025390625, + "loss": 0.6599, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17793354392051697, + "rewards/margins": 0.11402539908885956, + "rewards/rejected": -0.2919589579105377, + "step": 2250 + }, + { + "epoch": 0.59, + "learning_rate": 4.43927822676105e-06, + "logits/chosen": -2.720634937286377, + "logits/rejected": -2.705869436264038, + "logps/chosen": -1196.5458984375, + "logps/rejected": -1118.008544921875, + "loss": 0.662, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20704011619091034, + "rewards/margins": 0.11822967231273651, + "rewards/rejected": -0.32526981830596924, + "step": 2260 + }, + { + "epoch": 0.59, + "learning_rate": 4.432050581354972e-06, + "logits/chosen": -2.7368626594543457, + "logits/rejected": -2.7253921031951904, + "logps/chosen": -1714.7447509765625, + "logps/rejected": -1365.946044921875, + "loss": 0.6638, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.208714097738266, + "rewards/margins": 0.05404313653707504, + "rewards/rejected": -0.26275724172592163, + "step": 2270 + }, + { + "epoch": 0.6, + "learning_rate": 4.424782616050078e-06, + "logits/chosen": -2.7176461219787598, + "logits/rejected": -2.700206756591797, + "logps/chosen": -1514.9925537109375, + "logps/rejected": -1475.500732421875, + "loss": 0.6488, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21896132826805115, + "rewards/margins": 0.11795832961797714, + "rewards/rejected": -0.33691972494125366, + "step": 2280 + }, + { + "epoch": 0.6, + "learning_rate": 4.4174744825212954e-06, + "logits/chosen": -2.7620787620544434, + "logits/rejected": -2.7197585105895996, + "logps/chosen": -1456.243408203125, + "logps/rejected": -1030.5003662109375, + "loss": 0.6494, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2278098315000534, + "rewards/margins": 0.1794498711824417, + "rewards/rejected": -0.4072597026824951, + "step": 2290 + }, + { + "epoch": 0.6, + "learning_rate": 4.410126333281815e-06, + "logits/chosen": -2.676535129547119, + "logits/rejected": -2.6870205402374268, + "logps/chosen": -1891.846435546875, + "logps/rejected": -1446.581298828125, + "loss": 0.6496, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19136743247509003, + "rewards/margins": 0.2605608105659485, + "rewards/rejected": -0.4519282281398773, + "step": 2300 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -2.7234508991241455, + "eval_logits/rejected": -2.715916633605957, + "eval_logps/chosen": -1586.070556640625, + "eval_logps/rejected": -1386.091552734375, + "eval_loss": 0.6581032872200012, + "eval_rewards/accuracies": 0.6289682388305664, + "eval_rewards/chosen": -0.2352151870727539, + "eval_rewards/margins": 0.10905227065086365, + "eval_rewards/rejected": -0.3442673981189728, + "eval_runtime": 222.0195, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.284, + "step": 2300 + }, + { + "epoch": 0.6, + "learning_rate": 4.402738321679918e-06, + "logits/chosen": -2.74485182762146, + "logits/rejected": -2.7311596870422363, + "logps/chosen": -1372.6346435546875, + "logps/rejected": -1097.726806640625, + "loss": 0.6901, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.35119324922561646, + "rewards/margins": -0.023585880175232887, + "rewards/rejected": -0.3276073634624481, + "step": 2310 + }, + { + "epoch": 0.61, + "learning_rate": 4.395310601895772e-06, + "logits/chosen": -2.713000774383545, + "logits/rejected": -2.712918281555176, + "logps/chosen": -1417.783935546875, + "logps/rejected": -1562.4224853515625, + "loss": 0.6689, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.28538185358047485, + "rewards/margins": 0.11929192394018173, + "rewards/rejected": -0.4046737551689148, + "step": 2320 + }, + { + "epoch": 0.61, + "learning_rate": 4.38784332893821e-06, + "logits/chosen": -2.646317481994629, + "logits/rejected": -2.679353713989258, + "logps/chosen": -1165.4190673828125, + "logps/rejected": -1229.4197998046875, + "loss": 0.6631, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3009629249572754, + "rewards/margins": 0.004149970598518848, + "rewards/rejected": -0.30511292815208435, + "step": 2330 + }, + { + "epoch": 0.61, + "learning_rate": 4.380336658641503e-06, + "logits/chosen": -2.692584276199341, + "logits/rejected": -2.7181296348571777, + "logps/chosen": -1560.274169921875, + "logps/rejected": -1453.550537109375, + "loss": 0.6464, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21228572726249695, + "rewards/margins": 0.10219845920801163, + "rewards/rejected": -0.31448420882225037, + "step": 2340 + }, + { + "epoch": 0.62, + "learning_rate": 4.372790747662101e-06, + "logits/chosen": -2.6889491081237793, + "logits/rejected": -2.6906516551971436, + "logps/chosen": -1620.80517578125, + "logps/rejected": -1415.034423828125, + "loss": 0.6674, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3593459725379944, + "rewards/margins": 0.0033436850644648075, + "rewards/rejected": -0.36268967390060425, + "step": 2350 + }, + { + "epoch": 0.62, + "learning_rate": 4.365205753475367e-06, + "logits/chosen": -2.675593852996826, + "logits/rejected": -2.6628692150115967, + "logps/chosen": -1526.362060546875, + "logps/rejected": -1447.8023681640625, + "loss": 0.6473, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22609886527061462, + "rewards/margins": 0.1290428638458252, + "rewards/rejected": -0.3551417291164398, + "step": 2360 + }, + { + "epoch": 0.62, + "learning_rate": 4.35758183437229e-06, + "logits/chosen": -2.7212536334991455, + "logits/rejected": -2.7167136669158936, + "logps/chosen": -1591.127197265625, + "logps/rejected": -1531.5855712890625, + "loss": 0.6667, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22146666049957275, + "rewards/margins": 0.12169722467660904, + "rewards/rejected": -0.3431639075279236, + "step": 2370 + }, + { + "epoch": 0.62, + "learning_rate": 4.3499191494561835e-06, + "logits/chosen": -2.760378360748291, + "logits/rejected": -2.7481420040130615, + "logps/chosen": -1869.5081787109375, + "logps/rejected": -1497.3836669921875, + "loss": 0.6601, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21052077412605286, + "rewards/margins": 0.1314498633146286, + "rewards/rejected": -0.34197065234184265, + "step": 2380 + }, + { + "epoch": 0.63, + "learning_rate": 4.3422178586393615e-06, + "logits/chosen": -2.730377197265625, + "logits/rejected": -2.743986129760742, + "logps/chosen": -1398.206787109375, + "logps/rejected": -1236.90673828125, + "loss": 0.6555, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18501418828964233, + "rewards/margins": 0.15820163488388062, + "rewards/rejected": -0.34321585297584534, + "step": 2390 + }, + { + "epoch": 0.63, + "learning_rate": 4.334478122639804e-06, + "logits/chosen": -2.745204448699951, + "logits/rejected": -2.724517345428467, + "logps/chosen": -1631.51318359375, + "logps/rejected": -1574.7022705078125, + "loss": 0.6668, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23339959979057312, + "rewards/margins": 0.09116321802139282, + "rewards/rejected": -0.32456284761428833, + "step": 2400 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -2.740978479385376, + "eval_logits/rejected": -2.7320845127105713, + "eval_logps/chosen": -1587.576904296875, + "eval_logps/rejected": -1387.298095703125, + "eval_loss": 0.6576688885688782, + "eval_rewards/accuracies": 0.6289682388305664, + "eval_rewards/chosen": -0.25027817487716675, + "eval_rewards/margins": 0.10605475306510925, + "eval_rewards/rejected": -0.356332927942276, + "eval_runtime": 222.0763, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 0.284, + "step": 2400 + }, + { + "epoch": 0.63, + "learning_rate": 4.3267001029778015e-06, + "logits/chosen": -2.757209300994873, + "logits/rejected": -2.7432403564453125, + "logps/chosen": -1993.5084228515625, + "logps/rejected": -1199.38720703125, + "loss": 0.651, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.23706431686878204, + "rewards/margins": 0.13261531293392181, + "rewards/rejected": -0.36967962980270386, + "step": 2410 + }, + { + "epoch": 0.63, + "learning_rate": 4.318883961972585e-06, + "logits/chosen": -2.734022378921509, + "logits/rejected": -2.7461204528808594, + "logps/chosen": -1391.851806640625, + "logps/rejected": -1220.397216796875, + "loss": 0.65, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1621847301721573, + "rewards/margins": 0.11607532203197479, + "rewards/rejected": -0.2782600522041321, + "step": 2420 + }, + { + "epoch": 0.64, + "learning_rate": 4.311029862738942e-06, + "logits/chosen": -2.7122702598571777, + "logits/rejected": -2.7062220573425293, + "logps/chosen": -1442.494873046875, + "logps/rejected": -1381.0362548828125, + "loss": 0.6813, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.2430756539106369, + "rewards/margins": 0.1406748741865158, + "rewards/rejected": -0.3837505280971527, + "step": 2430 + }, + { + "epoch": 0.64, + "learning_rate": 4.303137969183804e-06, + "logits/chosen": -2.718060255050659, + "logits/rejected": -2.7174715995788574, + "logps/chosen": -1650.6988525390625, + "logps/rejected": -1207.530517578125, + "loss": 0.631, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1915169060230255, + "rewards/margins": 0.1620643436908722, + "rewards/rejected": -0.3535812199115753, + "step": 2440 + }, + { + "epoch": 0.64, + "learning_rate": 4.295208446002832e-06, + "logits/chosen": -2.736496925354004, + "logits/rejected": -2.727999210357666, + "logps/chosen": -1543.1021728515625, + "logps/rejected": -1205.076416015625, + "loss": 0.6544, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.28645798563957214, + "rewards/margins": 0.09804262965917587, + "rewards/rejected": -0.38450056314468384, + "step": 2450 + }, + { + "epoch": 0.64, + "learning_rate": 4.287241458676981e-06, + "logits/chosen": -2.69891619682312, + "logits/rejected": -2.700446367263794, + "logps/chosen": -1317.9898681640625, + "logps/rejected": -1057.5445556640625, + "loss": 0.6642, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.3027532994747162, + "rewards/margins": 0.06918701529502869, + "rewards/rejected": -0.3719402551651001, + "step": 2460 + }, + { + "epoch": 0.65, + "learning_rate": 4.279237173469043e-06, + "logits/chosen": -2.7122912406921387, + "logits/rejected": -2.704481840133667, + "logps/chosen": -1588.9444580078125, + "logps/rejected": -1400.419921875, + "loss": 0.6461, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16931328177452087, + "rewards/margins": 0.1601758748292923, + "rewards/rejected": -0.32948917150497437, + "step": 2470 + }, + { + "epoch": 0.65, + "learning_rate": 4.271195757420177e-06, + "logits/chosen": -2.7225699424743652, + "logits/rejected": -2.688791275024414, + "logps/chosen": -1543.356201171875, + "logps/rejected": -1652.3414306640625, + "loss": 0.6565, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22034041583538055, + "rewards/margins": 0.047962792217731476, + "rewards/rejected": -0.26830318570137024, + "step": 2480 + }, + { + "epoch": 0.65, + "learning_rate": 4.263117378346425e-06, + "logits/chosen": -2.7116904258728027, + "logits/rejected": -2.720613956451416, + "logps/chosen": -1549.628173828125, + "logps/rejected": -1367.11181640625, + "loss": 0.6472, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.28746548295021057, + "rewards/margins": 0.13900980353355408, + "rewards/rejected": -0.4264752268791199, + "step": 2490 + }, + { + "epoch": 0.65, + "learning_rate": 4.255002204835208e-06, + "logits/chosen": -2.7465109825134277, + "logits/rejected": -2.7485060691833496, + "logps/chosen": -1488.91259765625, + "logps/rejected": -1472.1722412109375, + "loss": 0.6477, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2156025469303131, + "rewards/margins": 0.14002607762813568, + "rewards/rejected": -0.3556286692619324, + "step": 2500 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -2.7370338439941406, + "eval_logits/rejected": -2.728743076324463, + "eval_logps/chosen": -1589.1619873046875, + "eval_logps/rejected": -1390.239990234375, + "eval_loss": 0.6559935808181763, + "eval_rewards/accuracies": 0.6309523582458496, + "eval_rewards/chosen": -0.2661284804344177, + "eval_rewards/margins": 0.11962475627660751, + "eval_rewards/rejected": -0.38575324416160583, + "eval_runtime": 221.9192, + "eval_samples_per_second": 9.012, + "eval_steps_per_second": 0.284, + "step": 2500 + }, + { + "epoch": 0.66, + "learning_rate": 4.246850406241812e-06, + "logits/chosen": -2.7424569129943848, + "logits/rejected": -2.721717357635498, + "logps/chosen": -1479.636962890625, + "logps/rejected": -1123.767822265625, + "loss": 0.6845, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3048413395881653, + "rewards/margins": 0.08976776897907257, + "rewards/rejected": -0.39460912346839905, + "step": 2510 + }, + { + "epoch": 0.66, + "learning_rate": 4.2386621526858465e-06, + "logits/chosen": -2.7249627113342285, + "logits/rejected": -2.708723306655884, + "logps/chosen": -1231.6832275390625, + "logps/rejected": -1143.8583984375, + "loss": 0.6696, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21194536983966827, + "rewards/margins": 0.18466496467590332, + "rewards/rejected": -0.3966103196144104, + "step": 2520 + }, + { + "epoch": 0.66, + "learning_rate": 4.2304376150477015e-06, + "logits/chosen": -2.738765239715576, + "logits/rejected": -2.727811098098755, + "logps/chosen": -1535.489990234375, + "logps/rejected": -1317.7042236328125, + "loss": 0.6717, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19936420023441315, + "rewards/margins": 0.14922715723514557, + "rewards/rejected": -0.3485913574695587, + "step": 2530 + }, + { + "epoch": 0.66, + "learning_rate": 4.222176964964977e-06, + "logits/chosen": -2.722945213317871, + "logits/rejected": -2.7192633152008057, + "logps/chosen": -1187.5484619140625, + "logps/rejected": -1205.326416015625, + "loss": 0.6474, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24626651406288147, + "rewards/margins": 0.13393202424049377, + "rewards/rejected": -0.38019853830337524, + "step": 2540 + }, + { + "epoch": 0.67, + "learning_rate": 4.213880374828903e-06, + "logits/chosen": -2.7221715450286865, + "logits/rejected": -2.716590404510498, + "logps/chosen": -1716.3052978515625, + "logps/rejected": -1616.6943359375, + "loss": 0.6642, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20805975794792175, + "rewards/margins": 0.14398939907550812, + "rewards/rejected": -0.35204917192459106, + "step": 2550 + }, + { + "epoch": 0.67, + "learning_rate": 4.2055480177807406e-06, + "logits/chosen": -2.6929428577423096, + "logits/rejected": -2.6970746517181396, + "logps/chosen": -1225.0126953125, + "logps/rejected": -1057.3948974609375, + "loss": 0.6528, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30539470911026, + "rewards/margins": 0.0778200551867485, + "rewards/rejected": -0.3832147717475891, + "step": 2560 + }, + { + "epoch": 0.67, + "learning_rate": 4.1971800677081696e-06, + "logits/chosen": -2.745729446411133, + "logits/rejected": -2.7569854259490967, + "logps/chosen": -1561.8831787109375, + "logps/rejected": -1445.382568359375, + "loss": 0.6349, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.34451374411582947, + "rewards/margins": 0.07771845161914825, + "rewards/rejected": -0.4222322106361389, + "step": 2570 + }, + { + "epoch": 0.68, + "learning_rate": 4.188776699241661e-06, + "logits/chosen": -2.6861016750335693, + "logits/rejected": -2.659060001373291, + "logps/chosen": -1728.2652587890625, + "logps/rejected": -1673.9222412109375, + "loss": 0.6365, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.23023895919322968, + "rewards/margins": 0.20911893248558044, + "rewards/rejected": -0.4393579065799713, + "step": 2580 + }, + { + "epoch": 0.68, + "learning_rate": 4.180338087750827e-06, + "logits/chosen": -2.753697156906128, + "logits/rejected": -2.727743148803711, + "logps/chosen": -1934.5179443359375, + "logps/rejected": -1417.443115234375, + "loss": 0.6394, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.27421361207962036, + "rewards/margins": 0.25228267908096313, + "rewards/rejected": -0.5264962315559387, + "step": 2590 + }, + { + "epoch": 0.68, + "learning_rate": 4.1718644093407704e-06, + "logits/chosen": -2.7282567024230957, + "logits/rejected": -2.704284906387329, + "logps/chosen": -1521.0718994140625, + "logps/rejected": -1392.9830322265625, + "loss": 0.6444, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2527593672275543, + "rewards/margins": 0.17600814998149872, + "rewards/rejected": -0.42876753211021423, + "step": 2600 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -2.7330162525177, + "eval_logits/rejected": -2.7239952087402344, + "eval_logps/chosen": -1590.8504638671875, + "eval_logps/rejected": -1391.5975341796875, + "eval_loss": 0.6549809575080872, + "eval_rewards/accuracies": 0.6269841194152832, + "eval_rewards/chosen": -0.283012717962265, + "eval_rewards/margins": 0.11631587892770767, + "eval_rewards/rejected": -0.39932864904403687, + "eval_runtime": 221.9289, + "eval_samples_per_second": 9.012, + "eval_steps_per_second": 0.284, + "step": 2600 + }, + { + "epoch": 0.68, + "learning_rate": 4.163355840848401e-06, + "logits/chosen": -2.73819899559021, + "logits/rejected": -2.7327933311462402, + "logps/chosen": -1506.4268798828125, + "logps/rejected": -1232.0145263671875, + "loss": 0.6505, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2977539896965027, + "rewards/margins": 0.10095224529504776, + "rewards/rejected": -0.39870625734329224, + "step": 2610 + }, + { + "epoch": 0.69, + "learning_rate": 4.154812559838748e-06, + "logits/chosen": -2.7333149909973145, + "logits/rejected": -2.719111919403076, + "logps/chosen": -1742.7320556640625, + "logps/rejected": -1420.9490966796875, + "loss": 0.6734, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.29098597168922424, + "rewards/margins": 0.03577999770641327, + "rewards/rejected": -0.3267659544944763, + "step": 2620 + }, + { + "epoch": 0.69, + "learning_rate": 4.146234744601259e-06, + "logits/chosen": -2.731231451034546, + "logits/rejected": -2.7138350009918213, + "logps/chosen": -1467.53662109375, + "logps/rejected": -1297.028076171875, + "loss": 0.6314, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2917880415916443, + "rewards/margins": 0.1302862912416458, + "rewards/rejected": -0.4220743775367737, + "step": 2630 + }, + { + "epoch": 0.69, + "learning_rate": 4.137622574146071e-06, + "logits/chosen": -2.7165801525115967, + "logits/rejected": -2.716395616531372, + "logps/chosen": -1543.955322265625, + "logps/rejected": -1588.169921875, + "loss": 0.6301, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2739286720752716, + "rewards/margins": 0.16008147597312927, + "rewards/rejected": -0.4340101182460785, + "step": 2640 + }, + { + "epoch": 0.69, + "learning_rate": 4.12897622820028e-06, + "logits/chosen": -2.6904263496398926, + "logits/rejected": -2.6798410415649414, + "logps/chosen": -1184.8038330078125, + "logps/rejected": -1180.9886474609375, + "loss": 0.6495, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3079219460487366, + "rewards/margins": 0.14843253791332245, + "rewards/rejected": -0.4563544690608978, + "step": 2650 + }, + { + "epoch": 0.7, + "learning_rate": 4.120295887204191e-06, + "logits/chosen": -2.707143545150757, + "logits/rejected": -2.6978511810302734, + "logps/chosen": -1218.4644775390625, + "logps/rejected": -1025.177490234375, + "loss": 0.6622, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3178409934043884, + "rewards/margins": 0.09260964393615723, + "rewards/rejected": -0.41045063734054565, + "step": 2660 + }, + { + "epoch": 0.7, + "learning_rate": 4.111581732307548e-06, + "logits/chosen": -2.7155771255493164, + "logits/rejected": -2.7257542610168457, + "logps/chosen": -1921.700439453125, + "logps/rejected": -1344.607421875, + "loss": 0.6604, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.24331799149513245, + "rewards/margins": 0.26474809646606445, + "rewards/rejected": -0.5080660581588745, + "step": 2670 + }, + { + "epoch": 0.7, + "learning_rate": 4.1028339453657595e-06, + "logits/chosen": -2.734602212905884, + "logits/rejected": -2.6983163356781006, + "logps/chosen": -1651.4810791015625, + "logps/rejected": -1358.7333984375, + "loss": 0.657, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2954995036125183, + "rewards/margins": 0.22179126739501953, + "rewards/rejected": -0.5172907710075378, + "step": 2680 + }, + { + "epoch": 0.7, + "learning_rate": 4.094052708936096e-06, + "logits/chosen": -2.6809024810791016, + "logits/rejected": -2.6818182468414307, + "logps/chosen": -1520.92626953125, + "logps/rejected": -1463.026611328125, + "loss": 0.6325, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.37137606739997864, + "rewards/margins": 0.08747304975986481, + "rewards/rejected": -0.45884910225868225, + "step": 2690 + }, + { + "epoch": 0.71, + "learning_rate": 4.0852382062738874e-06, + "logits/chosen": -2.6768171787261963, + "logits/rejected": -2.684788227081299, + "logps/chosen": -1481.054931640625, + "logps/rejected": -1389.98193359375, + "loss": 0.6594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30067363381385803, + "rewards/margins": 0.04770331829786301, + "rewards/rejected": -0.34837692975997925, + "step": 2700 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -2.6817612648010254, + "eval_logits/rejected": -2.6747777462005615, + "eval_logps/chosen": -1598.0084228515625, + "eval_logps/rejected": -1400.2867431640625, + "eval_loss": 0.6565902829170227, + "eval_rewards/accuracies": 0.6190476417541504, + "eval_rewards/chosen": -0.3545936048030853, + "eval_rewards/margins": 0.1316264271736145, + "eval_rewards/rejected": -0.4862200617790222, + "eval_runtime": 221.9819, + "eval_samples_per_second": 9.01, + "eval_steps_per_second": 0.284, + "step": 2700 + }, + { + "epoch": 0.71, + "learning_rate": 4.076390621328693e-06, + "logits/chosen": -2.6784491539001465, + "logits/rejected": -2.6636245250701904, + "logps/chosen": -1459.29052734375, + "logps/rejected": -1261.677001953125, + "loss": 0.6739, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3422325849533081, + "rewards/margins": 0.013906337320804596, + "rewards/rejected": -0.3561389446258545, + "step": 2710 + }, + { + "epoch": 0.71, + "learning_rate": 4.067510138740467e-06, + "logits/chosen": -2.71091890335083, + "logits/rejected": -2.700723171234131, + "logps/chosen": -1594.1536865234375, + "logps/rejected": -1248.9345703125, + "loss": 0.6468, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3426397144794464, + "rewards/margins": 0.05318804457783699, + "rewards/rejected": -0.3958277404308319, + "step": 2720 + }, + { + "epoch": 0.71, + "learning_rate": 4.058596943835703e-06, + "logits/chosen": -2.6955184936523438, + "logits/rejected": -2.6890547275543213, + "logps/chosen": -1438.625732421875, + "logps/rejected": -1318.1397705078125, + "loss": 0.6567, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.26787054538726807, + "rewards/margins": 0.04538039490580559, + "rewards/rejected": -0.31325095891952515, + "step": 2730 + }, + { + "epoch": 0.72, + "learning_rate": 4.049651222623568e-06, + "logits/chosen": -2.7120392322540283, + "logits/rejected": -2.705559730529785, + "logps/chosen": -1674.390380859375, + "logps/rejected": -1225.760009765625, + "loss": 0.6136, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19621030986309052, + "rewards/margins": 0.2308935821056366, + "rewards/rejected": -0.4271039068698883, + "step": 2740 + }, + { + "epoch": 0.72, + "learning_rate": 4.040673161792014e-06, + "logits/chosen": -2.700012683868408, + "logits/rejected": -2.6899571418762207, + "logps/chosen": -972.44140625, + "logps/rejected": -753.1502685546875, + "loss": 0.6859, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.30196860432624817, + "rewards/margins": 0.11960021406412125, + "rewards/rejected": -0.42156878113746643, + "step": 2750 + }, + { + "epoch": 0.72, + "learning_rate": 4.031662948703896e-06, + "logits/chosen": -2.650031328201294, + "logits/rejected": -2.641418933868408, + "logps/chosen": -1580.4583740234375, + "logps/rejected": -1196.228759765625, + "loss": 0.6657, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.29348674416542053, + "rewards/margins": 0.051687635481357574, + "rewards/rejected": -0.3451744019985199, + "step": 2760 + }, + { + "epoch": 0.72, + "learning_rate": 4.022620771393047e-06, + "logits/chosen": -2.7186498641967773, + "logits/rejected": -2.690887928009033, + "logps/chosen": -1672.505859375, + "logps/rejected": -1422.82080078125, + "loss": 0.6726, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30704042315483093, + "rewards/margins": 0.3519892692565918, + "rewards/rejected": -0.6590296030044556, + "step": 2770 + }, + { + "epoch": 0.73, + "learning_rate": 4.013546818560362e-06, + "logits/chosen": -2.704613447189331, + "logits/rejected": -2.6747727394104004, + "logps/chosen": -1655.041748046875, + "logps/rejected": -1089.197265625, + "loss": 0.665, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.34044402837753296, + "rewards/margins": 0.08583029359579086, + "rewards/rejected": -0.4262743592262268, + "step": 2780 + }, + { + "epoch": 0.73, + "learning_rate": 4.00444127956986e-06, + "logits/chosen": -2.7101197242736816, + "logits/rejected": -2.6946587562561035, + "logps/chosen": -1646.9859619140625, + "logps/rejected": -1348.275390625, + "loss": 0.6428, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.21547237038612366, + "rewards/margins": 0.07543666660785675, + "rewards/rejected": -0.2909089922904968, + "step": 2790 + }, + { + "epoch": 0.73, + "learning_rate": 3.9953043444447255e-06, + "logits/chosen": -2.6812894344329834, + "logits/rejected": -2.666337251663208, + "logps/chosen": -1310.0408935546875, + "logps/rejected": -1114.087158203125, + "loss": 0.6329, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2714948058128357, + "rewards/margins": 0.21535822749137878, + "rewards/rejected": -0.48685306310653687, + "step": 2800 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.706270694732666, + "eval_logits/rejected": -2.698537588119507, + "eval_logps/chosen": -1590.024658203125, + "eval_logps/rejected": -1391.0291748046875, + "eval_loss": 0.6544455289840698, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": -0.2747553884983063, + "eval_rewards/margins": 0.11888986825942993, + "eval_rewards/rejected": -0.3936452269554138, + "eval_runtime": 221.9495, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 0.284, + "step": 2800 + }, + { + "epoch": 0.74, + "learning_rate": 3.986136203863355e-06, + "logits/chosen": -2.709186553955078, + "logits/rejected": -2.710569381713867, + "logps/chosen": -1612.9268798828125, + "logps/rejected": -1281.6539306640625, + "loss": 0.6538, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1813502460718155, + "rewards/margins": 0.16553938388824463, + "rewards/rejected": -0.3468896448612213, + "step": 2810 + }, + { + "epoch": 0.74, + "learning_rate": 3.976937049155365e-06, + "logits/chosen": -2.707552909851074, + "logits/rejected": -2.7168171405792236, + "logps/chosen": -1346.50634765625, + "logps/rejected": -1408.813720703125, + "loss": 0.6427, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3057866394519806, + "rewards/margins": 0.14527741074562073, + "rewards/rejected": -0.45106402039527893, + "step": 2820 + }, + { + "epoch": 0.74, + "learning_rate": 3.967707072297608e-06, + "logits/chosen": -2.720702886581421, + "logits/rejected": -2.704594135284424, + "logps/chosen": -1848.3275146484375, + "logps/rejected": -1304.785888671875, + "loss": 0.6654, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23301962018013, + "rewards/margins": 0.12841656804084778, + "rewards/rejected": -0.36143621802330017, + "step": 2830 + }, + { + "epoch": 0.74, + "learning_rate": 3.958446465910159e-06, + "logits/chosen": -2.7212958335876465, + "logits/rejected": -2.711705207824707, + "logps/chosen": -1518.9945068359375, + "logps/rejected": -1312.5887451171875, + "loss": 0.6328, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24658803641796112, + "rewards/margins": 0.18430814146995544, + "rewards/rejected": -0.43089619278907776, + "step": 2840 + }, + { + "epoch": 0.75, + "learning_rate": 3.9491554232523066e-06, + "logits/chosen": -2.6986780166625977, + "logits/rejected": -2.7011802196502686, + "logps/chosen": -1586.0, + "logps/rejected": -1219.230712890625, + "loss": 0.6506, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3162277638912201, + "rewards/margins": 0.15653644502162933, + "rewards/rejected": -0.472764253616333, + "step": 2850 + }, + { + "epoch": 0.75, + "learning_rate": 3.939834138218505e-06, + "logits/chosen": -2.7183499336242676, + "logits/rejected": -2.7136991024017334, + "logps/chosen": -1092.1768798828125, + "logps/rejected": -1095.692138671875, + "loss": 0.6804, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3108363747596741, + "rewards/margins": 0.07240144908428192, + "rewards/rejected": -0.3832378387451172, + "step": 2860 + }, + { + "epoch": 0.75, + "learning_rate": 3.930482805334339e-06, + "logits/chosen": -2.716367721557617, + "logits/rejected": -2.7113070487976074, + "logps/chosen": -1651.617431640625, + "logps/rejected": -1553.928955078125, + "loss": 0.6551, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.23448964953422546, + "rewards/margins": 0.1761372834444046, + "rewards/rejected": -0.41062694787979126, + "step": 2870 + }, + { + "epoch": 0.75, + "learning_rate": 3.921101619752464e-06, + "logits/chosen": -2.7226855754852295, + "logits/rejected": -2.7159783840179443, + "logps/chosen": -1491.2879638671875, + "logps/rejected": -1291.7816162109375, + "loss": 0.6551, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3070013225078583, + "rewards/margins": 0.17114727199077606, + "rewards/rejected": -0.47814860939979553, + "step": 2880 + }, + { + "epoch": 0.76, + "learning_rate": 3.911690777248525e-06, + "logits/chosen": -2.703758478164673, + "logits/rejected": -2.702188491821289, + "logps/chosen": -1669.4017333984375, + "logps/rejected": -1359.0408935546875, + "loss": 0.6651, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.32670092582702637, + "rewards/margins": 0.0898340493440628, + "rewards/rejected": -0.41653499007225037, + "step": 2890 + }, + { + "epoch": 0.76, + "learning_rate": 3.902250474217079e-06, + "logits/chosen": -2.722134590148926, + "logits/rejected": -2.7083685398101807, + "logps/chosen": -1339.631591796875, + "logps/rejected": -1110.2490234375, + "loss": 0.6351, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.299724817276001, + "rewards/margins": 0.12701234221458435, + "rewards/rejected": -0.4267371594905853, + "step": 2900 + }, + { + "epoch": 0.76, + "eval_logits/chosen": -2.713578701019287, + "eval_logits/rejected": -2.7050318717956543, + "eval_logps/chosen": -1591.8255615234375, + "eval_logps/rejected": -1393.1846923828125, + "eval_loss": 0.6545432209968567, + "eval_rewards/accuracies": 0.6269841194152832, + "eval_rewards/chosen": -0.292764276266098, + "eval_rewards/margins": 0.1224350854754448, + "eval_rewards/rejected": -0.4151993691921234, + "eval_runtime": 222.0124, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 2900 + }, + { + "epoch": 0.76, + "learning_rate": 3.892780907667495e-06, + "logits/chosen": -2.6744871139526367, + "logits/rejected": -2.677670478820801, + "logps/chosen": -1297.406005859375, + "logps/rejected": -1041.806640625, + "loss": 0.6526, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3169351816177368, + "rewards/margins": 0.08249001950025558, + "rewards/rejected": -0.399425208568573, + "step": 2910 + }, + { + "epoch": 0.76, + "learning_rate": 3.883282275219837e-06, + "logits/chosen": -2.680647611618042, + "logits/rejected": -2.674455165863037, + "logps/chosen": -1549.8154296875, + "logps/rejected": -1308.754638671875, + "loss": 0.6657, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.27796778082847595, + "rewards/margins": 0.0887480154633522, + "rewards/rejected": -0.36671575903892517, + "step": 2920 + }, + { + "epoch": 0.77, + "learning_rate": 3.873754775100751e-06, + "logits/chosen": -2.7278220653533936, + "logits/rejected": -2.7124266624450684, + "logps/chosen": -1649.2249755859375, + "logps/rejected": -1332.158203125, + "loss": 0.6359, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22023046016693115, + "rewards/margins": 0.2440110146999359, + "rewards/rejected": -0.46424150466918945, + "step": 2930 + }, + { + "epoch": 0.77, + "learning_rate": 3.8641986061393145e-06, + "logits/chosen": -2.695570468902588, + "logits/rejected": -2.6847481727600098, + "logps/chosen": -1939.2047119140625, + "logps/rejected": -1484.7591552734375, + "loss": 0.6457, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2145908772945404, + "rewards/margins": 0.16744297742843628, + "rewards/rejected": -0.3820338845252991, + "step": 2940 + }, + { + "epoch": 0.77, + "learning_rate": 3.854613967762898e-06, + "logits/chosen": -2.713531017303467, + "logits/rejected": -2.7223057746887207, + "logps/chosen": -1435.392333984375, + "logps/rejected": -1526.7779541015625, + "loss": 0.6368, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.32843753695487976, + "rewards/margins": 0.14957351982593536, + "rewards/rejected": -0.4780110716819763, + "step": 2950 + }, + { + "epoch": 0.77, + "learning_rate": 3.845001059992999e-06, + "logits/chosen": -2.70269513130188, + "logits/rejected": -2.6946027278900146, + "logps/chosen": -1615.549072265625, + "logps/rejected": -1480.8746337890625, + "loss": 0.6529, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.287524551153183, + "rewards/margins": 0.06467927992343903, + "rewards/rejected": -0.3522038459777832, + "step": 2960 + }, + { + "epoch": 0.78, + "learning_rate": 3.835360083441067e-06, + "logits/chosen": -2.7028274536132812, + "logits/rejected": -2.6871161460876465, + "logps/chosen": -1583.056396484375, + "logps/rejected": -1453.07568359375, + "loss": 0.6492, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.33619558811187744, + "rewards/margins": 0.10127142816781998, + "rewards/rejected": -0.4374670088291168, + "step": 2970 + }, + { + "epoch": 0.78, + "learning_rate": 3.825691239304318e-06, + "logits/chosen": -2.68107008934021, + "logits/rejected": -2.671574592590332, + "logps/chosen": -1545.6512451171875, + "logps/rejected": -1043.787109375, + "loss": 0.6621, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3671846091747284, + "rewards/margins": 0.12449660152196884, + "rewards/rejected": -0.4916812479496002, + "step": 2980 + }, + { + "epoch": 0.78, + "learning_rate": 3.8159947293615385e-06, + "logits/chosen": -2.6950478553771973, + "logits/rejected": -2.6778013706207275, + "logps/chosen": -1619.7373046875, + "logps/rejected": -1588.569091796875, + "loss": 0.6453, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2479819506406784, + "rewards/margins": 0.2056024968624115, + "rewards/rejected": -0.4535844922065735, + "step": 2990 + }, + { + "epoch": 0.79, + "learning_rate": 3.806270755968866e-06, + "logits/chosen": -2.706171989440918, + "logits/rejected": -2.684222936630249, + "logps/chosen": -1458.9422607421875, + "logps/rejected": -1434.80078125, + "loss": 0.6724, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.38695085048675537, + "rewards/margins": 0.0020756437443196774, + "rewards/rejected": -0.3890264630317688, + "step": 3000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -2.70687198638916, + "eval_logits/rejected": -2.698601484298706, + "eval_logps/chosen": -1593.22021484375, + "eval_logps/rejected": -1395.8458251953125, + "eval_loss": 0.6528115272521973, + "eval_rewards/accuracies": 0.6448412537574768, + "eval_rewards/chosen": -0.30671125650405884, + "eval_rewards/margins": 0.13510096073150635, + "eval_rewards/rejected": -0.4418122470378876, + "eval_runtime": 222.0026, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 3000 + }, + { + "epoch": 0.79, + "learning_rate": 3.7965195220555784e-06, + "logits/chosen": -2.7062435150146484, + "logits/rejected": -2.6944046020507812, + "logps/chosen": -1621.0589599609375, + "logps/rejected": -1340.968994140625, + "loss": 0.6426, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.27050885558128357, + "rewards/margins": 0.11764784157276154, + "rewards/rejected": -0.3881567120552063, + "step": 3010 + }, + { + "epoch": 0.79, + "learning_rate": 3.786741231119847e-06, + "logits/chosen": -2.723007917404175, + "logits/rejected": -2.71048903465271, + "logps/chosen": -1695.354248046875, + "logps/rejected": -1328.361083984375, + "loss": 0.6515, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.27845701575279236, + "rewards/margins": 0.2164488285779953, + "rewards/rejected": -0.4949057996273041, + "step": 3020 + }, + { + "epoch": 0.79, + "learning_rate": 3.7769360872244992e-06, + "logits/chosen": -2.736701488494873, + "logits/rejected": -2.733898878097534, + "logps/chosen": -1670.4505615234375, + "logps/rejected": -1539.7659912109375, + "loss": 0.6553, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20875000953674316, + "rewards/margins": 0.13187198340892792, + "rewards/rejected": -0.3406220078468323, + "step": 3030 + }, + { + "epoch": 0.8, + "learning_rate": 3.767104294992754e-06, + "logits/chosen": -2.680217742919922, + "logits/rejected": -2.673816442489624, + "logps/chosen": -1585.65380859375, + "logps/rejected": -1541.0491943359375, + "loss": 0.638, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.28602826595306396, + "rewards/margins": 0.14265409111976624, + "rewards/rejected": -0.428682416677475, + "step": 3040 + }, + { + "epoch": 0.8, + "learning_rate": 3.7572460596039524e-06, + "logits/chosen": -2.6832826137542725, + "logits/rejected": -2.6885409355163574, + "logps/chosen": -1698.9482421875, + "logps/rejected": -1336.911865234375, + "loss": 0.6208, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.281388521194458, + "rewards/margins": 0.2917434573173523, + "rewards/rejected": -0.5731319785118103, + "step": 3050 + }, + { + "epoch": 0.8, + "learning_rate": 3.74736158678928e-06, + "logits/chosen": -2.7269351482391357, + "logits/rejected": -2.7052764892578125, + "logps/chosen": -1629.1763916015625, + "logps/rejected": -1432.0966796875, + "loss": 0.6493, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3689160943031311, + "rewards/margins": 0.16314366459846497, + "rewards/rejected": -0.5320597290992737, + "step": 3060 + }, + { + "epoch": 0.8, + "learning_rate": 3.7374510828274673e-06, + "logits/chosen": -2.672687530517578, + "logits/rejected": -2.6832921504974365, + "logps/chosen": -1510.4393310546875, + "logps/rejected": -1656.064208984375, + "loss": 0.6482, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4276893734931946, + "rewards/margins": 0.14145776629447937, + "rewards/rejected": -0.5691471099853516, + "step": 3070 + }, + { + "epoch": 0.81, + "learning_rate": 3.72751475454049e-06, + "logits/chosen": -2.6742148399353027, + "logits/rejected": -2.6767220497131348, + "logps/chosen": -1322.834716796875, + "logps/rejected": -1213.99951171875, + "loss": 0.6702, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.31878334283828735, + "rewards/margins": 0.08807969093322754, + "rewards/rejected": -0.4068630337715149, + "step": 3080 + }, + { + "epoch": 0.81, + "learning_rate": 3.7175528092892503e-06, + "logits/chosen": -2.6825454235076904, + "logits/rejected": -2.672701597213745, + "logps/chosen": -1248.2161865234375, + "logps/rejected": -943.9337768554688, + "loss": 0.6487, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3250463306903839, + "rewards/margins": 0.2221144735813141, + "rewards/rejected": -0.5471608638763428, + "step": 3090 + }, + { + "epoch": 0.81, + "learning_rate": 3.7075654549692498e-06, + "logits/chosen": -2.683488368988037, + "logits/rejected": -2.6680476665496826, + "logps/chosen": -1318.1368408203125, + "logps/rejected": -1133.6009521484375, + "loss": 0.6413, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.33140167593955994, + "rewards/margins": 0.13203348219394684, + "rewards/rejected": -0.46343517303466797, + "step": 3100 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -2.6984853744506836, + "eval_logits/rejected": -2.689216375350952, + "eval_logps/chosen": -1594.0811767578125, + "eval_logps/rejected": -1397.078125, + "eval_loss": 0.6514426469802856, + "eval_rewards/accuracies": 0.6547619104385376, + "eval_rewards/chosen": -0.31532174348831177, + "eval_rewards/margins": 0.13881219923496246, + "eval_rewards/rejected": -0.4541339576244354, + "eval_runtime": 222.0759, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 0.284, + "step": 3100 + }, + { + "epoch": 0.81, + "learning_rate": 3.697552900006249e-06, + "logits/chosen": -2.7257251739501953, + "logits/rejected": -2.721527099609375, + "logps/chosen": -1413.564697265625, + "logps/rejected": -1260.1339111328125, + "loss": 0.6703, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3458861708641052, + "rewards/margins": 0.1819792091846466, + "rewards/rejected": -0.5278654098510742, + "step": 3110 + }, + { + "epoch": 0.82, + "learning_rate": 3.6875153533519244e-06, + "logits/chosen": -2.700303554534912, + "logits/rejected": -2.700843095779419, + "logps/chosen": -1726.0238037109375, + "logps/rejected": -1657.860107421875, + "loss": 0.6642, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2840934693813324, + "rewards/margins": 0.05369790643453598, + "rewards/rejected": -0.337791383266449, + "step": 3120 + }, + { + "epoch": 0.82, + "learning_rate": 3.6774530244794992e-06, + "logits/chosen": -2.7197420597076416, + "logits/rejected": -2.7181406021118164, + "logps/chosen": -1546.3883056640625, + "logps/rejected": -1361.9136962890625, + "loss": 0.6529, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3362138867378235, + "rewards/margins": 0.11709228903055191, + "rewards/rejected": -0.4533061385154724, + "step": 3130 + }, + { + "epoch": 0.82, + "learning_rate": 3.667366123379378e-06, + "logits/chosen": -2.69582200050354, + "logits/rejected": -2.7219948768615723, + "logps/chosen": -1652.6771240234375, + "logps/rejected": -1682.2486572265625, + "loss": 0.6973, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.39420560002326965, + "rewards/margins": -0.011835318990051746, + "rewards/rejected": -0.382370263338089, + "step": 3140 + }, + { + "epoch": 0.82, + "learning_rate": 3.6572548605547607e-06, + "logits/chosen": -2.697044849395752, + "logits/rejected": -2.711665153503418, + "logps/chosen": -1529.5, + "logps/rejected": -1450.67333984375, + "loss": 0.6736, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39427393674850464, + "rewards/margins": 0.12939773499965668, + "rewards/rejected": -0.5236716866493225, + "step": 3150 + }, + { + "epoch": 0.83, + "learning_rate": 3.6471194470172538e-06, + "logits/chosen": -2.699864625930786, + "logits/rejected": -2.7058236598968506, + "logps/chosen": -1312.325439453125, + "logps/rejected": -1181.29296875, + "loss": 0.6343, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2711292803287506, + "rewards/margins": 0.16613033413887024, + "rewards/rejected": -0.43725961446762085, + "step": 3160 + }, + { + "epoch": 0.83, + "learning_rate": 3.636960094282461e-06, + "logits/chosen": -2.713305950164795, + "logits/rejected": -2.7030246257781982, + "logps/chosen": -1462.4920654296875, + "logps/rejected": -1178.053466796875, + "loss": 0.6655, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.3815879225730896, + "rewards/margins": 0.024677610024809837, + "rewards/rejected": -0.4062655568122864, + "step": 3170 + }, + { + "epoch": 0.83, + "learning_rate": 3.6267770143655743e-06, + "logits/chosen": -2.726931095123291, + "logits/rejected": -2.735182285308838, + "logps/chosen": -1840.1080322265625, + "logps/rejected": -1748.771240234375, + "loss": 0.6537, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2249617874622345, + "rewards/margins": 0.2421763390302658, + "rewards/rejected": -0.4671381413936615, + "step": 3180 + }, + { + "epoch": 0.83, + "learning_rate": 3.6165704197769484e-06, + "logits/chosen": -2.7426035404205322, + "logits/rejected": -2.739197254180908, + "logps/chosen": -1690.318603515625, + "logps/rejected": -1485.94384765625, + "loss": 0.6508, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24536451697349548, + "rewards/margins": 0.17550361156463623, + "rewards/rejected": -0.4208681583404541, + "step": 3190 + }, + { + "epoch": 0.84, + "learning_rate": 3.606340523517663e-06, + "logits/chosen": -2.6690893173217773, + "logits/rejected": -2.6679508686065674, + "logps/chosen": -1618.514404296875, + "logps/rejected": -1375.9447021484375, + "loss": 0.6242, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2654856741428375, + "rewards/margins": 0.12073127925395966, + "rewards/rejected": -0.386216938495636, + "step": 3200 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -2.720609188079834, + "eval_logits/rejected": -2.7122809886932373, + "eval_logps/chosen": -1594.5162353515625, + "eval_logps/rejected": -1397.845947265625, + "eval_loss": 0.652283787727356, + "eval_rewards/accuracies": 0.6349206566810608, + "eval_rewards/chosen": -0.3196706473827362, + "eval_rewards/margins": 0.14214123785495758, + "eval_rewards/rejected": -0.461811900138855, + "eval_runtime": 222.0708, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 0.284, + "step": 3200 + }, + { + "epoch": 0.84, + "learning_rate": 3.5960875390750793e-06, + "logits/chosen": -2.683267831802368, + "logits/rejected": -2.6658530235290527, + "logps/chosen": -1604.8580322265625, + "logps/rejected": -1078.8558349609375, + "loss": 0.6726, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3753311038017273, + "rewards/margins": 0.055336564779281616, + "rewards/rejected": -0.4306676983833313, + "step": 3210 + }, + { + "epoch": 0.84, + "learning_rate": 3.585811680418386e-06, + "logits/chosen": -2.718259334564209, + "logits/rejected": -2.700469732284546, + "logps/chosen": -1195.2596435546875, + "logps/rejected": -1219.2757568359375, + "loss": 0.6469, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.32439124584198, + "rewards/margins": 0.08395363390445709, + "rewards/rejected": -0.40834489464759827, + "step": 3220 + }, + { + "epoch": 0.85, + "learning_rate": 3.5755131619941347e-06, + "logits/chosen": -2.730173349380493, + "logits/rejected": -2.741326332092285, + "logps/chosen": -1548.9466552734375, + "logps/rejected": -1413.3919677734375, + "loss": 0.6699, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.28420156240463257, + "rewards/margins": 0.03134525939822197, + "rewards/rejected": -0.31554684042930603, + "step": 3230 + }, + { + "epoch": 0.85, + "learning_rate": 3.565192198721759e-06, + "logits/chosen": -2.7104811668395996, + "logits/rejected": -2.698362350463867, + "logps/chosen": -1477.9722900390625, + "logps/rejected": -1029.33251953125, + "loss": 0.6688, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.40409666299819946, + "rewards/margins": 0.008787902072072029, + "rewards/rejected": -0.41288453340530396, + "step": 3240 + }, + { + "epoch": 0.85, + "learning_rate": 3.5548490059890965e-06, + "logits/chosen": -2.718234062194824, + "logits/rejected": -2.7058792114257812, + "logps/chosen": -1958.7900390625, + "logps/rejected": -1565.6016845703125, + "loss": 0.6694, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2763881981372833, + "rewards/margins": 0.1657334268093109, + "rewards/rejected": -0.44212159514427185, + "step": 3250 + }, + { + "epoch": 0.85, + "learning_rate": 3.5444837996478903e-06, + "logits/chosen": -2.7599310874938965, + "logits/rejected": -2.730556011199951, + "logps/chosen": -1544.931640625, + "logps/rejected": -1118.84912109375, + "loss": 0.6464, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36453551054000854, + "rewards/margins": 0.16067495942115784, + "rewards/rejected": -0.5252104997634888, + "step": 3260 + }, + { + "epoch": 0.86, + "learning_rate": 3.534096796009282e-06, + "logits/chosen": -2.7185637950897217, + "logits/rejected": -2.7207720279693604, + "logps/chosen": -1313.7662353515625, + "logps/rejected": -1141.20947265625, + "loss": 0.6594, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.26091524958610535, + "rewards/margins": 0.19384625554084778, + "rewards/rejected": -0.4547615647315979, + "step": 3270 + }, + { + "epoch": 0.86, + "learning_rate": 3.5236882118393046e-06, + "logits/chosen": -2.723361015319824, + "logits/rejected": -2.715217113494873, + "logps/chosen": -1645.950439453125, + "logps/rejected": -1256.7423095703125, + "loss": 0.6612, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3042986989021301, + "rewards/margins": 0.16423553228378296, + "rewards/rejected": -0.4685342311859131, + "step": 3280 + }, + { + "epoch": 0.86, + "learning_rate": 3.5132582643543513e-06, + "logits/chosen": -2.6780922412872314, + "logits/rejected": -2.669283390045166, + "logps/chosen": -1180.63427734375, + "logps/rejected": -1273.1810302734375, + "loss": 0.6426, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3003751337528229, + "rewards/margins": 0.14592352509498596, + "rewards/rejected": -0.44629865884780884, + "step": 3290 + }, + { + "epoch": 0.86, + "learning_rate": 3.5028071712166456e-06, + "logits/chosen": -2.7153820991516113, + "logits/rejected": -2.699248790740967, + "logps/chosen": -1570.4791259765625, + "logps/rejected": -1684.700439453125, + "loss": 0.6773, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4282234311103821, + "rewards/margins": -0.009249694645404816, + "rewards/rejected": -0.41897374391555786, + "step": 3300 + }, + { + "epoch": 0.86, + "eval_logits/chosen": -2.713590621948242, + "eval_logits/rejected": -2.704197883605957, + "eval_logps/chosen": -1592.927978515625, + "eval_logps/rejected": -1395.993896484375, + "eval_loss": 0.6505909562110901, + "eval_rewards/accuracies": 0.6507936716079712, + "eval_rewards/chosen": -0.30378803610801697, + "eval_rewards/margins": 0.1395045667886734, + "eval_rewards/rejected": -0.44329264760017395, + "eval_runtime": 222.0133, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.284, + "step": 3300 + }, + { + "epoch": 0.87, + "learning_rate": 3.4923351505297008e-06, + "logits/chosen": -2.7215983867645264, + "logits/rejected": -2.7041127681732178, + "logps/chosen": -1594.6470947265625, + "logps/rejected": -1549.53369140625, + "loss": 0.6382, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.33198311924934387, + "rewards/margins": 0.22456741333007812, + "rewards/rejected": -0.5565505623817444, + "step": 3310 + }, + { + "epoch": 0.87, + "learning_rate": 3.481842420833766e-06, + "logits/chosen": -2.6897029876708984, + "logits/rejected": -2.705749273300171, + "logps/chosen": -1553.8692626953125, + "logps/rejected": -1572.8138427734375, + "loss": 0.6448, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2950561046600342, + "rewards/margins": 0.21469798684120178, + "rewards/rejected": -0.5097540616989136, + "step": 3320 + }, + { + "epoch": 0.87, + "learning_rate": 3.4713292011012645e-06, + "logits/chosen": -2.738208055496216, + "logits/rejected": -2.7477622032165527, + "logps/chosen": -1472.1783447265625, + "logps/rejected": -1493.874267578125, + "loss": 0.6701, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3312566876411438, + "rewards/margins": 0.14993061125278473, + "rewards/rejected": -0.48118728399276733, + "step": 3330 + }, + { + "epoch": 0.87, + "learning_rate": 3.4607957107322277e-06, + "logits/chosen": -2.6952614784240723, + "logits/rejected": -2.699683666229248, + "logps/chosen": -1731.744140625, + "logps/rejected": -1487.7821044921875, + "loss": 0.6656, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3338416814804077, + "rewards/margins": 0.04807500168681145, + "rewards/rejected": -0.38191670179367065, + "step": 3340 + }, + { + "epoch": 0.88, + "learning_rate": 3.4502421695497112e-06, + "logits/chosen": -2.6966915130615234, + "logits/rejected": -2.6960909366607666, + "logps/chosen": -1478.6884765625, + "logps/rejected": -1387.267578125, + "loss": 0.6344, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3261907696723938, + "rewards/margins": 0.17302435636520386, + "rewards/rejected": -0.49921512603759766, + "step": 3350 + }, + { + "epoch": 0.88, + "learning_rate": 3.4396687977952137e-06, + "logits/chosen": -2.667715072631836, + "logits/rejected": -2.67319655418396, + "logps/chosen": -1495.124755859375, + "logps/rejected": -1264.6702880859375, + "loss": 0.6615, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3378712236881256, + "rewards/margins": 0.07796554267406464, + "rewards/rejected": -0.41583672165870667, + "step": 3360 + }, + { + "epoch": 0.88, + "learning_rate": 3.429075816124075e-06, + "logits/chosen": -2.6785030364990234, + "logits/rejected": -2.679802179336548, + "logps/chosen": -1791.1708984375, + "logps/rejected": -1383.6787109375, + "loss": 0.6622, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.377760112285614, + "rewards/margins": 0.13859833776950836, + "rewards/rejected": -0.516358494758606, + "step": 3370 + }, + { + "epoch": 0.88, + "learning_rate": 3.418463445600874e-06, + "logits/chosen": -2.671396493911743, + "logits/rejected": -2.685154676437378, + "logps/chosen": -1854.9521484375, + "logps/rejected": -1450.7947998046875, + "loss": 0.6512, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.25897151231765747, + "rewards/margins": 0.2364932745695114, + "rewards/rejected": -0.4954647421836853, + "step": 3380 + }, + { + "epoch": 0.89, + "learning_rate": 3.4078319076948173e-06, + "logits/chosen": -2.6842753887176514, + "logits/rejected": -2.6870522499084473, + "logps/chosen": -1303.2357177734375, + "logps/rejected": -1232.04638671875, + "loss": 0.6741, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.35404831171035767, + "rewards/margins": -0.003727942705154419, + "rewards/rejected": -0.35032039880752563, + "step": 3390 + }, + { + "epoch": 0.89, + "learning_rate": 3.3971814242751123e-06, + "logits/chosen": -2.6884913444519043, + "logits/rejected": -2.6751790046691895, + "logps/chosen": -1439.244140625, + "logps/rejected": -1143.121826171875, + "loss": 0.6531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22703692317008972, + "rewards/margins": 0.15256431698799133, + "rewards/rejected": -0.37960129976272583, + "step": 3400 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -2.6711924076080322, + "eval_logits/rejected": -2.6620049476623535, + "eval_logps/chosen": -1592.909912109375, + "eval_logps/rejected": -1395.920654296875, + "eval_loss": 0.6505374908447266, + "eval_rewards/accuracies": 0.6329365372657776, + "eval_rewards/chosen": -0.3036077916622162, + "eval_rewards/margins": 0.13895148038864136, + "eval_rewards/rejected": -0.44255930185317993, + "eval_runtime": 222.0417, + "eval_samples_per_second": 9.007, + "eval_steps_per_second": 0.284, + "step": 3400 + }, + { + "epoch": 0.89, + "learning_rate": 3.386512217606339e-06, + "logits/chosen": -2.6748526096343994, + "logits/rejected": -2.6637487411499023, + "logps/chosen": -1303.639892578125, + "logps/rejected": -1194.2991943359375, + "loss": 0.6513, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.37782102823257446, + "rewards/margins": 0.053777169436216354, + "rewards/rejected": -0.4315981864929199, + "step": 3410 + }, + { + "epoch": 0.9, + "learning_rate": 3.375824510343816e-06, + "logits/chosen": -2.66794753074646, + "logits/rejected": -2.668147563934326, + "logps/chosen": -1223.28271484375, + "logps/rejected": -1204.585205078125, + "loss": 0.6627, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4264448285102844, + "rewards/margins": 0.11545290797948837, + "rewards/rejected": -0.5418976545333862, + "step": 3420 + }, + { + "epoch": 0.9, + "learning_rate": 3.3651185255289466e-06, + "logits/chosen": -2.698690176010132, + "logits/rejected": -2.7189040184020996, + "logps/chosen": -1692.838134765625, + "logps/rejected": -1683.9605712890625, + "loss": 0.6314, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2621743977069855, + "rewards/margins": 0.2654629349708557, + "rewards/rejected": -0.5276373624801636, + "step": 3430 + }, + { + "epoch": 0.9, + "learning_rate": 3.354394486584568e-06, + "logits/chosen": -2.6886143684387207, + "logits/rejected": -2.6708929538726807, + "logps/chosen": -1850.3284912109375, + "logps/rejected": -1404.5838623046875, + "loss": 0.6484, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2724509835243225, + "rewards/margins": 0.15381048619747162, + "rewards/rejected": -0.42626142501831055, + "step": 3440 + }, + { + "epoch": 0.9, + "learning_rate": 3.3436526173102913e-06, + "logits/chosen": -2.6439101696014404, + "logits/rejected": -2.632524013519287, + "logps/chosen": -1454.0938720703125, + "logps/rejected": -1376.771728515625, + "loss": 0.6482, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.32575541734695435, + "rewards/margins": 0.10104187577962875, + "rewards/rejected": -0.4267973005771637, + "step": 3450 + }, + { + "epoch": 0.91, + "learning_rate": 3.3328931418778254e-06, + "logits/chosen": -2.676669120788574, + "logits/rejected": -2.6826610565185547, + "logps/chosen": -1268.5804443359375, + "logps/rejected": -1398.8740234375, + "loss": 0.6505, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.38719624280929565, + "rewards/margins": 0.012085462920367718, + "rewards/rejected": -0.39928168058395386, + "step": 3460 + }, + { + "epoch": 0.91, + "learning_rate": 3.3221162848263028e-06, + "logits/chosen": -2.6567957401275635, + "logits/rejected": -2.6414246559143066, + "logps/chosen": -1245.016357421875, + "logps/rejected": -1106.606201171875, + "loss": 0.6277, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2696836590766907, + "rewards/margins": 0.2176477015018463, + "rewards/rejected": -0.4873313903808594, + "step": 3470 + }, + { + "epoch": 0.91, + "learning_rate": 3.3113222710575914e-06, + "logits/chosen": -2.642732858657837, + "logits/rejected": -2.637019395828247, + "logps/chosen": -1382.54296875, + "logps/rejected": -1240.7161865234375, + "loss": 0.6648, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34380394220352173, + "rewards/margins": 0.14207597076892853, + "rewards/rejected": -0.48587995767593384, + "step": 3480 + }, + { + "epoch": 0.91, + "learning_rate": 3.300511325831603e-06, + "logits/chosen": -2.6817688941955566, + "logits/rejected": -2.6702680587768555, + "logps/chosen": -1511.686767578125, + "logps/rejected": -1441.002197265625, + "loss": 0.651, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4115941524505615, + "rewards/margins": 0.09103700518608093, + "rewards/rejected": -0.5026311278343201, + "step": 3490 + }, + { + "epoch": 0.92, + "learning_rate": 3.289683674761592e-06, + "logits/chosen": -2.6555826663970947, + "logits/rejected": -2.642547607421875, + "logps/chosen": -1455.566162109375, + "logps/rejected": -1279.9700927734375, + "loss": 0.6499, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2933773994445801, + "rewards/margins": 0.18951551616191864, + "rewards/rejected": -0.4828929007053375, + "step": 3500 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -2.6700594425201416, + "eval_logits/rejected": -2.661106824874878, + "eval_logps/chosen": -1597.6368408203125, + "eval_logps/rejected": -1401.417724609375, + "eval_loss": 0.6504107713699341, + "eval_rewards/accuracies": 0.6448412537574768, + "eval_rewards/chosen": -0.3508760929107666, + "eval_rewards/margins": 0.14665423333644867, + "eval_rewards/rejected": -0.49753034114837646, + "eval_runtime": 222.1254, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 0.284, + "step": 3500 + }, + { + "epoch": 0.92, + "learning_rate": 3.2788395438094444e-06, + "logits/chosen": -2.6743717193603516, + "logits/rejected": -2.65002179145813, + "logps/chosen": -1744.8306884765625, + "logps/rejected": -1472.7691650390625, + "loss": 0.6635, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3447830080986023, + "rewards/margins": 0.14540955424308777, + "rewards/rejected": -0.4901925027370453, + "step": 3510 + }, + { + "epoch": 0.92, + "learning_rate": 3.2679791592809653e-06, + "logits/chosen": -2.6821513175964355, + "logits/rejected": -2.6835196018218994, + "logps/chosen": -1547.2332763671875, + "logps/rejected": -1329.255615234375, + "loss": 0.6465, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.288739413022995, + "rewards/margins": 0.187089741230011, + "rewards/rejected": -0.47582918405532837, + "step": 3520 + }, + { + "epoch": 0.92, + "learning_rate": 3.257102747821157e-06, + "logits/chosen": -2.6712074279785156, + "logits/rejected": -2.665398120880127, + "logps/chosen": -1841.3148193359375, + "logps/rejected": -1583.763427734375, + "loss": 0.6481, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3188974857330322, + "rewards/margins": 0.1695767343044281, + "rewards/rejected": -0.4884742200374603, + "step": 3530 + }, + { + "epoch": 0.93, + "learning_rate": 3.246210536409484e-06, + "logits/chosen": -2.649895668029785, + "logits/rejected": -2.667114019393921, + "logps/chosen": -1591.7431640625, + "logps/rejected": -1639.8531494140625, + "loss": 0.6405, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.34883302450180054, + "rewards/margins": 0.20935270190238953, + "rewards/rejected": -0.5581857562065125, + "step": 3540 + }, + { + "epoch": 0.93, + "learning_rate": 3.235302752355142e-06, + "logits/chosen": -2.6782820224761963, + "logits/rejected": -2.6591145992279053, + "logps/chosen": -1445.5625, + "logps/rejected": -1209.8031005859375, + "loss": 0.6472, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.33042627573013306, + "rewards/margins": 0.14861580729484558, + "rewards/rejected": -0.47904205322265625, + "step": 3550 + }, + { + "epoch": 0.93, + "learning_rate": 3.2243796232923097e-06, + "logits/chosen": -2.6708455085754395, + "logits/rejected": -2.6628527641296387, + "logps/chosen": -1832.881591796875, + "logps/rejected": -1392.3641357421875, + "loss": 0.6558, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3827039301395416, + "rewards/margins": 0.20069828629493713, + "rewards/rejected": -0.583402156829834, + "step": 3560 + }, + { + "epoch": 0.93, + "learning_rate": 3.2134413771754037e-06, + "logits/chosen": -2.646705150604248, + "logits/rejected": -2.6478943824768066, + "logps/chosen": -1485.1029052734375, + "logps/rejected": -1430.926025390625, + "loss": 0.6465, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3916592597961426, + "rewards/margins": 0.14465120434761047, + "rewards/rejected": -0.5363104939460754, + "step": 3570 + }, + { + "epoch": 0.94, + "learning_rate": 3.2024882422743118e-06, + "logits/chosen": -2.6661245822906494, + "logits/rejected": -2.630502223968506, + "logps/chosen": -1536.616943359375, + "logps/rejected": -995.7938232421875, + "loss": 0.6477, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39404481649398804, + "rewards/margins": 0.10233994573354721, + "rewards/rejected": -0.49638479948043823, + "step": 3580 + }, + { + "epoch": 0.94, + "learning_rate": 3.1915204471696425e-06, + "logits/chosen": -2.7028446197509766, + "logits/rejected": -2.6725103855133057, + "logps/chosen": -1654.890380859375, + "logps/rejected": -1495.13916015625, + "loss": 0.6671, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2148229330778122, + "rewards/margins": 0.2208261936903, + "rewards/rejected": -0.4356490969657898, + "step": 3590 + }, + { + "epoch": 0.94, + "learning_rate": 3.180538220747943e-06, + "logits/chosen": -2.67252779006958, + "logits/rejected": -2.651607036590576, + "logps/chosen": -1619.154541015625, + "logps/rejected": -1425.920166015625, + "loss": 0.6439, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3832196593284607, + "rewards/margins": 0.1527005434036255, + "rewards/rejected": -0.535920262336731, + "step": 3600 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -2.684110164642334, + "eval_logits/rejected": -2.6758134365081787, + "eval_logps/chosen": -1597.77294921875, + "eval_logps/rejected": -1401.4176025390625, + "eval_loss": 0.6509248614311218, + "eval_rewards/accuracies": 0.6349206566810608, + "eval_rewards/chosen": -0.3522396683692932, + "eval_rewards/margins": 0.1452905237674713, + "eval_rewards/rejected": -0.4975302219390869, + "eval_runtime": 221.9749, + "eval_samples_per_second": 9.01, + "eval_steps_per_second": 0.284, + "step": 3600 + }, + { + "epoch": 0.94, + "learning_rate": 3.1695417921969287e-06, + "logits/chosen": -2.679503917694092, + "logits/rejected": -2.6619620323181152, + "logps/chosen": -1550.0435791015625, + "logps/rejected": -1238.9837646484375, + "loss": 0.6431, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.37688523530960083, + "rewards/margins": 0.07274709641933441, + "rewards/rejected": -0.44963231682777405, + "step": 3610 + }, + { + "epoch": 0.95, + "learning_rate": 3.158531391000697e-06, + "logits/chosen": -2.697350025177002, + "logits/rejected": -2.6872169971466064, + "logps/chosen": -1467.5635986328125, + "logps/rejected": -1384.6875, + "loss": 0.6375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3753679394721985, + "rewards/margins": 0.08869564533233643, + "rewards/rejected": -0.4640636444091797, + "step": 3620 + }, + { + "epoch": 0.95, + "learning_rate": 3.147507246934943e-06, + "logits/chosen": -2.6543407440185547, + "logits/rejected": -2.6734654903411865, + "logps/chosen": -1608.889892578125, + "logps/rejected": -1336.0, + "loss": 0.6437, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3899288773536682, + "rewards/margins": 0.1723022758960724, + "rewards/rejected": -0.562231183052063, + "step": 3630 + }, + { + "epoch": 0.95, + "learning_rate": 3.136469590062158e-06, + "logits/chosen": -2.702857494354248, + "logits/rejected": -2.6768500804901123, + "logps/chosen": -1747.8372802734375, + "logps/rejected": -1472.735107421875, + "loss": 0.6332, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21806052327156067, + "rewards/margins": 0.20363029837608337, + "rewards/rejected": -0.42169085144996643, + "step": 3640 + }, + { + "epoch": 0.96, + "learning_rate": 3.1254186507268354e-06, + "logits/chosen": -2.7096176147460938, + "logits/rejected": -2.6751861572265625, + "logps/chosen": -1822.2640380859375, + "logps/rejected": -1292.9925537109375, + "loss": 0.6645, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3251684606075287, + "rewards/margins": 0.17529870569705963, + "rewards/rejected": -0.5004671812057495, + "step": 3650 + }, + { + "epoch": 0.96, + "learning_rate": 3.114354659550656e-06, + "logits/chosen": -2.6707568168640137, + "logits/rejected": -2.656738758087158, + "logps/chosen": -1582.6048583984375, + "logps/rejected": -1370.0179443359375, + "loss": 0.6368, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4510306417942047, + "rewards/margins": 0.1725241243839264, + "rewards/rejected": -0.6235548257827759, + "step": 3660 + }, + { + "epoch": 0.96, + "learning_rate": 3.1032778474276816e-06, + "logits/chosen": -2.6743695735931396, + "logits/rejected": -2.6786069869995117, + "logps/chosen": -1812.0972900390625, + "logps/rejected": -1499.954833984375, + "loss": 0.6378, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.42497795820236206, + "rewards/margins": 0.13020361959934235, + "rewards/rejected": -0.5551815629005432, + "step": 3670 + }, + { + "epoch": 0.96, + "learning_rate": 3.092188445519532e-06, + "logits/chosen": -2.6756932735443115, + "logits/rejected": -2.6719775199890137, + "logps/chosen": -1629.5001220703125, + "logps/rejected": -1444.050048828125, + "loss": 0.6518, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.39225250482559204, + "rewards/margins": 0.19171519577503204, + "rewards/rejected": -0.5839677453041077, + "step": 3680 + }, + { + "epoch": 0.97, + "learning_rate": 3.081086685250565e-06, + "logits/chosen": -2.71132755279541, + "logits/rejected": -2.7033679485321045, + "logps/chosen": -1543.3043212890625, + "logps/rejected": -1258.254638671875, + "loss": 0.6459, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3387759327888489, + "rewards/margins": 0.23766681551933289, + "rewards/rejected": -0.5764427781105042, + "step": 3690 + }, + { + "epoch": 0.97, + "learning_rate": 3.0699727983030434e-06, + "logits/chosen": -2.6973066329956055, + "logits/rejected": -2.660385847091675, + "logps/chosen": -1589.33544921875, + "logps/rejected": -1132.1702880859375, + "loss": 0.6279, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3620646297931671, + "rewards/margins": 0.19742536544799805, + "rewards/rejected": -0.5594899654388428, + "step": 3700 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -2.7011826038360596, + "eval_logits/rejected": -2.691807746887207, + "eval_logps/chosen": -1602.89501953125, + "eval_logps/rejected": -1406.66748046875, + "eval_loss": 0.6505332589149475, + "eval_rewards/accuracies": 0.6309523582458496, + "eval_rewards/chosen": -0.4034595787525177, + "eval_rewards/margins": 0.14656895399093628, + "eval_rewards/rejected": -0.5500285625457764, + "eval_runtime": 222.0396, + "eval_samples_per_second": 9.007, + "eval_steps_per_second": 0.284, + "step": 3700 + }, + { + "epoch": 0.97, + "learning_rate": 3.058847016612301e-06, + "logits/chosen": -2.7201943397521973, + "logits/rejected": -2.7136852741241455, + "logps/chosen": -1556.614501953125, + "logps/rejected": -1279.77587890625, + "loss": 0.6211, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3983418047428131, + "rewards/margins": 0.14020150899887085, + "rewards/rejected": -0.5385433435440063, + "step": 3710 + }, + { + "epoch": 0.97, + "learning_rate": 3.0477095723619034e-06, + "logits/chosen": -2.7106566429138184, + "logits/rejected": -2.7031445503234863, + "logps/chosen": -1471.138916015625, + "logps/rejected": -1194.182373046875, + "loss": 0.636, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.31719970703125, + "rewards/margins": 0.25403183698654175, + "rewards/rejected": -0.571231484413147, + "step": 3720 + }, + { + "epoch": 0.98, + "learning_rate": 3.0365606979788003e-06, + "logits/chosen": -2.7038567066192627, + "logits/rejected": -2.717050552368164, + "logps/chosen": -1422.19091796875, + "logps/rejected": -1356.302001953125, + "loss": 0.6515, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3847157061100006, + "rewards/margins": 0.13767775893211365, + "rewards/rejected": -0.5223934650421143, + "step": 3730 + }, + { + "epoch": 0.98, + "learning_rate": 3.0254006261284786e-06, + "logits/chosen": -2.7265188694000244, + "logits/rejected": -2.696443557739258, + "logps/chosen": -1579.3212890625, + "logps/rejected": -1215.633056640625, + "loss": 0.6335, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3812587261199951, + "rewards/margins": 0.18493662774562836, + "rewards/rejected": -0.5661953687667847, + "step": 3740 + }, + { + "epoch": 0.98, + "learning_rate": 3.0142295897101032e-06, + "logits/chosen": -2.677203416824341, + "logits/rejected": -2.663817882537842, + "logps/chosen": -1657.607421875, + "logps/rejected": -1373.084716796875, + "loss": 0.6332, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.44950050115585327, + "rewards/margins": 0.10747332870960236, + "rewards/rejected": -0.5569738149642944, + "step": 3750 + }, + { + "epoch": 0.98, + "learning_rate": 3.0030478218516578e-06, + "logits/chosen": -2.752816677093506, + "logits/rejected": -2.7207674980163574, + "logps/chosen": -1824.684326171875, + "logps/rejected": -1472.426513671875, + "loss": 0.6291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31192246079444885, + "rewards/margins": 0.19837240874767303, + "rewards/rejected": -0.5102948546409607, + "step": 3760 + }, + { + "epoch": 0.99, + "learning_rate": 2.9918555559050826e-06, + "logits/chosen": -2.711655378341675, + "logits/rejected": -2.718435287475586, + "logps/chosen": -1729.4273681640625, + "logps/rejected": -1415.324951171875, + "loss": 0.6432, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.34275177121162415, + "rewards/margins": 0.1975889950990677, + "rewards/rejected": -0.540340781211853, + "step": 3770 + }, + { + "epoch": 0.99, + "learning_rate": 2.980653025441399e-06, + "logits/chosen": -2.6924350261688232, + "logits/rejected": -2.675720691680908, + "logps/chosen": -1520.789794921875, + "logps/rejected": -1284.784423828125, + "loss": 0.6459, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.5108143091201782, + "rewards/margins": 0.06858311593532562, + "rewards/rejected": -0.5793974995613098, + "step": 3780 + }, + { + "epoch": 0.99, + "learning_rate": 2.969440464245841e-06, + "logits/chosen": -2.6855132579803467, + "logits/rejected": -2.6786370277404785, + "logps/chosen": -1267.9501953125, + "logps/rejected": -1170.1279296875, + "loss": 0.6634, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5093327760696411, + "rewards/margins": -0.006477591581642628, + "rewards/rejected": -0.5028551816940308, + "step": 3790 + }, + { + "epoch": 0.99, + "learning_rate": 2.95821810631297e-06, + "logits/chosen": -2.69722580909729, + "logits/rejected": -2.6795449256896973, + "logps/chosen": -1494.9287109375, + "logps/rejected": -1520.502685546875, + "loss": 0.6443, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5035138726234436, + "rewards/margins": 0.12939448654651642, + "rewards/rejected": -0.6329083442687988, + "step": 3800 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -2.696512460708618, + "eval_logits/rejected": -2.6875741481781006, + "eval_logps/chosen": -1602.2508544921875, + "eval_logps/rejected": -1406.07275390625, + "eval_loss": 0.6497198343276978, + "eval_rewards/accuracies": 0.6289682388305664, + "eval_rewards/chosen": -0.3970177173614502, + "eval_rewards/margins": 0.14706376194953918, + "eval_rewards/rejected": -0.544081449508667, + "eval_runtime": 221.9365, + "eval_samples_per_second": 9.012, + "eval_steps_per_second": 0.284, + "step": 3800 + }, + { + "epoch": 1.0, + "learning_rate": 2.946986185841801e-06, + "logits/chosen": -2.7158286571502686, + "logits/rejected": -2.702641248703003, + "logps/chosen": -1648.909423828125, + "logps/rejected": -1226.8458251953125, + "loss": 0.6522, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.35285401344299316, + "rewards/margins": 0.07958800345659256, + "rewards/rejected": -0.4324420392513275, + "step": 3810 + }, + { + "epoch": 1.0, + "learning_rate": 2.935744937230903e-06, + "logits/chosen": -2.6734156608581543, + "logits/rejected": -2.685497760772705, + "logps/chosen": -1302.2877197265625, + "logps/rejected": -1202.2618408203125, + "loss": 0.6363, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5028305649757385, + "rewards/margins": 0.08032174408435822, + "rewards/rejected": -0.5831522941589355, + "step": 3820 + }, + { + "epoch": 1.0, + "learning_rate": 2.924494595073517e-06, + "logits/chosen": -2.6925551891326904, + "logits/rejected": -2.7131991386413574, + "logps/chosen": -1666.137939453125, + "logps/rejected": -1432.2186279296875, + "loss": 0.6262, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37226518988609314, + "rewards/margins": 0.2526193857192993, + "rewards/rejected": -0.6248846054077148, + "step": 3830 + }, + { + "epoch": 1.0, + "learning_rate": 2.9132353941526575e-06, + "logits/chosen": -2.679866313934326, + "logits/rejected": -2.6491472721099854, + "logps/chosen": -1389.10986328125, + "logps/rejected": -1554.234130859375, + "loss": 0.6387, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.35313764214515686, + "rewards/margins": 0.24915650486946106, + "rewards/rejected": -0.6022941470146179, + "step": 3840 + }, + { + "epoch": 1.01, + "learning_rate": 2.901967569436209e-06, + "logits/chosen": -2.6779770851135254, + "logits/rejected": -2.6733503341674805, + "logps/chosen": -1629.6099853515625, + "logps/rejected": -1241.42333984375, + "loss": 0.623, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.405801922082901, + "rewards/margins": 0.10763572156429291, + "rewards/rejected": -0.5134376287460327, + "step": 3850 + }, + { + "epoch": 1.01, + "learning_rate": 2.89069135607203e-06, + "logits/chosen": -2.718233823776245, + "logits/rejected": -2.6924235820770264, + "logps/chosen": -1733.679443359375, + "logps/rejected": -1611.1083984375, + "loss": 0.6483, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.25229984521865845, + "rewards/margins": 0.31755656003952026, + "rewards/rejected": -0.5698564648628235, + "step": 3860 + }, + { + "epoch": 1.01, + "learning_rate": 2.8794069893830386e-06, + "logits/chosen": -2.715327024459839, + "logits/rejected": -2.715574264526367, + "logps/chosen": -1536.84765625, + "logps/rejected": -1138.0296630859375, + "loss": 0.6242, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4053262770175934, + "rewards/margins": 0.10040046274662018, + "rewards/rejected": -0.50572669506073, + "step": 3870 + }, + { + "epoch": 1.02, + "learning_rate": 2.8681147048623038e-06, + "logits/chosen": -2.6821646690368652, + "logits/rejected": -2.6729283332824707, + "logps/chosen": -1832.4056396484375, + "logps/rejected": -1504.1866455078125, + "loss": 0.6356, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3265860378742218, + "rewards/margins": 0.22155019640922546, + "rewards/rejected": -0.5481362342834473, + "step": 3880 + }, + { + "epoch": 1.02, + "learning_rate": 2.8568147381681333e-06, + "logits/chosen": -2.717892646789551, + "logits/rejected": -2.7001843452453613, + "logps/chosen": -1530.6292724609375, + "logps/rejected": -1179.5780029296875, + "loss": 0.6367, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.31728649139404297, + "rewards/margins": 0.2593171000480652, + "rewards/rejected": -0.5766035318374634, + "step": 3890 + }, + { + "epoch": 1.02, + "learning_rate": 2.8455073251191533e-06, + "logits/chosen": -2.6777820587158203, + "logits/rejected": -2.677088737487793, + "logps/chosen": -1128.9188232421875, + "logps/rejected": -1106.5223388671875, + "loss": 0.6355, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4586946368217468, + "rewards/margins": 0.09128829836845398, + "rewards/rejected": -0.5499829649925232, + "step": 3900 + }, + { + "epoch": 1.02, + "eval_logits/chosen": -2.7038865089416504, + "eval_logits/rejected": -2.6949610710144043, + "eval_logps/chosen": -1597.9246826171875, + "eval_logps/rejected": -1401.5294189453125, + "eval_loss": 0.6484230756759644, + "eval_rewards/accuracies": 0.6349206566810608, + "eval_rewards/chosen": -0.3537573218345642, + "eval_rewards/margins": 0.14489062130451202, + "eval_rewards/rejected": -0.49864792823791504, + "eval_runtime": 222.0835, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 0.284, + "step": 3900 + }, + { + "epoch": 1.02, + "learning_rate": 2.8341927016893887e-06, + "logits/chosen": -2.691997528076172, + "logits/rejected": -2.696678400039673, + "logps/chosen": -1353.6199951171875, + "logps/rejected": -1345.487548828125, + "loss": 0.6653, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.397295743227005, + "rewards/margins": 0.09288805723190308, + "rewards/rejected": -0.4901837706565857, + "step": 3910 + }, + { + "epoch": 1.03, + "learning_rate": 2.822871104003335e-06, + "logits/chosen": -2.702699661254883, + "logits/rejected": -2.679598331451416, + "logps/chosen": -1657.781982421875, + "logps/rejected": -1315.1046142578125, + "loss": 0.6034, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.319889098405838, + "rewards/margins": 0.31032368540763855, + "rewards/rejected": -0.6302127838134766, + "step": 3920 + }, + { + "epoch": 1.03, + "learning_rate": 2.8115427683310355e-06, + "logits/chosen": -2.693666696548462, + "logits/rejected": -2.680574893951416, + "logps/chosen": -1606.5465087890625, + "logps/rejected": -1350.802734375, + "loss": 0.6385, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.41127365827560425, + "rewards/margins": 0.09982170164585114, + "rewards/rejected": -0.511095404624939, + "step": 3930 + }, + { + "epoch": 1.03, + "learning_rate": 2.8002079310831477e-06, + "logits/chosen": -2.656543016433716, + "logits/rejected": -2.647818088531494, + "logps/chosen": -1718.5107421875, + "logps/rejected": -1301.51416015625, + "loss": 0.658, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3816748261451721, + "rewards/margins": 0.1260094940662384, + "rewards/rejected": -0.5076843500137329, + "step": 3940 + }, + { + "epoch": 1.03, + "learning_rate": 2.7888668288060095e-06, + "logits/chosen": -2.704258680343628, + "logits/rejected": -2.706493377685547, + "logps/chosen": -1798.8916015625, + "logps/rejected": -1319.958251953125, + "loss": 0.633, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3321087956428528, + "rewards/margins": 0.25544315576553345, + "rewards/rejected": -0.5875519514083862, + "step": 3950 + }, + { + "epoch": 1.04, + "learning_rate": 2.7775196981767044e-06, + "logits/chosen": -2.705000400543213, + "logits/rejected": -2.6998260021209717, + "logps/chosen": -1702.5657958984375, + "logps/rejected": -1557.700927734375, + "loss": 0.6578, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4326794147491455, + "rewards/margins": 0.19303780794143677, + "rewards/rejected": -0.6257172226905823, + "step": 3960 + }, + { + "epoch": 1.04, + "learning_rate": 2.7661667759981213e-06, + "logits/chosen": -2.7508487701416016, + "logits/rejected": -2.741401433944702, + "logps/chosen": -1811.778564453125, + "logps/rejected": -1451.1256103515625, + "loss": 0.6158, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2792471945285797, + "rewards/margins": 0.21910421550273895, + "rewards/rejected": -0.49835139513015747, + "step": 3970 + }, + { + "epoch": 1.04, + "learning_rate": 2.7548082991940137e-06, + "logits/chosen": -2.695704936981201, + "logits/rejected": -2.6940112113952637, + "logps/chosen": -1396.2000732421875, + "logps/rejected": -1184.29638671875, + "loss": 0.6579, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.515252947807312, + "rewards/margins": 0.2268890142440796, + "rewards/rejected": -0.7421420216560364, + "step": 3980 + }, + { + "epoch": 1.04, + "learning_rate": 2.743444504804051e-06, + "logits/chosen": -2.6837940216064453, + "logits/rejected": -2.6780648231506348, + "logps/chosen": -1294.56884765625, + "logps/rejected": -1170.3057861328125, + "loss": 0.6325, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42278486490249634, + "rewards/margins": 0.1340743750333786, + "rewards/rejected": -0.5568591952323914, + "step": 3990 + }, + { + "epoch": 1.05, + "learning_rate": 2.7320756299788788e-06, + "logits/chosen": -2.723829746246338, + "logits/rejected": -2.7320613861083984, + "logps/chosen": -1578.4306640625, + "logps/rejected": -1222.0570068359375, + "loss": 0.6683, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.5014371275901794, + "rewards/margins": -0.04650117829442024, + "rewards/rejected": -0.4549359679222107, + "step": 4000 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -2.7079918384552, + "eval_logits/rejected": -2.699162244796753, + "eval_logps/chosen": -1598.626220703125, + "eval_logps/rejected": -1402.8544921875, + "eval_loss": 0.6481702923774719, + "eval_rewards/accuracies": 0.6349206566810608, + "eval_rewards/chosen": -0.3607728183269501, + "eval_rewards/margins": 0.1511262059211731, + "eval_rewards/rejected": -0.5118989944458008, + "eval_runtime": 221.9957, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 4000 + }, + { + "epoch": 1.05, + "learning_rate": 2.7207019119751644e-06, + "logits/chosen": -2.692671298980713, + "logits/rejected": -2.6763689517974854, + "logps/chosen": -1635.9403076171875, + "logps/rejected": -1283.6893310546875, + "loss": 0.6155, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24907469749450684, + "rewards/margins": 0.21768009662628174, + "rewards/rejected": -0.46675482392311096, + "step": 4010 + }, + { + "epoch": 1.05, + "learning_rate": 2.7093235881506474e-06, + "logits/chosen": -2.6583075523376465, + "logits/rejected": -2.6455483436584473, + "logps/chosen": -1569.704833984375, + "logps/rejected": -1339.1483154296875, + "loss": 0.648, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4154641628265381, + "rewards/margins": 0.11932597309350967, + "rewards/rejected": -0.5347901582717896, + "step": 4020 + }, + { + "epoch": 1.05, + "learning_rate": 2.6979408959591863e-06, + "logits/chosen": -2.676906108856201, + "logits/rejected": -2.654895305633545, + "logps/chosen": -1454.9947509765625, + "logps/rejected": -1063.083740234375, + "loss": 0.6484, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3592832684516907, + "rewards/margins": 0.1434701383113861, + "rewards/rejected": -0.5027534365653992, + "step": 4030 + }, + { + "epoch": 1.06, + "learning_rate": 2.6865540729458034e-06, + "logits/chosen": -2.722224235534668, + "logits/rejected": -2.6849489212036133, + "logps/chosen": -1687.6962890625, + "logps/rejected": -1196.1148681640625, + "loss": 0.6302, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3025377690792084, + "rewards/margins": 0.15057072043418884, + "rewards/rejected": -0.453108549118042, + "step": 4040 + }, + { + "epoch": 1.06, + "learning_rate": 2.675163356741726e-06, + "logits/chosen": -2.6830029487609863, + "logits/rejected": -2.683077812194824, + "logps/chosen": -1607.4324951171875, + "logps/rejected": -1472.721435546875, + "loss": 0.6434, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3175775408744812, + "rewards/margins": 0.12574470043182373, + "rewards/rejected": -0.44332224130630493, + "step": 4050 + }, + { + "epoch": 1.06, + "learning_rate": 2.6637689850594285e-06, + "logits/chosen": -2.6846535205841064, + "logits/rejected": -2.6847877502441406, + "logps/chosen": -1888.1953125, + "logps/rejected": -1392.2398681640625, + "loss": 0.6614, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.35660284757614136, + "rewards/margins": 0.07980741560459137, + "rewards/rejected": -0.4364103376865387, + "step": 4060 + }, + { + "epoch": 1.07, + "learning_rate": 2.652371195687671e-06, + "logits/chosen": -2.6865925788879395, + "logits/rejected": -2.6678271293640137, + "logps/chosen": -1839.5250244140625, + "logps/rejected": -1401.562744140625, + "loss": 0.6581, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3077114224433899, + "rewards/margins": 0.15223054587841034, + "rewards/rejected": -0.45994195342063904, + "step": 4070 + }, + { + "epoch": 1.07, + "learning_rate": 2.64097022648654e-06, + "logits/chosen": -2.7195897102355957, + "logits/rejected": -2.709167957305908, + "logps/chosen": -1437.06103515625, + "logps/rejected": -1188.230712890625, + "loss": 0.6318, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2933391332626343, + "rewards/margins": 0.14199650287628174, + "rewards/rejected": -0.435335636138916, + "step": 4080 + }, + { + "epoch": 1.07, + "learning_rate": 2.6295663153824774e-06, + "logits/chosen": -2.676091194152832, + "logits/rejected": -2.6590778827667236, + "logps/chosen": -1539.490478515625, + "logps/rejected": -1512.3614501953125, + "loss": 0.6538, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3669421076774597, + "rewards/margins": 0.07918572425842285, + "rewards/rejected": -0.44612783193588257, + "step": 4090 + }, + { + "epoch": 1.07, + "learning_rate": 2.6181597003633218e-06, + "logits/chosen": -2.722808599472046, + "logits/rejected": -2.7022972106933594, + "logps/chosen": -1721.559326171875, + "logps/rejected": -1385.15625, + "loss": 0.6459, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2962713837623596, + "rewards/margins": 0.18515335023403168, + "rewards/rejected": -0.4814247190952301, + "step": 4100 + }, + { + "epoch": 1.07, + "eval_logits/chosen": -2.694403886795044, + "eval_logits/rejected": -2.6851539611816406, + "eval_logps/chosen": -1595.5987548828125, + "eval_logps/rejected": -1399.263427734375, + "eval_loss": 0.6475256085395813, + "eval_rewards/accuracies": 0.6448412537574768, + "eval_rewards/chosen": -0.33049485087394714, + "eval_rewards/margins": 0.14549362659454346, + "eval_rewards/rejected": -0.4759885370731354, + "eval_runtime": 222.0313, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.284, + "step": 4100 + }, + { + "epoch": 1.08, + "learning_rate": 2.606750619473342e-06, + "logits/chosen": -2.6989099979400635, + "logits/rejected": -2.6851353645324707, + "logps/chosen": -1384.610595703125, + "logps/rejected": -1350.0355224609375, + "loss": 0.6369, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3065218925476074, + "rewards/margins": 0.11955137550830841, + "rewards/rejected": -0.42607325315475464, + "step": 4110 + }, + { + "epoch": 1.08, + "learning_rate": 2.595339310808262e-06, + "logits/chosen": -2.6756765842437744, + "logits/rejected": -2.6787614822387695, + "logps/chosen": -1460.1578369140625, + "logps/rejected": -1376.394775390625, + "loss": 0.6227, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3293229937553406, + "rewards/margins": 0.15691342949867249, + "rewards/rejected": -0.48623642325401306, + "step": 4120 + }, + { + "epoch": 1.08, + "learning_rate": 2.5839260125103004e-06, + "logits/chosen": -2.656978130340576, + "logits/rejected": -2.6542904376983643, + "logps/chosen": -1446.76171875, + "logps/rejected": -1587.48828125, + "loss": 0.6449, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.29374203085899353, + "rewards/margins": 0.17694918811321259, + "rewards/rejected": -0.4706912636756897, + "step": 4130 + }, + { + "epoch": 1.08, + "learning_rate": 2.5725109627631984e-06, + "logits/chosen": -2.7323098182678223, + "logits/rejected": -2.71343731880188, + "logps/chosen": -1746.254150390625, + "logps/rejected": -1433.3355712890625, + "loss": 0.6762, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3805330693721771, + "rewards/margins": 0.14356563985347748, + "rewards/rejected": -0.5240987539291382, + "step": 4140 + }, + { + "epoch": 1.09, + "learning_rate": 2.5610943997872443e-06, + "logits/chosen": -2.714146852493286, + "logits/rejected": -2.6969714164733887, + "logps/chosen": -1565.2064208984375, + "logps/rejected": -1335.5848388671875, + "loss": 0.6345, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2877258360385895, + "rewards/margins": 0.21380770206451416, + "rewards/rejected": -0.501533567905426, + "step": 4150 + }, + { + "epoch": 1.09, + "learning_rate": 2.5496765618343096e-06, + "logits/chosen": -2.703857898712158, + "logits/rejected": -2.6917083263397217, + "logps/chosen": -1705.0570068359375, + "logps/rejected": -1651.3447265625, + "loss": 0.6253, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.31827279925346375, + "rewards/margins": 0.18771009147167206, + "rewards/rejected": -0.5059828758239746, + "step": 4160 + }, + { + "epoch": 1.09, + "learning_rate": 2.538257687182871e-06, + "logits/chosen": -2.7111072540283203, + "logits/rejected": -2.7166075706481934, + "logps/chosen": -1474.952880859375, + "logps/rejected": -1338.7607421875, + "loss": 0.6475, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3775080144405365, + "rewards/margins": 0.05340402573347092, + "rewards/rejected": -0.4309120178222656, + "step": 4170 + }, + { + "epoch": 1.09, + "learning_rate": 2.526838014133041e-06, + "logits/chosen": -2.7166781425476074, + "logits/rejected": -2.681563377380371, + "logps/chosen": -1745.790283203125, + "logps/rejected": -1185.437744140625, + "loss": 0.6368, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.34735316038131714, + "rewards/margins": 0.14297917485237122, + "rewards/rejected": -0.4903322756290436, + "step": 4180 + }, + { + "epoch": 1.1, + "learning_rate": 2.515417781001594e-06, + "logits/chosen": -2.6650118827819824, + "logits/rejected": -2.6872620582580566, + "logps/chosen": -1447.572021484375, + "logps/rejected": -1491.62548828125, + "loss": 0.6357, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3270224332809448, + "rewards/margins": 0.08632199466228485, + "rewards/rejected": -0.41334444284439087, + "step": 4190 + }, + { + "epoch": 1.1, + "learning_rate": 2.503997226116992e-06, + "logits/chosen": -2.685615062713623, + "logits/rejected": -2.6830313205718994, + "logps/chosen": -1447.239990234375, + "logps/rejected": -1051.3704833984375, + "loss": 0.6451, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.313309907913208, + "rewards/margins": 0.2056921422481537, + "rewards/rejected": -0.5190020799636841, + "step": 4200 + }, + { + "epoch": 1.1, + "eval_logits/chosen": -2.704213857650757, + "eval_logits/rejected": -2.6953837871551514, + "eval_logps/chosen": -1597.2633056640625, + "eval_logps/rejected": -1401.5712890625, + "eval_loss": 0.6471446752548218, + "eval_rewards/accuracies": 0.636904776096344, + "eval_rewards/chosen": -0.34714046120643616, + "eval_rewards/margins": 0.1519256830215454, + "eval_rewards/rejected": -0.49906620383262634, + "eval_runtime": 222.0001, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 4200 + }, + { + "epoch": 1.1, + "learning_rate": 2.4925765878144115e-06, + "logits/chosen": -2.709895372390747, + "logits/rejected": -2.6850790977478027, + "logps/chosen": -1786.897216796875, + "logps/rejected": -1272.6116943359375, + "loss": 0.6101, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2910698354244232, + "rewards/margins": 0.18431541323661804, + "rewards/rejected": -0.47538524866104126, + "step": 4210 + }, + { + "epoch": 1.1, + "learning_rate": 2.4811561044307727e-06, + "logits/chosen": -2.7279367446899414, + "logits/rejected": -2.703131675720215, + "logps/chosen": -1588.3284912109375, + "logps/rejected": -1520.607421875, + "loss": 0.6235, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.33380937576293945, + "rewards/margins": 0.14665281772613525, + "rewards/rejected": -0.4804622232913971, + "step": 4220 + }, + { + "epoch": 1.11, + "learning_rate": 2.469736014299758e-06, + "logits/chosen": -2.682407855987549, + "logits/rejected": -2.672849416732788, + "logps/chosen": -1425.3299560546875, + "logps/rejected": -1198.2889404296875, + "loss": 0.664, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.38793474435806274, + "rewards/margins": 0.05479978397488594, + "rewards/rejected": -0.4427345395088196, + "step": 4230 + }, + { + "epoch": 1.11, + "learning_rate": 2.458316555746846e-06, + "logits/chosen": -2.693711757659912, + "logits/rejected": -2.674509048461914, + "logps/chosen": -1880.191650390625, + "logps/rejected": -1725.5045166015625, + "loss": 0.6343, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3580625057220459, + "rewards/margins": 0.14049111306667328, + "rewards/rejected": -0.498553603887558, + "step": 4240 + }, + { + "epoch": 1.11, + "learning_rate": 2.446897967084334e-06, + "logits/chosen": -2.7132728099823, + "logits/rejected": -2.7284951210021973, + "logps/chosen": -1449.487548828125, + "logps/rejected": -1492.966064453125, + "loss": 0.6573, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4193345606327057, + "rewards/margins": 0.08319560438394547, + "rewards/rejected": -0.5025301575660706, + "step": 4250 + }, + { + "epoch": 1.11, + "learning_rate": 2.4354804866063684e-06, + "logits/chosen": -2.6998226642608643, + "logits/rejected": -2.7050068378448486, + "logps/chosen": -1740.021240234375, + "logps/rejected": -1726.16796875, + "loss": 0.6485, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3352826237678528, + "rewards/margins": 0.14481016993522644, + "rewards/rejected": -0.480092853307724, + "step": 4260 + }, + { + "epoch": 1.12, + "learning_rate": 2.424064352583964e-06, + "logits/chosen": -2.661332607269287, + "logits/rejected": -2.6545464992523193, + "logps/chosen": -1448.39794921875, + "logps/rejected": -1272.430908203125, + "loss": 0.649, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.486703097820282, + "rewards/margins": 0.09392253309488297, + "rewards/rejected": -0.5806256532669067, + "step": 4270 + }, + { + "epoch": 1.12, + "learning_rate": 2.4126498032600403e-06, + "logits/chosen": -2.678957223892212, + "logits/rejected": -2.666158437728882, + "logps/chosen": -1762.4287109375, + "logps/rejected": -1390.131103515625, + "loss": 0.6115, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3684222102165222, + "rewards/margins": 0.2715838551521301, + "rewards/rejected": -0.6400061249732971, + "step": 4280 + }, + { + "epoch": 1.12, + "learning_rate": 2.401237076844445e-06, + "logits/chosen": -2.700265407562256, + "logits/rejected": -2.6984000205993652, + "logps/chosen": -1472.801025390625, + "logps/rejected": -1357.495849609375, + "loss": 0.653, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3983010947704315, + "rewards/margins": 0.22106428444385529, + "rewards/rejected": -0.619365394115448, + "step": 4290 + }, + { + "epoch": 1.13, + "learning_rate": 2.38982641150898e-06, + "logits/chosen": -2.73397159576416, + "logits/rejected": -2.705148696899414, + "logps/chosen": -1355.8466796875, + "logps/rejected": -1200.568603515625, + "loss": 0.6744, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.34279078245162964, + "rewards/margins": 0.10427943617105484, + "rewards/rejected": -0.4470701813697815, + "step": 4300 + }, + { + "epoch": 1.13, + "eval_logits/chosen": -2.7094671726226807, + "eval_logits/rejected": -2.7008376121520996, + "eval_logps/chosen": -1598.7427978515625, + "eval_logps/rejected": -1402.7869873046875, + "eval_loss": 0.6482938528060913, + "eval_rewards/accuracies": 0.6428571343421936, + "eval_rewards/chosen": -0.361937552690506, + "eval_rewards/margins": 0.14928516745567322, + "eval_rewards/rejected": -0.5112226605415344, + "eval_runtime": 221.8751, + "eval_samples_per_second": 9.014, + "eval_steps_per_second": 0.284, + "step": 4300 + }, + { + "epoch": 1.13, + "learning_rate": 2.3784180453824414e-06, + "logits/chosen": -2.6598381996154785, + "logits/rejected": -2.6440939903259277, + "logps/chosen": -1149.7069091796875, + "logps/rejected": -1251.6026611328125, + "loss": 0.6159, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.3814155161380768, + "rewards/margins": 0.27214717864990234, + "rewards/rejected": -0.6535626649856567, + "step": 4310 + }, + { + "epoch": 1.13, + "learning_rate": 2.367012216545638e-06, + "logits/chosen": -2.7360281944274902, + "logits/rejected": -2.704324722290039, + "logps/chosen": -1364.4183349609375, + "logps/rejected": -1276.6588134765625, + "loss": 0.6425, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3955642879009247, + "rewards/margins": 0.0812029018998146, + "rewards/rejected": -0.4767672121524811, + "step": 4320 + }, + { + "epoch": 1.13, + "learning_rate": 2.3556091630264294e-06, + "logits/chosen": -2.708026885986328, + "logits/rejected": -2.715130567550659, + "logps/chosen": -1570.659912109375, + "logps/rejected": -1440.263427734375, + "loss": 0.6338, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4111559987068176, + "rewards/margins": 0.19120827317237854, + "rewards/rejected": -0.6023643016815186, + "step": 4330 + }, + { + "epoch": 1.14, + "learning_rate": 2.344209122794757e-06, + "logits/chosen": -2.731186866760254, + "logits/rejected": -2.7232601642608643, + "logps/chosen": -1690.5205078125, + "logps/rejected": -1551.1507568359375, + "loss": 0.6155, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.395660936832428, + "rewards/margins": 0.2794150114059448, + "rewards/rejected": -0.675075888633728, + "step": 4340 + }, + { + "epoch": 1.14, + "learning_rate": 2.3328123337576787e-06, + "logits/chosen": -2.6726715564727783, + "logits/rejected": -2.6711506843566895, + "logps/chosen": -1203.287109375, + "logps/rejected": -1226.397705078125, + "loss": 0.6488, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4064127504825592, + "rewards/margins": 0.12786249816417694, + "rewards/rejected": -0.534275233745575, + "step": 4350 + }, + { + "epoch": 1.14, + "learning_rate": 2.3214190337544017e-06, + "logits/chosen": -2.7237210273742676, + "logits/rejected": -2.7012486457824707, + "logps/chosen": -1373.9417724609375, + "logps/rejected": -1111.353271484375, + "loss": 0.6324, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3576543927192688, + "rewards/margins": 0.24221619963645935, + "rewards/rejected": -0.5998705625534058, + "step": 4360 + }, + { + "epoch": 1.14, + "learning_rate": 2.310029460551323e-06, + "logits/chosen": -2.670630693435669, + "logits/rejected": -2.6639411449432373, + "logps/chosen": -1617.3814697265625, + "logps/rejected": -1109.9146728515625, + "loss": 0.6155, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3107014298439026, + "rewards/margins": 0.20053979754447937, + "rewards/rejected": -0.5112412571907043, + "step": 4370 + }, + { + "epoch": 1.15, + "learning_rate": 2.2986438518370645e-06, + "logits/chosen": -2.6928963661193848, + "logits/rejected": -2.675933361053467, + "logps/chosen": -1699.2435302734375, + "logps/rejected": -1564.343017578125, + "loss": 0.6306, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40069809556007385, + "rewards/margins": 0.17530310153961182, + "rewards/rejected": -0.5760011672973633, + "step": 4380 + }, + { + "epoch": 1.15, + "learning_rate": 2.2872624452175123e-06, + "logits/chosen": -2.687253475189209, + "logits/rejected": -2.6814191341400146, + "logps/chosen": -1764.602783203125, + "logps/rejected": -1355.3795166015625, + "loss": 0.6642, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4385541081428528, + "rewards/margins": 0.19212810695171356, + "rewards/rejected": -0.6306821703910828, + "step": 4390 + }, + { + "epoch": 1.15, + "learning_rate": 2.2758854782108584e-06, + "logits/chosen": -2.6809728145599365, + "logits/rejected": -2.6859257221221924, + "logps/chosen": -1223.7222900390625, + "logps/rejected": -1274.3193359375, + "loss": 0.6355, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5250788927078247, + "rewards/margins": 0.08925594389438629, + "rewards/rejected": -0.6143348217010498, + "step": 4400 + }, + { + "epoch": 1.15, + "eval_logits/chosen": -2.7000913619995117, + "eval_logits/rejected": -2.691587209701538, + "eval_logps/chosen": -1602.953125, + "eval_logps/rejected": -1407.248046875, + "eval_loss": 0.6476736068725586, + "eval_rewards/accuracies": 0.6269841194152832, + "eval_rewards/chosen": -0.4040408134460449, + "eval_rewards/margins": 0.15179233253002167, + "eval_rewards/rejected": -0.5558331608772278, + "eval_runtime": 221.9412, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 0.284, + "step": 4400 + }, + { + "epoch": 1.15, + "learning_rate": 2.2645131882426458e-06, + "logits/chosen": -2.6515867710113525, + "logits/rejected": -2.6461291313171387, + "logps/chosen": -1675.511962890625, + "logps/rejected": -1306.486083984375, + "loss": 0.6526, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4507780969142914, + "rewards/margins": 0.0885946974158287, + "rewards/rejected": -0.5393728017807007, + "step": 4410 + }, + { + "epoch": 1.16, + "learning_rate": 2.2531458126408154e-06, + "logits/chosen": -2.696350574493408, + "logits/rejected": -2.671020984649658, + "logps/chosen": -1504.6510009765625, + "logps/rejected": -1417.209228515625, + "loss": 0.6406, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.44704046845436096, + "rewards/margins": 0.16284236311912537, + "rewards/rejected": -0.6098828315734863, + "step": 4420 + }, + { + "epoch": 1.16, + "learning_rate": 2.2417835886307452e-06, + "logits/chosen": -2.6964614391326904, + "logits/rejected": -2.693498134613037, + "logps/chosen": -1594.204345703125, + "logps/rejected": -1482.2950439453125, + "loss": 0.6424, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.412257581949234, + "rewards/margins": 0.11970730125904083, + "rewards/rejected": -0.5319648385047913, + "step": 4430 + }, + { + "epoch": 1.16, + "learning_rate": 2.2304267533303075e-06, + "logits/chosen": -2.7227253913879395, + "logits/rejected": -2.717461109161377, + "logps/chosen": -1820.9801025390625, + "logps/rejected": -1700.9222412109375, + "loss": 0.6213, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4056100845336914, + "rewards/margins": 0.19048570096492767, + "rewards/rejected": -0.5960958003997803, + "step": 4440 + }, + { + "epoch": 1.16, + "learning_rate": 2.219075543744918e-06, + "logits/chosen": -2.6950507164001465, + "logits/rejected": -2.686513662338257, + "logps/chosen": -1661.6273193359375, + "logps/rejected": -1578.125, + "loss": 0.6407, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3556327819824219, + "rewards/margins": 0.23430833220481873, + "rewards/rejected": -0.589941143989563, + "step": 4450 + }, + { + "epoch": 1.17, + "learning_rate": 2.207730196762589e-06, + "logits/chosen": -2.685410976409912, + "logits/rejected": -2.6853833198547363, + "logps/chosen": -1636.17236328125, + "logps/rejected": -1485.104736328125, + "loss": 0.6347, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3346371650695801, + "rewards/margins": 0.1579541265964508, + "rewards/rejected": -0.49259132146835327, + "step": 4460 + }, + { + "epoch": 1.17, + "learning_rate": 2.1963909491489846e-06, + "logits/chosen": -2.629254102706909, + "logits/rejected": -2.631855010986328, + "logps/chosen": -1437.640625, + "logps/rejected": -1336.6815185546875, + "loss": 0.6243, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.540132462978363, + "rewards/margins": 0.1326352059841156, + "rewards/rejected": -0.672767698764801, + "step": 4470 + }, + { + "epoch": 1.17, + "learning_rate": 2.185058037542486e-06, + "logits/chosen": -2.6665568351745605, + "logits/rejected": -2.672071933746338, + "logps/chosen": -1345.5667724609375, + "logps/rejected": -1157.662109375, + "loss": 0.6238, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3904740512371063, + "rewards/margins": 0.15949508547782898, + "rewards/rejected": -0.5499691367149353, + "step": 4480 + }, + { + "epoch": 1.18, + "learning_rate": 2.173731698449244e-06, + "logits/chosen": -2.7047767639160156, + "logits/rejected": -2.7059977054595947, + "logps/chosen": -1669.259765625, + "logps/rejected": -1611.8948974609375, + "loss": 0.6342, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.39055395126342773, + "rewards/margins": 0.24455150961875916, + "rewards/rejected": -0.6351054310798645, + "step": 4490 + }, + { + "epoch": 1.18, + "learning_rate": 2.1624121682382495e-06, + "logits/chosen": -2.6913721561431885, + "logits/rejected": -2.680323839187622, + "logps/chosen": -1435.3963623046875, + "logps/rejected": -1044.6759033203125, + "loss": 0.6187, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.42885956168174744, + "rewards/margins": 0.17149262130260468, + "rewards/rejected": -0.6003522276878357, + "step": 4500 + }, + { + "epoch": 1.18, + "eval_logits/chosen": -2.6962525844573975, + "eval_logits/rejected": -2.6882517337799072, + "eval_logps/chosen": -1603.0440673828125, + "eval_logps/rejected": -1407.0084228515625, + "eval_loss": 0.6472293138504028, + "eval_rewards/accuracies": 0.6349206566810608, + "eval_rewards/chosen": -0.40495049953460693, + "eval_rewards/margins": 0.14848746359348297, + "eval_rewards/rejected": -0.5534379482269287, + "eval_runtime": 222.0323, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.284, + "step": 4500 + }, + { + "epoch": 1.18, + "learning_rate": 2.1510996831363993e-06, + "logits/chosen": -2.654839277267456, + "logits/rejected": -2.654435157775879, + "logps/chosen": -1602.6588134765625, + "logps/rejected": -1460.040771484375, + "loss": 0.6251, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31933337450027466, + "rewards/margins": 0.2544369697570801, + "rewards/rejected": -0.5737703442573547, + "step": 4510 + }, + { + "epoch": 1.18, + "learning_rate": 2.139794479223565e-06, + "logits/chosen": -2.6878533363342285, + "logits/rejected": -2.700411319732666, + "logps/chosen": -1505.231201171875, + "logps/rejected": -1499.4599609375, + "loss": 0.6241, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3987555503845215, + "rewards/margins": 0.07872674614191055, + "rewards/rejected": -0.47748225927352905, + "step": 4520 + }, + { + "epoch": 1.19, + "learning_rate": 2.128496792427669e-06, + "logits/chosen": -2.702573299407959, + "logits/rejected": -2.6977837085723877, + "logps/chosen": -1317.4459228515625, + "logps/rejected": -1278.5416259765625, + "loss": 0.6258, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5213514566421509, + "rewards/margins": 0.08871433138847351, + "rewards/rejected": -0.610065758228302, + "step": 4530 + }, + { + "epoch": 1.19, + "learning_rate": 2.117206858519758e-06, + "logits/chosen": -2.70503568649292, + "logits/rejected": -2.6909327507019043, + "logps/chosen": -2104.00927734375, + "logps/rejected": -1710.705078125, + "loss": 0.627, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.29126936197280884, + "rewards/margins": 0.23307795822620392, + "rewards/rejected": -0.5243473052978516, + "step": 4540 + }, + { + "epoch": 1.19, + "learning_rate": 2.1059249131090844e-06, + "logits/chosen": -2.7203164100646973, + "logits/rejected": -2.7143332958221436, + "logps/chosen": -1671.8541259765625, + "logps/rejected": -1462.7237548828125, + "loss": 0.6644, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.35251596570014954, + "rewards/margins": 0.12573233246803284, + "rewards/rejected": -0.4782482981681824, + "step": 4550 + }, + { + "epoch": 1.19, + "learning_rate": 2.094651191638189e-06, + "logits/chosen": -2.7122576236724854, + "logits/rejected": -2.712846517562866, + "logps/chosen": -1451.910888671875, + "logps/rejected": -1314.6732177734375, + "loss": 0.6382, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3421178162097931, + "rewards/margins": 0.19824132323265076, + "rewards/rejected": -0.5403591990470886, + "step": 4560 + }, + { + "epoch": 1.2, + "learning_rate": 2.0833859293779867e-06, + "logits/chosen": -2.73149037361145, + "logits/rejected": -2.7085766792297363, + "logps/chosen": -1928.4527587890625, + "logps/rejected": -1434.140380859375, + "loss": 0.6435, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3224185109138489, + "rewards/margins": 0.23348590731620789, + "rewards/rejected": -0.5559044480323792, + "step": 4570 + }, + { + "epoch": 1.2, + "learning_rate": 2.0721293614228568e-06, + "logits/chosen": -2.691683292388916, + "logits/rejected": -2.675884246826172, + "logps/chosen": -1427.173828125, + "logps/rejected": -1134.955078125, + "loss": 0.6332, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2849566340446472, + "rewards/margins": 0.2647608816623688, + "rewards/rejected": -0.5497175455093384, + "step": 4580 + }, + { + "epoch": 1.2, + "learning_rate": 2.060881722685742e-06, + "logits/chosen": -2.730750560760498, + "logits/rejected": -2.7235236167907715, + "logps/chosen": -1560.0443115234375, + "logps/rejected": -1230.85693359375, + "loss": 0.6558, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.38310301303863525, + "rewards/margins": 0.09956183284521103, + "rewards/rejected": -0.48266488313674927, + "step": 4590 + }, + { + "epoch": 1.2, + "learning_rate": 2.049643247893235e-06, + "logits/chosen": -2.7023532390594482, + "logits/rejected": -2.7045681476593018, + "logps/chosen": -1575.8134765625, + "logps/rejected": -1411.7012939453125, + "loss": 0.6555, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5016661882400513, + "rewards/margins": 0.09470875561237335, + "rewards/rejected": -0.5963749289512634, + "step": 4600 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -2.716783285140991, + "eval_logits/rejected": -2.707549810409546, + "eval_logps/chosen": -1601.382568359375, + "eval_logps/rejected": -1405.2078857421875, + "eval_loss": 0.6472097635269165, + "eval_rewards/accuracies": 0.6309523582458496, + "eval_rewards/chosen": -0.38833513855934143, + "eval_rewards/margins": 0.14709699153900146, + "eval_rewards/rejected": -0.5354321002960205, + "eval_runtime": 221.9388, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 0.284, + "step": 4600 + }, + { + "epoch": 1.21, + "learning_rate": 2.0384141715806903e-06, + "logits/chosen": -2.6752729415893555, + "logits/rejected": -2.6681485176086426, + "logps/chosen": -1369.5797119140625, + "logps/rejected": -1186.574462890625, + "loss": 0.6345, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3633851408958435, + "rewards/margins": 0.12907521426677704, + "rewards/rejected": -0.49246034026145935, + "step": 4610 + }, + { + "epoch": 1.21, + "learning_rate": 2.0271947280873255e-06, + "logits/chosen": -2.70173978805542, + "logits/rejected": -2.6857197284698486, + "logps/chosen": -1928.5341796875, + "logps/rejected": -1503.6873779296875, + "loss": 0.6447, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3070146143436432, + "rewards/margins": 0.21328814327716827, + "rewards/rejected": -0.5203026533126831, + "step": 4620 + }, + { + "epoch": 1.21, + "learning_rate": 2.0159851515513302e-06, + "logits/chosen": -2.7439606189727783, + "logits/rejected": -2.725101947784424, + "logps/chosen": -1525.56787109375, + "logps/rejected": -1400.937255859375, + "loss": 0.65, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3897179365158081, + "rewards/margins": 0.20567400753498077, + "rewards/rejected": -0.5953919291496277, + "step": 4630 + }, + { + "epoch": 1.21, + "learning_rate": 2.004785675904982e-06, + "logits/chosen": -2.7022523880004883, + "logits/rejected": -2.7149405479431152, + "logps/chosen": -1141.4708251953125, + "logps/rejected": -1319.975341796875, + "loss": 0.6594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40314921736717224, + "rewards/margins": 0.05639972165226936, + "rewards/rejected": -0.4595489501953125, + "step": 4640 + }, + { + "epoch": 1.22, + "learning_rate": 1.9935965348697624e-06, + "logits/chosen": -2.7120418548583984, + "logits/rejected": -2.6966214179992676, + "logps/chosen": -1568.3426513671875, + "logps/rejected": -1232.408447265625, + "loss": 0.6226, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.382880836725235, + "rewards/margins": 0.1441594809293747, + "rewards/rejected": -0.5270403027534485, + "step": 4650 + }, + { + "epoch": 1.22, + "learning_rate": 1.9824179619514807e-06, + "logits/chosen": -2.712540864944458, + "logits/rejected": -2.70318341255188, + "logps/chosen": -1457.201904296875, + "logps/rejected": -1480.9552001953125, + "loss": 0.6325, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4275636076927185, + "rewards/margins": 0.03508736938238144, + "rewards/rejected": -0.46265095472335815, + "step": 4660 + }, + { + "epoch": 1.22, + "learning_rate": 1.9712501904354004e-06, + "logits/chosen": -2.711000919342041, + "logits/rejected": -2.7133917808532715, + "logps/chosen": -1551.674072265625, + "logps/rejected": -1135.0037841796875, + "loss": 0.6527, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4668704867362976, + "rewards/margins": 0.1215348094701767, + "rewards/rejected": -0.5884053111076355, + "step": 4670 + }, + { + "epoch": 1.22, + "learning_rate": 1.960093453381369e-06, + "logits/chosen": -2.6892762184143066, + "logits/rejected": -2.684814214706421, + "logps/chosen": -1495.062255859375, + "logps/rejected": -1371.4951171875, + "loss": 0.6442, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5431568026542664, + "rewards/margins": 0.0739276260137558, + "rewards/rejected": -0.6170844435691833, + "step": 4680 + }, + { + "epoch": 1.23, + "learning_rate": 1.948947983618962e-06, + "logits/chosen": -2.6771562099456787, + "logits/rejected": -2.682755708694458, + "logps/chosen": -1703.9365234375, + "logps/rejected": -1336.865234375, + "loss": 0.6401, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4650822579860687, + "rewards/margins": 0.1413726508617401, + "rewards/rejected": -0.6064549088478088, + "step": 4690 + }, + { + "epoch": 1.23, + "learning_rate": 1.937814013742611e-06, + "logits/chosen": -2.6763648986816406, + "logits/rejected": -2.669142246246338, + "logps/chosen": -1396.1729736328125, + "logps/rejected": -1169.147705078125, + "loss": 0.6178, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.45917263627052307, + "rewards/margins": 0.19413240253925323, + "rewards/rejected": -0.6533050537109375, + "step": 4700 + }, + { + "epoch": 1.23, + "eval_logits/chosen": -2.700589179992676, + "eval_logits/rejected": -2.6911513805389404, + "eval_logps/chosen": -1602.476318359375, + "eval_logps/rejected": -1405.8092041015625, + "eval_loss": 0.6476128101348877, + "eval_rewards/accuracies": 0.6190476417541504, + "eval_rewards/chosen": -0.39927202463150024, + "eval_rewards/margins": 0.1421724110841751, + "eval_rewards/rejected": -0.5414443612098694, + "eval_runtime": 222.0165, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.284, + "step": 4700 + }, + { + "epoch": 1.23, + "learning_rate": 1.9266917761067617e-06, + "logits/chosen": -2.6928534507751465, + "logits/rejected": -2.695483922958374, + "logps/chosen": -1367.698974609375, + "logps/rejected": -1266.385986328125, + "loss": 0.6361, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.3876931071281433, + "rewards/margins": 0.1019570380449295, + "rewards/rejected": -0.489650160074234, + "step": 4710 + }, + { + "epoch": 1.24, + "learning_rate": 1.915581502821017e-06, + "logits/chosen": -2.7133240699768066, + "logits/rejected": -2.7143406867980957, + "logps/chosen": -1590.260009765625, + "logps/rejected": -1481.0340576171875, + "loss": 0.6478, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43118634819984436, + "rewards/margins": 0.1839209347963333, + "rewards/rejected": -0.6151072978973389, + "step": 4720 + }, + { + "epoch": 1.24, + "learning_rate": 1.9044834257452997e-06, + "logits/chosen": -2.6916699409484863, + "logits/rejected": -2.6897857189178467, + "logps/chosen": -1501.4925537109375, + "logps/rejected": -1478.397216796875, + "loss": 0.6044, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3613813817501068, + "rewards/margins": 0.3248251974582672, + "rewards/rejected": -0.6862064599990845, + "step": 4730 + }, + { + "epoch": 1.24, + "learning_rate": 1.893397776485006e-06, + "logits/chosen": -2.7368080615997314, + "logits/rejected": -2.731428861618042, + "logps/chosen": -1889.243408203125, + "logps/rejected": -1275.2314453125, + "loss": 0.6557, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.35338732600212097, + "rewards/margins": 0.1933208405971527, + "rewards/rejected": -0.5467082262039185, + "step": 4740 + }, + { + "epoch": 1.24, + "learning_rate": 1.8823247863861804e-06, + "logits/chosen": -2.7085061073303223, + "logits/rejected": -2.6927361488342285, + "logps/chosen": -1784.7685546875, + "logps/rejected": -1247.2198486328125, + "loss": 0.6375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.37581080198287964, + "rewards/margins": 0.2545829117298126, + "rewards/rejected": -0.6303936839103699, + "step": 4750 + }, + { + "epoch": 1.25, + "learning_rate": 1.8712646865306822e-06, + "logits/chosen": -2.6886403560638428, + "logits/rejected": -2.6807615756988525, + "logps/chosen": -1332.3145751953125, + "logps/rejected": -1414.520751953125, + "loss": 0.6221, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.40824103355407715, + "rewards/margins": 0.08367902040481567, + "rewards/rejected": -0.4919200539588928, + "step": 4760 + }, + { + "epoch": 1.25, + "learning_rate": 1.8602177077313631e-06, + "logits/chosen": -2.7041351795196533, + "logits/rejected": -2.6894426345825195, + "logps/chosen": -1375.96435546875, + "logps/rejected": -1346.1580810546875, + "loss": 0.636, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.24864885210990906, + "rewards/margins": 0.2993265390396118, + "rewards/rejected": -0.5479754209518433, + "step": 4770 + }, + { + "epoch": 1.25, + "learning_rate": 1.8491840805272546e-06, + "logits/chosen": -2.7112724781036377, + "logits/rejected": -2.718292474746704, + "logps/chosen": -1560.8367919921875, + "logps/rejected": -1538.97802734375, + "loss": 0.6279, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.43749088048934937, + "rewards/margins": 0.12982013821601868, + "rewards/rejected": -0.5673110485076904, + "step": 4780 + }, + { + "epoch": 1.25, + "learning_rate": 1.8381640351787516e-06, + "logits/chosen": -2.7069056034088135, + "logits/rejected": -2.6937079429626465, + "logps/chosen": -1586.96630859375, + "logps/rejected": -1298.62890625, + "loss": 0.6174, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38084354996681213, + "rewards/margins": 0.25583982467651367, + "rewards/rejected": -0.6366834044456482, + "step": 4790 + }, + { + "epoch": 1.26, + "learning_rate": 1.8271578016628122e-06, + "logits/chosen": -2.700934886932373, + "logits/rejected": -2.695272207260132, + "logps/chosen": -1409.424560546875, + "logps/rejected": -1458.7320556640625, + "loss": 0.6242, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.39179927110671997, + "rewards/margins": 0.16118505597114563, + "rewards/rejected": -0.552984356880188, + "step": 4800 + }, + { + "epoch": 1.26, + "eval_logits/chosen": -2.701568841934204, + "eval_logits/rejected": -2.6917405128479004, + "eval_logps/chosen": -1605.5714111328125, + "eval_logps/rejected": -1409.126708984375, + "eval_loss": 0.6477026343345642, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": -0.4302244782447815, + "eval_rewards/margins": 0.14439751207828522, + "eval_rewards/rejected": -0.5746219158172607, + "eval_runtime": 222.0078, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 4800 + }, + { + "epoch": 1.26, + "learning_rate": 1.8161656096681546e-06, + "logits/chosen": -2.6858129501342773, + "logits/rejected": -2.65826416015625, + "logps/chosen": -1292.4794921875, + "logps/rejected": -886.6090087890625, + "loss": 0.635, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5286234021186829, + "rewards/margins": 0.1071949228644371, + "rewards/rejected": -0.6358182430267334, + "step": 4810 + }, + { + "epoch": 1.26, + "learning_rate": 1.8051876885904645e-06, + "logits/chosen": -2.6940348148345947, + "logits/rejected": -2.668518304824829, + "logps/chosen": -1209.8846435546875, + "logps/rejected": -1209.081298828125, + "loss": 0.6463, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.4978295862674713, + "rewards/margins": 0.07090970128774643, + "rewards/rejected": -0.5687392950057983, + "step": 4820 + }, + { + "epoch": 1.26, + "learning_rate": 1.7942242675276098e-06, + "logits/chosen": -2.691037893295288, + "logits/rejected": -2.698622941970825, + "logps/chosen": -1303.959228515625, + "logps/rejected": -1136.2342529296875, + "loss": 0.6618, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4843706488609314, + "rewards/margins": 0.10510985553264618, + "rewards/rejected": -0.5894805788993835, + "step": 4830 + }, + { + "epoch": 1.27, + "learning_rate": 1.783275575274856e-06, + "logits/chosen": -2.7178115844726562, + "logits/rejected": -2.704846143722534, + "logps/chosen": -1262.106689453125, + "logps/rejected": -1242.8951416015625, + "loss": 0.645, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4795067310333252, + "rewards/margins": 0.03653149679303169, + "rewards/rejected": -0.5160382390022278, + "step": 4840 + }, + { + "epoch": 1.27, + "learning_rate": 1.7723418403200943e-06, + "logits/chosen": -2.6975011825561523, + "logits/rejected": -2.6939289569854736, + "logps/chosen": -1763.947265625, + "logps/rejected": -1696.672607421875, + "loss": 0.6684, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.46184077858924866, + "rewards/margins": 0.04751107469201088, + "rewards/rejected": -0.509351909160614, + "step": 4850 + }, + { + "epoch": 1.27, + "learning_rate": 1.7614232908390748e-06, + "logits/chosen": -2.7259678840637207, + "logits/rejected": -2.714102268218994, + "logps/chosen": -1708.26953125, + "logps/rejected": -1277.268798828125, + "loss": 0.6668, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.339743971824646, + "rewards/margins": 0.13007903099060059, + "rewards/rejected": -0.4698229730129242, + "step": 4860 + }, + { + "epoch": 1.27, + "learning_rate": 1.7505201546906398e-06, + "logits/chosen": -2.728283405303955, + "logits/rejected": -2.717923641204834, + "logps/chosen": -1363.5814208984375, + "logps/rejected": -1250.927734375, + "loss": 0.6323, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3779984414577484, + "rewards/margins": 0.18015049397945404, + "rewards/rejected": -0.5581489205360413, + "step": 4870 + }, + { + "epoch": 1.28, + "learning_rate": 1.7396326594119717e-06, + "logits/chosen": -2.637516975402832, + "logits/rejected": -2.659529209136963, + "logps/chosen": -1411.4888916015625, + "logps/rejected": -1366.10205078125, + "loss": 0.6638, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.36083534359931946, + "rewards/margins": 0.07913025468587875, + "rewards/rejected": -0.4399656355381012, + "step": 4880 + }, + { + "epoch": 1.28, + "learning_rate": 1.7287610322138449e-06, + "logits/chosen": -2.7112040519714355, + "logits/rejected": -2.6832072734832764, + "logps/chosen": -1514.82177734375, + "logps/rejected": -1104.4573974609375, + "loss": 0.6283, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.35094964504241943, + "rewards/margins": 0.20894651114940643, + "rewards/rejected": -0.5598961710929871, + "step": 4890 + }, + { + "epoch": 1.28, + "learning_rate": 1.7179054999758817e-06, + "logits/chosen": -2.6823577880859375, + "logits/rejected": -2.677356243133545, + "logps/chosen": -1602.299072265625, + "logps/rejected": -1288.768310546875, + "loss": 0.6221, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.35732200741767883, + "rewards/margins": 0.25401774048805237, + "rewards/rejected": -0.6113396883010864, + "step": 4900 + }, + { + "epoch": 1.28, + "eval_logits/chosen": -2.716733694076538, + "eval_logits/rejected": -2.707334041595459, + "eval_logps/chosen": -1601.0272216796875, + "eval_logps/rejected": -1404.6871337890625, + "eval_loss": 0.6463930606842041, + "eval_rewards/accuracies": 0.6349206566810608, + "eval_rewards/chosen": -0.38478225469589233, + "eval_rewards/margins": 0.14544257521629333, + "eval_rewards/rejected": -0.5302248001098633, + "eval_runtime": 221.9549, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 0.284, + "step": 4900 + }, + { + "epoch": 1.29, + "learning_rate": 1.7070662892418225e-06, + "logits/chosen": -2.7210943698883057, + "logits/rejected": -2.6983580589294434, + "logps/chosen": -1570.259521484375, + "logps/rejected": -1662.6558837890625, + "loss": 0.6416, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4329853951931, + "rewards/margins": 0.20024752616882324, + "rewards/rejected": -0.6332329511642456, + "step": 4910 + }, + { + "epoch": 1.29, + "learning_rate": 1.6962436262147913e-06, + "logits/chosen": -2.7212061882019043, + "logits/rejected": -2.7054669857025146, + "logps/chosen": -1921.957763671875, + "logps/rejected": -1697.683349609375, + "loss": 0.6421, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2484970986843109, + "rewards/margins": 0.2698536217212677, + "rewards/rejected": -0.5183507204055786, + "step": 4920 + }, + { + "epoch": 1.29, + "learning_rate": 1.6854377367525814e-06, + "logits/chosen": -2.6766185760498047, + "logits/rejected": -2.678335428237915, + "logps/chosen": -1425.6380615234375, + "logps/rejected": -1139.2139892578125, + "loss": 0.6755, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5290840268135071, + "rewards/margins": 0.013314229436218739, + "rewards/rejected": -0.54239821434021, + "step": 4930 + }, + { + "epoch": 1.29, + "learning_rate": 1.6746488463629362e-06, + "logits/chosen": -2.6874043941497803, + "logits/rejected": -2.697096824645996, + "logps/chosen": -1432.6015625, + "logps/rejected": -1319.1143798828125, + "loss": 0.6488, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4031291902065277, + "rewards/margins": 0.18114542961120605, + "rewards/rejected": -0.5842746496200562, + "step": 4940 + }, + { + "epoch": 1.3, + "learning_rate": 1.6638771801988483e-06, + "logits/chosen": -2.7338335514068604, + "logits/rejected": -2.7252652645111084, + "logps/chosen": -1705.234375, + "logps/rejected": -1308.44384765625, + "loss": 0.6515, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38323846459388733, + "rewards/margins": 0.1837080419063568, + "rewards/rejected": -0.5669465065002441, + "step": 4950 + }, + { + "epoch": 1.3, + "learning_rate": 1.653122963053857e-06, + "logits/chosen": -2.679515838623047, + "logits/rejected": -2.6913959980010986, + "logps/chosen": -1297.17138671875, + "logps/rejected": -1433.731201171875, + "loss": 0.663, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.36984187364578247, + "rewards/margins": 0.11996430158615112, + "rewards/rejected": -0.48980623483657837, + "step": 4960 + }, + { + "epoch": 1.3, + "learning_rate": 1.6423864193573606e-06, + "logits/chosen": -2.732271909713745, + "logits/rejected": -2.7238974571228027, + "logps/chosen": -1768.8033447265625, + "logps/rejected": -1411.5208740234375, + "loss": 0.6216, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5462476015090942, + "rewards/margins": 0.15834124386310577, + "rewards/rejected": -0.7045888304710388, + "step": 4970 + }, + { + "epoch": 1.3, + "learning_rate": 1.6316677731699286e-06, + "logits/chosen": -2.7058351039886475, + "logits/rejected": -2.678009271621704, + "logps/chosen": -1341.9267578125, + "logps/rejected": -1204.485595703125, + "loss": 0.614, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4298137128353119, + "rewards/margins": 0.18093213438987732, + "rewards/rejected": -0.6107458472251892, + "step": 4980 + }, + { + "epoch": 1.31, + "learning_rate": 1.6209672481786302e-06, + "logits/chosen": -2.707846164703369, + "logits/rejected": -2.6903176307678223, + "logps/chosen": -1449.9866943359375, + "logps/rejected": -1365.045166015625, + "loss": 0.6434, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.43810850381851196, + "rewards/margins": 0.18651129305362701, + "rewards/rejected": -0.6246197819709778, + "step": 4990 + }, + { + "epoch": 1.31, + "learning_rate": 1.6102850676923616e-06, + "logits/chosen": -2.7575032711029053, + "logits/rejected": -2.7509520053863525, + "logps/chosen": -1555.2666015625, + "logps/rejected": -1557.8531494140625, + "loss": 0.6582, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.43144112825393677, + "rewards/margins": 0.06042899936437607, + "rewards/rejected": -0.49187007546424866, + "step": 5000 + }, + { + "epoch": 1.31, + "eval_logits/chosen": -2.7267844676971436, + "eval_logits/rejected": -2.717449903488159, + "eval_logps/chosen": -1602.501220703125, + "eval_logps/rejected": -1406.292724609375, + "eval_loss": 0.6459673643112183, + "eval_rewards/accuracies": 0.6309523582458496, + "eval_rewards/chosen": -0.39952224493026733, + "eval_rewards/margins": 0.14675946533679962, + "eval_rewards/rejected": -0.5462816953659058, + "eval_runtime": 222.1193, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 0.284, + "step": 5000 + }, + { + "epoch": 1.31, + "learning_rate": 1.5996214546371888e-06, + "logits/chosen": -2.741490602493286, + "logits/rejected": -2.730355978012085, + "logps/chosen": -1695.7152099609375, + "logps/rejected": -1342.337158203125, + "loss": 0.6385, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.36179646849632263, + "rewards/margins": 0.20229323208332062, + "rewards/rejected": -0.5640896558761597, + "step": 5010 + }, + { + "epoch": 1.31, + "learning_rate": 1.588976631551697e-06, + "logits/chosen": -2.6824703216552734, + "logits/rejected": -2.6855571269989014, + "logps/chosen": -1382.963623046875, + "logps/rejected": -1372.467041015625, + "loss": 0.6429, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4259725511074066, + "rewards/margins": 0.19692833721637726, + "rewards/rejected": -0.6229008436203003, + "step": 5020 + }, + { + "epoch": 1.32, + "learning_rate": 1.5783508205823412e-06, + "logits/chosen": -2.7435383796691895, + "logits/rejected": -2.740779399871826, + "logps/chosen": -1508.678466796875, + "logps/rejected": -1334.25, + "loss": 0.6218, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.382813036441803, + "rewards/margins": 0.24260124564170837, + "rewards/rejected": -0.625414252281189, + "step": 5030 + }, + { + "epoch": 1.32, + "learning_rate": 1.5677442434788143e-06, + "logits/chosen": -2.7308874130249023, + "logits/rejected": -2.7374587059020996, + "logps/chosen": -1877.0191650390625, + "logps/rejected": -1829.716796875, + "loss": 0.6812, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4555627703666687, + "rewards/margins": 0.05523302033543587, + "rewards/rejected": -0.5107957720756531, + "step": 5040 + }, + { + "epoch": 1.32, + "learning_rate": 1.5571571215894181e-06, + "logits/chosen": -2.740858316421509, + "logits/rejected": -2.7242140769958496, + "logps/chosen": -1411.8358154296875, + "logps/rejected": -1487.0919189453125, + "loss": 0.6256, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.36944401264190674, + "rewards/margins": 0.280577152967453, + "rewards/rejected": -0.6500211954116821, + "step": 5050 + }, + { + "epoch": 1.32, + "learning_rate": 1.5465896758564452e-06, + "logits/chosen": -2.7520554065704346, + "logits/rejected": -2.75130295753479, + "logps/chosen": -1459.7698974609375, + "logps/rejected": -1445.945556640625, + "loss": 0.6289, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2901294529438019, + "rewards/margins": 0.2617509663105011, + "rewards/rejected": -0.551880419254303, + "step": 5060 + }, + { + "epoch": 1.33, + "learning_rate": 1.5360421268115653e-06, + "logits/chosen": -2.719130277633667, + "logits/rejected": -2.718916177749634, + "logps/chosen": -1367.1160888671875, + "logps/rejected": -1231.0194091796875, + "loss": 0.6296, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.273499459028244, + "rewards/margins": 0.24340489506721497, + "rewards/rejected": -0.516904354095459, + "step": 5070 + }, + { + "epoch": 1.33, + "learning_rate": 1.5255146945712267e-06, + "logits/chosen": -2.7351174354553223, + "logits/rejected": -2.7053210735321045, + "logps/chosen": -1379.465576171875, + "logps/rejected": -1388.5853271484375, + "loss": 0.6134, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.40679627656936646, + "rewards/margins": 0.19303181767463684, + "rewards/rejected": -0.5998281240463257, + "step": 5080 + }, + { + "epoch": 1.33, + "learning_rate": 1.5150075988320594e-06, + "logits/chosen": -2.7461514472961426, + "logits/rejected": -2.732804536819458, + "logps/chosen": -1556.8673095703125, + "logps/rejected": -1557.4312744140625, + "loss": 0.6263, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.39684033393859863, + "rewards/margins": 0.14313337206840515, + "rewards/rejected": -0.539973795413971, + "step": 5090 + }, + { + "epoch": 1.33, + "learning_rate": 1.5045210588662929e-06, + "logits/chosen": -2.723940849304199, + "logits/rejected": -2.7165207862854004, + "logps/chosen": -1319.94970703125, + "logps/rejected": -1179.8677978515625, + "loss": 0.6276, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3562172055244446, + "rewards/margins": 0.20867054164409637, + "rewards/rejected": -0.5648878216743469, + "step": 5100 + }, + { + "epoch": 1.33, + "eval_logits/chosen": -2.728083372116089, + "eval_logits/rejected": -2.7191505432128906, + "eval_logps/chosen": -1603.0245361328125, + "eval_logps/rejected": -1407.0914306640625, + "eval_loss": 0.6458316445350647, + "eval_rewards/accuracies": 0.6309523582458496, + "eval_rewards/chosen": -0.4047529995441437, + "eval_rewards/margins": 0.1495141237974167, + "eval_rewards/rejected": -0.5542671084403992, + "eval_runtime": 221.9733, + "eval_samples_per_second": 9.01, + "eval_steps_per_second": 0.284, + "step": 5100 + }, + { + "epoch": 1.34, + "learning_rate": 1.4940552935171781e-06, + "logits/chosen": -2.7510228157043457, + "logits/rejected": -2.733030319213867, + "logps/chosen": -1656.7183837890625, + "logps/rejected": -1355.869140625, + "loss": 0.6528, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3592923879623413, + "rewards/margins": 0.11819101870059967, + "rewards/rejected": -0.4774834215641022, + "step": 5110 + }, + { + "epoch": 1.34, + "learning_rate": 1.483610521194419e-06, + "logits/chosen": -2.7261557579040527, + "logits/rejected": -2.69697642326355, + "logps/chosen": -1745.705322265625, + "logps/rejected": -1580.485595703125, + "loss": 0.6212, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.409809410572052, + "rewards/margins": 0.17661479115486145, + "rewards/rejected": -0.5864241719245911, + "step": 5120 + }, + { + "epoch": 1.34, + "learning_rate": 1.4731869598696226e-06, + "logits/chosen": -2.73225998878479, + "logits/rejected": -2.716576099395752, + "logps/chosen": -1597.790771484375, + "logps/rejected": -1318.602294921875, + "loss": 0.6156, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3756243586540222, + "rewards/margins": 0.22246117889881134, + "rewards/rejected": -0.5980855226516724, + "step": 5130 + }, + { + "epoch": 1.35, + "learning_rate": 1.4627848270717387e-06, + "logits/chosen": -2.7219340801239014, + "logits/rejected": -2.7200512886047363, + "logps/chosen": -1253.752197265625, + "logps/rejected": -1163.3043212890625, + "loss": 0.6312, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.45713797211647034, + "rewards/margins": 0.22213175892829895, + "rewards/rejected": -0.6792697310447693, + "step": 5140 + }, + { + "epoch": 1.35, + "learning_rate": 1.4524043398825277e-06, + "logits/chosen": -2.761448621749878, + "logits/rejected": -2.7348995208740234, + "logps/chosen": -1840.7236328125, + "logps/rejected": -1707.0986328125, + "loss": 0.6369, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2975284457206726, + "rewards/margins": 0.1861899346113205, + "rewards/rejected": -0.4837183952331543, + "step": 5150 + }, + { + "epoch": 1.35, + "learning_rate": 1.4420457149320299e-06, + "logits/chosen": -2.703780174255371, + "logits/rejected": -2.7139551639556885, + "logps/chosen": -1718.882080078125, + "logps/rejected": -1578.334716796875, + "loss": 0.6357, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3426080644130707, + "rewards/margins": 0.20360854268074036, + "rewards/rejected": -0.5462166666984558, + "step": 5160 + }, + { + "epoch": 1.35, + "learning_rate": 1.431709168394042e-06, + "logits/chosen": -2.7347803115844727, + "logits/rejected": -2.735466480255127, + "logps/chosen": -1229.0181884765625, + "logps/rejected": -1180.7528076171875, + "loss": 0.6334, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.385633647441864, + "rewards/margins": 0.1824651062488556, + "rewards/rejected": -0.5680987238883972, + "step": 5170 + }, + { + "epoch": 1.36, + "learning_rate": 1.4213949159816059e-06, + "logits/chosen": -2.721846580505371, + "logits/rejected": -2.7203266620635986, + "logps/chosen": -1505.1744384765625, + "logps/rejected": -1412.5858154296875, + "loss": 0.6487, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3777450621128082, + "rewards/margins": 0.15947818756103516, + "rewards/rejected": -0.537223219871521, + "step": 5180 + }, + { + "epoch": 1.36, + "learning_rate": 1.4111031729425103e-06, + "logits/chosen": -2.702693462371826, + "logits/rejected": -2.708313226699829, + "logps/chosen": -1602.787109375, + "logps/rejected": -1448.283447265625, + "loss": 0.6541, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5254753232002258, + "rewards/margins": 0.0998118668794632, + "rewards/rejected": -0.6252871751785278, + "step": 5190 + }, + { + "epoch": 1.36, + "learning_rate": 1.4008341540547965e-06, + "logits/chosen": -2.7064757347106934, + "logits/rejected": -2.6883506774902344, + "logps/chosen": -1634.73828125, + "logps/rejected": -1369.5660400390625, + "loss": 0.6573, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.37241652607917786, + "rewards/margins": 0.14691108465194702, + "rewards/rejected": -0.5193276405334473, + "step": 5200 + }, + { + "epoch": 1.36, + "eval_logits/chosen": -2.7229835987091064, + "eval_logits/rejected": -2.7142302989959717, + "eval_logps/chosen": -1603.234375, + "eval_logps/rejected": -1407.468017578125, + "eval_loss": 0.6451988816261292, + "eval_rewards/accuracies": 0.6289682388305664, + "eval_rewards/chosen": -0.40685272216796875, + "eval_rewards/margins": 0.15118181705474854, + "eval_rewards/rejected": -0.5580345392227173, + "eval_runtime": 221.9736, + "eval_samples_per_second": 9.01, + "eval_steps_per_second": 0.284, + "step": 5200 + }, + { + "epoch": 1.36, + "learning_rate": 1.3905880736222737e-06, + "logits/chosen": -2.7218070030212402, + "logits/rejected": -2.7174899578094482, + "logps/chosen": -1455.731689453125, + "logps/rejected": -1164.8289794921875, + "loss": 0.6396, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.40525612235069275, + "rewards/margins": 0.162841796875, + "rewards/rejected": -0.5680979490280151, + "step": 5210 + }, + { + "epoch": 1.37, + "learning_rate": 1.3803651454700531e-06, + "logits/chosen": -2.705242872238159, + "logits/rejected": -2.692960023880005, + "logps/chosen": -1358.7576904296875, + "logps/rejected": -1273.019287109375, + "loss": 0.6561, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.42546719312667847, + "rewards/margins": 0.03913971036672592, + "rewards/rejected": -0.4646069407463074, + "step": 5220 + }, + { + "epoch": 1.37, + "learning_rate": 1.3701655829400773e-06, + "logits/chosen": -2.7341837882995605, + "logits/rejected": -2.7173619270324707, + "logps/chosen": -1405.4193115234375, + "logps/rejected": -1325.1807861328125, + "loss": 0.6365, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.5209168195724487, + "rewards/margins": 0.011190772987902164, + "rewards/rejected": -0.5321077108383179, + "step": 5230 + }, + { + "epoch": 1.37, + "learning_rate": 1.3599895988866756e-06, + "logits/chosen": -2.714791774749756, + "logits/rejected": -2.696171283721924, + "logps/chosen": -1703.659912109375, + "logps/rejected": -1553.957275390625, + "loss": 0.6615, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.39020878076553345, + "rewards/margins": 0.12071947753429413, + "rewards/rejected": -0.5109282732009888, + "step": 5240 + }, + { + "epoch": 1.37, + "learning_rate": 1.3498374056721198e-06, + "logits/chosen": -2.6696650981903076, + "logits/rejected": -2.650538682937622, + "logps/chosen": -1428.1317138671875, + "logps/rejected": -1197.024658203125, + "loss": 0.6233, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4180733561515808, + "rewards/margins": 0.23636317253112793, + "rewards/rejected": -0.6544365286827087, + "step": 5250 + }, + { + "epoch": 1.38, + "learning_rate": 1.3397092151621883e-06, + "logits/chosen": -2.7149243354797363, + "logits/rejected": -2.6980550289154053, + "logps/chosen": -1731.099609375, + "logps/rejected": -1579.0975341796875, + "loss": 0.6072, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.41332024335861206, + "rewards/margins": 0.19287510216236115, + "rewards/rejected": -0.606195330619812, + "step": 5260 + }, + { + "epoch": 1.38, + "learning_rate": 1.3296052387217484e-06, + "logits/chosen": -2.7171783447265625, + "logits/rejected": -2.72312331199646, + "logps/chosen": -1473.4296875, + "logps/rejected": -1383.3031005859375, + "loss": 0.6632, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.37961164116859436, + "rewards/margins": 0.08137451112270355, + "rewards/rejected": -0.4609861373901367, + "step": 5270 + }, + { + "epoch": 1.38, + "learning_rate": 1.3195256872103476e-06, + "logits/chosen": -2.7354798316955566, + "logits/rejected": -2.7498269081115723, + "logps/chosen": -1588.6763916015625, + "logps/rejected": -1422.8865966796875, + "loss": 0.6343, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42312923073768616, + "rewards/margins": 0.16869792342185974, + "rewards/rejected": -0.5918271541595459, + "step": 5280 + }, + { + "epoch": 1.38, + "learning_rate": 1.3094707709778068e-06, + "logits/chosen": -2.6907153129577637, + "logits/rejected": -2.6785695552825928, + "logps/chosen": -1365.05224609375, + "logps/rejected": -1313.2744140625, + "loss": 0.6147, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.47066640853881836, + "rewards/margins": 0.15152886509895325, + "rewards/rejected": -0.622195303440094, + "step": 5290 + }, + { + "epoch": 1.39, + "learning_rate": 1.2994406998598364e-06, + "logits/chosen": -2.7071175575256348, + "logits/rejected": -2.7032291889190674, + "logps/chosen": -1262.134765625, + "logps/rejected": -1245.59765625, + "loss": 0.6672, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5115293860435486, + "rewards/margins": 0.11049805581569672, + "rewards/rejected": -0.6220273971557617, + "step": 5300 + }, + { + "epoch": 1.39, + "eval_logits/chosen": -2.707981586456299, + "eval_logits/rejected": -2.6997311115264893, + "eval_logps/chosen": -1602.744140625, + "eval_logps/rejected": -1406.7059326171875, + "eval_loss": 0.6457715034484863, + "eval_rewards/accuracies": 0.6329365372657776, + "eval_rewards/chosen": -0.4019514322280884, + "eval_rewards/margins": 0.14846062660217285, + "eval_rewards/rejected": -0.5504120588302612, + "eval_runtime": 222.0084, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 5300 + }, + { + "epoch": 1.39, + "learning_rate": 1.2894356831736558e-06, + "logits/chosen": -2.7404913902282715, + "logits/rejected": -2.7101473808288574, + "logps/chosen": -1680.30859375, + "logps/rejected": -1509.7115478515625, + "loss": 0.6541, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5058525800704956, + "rewards/margins": 0.10094550997018814, + "rewards/rejected": -0.6067981123924255, + "step": 5310 + }, + { + "epoch": 1.39, + "learning_rate": 1.2794559297136203e-06, + "logits/chosen": -2.7266457080841064, + "logits/rejected": -2.7247345447540283, + "logps/chosen": -1590.14111328125, + "logps/rejected": -1474.99755859375, + "loss": 0.6407, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41412800550460815, + "rewards/margins": 0.21516411006450653, + "rewards/rejected": -0.6292921304702759, + "step": 5320 + }, + { + "epoch": 1.39, + "learning_rate": 1.2695016477468724e-06, + "logits/chosen": -2.690953493118286, + "logits/rejected": -2.6827735900878906, + "logps/chosen": -1386.6346435546875, + "logps/rejected": -1365.217041015625, + "loss": 0.6586, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.48004859685897827, + "rewards/margins": 0.21122178435325623, + "rewards/rejected": -0.6912704110145569, + "step": 5330 + }, + { + "epoch": 1.4, + "learning_rate": 1.2595730450089874e-06, + "logits/chosen": -2.700653076171875, + "logits/rejected": -2.708189010620117, + "logps/chosen": -1437.6949462890625, + "logps/rejected": -1425.35546875, + "loss": 0.6465, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.37449318170547485, + "rewards/margins": 0.10607191175222397, + "rewards/rejected": -0.4805651307106018, + "step": 5340 + }, + { + "epoch": 1.4, + "learning_rate": 1.2496703286996433e-06, + "logits/chosen": -2.662972927093506, + "logits/rejected": -2.651554584503174, + "logps/chosen": -1539.100830078125, + "logps/rejected": -1469.132080078125, + "loss": 0.6183, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.42751431465148926, + "rewards/margins": 0.19100254774093628, + "rewards/rejected": -0.6185168623924255, + "step": 5350 + }, + { + "epoch": 1.4, + "learning_rate": 1.2397937054782961e-06, + "logits/chosen": -2.6878347396850586, + "logits/rejected": -2.7087759971618652, + "logps/chosen": -1573.5694580078125, + "logps/rejected": -1411.982666015625, + "loss": 0.6867, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4747236669063568, + "rewards/margins": 0.060199446976184845, + "rewards/rejected": -0.5349230766296387, + "step": 5360 + }, + { + "epoch": 1.41, + "learning_rate": 1.2299433814598635e-06, + "logits/chosen": -2.719141960144043, + "logits/rejected": -2.701843738555908, + "logps/chosen": -1555.9644775390625, + "logps/rejected": -1262.3304443359375, + "loss": 0.6165, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.47410351037979126, + "rewards/margins": 0.08914776146411896, + "rewards/rejected": -0.5632511973381042, + "step": 5370 + }, + { + "epoch": 1.41, + "learning_rate": 1.2201195622104265e-06, + "logits/chosen": -2.7186007499694824, + "logits/rejected": -2.7116668224334717, + "logps/chosen": -1402.317138671875, + "logps/rejected": -1303.7572021484375, + "loss": 0.6388, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3963562846183777, + "rewards/margins": 0.08322183787822723, + "rewards/rejected": -0.4795781672000885, + "step": 5380 + }, + { + "epoch": 1.41, + "learning_rate": 1.2103224527429417e-06, + "logits/chosen": -2.724838972091675, + "logits/rejected": -2.7124266624450684, + "logps/chosen": -1474.955322265625, + "logps/rejected": -1274.307861328125, + "loss": 0.6386, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4477899670600891, + "rewards/margins": 0.18380063772201538, + "rewards/rejected": -0.6315906047821045, + "step": 5390 + }, + { + "epoch": 1.41, + "learning_rate": 1.2005522575129559e-06, + "logits/chosen": -2.6977944374084473, + "logits/rejected": -2.684950113296509, + "logps/chosen": -1461.167724609375, + "logps/rejected": -1330.77490234375, + "loss": 0.6112, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3553297817707062, + "rewards/margins": 0.2838284373283386, + "rewards/rejected": -0.6391581892967224, + "step": 5400 + }, + { + "epoch": 1.41, + "eval_logits/chosen": -2.70361590385437, + "eval_logits/rejected": -2.695265054702759, + "eval_logps/chosen": -1602.899658203125, + "eval_logps/rejected": -1406.76318359375, + "eval_loss": 0.6459503173828125, + "eval_rewards/accuracies": 0.6289682388305664, + "eval_rewards/chosen": -0.4035067558288574, + "eval_rewards/margins": 0.14747834205627441, + "eval_rewards/rejected": -0.5509850978851318, + "eval_runtime": 221.9833, + "eval_samples_per_second": 9.01, + "eval_steps_per_second": 0.284, + "step": 5400 + }, + { + "epoch": 1.42, + "learning_rate": 1.1908091804143469e-06, + "logits/chosen": -2.6938157081604004, + "logits/rejected": -2.686267375946045, + "logps/chosen": -1382.980224609375, + "logps/rejected": -1162.074951171875, + "loss": 0.6195, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3829803168773651, + "rewards/margins": 0.22588439285755157, + "rewards/rejected": -0.6088647246360779, + "step": 5410 + }, + { + "epoch": 1.42, + "learning_rate": 1.1810934247750649e-06, + "logits/chosen": -2.6722521781921387, + "logits/rejected": -2.6724560260772705, + "logps/chosen": -1091.403076171875, + "logps/rejected": -973.5833740234375, + "loss": 0.6671, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.44891077280044556, + "rewards/margins": 0.09966419637203217, + "rewards/rejected": -0.5485749244689941, + "step": 5420 + }, + { + "epoch": 1.42, + "learning_rate": 1.1714051933528881e-06, + "logits/chosen": -2.7134673595428467, + "logits/rejected": -2.686100482940674, + "logps/chosen": -1661.9583740234375, + "logps/rejected": -1192.046875, + "loss": 0.6279, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36968302726745605, + "rewards/margins": 0.10744525492191315, + "rewards/rejected": -0.477128267288208, + "step": 5430 + }, + { + "epoch": 1.42, + "learning_rate": 1.161744688331192e-06, + "logits/chosen": -2.691920518875122, + "logits/rejected": -2.6858606338500977, + "logps/chosen": -1821.1956787109375, + "logps/rejected": -1636.1937255859375, + "loss": 0.6193, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.44910621643066406, + "rewards/margins": 0.11931423842906952, + "rewards/rejected": -0.56842041015625, + "step": 5440 + }, + { + "epoch": 1.43, + "learning_rate": 1.152112111314733e-06, + "logits/chosen": -2.6686320304870605, + "logits/rejected": -2.6621997356414795, + "logps/chosen": -1850.287841796875, + "logps/rejected": -1289.6005859375, + "loss": 0.6289, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.37908318638801575, + "rewards/margins": 0.15685948729515076, + "rewards/rejected": -0.5359426736831665, + "step": 5450 + }, + { + "epoch": 1.43, + "learning_rate": 1.142507663325439e-06, + "logits/chosen": -2.682161808013916, + "logits/rejected": -2.6786532402038574, + "logps/chosen": -1570.059814453125, + "logps/rejected": -1518.0584716796875, + "loss": 0.6434, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3491072356700897, + "rewards/margins": 0.21276946365833282, + "rewards/rejected": -0.5618767142295837, + "step": 5460 + }, + { + "epoch": 1.43, + "learning_rate": 1.132931544798211e-06, + "logits/chosen": -2.7180349826812744, + "logits/rejected": -2.691465139389038, + "logps/chosen": -1448.842041015625, + "logps/rejected": -1093.453369140625, + "loss": 0.635, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3692038655281067, + "rewards/margins": 0.17854595184326172, + "rewards/rejected": -0.5477498173713684, + "step": 5470 + }, + { + "epoch": 1.43, + "learning_rate": 1.1233839555767482e-06, + "logits/chosen": -2.695664405822754, + "logits/rejected": -2.6881296634674072, + "logps/chosen": -1042.5235595703125, + "logps/rejected": -1147.806884765625, + "loss": 0.6391, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.34199172258377075, + "rewards/margins": 0.10589548200368881, + "rewards/rejected": -0.44788724184036255, + "step": 5480 + }, + { + "epoch": 1.44, + "learning_rate": 1.1138650949093668e-06, + "logits/chosen": -2.66930890083313, + "logits/rejected": -2.6495189666748047, + "logps/chosen": -1223.517333984375, + "logps/rejected": -1291.786376953125, + "loss": 0.6465, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.35710740089416504, + "rewards/margins": 0.2592705488204956, + "rewards/rejected": -0.6163779497146606, + "step": 5490 + }, + { + "epoch": 1.44, + "learning_rate": 1.1043751614448543e-06, + "logits/chosen": -2.736130475997925, + "logits/rejected": -2.7363991737365723, + "logps/chosen": -1665.0625, + "logps/rejected": -1605.57861328125, + "loss": 0.6421, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3382241725921631, + "rewards/margins": 0.17074397206306458, + "rewards/rejected": -0.5089680552482605, + "step": 5500 + }, + { + "epoch": 1.44, + "eval_logits/chosen": -2.708101987838745, + "eval_logits/rejected": -2.6991231441497803, + "eval_logps/chosen": -1601.6962890625, + "eval_logps/rejected": -1405.801025390625, + "eval_loss": 0.6449205875396729, + "eval_rewards/accuracies": 0.6408730149269104, + "eval_rewards/chosen": -0.3914722204208374, + "eval_rewards/margins": 0.14989058673381805, + "eval_rewards/rejected": -0.5413628220558167, + "eval_runtime": 222.0008, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 5500 + }, + { + "epoch": 1.44, + "learning_rate": 1.0949143532283107e-06, + "logits/chosen": -2.72534441947937, + "logits/rejected": -2.692267656326294, + "logps/chosen": -1723.418701171875, + "logps/rejected": -1653.270751953125, + "loss": 0.6283, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.31096896529197693, + "rewards/margins": 0.22809436917304993, + "rewards/rejected": -0.5390633344650269, + "step": 5510 + }, + { + "epoch": 1.44, + "learning_rate": 1.0854828676970275e-06, + "logits/chosen": -2.719973087310791, + "logits/rejected": -2.6971845626831055, + "logps/chosen": -1367.62109375, + "logps/rejected": -1190.7325439453125, + "loss": 0.6439, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4206356108188629, + "rewards/margins": 0.029691871255636215, + "rewards/rejected": -0.45032748579978943, + "step": 5520 + }, + { + "epoch": 1.45, + "learning_rate": 1.076080901676361e-06, + "logits/chosen": -2.710170269012451, + "logits/rejected": -2.699993848800659, + "logps/chosen": -1609.293212890625, + "logps/rejected": -1397.969970703125, + "loss": 0.6515, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4713289141654968, + "rewards/margins": 0.12276891618967056, + "rewards/rejected": -0.5940978527069092, + "step": 5530 + }, + { + "epoch": 1.45, + "learning_rate": 1.0667086513756234e-06, + "logits/chosen": -2.6901488304138184, + "logits/rejected": -2.6973462104797363, + "logps/chosen": -1350.40283203125, + "logps/rejected": -1136.89599609375, + "loss": 0.6265, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3778039813041687, + "rewards/margins": 0.18645079433918, + "rewards/rejected": -0.5642547607421875, + "step": 5540 + }, + { + "epoch": 1.45, + "learning_rate": 1.0573663123839912e-06, + "logits/chosen": -2.696373224258423, + "logits/rejected": -2.69925856590271, + "logps/chosen": -1330.9554443359375, + "logps/rejected": -1100.8270263671875, + "loss": 0.6099, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4058258533477783, + "rewards/margins": 0.26610302925109863, + "rewards/rejected": -0.671928882598877, + "step": 5550 + }, + { + "epoch": 1.46, + "learning_rate": 1.0480540796664251e-06, + "logits/chosen": -2.6961722373962402, + "logits/rejected": -2.693441152572632, + "logps/chosen": -1421.529052734375, + "logps/rejected": -1489.1346435546875, + "loss": 0.6441, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4659119248390198, + "rewards/margins": 0.11813143640756607, + "rewards/rejected": -0.5840433239936829, + "step": 5560 + }, + { + "epoch": 1.46, + "learning_rate": 1.0387721475595978e-06, + "logits/chosen": -2.7045772075653076, + "logits/rejected": -2.6836910247802734, + "logps/chosen": -1542.04248046875, + "logps/rejected": -1230.510986328125, + "loss": 0.6117, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41788873076438904, + "rewards/margins": 0.1756511926651001, + "rewards/rejected": -0.5935398936271667, + "step": 5570 + }, + { + "epoch": 1.46, + "learning_rate": 1.0295207097678378e-06, + "logits/chosen": -2.71760892868042, + "logits/rejected": -2.69905424118042, + "logps/chosen": -1756.976318359375, + "logps/rejected": -1334.25146484375, + "loss": 0.6428, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4227485656738281, + "rewards/margins": 0.22662191092967987, + "rewards/rejected": -0.6493704915046692, + "step": 5580 + }, + { + "epoch": 1.46, + "learning_rate": 1.0202999593590924e-06, + "logits/chosen": -2.721705675125122, + "logits/rejected": -2.6900992393493652, + "logps/chosen": -1567.1680908203125, + "logps/rejected": -1261.202880859375, + "loss": 0.628, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42403268814086914, + "rewards/margins": 0.19053298234939575, + "rewards/rejected": -0.6145657300949097, + "step": 5590 + }, + { + "epoch": 1.47, + "learning_rate": 1.011110088760891e-06, + "logits/chosen": -2.667708396911621, + "logits/rejected": -2.6748549938201904, + "logps/chosen": -1374.74658203125, + "logps/rejected": -1232.95849609375, + "loss": 0.658, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.28120842576026917, + "rewards/margins": 0.15552183985710144, + "rewards/rejected": -0.436730295419693, + "step": 5600 + }, + { + "epoch": 1.47, + "eval_logits/chosen": -2.7027201652526855, + "eval_logits/rejected": -2.693756103515625, + "eval_logps/chosen": -1602.7802734375, + "eval_logps/rejected": -1407.1986083984375, + "eval_loss": 0.6451008319854736, + "eval_rewards/accuracies": 0.6428571343421936, + "eval_rewards/chosen": -0.40230992436408997, + "eval_rewards/margins": 0.15302817523479462, + "eval_rewards/rejected": -0.5553380846977234, + "eval_runtime": 221.8522, + "eval_samples_per_second": 9.015, + "eval_steps_per_second": 0.284, + "step": 5600 + }, + { + "epoch": 1.47, + "learning_rate": 1.0019512897563347e-06, + "logits/chosen": -2.687178134918213, + "logits/rejected": -2.696901798248291, + "logps/chosen": -1677.1754150390625, + "logps/rejected": -1356.8736572265625, + "loss": 0.6386, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41005244851112366, + "rewards/margins": 0.22635486721992493, + "rewards/rejected": -0.6364073753356934, + "step": 5610 + }, + { + "epoch": 1.47, + "learning_rate": 9.928237534800935e-07, + "logits/chosen": -2.709001064300537, + "logits/rejected": -2.7000620365142822, + "logps/chosen": -1754.002197265625, + "logps/rejected": -1632.352294921875, + "loss": 0.5925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.41502103209495544, + "rewards/margins": 0.1642555445432663, + "rewards/rejected": -0.5792765617370605, + "step": 5620 + }, + { + "epoch": 1.47, + "learning_rate": 9.837276704144174e-07, + "logits/chosen": -2.6920666694641113, + "logits/rejected": -2.690368175506592, + "logps/chosen": -1598.8590087890625, + "logps/rejected": -1336.1282958984375, + "loss": 0.6366, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4579346179962158, + "rewards/margins": 0.12214440107345581, + "rewards/rejected": -0.5800789594650269, + "step": 5630 + }, + { + "epoch": 1.48, + "learning_rate": 9.746632303851569e-07, + "logits/chosen": -2.7054312229156494, + "logits/rejected": -2.6833667755126953, + "logps/chosen": -1388.1898193359375, + "logps/rejected": -1140.6986083984375, + "loss": 0.6353, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4576742649078369, + "rewards/margins": 0.07921469211578369, + "rewards/rejected": -0.5368889570236206, + "step": 5640 + }, + { + "epoch": 1.48, + "learning_rate": 9.65630622557809e-07, + "logits/chosen": -2.681272029876709, + "logits/rejected": -2.6905956268310547, + "logps/chosen": -1354.0238037109375, + "logps/rejected": -1291.4276123046875, + "loss": 0.6635, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.45517176389694214, + "rewards/margins": 0.15478548407554626, + "rewards/rejected": -0.6099572777748108, + "step": 5650 + }, + { + "epoch": 1.48, + "learning_rate": 9.56630035433561e-07, + "logits/chosen": -2.6686363220214844, + "logits/rejected": -2.6861915588378906, + "logps/chosen": -1292.4571533203125, + "logps/rejected": -1379.446044921875, + "loss": 0.6768, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.47962379455566406, + "rewards/margins": 0.084124356508255, + "rewards/rejected": -0.5637482404708862, + "step": 5660 + }, + { + "epoch": 1.48, + "learning_rate": 9.476616568453659e-07, + "logits/chosen": -2.692587375640869, + "logits/rejected": -2.6729211807250977, + "logps/chosen": -1336.821533203125, + "logps/rejected": -1283.6484375, + "loss": 0.6387, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.34590721130371094, + "rewards/margins": 0.15082183480262756, + "rewards/rejected": -0.4967290759086609, + "step": 5670 + }, + { + "epoch": 1.49, + "learning_rate": 9.387256739540162e-07, + "logits/chosen": -2.7087090015411377, + "logits/rejected": -2.679457664489746, + "logps/chosen": -1880.095947265625, + "logps/rejected": -1294.837890625, + "loss": 0.649, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.32686570286750793, + "rewards/margins": 0.17968794703483582, + "rewards/rejected": -0.506553590297699, + "step": 5680 + }, + { + "epoch": 1.49, + "learning_rate": 9.298222732442377e-07, + "logits/chosen": -2.685485363006592, + "logits/rejected": -2.6599249839782715, + "logps/chosen": -1625.441650390625, + "logps/rejected": -1470.1597900390625, + "loss": 0.6498, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36153444647789, + "rewards/margins": 0.1952248066663742, + "rewards/rejected": -0.5567591786384583, + "step": 5690 + }, + { + "epoch": 1.49, + "learning_rate": 9.20951640520803e-07, + "logits/chosen": -2.6474082469940186, + "logits/rejected": -2.6487858295440674, + "logps/chosen": -1660.1871337890625, + "logps/rejected": -1259.378662109375, + "loss": 0.6437, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3772795796394348, + "rewards/margins": 0.26258450746536255, + "rewards/rejected": -0.6398640871047974, + "step": 5700 + }, + { + "epoch": 1.49, + "eval_logits/chosen": -2.6972296237945557, + "eval_logits/rejected": -2.6883459091186523, + "eval_logps/chosen": -1603.052734375, + "eval_logps/rejected": -1407.21630859375, + "eval_loss": 0.6453641653060913, + "eval_rewards/accuracies": 0.6388888955116272, + "eval_rewards/chosen": -0.4050370156764984, + "eval_rewards/margins": 0.15047885477542877, + "eval_rewards/rejected": -0.5555158853530884, + "eval_runtime": 221.8964, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 0.284, + "step": 5700 + }, + { + "epoch": 1.49, + "learning_rate": 9.121139609046484e-07, + "logits/chosen": -2.665215015411377, + "logits/rejected": -2.653277635574341, + "logps/chosen": -1373.9349365234375, + "logps/rejected": -847.8890380859375, + "loss": 0.6369, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4055308401584625, + "rewards/margins": 0.24549469351768494, + "rewards/rejected": -0.6510254740715027, + "step": 5710 + }, + { + "epoch": 1.5, + "learning_rate": 9.033094188290121e-07, + "logits/chosen": -2.7135136127471924, + "logits/rejected": -2.6930954456329346, + "logps/chosen": -1529.5589599609375, + "logps/rejected": -1347.803466796875, + "loss": 0.6189, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42335405945777893, + "rewards/margins": 0.19795912504196167, + "rewards/rejected": -0.621313214302063, + "step": 5720 + }, + { + "epoch": 1.5, + "learning_rate": 8.945381980355889e-07, + "logits/chosen": -2.6918070316314697, + "logits/rejected": -2.694427013397217, + "logps/chosen": -1536.6533203125, + "logps/rejected": -1356.972412109375, + "loss": 0.6446, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.35820502042770386, + "rewards/margins": 0.21345534920692444, + "rewards/rejected": -0.5716603994369507, + "step": 5730 + }, + { + "epoch": 1.5, + "learning_rate": 8.858004815706919e-07, + "logits/chosen": -2.695432186126709, + "logits/rejected": -2.684980869293213, + "logps/chosen": -1508.866943359375, + "logps/rejected": -1638.4742431640625, + "loss": 0.637, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.40777310729026794, + "rewards/margins": 0.200104758143425, + "rewards/rejected": -0.6078779101371765, + "step": 5740 + }, + { + "epoch": 1.5, + "learning_rate": 8.77096451781432e-07, + "logits/chosen": -2.720970869064331, + "logits/rejected": -2.725588321685791, + "logps/chosen": -1464.5390625, + "logps/rejected": -1301.757568359375, + "loss": 0.653, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3903093636035919, + "rewards/margins": 0.22096291184425354, + "rewards/rejected": -0.6112722158432007, + "step": 5750 + }, + { + "epoch": 1.51, + "learning_rate": 8.684262903119165e-07, + "logits/chosen": -2.6991419792175293, + "logits/rejected": -2.6937973499298096, + "logps/chosen": -1612.3106689453125, + "logps/rejected": -1525.4544677734375, + "loss": 0.6388, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.38578009605407715, + "rewards/margins": 0.16267183423042297, + "rewards/rejected": -0.5484519600868225, + "step": 5760 + }, + { + "epoch": 1.51, + "learning_rate": 8.597901780994525e-07, + "logits/chosen": -2.7179884910583496, + "logits/rejected": -2.6874876022338867, + "logps/chosen": -1531.790283203125, + "logps/rejected": -1408.367919921875, + "loss": 0.6342, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4343856871128082, + "rewards/margins": 0.11403163522481918, + "rewards/rejected": -0.548417329788208, + "step": 5770 + }, + { + "epoch": 1.51, + "learning_rate": 8.511882953707773e-07, + "logits/chosen": -2.71705961227417, + "logits/rejected": -2.721656560897827, + "logps/chosen": -1742.287841796875, + "logps/rejected": -1498.193115234375, + "loss": 0.6361, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.40885210037231445, + "rewards/margins": 0.07669506967067719, + "rewards/rejected": -0.48554715514183044, + "step": 5780 + }, + { + "epoch": 1.52, + "learning_rate": 8.426208216382944e-07, + "logits/chosen": -2.681305170059204, + "logits/rejected": -2.688035726547241, + "logps/chosen": -1341.0355224609375, + "logps/rejected": -1515.4603271484375, + "loss": 0.6474, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.39439091086387634, + "rewards/margins": 0.21398195624351501, + "rewards/rejected": -0.6083729267120361, + "step": 5790 + }, + { + "epoch": 1.52, + "learning_rate": 8.340879356963245e-07, + "logits/chosen": -2.6872317790985107, + "logits/rejected": -2.678461790084839, + "logps/chosen": -1467.8450927734375, + "logps/rejected": -1348.451904296875, + "loss": 0.6289, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3274436891078949, + "rewards/margins": 0.23764677345752716, + "rewards/rejected": -0.565090537071228, + "step": 5800 + }, + { + "epoch": 1.52, + "eval_logits/chosen": -2.7093505859375, + "eval_logits/rejected": -2.7007176876068115, + "eval_logps/chosen": -1602.4105224609375, + "eval_logps/rejected": -1406.861083984375, + "eval_loss": 0.6442674398422241, + "eval_rewards/accuracies": 0.64682537317276, + "eval_rewards/chosen": -0.3986143171787262, + "eval_rewards/margins": 0.15335094928741455, + "eval_rewards/rejected": -0.5519652366638184, + "eval_runtime": 221.9954, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 5800 + }, + { + "epoch": 1.52, + "learning_rate": 8.255898156173777e-07, + "logits/chosen": -2.721546173095703, + "logits/rejected": -2.7181859016418457, + "logps/chosen": -1618.231201171875, + "logps/rejected": -1498.933837890625, + "loss": 0.6084, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32977229356765747, + "rewards/margins": 0.3294047713279724, + "rewards/rejected": -0.6591770648956299, + "step": 5810 + }, + { + "epoch": 1.52, + "learning_rate": 8.171266387484389e-07, + "logits/chosen": -2.716600179672241, + "logits/rejected": -2.7120096683502197, + "logps/chosen": -1584.3115234375, + "logps/rejected": -1282.3509521484375, + "loss": 0.6282, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.40199166536331177, + "rewards/margins": 0.1491709053516388, + "rewards/rejected": -0.5511625409126282, + "step": 5820 + }, + { + "epoch": 1.53, + "learning_rate": 8.086985817072604e-07, + "logits/chosen": -2.7204251289367676, + "logits/rejected": -2.71130108833313, + "logps/chosen": -1321.2801513671875, + "logps/rejected": -1095.7039794921875, + "loss": 0.6577, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.47304391860961914, + "rewards/margins": 0.024396944791078568, + "rewards/rejected": -0.4974408745765686, + "step": 5830 + }, + { + "epoch": 1.53, + "learning_rate": 8.003058203786835e-07, + "logits/chosen": -2.6963999271392822, + "logits/rejected": -2.680232048034668, + "logps/chosen": -1220.0439453125, + "logps/rejected": -1195.7838134765625, + "loss": 0.6135, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43111515045166016, + "rewards/margins": 0.16377350687980652, + "rewards/rejected": -0.5948886871337891, + "step": 5840 + }, + { + "epoch": 1.53, + "learning_rate": 7.91948529910963e-07, + "logits/chosen": -2.681727409362793, + "logits/rejected": -2.673245906829834, + "logps/chosen": -1630.849853515625, + "logps/rejected": -1601.662353515625, + "loss": 0.6352, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.36390581727027893, + "rewards/margins": 0.21514716744422913, + "rewards/rejected": -0.5790529847145081, + "step": 5850 + }, + { + "epoch": 1.53, + "learning_rate": 7.836268847121126e-07, + "logits/chosen": -2.7189762592315674, + "logits/rejected": -2.7372474670410156, + "logps/chosen": -1822.910888671875, + "logps/rejected": -1758.435546875, + "loss": 0.651, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4066740572452545, + "rewards/margins": 0.16316178441047668, + "rewards/rejected": -0.569835901260376, + "step": 5860 + }, + { + "epoch": 1.54, + "learning_rate": 7.753410584462681e-07, + "logits/chosen": -2.733602285385132, + "logits/rejected": -2.725095272064209, + "logps/chosen": -1537.243896484375, + "logps/rejected": -1504.42431640625, + "loss": 0.6489, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3789084851741791, + "rewards/margins": 0.12120094150304794, + "rewards/rejected": -0.5001094937324524, + "step": 5870 + }, + { + "epoch": 1.54, + "learning_rate": 7.670912240300596e-07, + "logits/chosen": -2.6847469806671143, + "logits/rejected": -2.678020715713501, + "logps/chosen": -1495.7813720703125, + "logps/rejected": -1345.4945068359375, + "loss": 0.6543, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3611437976360321, + "rewards/margins": 0.15165142714977264, + "rewards/rejected": -0.5127952694892883, + "step": 5880 + }, + { + "epoch": 1.54, + "learning_rate": 7.588775536290035e-07, + "logits/chosen": -2.6968045234680176, + "logits/rejected": -2.6823792457580566, + "logps/chosen": -1608.9373779296875, + "logps/rejected": -1043.4354248046875, + "loss": 0.63, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.34363892674446106, + "rewards/margins": 0.16888666152954102, + "rewards/rejected": -0.5125256776809692, + "step": 5890 + }, + { + "epoch": 1.54, + "learning_rate": 7.507002186539147e-07, + "logits/chosen": -2.6805570125579834, + "logits/rejected": -2.675029993057251, + "logps/chosen": -1435.6429443359375, + "logps/rejected": -1435.0908203125, + "loss": 0.6361, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.39160555601119995, + "rewards/margins": 0.1981893628835678, + "rewards/rejected": -0.589794933795929, + "step": 5900 + }, + { + "epoch": 1.54, + "eval_logits/chosen": -2.7046704292297363, + "eval_logits/rejected": -2.696174383163452, + "eval_logps/chosen": -1602.9124755859375, + "eval_logps/rejected": -1407.40869140625, + "eval_loss": 0.6442207098007202, + "eval_rewards/accuracies": 0.6408730149269104, + "eval_rewards/chosen": -0.4036337435245514, + "eval_rewards/margins": 0.1538066416978836, + "eval_rewards/rejected": -0.5574404001235962, + "eval_runtime": 221.9282, + "eval_samples_per_second": 9.012, + "eval_steps_per_second": 0.284, + "step": 5900 + }, + { + "epoch": 1.55, + "learning_rate": 7.425593897573216e-07, + "logits/chosen": -2.693079948425293, + "logits/rejected": -2.6898703575134277, + "logps/chosen": -1721.556396484375, + "logps/rejected": -1643.6937255859375, + "loss": 0.632, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3537727892398834, + "rewards/margins": 0.3143417239189148, + "rewards/rejected": -0.6681144833564758, + "step": 5910 + }, + { + "epoch": 1.55, + "learning_rate": 7.344552368299088e-07, + "logits/chosen": -2.6862666606903076, + "logits/rejected": -2.694638967514038, + "logps/chosen": -1307.9896240234375, + "logps/rejected": -1450.29443359375, + "loss": 0.6364, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4261978268623352, + "rewards/margins": 0.1255144625902176, + "rewards/rejected": -0.5517122745513916, + "step": 5920 + }, + { + "epoch": 1.55, + "learning_rate": 7.26387928996973e-07, + "logits/chosen": -2.692228317260742, + "logits/rejected": -2.698779344558716, + "logps/chosen": -1149.926025390625, + "logps/rejected": -1233.862548828125, + "loss": 0.6409, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.46561938524246216, + "rewards/margins": 0.2557544708251953, + "rewards/rejected": -0.7213739156723022, + "step": 5930 + }, + { + "epoch": 1.55, + "learning_rate": 7.183576346148899e-07, + "logits/chosen": -2.6880781650543213, + "logits/rejected": -2.6710593700408936, + "logps/chosen": -1776.842529296875, + "logps/rejected": -1320.5303955078125, + "loss": 0.639, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.30308040976524353, + "rewards/margins": 0.35630732774734497, + "rewards/rejected": -0.6593877673149109, + "step": 5940 + }, + { + "epoch": 1.56, + "learning_rate": 7.103645212676044e-07, + "logits/chosen": -2.6921093463897705, + "logits/rejected": -2.6773791313171387, + "logps/chosen": -1392.0758056640625, + "logps/rejected": -1509.171875, + "loss": 0.6312, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4809054434299469, + "rewards/margins": 0.11313720047473907, + "rewards/rejected": -0.5940426588058472, + "step": 5950 + }, + { + "epoch": 1.56, + "learning_rate": 7.024087557631318e-07, + "logits/chosen": -2.702073097229004, + "logits/rejected": -2.7095000743865967, + "logps/chosen": -1254.189697265625, + "logps/rejected": -1364.05615234375, + "loss": 0.6292, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.44157737493515015, + "rewards/margins": 0.08752346783876419, + "rewards/rejected": -0.5291008353233337, + "step": 5960 + }, + { + "epoch": 1.56, + "learning_rate": 6.944905041300739e-07, + "logits/chosen": -2.6660048961639404, + "logits/rejected": -2.6542418003082275, + "logps/chosen": -1623.0982666015625, + "logps/rejected": -1455.97802734375, + "loss": 0.603, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.40555793046951294, + "rewards/margins": 0.33708950877189636, + "rewards/rejected": -0.7426473498344421, + "step": 5970 + }, + { + "epoch": 1.57, + "learning_rate": 6.866099316141606e-07, + "logits/chosen": -2.702817678451538, + "logits/rejected": -2.716486930847168, + "logps/chosen": -1421.0152587890625, + "logps/rejected": -1557.4527587890625, + "loss": 0.6305, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42785245180130005, + "rewards/margins": 0.20500314235687256, + "rewards/rejected": -0.6328555941581726, + "step": 5980 + }, + { + "epoch": 1.57, + "learning_rate": 6.787672026747946e-07, + "logits/chosen": -2.698267698287964, + "logits/rejected": -2.677493095397949, + "logps/chosen": -1327.359375, + "logps/rejected": -1614.907470703125, + "loss": 0.6448, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5610150098800659, + "rewards/margins": 0.19009160995483398, + "rewards/rejected": -0.7511066198348999, + "step": 5990 + }, + { + "epoch": 1.57, + "learning_rate": 6.709624809816223e-07, + "logits/chosen": -2.701934337615967, + "logits/rejected": -2.692894458770752, + "logps/chosen": -1514.8740234375, + "logps/rejected": -1444.414794921875, + "loss": 0.6374, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.35164642333984375, + "rewards/margins": 0.16492925584316254, + "rewards/rejected": -0.5165756940841675, + "step": 6000 + }, + { + "epoch": 1.57, + "eval_logits/chosen": -2.7047617435455322, + "eval_logits/rejected": -2.6962783336639404, + "eval_logps/chosen": -1604.185302734375, + "eval_logps/rejected": -1408.8310546875, + "eval_loss": 0.644648551940918, + "eval_rewards/accuracies": 0.6428571343421936, + "eval_rewards/chosen": -0.4163608253002167, + "eval_rewards/margins": 0.15530355274677277, + "eval_rewards/rejected": -0.5716643929481506, + "eval_runtime": 221.9315, + "eval_samples_per_second": 9.012, + "eval_steps_per_second": 0.284, + "step": 6000 + }, + { + "epoch": 1.57, + "learning_rate": 6.6319592941112e-07, + "logits/chosen": -2.7169597148895264, + "logits/rejected": -2.6866252422332764, + "logps/chosen": -1711.214111328125, + "logps/rejected": -1542.6513671875, + "loss": 0.656, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2974657416343689, + "rewards/margins": 0.2659049928188324, + "rewards/rejected": -0.5633708238601685, + "step": 6010 + }, + { + "epoch": 1.58, + "learning_rate": 6.554677100431927e-07, + "logits/chosen": -2.733557939529419, + "logits/rejected": -2.715567111968994, + "logps/chosen": -1610.2353515625, + "logps/rejected": -1287.1390380859375, + "loss": 0.6268, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3757496178150177, + "rewards/margins": 0.29556283354759216, + "rewards/rejected": -0.6713123917579651, + "step": 6020 + }, + { + "epoch": 1.58, + "learning_rate": 6.4777798415779e-07, + "logits/chosen": -2.7137365341186523, + "logits/rejected": -2.7267062664031982, + "logps/chosen": -1452.125, + "logps/rejected": -1292.182373046875, + "loss": 0.5904, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3056414723396301, + "rewards/margins": 0.2070426493883133, + "rewards/rejected": -0.512684166431427, + "step": 6030 + }, + { + "epoch": 1.58, + "learning_rate": 6.401269122315451e-07, + "logits/chosen": -2.7094180583953857, + "logits/rejected": -2.7037181854248047, + "logps/chosen": -1874.9847412109375, + "logps/rejected": -1374.64697265625, + "loss": 0.6367, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3752003312110901, + "rewards/margins": 0.28431040048599243, + "rewards/rejected": -0.6595107316970825, + "step": 6040 + }, + { + "epoch": 1.58, + "learning_rate": 6.325146539344196e-07, + "logits/chosen": -2.6920981407165527, + "logits/rejected": -2.6991913318634033, + "logps/chosen": -1592.9559326171875, + "logps/rejected": -1228.34375, + "loss": 0.6741, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.42658185958862305, + "rewards/margins": 0.15396739542484283, + "rewards/rejected": -0.5805492997169495, + "step": 6050 + }, + { + "epoch": 1.59, + "learning_rate": 6.249413681263782e-07, + "logits/chosen": -2.6854660511016846, + "logits/rejected": -2.689037561416626, + "logps/chosen": -1591.2857666015625, + "logps/rejected": -1392.5634765625, + "loss": 0.6136, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3609497547149658, + "rewards/margins": 0.27985721826553345, + "rewards/rejected": -0.6408069133758545, + "step": 6060 + }, + { + "epoch": 1.59, + "learning_rate": 6.174072128540686e-07, + "logits/chosen": -2.7083308696746826, + "logits/rejected": -2.6896657943725586, + "logps/chosen": -1535.8818359375, + "logps/rejected": -1262.798095703125, + "loss": 0.6266, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.35114628076553345, + "rewards/margins": 0.3368942141532898, + "rewards/rejected": -0.6880404949188232, + "step": 6070 + }, + { + "epoch": 1.59, + "learning_rate": 6.099123453475245e-07, + "logits/chosen": -2.713439464569092, + "logits/rejected": -2.681724786758423, + "logps/chosen": -1554.7132568359375, + "logps/rejected": -1333.768798828125, + "loss": 0.6509, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4823225438594818, + "rewards/margins": 0.2098710536956787, + "rewards/rejected": -0.6921936273574829, + "step": 6080 + }, + { + "epoch": 1.59, + "learning_rate": 6.024569220168836e-07, + "logits/chosen": -2.6846044063568115, + "logits/rejected": -2.6816964149475098, + "logps/chosen": -1673.781494140625, + "logps/rejected": -1116.6241455078125, + "loss": 0.6224, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.39769247174263, + "rewards/margins": 0.258812814950943, + "rewards/rejected": -0.6565052270889282, + "step": 6090 + }, + { + "epoch": 1.6, + "learning_rate": 5.950410984491268e-07, + "logits/chosen": -2.684141159057617, + "logits/rejected": -2.654639720916748, + "logps/chosen": -1464.31103515625, + "logps/rejected": -1539.7857666015625, + "loss": 0.6423, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4491928517818451, + "rewards/margins": 0.1657174527645111, + "rewards/rejected": -0.6149102449417114, + "step": 6100 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -2.6991634368896484, + "eval_logits/rejected": -2.6905128955841064, + "eval_logps/chosen": -1604.669189453125, + "eval_logps/rejected": -1409.4735107421875, + "eval_loss": 0.6447591781616211, + "eval_rewards/accuracies": 0.6349206566810608, + "eval_rewards/chosen": -0.42120110988616943, + "eval_rewards/margins": 0.15688644349575043, + "eval_rewards/rejected": -0.5780875086784363, + "eval_runtime": 221.9398, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 0.284, + "step": 6100 + }, + { + "epoch": 1.6, + "learning_rate": 5.876650294048262e-07, + "logits/chosen": -2.6803715229034424, + "logits/rejected": -2.6851279735565186, + "logps/chosen": -1618.722900390625, + "logps/rejected": -1358.609619140625, + "loss": 0.6144, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3551791310310364, + "rewards/margins": 0.24444207549095154, + "rewards/rejected": -0.5996211767196655, + "step": 6110 + }, + { + "epoch": 1.6, + "learning_rate": 5.8032886881492e-07, + "logits/chosen": -2.7037949562072754, + "logits/rejected": -2.673633575439453, + "logps/chosen": -1692.9459228515625, + "logps/rejected": -1603.280517578125, + "loss": 0.6317, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4877226948738098, + "rewards/margins": 0.14314484596252441, + "rewards/rejected": -0.6308675408363342, + "step": 6120 + }, + { + "epoch": 1.6, + "learning_rate": 5.730327697774988e-07, + "logits/chosen": -2.7081751823425293, + "logits/rejected": -2.6860859394073486, + "logps/chosen": -1419.7940673828125, + "logps/rejected": -1213.22412109375, + "loss": 0.6271, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3978734016418457, + "rewards/margins": 0.1460903137922287, + "rewards/rejected": -0.5439636707305908, + "step": 6130 + }, + { + "epoch": 1.61, + "learning_rate": 5.657768845546068e-07, + "logits/chosen": -2.7000532150268555, + "logits/rejected": -2.697673797607422, + "logps/chosen": -1345.072265625, + "logps/rejected": -1430.123046875, + "loss": 0.6313, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4374998211860657, + "rewards/margins": 0.19011881947517395, + "rewards/rejected": -0.627618670463562, + "step": 6140 + }, + { + "epoch": 1.61, + "learning_rate": 5.585613645690713e-07, + "logits/chosen": -2.675696849822998, + "logits/rejected": -2.6756703853607178, + "logps/chosen": -1513.296630859375, + "logps/rejected": -1221.71435546875, + "loss": 0.6418, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5287090539932251, + "rewards/margins": 0.09966927766799927, + "rewards/rejected": -0.6283783912658691, + "step": 6150 + }, + { + "epoch": 1.61, + "learning_rate": 5.513863604013355e-07, + "logits/chosen": -2.7069761753082275, + "logits/rejected": -2.719494581222534, + "logps/chosen": -1532.029052734375, + "logps/rejected": -1490.412353515625, + "loss": 0.6218, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4088051915168762, + "rewards/margins": 0.22951212525367737, + "rewards/rejected": -0.6383172869682312, + "step": 6160 + }, + { + "epoch": 1.61, + "learning_rate": 5.442520217863215e-07, + "logits/chosen": -2.7155890464782715, + "logits/rejected": -2.706444263458252, + "logps/chosen": -1841.702392578125, + "logps/rejected": -1519.7572021484375, + "loss": 0.6097, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2945634722709656, + "rewards/margins": 0.31701231002807617, + "rewards/rejected": -0.6115757822990417, + "step": 6170 + }, + { + "epoch": 1.62, + "learning_rate": 5.371584976103034e-07, + "logits/chosen": -2.6755757331848145, + "logits/rejected": -2.682091236114502, + "logps/chosen": -1305.5174560546875, + "logps/rejected": -1578.818603515625, + "loss": 0.6232, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4950660169124603, + "rewards/margins": 0.0794348269701004, + "rewards/rejected": -0.5745008587837219, + "step": 6180 + }, + { + "epoch": 1.62, + "learning_rate": 5.301059359077987e-07, + "logits/chosen": -2.680753231048584, + "logits/rejected": -2.6786160469055176, + "logps/chosen": -1549.20654296875, + "logps/rejected": -1367.5374755859375, + "loss": 0.6435, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44760221242904663, + "rewards/margins": 0.1415398269891739, + "rewards/rejected": -0.5891419649124146, + "step": 6190 + }, + { + "epoch": 1.62, + "learning_rate": 5.230944838584806e-07, + "logits/chosen": -2.7030324935913086, + "logits/rejected": -2.6957592964172363, + "logps/chosen": -1537.458740234375, + "logps/rejected": -1226.929443359375, + "loss": 0.6611, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5085504651069641, + "rewards/margins": 0.0372183658182621, + "rewards/rejected": -0.5457688570022583, + "step": 6200 + }, + { + "epoch": 1.62, + "eval_logits/chosen": -2.7009968757629395, + "eval_logits/rejected": -2.6925363540649414, + "eval_logps/chosen": -1605.986572265625, + "eval_logps/rejected": -1410.8238525390625, + "eval_loss": 0.6452645063400269, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": -0.4343767464160919, + "eval_rewards/margins": 0.1572161614894867, + "eval_rewards/rejected": -0.5915929675102234, + "eval_runtime": 221.9554, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 0.284, + "step": 6200 + }, + { + "epoch": 1.63, + "learning_rate": 5.161242877841083e-07, + "logits/chosen": -2.710780620574951, + "logits/rejected": -2.714012622833252, + "logps/chosen": -1271.187255859375, + "logps/rejected": -1383.121337890625, + "loss": 0.6161, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4201042652130127, + "rewards/margins": 0.12770195305347443, + "rewards/rejected": -0.5478062629699707, + "step": 6210 + }, + { + "epoch": 1.63, + "learning_rate": 5.091954931454682e-07, + "logits/chosen": -2.6867432594299316, + "logits/rejected": -2.6809990406036377, + "logps/chosen": -1415.892333984375, + "logps/rejected": -1244.710693359375, + "loss": 0.6515, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4569178521633148, + "rewards/margins": 0.14460769295692444, + "rewards/rejected": -0.601525604724884, + "step": 6220 + }, + { + "epoch": 1.63, + "learning_rate": 5.023082445393446e-07, + "logits/chosen": -2.648237705230713, + "logits/rejected": -2.663208246231079, + "logps/chosen": -1238.7562255859375, + "logps/rejected": -1183.0709228515625, + "loss": 0.6458, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.41196736693382263, + "rewards/margins": 0.1221848726272583, + "rewards/rejected": -0.5341522097587585, + "step": 6230 + }, + { + "epoch": 1.63, + "learning_rate": 4.95462685695498e-07, + "logits/chosen": -2.716076135635376, + "logits/rejected": -2.702650547027588, + "logps/chosen": -1689.1253662109375, + "logps/rejected": -1441.07373046875, + "loss": 0.6646, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4031899869441986, + "rewards/margins": 0.01048656739294529, + "rewards/rejected": -0.41367655992507935, + "step": 6240 + }, + { + "epoch": 1.64, + "learning_rate": 4.88658959473666e-07, + "logits/chosen": -2.6755599975585938, + "logits/rejected": -2.678421974182129, + "logps/chosen": -1729.2132568359375, + "logps/rejected": -1477.9561767578125, + "loss": 0.6439, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4346516728401184, + "rewards/margins": 0.22722020745277405, + "rewards/rejected": -0.6618717908859253, + "step": 6250 + }, + { + "epoch": 1.64, + "learning_rate": 4.818972078605821e-07, + "logits/chosen": -2.67181134223938, + "logits/rejected": -2.669654130935669, + "logps/chosen": -1462.231689453125, + "logps/rejected": -1391.5391845703125, + "loss": 0.625, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4106716215610504, + "rewards/margins": 0.10334018617868423, + "rewards/rejected": -0.514011800289154, + "step": 6260 + }, + { + "epoch": 1.64, + "learning_rate": 4.7517757196701514e-07, + "logits/chosen": -2.7111282348632812, + "logits/rejected": -2.699073314666748, + "logps/chosen": -2008.9390869140625, + "logps/rejected": -1729.846923828125, + "loss": 0.6381, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.38991931080818176, + "rewards/margins": 0.37259799242019653, + "rewards/rejected": -0.7625172734260559, + "step": 6270 + }, + { + "epoch": 1.64, + "learning_rate": 4.6850019202482193e-07, + "logits/chosen": -2.7020959854125977, + "logits/rejected": -2.7092764377593994, + "logps/chosen": -1485.4185791015625, + "logps/rejected": -1375.3634033203125, + "loss": 0.6131, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49320369958877563, + "rewards/margins": 0.2785857915878296, + "rewards/rejected": -0.7717894315719604, + "step": 6280 + }, + { + "epoch": 1.65, + "learning_rate": 4.618652073840188e-07, + "logits/chosen": -2.690502405166626, + "logits/rejected": -2.7097690105438232, + "logps/chosen": -1226.995361328125, + "logps/rejected": -1234.814697265625, + "loss": 0.6014, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3699643611907959, + "rewards/margins": 0.2853412628173828, + "rewards/rejected": -0.6553056240081787, + "step": 6290 + }, + { + "epoch": 1.65, + "learning_rate": 4.5527275650987965e-07, + "logits/chosen": -2.6978249549865723, + "logits/rejected": -2.7032647132873535, + "logps/chosen": -1490.8839111328125, + "logps/rejected": -1340.5758056640625, + "loss": 0.6355, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4975271224975586, + "rewards/margins": 0.1745329648256302, + "rewards/rejected": -0.6720601320266724, + "step": 6300 + }, + { + "epoch": 1.65, + "eval_logits/chosen": -2.7008187770843506, + "eval_logits/rejected": -2.6922249794006348, + "eval_logps/chosen": -1605.803466796875, + "eval_logps/rejected": -1410.7569580078125, + "eval_loss": 0.6450992226600647, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": -0.43254372477531433, + "eval_rewards/margins": 0.15837757289409637, + "eval_rewards/rejected": -0.5909213423728943, + "eval_runtime": 221.9031, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 0.284, + "step": 6300 + }, + { + "epoch": 1.65, + "learning_rate": 4.487229769800394e-07, + "logits/chosen": -2.652451515197754, + "logits/rejected": -2.6576945781707764, + "logps/chosen": -1265.826171875, + "logps/rejected": -1329.8004150390625, + "loss": 0.6298, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4063987135887146, + "rewards/margins": 0.31934913992881775, + "rewards/rejected": -0.72574782371521, + "step": 6310 + }, + { + "epoch": 1.65, + "learning_rate": 4.422160054816285e-07, + "logits/chosen": -2.680227756500244, + "logits/rejected": -2.671940803527832, + "logps/chosen": -1844.2279052734375, + "logps/rejected": -1543.527587890625, + "loss": 0.6514, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.42796629667282104, + "rewards/margins": 0.08723724633455276, + "rewards/rejected": -0.515203595161438, + "step": 6320 + }, + { + "epoch": 1.66, + "learning_rate": 4.35751977808416e-07, + "logits/chosen": -2.69242262840271, + "logits/rejected": -2.6834845542907715, + "logps/chosen": -1286.2861328125, + "logps/rejected": -1022.7605590820312, + "loss": 0.6204, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41723084449768066, + "rewards/margins": 0.20962531864643097, + "rewards/rejected": -0.6268561482429504, + "step": 6330 + }, + { + "epoch": 1.66, + "learning_rate": 4.293310288579794e-07, + "logits/chosen": -2.7137248516082764, + "logits/rejected": -2.697584390640259, + "logps/chosen": -1610.983154296875, + "logps/rejected": -1428.408935546875, + "loss": 0.6083, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3962358832359314, + "rewards/margins": 0.24171388149261475, + "rewards/rejected": -0.6379498243331909, + "step": 6340 + }, + { + "epoch": 1.66, + "learning_rate": 4.2295329262888733e-07, + "logits/chosen": -2.681994676589966, + "logits/rejected": -2.6608827114105225, + "logps/chosen": -1370.623779296875, + "logps/rejected": -1160.845703125, + "loss": 0.6433, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.448493093252182, + "rewards/margins": 0.1068970188498497, + "rewards/rejected": -0.5553901195526123, + "step": 6350 + }, + { + "epoch": 1.66, + "learning_rate": 4.1661890221790316e-07, + "logits/chosen": -2.685898780822754, + "logits/rejected": -2.6794886589050293, + "logps/chosen": -1368.075439453125, + "logps/rejected": -1313.705078125, + "loss": 0.6254, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4664105474948883, + "rewards/margins": 0.07501848042011261, + "rewards/rejected": -0.5414290428161621, + "step": 6360 + }, + { + "epoch": 1.67, + "learning_rate": 4.103279898172072e-07, + "logits/chosen": -2.738821268081665, + "logits/rejected": -2.7228472232818604, + "logps/chosen": -1854.404052734375, + "logps/rejected": -1583.7427978515625, + "loss": 0.6266, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.25630882382392883, + "rewards/margins": 0.2646670341491699, + "rewards/rejected": -0.5209758281707764, + "step": 6370 + }, + { + "epoch": 1.67, + "learning_rate": 4.040806867116401e-07, + "logits/chosen": -2.72967791557312, + "logits/rejected": -2.698141098022461, + "logps/chosen": -1598.608154296875, + "logps/rejected": -1285.6024169921875, + "loss": 0.6149, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.44886884093284607, + "rewards/margins": 0.24276570975780487, + "rewards/rejected": -0.6916345953941345, + "step": 6380 + }, + { + "epoch": 1.67, + "learning_rate": 3.978771232759615e-07, + "logits/chosen": -2.6666808128356934, + "logits/rejected": -2.676840305328369, + "logps/chosen": -1426.3568115234375, + "logps/rejected": -1619.660888671875, + "loss": 0.6252, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4658185839653015, + "rewards/margins": 0.22935962677001953, + "rewards/rejected": -0.6951782703399658, + "step": 6390 + }, + { + "epoch": 1.67, + "learning_rate": 3.917174289721276e-07, + "logits/chosen": -2.674344301223755, + "logits/rejected": -2.6882286071777344, + "logps/chosen": -1507.402587890625, + "logps/rejected": -1146.4400634765625, + "loss": 0.6555, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5148229598999023, + "rewards/margins": 0.18879520893096924, + "rewards/rejected": -0.7036181688308716, + "step": 6400 + }, + { + "epoch": 1.67, + "eval_logits/chosen": -2.7020721435546875, + "eval_logits/rejected": -2.6935031414031982, + "eval_logps/chosen": -1605.8125, + "eval_logps/rejected": -1410.7894287109375, + "eval_loss": 0.6451132297515869, + "eval_rewards/accuracies": 0.6230158805847168, + "eval_rewards/chosen": -0.43263548612594604, + "eval_rewards/margins": 0.15861284732818604, + "eval_rewards/rejected": -0.5912482142448425, + "eval_runtime": 222.0105, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 6400 + }, + { + "epoch": 1.68, + "learning_rate": 3.856017323465938e-07, + "logits/chosen": -2.706326723098755, + "logits/rejected": -2.713017225265503, + "logps/chosen": -1699.1292724609375, + "logps/rejected": -1701.164794921875, + "loss": 0.6221, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49689310789108276, + "rewards/margins": 0.24944552779197693, + "rewards/rejected": -0.7463387250900269, + "step": 6410 + }, + { + "epoch": 1.68, + "learning_rate": 3.7953016102762695e-07, + "logits/chosen": -2.691145420074463, + "logits/rejected": -2.677729845046997, + "logps/chosen": -1404.14501953125, + "logps/rejected": -1107.845458984375, + "loss": 0.6219, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4841841757297516, + "rewards/margins": 0.13497625291347504, + "rewards/rejected": -0.6191604137420654, + "step": 6420 + }, + { + "epoch": 1.68, + "learning_rate": 3.7350284172264493e-07, + "logits/chosen": -2.677605390548706, + "logits/rejected": -2.670762538909912, + "logps/chosen": -1495.4324951171875, + "logps/rejected": -1399.22021484375, + "loss": 0.6222, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4789021909236908, + "rewards/margins": 0.1323304921388626, + "rewards/rejected": -0.6112326979637146, + "step": 6430 + }, + { + "epoch": 1.69, + "learning_rate": 3.67519900215573e-07, + "logits/chosen": -2.718522310256958, + "logits/rejected": -2.7079195976257324, + "logps/chosen": -1269.6953125, + "logps/rejected": -1307.8406982421875, + "loss": 0.6652, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.43117189407348633, + "rewards/margins": 0.12488353252410889, + "rewards/rejected": -0.5560554265975952, + "step": 6440 + }, + { + "epoch": 1.69, + "learning_rate": 3.615814613642174e-07, + "logits/chosen": -2.695288896560669, + "logits/rejected": -2.6770596504211426, + "logps/chosen": -1622.7061767578125, + "logps/rejected": -1300.5322265625, + "loss": 0.613, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.35286465287208557, + "rewards/margins": 0.2980819344520569, + "rewards/rejected": -0.6509465575218201, + "step": 6450 + }, + { + "epoch": 1.69, + "learning_rate": 3.5568764909765795e-07, + "logits/chosen": -2.6688880920410156, + "logits/rejected": -2.663649797439575, + "logps/chosen": -1231.46923828125, + "logps/rejected": -1042.579833984375, + "loss": 0.6217, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4582611620426178, + "rewards/margins": 0.24432484805583954, + "rewards/rejected": -0.7025860548019409, + "step": 6460 + }, + { + "epoch": 1.69, + "learning_rate": 3.498385864136672e-07, + "logits/chosen": -2.68622088432312, + "logits/rejected": -2.699253559112549, + "logps/chosen": -1828.1165771484375, + "logps/rejected": -1496.741943359375, + "loss": 0.6386, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43254002928733826, + "rewards/margins": 0.17335142195224762, + "rewards/rejected": -0.6058914065361023, + "step": 6470 + }, + { + "epoch": 1.7, + "learning_rate": 3.440343953761363e-07, + "logits/chosen": -2.7274928092956543, + "logits/rejected": -2.7285075187683105, + "logps/chosen": -1574.9361572265625, + "logps/rejected": -1374.0662841796875, + "loss": 0.6368, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5280033349990845, + "rewards/margins": 0.18731360137462616, + "rewards/rejected": -0.715316891670227, + "step": 6480 + }, + { + "epoch": 1.7, + "learning_rate": 3.382751971125345e-07, + "logits/chosen": -2.6805925369262695, + "logits/rejected": -2.6991703510284424, + "logps/chosen": -1428.466552734375, + "logps/rejected": -1461.8367919921875, + "loss": 0.6573, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.44011393189430237, + "rewards/margins": 0.03761814907193184, + "rewards/rejected": -0.47773200273513794, + "step": 6490 + }, + { + "epoch": 1.7, + "learning_rate": 3.3256111181137753e-07, + "logits/chosen": -2.701589584350586, + "logits/rejected": -2.7013587951660156, + "logps/chosen": -1421.63134765625, + "logps/rejected": -1250.77880859375, + "loss": 0.6584, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4727795720100403, + "rewards/margins": 0.12564000487327576, + "rewards/rejected": -0.5984196066856384, + "step": 6500 + }, + { + "epoch": 1.7, + "eval_logits/chosen": -2.6986544132232666, + "eval_logits/rejected": -2.6900033950805664, + "eval_logps/chosen": -1605.6461181640625, + "eval_logps/rejected": -1410.715087890625, + "eval_loss": 0.6449440121650696, + "eval_rewards/accuracies": 0.6269841194152832, + "eval_rewards/chosen": -0.43097057938575745, + "eval_rewards/margins": 0.15953212976455688, + "eval_rewards/rejected": -0.5905026793479919, + "eval_runtime": 221.9035, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 0.284, + "step": 6500 + }, + { + "epoch": 1.7, + "learning_rate": 3.2689225871971905e-07, + "logits/chosen": -2.681798219680786, + "logits/rejected": -2.6716222763061523, + "logps/chosen": -1813.3060302734375, + "logps/rejected": -1428.790283203125, + "loss": 0.6145, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3622683584690094, + "rewards/margins": 0.19917793571949005, + "rewards/rejected": -0.5614463090896606, + "step": 6510 + }, + { + "epoch": 1.71, + "learning_rate": 3.2126875614066523e-07, + "logits/chosen": -2.688297986984253, + "logits/rejected": -2.6739754676818848, + "logps/chosen": -1608.40673828125, + "logps/rejected": -1535.1259765625, + "loss": 0.6206, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42029422521591187, + "rewards/margins": 0.12229009717702866, + "rewards/rejected": -0.5425843000411987, + "step": 6520 + }, + { + "epoch": 1.71, + "learning_rate": 3.156907214309024e-07, + "logits/chosen": -2.684985637664795, + "logits/rejected": -2.6793723106384277, + "logps/chosen": -1526.4219970703125, + "logps/rejected": -1399.494140625, + "loss": 0.6199, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3454342484474182, + "rewards/margins": 0.2345781773328781, + "rewards/rejected": -0.5800124406814575, + "step": 6530 + }, + { + "epoch": 1.71, + "learning_rate": 3.1015827099824923e-07, + "logits/chosen": -2.6967806816101074, + "logits/rejected": -2.685199737548828, + "logps/chosen": -1596.5302734375, + "logps/rejected": -1385.5205078125, + "loss": 0.6364, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3673211634159088, + "rewards/margins": 0.29226335883140564, + "rewards/rejected": -0.6595844626426697, + "step": 6540 + }, + { + "epoch": 1.71, + "learning_rate": 3.0467152029922926e-07, + "logits/chosen": -2.720731258392334, + "logits/rejected": -2.7024600505828857, + "logps/chosen": -1980.4075927734375, + "logps/rejected": -1463.03515625, + "loss": 0.6366, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.37323513627052307, + "rewards/margins": 0.40360528230667114, + "rewards/rejected": -0.7768403887748718, + "step": 6550 + }, + { + "epoch": 1.72, + "learning_rate": 2.992305838366591e-07, + "logits/chosen": -2.6530847549438477, + "logits/rejected": -2.667252540588379, + "logps/chosen": -1265.091552734375, + "logps/rejected": -1235.925537109375, + "loss": 0.6338, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.43354421854019165, + "rewards/margins": 0.2392309159040451, + "rewards/rejected": -0.672775149345398, + "step": 6560 + }, + { + "epoch": 1.72, + "learning_rate": 2.938355751572583e-07, + "logits/chosen": -2.712062358856201, + "logits/rejected": -2.7131266593933105, + "logps/chosen": -1501.1148681640625, + "logps/rejected": -1550.7257080078125, + "loss": 0.6582, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5113368630409241, + "rewards/margins": 0.08612775057554245, + "rewards/rejected": -0.5974645614624023, + "step": 6570 + }, + { + "epoch": 1.72, + "learning_rate": 2.8848660684928307e-07, + "logits/chosen": -2.6969046592712402, + "logits/rejected": -2.697132110595703, + "logps/chosen": -1749.205810546875, + "logps/rejected": -1346.0675048828125, + "loss": 0.6395, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.36252182722091675, + "rewards/margins": 0.21485641598701477, + "rewards/rejected": -0.5773781538009644, + "step": 6580 + }, + { + "epoch": 1.72, + "learning_rate": 2.8318379054017383e-07, + "logits/chosen": -2.666499376296997, + "logits/rejected": -2.661734104156494, + "logps/chosen": -1770.1800537109375, + "logps/rejected": -1656.2496337890625, + "loss": 0.6425, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3796055018901825, + "rewards/margins": 0.18472369015216827, + "rewards/rejected": -0.5643291473388672, + "step": 6590 + }, + { + "epoch": 1.73, + "learning_rate": 2.779272368942246e-07, + "logits/chosen": -2.6835289001464844, + "logits/rejected": -2.713869094848633, + "logps/chosen": -1644.5257568359375, + "logps/rejected": -1615.684326171875, + "loss": 0.6371, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4157450199127197, + "rewards/margins": 0.14883123338222504, + "rewards/rejected": -0.564576268196106, + "step": 6600 + }, + { + "epoch": 1.73, + "eval_logits/chosen": -2.6984503269195557, + "eval_logits/rejected": -2.689661741256714, + "eval_logps/chosen": -1605.211181640625, + "eval_logps/rejected": -1410.3033447265625, + "eval_loss": 0.644827663898468, + "eval_rewards/accuracies": 0.6309523582458496, + "eval_rewards/chosen": -0.4266229569911957, + "eval_rewards/margins": 0.15976297855377197, + "eval_rewards/rejected": -0.5863860249519348, + "eval_runtime": 222.1021, + "eval_samples_per_second": 9.005, + "eval_steps_per_second": 0.284, + "step": 6600 + }, + { + "epoch": 1.73, + "learning_rate": 2.7271705561027986e-07, + "logits/chosen": -2.6922175884246826, + "logits/rejected": -2.691366195678711, + "logps/chosen": -1414.966796875, + "logps/rejected": -1288.3101806640625, + "loss": 0.6145, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.48495978116989136, + "rewards/margins": 0.1690906137228012, + "rewards/rejected": -0.6540504097938538, + "step": 6610 + }, + { + "epoch": 1.73, + "learning_rate": 2.6755335541943677e-07, + "logits/chosen": -2.6796720027923584, + "logits/rejected": -2.670698642730713, + "logps/chosen": -1534.7186279296875, + "logps/rejected": -1293.262939453125, + "loss": 0.6729, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.531518280506134, + "rewards/margins": 0.024871502071619034, + "rewards/rejected": -0.5563897490501404, + "step": 6620 + }, + { + "epoch": 1.74, + "learning_rate": 2.62436244082781e-07, + "logits/chosen": -2.723823308944702, + "logits/rejected": -2.705821990966797, + "logps/chosen": -1626.5982666015625, + "logps/rejected": -1413.130615234375, + "loss": 0.6512, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4747316241264343, + "rewards/margins": 0.1358988732099533, + "rewards/rejected": -0.6106305122375488, + "step": 6630 + }, + { + "epoch": 1.74, + "learning_rate": 2.5736582838913836e-07, + "logits/chosen": -2.7102105617523193, + "logits/rejected": -2.702176809310913, + "logps/chosen": -1340.4083251953125, + "logps/rejected": -1251.639892578125, + "loss": 0.6995, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5650926232337952, + "rewards/margins": 0.00785607099533081, + "rewards/rejected": -0.5729486346244812, + "step": 6640 + }, + { + "epoch": 1.74, + "learning_rate": 2.5234221415284363e-07, + "logits/chosen": -2.7118804454803467, + "logits/rejected": -2.708615303039551, + "logps/chosen": -1556.7886962890625, + "logps/rejected": -1601.3795166015625, + "loss": 0.6605, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4189126491546631, + "rewards/margins": 0.17687158286571503, + "rewards/rejected": -0.5957843065261841, + "step": 6650 + }, + { + "epoch": 1.74, + "learning_rate": 2.4736550621153375e-07, + "logits/chosen": -2.7197377681732178, + "logits/rejected": -2.713139057159424, + "logps/chosen": -1586.169677734375, + "logps/rejected": -1249.861083984375, + "loss": 0.6616, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4749983251094818, + "rewards/margins": 0.15875229239463806, + "rewards/rejected": -0.6337506771087646, + "step": 6660 + }, + { + "epoch": 1.75, + "learning_rate": 2.424358084239609e-07, + "logits/chosen": -2.7143969535827637, + "logits/rejected": -2.7137789726257324, + "logps/chosen": -1450.5582275390625, + "logps/rejected": -1345.9810791015625, + "loss": 0.5912, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4128552973270416, + "rewards/margins": 0.13145416975021362, + "rewards/rejected": -0.5443094968795776, + "step": 6670 + }, + { + "epoch": 1.75, + "learning_rate": 2.3755322366782158e-07, + "logits/chosen": -2.655879497528076, + "logits/rejected": -2.6556153297424316, + "logps/chosen": -1635.095703125, + "logps/rejected": -1260.944091796875, + "loss": 0.6132, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.36572757363319397, + "rewards/margins": 0.18873073160648346, + "rewards/rejected": -0.5544583201408386, + "step": 6680 + }, + { + "epoch": 1.75, + "learning_rate": 2.3271785383761431e-07, + "logits/chosen": -2.7286901473999023, + "logits/rejected": -2.691357374191284, + "logps/chosen": -1588.2955322265625, + "logps/rejected": -1274.890869140625, + "loss": 0.6173, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.39285022020339966, + "rewards/margins": 0.18746483325958252, + "rewards/rejected": -0.580315113067627, + "step": 6690 + }, + { + "epoch": 1.75, + "learning_rate": 2.2792979984250978e-07, + "logits/chosen": -2.6657614707946777, + "logits/rejected": -2.6573615074157715, + "logps/chosen": -1582.229736328125, + "logps/rejected": -956.1892700195312, + "loss": 0.6051, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43431225419044495, + "rewards/margins": 0.2427477389574051, + "rewards/rejected": -0.6770600080490112, + "step": 6700 + }, + { + "epoch": 1.75, + "eval_logits/chosen": -2.701227903366089, + "eval_logits/rejected": -2.6926941871643066, + "eval_logps/chosen": -1604.7469482421875, + "eval_logps/rejected": -1409.8746337890625, + "eval_loss": 0.6445795893669128, + "eval_rewards/accuracies": 0.6329365372657776, + "eval_rewards/chosen": -0.42197802662849426, + "eval_rewards/margins": 0.16012054681777954, + "eval_rewards/rejected": -0.5820986032485962, + "eval_runtime": 221.9478, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 0.284, + "step": 6700 + }, + { + "epoch": 1.76, + "learning_rate": 2.231891616042453e-07, + "logits/chosen": -2.701498508453369, + "logits/rejected": -2.704812526702881, + "logps/chosen": -1382.1441650390625, + "logps/rejected": -1292.918212890625, + "loss": 0.669, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.4683159291744232, + "rewards/margins": 0.08218617737293243, + "rewards/rejected": -0.5505021810531616, + "step": 6710 + }, + { + "epoch": 1.76, + "learning_rate": 2.1849603805504328e-07, + "logits/chosen": -2.680833339691162, + "logits/rejected": -2.694401979446411, + "logps/chosen": -1717.7662353515625, + "logps/rejected": -1386.9920654296875, + "loss": 0.6116, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.29614511132240295, + "rewards/margins": 0.2935263216495514, + "rewards/rejected": -0.5896713733673096, + "step": 6720 + }, + { + "epoch": 1.76, + "learning_rate": 2.1385052713554066e-07, + "logits/chosen": -2.6653406620025635, + "logits/rejected": -2.6641170978546143, + "logps/chosen": -1424.972412109375, + "logps/rejected": -1280.713623046875, + "loss": 0.6394, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.36866647005081177, + "rewards/margins": 0.2155831754207611, + "rewards/rejected": -0.5842496156692505, + "step": 6730 + }, + { + "epoch": 1.76, + "learning_rate": 2.0925272579274873e-07, + "logits/chosen": -2.6768558025360107, + "logits/rejected": -2.669332504272461, + "logps/chosen": -1590.535888671875, + "logps/rejected": -1380.893310546875, + "loss": 0.6491, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4337334632873535, + "rewards/margins": 0.05396395921707153, + "rewards/rejected": -0.48769742250442505, + "step": 6740 + }, + { + "epoch": 1.77, + "learning_rate": 2.047027299780302e-07, + "logits/chosen": -2.7149715423583984, + "logits/rejected": -2.712569236755371, + "logps/chosen": -1647.9619140625, + "logps/rejected": -1534.022705078125, + "loss": 0.6521, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36180227994918823, + "rewards/margins": 0.08609117567539215, + "rewards/rejected": -0.44789353013038635, + "step": 6750 + }, + { + "epoch": 1.77, + "learning_rate": 2.0020063464509492e-07, + "logits/chosen": -2.6872036457061768, + "logits/rejected": -2.6867897510528564, + "logps/chosen": -1633.364990234375, + "logps/rejected": -1352.14111328125, + "loss": 0.6569, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3849290907382965, + "rewards/margins": 0.17690841853618622, + "rewards/rejected": -0.5618374943733215, + "step": 6760 + }, + { + "epoch": 1.77, + "learning_rate": 1.957465337480191e-07, + "logits/chosen": -2.6997156143188477, + "logits/rejected": -2.7141852378845215, + "logps/chosen": -1325.0081787109375, + "logps/rejected": -1167.6669921875, + "loss": 0.6109, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4304627478122711, + "rewards/margins": 0.1625189185142517, + "rewards/rejected": -0.5929816365242004, + "step": 6770 + }, + { + "epoch": 1.77, + "learning_rate": 1.9134052023928622e-07, + "logits/chosen": -2.725517749786377, + "logits/rejected": -2.714160203933716, + "logps/chosen": -1786.7330322265625, + "logps/rejected": -1655.354736328125, + "loss": 0.6293, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.39226698875427246, + "rewards/margins": 0.18630550801753998, + "rewards/rejected": -0.5785725116729736, + "step": 6780 + }, + { + "epoch": 1.78, + "learning_rate": 1.8698268606784392e-07, + "logits/chosen": -2.6865007877349854, + "logits/rejected": -2.686338424682617, + "logps/chosen": -1578.5491943359375, + "logps/rejected": -1273.3411865234375, + "loss": 0.6167, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.35822856426239014, + "rewards/margins": 0.25197267532348633, + "rewards/rejected": -0.6102012395858765, + "step": 6790 + }, + { + "epoch": 1.78, + "learning_rate": 1.826731221771866e-07, + "logits/chosen": -2.6549112796783447, + "logits/rejected": -2.6449966430664062, + "logps/chosen": -1780.359375, + "logps/rejected": -1476.2601318359375, + "loss": 0.6136, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3461179733276367, + "rewards/margins": 0.23195484280586243, + "rewards/rejected": -0.5780729055404663, + "step": 6800 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -2.702434539794922, + "eval_logits/rejected": -2.694024085998535, + "eval_logps/chosen": -1604.7393798828125, + "eval_logps/rejected": -1409.8861083984375, + "eval_loss": 0.6445672512054443, + "eval_rewards/accuracies": 0.6309523582458496, + "eval_rewards/chosen": -0.4219011664390564, + "eval_rewards/margins": 0.16031363606452942, + "eval_rewards/rejected": -0.5822148323059082, + "eval_runtime": 221.9949, + "eval_samples_per_second": 9.009, + "eval_steps_per_second": 0.284, + "step": 6800 + }, + { + "epoch": 1.78, + "learning_rate": 1.7841191850345967e-07, + "logits/chosen": -2.7168614864349365, + "logits/rejected": -2.7038416862487793, + "logps/chosen": -1741.0836181640625, + "logps/rejected": -1419.6201171875, + "loss": 0.6262, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.44240108132362366, + "rewards/margins": 0.20116452872753143, + "rewards/rejected": -0.6435655951499939, + "step": 6810 + }, + { + "epoch": 1.78, + "learning_rate": 1.7419916397357905e-07, + "logits/chosen": -2.6968274116516113, + "logits/rejected": -2.6936259269714355, + "logps/chosen": -1782.3134765625, + "logps/rejected": -1579.941162109375, + "loss": 0.6335, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.38433465361595154, + "rewards/margins": 0.12539786100387573, + "rewards/rejected": -0.5097325444221497, + "step": 6820 + }, + { + "epoch": 1.79, + "learning_rate": 1.700349465033782e-07, + "logits/chosen": -2.6470327377319336, + "logits/rejected": -2.642458438873291, + "logps/chosen": -1636.154296875, + "logps/rejected": -1551.156982421875, + "loss": 0.6415, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3473636507987976, + "rewards/margins": 0.3328436017036438, + "rewards/rejected": -0.6802071928977966, + "step": 6830 + }, + { + "epoch": 1.79, + "learning_rate": 1.6591935299577227e-07, + "logits/chosen": -2.7135472297668457, + "logits/rejected": -2.7110230922698975, + "logps/chosen": -1444.708984375, + "logps/rejected": -1374.4935302734375, + "loss": 0.6437, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4015157222747803, + "rewards/margins": 0.1597764492034912, + "rewards/rejected": -0.5612921118736267, + "step": 6840 + }, + { + "epoch": 1.79, + "learning_rate": 1.6185246933894338e-07, + "logits/chosen": -2.6957876682281494, + "logits/rejected": -2.7021939754486084, + "logps/chosen": -1620.359619140625, + "logps/rejected": -1610.036865234375, + "loss": 0.6066, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3133171796798706, + "rewards/margins": 0.36562633514404297, + "rewards/rejected": -0.6789435148239136, + "step": 6850 + }, + { + "epoch": 1.8, + "learning_rate": 1.5783438040455097e-07, + "logits/chosen": -2.7235379219055176, + "logits/rejected": -2.7061972618103027, + "logps/chosen": -1601.1849365234375, + "logps/rejected": -1397.846923828125, + "loss": 0.6452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4237557351589203, + "rewards/margins": 0.19110596179962158, + "rewards/rejected": -0.6148617267608643, + "step": 6860 + }, + { + "epoch": 1.8, + "learning_rate": 1.538651700459576e-07, + "logits/chosen": -2.691257953643799, + "logits/rejected": -2.6782546043395996, + "logps/chosen": -1259.538818359375, + "logps/rejected": -1211.7760009765625, + "loss": 0.6237, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.46185874938964844, + "rewards/margins": 0.11213238537311554, + "rewards/rejected": -0.5739911198616028, + "step": 6870 + }, + { + "epoch": 1.8, + "learning_rate": 1.4994492109648151e-07, + "logits/chosen": -2.708364486694336, + "logits/rejected": -2.6761398315429688, + "logps/chosen": -1647.710693359375, + "logps/rejected": -1118.4906005859375, + "loss": 0.6351, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5295812487602234, + "rewards/margins": 0.058738600462675095, + "rewards/rejected": -0.5883198976516724, + "step": 6880 + }, + { + "epoch": 1.8, + "learning_rate": 1.4607371536766695e-07, + "logits/chosen": -2.714329481124878, + "logits/rejected": -2.723923921585083, + "logps/chosen": -1706.525146484375, + "logps/rejected": -1742.4144287109375, + "loss": 0.6241, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.40046390891075134, + "rewards/margins": 0.2260729968547821, + "rewards/rejected": -0.6265369057655334, + "step": 6890 + }, + { + "epoch": 1.81, + "learning_rate": 1.4225163364757655e-07, + "logits/chosen": -2.7081897258758545, + "logits/rejected": -2.6999526023864746, + "logps/chosen": -1839.845703125, + "logps/rejected": -1673.989013671875, + "loss": 0.6503, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.35931825637817383, + "rewards/margins": 0.24749819934368134, + "rewards/rejected": -0.6068164706230164, + "step": 6900 + }, + { + "epoch": 1.81, + "eval_logits/chosen": -2.7030396461486816, + "eval_logits/rejected": -2.694699287414551, + "eval_logps/chosen": -1604.7735595703125, + "eval_logps/rejected": -1409.9207763671875, + "eval_loss": 0.644517719745636, + "eval_rewards/accuracies": 0.6349206566810608, + "eval_rewards/chosen": -0.42224541306495667, + "eval_rewards/margins": 0.16031552851200104, + "eval_rewards/rejected": -0.5825609564781189, + "eval_runtime": 221.8784, + "eval_samples_per_second": 9.014, + "eval_steps_per_second": 0.284, + "step": 6900 + }, + { + "epoch": 1.81, + "learning_rate": 1.3847875569910462e-07, + "logits/chosen": -2.725914716720581, + "logits/rejected": -2.7312119007110596, + "logps/chosen": -1705.2923583984375, + "logps/rejected": -1547.054443359375, + "loss": 0.6337, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3760010004043579, + "rewards/margins": 0.27584725618362427, + "rewards/rejected": -0.6518482565879822, + "step": 6910 + }, + { + "epoch": 1.81, + "learning_rate": 1.3475516025831552e-07, + "logits/chosen": -2.7132887840270996, + "logits/rejected": -2.7062325477600098, + "logps/chosen": -1419.587646484375, + "logps/rejected": -1096.178466796875, + "loss": 0.6266, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4135599136352539, + "rewards/margins": 0.21963760256767273, + "rewards/rejected": -0.633197546005249, + "step": 6920 + }, + { + "epoch": 1.81, + "learning_rate": 1.310809250327974e-07, + "logits/chosen": -2.7178874015808105, + "logits/rejected": -2.687227487564087, + "logps/chosen": -1344.69091796875, + "logps/rejected": -1032.5728759765625, + "loss": 0.6345, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42153486609458923, + "rewards/margins": 0.18064364790916443, + "rewards/rejected": -0.6021785140037537, + "step": 6930 + }, + { + "epoch": 1.82, + "learning_rate": 1.2745612670004153e-07, + "logits/chosen": -2.6864330768585205, + "logits/rejected": -2.6985878944396973, + "logps/chosen": -1467.687744140625, + "logps/rejected": -1250.664306640625, + "loss": 0.6185, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4021376669406891, + "rewards/margins": 0.30139079689979553, + "rewards/rejected": -0.7035284638404846, + "step": 6940 + }, + { + "epoch": 1.82, + "learning_rate": 1.2388084090584395e-07, + "logits/chosen": -2.6969826221466064, + "logits/rejected": -2.687559127807617, + "logps/chosen": -1678.2601318359375, + "logps/rejected": -1455.1302490234375, + "loss": 0.6654, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5606560111045837, + "rewards/margins": 0.09997959434986115, + "rewards/rejected": -0.6606355905532837, + "step": 6950 + }, + { + "epoch": 1.82, + "learning_rate": 1.2035514226272305e-07, + "logits/chosen": -2.63489031791687, + "logits/rejected": -2.646099805831909, + "logps/chosen": -1444.5469970703125, + "logps/rejected": -1406.29541015625, + "loss": 0.6466, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43905216455459595, + "rewards/margins": 0.15955057740211487, + "rewards/rejected": -0.5986027717590332, + "step": 6960 + }, + { + "epoch": 1.82, + "learning_rate": 1.1687910434836607e-07, + "logits/chosen": -2.6887829303741455, + "logits/rejected": -2.684683084487915, + "logps/chosen": -1515.1943359375, + "logps/rejected": -1243.822998046875, + "loss": 0.6022, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3628450036048889, + "rewards/margins": 0.33365586400032043, + "rewards/rejected": -0.696500837802887, + "step": 6970 + }, + { + "epoch": 1.83, + "learning_rate": 1.1345279970409128e-07, + "logits/chosen": -2.6827330589294434, + "logits/rejected": -2.6747491359710693, + "logps/chosen": -1549.365966796875, + "logps/rejected": -1341.9677734375, + "loss": 0.639, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.497032105922699, + "rewards/margins": 0.08594363182783127, + "rewards/rejected": -0.5829757452011108, + "step": 6980 + }, + { + "epoch": 1.83, + "learning_rate": 1.1007629983333629e-07, + "logits/chosen": -2.6825199127197266, + "logits/rejected": -2.682945728302002, + "logps/chosen": -1833.2259521484375, + "logps/rejected": -1173.13427734375, + "loss": 0.6461, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.431971937417984, + "rewards/margins": 0.20928311347961426, + "rewards/rejected": -0.6412550210952759, + "step": 6990 + }, + { + "epoch": 1.83, + "learning_rate": 1.067496752001626e-07, + "logits/chosen": -2.7132515907287598, + "logits/rejected": -2.7097418308258057, + "logps/chosen": -1519.0521240234375, + "logps/rejected": -1273.3426513671875, + "loss": 0.6318, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4385862350463867, + "rewards/margins": 0.07204887270927429, + "rewards/rejected": -0.5106351375579834, + "step": 7000 + }, + { + "epoch": 1.83, + "eval_logits/chosen": -2.7010436058044434, + "eval_logits/rejected": -2.692471742630005, + "eval_logps/chosen": -1604.7110595703125, + "eval_logps/rejected": -1409.8387451171875, + "eval_loss": 0.6445296406745911, + "eval_rewards/accuracies": 0.6329365372657776, + "eval_rewards/chosen": -0.4216185212135315, + "eval_rewards/margins": 0.1601227968931198, + "eval_rewards/rejected": -0.581741213798523, + "eval_runtime": 222.0778, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 0.284, + "step": 7000 + }, + { + "epoch": 1.83, + "learning_rate": 1.0347299522778909e-07, + "logits/chosen": -2.683096408843994, + "logits/rejected": -2.65643572807312, + "logps/chosen": -1366.478759765625, + "logps/rejected": -1104.1004638671875, + "loss": 0.6471, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4307782053947449, + "rewards/margins": 0.23234939575195312, + "rewards/rejected": -0.663127601146698, + "step": 7010 + }, + { + "epoch": 1.84, + "learning_rate": 1.0024632829713971e-07, + "logits/chosen": -2.6967613697052, + "logits/rejected": -2.6798155307769775, + "logps/chosen": -1273.128173828125, + "logps/rejected": -1150.955078125, + "loss": 0.6447, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.39100247621536255, + "rewards/margins": 0.12937206029891968, + "rewards/rejected": -0.5203745365142822, + "step": 7020 + }, + { + "epoch": 1.84, + "learning_rate": 9.706974174541889e-08, + "logits/chosen": -2.6890580654144287, + "logits/rejected": -2.6844520568847656, + "logps/chosen": -1574.6275634765625, + "logps/rejected": -1529.202880859375, + "loss": 0.6431, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4391850531101227, + "rewards/margins": 0.01236086804419756, + "rewards/rejected": -0.4515458941459656, + "step": 7030 + }, + { + "epoch": 1.84, + "learning_rate": 9.39433018647043e-08, + "logits/chosen": -2.6954872608184814, + "logits/rejected": -2.686889171600342, + "logps/chosen": -1622.2542724609375, + "logps/rejected": -1530.85888671875, + "loss": 0.6221, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.32484784722328186, + "rewards/margins": 0.20420944690704346, + "rewards/rejected": -0.5290572643280029, + "step": 7040 + }, + { + "epoch": 1.85, + "learning_rate": 9.086707390056543e-08, + "logits/chosen": -2.716411828994751, + "logits/rejected": -2.7104315757751465, + "logps/chosen": -1577.9422607421875, + "logps/rejected": -1285.207763671875, + "loss": 0.6442, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37132635712623596, + "rewards/margins": 0.163702130317688, + "rewards/rejected": -0.5350284576416016, + "step": 7050 + }, + { + "epoch": 1.85, + "learning_rate": 8.784112205070083e-08, + "logits/chosen": -2.6795337200164795, + "logits/rejected": -2.689896583557129, + "logps/chosen": -1706.0953369140625, + "logps/rejected": -1669.0413818359375, + "loss": 0.6446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45116591453552246, + "rewards/margins": 0.21349339187145233, + "rewards/rejected": -0.6646592617034912, + "step": 7060 + }, + { + "epoch": 1.85, + "learning_rate": 8.486550946359779e-08, + "logits/chosen": -2.7108492851257324, + "logits/rejected": -2.698906421661377, + "logps/chosen": -1502.298095703125, + "logps/rejected": -1157.237060546875, + "loss": 0.6307, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.48375552892684937, + "rewards/margins": 0.08426637947559357, + "rewards/rejected": -0.5680218935012817, + "step": 7070 + }, + { + "epoch": 1.85, + "learning_rate": 8.194029823721556e-08, + "logits/chosen": -2.697640895843506, + "logits/rejected": -2.6725358963012695, + "logps/chosen": -1769.990234375, + "logps/rejected": -1637.005126953125, + "loss": 0.6669, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.3244650363922119, + "rewards/margins": 0.21794219315052032, + "rewards/rejected": -0.542407214641571, + "step": 7080 + }, + { + "epoch": 1.86, + "learning_rate": 7.906554941768896e-08, + "logits/chosen": -2.7125067710876465, + "logits/rejected": -2.7144935131073, + "logps/chosen": -1664.8450927734375, + "logps/rejected": -1524.383544921875, + "loss": 0.6529, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.49117860198020935, + "rewards/margins": 0.029221346601843834, + "rewards/rejected": -0.5203999876976013, + "step": 7090 + }, + { + "epoch": 1.86, + "learning_rate": 7.624132299805575e-08, + "logits/chosen": -2.6807377338409424, + "logits/rejected": -2.694531202316284, + "logps/chosen": -1521.4224853515625, + "logps/rejected": -1667.6331787109375, + "loss": 0.6493, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5217372179031372, + "rewards/margins": 0.11972247064113617, + "rewards/rejected": -0.641459584236145, + "step": 7100 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -2.702463388442993, + "eval_logits/rejected": -2.6940252780914307, + "eval_logps/chosen": -1604.70263671875, + "eval_logps/rejected": -1409.81787109375, + "eval_loss": 0.6445424556732178, + "eval_rewards/accuracies": 0.6329365372657776, + "eval_rewards/chosen": -0.42153695225715637, + "eval_rewards/margins": 0.15999405086040497, + "eval_rewards/rejected": -0.5815309882164001, + "eval_runtime": 221.796, + "eval_samples_per_second": 9.017, + "eval_steps_per_second": 0.284, + "step": 7100 + }, + { + "epoch": 1.86, + "learning_rate": 7.346767791700127e-08, + "logits/chosen": -2.706827163696289, + "logits/rejected": -2.714289665222168, + "logps/chosen": -1348.616455078125, + "logps/rejected": -1254.7540283203125, + "loss": 0.6332, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47383370995521545, + "rewards/margins": 0.15515919029712677, + "rewards/rejected": -0.6289928555488586, + "step": 7110 + }, + { + "epoch": 1.86, + "learning_rate": 7.07446720576327e-08, + "logits/chosen": -2.7369492053985596, + "logits/rejected": -2.73970890045166, + "logps/chosen": -1749.2347412109375, + "logps/rejected": -1596.642333984375, + "loss": 0.6375, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.46583041548728943, + "rewards/margins": 0.057837147265672684, + "rewards/rejected": -0.523667573928833, + "step": 7120 + }, + { + "epoch": 1.87, + "learning_rate": 6.807236224626701e-08, + "logits/chosen": -2.689509868621826, + "logits/rejected": -2.672372341156006, + "logps/chosen": -1553.8916015625, + "logps/rejected": -1422.921875, + "loss": 0.6388, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4380861222743988, + "rewards/margins": 0.23138687014579773, + "rewards/rejected": -0.6694729924201965, + "step": 7130 + }, + { + "epoch": 1.87, + "learning_rate": 6.545080425124888e-08, + "logits/chosen": -2.7350094318389893, + "logits/rejected": -2.7102842330932617, + "logps/chosen": -1565.059814453125, + "logps/rejected": -983.6837768554688, + "loss": 0.6344, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3876848518848419, + "rewards/margins": 0.2866832911968231, + "rewards/rejected": -0.6743682026863098, + "step": 7140 + }, + { + "epoch": 1.87, + "learning_rate": 6.288005278178382e-08, + "logits/chosen": -2.7340786457061768, + "logits/rejected": -2.7070319652557373, + "logps/chosen": -1651.146240234375, + "logps/rejected": -1310.5484619140625, + "loss": 0.6414, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4129902720451355, + "rewards/margins": 0.18922848999500275, + "rewards/rejected": -0.6022188067436218, + "step": 7150 + }, + { + "epoch": 1.87, + "learning_rate": 6.036016148679825e-08, + "logits/chosen": -2.685917377471924, + "logits/rejected": -2.673793315887451, + "logps/chosen": -1528.411865234375, + "logps/rejected": -1308.5904541015625, + "loss": 0.6236, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4521363377571106, + "rewards/margins": 0.14088140428066254, + "rewards/rejected": -0.5930176973342896, + "step": 7160 + }, + { + "epoch": 1.88, + "learning_rate": 5.7891182953819235e-08, + "logits/chosen": -2.68344783782959, + "logits/rejected": -2.680868625640869, + "logps/chosen": -1653.4996337890625, + "logps/rejected": -1406.031494140625, + "loss": 0.6425, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5184036493301392, + "rewards/margins": 0.12173338234424591, + "rewards/rejected": -0.6401370167732239, + "step": 7170 + }, + { + "epoch": 1.88, + "learning_rate": 5.547316870787689e-08, + "logits/chosen": -2.7209701538085938, + "logits/rejected": -2.7010300159454346, + "logps/chosen": -1677.9332275390625, + "logps/rejected": -1209.544677734375, + "loss": 0.6339, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5062988996505737, + "rewards/margins": 0.18664419651031494, + "rewards/rejected": -0.6929429769515991, + "step": 7180 + }, + { + "epoch": 1.88, + "learning_rate": 5.310616921042927e-08, + "logits/chosen": -2.660794734954834, + "logits/rejected": -2.6702022552490234, + "logps/chosen": -1407.279052734375, + "logps/rejected": -1222.977294921875, + "loss": 0.6418, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39660215377807617, + "rewards/margins": 0.10538413375616074, + "rewards/rejected": -0.5019862651824951, + "step": 7190 + }, + { + "epoch": 1.88, + "learning_rate": 5.079023385830939e-08, + "logits/chosen": -2.6666197776794434, + "logits/rejected": -2.6605982780456543, + "logps/chosen": -1460.6802978515625, + "logps/rejected": -1327.25244140625, + "loss": 0.6292, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.31100255250930786, + "rewards/margins": 0.2334582507610321, + "rewards/rejected": -0.5444608926773071, + "step": 7200 + }, + { + "epoch": 1.88, + "eval_logits/chosen": -2.702709674835205, + "eval_logits/rejected": -2.69431209564209, + "eval_logps/chosen": -1604.719482421875, + "eval_logps/rejected": -1409.822265625, + "eval_loss": 0.6445853114128113, + "eval_rewards/accuracies": 0.6329365372657776, + "eval_rewards/chosen": -0.4217035472393036, + "eval_rewards/margins": 0.15987294912338257, + "eval_rewards/rejected": -0.5815765261650085, + "eval_runtime": 221.9519, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 0.284, + "step": 7200 + }, + { + "epoch": 1.89, + "learning_rate": 4.8525410982695476e-08, + "logits/chosen": -2.654919147491455, + "logits/rejected": -2.6503231525421143, + "logps/chosen": -1528.534423828125, + "logps/rejected": -1125.8482666015625, + "loss": 0.6448, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.33400973677635193, + "rewards/margins": 0.23223035037517548, + "rewards/rejected": -0.5662400722503662, + "step": 7210 + }, + { + "epoch": 1.89, + "learning_rate": 4.6311747848099e-08, + "logits/chosen": -2.6991829872131348, + "logits/rejected": -2.6920909881591797, + "logps/chosen": -1656.469970703125, + "logps/rejected": -1294.0587158203125, + "loss": 0.6691, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.45552200078964233, + "rewards/margins": 0.07368168979883194, + "rewards/rejected": -0.5292037725448608, + "step": 7220 + }, + { + "epoch": 1.89, + "learning_rate": 4.4149290651382405e-08, + "logits/chosen": -2.652975082397461, + "logits/rejected": -2.6496188640594482, + "logps/chosen": -1439.092041015625, + "logps/rejected": -1270.6805419921875, + "loss": 0.6202, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.49305492639541626, + "rewards/margins": 0.0813017338514328, + "rewards/rejected": -0.5743566155433655, + "step": 7230 + }, + { + "epoch": 1.89, + "learning_rate": 4.203808452079211e-08, + "logits/chosen": -2.737536907196045, + "logits/rejected": -2.713040828704834, + "logps/chosen": -1595.1773681640625, + "logps/rejected": -1512.947021484375, + "loss": 0.6126, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3274495005607605, + "rewards/margins": 0.3223855197429657, + "rewards/rejected": -0.6498350501060486, + "step": 7240 + }, + { + "epoch": 1.9, + "learning_rate": 3.9978173515018427e-08, + "logits/chosen": -2.7084734439849854, + "logits/rejected": -2.69303035736084, + "logps/chosen": -1404.4498291015625, + "logps/rejected": -1423.9259033203125, + "loss": 0.6338, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5082755088806152, + "rewards/margins": 0.11508085578680038, + "rewards/rejected": -0.623356282711029, + "step": 7250 + }, + { + "epoch": 1.9, + "learning_rate": 3.7969600622274614e-08, + "logits/chosen": -2.7099738121032715, + "logits/rejected": -2.6851272583007812, + "logps/chosen": -1558.6534423828125, + "logps/rejected": -1573.78076171875, + "loss": 0.6469, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.38475877046585083, + "rewards/margins": 0.12393616139888763, + "rewards/rejected": -0.5086949467658997, + "step": 7260 + }, + { + "epoch": 1.9, + "learning_rate": 3.601240775940151e-08, + "logits/chosen": -2.683004856109619, + "logits/rejected": -2.681644916534424, + "logps/chosen": -1144.97802734375, + "logps/rejected": -1189.7843017578125, + "loss": 0.6497, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4038308560848236, + "rewards/margins": 0.14985907077789307, + "rewards/rejected": -0.5536898970603943, + "step": 7270 + }, + { + "epoch": 1.91, + "learning_rate": 3.410663577099071e-08, + "logits/chosen": -2.709052562713623, + "logits/rejected": -2.668560266494751, + "logps/chosen": -1598.82861328125, + "logps/rejected": -1156.0931396484375, + "loss": 0.6498, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.44105497002601624, + "rewards/margins": 0.27089494466781616, + "rewards/rejected": -0.7119500041007996, + "step": 7280 + }, + { + "epoch": 1.91, + "learning_rate": 3.2252324428534986e-08, + "logits/chosen": -2.734286069869995, + "logits/rejected": -2.7319412231445312, + "logps/chosen": -1638.3583984375, + "logps/rejected": -1431.2901611328125, + "loss": 0.6428, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44060295820236206, + "rewards/margins": 0.13731783628463745, + "rewards/rejected": -0.5779208540916443, + "step": 7290 + }, + { + "epoch": 1.91, + "learning_rate": 3.0449512429594486e-08, + "logits/chosen": -2.7285077571868896, + "logits/rejected": -2.7121500968933105, + "logps/chosen": -1547.3211669921875, + "logps/rejected": -1328.563720703125, + "loss": 0.625, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.39875486493110657, + "rewards/margins": 0.27655380964279175, + "rewards/rejected": -0.6753085851669312, + "step": 7300 + }, + { + "epoch": 1.91, + "eval_logits/chosen": -2.702165365219116, + "eval_logits/rejected": -2.693725109100342, + "eval_logps/chosen": -1604.7012939453125, + "eval_logps/rejected": -1409.8218994140625, + "eval_loss": 0.6445257067680359, + "eval_rewards/accuracies": 0.6329365372657776, + "eval_rewards/chosen": -0.42152103781700134, + "eval_rewards/margins": 0.16004998981952667, + "eval_rewards/rejected": -0.5815710425376892, + "eval_runtime": 221.9011, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 0.284, + "step": 7300 + }, + { + "epoch": 1.91, + "learning_rate": 2.8698237396992956e-08, + "logits/chosen": -2.7125720977783203, + "logits/rejected": -2.710803508758545, + "logps/chosen": -1958.1751708984375, + "logps/rejected": -1697.903076171875, + "loss": 0.6355, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3638436198234558, + "rewards/margins": 0.1362782120704651, + "rewards/rejected": -0.5001217722892761, + "step": 7310 + }, + { + "epoch": 1.92, + "learning_rate": 2.6998535878030584e-08, + "logits/chosen": -2.687643527984619, + "logits/rejected": -2.6909823417663574, + "logps/chosen": -1759.569091796875, + "logps/rejected": -1552.631591796875, + "loss": 0.6333, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39130860567092896, + "rewards/margins": 0.19722767174243927, + "rewards/rejected": -0.5885363221168518, + "step": 7320 + }, + { + "epoch": 1.92, + "learning_rate": 2.535044334372072e-08, + "logits/chosen": -2.698178768157959, + "logits/rejected": -2.7111730575561523, + "logps/chosen": -1635.5369873046875, + "logps/rejected": -1726.328369140625, + "loss": 0.649, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.42986369132995605, + "rewards/margins": 0.16456125676631927, + "rewards/rejected": -0.5944249033927917, + "step": 7330 + }, + { + "epoch": 1.92, + "learning_rate": 2.3753994188051853e-08, + "logits/chosen": -2.7270450592041016, + "logits/rejected": -2.7080130577087402, + "logps/chosen": -1637.684814453125, + "logps/rejected": -1292.521484375, + "loss": 0.615, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.38862666487693787, + "rewards/margins": 0.2333899438381195, + "rewards/rejected": -0.6220166087150574, + "step": 7340 + }, + { + "epoch": 1.92, + "learning_rate": 2.220922172726764e-08, + "logits/chosen": -2.6930315494537354, + "logits/rejected": -2.685628652572632, + "logps/chosen": -1648.4945068359375, + "logps/rejected": -1635.29248046875, + "loss": 0.6388, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.4903673529624939, + "rewards/margins": 0.09882920235395432, + "rewards/rejected": -0.5891965627670288, + "step": 7350 + }, + { + "epoch": 1.93, + "learning_rate": 2.071615819917244e-08, + "logits/chosen": -2.726003885269165, + "logits/rejected": -2.7233633995056152, + "logps/chosen": -1906.630615234375, + "logps/rejected": -1663.8984375, + "loss": 0.6585, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3871636986732483, + "rewards/margins": 0.10992630571126938, + "rewards/rejected": -0.49709001183509827, + "step": 7360 + }, + { + "epoch": 1.93, + "learning_rate": 1.9274834762459393e-08, + "logits/chosen": -2.6870107650756836, + "logits/rejected": -2.6754543781280518, + "logps/chosen": -1455.197509765625, + "logps/rejected": -1335.19140625, + "loss": 0.6257, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4132939279079437, + "rewards/margins": 0.26970240473747253, + "rewards/rejected": -0.682996392250061, + "step": 7370 + }, + { + "epoch": 1.93, + "learning_rate": 1.7885281496058947e-08, + "logits/chosen": -2.7087411880493164, + "logits/rejected": -2.6738803386688232, + "logps/chosen": -1738.7806396484375, + "logps/rejected": -1245.742919921875, + "loss": 0.6571, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3533250391483307, + "rewards/margins": 0.21966774761676788, + "rewards/rejected": -0.5729928016662598, + "step": 7380 + }, + { + "epoch": 1.93, + "learning_rate": 1.654752739851134e-08, + "logits/chosen": -2.7014052867889404, + "logits/rejected": -2.693253993988037, + "logps/chosen": -1632.851806640625, + "logps/rejected": -1454.01416015625, + "loss": 0.6522, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.36678558588027954, + "rewards/margins": 0.1973138153553009, + "rewards/rejected": -0.5640994310379028, + "step": 7390 + }, + { + "epoch": 1.94, + "learning_rate": 1.526160038736235e-08, + "logits/chosen": -2.6802146434783936, + "logits/rejected": -2.6759490966796875, + "logps/chosen": -1576.840576171875, + "logps/rejected": -1372.6458740234375, + "loss": 0.6306, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5172873139381409, + "rewards/margins": 0.12491671741008759, + "rewards/rejected": -0.6422039270401001, + "step": 7400 + }, + { + "epoch": 1.94, + "eval_logits/chosen": -2.702148914337158, + "eval_logits/rejected": -2.6937015056610107, + "eval_logps/chosen": -1604.724365234375, + "eval_logps/rejected": -1409.8013916015625, + "eval_loss": 0.6445860862731934, + "eval_rewards/accuracies": 0.6289682388305664, + "eval_rewards/chosen": -0.4217517077922821, + "eval_rewards/margins": 0.15961501002311707, + "eval_rewards/rejected": -0.5813668370246887, + "eval_runtime": 221.8611, + "eval_samples_per_second": 9.015, + "eval_steps_per_second": 0.284, + "step": 7400 + }, + { + "epoch": 1.94, + "learning_rate": 1.402752729857959e-08, + "logits/chosen": -2.701080322265625, + "logits/rejected": -2.704324722290039, + "logps/chosen": -1448.97900390625, + "logps/rejected": -1565.101806640625, + "loss": 0.6573, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.45339909195899963, + "rewards/margins": 0.009469692595303059, + "rewards/rejected": -0.4628687798976898, + "step": 7410 + }, + { + "epoch": 1.94, + "learning_rate": 1.2845333885992683e-08, + "logits/chosen": -2.6895458698272705, + "logits/rejected": -2.691756010055542, + "logps/chosen": -1508.894287109375, + "logps/rejected": -1402.9140625, + "loss": 0.6277, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.39680013060569763, + "rewards/margins": 0.17420104146003723, + "rewards/rejected": -0.5710011720657349, + "step": 7420 + }, + { + "epoch": 1.94, + "learning_rate": 1.171504482075675e-08, + "logits/chosen": -2.687668800354004, + "logits/rejected": -2.6926398277282715, + "logps/chosen": -1556.4830322265625, + "logps/rejected": -1661.039794921875, + "loss": 0.613, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4860979914665222, + "rewards/margins": 0.2525389492511749, + "rewards/rejected": -0.7386370301246643, + "step": 7430 + }, + { + "epoch": 1.95, + "learning_rate": 1.0636683690836147e-08, + "logits/chosen": -2.7024621963500977, + "logits/rejected": -2.6893680095672607, + "logps/chosen": -1638.73046875, + "logps/rejected": -1485.549072265625, + "loss": 0.6639, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5557451248168945, + "rewards/margins": -0.012395946308970451, + "rewards/rejected": -0.5433492064476013, + "step": 7440 + }, + { + "epoch": 1.95, + "learning_rate": 9.610273000513203e-09, + "logits/chosen": -2.6708273887634277, + "logits/rejected": -2.675523281097412, + "logps/chosen": -1538.5250244140625, + "logps/rejected": -1047.2149658203125, + "loss": 0.6475, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4156258702278137, + "rewards/margins": 0.1110345870256424, + "rewards/rejected": -0.5266603827476501, + "step": 7450 + }, + { + "epoch": 1.95, + "learning_rate": 8.635834169918312e-09, + "logits/chosen": -2.7205698490142822, + "logits/rejected": -2.7201004028320312, + "logps/chosen": -1408.6478271484375, + "logps/rejected": -1399.743408203125, + "loss": 0.6223, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4861675798892975, + "rewards/margins": 0.19877155125141144, + "rewards/rejected": -0.6849390864372253, + "step": 7460 + }, + { + "epoch": 1.95, + "learning_rate": 7.713387534582506e-09, + "logits/chosen": -2.6804215908050537, + "logits/rejected": -2.6637444496154785, + "logps/chosen": -1541.433349609375, + "logps/rejected": -1571.2962646484375, + "loss": 0.6215, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3475147783756256, + "rewards/margins": 0.16994670033454895, + "rewards/rejected": -0.5174614787101746, + "step": 7470 + }, + { + "epoch": 1.96, + "learning_rate": 6.84295234501392e-09, + "logits/chosen": -2.7089502811431885, + "logits/rejected": -2.712880849838257, + "logps/chosen": -1744.5091552734375, + "logps/rejected": -1501.75244140625, + "loss": 0.624, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4259180426597595, + "rewards/margins": 0.2688801884651184, + "rewards/rejected": -0.6947982907295227, + "step": 7480 + }, + { + "epoch": 1.96, + "learning_rate": 6.024546766295325e-09, + "logits/chosen": -2.668334484100342, + "logits/rejected": -2.6527392864227295, + "logps/chosen": -1190.046142578125, + "logps/rejected": -1228.147216796875, + "loss": 0.6477, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5390468835830688, + "rewards/margins": 0.11457918584346771, + "rewards/rejected": -0.653626024723053, + "step": 7490 + }, + { + "epoch": 1.96, + "learning_rate": 5.2581878777049895e-09, + "logits/chosen": -2.663644552230835, + "logits/rejected": -2.6548047065734863, + "logps/chosen": -1755.512451171875, + "logps/rejected": -1462.0538330078125, + "loss": 0.6446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4531777501106262, + "rewards/margins": 0.19328074157238007, + "rewards/rejected": -0.6464585065841675, + "step": 7500 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -2.702141284942627, + "eval_logits/rejected": -2.6936912536621094, + "eval_logps/chosen": -1604.7235107421875, + "eval_logps/rejected": -1409.80029296875, + "eval_loss": 0.6445866227149963, + "eval_rewards/accuracies": 0.6289682388305664, + "eval_rewards/chosen": -0.4217440187931061, + "eval_rewards/margins": 0.15961241722106934, + "eval_rewards/rejected": -0.5813564658164978, + "eval_runtime": 221.8894, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 0.284, + "step": 7500 + }, + { + "epoch": 1.97, + "learning_rate": 4.543891672361411e-09, + "logits/chosen": -2.713366746902466, + "logits/rejected": -2.734687328338623, + "logps/chosen": -1524.438232421875, + "logps/rejected": -1415.349365234375, + "loss": 0.6461, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3279759883880615, + "rewards/margins": 0.17746631801128387, + "rewards/rejected": -0.5054423213005066, + "step": 7510 + }, + { + "epoch": 1.97, + "learning_rate": 3.881673056887747e-09, + "logits/chosen": -2.6779403686523438, + "logits/rejected": -2.6494932174682617, + "logps/chosen": -1766.4049072265625, + "logps/rejected": -1750.7711181640625, + "loss": 0.6198, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4086696207523346, + "rewards/margins": 0.3214671015739441, + "rewards/rejected": -0.7301367521286011, + "step": 7520 + }, + { + "epoch": 1.97, + "learning_rate": 3.2715458511023425e-09, + "logits/chosen": -2.7254676818847656, + "logits/rejected": -2.7072927951812744, + "logps/chosen": -1588.3248291015625, + "logps/rejected": -1312.022705078125, + "loss": 0.6114, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3616897463798523, + "rewards/margins": 0.27325281500816345, + "rewards/rejected": -0.6349425315856934, + "step": 7530 + }, + { + "epoch": 1.97, + "learning_rate": 2.7135227877289617e-09, + "logits/chosen": -2.699939727783203, + "logits/rejected": -2.6992363929748535, + "logps/chosen": -1314.650146484375, + "logps/rejected": -1154.2691650390625, + "loss": 0.6657, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.4900095462799072, + "rewards/margins": 0.06088308244943619, + "rewards/rejected": -0.55089271068573, + "step": 7540 + }, + { + "epoch": 1.98, + "learning_rate": 2.2076155121328326e-09, + "logits/chosen": -2.7241501808166504, + "logits/rejected": -2.705294609069824, + "logps/chosen": -1565.5633544921875, + "logps/rejected": -1411.188720703125, + "loss": 0.6372, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.36127161979675293, + "rewards/margins": 0.26764652132987976, + "rewards/rejected": -0.6289182305335999, + "step": 7550 + }, + { + "epoch": 1.98, + "learning_rate": 1.7538345820755641e-09, + "logits/chosen": -2.730531692504883, + "logits/rejected": -2.7179620265960693, + "logps/chosen": -1499.7626953125, + "logps/rejected": -1195.4996337890625, + "loss": 0.6541, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4179004728794098, + "rewards/margins": 0.2592325508594513, + "rewards/rejected": -0.6771329641342163, + "step": 7560 + }, + { + "epoch": 1.98, + "learning_rate": 1.3521894674961567e-09, + "logits/chosen": -2.6926231384277344, + "logits/rejected": -2.711402177810669, + "logps/chosen": -1196.3023681640625, + "logps/rejected": -1280.1029052734375, + "loss": 0.6327, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.47418227791786194, + "rewards/margins": 0.17429625988006592, + "rewards/rejected": -0.6484785676002502, + "step": 7570 + }, + { + "epoch": 1.98, + "learning_rate": 1.0026885503131023e-09, + "logits/chosen": -2.729646921157837, + "logits/rejected": -2.731315851211548, + "logps/chosen": -1656.604248046875, + "logps/rejected": -1487.271728515625, + "loss": 0.6553, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5302258729934692, + "rewards/margins": 0.10667018592357635, + "rewards/rejected": -0.6368960738182068, + "step": 7580 + }, + { + "epoch": 1.99, + "learning_rate": 7.053391242492491e-10, + "logits/chosen": -2.6965746879577637, + "logits/rejected": -2.7005808353424072, + "logps/chosen": -1340.211181640625, + "logps/rejected": -1152.303955078125, + "loss": 0.6525, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4404354691505432, + "rewards/margins": 0.14564061164855957, + "rewards/rejected": -0.5860761404037476, + "step": 7590 + }, + { + "epoch": 1.99, + "learning_rate": 4.6014739467997725e-10, + "logits/chosen": -2.7206387519836426, + "logits/rejected": -2.7116284370422363, + "logps/chosen": -1512.256103515625, + "logps/rejected": -1473.5595703125, + "loss": 0.6394, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.44393259286880493, + "rewards/margins": 0.21005916595458984, + "rewards/rejected": -0.6539917588233948, + "step": 7600 + }, + { + "epoch": 1.99, + "eval_logits/chosen": -2.702141284942627, + "eval_logits/rejected": -2.6936912536621094, + "eval_logps/chosen": -1604.7235107421875, + "eval_logps/rejected": -1409.80029296875, + "eval_loss": 0.6445866227149963, + "eval_rewards/accuracies": 0.6289682388305664, + "eval_rewards/chosen": -0.4217440187931061, + "eval_rewards/margins": 0.15961241722106934, + "eval_rewards/rejected": -0.5813564658164978, + "eval_runtime": 222.104, + "eval_samples_per_second": 9.005, + "eval_steps_per_second": 0.284, + "step": 7600 + }, + { + "epoch": 1.99, + "learning_rate": 2.671184785033032e-10, + "logits/chosen": -2.687058925628662, + "logits/rejected": -2.6820104122161865, + "logps/chosen": -1634.839599609375, + "logps/rejected": -1303.875244140625, + "loss": 0.6623, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.48275700211524963, + "rewards/margins": -0.0027821571566164494, + "rewards/rejected": -0.4799748957157135, + "step": 7610 + }, + { + "epoch": 1.99, + "learning_rate": 1.2625640403302054e-10, + "logits/chosen": -2.698061466217041, + "logits/rejected": -2.6827805042266846, + "logps/chosen": -1436.16748046875, + "logps/rejected": -1356.511962890625, + "loss": 0.6492, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.47642627358436584, + "rewards/margins": 0.1004827618598938, + "rewards/rejected": -0.5769090056419373, + "step": 7620 + }, + { + "epoch": 2.0, + "learning_rate": 3.756411091515588e-11, + "logits/chosen": -2.6850998401641846, + "logits/rejected": -2.6807219982147217, + "logps/chosen": -1597.2109375, + "logps/rejected": -1336.142578125, + "loss": 0.6746, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.43785446882247925, + "rewards/margins": 0.1036500483751297, + "rewards/rejected": -0.5415045022964478, + "step": 7630 + }, + { + "epoch": 2.0, + "learning_rate": 1.0434500657963143e-12, + "logits/chosen": -2.703892946243286, + "logits/rejected": -2.7021007537841797, + "logps/chosen": -1478.006591796875, + "logps/rejected": -1315.51318359375, + "loss": 0.6228, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.37986981868743896, + "rewards/margins": 0.44364243745803833, + "rewards/rejected": -0.8235122561454773, + "step": 7640 + }, + { + "epoch": 2.0, + "step": 7642, + "total_flos": 0.0, + "train_loss": 0.6517634629204897, + "train_runtime": 44544.264, + "train_samples_per_second": 2.745, + "train_steps_per_second": 0.172 + } + ], + "logging_steps": 10, + "max_steps": 7642, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}