{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016, "grad_norm": 188.96707295865775, "learning_rate": 9.9856e-07, "logits/chosen": 0.617413341999054, "logits/rejected": 0.849047839641571, "logps/chosen": -233.625, "logps/rejected": -191.3874969482422, "loss": 0.6734, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 0.07488174736499786, "rewards/margins": 0.050000764429569244, "rewards/rejected": 0.02491302415728569, "step": 10 }, { "epoch": 0.0032, "grad_norm": 127.04732651118793, "learning_rate": 9.9696e-07, "logits/chosen": 0.587292492389679, "logits/rejected": 0.6776123046875, "logps/chosen": -205.1999969482422, "logps/rejected": -217.46249389648438, "loss": 0.6247, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.20032043755054474, "rewards/margins": 0.22720031440258026, "rewards/rejected": -0.02682647667825222, "step": 20 }, { "epoch": 0.0048, "grad_norm": 161.60402279249308, "learning_rate": 9.9536e-07, "logits/chosen": 0.36060792207717896, "logits/rejected": 0.5626770257949829, "logps/chosen": -249.3625030517578, "logps/rejected": -208.1999969482422, "loss": 0.5836, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.3112548887729645, "rewards/margins": 0.4395080506801605, "rewards/rejected": -0.128204345703125, "step": 30 }, { "epoch": 0.0064, "grad_norm": 91.1270673616462, "learning_rate": 9.937599999999999e-07, "logits/chosen": 0.4085754454135895, "logits/rejected": 0.5856262445449829, "logps/chosen": -233.97500610351562, "logps/rejected": -200.47500610351562, "loss": 0.5388, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2527832090854645, "rewards/margins": 0.765045166015625, "rewards/rejected": -0.5127624273300171, "step": 40 }, { "epoch": 0.008, "grad_norm": 148.52864887160544, "learning_rate": 9.9216e-07, "logits/chosen": 0.36283570528030396, "logits/rejected": 0.503063976764679, "logps/chosen": -238.33749389648438, "logps/rejected": -177.14999389648438, "loss": 0.6157, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0076141357421875, "rewards/margins": 0.596942126750946, "rewards/rejected": -0.589202880859375, "step": 50 }, { "epoch": 0.0096, "grad_norm": 168.5513543402595, "learning_rate": 9.9056e-07, "logits/chosen": 0.3789520263671875, "logits/rejected": 0.44984132051467896, "logps/chosen": -250.8249969482422, "logps/rejected": -211.7624969482422, "loss": 0.629, "rewards/accuracies": 0.65625, "rewards/chosen": -0.006365966983139515, "rewards/margins": 0.5185302495956421, "rewards/rejected": -0.5244476199150085, "step": 60 }, { "epoch": 0.0112, "grad_norm": 150.8862983294824, "learning_rate": 9.8896e-07, "logits/chosen": 0.4671691954135895, "logits/rejected": 0.549670398235321, "logps/chosen": -247.8249969482422, "logps/rejected": -210.5625, "loss": 0.6409, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.12908324599266052, "rewards/margins": 0.538433849811554, "rewards/rejected": -0.40939027070999146, "step": 70 }, { "epoch": 0.0128, "grad_norm": 118.61684329282609, "learning_rate": 9.8736e-07, "logits/chosen": 0.4459472596645355, "logits/rejected": 0.63922119140625, "logps/chosen": -205.1125030517578, "logps/rejected": -187.71249389648438, "loss": 0.5634, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.20466919243335724, "rewards/margins": 0.6068969964981079, "rewards/rejected": -0.40214842557907104, "step": 80 }, { "epoch": 0.0144, "grad_norm": 146.2143514929028, "learning_rate": 9.8576e-07, "logits/chosen": 0.49005126953125, "logits/rejected": 0.613391101360321, "logps/chosen": -218.1125030517578, "logps/rejected": -202.9375, "loss": 0.5311, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.35814207792282104, "rewards/margins": 0.7134033441543579, "rewards/rejected": -0.35429686307907104, "step": 90 }, { "epoch": 0.016, "grad_norm": 166.75102323489787, "learning_rate": 9.8416e-07, "logits/chosen": NaN, "logits/rejected": 0.561798095703125, "logps/chosen": -218.71249389648438, "logps/rejected": -194.25, "loss": 2.3935, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4615600109100342, "rewards/margins": -1.267401099205017, "rewards/rejected": -0.192108154296875, "step": 100 }, { "epoch": 0.0176, "grad_norm": 182.62908352832616, "learning_rate": 9.825599999999999e-07, "logits/chosen": 0.4417968690395355, "logits/rejected": 0.5448364019393921, "logps/chosen": -237.5500030517578, "logps/rejected": -206.1687469482422, "loss": 0.5911, "rewards/accuracies": 0.59375, "rewards/chosen": 0.390280157327652, "rewards/margins": 0.706127941608429, "rewards/rejected": -0.31534117460250854, "step": 110 }, { "epoch": 0.0192, "grad_norm": 85.05821239457613, "learning_rate": 9.8096e-07, "logits/chosen": 0.21943359076976776, "logits/rejected": 0.4193311631679535, "logps/chosen": -252.8562469482422, "logps/rejected": -206.1999969482422, "loss": 0.6544, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4583801329135895, "rewards/margins": 0.822583019733429, "rewards/rejected": -0.36375731229782104, "step": 120 }, { "epoch": 0.0208, "grad_norm": 207.83840040486416, "learning_rate": 9.7936e-07, "logits/chosen": 0.25749510526657104, "logits/rejected": 0.38177186250686646, "logps/chosen": -250.9250030517578, "logps/rejected": -205.3312530517578, "loss": 0.564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.63616943359375, "rewards/margins": 0.929370105266571, "rewards/rejected": -0.292794793844223, "step": 130 }, { "epoch": 0.0224, "grad_norm": 165.96996290683802, "learning_rate": 9.7776e-07, "logits/chosen": 0.3760986328125, "logits/rejected": 0.6401122808456421, "logps/chosen": -250.3000030517578, "logps/rejected": -182.0124969482422, "loss": 0.5294, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.46071165800094604, "rewards/margins": 0.9586852788925171, "rewards/rejected": -0.4972900450229645, "step": 140 }, { "epoch": 0.024, "grad_norm": 155.77040357001442, "learning_rate": 9.7616e-07, "logits/chosen": 0.3559814393520355, "logits/rejected": 0.571789562702179, "logps/chosen": -238.2375030517578, "logps/rejected": -184.0500030517578, "loss": 0.6026, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.5387939214706421, "rewards/margins": 0.84075927734375, "rewards/rejected": -0.3012146055698395, "step": 150 }, { "epoch": 0.0256, "grad_norm": 184.65288063637075, "learning_rate": 9.7456e-07, "logits/chosen": 0.361907958984375, "logits/rejected": 0.5497070550918579, "logps/chosen": -204.75, "logps/rejected": -184.8625030517578, "loss": 0.5414, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6497558355331421, "rewards/margins": 1.035803198814392, "rewards/rejected": -0.3850952088832855, "step": 160 }, { "epoch": 0.0272, "grad_norm": 171.64591410676627, "learning_rate": 9.7296e-07, "logits/chosen": 0.3606201112270355, "logits/rejected": 0.5216919183731079, "logps/chosen": -233.89999389648438, "logps/rejected": -182.27499389648438, "loss": 0.5575, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.6731017827987671, "rewards/margins": 0.99462890625, "rewards/rejected": -0.3209289610385895, "step": 170 }, { "epoch": 0.0288, "grad_norm": 174.88263981633418, "learning_rate": 9.713599999999999e-07, "logits/chosen": 0.3581909239292145, "logits/rejected": 0.4918579161167145, "logps/chosen": -247.77499389648438, "logps/rejected": -195.21249389648438, "loss": 0.5137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.601947009563446, "rewards/margins": 1.007849097251892, "rewards/rejected": -0.4052734375, "step": 180 }, { "epoch": 0.0304, "grad_norm": 146.706418983595, "learning_rate": 9.6976e-07, "logits/chosen": 0.39574891328811646, "logits/rejected": 0.438995361328125, "logps/chosen": -236.60000610351562, "logps/rejected": -204.58749389648438, "loss": 0.5093, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.611157238483429, "rewards/margins": 1.084570288658142, "rewards/rejected": -0.47248536348342896, "step": 190 }, { "epoch": 0.032, "grad_norm": 213.9524684156536, "learning_rate": 9.6816e-07, "logits/chosen": 0.28535157442092896, "logits/rejected": 0.521746814250946, "logps/chosen": -232.75, "logps/rejected": -191.4250030517578, "loss": 0.5784, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6331421136856079, "rewards/margins": 1.012536644935608, "rewards/rejected": -0.37884521484375, "step": 200 }, { "epoch": 0.0336, "grad_norm": 105.4039682414964, "learning_rate": 9.6656e-07, "logits/chosen": 0.31889647245407104, "logits/rejected": 0.4892578125, "logps/chosen": -236.21249389648438, "logps/rejected": -191.21875, "loss": 0.5758, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8638641238212585, "rewards/margins": 1.10736083984375, "rewards/rejected": -0.2421875, "step": 210 }, { "epoch": 0.0352, "grad_norm": 167.49696995219443, "learning_rate": 9.6496e-07, "logits/chosen": 0.47358399629592896, "logits/rejected": 0.651965320110321, "logps/chosen": -232.8312530517578, "logps/rejected": -191.52499389648438, "loss": 0.5456, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.0089232921600342, "rewards/margins": 1.06304931640625, "rewards/rejected": -0.05415649339556694, "step": 220 }, { "epoch": 0.0368, "grad_norm": 186.6323210731834, "learning_rate": 9.6336e-07, "logits/chosen": 0.4628662168979645, "logits/rejected": 0.5611327886581421, "logps/chosen": -248.72500610351562, "logps/rejected": -200.1374969482422, "loss": 0.6409, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.9612945318222046, "rewards/margins": 0.944653332233429, "rewards/rejected": 0.018463134765625, "step": 230 }, { "epoch": 0.0384, "grad_norm": 135.38651511576737, "learning_rate": 9.6176e-07, "logits/chosen": 0.42094725370407104, "logits/rejected": 0.578808605670929, "logps/chosen": -234.125, "logps/rejected": -186.9499969482422, "loss": 0.6017, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.994213879108429, "rewards/margins": 1.0519530773162842, "rewards/rejected": -0.05690918117761612, "step": 240 }, { "epoch": 0.04, "grad_norm": 102.89918872057203, "learning_rate": 9.601599999999999e-07, "logits/chosen": 0.35648804903030396, "logits/rejected": 0.5980468988418579, "logps/chosen": -231.03125, "logps/rejected": -200.08749389648438, "loss": 0.5448, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.1788451671600342, "rewards/margins": 1.1294677257537842, "rewards/rejected": 0.05012207105755806, "step": 250 }, { "epoch": 0.0416, "grad_norm": 179.76763890726144, "learning_rate": 9.5856e-07, "logits/chosen": 0.44647216796875, "logits/rejected": 0.626293957233429, "logps/chosen": -248.8874969482422, "logps/rejected": -210.6875, "loss": 0.6038, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.0896484851837158, "rewards/margins": 0.9274383783340454, "rewards/rejected": 0.16181030869483948, "step": 260 }, { "epoch": 0.0432, "grad_norm": 158.6704182877787, "learning_rate": 9.5696e-07, "logits/chosen": 0.5301269292831421, "logits/rejected": 0.71337890625, "logps/chosen": -221.4375, "logps/rejected": -184.35000610351562, "loss": 0.5233, "rewards/accuracies": 0.75, "rewards/chosen": 1.0899658203125, "rewards/margins": 1.310827612876892, "rewards/rejected": -0.22025756537914276, "step": 270 }, { "epoch": 0.0448, "grad_norm": 107.76249983844443, "learning_rate": 9.5536e-07, "logits/chosen": 0.4189407229423523, "logits/rejected": 0.49842530488967896, "logps/chosen": -200.5749969482422, "logps/rejected": -182.1750030517578, "loss": 0.6811, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.591906726360321, "rewards/margins": 0.8349609375, "rewards/rejected": -0.2432403564453125, "step": 280 }, { "epoch": 0.0464, "grad_norm": 143.45298659347284, "learning_rate": 9.5376e-07, "logits/chosen": 0.479562371969223, "logits/rejected": 0.542065441608429, "logps/chosen": -239.52499389648438, "logps/rejected": -198.375, "loss": 0.6021, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.638928234577179, "rewards/margins": 0.9524170160293579, "rewards/rejected": -0.3132690489292145, "step": 290 }, { "epoch": 0.048, "grad_norm": 142.4942642293823, "learning_rate": 9.521599999999999e-07, "logits/chosen": 0.651074230670929, "logits/rejected": 0.792895495891571, "logps/chosen": -246.41250610351562, "logps/rejected": -189.4499969482422, "loss": 0.5631, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.484414666891098, "rewards/margins": 1.043981909751892, "rewards/rejected": -0.5590454339981079, "step": 300 }, { "epoch": 0.0496, "grad_norm": 97.12525060196589, "learning_rate": 9.505599999999999e-07, "logits/chosen": 0.47523194551467896, "logits/rejected": 0.597582995891571, "logps/chosen": -260.5874938964844, "logps/rejected": -211.10000610351562, "loss": 0.5024, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.7660278081893921, "rewards/margins": 1.232446312904358, "rewards/rejected": -0.465972900390625, "step": 310 }, { "epoch": 0.0512, "grad_norm": 84.54019028280128, "learning_rate": 9.4896e-07, "logits/chosen": 0.4879516661167145, "logits/rejected": 0.562176525592804, "logps/chosen": -225.39999389648438, "logps/rejected": -218.64999389648438, "loss": 0.5291, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.929309070110321, "rewards/margins": 1.429443359375, "rewards/rejected": -0.4997802674770355, "step": 320 }, { "epoch": 0.0528, "grad_norm": 96.01963205383929, "learning_rate": 9.4736e-07, "logits/chosen": 0.5347534418106079, "logits/rejected": 0.767102062702179, "logps/chosen": -223.4499969482422, "logps/rejected": -192.5500030517578, "loss": 0.5327, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.7455886602401733, "rewards/margins": 1.194067358970642, "rewards/rejected": -0.4479614198207855, "step": 330 }, { "epoch": 0.0544, "grad_norm": 77.11780067800544, "learning_rate": 9.4576e-07, "logits/chosen": 0.5583511590957642, "logits/rejected": 0.6645233035087585, "logps/chosen": -248.08749389648438, "logps/rejected": -207.46875, "loss": 0.4887, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.5950988531112671, "rewards/margins": 1.3935668468475342, "rewards/rejected": -0.7992187738418579, "step": 340 }, { "epoch": 0.056, "grad_norm": 93.6401267342267, "learning_rate": 9.4416e-07, "logits/chosen": 0.58123779296875, "logits/rejected": 0.8092895746231079, "logps/chosen": -228.60000610351562, "logps/rejected": -221.1875, "loss": 0.5415, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.637072741985321, "rewards/margins": 1.106530785560608, "rewards/rejected": -0.4685913026332855, "step": 350 }, { "epoch": 0.0576, "grad_norm": 120.24218231073242, "learning_rate": 9.425599999999999e-07, "logits/chosen": 0.5223144292831421, "logits/rejected": 0.691784679889679, "logps/chosen": -262.92498779296875, "logps/rejected": -201.375, "loss": 0.4714, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 1.10186767578125, "rewards/margins": 1.6827881336212158, "rewards/rejected": -0.5799560546875, "step": 360 }, { "epoch": 0.0592, "grad_norm": 93.20561279822927, "learning_rate": 9.409599999999999e-07, "logits/chosen": 0.56829833984375, "logits/rejected": 0.5872802734375, "logps/chosen": -243.3874969482422, "logps/rejected": -217.5625, "loss": 0.584, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.0656859874725342, "rewards/margins": 1.1296875476837158, "rewards/rejected": -0.06270752102136612, "step": 370 }, { "epoch": 0.0608, "grad_norm": 220.04310755329988, "learning_rate": 9.393599999999999e-07, "logits/chosen": 0.5386688113212585, "logits/rejected": 0.604693591594696, "logps/chosen": -246.83749389648438, "logps/rejected": -210.66250610351562, "loss": 0.6331, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.2078857421875, "rewards/margins": 1.1964111328125, "rewards/rejected": -0.9888671636581421, "step": 380 }, { "epoch": 0.0624, "grad_norm": 153.71388777676697, "learning_rate": 9.3776e-07, "logits/chosen": 0.5551391839981079, "logits/rejected": 0.696215808391571, "logps/chosen": -224.8249969482422, "logps/rejected": -190.6875, "loss": 0.6437, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6016479730606079, "rewards/margins": 0.888671875, "rewards/rejected": -1.4897949695587158, "step": 390 }, { "epoch": 0.064, "grad_norm": 164.83305154590442, "learning_rate": 9.3616e-07, "logits/chosen": 0.599139392375946, "logits/rejected": 0.693896472454071, "logps/chosen": -248.0124969482422, "logps/rejected": -208.7687530517578, "loss": 0.516, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7664337158203125, "rewards/margins": 1.3621826171875, "rewards/rejected": -0.5948120355606079, "step": 400 }, { "epoch": 0.0656, "grad_norm": 140.27477345376516, "learning_rate": 9.345599999999999e-07, "logits/chosen": 0.5668700933456421, "logits/rejected": 0.5928955078125, "logps/chosen": -232.0625, "logps/rejected": -188.28125, "loss": 0.5758, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.029077172279358, "rewards/margins": 1.341821312904358, "rewards/rejected": -0.3129821717739105, "step": 410 }, { "epoch": 0.0672, "grad_norm": 100.50562469423578, "learning_rate": 9.3296e-07, "logits/chosen": 0.412200927734375, "logits/rejected": 0.569744884967804, "logps/chosen": -229.81875610351562, "logps/rejected": -201.2624969482422, "loss": 0.5084, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.9132324457168579, "rewards/margins": 1.3827636241912842, "rewards/rejected": -0.4690918028354645, "step": 420 }, { "epoch": 0.0688, "grad_norm": 188.88981052102017, "learning_rate": 9.313599999999999e-07, "logits/chosen": 0.617047131061554, "logits/rejected": 0.7023071050643921, "logps/chosen": -232.97500610351562, "logps/rejected": -210.2062530517578, "loss": 0.813, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.011187744326889515, "rewards/margins": 0.7536865472793579, "rewards/rejected": -0.741137683391571, "step": 430 }, { "epoch": 0.0704, "grad_norm": 176.12435948391087, "learning_rate": 9.2976e-07, "logits/chosen": 0.658935546875, "logits/rejected": 0.7484985589981079, "logps/chosen": -214.39999389648438, "logps/rejected": -195.77499389648438, "loss": 0.6008, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.4151855409145355, "rewards/margins": 0.9918457269668579, "rewards/rejected": -0.575482189655304, "step": 440 }, { "epoch": 0.072, "grad_norm": 113.65125304838726, "learning_rate": 9.281599999999999e-07, "logits/chosen": 0.47617799043655396, "logits/rejected": 0.6357421875, "logps/chosen": -217.46249389648438, "logps/rejected": -194.71249389648438, "loss": 0.677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.81634521484375, "rewards/margins": 1.197778344154358, "rewards/rejected": -0.38068848848342896, "step": 450 }, { "epoch": 0.0736, "grad_norm": 170.51462846152094, "learning_rate": 9.2656e-07, "logits/chosen": 0.38282471895217896, "logits/rejected": 0.5413573980331421, "logps/chosen": -217.21875, "logps/rejected": -202.64999389648438, "loss": 0.5367, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.9895523190498352, "rewards/margins": 1.419976830482483, "rewards/rejected": -0.42982178926467896, "step": 460 }, { "epoch": 0.0752, "grad_norm": 83.30415570067892, "learning_rate": 9.2496e-07, "logits/chosen": 0.516162097454071, "logits/rejected": 0.7424072027206421, "logps/chosen": -210.91250610351562, "logps/rejected": -186.0749969482422, "loss": 0.5315, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.657031238079071, "rewards/margins": 1.5942871570587158, "rewards/rejected": -0.9378722906112671, "step": 470 }, { "epoch": 0.0768, "grad_norm": 164.18122944417445, "learning_rate": 9.233599999999999e-07, "logits/chosen": 0.39228516817092896, "logits/rejected": 0.6114746332168579, "logps/chosen": -229.59375, "logps/rejected": -175.7937469482422, "loss": 0.5856, "rewards/accuracies": 0.71875, "rewards/chosen": 0.847991943359375, "rewards/margins": 1.454003930091858, "rewards/rejected": -0.605908215045929, "step": 480 }, { "epoch": 0.0784, "grad_norm": 153.76935520907054, "learning_rate": 9.2176e-07, "logits/chosen": 0.4461669921875, "logits/rejected": 0.66845703125, "logps/chosen": -210.0500030517578, "logps/rejected": -196.72500610351562, "loss": 0.574, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.190460205078125, "rewards/margins": 1.479821801185608, "rewards/rejected": -0.2884887754917145, "step": 490 }, { "epoch": 0.08, "grad_norm": 69.17077946719137, "learning_rate": 9.2016e-07, "logits/chosen": 0.6010681390762329, "logits/rejected": 0.613201916217804, "logps/chosen": -218.47500610351562, "logps/rejected": -192.16250610351562, "loss": 0.598, "rewards/accuracies": 0.6875, "rewards/chosen": 1.64544677734375, "rewards/margins": 1.0168883800506592, "rewards/rejected": 0.629895031452179, "step": 500 }, { "epoch": 0.0816, "grad_norm": 153.36149168542698, "learning_rate": 9.1856e-07, "logits/chosen": 0.620227038860321, "logits/rejected": 0.6868652105331421, "logps/chosen": -207.0625, "logps/rejected": -181.66250610351562, "loss": 0.5993, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.242895483970642, "rewards/margins": 1.214086890220642, "rewards/rejected": 0.02808837965130806, "step": 510 }, { "epoch": 0.0832, "grad_norm": 160.55985347142067, "learning_rate": 9.169599999999999e-07, "logits/chosen": 0.5306396484375, "logits/rejected": 0.6199951171875, "logps/chosen": -255.2937469482422, "logps/rejected": -212.3125, "loss": 0.6673, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.3274657726287842, "rewards/margins": 1.2119140625, "rewards/rejected": 0.11618652194738388, "step": 520 }, { "epoch": 0.0848, "grad_norm": 158.19809328321486, "learning_rate": 9.153599999999999e-07, "logits/chosen": 0.6107422113418579, "logits/rejected": 0.6884765625, "logps/chosen": -242.0, "logps/rejected": -218.6374969482422, "loss": 0.5668, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9986206293106079, "rewards/margins": 1.4765441417694092, "rewards/rejected": -0.47684937715530396, "step": 530 }, { "epoch": 0.0864, "grad_norm": 101.75003119652608, "learning_rate": 9.1376e-07, "logits/chosen": NaN, "logits/rejected": 0.7379394769668579, "logps/chosen": -237.02499389648438, "logps/rejected": -205.1125030517578, "loss": 0.7483, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.521221935749054, "rewards/margins": 0.9746338129043579, "rewards/rejected": -0.45225220918655396, "step": 540 }, { "epoch": 0.088, "grad_norm": 192.78145373461356, "learning_rate": 9.121599999999999e-07, "logits/chosen": 0.48966675996780396, "logits/rejected": 0.6687866449356079, "logps/chosen": -229.3000030517578, "logps/rejected": -182.84375, "loss": 0.5618, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.8734375238418579, "rewards/margins": 1.429443359375, "rewards/rejected": -0.5543457269668579, "step": 550 }, { "epoch": 0.0896, "grad_norm": 185.41776475061948, "learning_rate": 9.1056e-07, "logits/chosen": 0.5898407101631165, "logits/rejected": 0.702014148235321, "logps/chosen": -269.0375061035156, "logps/rejected": -250.9499969482422, "loss": 0.624, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.790844738483429, "rewards/margins": 1.3859374523162842, "rewards/rejected": -0.594103991985321, "step": 560 }, { "epoch": 0.0912, "grad_norm": 158.60838941811423, "learning_rate": 9.0896e-07, "logits/chosen": 0.5207931399345398, "logits/rejected": 0.6006835699081421, "logps/chosen": -240.875, "logps/rejected": -202.25, "loss": 0.5966, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.6866455078125, "rewards/margins": 1.5197875499725342, "rewards/rejected": -0.833264172077179, "step": 570 }, { "epoch": 0.0928, "grad_norm": 107.10432515090957, "learning_rate": 9.0736e-07, "logits/chosen": 0.6126464605331421, "logits/rejected": 0.750012218952179, "logps/chosen": -261.7749938964844, "logps/rejected": -203.625, "loss": 0.5943, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8048736453056335, "rewards/margins": 1.493841528892517, "rewards/rejected": -0.6888183355331421, "step": 580 }, { "epoch": 0.0944, "grad_norm": 113.76654299254619, "learning_rate": 9.057599999999999e-07, "logits/chosen": 0.702807605266571, "logits/rejected": 0.796191394329071, "logps/chosen": -217.1125030517578, "logps/rejected": -179.9562530517578, "loss": 0.5994, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.0714600086212158, "rewards/margins": 1.6635010242462158, "rewards/rejected": -0.58984375, "step": 590 }, { "epoch": 0.096, "grad_norm": 181.81120309318217, "learning_rate": 9.041599999999999e-07, "logits/chosen": 0.5701324343681335, "logits/rejected": 0.7892822027206421, "logps/chosen": -214.64999389648438, "logps/rejected": -195.1374969482422, "loss": 0.7064, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0507080554962158, "rewards/margins": 1.3180968761444092, "rewards/rejected": -0.26771241426467896, "step": 600 }, { "epoch": 0.0976, "grad_norm": 156.00228395828861, "learning_rate": 9.0256e-07, "logits/chosen": 0.591906726360321, "logits/rejected": 0.7104736566543579, "logps/chosen": -253.5625, "logps/rejected": -205.125, "loss": 0.6561, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5225127935409546, "rewards/margins": 1.3517334461212158, "rewards/rejected": -0.828533947467804, "step": 610 }, { "epoch": 0.0992, "grad_norm": 170.10297111727013, "learning_rate": 9.0096e-07, "logits/chosen": 0.6283203363418579, "logits/rejected": 0.7509216070175171, "logps/chosen": -246.2624969482422, "logps/rejected": -214.1125030517578, "loss": 0.6152, "rewards/accuracies": 0.71875, "rewards/chosen": 0.25297850370407104, "rewards/margins": 1.396240234375, "rewards/rejected": -1.1436035633087158, "step": 620 }, { "epoch": 0.1008, "grad_norm": 118.66533522835886, "learning_rate": 8.9936e-07, "logits/chosen": NaN, "logits/rejected": 0.721606433391571, "logps/chosen": -231.5, "logps/rejected": -205.8625030517578, "loss": 0.4592, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.30303955078125, "rewards/margins": 1.613012671470642, "rewards/rejected": -1.914575219154358, "step": 630 }, { "epoch": 0.1024, "grad_norm": 192.83031904161524, "learning_rate": 8.9776e-07, "logits/chosen": 0.650177001953125, "logits/rejected": 0.820727527141571, "logps/chosen": -222.72500610351562, "logps/rejected": -222.03750610351562, "loss": 0.5942, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.92156982421875, "rewards/margins": 1.3645508289337158, "rewards/rejected": -2.2868409156799316, "step": 640 }, { "epoch": 0.104, "grad_norm": 139.10523387528514, "learning_rate": 8.961599999999999e-07, "logits/chosen": 0.55841064453125, "logits/rejected": 0.655139148235321, "logps/chosen": -237.75, "logps/rejected": -189.0, "loss": 0.6556, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8970702886581421, "rewards/margins": 1.481530785560608, "rewards/rejected": -2.3784422874450684, "step": 650 }, { "epoch": 0.1056, "grad_norm": 149.70156672396524, "learning_rate": 8.945599999999999e-07, "logits/chosen": 0.5958496332168579, "logits/rejected": 0.919238269329071, "logps/chosen": -213.97500610351562, "logps/rejected": -200.97500610351562, "loss": 0.6233, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.39631348848342896, "rewards/margins": 1.480859398841858, "rewards/rejected": -1.876672387123108, "step": 660 }, { "epoch": 0.1072, "grad_norm": 259.21827982746464, "learning_rate": 8.929599999999999e-07, "logits/chosen": 0.60931396484375, "logits/rejected": 0.724578857421875, "logps/chosen": -244.2375030517578, "logps/rejected": -203.10000610351562, "loss": 0.7485, "rewards/accuracies": 0.71875, "rewards/chosen": 0.08792724460363388, "rewards/margins": 1.45562744140625, "rewards/rejected": -1.368749976158142, "step": 670 }, { "epoch": 0.1088, "grad_norm": 53.139418528733586, "learning_rate": 8.9136e-07, "logits/chosen": 0.46925050020217896, "logits/rejected": 0.670977771282196, "logps/chosen": -281.17498779296875, "logps/rejected": -215.78750610351562, "loss": 0.6995, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.12225341796875, "rewards/margins": 1.174902319908142, "rewards/rejected": -1.0519530773162842, "step": 680 }, { "epoch": 0.1104, "grad_norm": 139.75086836723196, "learning_rate": 8.8976e-07, "logits/chosen": 0.6266266107559204, "logits/rejected": 0.733776867389679, "logps/chosen": -266.2749938964844, "logps/rejected": -198.1374969482422, "loss": 0.7087, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.833740234375, "rewards/margins": 1.559179663658142, "rewards/rejected": 0.2745605409145355, "step": 690 }, { "epoch": 0.112, "grad_norm": 100.71935082994477, "learning_rate": 8.881599999999999e-07, "logits/chosen": 0.520886242389679, "logits/rejected": 0.652819812297821, "logps/chosen": -247.77499389648438, "logps/rejected": -217.27499389648438, "loss": 0.698, "rewards/accuracies": 0.75, "rewards/chosen": 3.0052733421325684, "rewards/margins": 1.97900390625, "rewards/rejected": 1.0280883312225342, "step": 700 }, { "epoch": 0.1136, "grad_norm": 164.34654078944274, "learning_rate": 8.8656e-07, "logits/chosen": 0.6051269769668579, "logits/rejected": 0.628857433795929, "logps/chosen": -230.9875030517578, "logps/rejected": -193.78750610351562, "loss": 0.6858, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 3.296875, "rewards/margins": 1.0980713367462158, "rewards/rejected": 2.1981444358825684, "step": 710 }, { "epoch": 0.1152, "grad_norm": 213.9803580926428, "learning_rate": 8.8496e-07, "logits/chosen": 0.5404297113418579, "logits/rejected": 0.69940185546875, "logps/chosen": -209.8874969482422, "logps/rejected": -186.5437469482422, "loss": 0.7257, "rewards/accuracies": 0.6875, "rewards/chosen": 2.690051317214966, "rewards/margins": 1.4174072742462158, "rewards/rejected": 1.2728271484375, "step": 720 }, { "epoch": 0.1168, "grad_norm": 117.49949440550603, "learning_rate": 8.8336e-07, "logits/chosen": 0.5959228277206421, "logits/rejected": 0.8413330316543579, "logps/chosen": -211.0749969482422, "logps/rejected": -160.8874969482422, "loss": 0.5425, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 2.2091307640075684, "rewards/margins": 1.7058837413787842, "rewards/rejected": 0.5053039789199829, "step": 730 }, { "epoch": 0.1184, "grad_norm": 134.9554022539648, "learning_rate": 8.817599999999999e-07, "logits/chosen": 0.4033203125, "logits/rejected": 0.562853991985321, "logps/chosen": -257.95001220703125, "logps/rejected": -228.8000030517578, "loss": 0.6105, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.875292956829071, "rewards/margins": 1.619897484779358, "rewards/rejected": -0.7439209222793579, "step": 740 }, { "epoch": 0.12, "grad_norm": 127.0903032849387, "learning_rate": 8.8016e-07, "logits/chosen": 0.7146850824356079, "logits/rejected": 0.831524670124054, "logps/chosen": -211.5500030517578, "logps/rejected": -197.08749389648438, "loss": 0.5301, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.03471069410443306, "rewards/margins": 1.635498046875, "rewards/rejected": -1.669580101966858, "step": 750 }, { "epoch": 0.1216, "grad_norm": 61.376499890516335, "learning_rate": 8.7856e-07, "logits/chosen": 0.5489867925643921, "logits/rejected": 0.69769287109375, "logps/chosen": -265.875, "logps/rejected": -214.60000610351562, "loss": 0.5099, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.18981322646141052, "rewards/margins": 1.97900390625, "rewards/rejected": -1.789086937904358, "step": 760 }, { "epoch": 0.1232, "grad_norm": 98.15226771118168, "learning_rate": 8.769599999999999e-07, "logits/chosen": 0.653149425983429, "logits/rejected": 0.7932159304618835, "logps/chosen": -221.0625, "logps/rejected": -201.52499389648438, "loss": 0.6054, "rewards/accuracies": 0.6875, "rewards/chosen": -0.32438355684280396, "rewards/margins": 1.44525146484375, "rewards/rejected": -1.7690918445587158, "step": 770 }, { "epoch": 0.1248, "grad_norm": 76.04528024059353, "learning_rate": 8.7536e-07, "logits/chosen": 0.673754870891571, "logits/rejected": 0.8952392339706421, "logps/chosen": -259.95001220703125, "logps/rejected": -206.6750030517578, "loss": 0.6613, "rewards/accuracies": 0.75, "rewards/chosen": -0.02675781212747097, "rewards/margins": 1.7832520008087158, "rewards/rejected": -1.8090941905975342, "step": 780 }, { "epoch": 0.1264, "grad_norm": 167.8052421577353, "learning_rate": 8.7376e-07, "logits/chosen": 0.630688488483429, "logits/rejected": 0.774365246295929, "logps/chosen": -220.0749969482422, "logps/rejected": -207.8625030517578, "loss": 0.7177, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.17859192192554474, "rewards/margins": 1.5286986827850342, "rewards/rejected": -1.349755883216858, "step": 790 }, { "epoch": 0.128, "grad_norm": 148.90117672723144, "learning_rate": 8.7216e-07, "logits/chosen": 0.6983276605606079, "logits/rejected": 0.767504870891571, "logps/chosen": -251.14999389648438, "logps/rejected": -192.4499969482422, "loss": 0.5486, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.07979736477136612, "rewards/margins": 1.6415526866912842, "rewards/rejected": -1.562139868736267, "step": 800 }, { "epoch": 0.1296, "grad_norm": 116.97445825639211, "learning_rate": 8.705599999999999e-07, "logits/chosen": 0.5964599847793579, "logits/rejected": 0.667004406452179, "logps/chosen": -229.75, "logps/rejected": -199.96249389648438, "loss": 0.6088, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3601928651332855, "rewards/margins": 1.6355102062225342, "rewards/rejected": -1.273901343345642, "step": 810 }, { "epoch": 0.1312, "grad_norm": 156.16982039252252, "learning_rate": 8.689599999999999e-07, "logits/chosen": 0.565234363079071, "logits/rejected": 0.746228039264679, "logps/chosen": -228.35000610351562, "logps/rejected": -177.6999969482422, "loss": 0.5814, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.30317384004592896, "rewards/margins": 1.5808594226837158, "rewards/rejected": -1.2781982421875, "step": 820 }, { "epoch": 0.1328, "grad_norm": 275.54517951292905, "learning_rate": 8.6736e-07, "logits/chosen": 0.4869018495082855, "logits/rejected": 0.5818847417831421, "logps/chosen": -237.96249389648438, "logps/rejected": -208.9875030517578, "loss": 0.7748, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6251220703125, "rewards/margins": 1.017553687095642, "rewards/rejected": -1.6412353515625, "step": 830 }, { "epoch": 0.1344, "grad_norm": 201.91743120183784, "learning_rate": 8.657599999999999e-07, "logits/chosen": 0.49724119901657104, "logits/rejected": 0.678375244140625, "logps/chosen": -260.04998779296875, "logps/rejected": -209.8249969482422, "loss": 0.5836, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.07681884616613388, "rewards/margins": 1.828149437904358, "rewards/rejected": -1.905908226966858, "step": 840 }, { "epoch": 0.136, "grad_norm": 182.30632981903764, "learning_rate": 8.6416e-07, "logits/chosen": 0.6130126714706421, "logits/rejected": 0.7337890863418579, "logps/chosen": -221.72500610351562, "logps/rejected": -217.5812530517578, "loss": 0.659, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.5157470703125, "rewards/margins": 1.7698242664337158, "rewards/rejected": -1.2529296875, "step": 850 }, { "epoch": 0.1376, "grad_norm": 132.1614949362825, "learning_rate": 8.6256e-07, "logits/chosen": 0.584716796875, "logits/rejected": 0.748242199420929, "logps/chosen": -230.10000610351562, "logps/rejected": -209.6750030517578, "loss": 0.6764, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.27354127168655396, "rewards/margins": 1.34686279296875, "rewards/rejected": -1.073144555091858, "step": 860 }, { "epoch": 0.1392, "grad_norm": 99.36978387109168, "learning_rate": 8.6096e-07, "logits/chosen": 0.6260741949081421, "logits/rejected": 0.662109375, "logps/chosen": -280.8500061035156, "logps/rejected": -215.89999389648438, "loss": 0.521, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.61883544921875, "rewards/margins": 1.927392601966858, "rewards/rejected": -1.307397484779358, "step": 870 }, { "epoch": 0.1408, "grad_norm": 143.30028269130165, "learning_rate": 8.593599999999999e-07, "logits/chosen": 0.5112365484237671, "logits/rejected": 0.7010864019393921, "logps/chosen": -229.61874389648438, "logps/rejected": -189.0625, "loss": 0.5808, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.215087890625, "rewards/margins": 1.848388671875, "rewards/rejected": -1.6343262195587158, "step": 880 }, { "epoch": 0.1424, "grad_norm": 99.87627926547164, "learning_rate": 8.577599999999999e-07, "logits/chosen": 0.6656128168106079, "logits/rejected": 0.7993408441543579, "logps/chosen": -230.5, "logps/rejected": -183.5, "loss": 0.5814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6577209234237671, "rewards/margins": 1.8166992664337158, "rewards/rejected": -2.473339796066284, "step": 890 }, { "epoch": 0.144, "grad_norm": 70.90598723367252, "learning_rate": 8.5616e-07, "logits/chosen": 0.6458495855331421, "logits/rejected": 0.7682861089706421, "logps/chosen": -236.8625030517578, "logps/rejected": -208.97500610351562, "loss": 0.6384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.21116943657398224, "rewards/margins": 1.6908447742462158, "rewards/rejected": -1.90167236328125, "step": 900 }, { "epoch": 0.1456, "grad_norm": 207.51596460839605, "learning_rate": 8.5456e-07, "logits/chosen": 0.531750500202179, "logits/rejected": NaN, "logps/chosen": -228.96875, "logps/rejected": -194.00625610351562, "loss": 0.9859, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6894592046737671, "rewards/margins": 0.917919933795929, "rewards/rejected": -1.6078369617462158, "step": 910 }, { "epoch": 0.1472, "grad_norm": 162.01810612157482, "learning_rate": 8.5296e-07, "logits/chosen": 0.4954467713832855, "logits/rejected": 0.605175793170929, "logps/chosen": -234.0437469482422, "logps/rejected": -214.58749389648438, "loss": 0.7228, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.51507568359375, "rewards/margins": 1.352685570716858, "rewards/rejected": -1.8689758777618408, "step": 920 }, { "epoch": 0.1488, "grad_norm": 120.87988527253805, "learning_rate": 8.5136e-07, "logits/chosen": 0.42475587129592896, "logits/rejected": 0.51239013671875, "logps/chosen": -248.6437530517578, "logps/rejected": -209.1875, "loss": 0.7019, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.474945068359375, "rewards/margins": 1.7710449695587158, "rewards/rejected": -2.243725538253784, "step": 930 }, { "epoch": 0.1504, "grad_norm": 170.76548834666255, "learning_rate": 8.4976e-07, "logits/chosen": 0.5953003168106079, "logits/rejected": 0.6329345703125, "logps/chosen": -236.47500610351562, "logps/rejected": -216.10000610351562, "loss": 0.6148, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.56695556640625, "rewards/margins": 1.7373535633087158, "rewards/rejected": -2.3067383766174316, "step": 940 }, { "epoch": 0.152, "grad_norm": 118.40100982328019, "learning_rate": 8.481599999999999e-07, "logits/chosen": 0.596508800983429, "logits/rejected": 0.785693347454071, "logps/chosen": -218.7062530517578, "logps/rejected": -173.1125030517578, "loss": 0.4696, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02598876878619194, "rewards/margins": 2.123364210128784, "rewards/rejected": -2.09814453125, "step": 950 }, { "epoch": 0.1536, "grad_norm": 261.17190886580204, "learning_rate": 8.465599999999999e-07, "logits/chosen": 0.44407957792282104, "logits/rejected": 0.668597400188446, "logps/chosen": -236.83749389648438, "logps/rejected": -212.3125, "loss": 0.7287, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6480163335800171, "rewards/margins": 1.688745141029358, "rewards/rejected": -1.0386962890625, "step": 960 }, { "epoch": 0.1552, "grad_norm": 172.2738448550876, "learning_rate": 8.4496e-07, "logits/chosen": 0.46885985136032104, "logits/rejected": 0.684277355670929, "logps/chosen": -235.25625610351562, "logps/rejected": -215.2937469482422, "loss": 0.4478, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.1743285655975342, "rewards/margins": 2.3404297828674316, "rewards/rejected": -1.1671142578125, "step": 970 }, { "epoch": 0.1568, "grad_norm": 128.2052978337301, "learning_rate": 8.4336e-07, "logits/chosen": 0.42870789766311646, "logits/rejected": 0.601635754108429, "logps/chosen": -222.8125, "logps/rejected": -184.22500610351562, "loss": 0.6399, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.9034668207168579, "rewards/margins": 1.605017066001892, "rewards/rejected": -0.70135498046875, "step": 980 }, { "epoch": 0.1584, "grad_norm": 174.52993401194558, "learning_rate": 8.417599999999999e-07, "logits/chosen": 0.53924560546875, "logits/rejected": 0.71917724609375, "logps/chosen": -230.0, "logps/rejected": -201.0, "loss": 0.6729, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8151611089706421, "rewards/margins": 1.8289794921875, "rewards/rejected": -2.64385986328125, "step": 990 }, { "epoch": 0.16, "grad_norm": 216.75742573999193, "learning_rate": 8.4016e-07, "logits/chosen": 0.4297851622104645, "logits/rejected": 0.538958728313446, "logps/chosen": -262.5874938964844, "logps/rejected": -229.91250610351562, "loss": 0.6837, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7505553960800171, "rewards/margins": 1.7178223133087158, "rewards/rejected": -2.4684815406799316, "step": 1000 }, { "epoch": 0.1616, "grad_norm": 163.72341770056494, "learning_rate": 8.3856e-07, "logits/chosen": 0.40788573026657104, "logits/rejected": 0.596600353717804, "logps/chosen": -256.5874938964844, "logps/rejected": -227.2375030517578, "loss": 0.53, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.11033935844898224, "rewards/margins": 2.212158203125, "rewards/rejected": -2.10205078125, "step": 1010 }, { "epoch": 0.1632, "grad_norm": 72.27724939242721, "learning_rate": 8.3696e-07, "logits/chosen": 0.38930052518844604, "logits/rejected": 0.660205066204071, "logps/chosen": -247.46249389648438, "logps/rejected": -229.6750030517578, "loss": 0.6349, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.01606445387005806, "rewards/margins": 1.9524657726287842, "rewards/rejected": -1.9670898914337158, "step": 1020 }, { "epoch": 0.1648, "grad_norm": 166.92104816875317, "learning_rate": 8.353599999999999e-07, "logits/chosen": 0.4688964784145355, "logits/rejected": 0.693896472454071, "logps/chosen": -245.6999969482422, "logps/rejected": -230.4375, "loss": 0.5338, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.70458984375, "rewards/margins": 2.319091796875, "rewards/rejected": -1.615930199623108, "step": 1030 }, { "epoch": 0.1664, "grad_norm": 187.46017198511086, "learning_rate": 8.337599999999999e-07, "logits/chosen": 0.4047790467739105, "logits/rejected": 0.5993286371231079, "logps/chosen": -232.14999389648438, "logps/rejected": -208.0749969482422, "loss": 0.6713, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.28718262910842896, "rewards/margins": 2.008349657058716, "rewards/rejected": -1.7207520008087158, "step": 1040 }, { "epoch": 0.168, "grad_norm": 61.63947637071314, "learning_rate": 8.3216e-07, "logits/chosen": 0.570727527141571, "logits/rejected": 0.774340808391571, "logps/chosen": -277.7124938964844, "logps/rejected": -220.9499969482422, "loss": 0.645, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.0008422851678915322, "rewards/margins": 2.293164014816284, "rewards/rejected": -2.2911620140075684, "step": 1050 }, { "epoch": 0.1696, "grad_norm": 123.19474363690848, "learning_rate": 8.305599999999999e-07, "logits/chosen": 0.61175537109375, "logits/rejected": 0.831372082233429, "logps/chosen": -248.7624969482422, "logps/rejected": -219.8125, "loss": 0.6467, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.3232421875, "rewards/margins": 2.163403272628784, "rewards/rejected": -1.837792992591858, "step": 1060 }, { "epoch": 0.1712, "grad_norm": 79.24712672479099, "learning_rate": 8.2896e-07, "logits/chosen": 0.5492309331893921, "logits/rejected": 0.689379870891571, "logps/chosen": -234.8874969482422, "logps/rejected": -217.22500610351562, "loss": 0.6371, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 1.030548095703125, "rewards/margins": 2.2544922828674316, "rewards/rejected": -1.222192406654358, "step": 1070 }, { "epoch": 0.1728, "grad_norm": 99.65472555295491, "learning_rate": 8.2736e-07, "logits/chosen": 0.587750256061554, "logits/rejected": 0.729296863079071, "logps/chosen": -247.5625, "logps/rejected": -205.77499389648438, "loss": 0.5638, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.050244092941284, "rewards/margins": 2.3236327171325684, "rewards/rejected": -0.27055662870407104, "step": 1080 }, { "epoch": 0.1744, "grad_norm": 102.24554283941872, "learning_rate": 8.257600000000001e-07, "logits/chosen": NaN, "logits/rejected": 0.751696765422821, "logps/chosen": -235.1999969482422, "logps/rejected": -186.3000030517578, "loss": 0.5997, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.2540526390075684, "rewards/margins": 2.3299560546875, "rewards/rejected": -0.07541503757238388, "step": 1090 }, { "epoch": 0.176, "grad_norm": 137.36403915650212, "learning_rate": 8.241599999999999e-07, "logits/chosen": 0.6454712152481079, "logits/rejected": 0.79541015625, "logps/chosen": -240.4875030517578, "logps/rejected": -176.3125, "loss": 0.5881, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.5836913585662842, "rewards/margins": 1.769287109375, "rewards/rejected": -0.18602295219898224, "step": 1100 }, { "epoch": 0.1776, "grad_norm": 196.74431190666138, "learning_rate": 8.225599999999999e-07, "logits/chosen": 0.600170910358429, "logits/rejected": 0.760546863079071, "logps/chosen": -228.6999969482422, "logps/rejected": -190.6999969482422, "loss": 0.6687, "rewards/accuracies": 0.71875, "rewards/chosen": 1.1934082508087158, "rewards/margins": 1.6085205078125, "rewards/rejected": -0.41435545682907104, "step": 1110 }, { "epoch": 0.1792, "grad_norm": 201.46928121863493, "learning_rate": 8.2096e-07, "logits/chosen": 0.47845458984375, "logits/rejected": 0.657482922077179, "logps/chosen": -253.125, "logps/rejected": -216.96249389648438, "loss": 0.756, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7223266363143921, "rewards/margins": 1.937524437904358, "rewards/rejected": -1.2147216796875, "step": 1120 }, { "epoch": 0.1808, "grad_norm": 133.59213468222563, "learning_rate": 8.193599999999999e-07, "logits/chosen": 0.5575317144393921, "logits/rejected": 0.6769317388534546, "logps/chosen": -240.77499389648438, "logps/rejected": -207.0749969482422, "loss": 0.7023, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2568115293979645, "rewards/margins": 1.6218750476837158, "rewards/rejected": -1.8782958984375, "step": 1130 }, { "epoch": 0.1824, "grad_norm": 89.42901886452964, "learning_rate": 8.1776e-07, "logits/chosen": 0.590606689453125, "logits/rejected": 0.780224621295929, "logps/chosen": -233.5124969482422, "logps/rejected": -213.6750030517578, "loss": 0.777, "rewards/accuracies": 0.75, "rewards/chosen": -0.43272703886032104, "rewards/margins": 1.772863745689392, "rewards/rejected": -2.2056884765625, "step": 1140 }, { "epoch": 0.184, "grad_norm": 205.15400566164502, "learning_rate": 8.1616e-07, "logits/chosen": 0.561206042766571, "logits/rejected": 0.717968761920929, "logps/chosen": -244.6875, "logps/rejected": -216.875, "loss": 0.6696, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.11097411811351776, "rewards/margins": 1.946557641029358, "rewards/rejected": -1.836889624595642, "step": 1150 }, { "epoch": 0.1856, "grad_norm": 181.60801822576326, "learning_rate": 8.1456e-07, "logits/chosen": 0.564648449420929, "logits/rejected": 0.5737762451171875, "logps/chosen": -246.3312530517578, "logps/rejected": -212.40625, "loss": 0.4882, "rewards/accuracies": 0.78125, "rewards/chosen": 0.04506225511431694, "rewards/margins": 2.0709471702575684, "rewards/rejected": -2.0231080055236816, "step": 1160 }, { "epoch": 0.1872, "grad_norm": 124.28207956892722, "learning_rate": 8.129599999999999e-07, "logits/chosen": 0.5801025629043579, "logits/rejected": 0.7880004644393921, "logps/chosen": -242.8000030517578, "logps/rejected": -203.6125030517578, "loss": 0.5827, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.25046998262405396, "rewards/margins": 2.2143311500549316, "rewards/rejected": -1.9604370594024658, "step": 1170 }, { "epoch": 0.1888, "grad_norm": 110.85159055066597, "learning_rate": 8.113599999999999e-07, "logits/chosen": 0.56549072265625, "logits/rejected": 0.7991577386856079, "logps/chosen": -233.83749389648438, "logps/rejected": -220.6374969482422, "loss": 0.5773, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6235901117324829, "rewards/margins": 1.920922875404358, "rewards/rejected": -2.5413575172424316, "step": 1180 }, { "epoch": 0.1904, "grad_norm": 120.73692866111101, "learning_rate": 8.0976e-07, "logits/chosen": 0.523327648639679, "logits/rejected": 0.655059814453125, "logps/chosen": -233.16250610351562, "logps/rejected": -197.91250610351562, "loss": 0.6488, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.070776343345642, "rewards/margins": 1.4341309070587158, "rewards/rejected": -2.5049805641174316, "step": 1190 }, { "epoch": 0.192, "grad_norm": 122.68074589493239, "learning_rate": 8.0816e-07, "logits/chosen": 0.4808593690395355, "logits/rejected": 0.626635730266571, "logps/chosen": -226.4375, "logps/rejected": -207.7375030517578, "loss": 0.4916, "rewards/accuracies": 0.75, "rewards/chosen": -0.09896545112133026, "rewards/margins": 2.01220703125, "rewards/rejected": -2.1114745140075684, "step": 1200 }, { "epoch": 0.1936, "grad_norm": 72.86580057378845, "learning_rate": 8.0656e-07, "logits/chosen": 0.503979504108429, "logits/rejected": 0.6670898199081421, "logps/chosen": -227.0500030517578, "logps/rejected": -187.78750610351562, "loss": 0.572, "rewards/accuracies": 0.78125, "rewards/chosen": 0.643811047077179, "rewards/margins": 2.285351514816284, "rewards/rejected": -1.6405518054962158, "step": 1210 }, { "epoch": 0.1952, "grad_norm": 218.9297920115356, "learning_rate": 8.0496e-07, "logits/chosen": 0.4409728944301605, "logits/rejected": 0.618212878704071, "logps/chosen": -230.72500610351562, "logps/rejected": -195.6374969482422, "loss": 0.6299, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.668261706829071, "rewards/margins": 2.5193848609924316, "rewards/rejected": -1.8502197265625, "step": 1220 }, { "epoch": 0.1968, "grad_norm": 101.76931645643073, "learning_rate": 8.0336e-07, "logits/chosen": 0.5230652093887329, "logits/rejected": 0.734149158000946, "logps/chosen": -223.47500610351562, "logps/rejected": -187.66250610351562, "loss": 0.7038, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9327148199081421, "rewards/margins": 1.7838623523712158, "rewards/rejected": -2.717529296875, "step": 1230 }, { "epoch": 0.1984, "grad_norm": 52.66549419837037, "learning_rate": 8.0176e-07, "logits/chosen": 0.5707641839981079, "logits/rejected": 0.763989269733429, "logps/chosen": -246.6374969482422, "logps/rejected": -219.46249389648438, "loss": 0.5692, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5831054449081421, "rewards/margins": 2.3076171875, "rewards/rejected": -2.8880858421325684, "step": 1240 }, { "epoch": 0.2, "grad_norm": 268.2246243008476, "learning_rate": 8.001599999999999e-07, "logits/chosen": 0.5026794672012329, "logits/rejected": 0.737500011920929, "logps/chosen": -228.6125030517578, "logps/rejected": -233.97500610351562, "loss": 0.7041, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8526366949081421, "rewards/margins": 2.0521240234375, "rewards/rejected": -2.9032225608825684, "step": 1250 }, { "epoch": 0.2016, "grad_norm": 136.65524552638988, "learning_rate": 7.9856e-07, "logits/chosen": 0.6878417730331421, "logits/rejected": 0.854687511920929, "logps/chosen": -215.5, "logps/rejected": -188.0625, "loss": 0.5934, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5452880859375, "rewards/margins": 2.008129835128784, "rewards/rejected": -2.5533690452575684, "step": 1260 }, { "epoch": 0.2032, "grad_norm": 97.19686782243825, "learning_rate": 7.9696e-07, "logits/chosen": 0.591784656047821, "logits/rejected": 0.694506824016571, "logps/chosen": -236.77499389648438, "logps/rejected": -188.39999389648438, "loss": 0.5231, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.901867687702179, "rewards/margins": 2.3388671875, "rewards/rejected": -1.435723900794983, "step": 1270 }, { "epoch": 0.2048, "grad_norm": 160.31407483122922, "learning_rate": 7.953599999999999e-07, "logits/chosen": 0.5442870855331421, "logits/rejected": 0.627001941204071, "logps/chosen": -223.875, "logps/rejected": -197.78750610351562, "loss": 0.7351, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.198205590248108, "rewards/margins": 1.678613305091858, "rewards/rejected": -0.4809326231479645, "step": 1280 }, { "epoch": 0.2064, "grad_norm": 169.50019923920786, "learning_rate": 7.9376e-07, "logits/chosen": 0.5093628168106079, "logits/rejected": 0.6820068359375, "logps/chosen": -243.8249969482422, "logps/rejected": -202.35000610351562, "loss": 0.6222, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.13181152939796448, "rewards/margins": 2.2594237327575684, "rewards/rejected": -2.391003370285034, "step": 1290 }, { "epoch": 0.208, "grad_norm": 186.16525364333114, "learning_rate": 7.9216e-07, "logits/chosen": 0.609509289264679, "logits/rejected": 0.7901855707168579, "logps/chosen": -204.35000610351562, "logps/rejected": -194.5124969482422, "loss": 0.6535, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2982666492462158, "rewards/margins": 1.902734398841858, "rewards/rejected": -3.19970703125, "step": 1300 }, { "epoch": 0.2096, "grad_norm": 134.73399832787567, "learning_rate": 7.9056e-07, "logits/chosen": NaN, "logits/rejected": 0.7830810546875, "logps/chosen": -258.29998779296875, "logps/rejected": -220.2375030517578, "loss": 0.6666, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.087792992591858, "rewards/margins": 2.169872999191284, "rewards/rejected": -3.25830078125, "step": 1310 }, { "epoch": 0.2112, "grad_norm": 120.16015882395286, "learning_rate": 7.889599999999999e-07, "logits/chosen": 0.60833740234375, "logits/rejected": 0.77099609375, "logps/chosen": -241.59375, "logps/rejected": -202.8000030517578, "loss": 0.7344, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8186767101287842, "rewards/margins": 1.810546875, "rewards/rejected": -3.6296629905700684, "step": 1320 }, { "epoch": 0.2128, "grad_norm": 94.16428866501501, "learning_rate": 7.873599999999999e-07, "logits/chosen": 0.507861316204071, "logits/rejected": 0.6562134027481079, "logps/chosen": -238.6374969482422, "logps/rejected": -217.3249969482422, "loss": 0.4191, "rewards/accuracies": 0.8125, "rewards/chosen": -2.443066358566284, "rewards/margins": 2.3634276390075684, "rewards/rejected": -4.808203220367432, "step": 1330 }, { "epoch": 0.2144, "grad_norm": 151.48169575005033, "learning_rate": 7.8576e-07, "logits/chosen": 0.6117798089981079, "logits/rejected": 0.749646008014679, "logps/chosen": -239.1750030517578, "logps/rejected": -211.9375, "loss": 0.5664, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.647167921066284, "rewards/margins": 2.056884765625, "rewards/rejected": -4.7041015625, "step": 1340 }, { "epoch": 0.216, "grad_norm": 126.80092888813287, "learning_rate": 7.841599999999999e-07, "logits/chosen": 0.561999499797821, "logits/rejected": 0.693225085735321, "logps/chosen": -229.0749969482422, "logps/rejected": -207.03750610351562, "loss": 0.7138, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5824706554412842, "rewards/margins": 1.789209008216858, "rewards/rejected": -3.369921922683716, "step": 1350 }, { "epoch": 0.2176, "grad_norm": 211.81424394486982, "learning_rate": 7.8256e-07, "logits/chosen": 0.6391845941543579, "logits/rejected": 0.7094482183456421, "logps/chosen": -221.7624969482422, "logps/rejected": -222.41250610351562, "loss": 0.7258, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.871246337890625, "rewards/margins": 1.82470703125, "rewards/rejected": -2.6949219703674316, "step": 1360 }, { "epoch": 0.2192, "grad_norm": 104.04847293751368, "learning_rate": 7.8096e-07, "logits/chosen": 0.500805675983429, "logits/rejected": 0.6427978277206421, "logps/chosen": -225.6125030517578, "logps/rejected": -218.58749389648438, "loss": 0.7589, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2872803211212158, "rewards/margins": 1.876953125, "rewards/rejected": -3.1624999046325684, "step": 1370 }, { "epoch": 0.2208, "grad_norm": 141.11843012358895, "learning_rate": 7.793600000000001e-07, "logits/chosen": 0.47551268339157104, "logits/rejected": 0.6089111566543579, "logps/chosen": -249.71249389648438, "logps/rejected": -217.7624969482422, "loss": 0.535, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.7452392578125, "rewards/margins": 2.7024168968200684, "rewards/rejected": -3.447460889816284, "step": 1380 }, { "epoch": 0.2224, "grad_norm": 58.73631673974729, "learning_rate": 7.777599999999999e-07, "logits/chosen": 0.34590452909469604, "logits/rejected": 0.5478271245956421, "logps/chosen": -243.22500610351562, "logps/rejected": -219.0749969482422, "loss": 0.5659, "rewards/accuracies": 0.75, "rewards/chosen": -1.144995093345642, "rewards/margins": 2.266918897628784, "rewards/rejected": -3.412890672683716, "step": 1390 }, { "epoch": 0.224, "grad_norm": 129.23752841225027, "learning_rate": 7.761599999999999e-07, "logits/chosen": 0.5350097417831421, "logits/rejected": 0.6661621332168579, "logps/chosen": -247.89999389648438, "logps/rejected": -216.39999389648438, "loss": 0.7026, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.095605492591858, "rewards/margins": 2.1724610328674316, "rewards/rejected": -3.2687745094299316, "step": 1400 }, { "epoch": 0.2256, "grad_norm": 77.21779891937061, "learning_rate": 7.7456e-07, "logits/chosen": 0.5290161371231079, "logits/rejected": 0.7059692144393921, "logps/chosen": -258.3374938964844, "logps/rejected": -214.4875030517578, "loss": 0.5684, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1089966297149658, "rewards/margins": 2.0660157203674316, "rewards/rejected": -3.1731934547424316, "step": 1410 }, { "epoch": 0.2272, "grad_norm": 124.87387807517305, "learning_rate": 7.729599999999999e-07, "logits/chosen": 0.4524902403354645, "logits/rejected": 0.6333252191543579, "logps/chosen": -243.91250610351562, "logps/rejected": -216.43124389648438, "loss": 0.6795, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.915692150592804, "rewards/margins": 2.53662109375, "rewards/rejected": -3.4503417015075684, "step": 1420 }, { "epoch": 0.2288, "grad_norm": 105.0781431987365, "learning_rate": 7.7136e-07, "logits/chosen": 0.4040283262729645, "logits/rejected": 0.5627182126045227, "logps/chosen": -253.5500030517578, "logps/rejected": -233.60000610351562, "loss": 0.6526, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5569092035293579, "rewards/margins": 2.224365234375, "rewards/rejected": -2.7821044921875, "step": 1430 }, { "epoch": 0.2304, "grad_norm": 98.17874940354532, "learning_rate": 7.6976e-07, "logits/chosen": 0.4698730409145355, "logits/rejected": 0.6697998046875, "logps/chosen": -225.7624969482422, "logps/rejected": -202.2375030517578, "loss": 0.6949, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5781615972518921, "rewards/margins": 2.172253370285034, "rewards/rejected": -2.7498536109924316, "step": 1440 }, { "epoch": 0.232, "grad_norm": 66.25513422343734, "learning_rate": 7.6816e-07, "logits/chosen": 0.4568115174770355, "logits/rejected": 0.57928466796875, "logps/chosen": -241.08749389648438, "logps/rejected": -236.3625030517578, "loss": 0.7066, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.935070812702179, "rewards/margins": 2.0193724632263184, "rewards/rejected": -2.9546875953674316, "step": 1450 }, { "epoch": 0.2336, "grad_norm": 180.99933806480348, "learning_rate": 7.6656e-07, "logits/chosen": 0.4901885986328125, "logits/rejected": 0.597973644733429, "logps/chosen": -262.9624938964844, "logps/rejected": -234.1374969482422, "loss": 0.6928, "rewards/accuracies": 0.75, "rewards/chosen": -0.758129894733429, "rewards/margins": 2.446484327316284, "rewards/rejected": -3.203295946121216, "step": 1460 }, { "epoch": 0.2352, "grad_norm": 86.02162196142035, "learning_rate": 7.649599999999999e-07, "logits/chosen": 0.4291748106479645, "logits/rejected": 0.6058593988418579, "logps/chosen": -235.8000030517578, "logps/rejected": -222.4250030517578, "loss": 0.5756, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.6880737543106079, "rewards/margins": 2.1203246116638184, "rewards/rejected": -2.808789014816284, "step": 1470 }, { "epoch": 0.2368, "grad_norm": 168.8389925314071, "learning_rate": 7.6336e-07, "logits/chosen": 0.570910632610321, "logits/rejected": 0.697924792766571, "logps/chosen": -240.83749389648438, "logps/rejected": -218.4499969482422, "loss": 0.6798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.685009777545929, "rewards/margins": 2.06396484375, "rewards/rejected": -2.748706102371216, "step": 1480 }, { "epoch": 0.2384, "grad_norm": 77.4898331719457, "learning_rate": 7.617599999999999e-07, "logits/chosen": 0.562695324420929, "logits/rejected": 0.60992431640625, "logps/chosen": -260.9125061035156, "logps/rejected": -221.89999389648438, "loss": 0.6677, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.954486072063446, "rewards/margins": 1.8463256359100342, "rewards/rejected": -2.800585985183716, "step": 1490 }, { "epoch": 0.24, "grad_norm": 155.8045585530678, "learning_rate": 7.601599999999999e-07, "logits/chosen": 0.573229968547821, "logits/rejected": 0.7222900390625, "logps/chosen": -239.6750030517578, "logps/rejected": -225.47500610351562, "loss": 0.6409, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.556665062904358, "rewards/margins": 2.2752442359924316, "rewards/rejected": -3.830859422683716, "step": 1500 }, { "epoch": 0.2416, "grad_norm": 65.16549089314945, "learning_rate": 7.5856e-07, "logits/chosen": 0.61871337890625, "logits/rejected": 0.718945324420929, "logps/chosen": -240.3125, "logps/rejected": -186.3625030517578, "loss": 0.6232, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.348242163658142, "rewards/margins": 1.956091284751892, "rewards/rejected": -3.305468797683716, "step": 1510 }, { "epoch": 0.2432, "grad_norm": 65.24423370074571, "learning_rate": 7.5696e-07, "logits/chosen": 0.60675048828125, "logits/rejected": 0.7623046636581421, "logps/chosen": -234.47500610351562, "logps/rejected": -220.6750030517578, "loss": 0.5619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.091040015220642, "rewards/margins": 2.7364258766174316, "rewards/rejected": -3.8257079124450684, "step": 1520 }, { "epoch": 0.2448, "grad_norm": 133.44691034483841, "learning_rate": 7.5536e-07, "logits/chosen": 0.6266723871231079, "logits/rejected": 0.691455066204071, "logps/chosen": -245.8874969482422, "logps/rejected": -208.16250610351562, "loss": 0.7486, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0851562023162842, "rewards/margins": 1.665795922279358, "rewards/rejected": -2.7510986328125, "step": 1530 }, { "epoch": 0.2464, "grad_norm": 121.49917383450642, "learning_rate": 7.537599999999999e-07, "logits/chosen": 0.5602172613143921, "logits/rejected": 0.684826672077179, "logps/chosen": -211.1999969482422, "logps/rejected": -175.1125030517578, "loss": 0.54, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.225317358970642, "rewards/margins": 1.9990234375, "rewards/rejected": -3.222851514816284, "step": 1540 }, { "epoch": 0.248, "grad_norm": 90.45557580010014, "learning_rate": 7.5216e-07, "logits/chosen": 0.5130249261856079, "logits/rejected": 0.6347411870956421, "logps/chosen": -264.70001220703125, "logps/rejected": -230.4375, "loss": 0.7062, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16748046875, "rewards/margins": 2.1143555641174316, "rewards/rejected": -2.2809691429138184, "step": 1550 }, { "epoch": 0.2496, "grad_norm": 103.26819213089799, "learning_rate": 7.5056e-07, "logits/chosen": 0.5665954351425171, "logits/rejected": 0.7652587890625, "logps/chosen": -231.1875, "logps/rejected": -201.33749389648438, "loss": 0.5992, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.43757325410842896, "rewards/margins": 2.503735303878784, "rewards/rejected": -2.0625977516174316, "step": 1560 }, { "epoch": 0.2512, "grad_norm": 191.83500922040844, "learning_rate": 7.489599999999999e-07, "logits/chosen": 0.614093005657196, "logits/rejected": 0.7955077886581421, "logps/chosen": -262.9125061035156, "logps/rejected": -206.47500610351562, "loss": 0.7483, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.974072277545929, "rewards/margins": 2.356982469558716, "rewards/rejected": -1.3817017078399658, "step": 1570 }, { "epoch": 0.2528, "grad_norm": 123.53275347529853, "learning_rate": 7.4736e-07, "logits/chosen": 0.761584460735321, "logits/rejected": 0.967041015625, "logps/chosen": -223.0, "logps/rejected": -185.3125, "loss": 0.5505, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.011474609375, "rewards/margins": 2.7007813453674316, "rewards/rejected": -2.6905274391174316, "step": 1580 }, { "epoch": 0.2544, "grad_norm": 142.91415975721222, "learning_rate": 7.4576e-07, "logits/chosen": 0.7645400762557983, "logits/rejected": 0.906689465045929, "logps/chosen": -213.3125, "logps/rejected": -205.25, "loss": 0.5752, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2771728038787842, "rewards/margins": 2.008007764816284, "rewards/rejected": -3.2876954078674316, "step": 1590 }, { "epoch": 0.256, "grad_norm": 126.73676058963068, "learning_rate": 7.4416e-07, "logits/chosen": 0.8503662347793579, "logits/rejected": 0.9769042730331421, "logps/chosen": -229.7624969482422, "logps/rejected": -209.4875030517578, "loss": 0.6422, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9767577648162842, "rewards/margins": 1.8313477039337158, "rewards/rejected": -3.806445360183716, "step": 1600 }, { "epoch": 0.2576, "grad_norm": 175.92636480428266, "learning_rate": 7.4256e-07, "logits/chosen": 0.83905029296875, "logits/rejected": 0.8807373046875, "logps/chosen": -242.4875030517578, "logps/rejected": -209.5749969482422, "loss": 0.5404, "rewards/accuracies": 0.78125, "rewards/chosen": -1.022729516029358, "rewards/margins": 2.636962890625, "rewards/rejected": -3.658252000808716, "step": 1610 }, { "epoch": 0.2592, "grad_norm": 56.03703213000988, "learning_rate": 7.409599999999999e-07, "logits/chosen": 0.788256824016571, "logits/rejected": 0.901074230670929, "logps/chosen": -247.22500610351562, "logps/rejected": -207.7375030517578, "loss": 0.616, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5648559331893921, "rewards/margins": 2.180126905441284, "rewards/rejected": -2.742480516433716, "step": 1620 }, { "epoch": 0.2608, "grad_norm": 192.17430686158704, "learning_rate": 7.3936e-07, "logits/chosen": 0.687915027141571, "logits/rejected": 0.775524914264679, "logps/chosen": -251.2375030517578, "logps/rejected": -215.9875030517578, "loss": 0.6525, "rewards/accuracies": 0.75, "rewards/chosen": 0.19647216796875, "rewards/margins": 2.481738328933716, "rewards/rejected": -2.284069776535034, "step": 1630 }, { "epoch": 0.2624, "grad_norm": 232.72441602932307, "learning_rate": 7.377599999999999e-07, "logits/chosen": 0.668286144733429, "logits/rejected": 0.887615978717804, "logps/chosen": -227.6374969482422, "logps/rejected": -189.8249969482422, "loss": 0.7726, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.69921875, "rewards/margins": 1.630468726158142, "rewards/rejected": -2.3294920921325684, "step": 1640 }, { "epoch": 0.264, "grad_norm": 132.00517360839555, "learning_rate": 7.3616e-07, "logits/chosen": 0.637585461139679, "logits/rejected": 0.900097668170929, "logps/chosen": -244.28750610351562, "logps/rejected": -192.28750610351562, "loss": 0.6077, "rewards/accuracies": 0.75, "rewards/chosen": -0.9097900390625, "rewards/margins": 2.2595582008361816, "rewards/rejected": -3.16943359375, "step": 1650 }, { "epoch": 0.2656, "grad_norm": 109.92384534706407, "learning_rate": 7.3456e-07, "logits/chosen": 0.7176513671875, "logits/rejected": 0.8939453363418579, "logps/chosen": -256.0249938964844, "logps/rejected": -210.5124969482422, "loss": 0.5571, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.412988305091858, "rewards/margins": 2.295239210128784, "rewards/rejected": -3.706835985183716, "step": 1660 }, { "epoch": 0.2672, "grad_norm": 111.29954729013393, "learning_rate": 7.329599999999999e-07, "logits/chosen": 0.677288830280304, "logits/rejected": 0.929614245891571, "logps/chosen": -241.03125, "logps/rejected": -199.0749969482422, "loss": 0.6617, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.38414305448532104, "rewards/margins": 2.320343017578125, "rewards/rejected": -2.702685594558716, "step": 1670 }, { "epoch": 0.2688, "grad_norm": 216.1815545860013, "learning_rate": 7.3136e-07, "logits/chosen": NaN, "logits/rejected": 0.721466064453125, "logps/chosen": -246.7624969482422, "logps/rejected": -409.82501220703125, "loss": 0.9829, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24089355766773224, "rewards/margins": 1.9621703624725342, "rewards/rejected": -1.7216675281524658, "step": 1680 }, { "epoch": 0.2704, "grad_norm": 127.60030311050626, "learning_rate": 7.297599999999999e-07, "logits/chosen": 0.6167968511581421, "logits/rejected": 0.8377441167831421, "logps/chosen": -256.9750061035156, "logps/rejected": -213.31875610351562, "loss": 0.6391, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.805419921875, "rewards/margins": 2.499804735183716, "rewards/rejected": -3.3050780296325684, "step": 1690 }, { "epoch": 0.272, "grad_norm": 107.50542644083856, "learning_rate": 7.2816e-07, "logits/chosen": 0.7699218988418579, "logits/rejected": 0.8668578863143921, "logps/chosen": -261.45001220703125, "logps/rejected": -237.02499389648438, "loss": 0.5342, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.748022437095642, "rewards/margins": 2.7164063453674316, "rewards/rejected": -4.461718559265137, "step": 1700 }, { "epoch": 0.2736, "grad_norm": 146.089664653905, "learning_rate": 7.265599999999999e-07, "logits/chosen": 0.741198718547821, "logits/rejected": 0.829541027545929, "logps/chosen": -208.71249389648438, "logps/rejected": -185.39999389648438, "loss": 0.6651, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.179248094558716, "rewards/margins": 1.9802734851837158, "rewards/rejected": -5.159765720367432, "step": 1710 }, { "epoch": 0.2752, "grad_norm": 67.86647527873484, "learning_rate": 7.2496e-07, "logits/chosen": 0.7754150629043579, "logits/rejected": 0.849291980266571, "logps/chosen": -240.6875, "logps/rejected": -203.66250610351562, "loss": 0.4522, "rewards/accuracies": 0.84375, "rewards/chosen": -1.499975562095642, "rewards/margins": 2.490649461746216, "rewards/rejected": -3.9908204078674316, "step": 1720 }, { "epoch": 0.2768, "grad_norm": 116.23381169673198, "learning_rate": 7.2336e-07, "logits/chosen": 0.8299804925918579, "logits/rejected": 0.901806652545929, "logps/chosen": -214.16250610351562, "logps/rejected": -185.375, "loss": 0.6066, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7080078125, "rewards/margins": 2.159374952316284, "rewards/rejected": -3.8656249046325684, "step": 1730 }, { "epoch": 0.2784, "grad_norm": 156.51827224773677, "learning_rate": 7.2176e-07, "logits/chosen": 0.725329577922821, "logits/rejected": 0.872607409954071, "logps/chosen": -243.25, "logps/rejected": -213.72500610351562, "loss": 0.7483, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.827746570110321, "rewards/margins": 1.9357421398162842, "rewards/rejected": -2.7651429176330566, "step": 1740 }, { "epoch": 0.28, "grad_norm": 112.03559323608981, "learning_rate": 7.2016e-07, "logits/chosen": 0.720410168170929, "logits/rejected": 0.875378429889679, "logps/chosen": -225.75, "logps/rejected": -206.72500610351562, "loss": 0.5216, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.34782105684280396, "rewards/margins": 2.362060546875, "rewards/rejected": -2.70751953125, "step": 1750 }, { "epoch": 0.2816, "grad_norm": 64.75012403664128, "learning_rate": 7.185599999999999e-07, "logits/chosen": 0.742785632610321, "logits/rejected": 0.8716796636581421, "logps/chosen": -241.6999969482422, "logps/rejected": -210.39999389648438, "loss": 0.5582, "rewards/accuracies": 0.75, "rewards/chosen": -0.028564453125, "rewards/margins": 2.4826416969299316, "rewards/rejected": -2.509417772293091, "step": 1760 }, { "epoch": 0.2832, "grad_norm": 136.360314219101, "learning_rate": 7.1696e-07, "logits/chosen": 0.7409423589706421, "logits/rejected": 0.8479858636856079, "logps/chosen": -231.9875030517578, "logps/rejected": -209.72500610351562, "loss": 0.6234, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.763317883014679, "rewards/margins": 2.140454053878784, "rewards/rejected": -2.904101610183716, "step": 1770 }, { "epoch": 0.2848, "grad_norm": 59.0192985240941, "learning_rate": 7.153599999999999e-07, "logits/chosen": 0.7479492425918579, "logits/rejected": 0.9000946283340454, "logps/chosen": -227.7624969482422, "logps/rejected": -203.6875, "loss": 0.7698, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9976257085800171, "rewards/margins": 1.8917236328125, "rewards/rejected": -2.8868408203125, "step": 1780 }, { "epoch": 0.2864, "grad_norm": 126.92765809750003, "learning_rate": 7.137599999999999e-07, "logits/chosen": 0.8992919921875, "logits/rejected": 0.951855480670929, "logps/chosen": -215.77499389648438, "logps/rejected": -193.3625030517578, "loss": 0.5672, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.676416039466858, "rewards/margins": 2.236035108566284, "rewards/rejected": -3.911181688308716, "step": 1790 }, { "epoch": 0.288, "grad_norm": 144.66364890572726, "learning_rate": 7.1216e-07, "logits/chosen": 0.7940673828125, "logits/rejected": 0.8644043207168579, "logps/chosen": -255.5500030517578, "logps/rejected": -217.5500030517578, "loss": 0.5785, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.16455078125, "rewards/margins": 2.523852586746216, "rewards/rejected": -3.690673828125, "step": 1800 }, { "epoch": 0.2896, "grad_norm": 124.16822381154583, "learning_rate": 7.1056e-07, "logits/chosen": 0.768310546875, "logits/rejected": 0.8497314453125, "logps/chosen": -246.0124969482422, "logps/rejected": -216.77499389648438, "loss": 0.596, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.560571312904358, "rewards/margins": 2.1285157203674316, "rewards/rejected": -3.6910157203674316, "step": 1810 }, { "epoch": 0.2912, "grad_norm": 114.26057613848202, "learning_rate": 7.0896e-07, "logits/chosen": 0.6771606206893921, "logits/rejected": 0.8094238042831421, "logps/chosen": -217.6374969482422, "logps/rejected": -207.58749389648438, "loss": 0.5232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.828320324420929, "rewards/margins": 3.115185499191284, "rewards/rejected": -3.9449219703674316, "step": 1820 }, { "epoch": 0.2928, "grad_norm": 202.92650488026328, "learning_rate": 7.0736e-07, "logits/chosen": 0.6882568597793579, "logits/rejected": 0.814453125, "logps/chosen": -258.0249938964844, "logps/rejected": -222.1125030517578, "loss": 0.6405, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.898083508014679, "rewards/margins": 2.2346434593200684, "rewards/rejected": -3.133007764816284, "step": 1830 }, { "epoch": 0.2944, "grad_norm": 123.15221139781302, "learning_rate": 7.0576e-07, "logits/chosen": NaN, "logits/rejected": 0.836865246295929, "logps/chosen": -249.33749389648438, "logps/rejected": -230.6875, "loss": 0.7122, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8819580078125, "rewards/margins": 2.0771727561950684, "rewards/rejected": -3.959033250808716, "step": 1840 }, { "epoch": 0.296, "grad_norm": 121.14972849762013, "learning_rate": 7.0416e-07, "logits/chosen": 0.7655273675918579, "logits/rejected": 0.9085448980331421, "logps/chosen": -219.52499389648438, "logps/rejected": -204.52499389648438, "loss": 0.791, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9534423351287842, "rewards/margins": 2.163745164871216, "rewards/rejected": -4.1171875, "step": 1850 }, { "epoch": 0.2976, "grad_norm": 312.45507284925014, "learning_rate": 7.025599999999999e-07, "logits/chosen": 0.689257800579071, "logits/rejected": 0.8206787109375, "logps/chosen": -246.75, "logps/rejected": -208.22500610351562, "loss": 0.8676, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.53802490234375, "rewards/margins": 1.8668944835662842, "rewards/rejected": -3.4044432640075684, "step": 1860 }, { "epoch": 0.2992, "grad_norm": 115.32557519334594, "learning_rate": 7.0096e-07, "logits/chosen": 0.6763671636581421, "logits/rejected": 0.833386242389679, "logps/chosen": -223.0500030517578, "logps/rejected": -217.22500610351562, "loss": 0.5239, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0249207019805908, "rewards/margins": 2.557812452316284, "rewards/rejected": -3.5841307640075684, "step": 1870 }, { "epoch": 0.3008, "grad_norm": 136.65928398014532, "learning_rate": 6.9936e-07, "logits/chosen": 0.6982482671737671, "logits/rejected": 0.8886963129043579, "logps/chosen": -233.6750030517578, "logps/rejected": -213.5500030517578, "loss": 0.582, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.255102515220642, "rewards/margins": 2.702343702316284, "rewards/rejected": -3.957714796066284, "step": 1880 }, { "epoch": 0.3024, "grad_norm": 170.64204451603538, "learning_rate": 6.9776e-07, "logits/chosen": 0.7345215082168579, "logits/rejected": 0.894116222858429, "logps/chosen": -230.53750610351562, "logps/rejected": -204.7375030517578, "loss": 0.5033, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.602148413658142, "rewards/margins": 2.6983399391174316, "rewards/rejected": -4.299218654632568, "step": 1890 }, { "epoch": 0.304, "grad_norm": 153.0045128435963, "learning_rate": 6.9616e-07, "logits/chosen": 0.728161633014679, "logits/rejected": 0.918286144733429, "logps/chosen": -243.9875030517578, "logps/rejected": -216.8000030517578, "loss": 0.6743, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.901879906654358, "rewards/margins": 1.9805176258087158, "rewards/rejected": -3.8857421875, "step": 1900 }, { "epoch": 0.3056, "grad_norm": 266.60970567114714, "learning_rate": 6.945599999999999e-07, "logits/chosen": 0.7692016363143921, "logits/rejected": 0.926806628704071, "logps/chosen": -245.875, "logps/rejected": -225.9875030517578, "loss": 0.5769, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2130858898162842, "rewards/margins": 2.6349120140075684, "rewards/rejected": -3.8453125953674316, "step": 1910 }, { "epoch": 0.3072, "grad_norm": 176.98761442864313, "learning_rate": 6.9296e-07, "logits/chosen": 0.645068347454071, "logits/rejected": 0.7796630859375, "logps/chosen": -248.72500610351562, "logps/rejected": -218.3625030517578, "loss": 0.5722, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4583984315395355, "rewards/margins": 2.6269774436950684, "rewards/rejected": -3.08642578125, "step": 1920 }, { "epoch": 0.3088, "grad_norm": 214.08888466926393, "learning_rate": 6.913599999999999e-07, "logits/chosen": 0.5599609613418579, "logits/rejected": 0.690258800983429, "logps/chosen": -259.2749938964844, "logps/rejected": -222.02499389648438, "loss": 0.5537, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.891040027141571, "rewards/margins": 2.9544920921325684, "rewards/rejected": -3.84375, "step": 1930 }, { "epoch": 0.3104, "grad_norm": 143.43362747178534, "learning_rate": 6.8976e-07, "logits/chosen": 0.6734619140625, "logits/rejected": 0.7696563601493835, "logps/chosen": -240.5, "logps/rejected": -229.7624969482422, "loss": 0.5484, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4538452625274658, "rewards/margins": 2.620800733566284, "rewards/rejected": -4.075097560882568, "step": 1940 }, { "epoch": 0.312, "grad_norm": 59.54211348820151, "learning_rate": 6.8816e-07, "logits/chosen": 0.674548327922821, "logits/rejected": 0.9041992425918579, "logps/chosen": -257.3500061035156, "logps/rejected": -225.9375, "loss": 0.7741, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.246069312095642, "rewards/margins": 2.31396484375, "rewards/rejected": -3.558056592941284, "step": 1950 }, { "epoch": 0.3136, "grad_norm": 120.78222068579896, "learning_rate": 6.865599999999999e-07, "logits/chosen": 0.7765136957168579, "logits/rejected": 0.909912109375, "logps/chosen": -241.8000030517578, "logps/rejected": -188.125, "loss": 0.595, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9190918207168579, "rewards/margins": 2.442700147628784, "rewards/rejected": -3.361132860183716, "step": 1960 }, { "epoch": 0.3152, "grad_norm": 94.67752643947595, "learning_rate": 6.8496e-07, "logits/chosen": 0.749371349811554, "logits/rejected": 0.8358734250068665, "logps/chosen": -220.97500610351562, "logps/rejected": -195.66250610351562, "loss": 0.5278, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.13158568739891052, "rewards/margins": 2.9180665016174316, "rewards/rejected": -3.0472655296325684, "step": 1970 }, { "epoch": 0.3168, "grad_norm": 127.09300156171604, "learning_rate": 6.833599999999999e-07, "logits/chosen": 0.6963135004043579, "logits/rejected": 0.883502185344696, "logps/chosen": -236.0124969482422, "logps/rejected": -224.53750610351562, "loss": 0.5687, "rewards/accuracies": 0.75, "rewards/chosen": -0.21137695014476776, "rewards/margins": 2.6353759765625, "rewards/rejected": -2.844531297683716, "step": 1980 }, { "epoch": 0.3184, "grad_norm": 232.3092713651462, "learning_rate": 6.8176e-07, "logits/chosen": 0.8005615472793579, "logits/rejected": 0.893505871295929, "logps/chosen": -224.33749389648438, "logps/rejected": -206.2375030517578, "loss": 0.4565, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.31840819120407104, "rewards/margins": 2.365966796875, "rewards/rejected": -2.6835083961486816, "step": 1990 }, { "epoch": 0.32, "grad_norm": 149.31588657462103, "learning_rate": 6.801599999999999e-07, "logits/chosen": 0.8180176019668579, "logits/rejected": 0.878588855266571, "logps/chosen": -243.78750610351562, "logps/rejected": -226.03750610351562, "loss": 0.5278, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.17879638075828552, "rewards/margins": 2.4073729515075684, "rewards/rejected": -2.226855516433716, "step": 2000 }, { "epoch": 0.3216, "grad_norm": 125.89816321836773, "learning_rate": 6.7856e-07, "logits/chosen": 0.8502441644668579, "logits/rejected": 1.018945336341858, "logps/chosen": -231.47500610351562, "logps/rejected": -218.3874969482422, "loss": 0.6272, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.29513245820999146, "rewards/margins": 2.4769043922424316, "rewards/rejected": -2.7708497047424316, "step": 2010 }, { "epoch": 0.3232, "grad_norm": 96.64226865971311, "learning_rate": 6.7696e-07, "logits/chosen": 0.830249011516571, "logits/rejected": 0.880816638469696, "logps/chosen": -246.875, "logps/rejected": -203.6999969482422, "loss": 0.5608, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.1153564453125, "rewards/margins": 2.6416015625, "rewards/rejected": -2.757861375808716, "step": 2020 }, { "epoch": 0.3248, "grad_norm": 124.9842664411677, "learning_rate": 6.7536e-07, "logits/chosen": 0.7671142816543579, "logits/rejected": 0.829541027545929, "logps/chosen": -254.25, "logps/rejected": -228.33749389648438, "loss": 0.6383, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3365844786167145, "rewards/margins": 2.7112793922424316, "rewards/rejected": -3.04736328125, "step": 2030 }, { "epoch": 0.3264, "grad_norm": 152.62569676149516, "learning_rate": 6.7376e-07, "logits/chosen": 0.887805163860321, "logits/rejected": 0.9347900152206421, "logps/chosen": -210.71249389648438, "logps/rejected": -194.4812469482422, "loss": 0.6265, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.957067847251892, "rewards/margins": 2.081103563308716, "rewards/rejected": -4.03515625, "step": 2040 }, { "epoch": 0.328, "grad_norm": 123.19966797408996, "learning_rate": 6.7216e-07, "logits/chosen": 0.8014160394668579, "logits/rejected": 0.8954833745956421, "logps/chosen": -277.2124938964844, "logps/rejected": -228.3000030517578, "loss": 0.5705, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.569665551185608, "rewards/margins": 3.157031297683716, "rewards/rejected": -4.726366996765137, "step": 2050 }, { "epoch": 0.3296, "grad_norm": 158.1496766412668, "learning_rate": 6.7056e-07, "logits/chosen": 0.870190441608429, "logits/rejected": 0.982861340045929, "logps/chosen": -227.22500610351562, "logps/rejected": -210.16250610351562, "loss": 0.6492, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6833374500274658, "rewards/margins": 2.7630372047424316, "rewards/rejected": -4.444921970367432, "step": 2060 }, { "epoch": 0.3312, "grad_norm": 99.85596975700024, "learning_rate": 6.689599999999999e-07, "logits/chosen": 0.765795886516571, "logits/rejected": 0.9016357660293579, "logps/chosen": -219.3625030517578, "logps/rejected": -186.97500610351562, "loss": 0.6064, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.457763671875, "rewards/margins": 2.002880811691284, "rewards/rejected": -3.462207078933716, "step": 2070 }, { "epoch": 0.3328, "grad_norm": 104.02193278400794, "learning_rate": 6.673599999999999e-07, "logits/chosen": 0.715624988079071, "logits/rejected": 0.7900390625, "logps/chosen": -245.6999969482422, "logps/rejected": -209.78750610351562, "loss": 0.5939, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.07892455905675888, "rewards/margins": 2.489697217941284, "rewards/rejected": -2.409533739089966, "step": 2080 }, { "epoch": 0.3344, "grad_norm": 102.84484761432033, "learning_rate": 6.6576e-07, "logits/chosen": 0.835156261920929, "logits/rejected": 1.068017601966858, "logps/chosen": -228.58749389648438, "logps/rejected": -210.86874389648438, "loss": 0.5351, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.8349853754043579, "rewards/margins": 2.714794874191284, "rewards/rejected": -1.8772461414337158, "step": 2090 }, { "epoch": 0.336, "grad_norm": 138.47130516338188, "learning_rate": 6.6416e-07, "logits/chosen": 0.7479248046875, "logits/rejected": 0.8705688714981079, "logps/chosen": -219.16250610351562, "logps/rejected": -202.85000610351562, "loss": 0.7408, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.9234497547149658, "rewards/margins": 2.2261719703674316, "rewards/rejected": -0.30218809843063354, "step": 2100 }, { "epoch": 0.3376, "grad_norm": 167.49902252661684, "learning_rate": 6.6256e-07, "logits/chosen": 0.774096667766571, "logits/rejected": 0.9251464605331421, "logps/chosen": -223.4375, "logps/rejected": -190.47500610351562, "loss": 0.6761, "rewards/accuracies": 0.6875, "rewards/chosen": 2.384570360183716, "rewards/margins": 1.784521460533142, "rewards/rejected": 0.598559558391571, "step": 2110 }, { "epoch": 0.3392, "grad_norm": 172.43541599897787, "learning_rate": 6.6096e-07, "logits/chosen": 0.635913074016571, "logits/rejected": 0.8281005620956421, "logps/chosen": -252.02499389648438, "logps/rejected": -223.8625030517578, "loss": 0.5798, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 2.0545897483825684, "rewards/margins": 2.454882860183716, "rewards/rejected": -0.39958494901657104, "step": 2120 }, { "epoch": 0.3408, "grad_norm": 128.70949619783258, "learning_rate": 6.593599999999999e-07, "logits/chosen": 0.6391662359237671, "logits/rejected": 0.8582763671875, "logps/chosen": -273.5249938964844, "logps/rejected": -222.97500610351562, "loss": 0.4865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1591796875, "rewards/margins": 3.8724608421325684, "rewards/rejected": -1.7156493663787842, "step": 2130 }, { "epoch": 0.3424, "grad_norm": 180.3243297125401, "learning_rate": 6.577599999999999e-07, "logits/chosen": 0.6510986089706421, "logits/rejected": 0.8548583984375, "logps/chosen": -223.71249389648438, "logps/rejected": -201.89999389648438, "loss": 0.5581, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.23251953721046448, "rewards/margins": 2.7448973655700684, "rewards/rejected": -2.9790282249450684, "step": 2140 }, { "epoch": 0.344, "grad_norm": 150.09954354936508, "learning_rate": 6.561599999999999e-07, "logits/chosen": 0.693981945514679, "logits/rejected": 0.8131347894668579, "logps/chosen": -216.89999389648438, "logps/rejected": -198.4375, "loss": 0.5962, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.680419921875, "rewards/margins": 2.2735838890075684, "rewards/rejected": -3.951953172683716, "step": 2150 }, { "epoch": 0.3456, "grad_norm": 135.7188994686067, "learning_rate": 6.5456e-07, "logits/chosen": 0.589794933795929, "logits/rejected": 0.7225097417831421, "logps/chosen": -235.27499389648438, "logps/rejected": -212.9499969482422, "loss": 0.7484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.252026319503784, "rewards/margins": 2.4186034202575684, "rewards/rejected": -4.671044826507568, "step": 2160 }, { "epoch": 0.3472, "grad_norm": 76.36584237950105, "learning_rate": 6.5296e-07, "logits/chosen": 0.6092773675918579, "logits/rejected": 0.706713855266571, "logps/chosen": -282.125, "logps/rejected": -216.3625030517578, "loss": 1.4931, "rewards/accuracies": 0.75, "rewards/chosen": -3.3864989280700684, "rewards/margins": 1.797753930091858, "rewards/rejected": -5.185937404632568, "step": 2170 }, { "epoch": 0.3488, "grad_norm": 110.67216672365826, "learning_rate": 6.5136e-07, "logits/chosen": 0.6737060546875, "logits/rejected": 0.785998523235321, "logps/chosen": -275.9750061035156, "logps/rejected": -234.9499969482422, "loss": 0.8654, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.852294921875, "rewards/margins": 2.0577149391174316, "rewards/rejected": -3.913281202316284, "step": 2180 }, { "epoch": 0.3504, "grad_norm": 273.31685348815483, "learning_rate": 6.4976e-07, "logits/chosen": 0.7088867425918579, "logits/rejected": 0.7575439214706421, "logps/chosen": -255.8000030517578, "logps/rejected": -205.83749389648438, "loss": 0.8244, "rewards/accuracies": 0.75, "rewards/chosen": -1.4041748046875, "rewards/margins": 2.078564405441284, "rewards/rejected": -3.4818358421325684, "step": 2190 }, { "epoch": 0.352, "grad_norm": 94.7449880972769, "learning_rate": 6.4816e-07, "logits/chosen": 0.637097179889679, "logits/rejected": 0.7564452886581421, "logps/chosen": -257.0625, "logps/rejected": -218.08749389648438, "loss": 0.4286, "rewards/accuracies": 0.8125, "rewards/chosen": -0.628222644329071, "rewards/margins": 3.1845703125, "rewards/rejected": -3.811230421066284, "step": 2200 }, { "epoch": 0.3536, "grad_norm": 119.51464663107423, "learning_rate": 6.4656e-07, "logits/chosen": 0.772906482219696, "logits/rejected": 0.889575183391571, "logps/chosen": -228.94375610351562, "logps/rejected": -202.2375030517578, "loss": 0.7707, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7989257574081421, "rewards/margins": 2.475817918777466, "rewards/rejected": -3.2759766578674316, "step": 2210 }, { "epoch": 0.3552, "grad_norm": 183.84528316786364, "learning_rate": 6.449599999999999e-07, "logits/chosen": 0.8477783203125, "logits/rejected": 0.9874267578125, "logps/chosen": -241.96249389648438, "logps/rejected": -218.41250610351562, "loss": 0.6104, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4881347715854645, "rewards/margins": 2.8127198219299316, "rewards/rejected": -3.2998046875, "step": 2220 }, { "epoch": 0.3568, "grad_norm": 95.81252622763674, "learning_rate": 6.4336e-07, "logits/chosen": 0.8132079839706421, "logits/rejected": 0.9325195550918579, "logps/chosen": -230.72500610351562, "logps/rejected": -215.0749969482422, "loss": 0.6519, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.233697533607483, "rewards/margins": 2.5452637672424316, "rewards/rejected": -3.7757811546325684, "step": 2230 }, { "epoch": 0.3584, "grad_norm": 261.2794156838422, "learning_rate": 6.4176e-07, "logits/chosen": 0.870129406452179, "logits/rejected": 0.978588879108429, "logps/chosen": -255.2375030517578, "logps/rejected": -219.52499389648438, "loss": 0.5715, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.234106421470642, "rewards/margins": 2.695019483566284, "rewards/rejected": -3.931591749191284, "step": 2240 }, { "epoch": 0.36, "grad_norm": 115.75557407398367, "learning_rate": 6.401599999999999e-07, "logits/chosen": 0.787609875202179, "logits/rejected": 0.929003894329071, "logps/chosen": -238.9875030517578, "logps/rejected": -220.375, "loss": 0.579, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2381103038787842, "rewards/margins": 3.004162549972534, "rewards/rejected": -4.243554592132568, "step": 2250 }, { "epoch": 0.3616, "grad_norm": 88.62182209177624, "learning_rate": 6.3856e-07, "logits/chosen": 0.8859008550643921, "logits/rejected": 0.9921875, "logps/chosen": -237.7375030517578, "logps/rejected": -205.9875030517578, "loss": 0.5923, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.027551293373108, "rewards/margins": 2.470947265625, "rewards/rejected": -3.4989256858825684, "step": 2260 }, { "epoch": 0.3632, "grad_norm": 38.11482514096183, "learning_rate": 6.3696e-07, "logits/chosen": 0.7615722417831421, "logits/rejected": 0.862622082233429, "logps/chosen": -250.33749389648438, "logps/rejected": -209.0500030517578, "loss": 0.573, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.598461925983429, "rewards/margins": 3.2533202171325684, "rewards/rejected": -3.849609375, "step": 2270 }, { "epoch": 0.3648, "grad_norm": 72.56849632041039, "learning_rate": 6.3536e-07, "logits/chosen": 0.7615722417831421, "logits/rejected": 0.962890625, "logps/chosen": -244.3000030517578, "logps/rejected": -235.6374969482422, "loss": 0.7781, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5746338367462158, "rewards/margins": 2.2955565452575684, "rewards/rejected": -3.8706297874450684, "step": 2280 }, { "epoch": 0.3664, "grad_norm": 35.55590716589917, "learning_rate": 6.337599999999999e-07, "logits/chosen": NaN, "logits/rejected": 0.96356201171875, "logps/chosen": -254.96249389648438, "logps/rejected": -217.0, "loss": 0.6925, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9073486328125, "rewards/margins": 2.370312452316284, "rewards/rejected": -4.272802829742432, "step": 2290 }, { "epoch": 0.368, "grad_norm": 182.36255213022955, "learning_rate": 6.3216e-07, "logits/chosen": 0.657214343547821, "logits/rejected": 0.829345703125, "logps/chosen": -238.52499389648438, "logps/rejected": -220.1750030517578, "loss": 0.6577, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5509154796600342, "rewards/margins": 2.3133788108825684, "rewards/rejected": -3.8617186546325684, "step": 2300 }, { "epoch": 0.3696, "grad_norm": 147.9272474981046, "learning_rate": 6.3056e-07, "logits/chosen": 0.5813354253768921, "logits/rejected": 0.737133800983429, "logps/chosen": -250.66250610351562, "logps/rejected": -236.625, "loss": 0.5689, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.114306688308716, "rewards/margins": 2.774609327316284, "rewards/rejected": -4.888867378234863, "step": 2310 }, { "epoch": 0.3712, "grad_norm": 116.06417023087921, "learning_rate": 6.289599999999999e-07, "logits/chosen": 0.6854492425918579, "logits/rejected": 0.9071044921875, "logps/chosen": -231.1750030517578, "logps/rejected": -219.9499969482422, "loss": 0.5702, "rewards/accuracies": 0.75, "rewards/chosen": -2.0658202171325684, "rewards/margins": 2.7840819358825684, "rewards/rejected": -4.845312595367432, "step": 2320 }, { "epoch": 0.3728, "grad_norm": 186.07827868246676, "learning_rate": 6.2736e-07, "logits/chosen": 0.7566894292831421, "logits/rejected": 0.9075927734375, "logps/chosen": -230.625, "logps/rejected": -220.0, "loss": 0.6314, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.627050757408142, "rewards/margins": 2.58013916015625, "rewards/rejected": -4.205273628234863, "step": 2330 }, { "epoch": 0.3744, "grad_norm": 115.63429376283501, "learning_rate": 6.2576e-07, "logits/chosen": 0.665515124797821, "logits/rejected": 0.774932861328125, "logps/chosen": -277.2124938964844, "logps/rejected": -212.53750610351562, "loss": 0.5375, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.934497058391571, "rewards/margins": 2.901562452316284, "rewards/rejected": -3.832714796066284, "step": 2340 }, { "epoch": 0.376, "grad_norm": 122.91610972272822, "learning_rate": 6.2416e-07, "logits/chosen": 0.7156738042831421, "logits/rejected": 0.8314208984375, "logps/chosen": -240.22500610351562, "logps/rejected": -212.85000610351562, "loss": 0.9552, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.134033203125, "rewards/margins": 1.863037109375, "rewards/rejected": -2.9966797828674316, "step": 2350 }, { "epoch": 0.3776, "grad_norm": 101.90796265947546, "learning_rate": 6.225599999999999e-07, "logits/chosen": 0.8014770746231079, "logits/rejected": 0.96826171875, "logps/chosen": -251.03750610351562, "logps/rejected": -216.66250610351562, "loss": 0.5233, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.930468738079071, "rewards/margins": 2.815380811691284, "rewards/rejected": -3.746875047683716, "step": 2360 }, { "epoch": 0.3792, "grad_norm": 78.44230568033473, "learning_rate": 6.209599999999999e-07, "logits/chosen": 0.7076660394668579, "logits/rejected": 0.866198718547821, "logps/chosen": -228.0124969482422, "logps/rejected": -206.125, "loss": 0.5426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8232787847518921, "rewards/margins": 3.203808546066284, "rewards/rejected": -4.024218559265137, "step": 2370 }, { "epoch": 0.3808, "grad_norm": 80.3797240754834, "learning_rate": 6.1936e-07, "logits/chosen": 0.654650866985321, "logits/rejected": 0.772534191608429, "logps/chosen": -260.29998779296875, "logps/rejected": -216.41250610351562, "loss": 0.7298, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.428198218345642, "rewards/margins": 2.495410203933716, "rewards/rejected": -3.9224610328674316, "step": 2380 }, { "epoch": 0.3824, "grad_norm": 84.1325652881116, "learning_rate": 6.1776e-07, "logits/chosen": 0.6282714605331421, "logits/rejected": 0.8133789300918579, "logps/chosen": -247.91250610351562, "logps/rejected": -230.4499969482422, "loss": 0.4771, "rewards/accuracies": 0.8125, "rewards/chosen": -1.09307861328125, "rewards/margins": 2.9456787109375, "rewards/rejected": -4.039746284484863, "step": 2390 }, { "epoch": 0.384, "grad_norm": 100.02562309404017, "learning_rate": 6.1616e-07, "logits/chosen": 0.5626465082168579, "logits/rejected": 0.7557617425918579, "logps/chosen": -227.64999389648438, "logps/rejected": -223.3625030517578, "loss": 0.6324, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5999206304550171, "rewards/margins": 2.9884276390075684, "rewards/rejected": -3.591015577316284, "step": 2400 }, { "epoch": 0.3856, "grad_norm": 77.26975675790722, "learning_rate": 6.1456e-07, "logits/chosen": 0.6801818609237671, "logits/rejected": 0.848071277141571, "logps/chosen": -222.875, "logps/rejected": -201.0749969482422, "loss": 0.6516, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.416772484779358, "rewards/margins": 2.528125047683716, "rewards/rejected": -3.943164110183716, "step": 2410 }, { "epoch": 0.3872, "grad_norm": 171.53756616292787, "learning_rate": 6.1296e-07, "logits/chosen": 0.6691650152206421, "logits/rejected": 0.7833007574081421, "logps/chosen": -254.5500030517578, "logps/rejected": -226.16250610351562, "loss": 0.4868, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.6100708246231079, "rewards/margins": 3.36865234375, "rewards/rejected": -3.975756883621216, "step": 2420 }, { "epoch": 0.3888, "grad_norm": 79.04109910807037, "learning_rate": 6.113599999999999e-07, "logits/chosen": 0.5101379156112671, "logits/rejected": 0.709826648235321, "logps/chosen": -253.22500610351562, "logps/rejected": -222.14999389648438, "loss": 0.6505, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7132751941680908, "rewards/margins": 2.599902391433716, "rewards/rejected": -4.312548637390137, "step": 2430 }, { "epoch": 0.3904, "grad_norm": 50.14019801331669, "learning_rate": 6.097599999999999e-07, "logits/chosen": 0.6258178949356079, "logits/rejected": 0.704882800579071, "logps/chosen": -237.85000610351562, "logps/rejected": -202.3874969482422, "loss": 0.6304, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3046753406524658, "rewards/margins": 3.0289063453674316, "rewards/rejected": -4.3271484375, "step": 2440 }, { "epoch": 0.392, "grad_norm": 151.4877946009793, "learning_rate": 6.0816e-07, "logits/chosen": 0.5773345828056335, "logits/rejected": 0.741625964641571, "logps/chosen": -212.14999389648438, "logps/rejected": -187.1062469482422, "loss": 0.5621, "rewards/accuracies": 0.71875, "rewards/chosen": -1.169531226158142, "rewards/margins": 2.4952635765075684, "rewards/rejected": -3.6631836891174316, "step": 2450 }, { "epoch": 0.3936, "grad_norm": 88.39089504871788, "learning_rate": 6.0656e-07, "logits/chosen": 0.70355224609375, "logits/rejected": 0.8645995855331421, "logps/chosen": -220.1125030517578, "logps/rejected": -187.8874969482422, "loss": 0.423, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5592772960662842, "rewards/margins": 3.029980421066284, "rewards/rejected": -4.590039253234863, "step": 2460 }, { "epoch": 0.3952, "grad_norm": 37.26774464911289, "learning_rate": 6.0496e-07, "logits/chosen": 0.6357421875, "logits/rejected": 0.691088855266571, "logps/chosen": -241.2624969482422, "logps/rejected": -192.8125, "loss": 0.5322, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1833007335662842, "rewards/margins": 2.824169874191284, "rewards/rejected": -4.00390625, "step": 2470 }, { "epoch": 0.3968, "grad_norm": 119.06985338945655, "learning_rate": 6.0336e-07, "logits/chosen": 0.5555664300918579, "logits/rejected": 0.7391601800918579, "logps/chosen": -244.8000030517578, "logps/rejected": -209.25, "loss": 0.6415, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4481933116912842, "rewards/margins": 2.5640625953674316, "rewards/rejected": -4.009863376617432, "step": 2480 }, { "epoch": 0.3984, "grad_norm": 62.34344902734852, "learning_rate": 6.0176e-07, "logits/chosen": 0.6489502191543579, "logits/rejected": 0.7471069097518921, "logps/chosen": -291.23748779296875, "logps/rejected": -265.26251220703125, "loss": 0.8521, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.806188941001892, "rewards/margins": 2.533886671066284, "rewards/rejected": -4.338281154632568, "step": 2490 }, { "epoch": 0.4, "grad_norm": 114.87512786976059, "learning_rate": 6.0016e-07, "logits/chosen": 0.5997864007949829, "logits/rejected": 0.725659191608429, "logps/chosen": -253.58749389648438, "logps/rejected": -226.4875030517578, "loss": 0.6254, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.17919921875, "rewards/margins": 2.7015624046325684, "rewards/rejected": -4.880273342132568, "step": 2500 }, { "epoch": 0.4016, "grad_norm": 153.7681943044082, "learning_rate": 5.985599999999999e-07, "logits/chosen": 0.693603515625, "logits/rejected": 0.8144165277481079, "logps/chosen": -269.38751220703125, "logps/rejected": -245.6875, "loss": 0.5658, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.4209227561950684, "rewards/margins": 2.5579590797424316, "rewards/rejected": -4.979784965515137, "step": 2510 }, { "epoch": 0.4032, "grad_norm": 140.17047949538733, "learning_rate": 5.9696e-07, "logits/chosen": 0.6505371332168579, "logits/rejected": 0.7375427484512329, "logps/chosen": -236.58749389648438, "logps/rejected": -221.2624969482422, "loss": 0.4705, "rewards/accuracies": 0.84375, "rewards/chosen": -2.586718797683716, "rewards/margins": 2.831249952316284, "rewards/rejected": -5.418554782867432, "step": 2520 }, { "epoch": 0.4048, "grad_norm": 105.4197420259573, "learning_rate": 5.9536e-07, "logits/chosen": 0.6558593511581421, "logits/rejected": 0.7953246831893921, "logps/chosen": -248.1374969482422, "logps/rejected": -216.1125030517578, "loss": 0.467, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6210448741912842, "rewards/margins": 3.527636766433716, "rewards/rejected": -5.144921779632568, "step": 2530 }, { "epoch": 0.4064, "grad_norm": 196.24923696319055, "learning_rate": 5.937599999999999e-07, "logits/chosen": 0.647595226764679, "logits/rejected": 0.78125, "logps/chosen": -267.17498779296875, "logps/rejected": -215.3625030517578, "loss": 0.6576, "rewards/accuracies": 0.78125, "rewards/chosen": -3.032910108566284, "rewards/margins": 2.757861375808716, "rewards/rejected": -5.790625095367432, "step": 2540 }, { "epoch": 0.408, "grad_norm": 190.18215900755632, "learning_rate": 5.9216e-07, "logits/chosen": 0.721789538860321, "logits/rejected": 0.892749011516571, "logps/chosen": -241.8000030517578, "logps/rejected": -199.75, "loss": 0.5397, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.3612303733825684, "rewards/margins": 2.616748094558716, "rewards/rejected": -5.975781440734863, "step": 2550 }, { "epoch": 0.4096, "grad_norm": 171.42174379703474, "learning_rate": 5.9056e-07, "logits/chosen": 0.5922485589981079, "logits/rejected": 0.777905285358429, "logps/chosen": -265.32501220703125, "logps/rejected": -241.0, "loss": 0.7378, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.836230516433716, "rewards/margins": 2.822265625, "rewards/rejected": -5.662011623382568, "step": 2560 }, { "epoch": 0.4112, "grad_norm": 139.1503242572505, "learning_rate": 5.889600000000001e-07, "logits/chosen": 0.65142822265625, "logits/rejected": 0.8030334711074829, "logps/chosen": -230.7375030517578, "logps/rejected": -219.1125030517578, "loss": 0.6888, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5654540061950684, "rewards/margins": 2.033935546875, "rewards/rejected": -4.599511623382568, "step": 2570 }, { "epoch": 0.4128, "grad_norm": 154.34632557959182, "learning_rate": 5.873599999999999e-07, "logits/chosen": 0.5485168695449829, "logits/rejected": 0.669903576374054, "logps/chosen": -263.13751220703125, "logps/rejected": -224.5500030517578, "loss": 0.7721, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6251952648162842, "rewards/margins": 2.7750000953674316, "rewards/rejected": -4.397656440734863, "step": 2580 }, { "epoch": 0.4144, "grad_norm": 68.12304173249493, "learning_rate": 5.857599999999999e-07, "logits/chosen": 0.507952868938446, "logits/rejected": 0.678210437297821, "logps/chosen": -247.5124969482422, "logps/rejected": -197.1125030517578, "loss": 0.3836, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.708703637123108, "rewards/margins": 3.3333497047424316, "rewards/rejected": -5.044531345367432, "step": 2590 }, { "epoch": 0.416, "grad_norm": 166.18285041808818, "learning_rate": 5.8416e-07, "logits/chosen": 0.4722228944301605, "logits/rejected": 0.5632904171943665, "logps/chosen": -266.2875061035156, "logps/rejected": -226.35000610351562, "loss": 0.5015, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.1387450695037842, "rewards/margins": 3.3224120140075684, "rewards/rejected": -4.460156440734863, "step": 2600 }, { "epoch": 0.4176, "grad_norm": 223.5573327863398, "learning_rate": 5.825599999999999e-07, "logits/chosen": 0.42528074979782104, "logits/rejected": 0.571826159954071, "logps/chosen": -251.39999389648438, "logps/rejected": -237.1374969482422, "loss": 0.674, "rewards/accuracies": 0.71875, "rewards/chosen": -1.968969702720642, "rewards/margins": 3.3397459983825684, "rewards/rejected": -5.305859565734863, "step": 2610 }, { "epoch": 0.4192, "grad_norm": 108.05776464667896, "learning_rate": 5.8096e-07, "logits/chosen": 0.5618225336074829, "logits/rejected": 0.723553478717804, "logps/chosen": -250.64999389648438, "logps/rejected": -220.3625030517578, "loss": 0.756, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.2275147438049316, "rewards/margins": 2.3970704078674316, "rewards/rejected": -5.626172065734863, "step": 2620 }, { "epoch": 0.4208, "grad_norm": 140.94248775753996, "learning_rate": 5.7936e-07, "logits/chosen": NaN, "logits/rejected": 0.8041747808456421, "logps/chosen": -222.375, "logps/rejected": -199.85000610351562, "loss": 0.5667, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.053466796875, "rewards/margins": 2.876757860183716, "rewards/rejected": -5.92919921875, "step": 2630 }, { "epoch": 0.4224, "grad_norm": 201.0403423910384, "learning_rate": 5.777600000000001e-07, "logits/chosen": 0.7263549566268921, "logits/rejected": 0.8509155511856079, "logps/chosen": -217.77499389648438, "logps/rejected": -220.28750610351562, "loss": 0.8184, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9300658702850342, "rewards/margins": 2.7294921875, "rewards/rejected": -4.658007621765137, "step": 2640 }, { "epoch": 0.424, "grad_norm": 62.56802019265078, "learning_rate": 5.761599999999999e-07, "logits/chosen": 0.574084460735321, "logits/rejected": 0.6549926996231079, "logps/chosen": -254.77499389648438, "logps/rejected": -232.39999389648438, "loss": 0.5131, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.532617211341858, "rewards/margins": 3.368896484375, "rewards/rejected": -4.900781154632568, "step": 2650 }, { "epoch": 0.4256, "grad_norm": 90.46822024528534, "learning_rate": 5.745599999999999e-07, "logits/chosen": NaN, "logits/rejected": 0.76318359375, "logps/chosen": -247.83749389648438, "logps/rejected": -224.7624969482422, "loss": 0.5824, "rewards/accuracies": 0.75, "rewards/chosen": -2.5792479515075684, "rewards/margins": 2.9717774391174316, "rewards/rejected": -5.548828125, "step": 2660 }, { "epoch": 0.4272, "grad_norm": 131.23456292113664, "learning_rate": 5.7296e-07, "logits/chosen": 0.609997570514679, "logits/rejected": 0.7974869012832642, "logps/chosen": -261.2250061035156, "logps/rejected": -242.78750610351562, "loss": 0.4664, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.078369140625, "rewards/margins": 3.491406202316284, "rewards/rejected": -5.568359375, "step": 2670 }, { "epoch": 0.4288, "grad_norm": 127.49377458001084, "learning_rate": 5.7136e-07, "logits/chosen": 0.6330810785293579, "logits/rejected": 0.756542980670929, "logps/chosen": -241.97500610351562, "logps/rejected": -214.0749969482422, "loss": 0.5458, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.918115258216858, "rewards/margins": 3.219775438308716, "rewards/rejected": -5.13623046875, "step": 2680 }, { "epoch": 0.4304, "grad_norm": 208.6408353630427, "learning_rate": 5.6976e-07, "logits/chosen": 0.6503661870956421, "logits/rejected": 0.771557629108429, "logps/chosen": -282.42498779296875, "logps/rejected": -242.53750610351562, "loss": 0.7571, "rewards/accuracies": 0.78125, "rewards/chosen": -1.575415015220642, "rewards/margins": 3.038378953933716, "rewards/rejected": -4.612011909484863, "step": 2690 }, { "epoch": 0.432, "grad_norm": 91.73048595826702, "learning_rate": 5.6816e-07, "logits/chosen": 0.6047118902206421, "logits/rejected": 0.815966784954071, "logps/chosen": -231.3125, "logps/rejected": -205.6125030517578, "loss": 0.5449, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.814477562904358, "rewards/margins": 2.644726514816284, "rewards/rejected": -4.459765434265137, "step": 2700 }, { "epoch": 0.4336, "grad_norm": 147.87979802638233, "learning_rate": 5.6656e-07, "logits/chosen": 0.625396728515625, "logits/rejected": 0.772326648235321, "logps/chosen": -256.2124938964844, "logps/rejected": -238.41250610351562, "loss": 0.6432, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8225586414337158, "rewards/margins": 2.865283250808716, "rewards/rejected": -4.684374809265137, "step": 2710 }, { "epoch": 0.4352, "grad_norm": 166.74613871046267, "learning_rate": 5.649599999999999e-07, "logits/chosen": 0.6835082769393921, "logits/rejected": 0.793505847454071, "logps/chosen": -248.7375030517578, "logps/rejected": -223.7375030517578, "loss": 0.6812, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4861328601837158, "rewards/margins": 3.0428709983825684, "rewards/rejected": -4.530468940734863, "step": 2720 }, { "epoch": 0.4368, "grad_norm": 123.45277796372092, "learning_rate": 5.633599999999999e-07, "logits/chosen": 0.6211883425712585, "logits/rejected": 0.7761596441268921, "logps/chosen": -242.60000610351562, "logps/rejected": -226.7375030517578, "loss": 0.3879, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3355469703674316, "rewards/margins": 3.3915038108825684, "rewards/rejected": -5.727734565734863, "step": 2730 }, { "epoch": 0.4384, "grad_norm": 155.28248808234613, "learning_rate": 5.6176e-07, "logits/chosen": 0.6507323980331421, "logits/rejected": 0.809643566608429, "logps/chosen": -260.86248779296875, "logps/rejected": -233.9499969482422, "loss": 0.6403, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.678082227706909, "rewards/margins": 2.731152296066284, "rewards/rejected": -5.4111328125, "step": 2740 }, { "epoch": 0.44, "grad_norm": 100.65696691199474, "learning_rate": 5.6016e-07, "logits/chosen": 0.6201537847518921, "logits/rejected": 0.7697998285293579, "logps/chosen": -235.8125, "logps/rejected": -207.25, "loss": 0.445, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.562206983566284, "rewards/margins": 2.808789014816284, "rewards/rejected": -6.370312690734863, "step": 2750 }, { "epoch": 0.4416, "grad_norm": 166.06118257136015, "learning_rate": 5.585599999999999e-07, "logits/chosen": 0.771533191204071, "logits/rejected": 0.921826183795929, "logps/chosen": -249.375, "logps/rejected": -192.0500030517578, "loss": 0.5237, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.951709032058716, "rewards/margins": 2.931396484375, "rewards/rejected": -5.880078315734863, "step": 2760 }, { "epoch": 0.4432, "grad_norm": 128.45563999928487, "learning_rate": 5.5696e-07, "logits/chosen": 0.6591430902481079, "logits/rejected": 0.770825207233429, "logps/chosen": -244.6374969482422, "logps/rejected": -243.27499389648438, "loss": 0.672, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.149365186691284, "rewards/margins": 2.8584227561950684, "rewards/rejected": -5.003222465515137, "step": 2770 }, { "epoch": 0.4448, "grad_norm": 35.79960626531477, "learning_rate": 5.5536e-07, "logits/chosen": 0.751452624797821, "logits/rejected": 0.8978637456893921, "logps/chosen": -256.54998779296875, "logps/rejected": -214.5, "loss": 0.4795, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.649169921875, "rewards/margins": 3.259960889816284, "rewards/rejected": -5.908251762390137, "step": 2780 }, { "epoch": 0.4464, "grad_norm": 107.7233118577926, "learning_rate": 5.537600000000001e-07, "logits/chosen": 0.5712035894393921, "logits/rejected": 0.673022449016571, "logps/chosen": -248.625, "logps/rejected": -217.97500610351562, "loss": 0.5474, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.155029296875, "rewards/margins": 2.9395508766174316, "rewards/rejected": -5.091796875, "step": 2790 }, { "epoch": 0.448, "grad_norm": 100.81904449373147, "learning_rate": 5.521599999999999e-07, "logits/chosen": 0.660449206829071, "logits/rejected": 0.8841797113418579, "logps/chosen": -239.4375, "logps/rejected": -228.85000610351562, "loss": 0.771, "rewards/accuracies": 0.75, "rewards/chosen": -2.456298828125, "rewards/margins": 2.580127000808716, "rewards/rejected": -5.036328315734863, "step": 2800 }, { "epoch": 0.4496, "grad_norm": 151.0311747124155, "learning_rate": 5.5056e-07, "logits/chosen": 0.6712890863418579, "logits/rejected": 0.8293396234512329, "logps/chosen": -233.5124969482422, "logps/rejected": -208.875, "loss": 0.6546, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.485913038253784, "rewards/margins": 2.6935057640075684, "rewards/rejected": -5.179345607757568, "step": 2810 }, { "epoch": 0.4512, "grad_norm": 141.71221498043155, "learning_rate": 5.4896e-07, "logits/chosen": 0.7000732421875, "logits/rejected": 0.8107376098632812, "logps/chosen": -234.8625030517578, "logps/rejected": -214.0124969482422, "loss": 0.5446, "rewards/accuracies": 0.71875, "rewards/chosen": -2.4765625, "rewards/margins": 2.4646973609924316, "rewards/rejected": -4.941113471984863, "step": 2820 }, { "epoch": 0.4528, "grad_norm": 91.37375251440345, "learning_rate": 5.473599999999999e-07, "logits/chosen": 0.6611572504043579, "logits/rejected": 0.823046863079071, "logps/chosen": -253.33749389648438, "logps/rejected": -230.77499389648438, "loss": 0.563, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0724120140075684, "rewards/margins": 2.7376465797424316, "rewards/rejected": -5.80859375, "step": 2830 }, { "epoch": 0.4544, "grad_norm": 147.50535553291277, "learning_rate": 5.4576e-07, "logits/chosen": 0.786334216594696, "logits/rejected": 0.9217773675918579, "logps/chosen": -240.85000610351562, "logps/rejected": -229.5124969482422, "loss": 0.5069, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.7281737327575684, "rewards/margins": 2.873242139816284, "rewards/rejected": -5.604101657867432, "step": 2840 }, { "epoch": 0.456, "grad_norm": 122.67157773324723, "learning_rate": 5.4416e-07, "logits/chosen": 0.613940417766571, "logits/rejected": 0.833587646484375, "logps/chosen": -220.6125030517578, "logps/rejected": -193.4250030517578, "loss": 0.6878, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2653260231018066, "rewards/margins": 2.818066358566284, "rewards/rejected": -5.084570407867432, "step": 2850 }, { "epoch": 0.4576, "grad_norm": 110.48867092094011, "learning_rate": 5.425600000000001e-07, "logits/chosen": 0.649169921875, "logits/rejected": 0.8060547113418579, "logps/chosen": -219.6374969482422, "logps/rejected": -212.1125030517578, "loss": 0.4667, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.68475341796875, "rewards/margins": 2.8666014671325684, "rewards/rejected": -4.55078125, "step": 2860 }, { "epoch": 0.4592, "grad_norm": 175.30276977491582, "learning_rate": 5.409599999999999e-07, "logits/chosen": 0.651690661907196, "logits/rejected": 0.8277832269668579, "logps/chosen": -220.75, "logps/rejected": -206.2624969482422, "loss": 0.7066, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.12493896484375, "rewards/margins": 2.908398389816284, "rewards/rejected": -4.03662109375, "step": 2870 }, { "epoch": 0.4608, "grad_norm": 334.8974576359162, "learning_rate": 5.393599999999999e-07, "logits/chosen": 0.695098876953125, "logits/rejected": NaN, "logps/chosen": -257.42498779296875, "logps/rejected": -220.8000030517578, "loss": 0.5169, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1355712413787842, "rewards/margins": 3.124316453933716, "rewards/rejected": -4.258203029632568, "step": 2880 }, { "epoch": 0.4624, "grad_norm": 195.72265332143817, "learning_rate": 5.3776e-07, "logits/chosen": 0.627185046672821, "logits/rejected": 0.8171173334121704, "logps/chosen": -258.67498779296875, "logps/rejected": -209.0124969482422, "loss": 0.5209, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5839111804962158, "rewards/margins": 3.0150389671325684, "rewards/rejected": -4.598242282867432, "step": 2890 }, { "epoch": 0.464, "grad_norm": 172.94830278641876, "learning_rate": 5.361599999999999e-07, "logits/chosen": 0.625378429889679, "logits/rejected": 0.737579345703125, "logps/chosen": -252.28750610351562, "logps/rejected": -212.96249389648438, "loss": 0.6763, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7634032964706421, "rewards/margins": 3.078320264816284, "rewards/rejected": -3.843066453933716, "step": 2900 }, { "epoch": 0.4656, "grad_norm": 147.21344187294133, "learning_rate": 5.3456e-07, "logits/chosen": 0.584307849407196, "logits/rejected": 0.775134265422821, "logps/chosen": -255.125, "logps/rejected": -207.2624969482422, "loss": 0.6531, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.730859398841858, "rewards/margins": 2.950732469558716, "rewards/rejected": -4.678906440734863, "step": 2910 }, { "epoch": 0.4672, "grad_norm": 194.75068980697992, "learning_rate": 5.3296e-07, "logits/chosen": 0.720013439655304, "logits/rejected": 0.8738037347793579, "logps/chosen": -277.42498779296875, "logps/rejected": -224.47500610351562, "loss": 0.5982, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.822265625, "rewards/margins": 2.938720703125, "rewards/rejected": -4.761889457702637, "step": 2920 }, { "epoch": 0.4688, "grad_norm": 85.06649745696132, "learning_rate": 5.313600000000001e-07, "logits/chosen": 0.626220703125, "logits/rejected": 0.7823486328125, "logps/chosen": -274.79998779296875, "logps/rejected": -243.5, "loss": 0.6761, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.872607469558716, "rewards/margins": 2.91943359375, "rewards/rejected": -5.791406154632568, "step": 2930 }, { "epoch": 0.4704, "grad_norm": 67.6497858180999, "learning_rate": 5.2976e-07, "logits/chosen": 0.719500720500946, "logits/rejected": 0.7916015386581421, "logps/chosen": -264.5625, "logps/rejected": -251.53750610351562, "loss": 0.5633, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.876025438308716, "rewards/margins": 3.0220704078674316, "rewards/rejected": -5.896679878234863, "step": 2940 }, { "epoch": 0.472, "grad_norm": 86.30205368419041, "learning_rate": 5.281599999999999e-07, "logits/chosen": 0.7191406488418579, "logits/rejected": 0.8896728754043579, "logps/chosen": -241.9250030517578, "logps/rejected": -207.78750610351562, "loss": 0.697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.994433641433716, "rewards/margins": 2.38037109375, "rewards/rejected": -5.37646484375, "step": 2950 }, { "epoch": 0.4736, "grad_norm": 164.03342948632562, "learning_rate": 5.2656e-07, "logits/chosen": 0.512133777141571, "logits/rejected": 0.6598266363143921, "logps/chosen": -257.7875061035156, "logps/rejected": -231.8000030517578, "loss": 0.7769, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.976269483566284, "rewards/margins": 2.633593797683716, "rewards/rejected": -5.614062309265137, "step": 2960 }, { "epoch": 0.4752, "grad_norm": 109.52304801341582, "learning_rate": 5.2496e-07, "logits/chosen": 0.634552001953125, "logits/rejected": 0.804516613483429, "logps/chosen": -241.9375, "logps/rejected": -225.8125, "loss": 0.6068, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.103320360183716, "rewards/margins": 2.734814405441284, "rewards/rejected": -5.837109565734863, "step": 2970 }, { "epoch": 0.4768, "grad_norm": 118.84921304135302, "learning_rate": 5.2336e-07, "logits/chosen": 0.7454833984375, "logits/rejected": 0.878125011920929, "logps/chosen": -241.28750610351562, "logps/rejected": -232.125, "loss": 0.6293, "rewards/accuracies": 0.78125, "rewards/chosen": -2.80908203125, "rewards/margins": 2.956738233566284, "rewards/rejected": -5.768359184265137, "step": 2980 }, { "epoch": 0.4784, "grad_norm": 115.01969677634393, "learning_rate": 5.2176e-07, "logits/chosen": 0.805468738079071, "logits/rejected": 0.9224609136581421, "logps/chosen": -263.8999938964844, "logps/rejected": -228.4875030517578, "loss": 0.6016, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.16748046875, "rewards/margins": 2.1290526390075684, "rewards/rejected": -5.297070503234863, "step": 2990 }, { "epoch": 0.48, "grad_norm": 138.14115989134893, "learning_rate": 5.2016e-07, "logits/chosen": 0.7022460699081421, "logits/rejected": 0.8174804449081421, "logps/chosen": -246.58749389648438, "logps/rejected": -235.77499389648438, "loss": 0.5927, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0460448265075684, "rewards/margins": 2.786083936691284, "rewards/rejected": -4.833398342132568, "step": 3000 }, { "epoch": 0.4816, "grad_norm": 97.70299666290603, "learning_rate": 5.1856e-07, "logits/chosen": 0.7508300542831421, "logits/rejected": 0.9314788579940796, "logps/chosen": -251.53750610351562, "logps/rejected": -204.7375030517578, "loss": 0.5541, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8261961936950684, "rewards/margins": 2.510498046875, "rewards/rejected": -5.339648246765137, "step": 3010 }, { "epoch": 0.4832, "grad_norm": 116.19503453815493, "learning_rate": 5.169599999999999e-07, "logits/chosen": 0.8140869140625, "logits/rejected": 1.035986304283142, "logps/chosen": -219.4375, "logps/rejected": -213.3874969482422, "loss": 0.6416, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.48834228515625, "rewards/margins": 2.423583984375, "rewards/rejected": -4.908593654632568, "step": 3020 }, { "epoch": 0.4848, "grad_norm": 211.3458083011255, "learning_rate": 5.1536e-07, "logits/chosen": 0.8699951171875, "logits/rejected": 0.9072021245956421, "logps/chosen": -245.47500610351562, "logps/rejected": -224.1374969482422, "loss": 0.5248, "rewards/accuracies": 0.75, "rewards/chosen": -2.343066453933716, "rewards/margins": 2.8250489234924316, "rewards/rejected": -5.172461032867432, "step": 3030 }, { "epoch": 0.4864, "grad_norm": 211.36257523476112, "learning_rate": 5.1376e-07, "logits/chosen": 0.7731689214706421, "logits/rejected": 0.888439953327179, "logps/chosen": -257.11248779296875, "logps/rejected": -202.16250610351562, "loss": 0.5196, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.317138671875, "rewards/margins": 2.562060594558716, "rewards/rejected": -4.879101753234863, "step": 3040 }, { "epoch": 0.488, "grad_norm": 90.10869896317409, "learning_rate": 5.121599999999999e-07, "logits/chosen": 0.808032214641571, "logits/rejected": 0.9343017339706421, "logps/chosen": -267.26251220703125, "logps/rejected": -219.5, "loss": 0.5436, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.707006812095642, "rewards/margins": 3.28076171875, "rewards/rejected": -4.984961032867432, "step": 3050 }, { "epoch": 0.4896, "grad_norm": 22.835919024810803, "learning_rate": 5.1056e-07, "logits/chosen": 0.7641845941543579, "logits/rejected": 0.922607421875, "logps/chosen": -244.9375, "logps/rejected": -228.3125, "loss": 0.6969, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.082922339439392, "rewards/margins": 3.2518553733825684, "rewards/rejected": -4.3359375, "step": 3060 }, { "epoch": 0.4912, "grad_norm": 122.70223042177881, "learning_rate": 5.0896e-07, "logits/chosen": 0.75384521484375, "logits/rejected": 0.872174084186554, "logps/chosen": -245.5, "logps/rejected": -204.3125, "loss": 0.6831, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.27756959199905396, "rewards/margins": 2.6890625953674316, "rewards/rejected": -2.965405225753784, "step": 3070 }, { "epoch": 0.4928, "grad_norm": 38.52734658804351, "learning_rate": 5.0736e-07, "logits/chosen": 0.707763671875, "logits/rejected": 0.8760741949081421, "logps/chosen": -240.2624969482422, "logps/rejected": -218.0500030517578, "loss": 0.6825, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1358153820037842, "rewards/margins": 3.143115282058716, "rewards/rejected": -4.283398628234863, "step": 3080 }, { "epoch": 0.4944, "grad_norm": 95.45157329991619, "learning_rate": 5.057599999999999e-07, "logits/chosen": NaN, "logits/rejected": 0.92315673828125, "logps/chosen": -243.9375, "logps/rejected": -434.3125, "loss": 0.5596, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7620849609375, "rewards/margins": 7.066552639007568, "rewards/rejected": -8.832568168640137, "step": 3090 }, { "epoch": 0.496, "grad_norm": 83.04598659902403, "learning_rate": 5.0416e-07, "logits/chosen": 0.7906249761581421, "logits/rejected": 0.9541015625, "logps/chosen": -238.8249969482422, "logps/rejected": -201.5749969482422, "loss": 0.5641, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5287353992462158, "rewards/margins": 3.3692870140075684, "rewards/rejected": -4.897558689117432, "step": 3100 }, { "epoch": 0.4976, "grad_norm": 71.39636567364606, "learning_rate": 5.0256e-07, "logits/chosen": 0.779675304889679, "logits/rejected": 0.8396240472793579, "logps/chosen": -265.2250061035156, "logps/rejected": -224.9499969482422, "loss": 0.4392, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.815893530845642, "rewards/margins": 3.5870361328125, "rewards/rejected": -5.401562690734863, "step": 3110 }, { "epoch": 0.4992, "grad_norm": 104.70805956869181, "learning_rate": 5.009599999999999e-07, "logits/chosen": 0.8216308355331421, "logits/rejected": 0.858325183391571, "logps/chosen": -241.8874969482422, "logps/rejected": -218.9375, "loss": 0.8275, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.801806688308716, "rewards/margins": 2.213183641433716, "rewards/rejected": -5.014843940734863, "step": 3120 }, { "epoch": 0.5008, "grad_norm": 197.74988092504987, "learning_rate": 4.9936e-07, "logits/chosen": 0.774169921875, "logits/rejected": 0.9549804925918579, "logps/chosen": -240.5625, "logps/rejected": -217.8125, "loss": 0.6888, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.0562500953674316, "rewards/margins": 2.3226318359375, "rewards/rejected": -5.377539157867432, "step": 3130 }, { "epoch": 0.5024, "grad_norm": 125.28365619103961, "learning_rate": 4.9776e-07, "logits/chosen": 0.779467761516571, "logits/rejected": 0.8573364019393921, "logps/chosen": -236.6875, "logps/rejected": -218.5124969482422, "loss": 0.5514, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.673144578933716, "rewards/margins": 2.977099657058716, "rewards/rejected": -5.650390625, "step": 3140 }, { "epoch": 0.504, "grad_norm": 94.78844192555279, "learning_rate": 4.9616e-07, "logits/chosen": 0.7393432855606079, "logits/rejected": 1.0047180652618408, "logps/chosen": -238.2624969482422, "logps/rejected": -213.41250610351562, "loss": 0.6115, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.43896484375, "rewards/margins": 3.111035108566284, "rewards/rejected": -5.553027153015137, "step": 3150 }, { "epoch": 0.5056, "grad_norm": 54.1551006790649, "learning_rate": 4.9456e-07, "logits/chosen": 0.7741943597793579, "logits/rejected": 0.8946777582168579, "logps/chosen": -248.9375, "logps/rejected": -216.14999389648438, "loss": 0.5191, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.703320264816284, "rewards/margins": 2.891406297683716, "rewards/rejected": -5.596093654632568, "step": 3160 }, { "epoch": 0.5072, "grad_norm": 134.96077896786642, "learning_rate": 4.929599999999999e-07, "logits/chosen": 0.695629894733429, "logits/rejected": 0.8460693359375, "logps/chosen": -246.9250030517578, "logps/rejected": -219.83749389648438, "loss": 0.6073, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.126477003097534, "rewards/margins": 2.6314454078674316, "rewards/rejected": -5.762109279632568, "step": 3170 }, { "epoch": 0.5088, "grad_norm": 58.36066953015736, "learning_rate": 4.9136e-07, "logits/chosen": 0.714672863483429, "logits/rejected": 0.837603747844696, "logps/chosen": -251.5, "logps/rejected": -223.66250610351562, "loss": 0.5619, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.28143310546875, "rewards/margins": 2.962207078933716, "rewards/rejected": -5.243261814117432, "step": 3180 }, { "epoch": 0.5104, "grad_norm": 97.48518610456644, "learning_rate": 4.897599999999999e-07, "logits/chosen": 0.799267590045929, "logits/rejected": 0.981640636920929, "logps/chosen": -241.6125030517578, "logps/rejected": -225.9250030517578, "loss": 0.5776, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.087646484375, "rewards/margins": 3.292236328125, "rewards/rejected": -5.379687309265137, "step": 3190 }, { "epoch": 0.512, "grad_norm": 123.14339470000206, "learning_rate": 4.8816e-07, "logits/chosen": NaN, "logits/rejected": 0.9389404058456421, "logps/chosen": -268.92498779296875, "logps/rejected": -435.1625061035156, "loss": 0.638, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4115417003631592, "rewards/margins": 7.299218654632568, "rewards/rejected": -8.703516006469727, "step": 3200 }, { "epoch": 0.5136, "grad_norm": 60.36819379242825, "learning_rate": 4.8656e-07, "logits/chosen": 0.818408191204071, "logits/rejected": NaN, "logps/chosen": -243.72500610351562, "logps/rejected": -214.8125, "loss": 0.6064, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.979882836341858, "rewards/margins": 2.6727538108825684, "rewards/rejected": -4.650292873382568, "step": 3210 }, { "epoch": 0.5152, "grad_norm": 152.46107562208192, "learning_rate": 4.8496e-07, "logits/chosen": 0.7169433832168579, "logits/rejected": 0.8781677484512329, "logps/chosen": -248.0124969482422, "logps/rejected": -245.10000610351562, "loss": 0.6198, "rewards/accuracies": 0.71875, "rewards/chosen": -2.04833984375, "rewards/margins": 2.981982469558716, "rewards/rejected": -5.028124809265137, "step": 3220 }, { "epoch": 0.5168, "grad_norm": 80.70408311319929, "learning_rate": 4.8336e-07, "logits/chosen": NaN, "logits/rejected": 0.8918091058731079, "logps/chosen": -246.77499389648438, "logps/rejected": -226.03750610351562, "loss": 0.5888, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7969849109649658, "rewards/margins": 3.123730421066284, "rewards/rejected": -4.916577339172363, "step": 3230 }, { "epoch": 0.5184, "grad_norm": 157.9132291654122, "learning_rate": 4.8176e-07, "logits/chosen": 0.798382580280304, "logits/rejected": 0.930493175983429, "logps/chosen": -232.77499389648438, "logps/rejected": -216.25, "loss": 0.4944, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6298339366912842, "rewards/margins": 3.2694334983825684, "rewards/rejected": -4.897656440734863, "step": 3240 }, { "epoch": 0.52, "grad_norm": 82.73265905449881, "learning_rate": 4.801599999999999e-07, "logits/chosen": 0.788403332233429, "logits/rejected": 0.903247058391571, "logps/chosen": -254.72500610351562, "logps/rejected": -237.39999389648438, "loss": 0.4594, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6685059070587158, "rewards/margins": 3.4544434547424316, "rewards/rejected": -5.124560356140137, "step": 3250 }, { "epoch": 0.5216, "grad_norm": 110.97815427106897, "learning_rate": 4.785599999999999e-07, "logits/chosen": 0.7823486328125, "logits/rejected": 0.9935302734375, "logps/chosen": -276.4125061035156, "logps/rejected": -212.53750610351562, "loss": 0.4552, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6578369140625, "rewards/margins": 3.224609375, "rewards/rejected": -4.8818359375, "step": 3260 }, { "epoch": 0.5232, "grad_norm": 167.52920987307832, "learning_rate": 4.7696e-07, "logits/chosen": 0.7235107421875, "logits/rejected": 0.820605456829071, "logps/chosen": -265.17498779296875, "logps/rejected": -232.47500610351562, "loss": 0.5266, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9161376953125, "rewards/margins": 2.925488233566284, "rewards/rejected": -4.841015815734863, "step": 3270 }, { "epoch": 0.5248, "grad_norm": 170.0889037280101, "learning_rate": 4.7536e-07, "logits/chosen": 0.71868896484375, "logits/rejected": 0.741564929485321, "logps/chosen": -256.38751220703125, "logps/rejected": -215.8625030517578, "loss": 0.5692, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.897900402545929, "rewards/margins": 3.1343750953674316, "rewards/rejected": -4.031445503234863, "step": 3280 }, { "epoch": 0.5264, "grad_norm": 49.759670099028575, "learning_rate": 4.7376e-07, "logits/chosen": 0.798388659954071, "logits/rejected": 0.8388916254043579, "logps/chosen": -257.36248779296875, "logps/rejected": -227.78750610351562, "loss": 0.4753, "rewards/accuracies": 0.8125, "rewards/chosen": -0.757080078125, "rewards/margins": 3.470507860183716, "rewards/rejected": -4.225976467132568, "step": 3290 }, { "epoch": 0.528, "grad_norm": 90.30277021743302, "learning_rate": 4.7216e-07, "logits/chosen": 0.7439941167831421, "logits/rejected": 0.852368175983429, "logps/chosen": -240.6125030517578, "logps/rejected": -236.9875030517578, "loss": 0.534, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8604980707168579, "rewards/margins": 3.36083984375, "rewards/rejected": -4.223535060882568, "step": 3300 }, { "epoch": 0.5296, "grad_norm": 206.04736136029345, "learning_rate": 4.7055999999999995e-07, "logits/chosen": 0.762377917766571, "logits/rejected": 0.9734863042831421, "logps/chosen": -246.16250610351562, "logps/rejected": -221.0500030517578, "loss": 0.702, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2774658203125, "rewards/margins": 2.578418016433716, "rewards/rejected": -3.8558592796325684, "step": 3310 }, { "epoch": 0.5312, "grad_norm": 127.03827936970102, "learning_rate": 4.6896e-07, "logits/chosen": 0.8592529296875, "logits/rejected": 0.974383533000946, "logps/chosen": -251.8625030517578, "logps/rejected": -212.96249389648438, "loss": 0.6049, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.490502953529358, "rewards/margins": 2.689013719558716, "rewards/rejected": -4.181066989898682, "step": 3320 }, { "epoch": 0.5328, "grad_norm": 138.80923225906295, "learning_rate": 4.6735999999999997e-07, "logits/chosen": 0.8569580316543579, "logits/rejected": 1.0310547351837158, "logps/chosen": -240.8625030517578, "logps/rejected": -222.5124969482422, "loss": 0.8517, "rewards/accuracies": 0.6875, "rewards/chosen": -2.277587890625, "rewards/margins": 2.26513671875, "rewards/rejected": -4.543554782867432, "step": 3330 }, { "epoch": 0.5344, "grad_norm": 64.34134604254014, "learning_rate": 4.6576e-07, "logits/chosen": NaN, "logits/rejected": 0.9678955078125, "logps/chosen": -265.88751220703125, "logps/rejected": -222.02499389648438, "loss": 0.502, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.662207007408142, "rewards/margins": 3.4117188453674316, "rewards/rejected": -5.070703029632568, "step": 3340 }, { "epoch": 0.536, "grad_norm": 222.75394743369253, "learning_rate": 4.6416e-07, "logits/chosen": 0.9559570550918579, "logits/rejected": 0.9713134765625, "logps/chosen": -248.125, "logps/rejected": -203.8125, "loss": 0.7047, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.226831078529358, "rewards/margins": 2.81591796875, "rewards/rejected": -4.041211128234863, "step": 3350 }, { "epoch": 0.5376, "grad_norm": 94.43619859892611, "learning_rate": 4.6256e-07, "logits/chosen": 0.7421509027481079, "logits/rejected": 0.783984363079071, "logps/chosen": -244.39999389648438, "logps/rejected": -238.8562469482422, "loss": 0.7834, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.069799780845642, "rewards/margins": 2.3690428733825684, "rewards/rejected": -3.4374756813049316, "step": 3360 }, { "epoch": 0.5392, "grad_norm": 155.12701813026763, "learning_rate": 4.6095999999999997e-07, "logits/chosen": 0.843518078327179, "logits/rejected": 0.992565929889679, "logps/chosen": -232.46249389648438, "logps/rejected": -199.6875, "loss": 0.4384, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.901318371295929, "rewards/margins": 2.883227586746216, "rewards/rejected": -3.784228563308716, "step": 3370 }, { "epoch": 0.5408, "grad_norm": 131.8639360698009, "learning_rate": 4.5935999999999995e-07, "logits/chosen": 0.8504272699356079, "logits/rejected": 0.887377917766571, "logps/chosen": -258.6000061035156, "logps/rejected": -238.3874969482422, "loss": 0.7722, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7423827648162842, "rewards/margins": 2.329730272293091, "rewards/rejected": -4.070019721984863, "step": 3380 }, { "epoch": 0.5424, "grad_norm": 52.16626174385673, "learning_rate": 4.5776e-07, "logits/chosen": 0.9061523675918579, "logits/rejected": 1.0849120616912842, "logps/chosen": -234.0, "logps/rejected": -223.8125, "loss": 0.3574, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.1852049827575684, "rewards/margins": 2.982617139816284, "rewards/rejected": -5.163281440734863, "step": 3390 }, { "epoch": 0.544, "grad_norm": 154.77383776442343, "learning_rate": 4.5616e-07, "logits/chosen": 0.8178955316543579, "logits/rejected": 0.863476574420929, "logps/chosen": -243.0625, "logps/rejected": -199.2624969482422, "loss": 0.7327, "rewards/accuracies": 0.75, "rewards/chosen": -2.221435546875, "rewards/margins": 2.5313963890075684, "rewards/rejected": -4.752734184265137, "step": 3400 }, { "epoch": 0.5456, "grad_norm": 115.15474530940313, "learning_rate": 4.5456e-07, "logits/chosen": 0.69232177734375, "logits/rejected": 0.914074718952179, "logps/chosen": -258.25, "logps/rejected": -226.39999389648438, "loss": 0.5691, "rewards/accuracies": 0.78125, "rewards/chosen": -1.358618140220642, "rewards/margins": 3.537646532058716, "rewards/rejected": -4.892870903015137, "step": 3410 }, { "epoch": 0.5472, "grad_norm": 223.71444240911254, "learning_rate": 4.5295999999999995e-07, "logits/chosen": 0.7466064691543579, "logits/rejected": 0.768078625202179, "logps/chosen": -273.32501220703125, "logps/rejected": -225.6750030517578, "loss": 0.4577, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.343994140625, "rewards/margins": 3.649707078933716, "rewards/rejected": -4.991113185882568, "step": 3420 }, { "epoch": 0.5488, "grad_norm": 72.18291989104627, "learning_rate": 4.5136e-07, "logits/chosen": 0.762377917766571, "logits/rejected": 0.971630871295929, "logps/chosen": -254.33749389648438, "logps/rejected": -212.8625030517578, "loss": 0.6529, "rewards/accuracies": 0.78125, "rewards/chosen": -2.150073289871216, "rewards/margins": 3.298290967941284, "rewards/rejected": -5.4453125, "step": 3430 }, { "epoch": 0.5504, "grad_norm": 36.15207859434299, "learning_rate": 4.4975999999999997e-07, "logits/chosen": 0.731951892375946, "logits/rejected": 0.9183349609375, "logps/chosen": -246.03750610351562, "logps/rejected": -210.33749389648438, "loss": 0.537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2860960960388184, "rewards/margins": 3.384960889816284, "rewards/rejected": -5.673828125, "step": 3440 }, { "epoch": 0.552, "grad_norm": 162.6630511041297, "learning_rate": 4.4815999999999996e-07, "logits/chosen": 0.7335449457168579, "logits/rejected": 0.899707019329071, "logps/chosen": -234.3874969482422, "logps/rejected": -213.3874969482422, "loss": 0.6214, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.533496141433716, "rewards/margins": 2.606250047683716, "rewards/rejected": -5.136328220367432, "step": 3450 }, { "epoch": 0.5536, "grad_norm": 113.56667526883294, "learning_rate": 4.4656e-07, "logits/chosen": 0.751171886920929, "logits/rejected": 0.9686523675918579, "logps/chosen": -259.2124938964844, "logps/rejected": -230.6374969482422, "loss": 0.6916, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.5538086891174316, "rewards/margins": 2.832080125808716, "rewards/rejected": -5.387109279632568, "step": 3460 }, { "epoch": 0.5552, "grad_norm": 135.09068965341334, "learning_rate": 4.4496e-07, "logits/chosen": 0.7906128168106079, "logits/rejected": 0.9534912109375, "logps/chosen": -241.66250610351562, "logps/rejected": -246.5625, "loss": 0.9148, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.225170850753784, "rewards/margins": 2.528027296066284, "rewards/rejected": -5.750390529632568, "step": 3470 }, { "epoch": 0.5568, "grad_norm": 135.2015080760663, "learning_rate": 4.4335999999999997e-07, "logits/chosen": 0.793896496295929, "logits/rejected": 0.9232422113418579, "logps/chosen": -246.10000610351562, "logps/rejected": -210.89999389648438, "loss": 0.7373, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.182421922683716, "rewards/margins": 2.8865723609924316, "rewards/rejected": -6.066210746765137, "step": 3480 }, { "epoch": 0.5584, "grad_norm": 157.9137854804258, "learning_rate": 4.4175999999999995e-07, "logits/chosen": 0.7548828125, "logits/rejected": 0.807690441608429, "logps/chosen": -250.25, "logps/rejected": -232.5625, "loss": 0.6934, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.626171827316284, "rewards/margins": 2.7628417015075684, "rewards/rejected": -6.386523246765137, "step": 3490 }, { "epoch": 0.56, "grad_norm": 140.55370387725654, "learning_rate": 4.4016e-07, "logits/chosen": 0.703717052936554, "logits/rejected": 0.912731945514679, "logps/chosen": -245.3625030517578, "logps/rejected": -231.53750610351562, "loss": 0.3665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2926268577575684, "rewards/margins": 2.9754881858825684, "rewards/rejected": -6.266797065734863, "step": 3500 }, { "epoch": 0.5616, "grad_norm": 94.23539817223339, "learning_rate": 4.3856e-07, "logits/chosen": 0.655224621295929, "logits/rejected": 0.7891845703125, "logps/chosen": -236.9499969482422, "logps/rejected": -211.9499969482422, "loss": 0.3125, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.938232421875, "rewards/margins": 3.292236328125, "rewards/rejected": -6.230078220367432, "step": 3510 }, { "epoch": 0.5632, "grad_norm": 155.6400803443239, "learning_rate": 4.3696e-07, "logits/chosen": 0.6993163824081421, "logits/rejected": 0.8944336175918579, "logps/chosen": -244.8125, "logps/rejected": -220.5625, "loss": 0.5019, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.80047607421875, "rewards/margins": 3.1827149391174316, "rewards/rejected": -5.982421875, "step": 3520 }, { "epoch": 0.5648, "grad_norm": 106.75575530523697, "learning_rate": 4.3536e-07, "logits/chosen": 0.6878082156181335, "logits/rejected": 0.8208678960800171, "logps/chosen": -238.1125030517578, "logps/rejected": -248.0749969482422, "loss": 0.6864, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.875732421875, "rewards/margins": 2.69873046875, "rewards/rejected": -5.574023246765137, "step": 3530 }, { "epoch": 0.5664, "grad_norm": 111.30658526641723, "learning_rate": 4.3375999999999993e-07, "logits/chosen": 0.7451171875, "logits/rejected": 0.89697265625, "logps/chosen": -256.125, "logps/rejected": -212.0124969482422, "loss": 0.5464, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.552441358566284, "rewards/margins": 3.2855467796325684, "rewards/rejected": -5.838086128234863, "step": 3540 }, { "epoch": 0.568, "grad_norm": 104.51391073069158, "learning_rate": 4.3215999999999997e-07, "logits/chosen": 0.606640636920929, "logits/rejected": 0.825927734375, "logps/chosen": -251.0124969482422, "logps/rejected": -222.0500030517578, "loss": 0.4833, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.7357420921325684, "rewards/margins": 3.059521436691284, "rewards/rejected": -5.795702934265137, "step": 3550 }, { "epoch": 0.5696, "grad_norm": 118.60800916647578, "learning_rate": 4.3055999999999996e-07, "logits/chosen": 0.672314465045929, "logits/rejected": 0.8662109375, "logps/chosen": -218.83749389648438, "logps/rejected": -217.41250610351562, "loss": 0.5692, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.583544969558716, "rewards/margins": 2.8711915016174316, "rewards/rejected": -5.455859184265137, "step": 3560 }, { "epoch": 0.5712, "grad_norm": 164.67742998607955, "learning_rate": 4.2896e-07, "logits/chosen": 0.72576904296875, "logits/rejected": 0.894335925579071, "logps/chosen": -237.625, "logps/rejected": -239.75, "loss": 0.5613, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.6636719703674316, "rewards/margins": 2.982177734375, "rewards/rejected": -5.641797065734863, "step": 3570 }, { "epoch": 0.5728, "grad_norm": 99.73825626131074, "learning_rate": 4.2736e-07, "logits/chosen": 0.626416027545929, "logits/rejected": 0.88165283203125, "logps/chosen": -255.4250030517578, "logps/rejected": -253.8249969482422, "loss": 0.6637, "rewards/accuracies": 0.75, "rewards/chosen": -2.4070677757263184, "rewards/margins": 3.074902296066284, "rewards/rejected": -5.48046875, "step": 3580 }, { "epoch": 0.5744, "grad_norm": 146.36720680379634, "learning_rate": 4.2576e-07, "logits/chosen": 0.7412353754043579, "logits/rejected": 0.884625256061554, "logps/chosen": -254.8125, "logps/rejected": -226.75, "loss": 0.705, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.5702881813049316, "rewards/margins": 2.428759813308716, "rewards/rejected": -4.998632907867432, "step": 3590 }, { "epoch": 0.576, "grad_norm": 148.52349741892837, "learning_rate": 4.2415999999999995e-07, "logits/chosen": 0.690356433391571, "logits/rejected": 0.74951171875, "logps/chosen": -250.2624969482422, "logps/rejected": -201.8625030517578, "loss": 0.4318, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9990723133087158, "rewards/margins": 2.882128953933716, "rewards/rejected": -4.882616996765137, "step": 3600 }, { "epoch": 0.5776, "grad_norm": 204.32006190940368, "learning_rate": 4.2256e-07, "logits/chosen": 0.6988281011581421, "logits/rejected": 0.829028308391571, "logps/chosen": -258.6625061035156, "logps/rejected": -247.4375, "loss": 0.58, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4901123046875, "rewards/margins": 3.091992139816284, "rewards/rejected": -4.581445217132568, "step": 3610 }, { "epoch": 0.5792, "grad_norm": 151.54100029125564, "learning_rate": 4.2096e-07, "logits/chosen": 0.753466784954071, "logits/rejected": 0.88385009765625, "logps/chosen": -243.41250610351562, "logps/rejected": -210.3000030517578, "loss": 0.4652, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.320703148841858, "rewards/margins": 3.331591844558716, "rewards/rejected": -4.650000095367432, "step": 3620 }, { "epoch": 0.5808, "grad_norm": 97.99028458601735, "learning_rate": 4.1936e-07, "logits/chosen": 0.6845008730888367, "logits/rejected": 0.792187511920929, "logps/chosen": -246.66250610351562, "logps/rejected": -220.6750030517578, "loss": 0.4478, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.3924682140350342, "rewards/margins": 3.653613328933716, "rewards/rejected": -5.043164253234863, "step": 3630 }, { "epoch": 0.5824, "grad_norm": 50.11660675103012, "learning_rate": 4.1776e-07, "logits/chosen": 0.7339447140693665, "logits/rejected": 0.8564453125, "logps/chosen": -251.9499969482422, "logps/rejected": -258.32501220703125, "loss": 0.5759, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.310620069503784, "rewards/margins": 3.0391602516174316, "rewards/rejected": -5.34619140625, "step": 3640 }, { "epoch": 0.584, "grad_norm": 119.98717736463216, "learning_rate": 4.1615999999999994e-07, "logits/chosen": 0.718768298625946, "logits/rejected": 0.798046886920929, "logps/chosen": -239.33749389648438, "logps/rejected": -215.22500610351562, "loss": 0.4716, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.3174805641174316, "rewards/margins": 3.108593702316284, "rewards/rejected": -5.429785251617432, "step": 3650 }, { "epoch": 0.5856, "grad_norm": 71.38481005890736, "learning_rate": 4.1456e-07, "logits/chosen": 0.635333240032196, "logits/rejected": 0.6854248046875, "logps/chosen": -243.71249389648438, "logps/rejected": -231.1999969482422, "loss": 0.6502, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.4076170921325684, "rewards/margins": 2.2418456077575684, "rewards/rejected": -5.650781154632568, "step": 3660 }, { "epoch": 0.5872, "grad_norm": 85.47030125397784, "learning_rate": 4.1295999999999996e-07, "logits/chosen": 0.6491333246231079, "logits/rejected": 0.791796863079071, "logps/chosen": -264.5, "logps/rejected": -225.53750610351562, "loss": 0.5762, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.4432616233825684, "rewards/margins": 3.119824171066284, "rewards/rejected": -6.561327934265137, "step": 3670 }, { "epoch": 0.5888, "grad_norm": 66.66156632064603, "learning_rate": 4.1136e-07, "logits/chosen": 0.694348156452179, "logits/rejected": 0.771728515625, "logps/chosen": -248.64999389648438, "logps/rejected": -232.3874969482422, "loss": 0.4744, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.017382860183716, "rewards/margins": 3.1041016578674316, "rewards/rejected": -6.120703220367432, "step": 3680 }, { "epoch": 0.5904, "grad_norm": 167.90263616676322, "learning_rate": 4.0976e-07, "logits/chosen": 0.7219177484512329, "logits/rejected": 0.8450683355331421, "logps/chosen": -254.96249389648438, "logps/rejected": -233.5, "loss": 0.4907, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0536131858825684, "rewards/margins": 3.1053223609924316, "rewards/rejected": -6.157812595367432, "step": 3690 }, { "epoch": 0.592, "grad_norm": 23.690368847496593, "learning_rate": 4.0816e-07, "logits/chosen": 0.671679675579071, "logits/rejected": 0.903839111328125, "logps/chosen": -268.8125, "logps/rejected": -271.2875061035156, "loss": 0.7089, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.874951124191284, "rewards/margins": 3.2841796875, "rewards/rejected": -6.160546779632568, "step": 3700 }, { "epoch": 0.5936, "grad_norm": 68.85284351417087, "learning_rate": 4.0655999999999996e-07, "logits/chosen": 0.74017333984375, "logits/rejected": 0.8541015386581421, "logps/chosen": -236.14999389648438, "logps/rejected": -217.96249389648438, "loss": 0.6554, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.817089796066284, "rewards/margins": 2.43798828125, "rewards/rejected": -5.256249904632568, "step": 3710 }, { "epoch": 0.5952, "grad_norm": 193.19479935862444, "learning_rate": 4.0496e-07, "logits/chosen": 0.70257568359375, "logits/rejected": 0.81927490234375, "logps/chosen": -246.64999389648438, "logps/rejected": -234.4875030517578, "loss": 0.65, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.959863305091858, "rewards/margins": 3.5055174827575684, "rewards/rejected": -5.467968940734863, "step": 3720 }, { "epoch": 0.5968, "grad_norm": 72.52955716861426, "learning_rate": 4.0336e-07, "logits/chosen": 0.7730957269668579, "logits/rejected": 0.928295910358429, "logps/chosen": -260.5375061035156, "logps/rejected": -213.6999969482422, "loss": 0.504, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8497314453125, "rewards/margins": 2.853076219558716, "rewards/rejected": -4.703906059265137, "step": 3730 }, { "epoch": 0.5984, "grad_norm": 129.9080173196447, "learning_rate": 4.0175999999999997e-07, "logits/chosen": 0.798999011516571, "logits/rejected": 0.8811065554618835, "logps/chosen": -236.8874969482422, "logps/rejected": -239.5124969482422, "loss": 0.5237, "rewards/accuracies": 0.75, "rewards/chosen": -1.27142333984375, "rewards/margins": 3.2975831031799316, "rewards/rejected": -4.569628715515137, "step": 3740 }, { "epoch": 0.6, "grad_norm": 141.47847522376875, "learning_rate": 4.0016e-07, "logits/chosen": 0.7231384515762329, "logits/rejected": 0.900097668170929, "logps/chosen": -228.6374969482422, "logps/rejected": -220.8000030517578, "loss": 0.5184, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6506836414337158, "rewards/margins": 3.0199217796325684, "rewards/rejected": -4.670507907867432, "step": 3750 }, { "epoch": 0.6016, "grad_norm": 168.01308001635374, "learning_rate": 3.9856e-07, "logits/chosen": 0.708789050579071, "logits/rejected": 0.8398681879043579, "logps/chosen": -247.3625030517578, "logps/rejected": -224.10000610351562, "loss": 0.5524, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.342541456222534, "rewards/margins": 2.7401366233825684, "rewards/rejected": -5.081933498382568, "step": 3760 }, { "epoch": 0.6032, "grad_norm": 95.4965923043596, "learning_rate": 3.9696e-07, "logits/chosen": 0.7160888910293579, "logits/rejected": 0.905468761920929, "logps/chosen": -252.64999389648438, "logps/rejected": -225.71249389648438, "loss": 0.6255, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.0108399391174316, "rewards/margins": 2.3823485374450684, "rewards/rejected": -5.392578125, "step": 3770 }, { "epoch": 0.6048, "grad_norm": 191.77657587550627, "learning_rate": 3.9535999999999996e-07, "logits/chosen": 0.7446655035018921, "logits/rejected": 0.8780517578125, "logps/chosen": -256.1499938964844, "logps/rejected": -232.10000610351562, "loss": 0.6089, "rewards/accuracies": 0.78125, "rewards/chosen": -1.044921875, "rewards/margins": 3.4188475608825684, "rewards/rejected": -4.462573051452637, "step": 3780 }, { "epoch": 0.6064, "grad_norm": 204.23977540578528, "learning_rate": 3.9376e-07, "logits/chosen": 0.802539050579071, "logits/rejected": 0.908435046672821, "logps/chosen": -237.5749969482422, "logps/rejected": -204.5, "loss": 0.6863, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.216601610183716, "rewards/margins": 2.6817383766174316, "rewards/rejected": -4.902148246765137, "step": 3790 }, { "epoch": 0.608, "grad_norm": 142.01758093382426, "learning_rate": 3.9216e-07, "logits/chosen": 0.786791980266571, "logits/rejected": 0.942431628704071, "logps/chosen": -245.83749389648438, "logps/rejected": -209.41250610351562, "loss": 0.5158, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.6932616233825684, "rewards/margins": 2.742724657058716, "rewards/rejected": -5.434765815734863, "step": 3800 }, { "epoch": 0.6096, "grad_norm": 204.9264282010986, "learning_rate": 3.9056e-07, "logits/chosen": 0.723236083984375, "logits/rejected": 0.8797057867050171, "logps/chosen": -266.0375061035156, "logps/rejected": -229.5500030517578, "loss": 0.7987, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.209667921066284, "rewards/margins": 2.751025438308716, "rewards/rejected": -4.961230278015137, "step": 3810 }, { "epoch": 0.6112, "grad_norm": 97.84708049426999, "learning_rate": 3.8895999999999996e-07, "logits/chosen": 0.805041491985321, "logits/rejected": 0.868823230266571, "logps/chosen": -243.0, "logps/rejected": -215.3625030517578, "loss": 0.6057, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8158690929412842, "rewards/margins": 2.8870911598205566, "rewards/rejected": -4.704150199890137, "step": 3820 }, { "epoch": 0.6128, "grad_norm": 104.49882329896423, "learning_rate": 3.8735999999999994e-07, "logits/chosen": 0.794689953327179, "logits/rejected": 0.8484131097793579, "logps/chosen": -250.0625, "logps/rejected": -237.5124969482422, "loss": 0.644, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.983251929283142, "rewards/margins": 2.831249952316284, "rewards/rejected": -4.812890529632568, "step": 3830 }, { "epoch": 0.6144, "grad_norm": 81.06331224197352, "learning_rate": 3.8576e-07, "logits/chosen": 0.698962390422821, "logits/rejected": 0.849322497844696, "logps/chosen": -241.375, "logps/rejected": -203.89999389648438, "loss": 0.6694, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7626221179962158, "rewards/margins": 2.751708984375, "rewards/rejected": -4.51416015625, "step": 3840 }, { "epoch": 0.616, "grad_norm": 129.3659847745369, "learning_rate": 3.8415999999999997e-07, "logits/chosen": 0.772998034954071, "logits/rejected": 0.956347644329071, "logps/chosen": -254.3249969482422, "logps/rejected": -217.77499389648438, "loss": 0.5509, "rewards/accuracies": 0.75, "rewards/chosen": -2.155139207839966, "rewards/margins": 2.773193359375, "rewards/rejected": -4.929589748382568, "step": 3850 }, { "epoch": 0.6176, "grad_norm": 170.04367206829875, "learning_rate": 3.8256e-07, "logits/chosen": 0.8574463129043579, "logits/rejected": 0.88848876953125, "logps/chosen": -234.60000610351562, "logps/rejected": -211.5625, "loss": 0.5093, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2385926246643066, "rewards/margins": 2.9419922828674316, "rewards/rejected": -5.181836128234863, "step": 3860 }, { "epoch": 0.6192, "grad_norm": 119.49715250908717, "learning_rate": 3.8096e-07, "logits/chosen": 0.72686767578125, "logits/rejected": 0.8303467035293579, "logps/chosen": -250.53750610351562, "logps/rejected": -225.64999389648438, "loss": 0.5437, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.0412840843200684, "rewards/margins": 3.303955078125, "rewards/rejected": -5.344336032867432, "step": 3870 }, { "epoch": 0.6208, "grad_norm": 65.00114363165702, "learning_rate": 3.7936e-07, "logits/chosen": 0.7477782964706421, "logits/rejected": 0.92529296875, "logps/chosen": -265.3500061035156, "logps/rejected": -234.47500610351562, "loss": 0.5812, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.42919921875, "rewards/margins": 3.059326171875, "rewards/rejected": -5.485156059265137, "step": 3880 }, { "epoch": 0.6224, "grad_norm": 173.69289309733537, "learning_rate": 3.7775999999999996e-07, "logits/chosen": 0.884521484375, "logits/rejected": 1.023095726966858, "logps/chosen": -236.4499969482422, "logps/rejected": -204.91250610351562, "loss": 0.5351, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8209471702575684, "rewards/margins": 2.968945264816284, "rewards/rejected": -5.7919921875, "step": 3890 }, { "epoch": 0.624, "grad_norm": 118.14937319593942, "learning_rate": 3.7616e-07, "logits/chosen": 0.7898803949356079, "logits/rejected": 0.8680664300918579, "logps/chosen": -242.96249389648438, "logps/rejected": -218.9250030517578, "loss": 0.5668, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.472607374191284, "rewards/margins": 3.1023926734924316, "rewards/rejected": -5.573828220367432, "step": 3900 }, { "epoch": 0.6256, "grad_norm": 146.07069245689496, "learning_rate": 3.7456e-07, "logits/chosen": 0.6823364496231079, "logits/rejected": 0.820556640625, "logps/chosen": -227.25, "logps/rejected": -207.91250610351562, "loss": 0.434, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.210253953933716, "rewards/margins": 3.503222703933716, "rewards/rejected": -5.713281154632568, "step": 3910 }, { "epoch": 0.6272, "grad_norm": 133.6504022719185, "learning_rate": 3.7295999999999997e-07, "logits/chosen": 0.613757312297821, "logits/rejected": 0.783557116985321, "logps/chosen": -264.20001220703125, "logps/rejected": -229.3625030517578, "loss": 0.469, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.8385863304138184, "rewards/margins": 3.360156297683716, "rewards/rejected": -6.200390815734863, "step": 3920 }, { "epoch": 0.6288, "grad_norm": 175.639079579612, "learning_rate": 3.7136e-07, "logits/chosen": 0.734088122844696, "logits/rejected": 0.936694324016571, "logps/chosen": -254.28750610351562, "logps/rejected": -208.6374969482422, "loss": 0.6021, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.183154344558716, "rewards/margins": 2.870800733566284, "rewards/rejected": -6.053906440734863, "step": 3930 }, { "epoch": 0.6304, "grad_norm": 69.45805198913405, "learning_rate": 3.6975999999999994e-07, "logits/chosen": 0.830761730670929, "logits/rejected": 1.0082519054412842, "logps/chosen": -229.1125030517578, "logps/rejected": -216.5500030517578, "loss": 0.4305, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.696484327316284, "rewards/margins": 2.95263671875, "rewards/rejected": -6.647656440734863, "step": 3940 }, { "epoch": 0.632, "grad_norm": 105.32655276305401, "learning_rate": 3.6816e-07, "logits/chosen": 0.750244140625, "logits/rejected": 0.9693847894668579, "logps/chosen": -256.0, "logps/rejected": -231.25, "loss": 0.6828, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.5176758766174316, "rewards/margins": 2.4066405296325684, "rewards/rejected": -5.921191215515137, "step": 3950 }, { "epoch": 0.6336, "grad_norm": 188.82003982861116, "learning_rate": 3.6655999999999997e-07, "logits/chosen": 0.6719970703125, "logits/rejected": NaN, "logps/chosen": -251.77499389648438, "logps/rejected": -225.08749389648438, "loss": 0.7166, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.499072313308716, "rewards/margins": 2.263867139816284, "rewards/rejected": -5.763867378234863, "step": 3960 }, { "epoch": 0.6352, "grad_norm": 72.3565128577758, "learning_rate": 3.6496e-07, "logits/chosen": 0.872729480266571, "logits/rejected": 1.015771508216858, "logps/chosen": -230.8000030517578, "logps/rejected": -222.41250610351562, "loss": 0.5167, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9489502906799316, "rewards/margins": 2.686474561691284, "rewards/rejected": -5.637109279632568, "step": 3970 }, { "epoch": 0.6368, "grad_norm": 131.04670652553375, "learning_rate": 3.6336e-07, "logits/chosen": 0.71337890625, "logits/rejected": 0.868115246295929, "logps/chosen": -238.9250030517578, "logps/rejected": -223.3000030517578, "loss": 0.4368, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4249510765075684, "rewards/margins": 3.52197265625, "rewards/rejected": -5.946093559265137, "step": 3980 }, { "epoch": 0.6384, "grad_norm": 58.21486435576126, "learning_rate": 3.6176000000000003e-07, "logits/chosen": 0.732897937297821, "logits/rejected": 0.896557629108429, "logps/chosen": -270.73748779296875, "logps/rejected": -252.14999389648438, "loss": 0.6658, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0274415016174316, "rewards/margins": 3.223388671875, "rewards/rejected": -5.250390529632568, "step": 3990 }, { "epoch": 0.64, "grad_norm": 97.05265659603872, "learning_rate": 3.6015999999999996e-07, "logits/chosen": 0.862841784954071, "logits/rejected": 0.927539050579071, "logps/chosen": -249.9375, "logps/rejected": -234.10000610351562, "loss": 0.4449, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2833495140075684, "rewards/margins": 3.556347608566284, "rewards/rejected": -5.839257717132568, "step": 4000 }, { "epoch": 0.6416, "grad_norm": 112.018069753924, "learning_rate": 3.5855999999999995e-07, "logits/chosen": 0.8719238042831421, "logits/rejected": 1.013403296470642, "logps/chosen": -265.7124938964844, "logps/rejected": -215.6750030517578, "loss": 0.5282, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9124023914337158, "rewards/margins": 3.1434569358825684, "rewards/rejected": -5.0576171875, "step": 4010 }, { "epoch": 0.6432, "grad_norm": 108.68670347505017, "learning_rate": 3.5696e-07, "logits/chosen": 0.7332519292831421, "logits/rejected": 0.8372802734375, "logps/chosen": -278.5625, "logps/rejected": -232.6374969482422, "loss": 0.4206, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.309716820716858, "rewards/margins": 3.7557616233825684, "rewards/rejected": -5.067822456359863, "step": 4020 }, { "epoch": 0.6448, "grad_norm": 65.70521338742213, "learning_rate": 3.5536e-07, "logits/chosen": 0.878002941608429, "logits/rejected": 0.974804699420929, "logps/chosen": -272.95001220703125, "logps/rejected": -237.58749389648438, "loss": 0.6186, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.333300828933716, "rewards/margins": 3.263964891433716, "rewards/rejected": -5.592968940734863, "step": 4030 }, { "epoch": 0.6464, "grad_norm": 180.56877630188325, "learning_rate": 3.5376e-07, "logits/chosen": 0.8079468011856079, "logits/rejected": 1.00146484375, "logps/chosen": -250.5749969482422, "logps/rejected": -245.97500610351562, "loss": 0.5719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6470704078674316, "rewards/margins": 3.4745116233825684, "rewards/rejected": -6.120800971984863, "step": 4040 }, { "epoch": 0.648, "grad_norm": 144.2594508028891, "learning_rate": 3.5215999999999995e-07, "logits/chosen": 0.8202148675918579, "logits/rejected": 0.8984740972518921, "logps/chosen": -239.25, "logps/rejected": -227.8625030517578, "loss": 0.5256, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.633471727371216, "rewards/margins": 3.2870116233825684, "rewards/rejected": -5.916601657867432, "step": 4050 }, { "epoch": 0.6496, "grad_norm": 35.53737513202295, "learning_rate": 3.5056e-07, "logits/chosen": 0.7031494379043579, "logits/rejected": 0.881591796875, "logps/chosen": -268.75, "logps/rejected": -213.875, "loss": 0.4878, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.127636671066284, "rewards/margins": 2.913623094558716, "rewards/rejected": -6.040625095367432, "step": 4060 }, { "epoch": 0.6512, "grad_norm": 196.66108346564522, "learning_rate": 3.4895999999999997e-07, "logits/chosen": 0.669140636920929, "logits/rejected": 0.868481457233429, "logps/chosen": -255.5124969482422, "logps/rejected": -227.3249969482422, "loss": 0.6461, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.391040086746216, "rewards/margins": 2.8857421875, "rewards/rejected": -6.276171684265137, "step": 4070 }, { "epoch": 0.6528, "grad_norm": 117.07151140120186, "learning_rate": 3.4736e-07, "logits/chosen": 0.775097668170929, "logits/rejected": 0.907958984375, "logps/chosen": -261.5874938964844, "logps/rejected": -230.9250030517578, "loss": 0.4764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9532713890075684, "rewards/margins": 3.32666015625, "rewards/rejected": -6.28125, "step": 4080 }, { "epoch": 0.6544, "grad_norm": 136.48385564641328, "learning_rate": 3.4576e-07, "logits/chosen": 0.8624023199081421, "logits/rejected": 0.979907214641571, "logps/chosen": -252.28750610351562, "logps/rejected": -227.4375, "loss": 0.7671, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.732226610183716, "rewards/margins": 2.6241211891174316, "rewards/rejected": -6.358202934265137, "step": 4090 }, { "epoch": 0.656, "grad_norm": 69.87619213756248, "learning_rate": 3.4416e-07, "logits/chosen": 0.6455078125, "logits/rejected": 0.913037121295929, "logps/chosen": -248.52499389648438, "logps/rejected": -220.4499969482422, "loss": 0.5661, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.8243651390075684, "rewards/margins": 3.455127000808716, "rewards/rejected": -6.275390625, "step": 4100 }, { "epoch": 0.6576, "grad_norm": 78.5085806094459, "learning_rate": 3.4255999999999997e-07, "logits/chosen": 0.781298816204071, "logits/rejected": 0.959716796875, "logps/chosen": -245.5749969482422, "logps/rejected": -208.72500610351562, "loss": 0.3416, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.2950072288513184, "rewards/margins": 4.238867282867432, "rewards/rejected": -6.5289306640625, "step": 4110 }, { "epoch": 0.6592, "grad_norm": 54.56351363806769, "learning_rate": 3.4095999999999995e-07, "logits/chosen": 0.72894287109375, "logits/rejected": 0.8223632574081421, "logps/chosen": -247.3249969482422, "logps/rejected": -224.58749389648438, "loss": 0.5422, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9384276866912842, "rewards/margins": 3.5370116233825684, "rewards/rejected": -5.471240043640137, "step": 4120 }, { "epoch": 0.6608, "grad_norm": 83.35200131810579, "learning_rate": 3.3936e-07, "logits/chosen": 0.7434074282646179, "logits/rejected": 0.860644519329071, "logps/chosen": -255.25, "logps/rejected": -233.66250610351562, "loss": 0.4757, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.223583936691284, "rewards/margins": 3.2012696266174316, "rewards/rejected": -5.422753810882568, "step": 4130 }, { "epoch": 0.6624, "grad_norm": 142.51598638722098, "learning_rate": 3.3776e-07, "logits/chosen": 0.9012695550918579, "logits/rejected": 0.9286864995956421, "logps/chosen": -250.66250610351562, "logps/rejected": -231.0625, "loss": 0.3888, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0341553688049316, "rewards/margins": 3.365771532058716, "rewards/rejected": -5.400000095367432, "step": 4140 }, { "epoch": 0.664, "grad_norm": 38.66555372778911, "learning_rate": 3.3616e-07, "logits/chosen": 0.8710571527481079, "logits/rejected": 0.955615222454071, "logps/chosen": -234.8249969482422, "logps/rejected": -193.5625, "loss": 0.429, "rewards/accuracies": 0.8125, "rewards/chosen": -2.092724561691284, "rewards/margins": 3.475146532058716, "rewards/rejected": -5.565625190734863, "step": 4150 }, { "epoch": 0.6656, "grad_norm": 137.7005478423889, "learning_rate": 3.3456e-07, "logits/chosen": 0.790881335735321, "logits/rejected": 0.8761962652206421, "logps/chosen": -250.53750610351562, "logps/rejected": -235.875, "loss": 0.517, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.185986280441284, "rewards/margins": 3.232714891433716, "rewards/rejected": -5.417187690734863, "step": 4160 }, { "epoch": 0.6672, "grad_norm": 137.67129458437674, "learning_rate": 3.3296e-07, "logits/chosen": 0.7396606206893921, "logits/rejected": 0.896624743938446, "logps/chosen": -240.58749389648438, "logps/rejected": -212.875, "loss": 0.5574, "rewards/accuracies": 0.8125, "rewards/chosen": -2.620361328125, "rewards/margins": 3.3904786109924316, "rewards/rejected": -6.010351657867432, "step": 4170 }, { "epoch": 0.6688, "grad_norm": 137.9515178913636, "learning_rate": 3.3135999999999997e-07, "logits/chosen": 0.825439453125, "logits/rejected": 0.964111328125, "logps/chosen": -270.7875061035156, "logps/rejected": -225.0625, "loss": 0.4813, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6363525390625, "rewards/margins": 3.6913084983825684, "rewards/rejected": -5.323437690734863, "step": 4180 }, { "epoch": 0.6704, "grad_norm": 66.00272594745516, "learning_rate": 3.2975999999999996e-07, "logits/chosen": 0.8145996332168579, "logits/rejected": 0.947314441204071, "logps/chosen": -252.3249969482422, "logps/rejected": -239.6125030517578, "loss": 0.5183, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.0152831077575684, "rewards/margins": 3.30029296875, "rewards/rejected": -5.316308498382568, "step": 4190 }, { "epoch": 0.672, "grad_norm": 223.386486686091, "learning_rate": 3.2816e-07, "logits/chosen": 0.746533215045929, "logits/rejected": 0.858715832233429, "logps/chosen": -261.57501220703125, "logps/rejected": -221.16250610351562, "loss": 0.5498, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.332934617996216, "rewards/margins": 3.3037109375, "rewards/rejected": -5.63671875, "step": 4200 }, { "epoch": 0.6736, "grad_norm": 112.28263160884279, "learning_rate": 3.2656e-07, "logits/chosen": 0.8147217035293579, "logits/rejected": 0.9227050542831421, "logps/chosen": -232.5124969482422, "logps/rejected": -211.39999389648438, "loss": 0.5221, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.4079833030700684, "rewards/margins": 3.201464891433716, "rewards/rejected": -5.607421875, "step": 4210 }, { "epoch": 0.6752, "grad_norm": 83.31906120195201, "learning_rate": 3.2496e-07, "logits/chosen": 0.7175048589706421, "logits/rejected": 0.857666015625, "logps/chosen": -235.9375, "logps/rejected": -209.08749389648438, "loss": 0.3604, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8209228515625, "rewards/margins": 3.182324171066284, "rewards/rejected": -5.000195503234863, "step": 4220 }, { "epoch": 0.6768, "grad_norm": 133.75700886959865, "learning_rate": 3.2335999999999995e-07, "logits/chosen": 0.7268127202987671, "logits/rejected": 0.8312011957168579, "logps/chosen": -235.6999969482422, "logps/rejected": -222.77499389648438, "loss": 0.4645, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.5624022483825684, "rewards/margins": 2.989941358566284, "rewards/rejected": -5.554101467132568, "step": 4230 }, { "epoch": 0.6784, "grad_norm": 119.19860684160687, "learning_rate": 3.2176e-07, "logits/chosen": 0.835156261920929, "logits/rejected": 1.0308105945587158, "logps/chosen": -230.9250030517578, "logps/rejected": -198.3874969482422, "loss": 0.5349, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7073121070861816, "rewards/margins": 3.083740234375, "rewards/rejected": -5.785546779632568, "step": 4240 }, { "epoch": 0.68, "grad_norm": 120.1712821778965, "learning_rate": 3.2016e-07, "logits/chosen": 0.8142455816268921, "logits/rejected": 0.945849597454071, "logps/chosen": -228.78750610351562, "logps/rejected": -221.5124969482422, "loss": 0.4318, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.20654296875, "rewards/margins": 3.066113233566284, "rewards/rejected": -6.271874904632568, "step": 4250 }, { "epoch": 0.6816, "grad_norm": 117.79315114853281, "learning_rate": 3.1856e-07, "logits/chosen": 0.770587146282196, "logits/rejected": 0.9659179449081421, "logps/chosen": -231.25, "logps/rejected": -207.7375030517578, "loss": 0.5887, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.567187547683716, "rewards/margins": 2.4107422828674316, "rewards/rejected": -5.975390434265137, "step": 4260 }, { "epoch": 0.6832, "grad_norm": 104.7916716981138, "learning_rate": 3.1696e-07, "logits/chosen": 0.7374267578125, "logits/rejected": 0.8310791254043579, "logps/chosen": -242.02499389648438, "logps/rejected": -228.8000030517578, "loss": 0.5841, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9706053733825684, "rewards/margins": 2.8362793922424316, "rewards/rejected": -5.8056640625, "step": 4270 }, { "epoch": 0.6848, "grad_norm": 74.63993490928631, "learning_rate": 3.1535999999999993e-07, "logits/chosen": 0.731494128704071, "logits/rejected": 0.838452160358429, "logps/chosen": -254.625, "logps/rejected": -206.8125, "loss": 0.3804, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.504150390625, "rewards/margins": 3.0579590797424316, "rewards/rejected": -5.5615234375, "step": 4280 }, { "epoch": 0.6864, "grad_norm": 79.51822342691126, "learning_rate": 3.1375999999999997e-07, "logits/chosen": 0.7542480230331421, "logits/rejected": 0.849841296672821, "logps/chosen": -255.1875, "logps/rejected": -223.78750610351562, "loss": 0.427, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8310058116912842, "rewards/margins": 3.3680663108825684, "rewards/rejected": -5.199316501617432, "step": 4290 }, { "epoch": 0.688, "grad_norm": 211.87219148033762, "learning_rate": 3.1215999999999996e-07, "logits/chosen": 0.6625610589981079, "logits/rejected": 0.8041747808456421, "logps/chosen": -210.0, "logps/rejected": -190.52499389648438, "loss": 0.5575, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8691527843475342, "rewards/margins": 3.04443359375, "rewards/rejected": -4.913476467132568, "step": 4300 }, { "epoch": 0.6896, "grad_norm": 105.9343163797224, "learning_rate": 3.1056e-07, "logits/chosen": 0.6974121332168579, "logits/rejected": 0.800732433795929, "logps/chosen": -240.5437469482422, "logps/rejected": -225.89999389648438, "loss": 0.4432, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.027587890625, "rewards/margins": 3.3248047828674316, "rewards/rejected": -4.356152534484863, "step": 4310 }, { "epoch": 0.6912, "grad_norm": 349.4655794444877, "learning_rate": 3.0896e-07, "logits/chosen": 0.762744128704071, "logits/rejected": 0.9122070074081421, "logps/chosen": -248.7624969482422, "logps/rejected": -245.97500610351562, "loss": 0.8631, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2655029296875, "rewards/margins": 2.814648389816284, "rewards/rejected": -4.078222751617432, "step": 4320 }, { "epoch": 0.6928, "grad_norm": 93.74763395888954, "learning_rate": 3.0736e-07, "logits/chosen": 0.6366943120956421, "logits/rejected": 0.748425304889679, "logps/chosen": -248.39999389648438, "logps/rejected": -210.0124969482422, "loss": 0.4662, "rewards/accuracies": 0.8125, "rewards/chosen": -1.050329566001892, "rewards/margins": 3.245800733566284, "rewards/rejected": -4.291601657867432, "step": 4330 }, { "epoch": 0.6944, "grad_norm": 171.8891880971162, "learning_rate": 3.0575999999999995e-07, "logits/chosen": 0.6362854242324829, "logits/rejected": 0.733935534954071, "logps/chosen": -243.3625030517578, "logps/rejected": -215.7375030517578, "loss": 0.7264, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0159423351287842, "rewards/margins": 2.6922850608825684, "rewards/rejected": -3.70654296875, "step": 4340 }, { "epoch": 0.696, "grad_norm": 76.31629464006157, "learning_rate": 3.0416e-07, "logits/chosen": 0.7487548589706421, "logits/rejected": 0.904541015625, "logps/chosen": -231.25, "logps/rejected": -204.1999969482422, "loss": 0.4635, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7296386957168579, "rewards/margins": 3.60284423828125, "rewards/rejected": -4.328906059265137, "step": 4350 }, { "epoch": 0.6976, "grad_norm": 61.730520244477844, "learning_rate": 3.0256e-07, "logits/chosen": 0.654980480670929, "logits/rejected": 0.838092029094696, "logps/chosen": -247.35000610351562, "logps/rejected": -211.3000030517578, "loss": 0.4772, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2656738758087158, "rewards/margins": 3.588085889816284, "rewards/rejected": -4.8525390625, "step": 4360 }, { "epoch": 0.6992, "grad_norm": 142.90939931734852, "learning_rate": 3.0096e-07, "logits/chosen": 0.694287121295929, "logits/rejected": 0.8456786870956421, "logps/chosen": -233.35000610351562, "logps/rejected": -207.9250030517578, "loss": 0.4681, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9881591796875, "rewards/margins": 3.484570264816284, "rewards/rejected": -5.473828315734863, "step": 4370 }, { "epoch": 0.7008, "grad_norm": 86.60258048213143, "learning_rate": 2.9936e-07, "logits/chosen": 0.623583972454071, "logits/rejected": 0.810864269733429, "logps/chosen": -246.1999969482422, "logps/rejected": -213.8874969482422, "loss": 0.3287, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.662664771080017, "rewards/margins": 3.9827637672424316, "rewards/rejected": -5.64453125, "step": 4380 }, { "epoch": 0.7024, "grad_norm": 49.128611588469745, "learning_rate": 2.9776e-07, "logits/chosen": 0.7290283441543579, "logits/rejected": 0.773754894733429, "logps/chosen": -234.375, "logps/rejected": -214.8000030517578, "loss": 0.4234, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.833837866783142, "rewards/margins": 3.441210985183716, "rewards/rejected": -5.274218559265137, "step": 4390 }, { "epoch": 0.704, "grad_norm": 39.59762652784034, "learning_rate": 2.9615999999999997e-07, "logits/chosen": 0.730908215045929, "logits/rejected": 0.8455810546875, "logps/chosen": -251.6999969482422, "logps/rejected": -205.5625, "loss": 0.4025, "rewards/accuracies": 0.84375, "rewards/chosen": -1.675195336341858, "rewards/margins": 3.3373780250549316, "rewards/rejected": -5.012011528015137, "step": 4400 }, { "epoch": 0.7056, "grad_norm": 223.70576473307696, "learning_rate": 2.9455999999999996e-07, "logits/chosen": 0.629913330078125, "logits/rejected": 0.7994445562362671, "logps/chosen": -237.375, "logps/rejected": -229.14999389648438, "loss": 0.5539, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.001208543777466, "rewards/margins": 3.63427734375, "rewards/rejected": -5.637499809265137, "step": 4410 }, { "epoch": 0.7072, "grad_norm": 72.22462893178663, "learning_rate": 2.9296e-07, "logits/chosen": 0.651440441608429, "logits/rejected": 0.838940441608429, "logps/chosen": -253.4875030517578, "logps/rejected": -228.9499969482422, "loss": 0.8552, "rewards/accuracies": 0.78125, "rewards/chosen": -2.91461181640625, "rewards/margins": 2.688525438308716, "rewards/rejected": -5.60546875, "step": 4420 }, { "epoch": 0.7088, "grad_norm": 105.57859956619657, "learning_rate": 2.9136e-07, "logits/chosen": 0.620373547077179, "logits/rejected": 0.7374206781387329, "logps/chosen": -262.61248779296875, "logps/rejected": -249.8874969482422, "loss": 0.408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.662280321121216, "rewards/margins": 3.9417481422424316, "rewards/rejected": -6.602734565734863, "step": 4430 }, { "epoch": 0.7104, "grad_norm": 188.53558423457233, "learning_rate": 2.8976e-07, "logits/chosen": 0.708789050579071, "logits/rejected": 0.8447265625, "logps/chosen": -227.1374969482422, "logps/rejected": -210.2624969482422, "loss": 0.3888, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9842286109924316, "rewards/margins": 3.50146484375, "rewards/rejected": -6.483593940734863, "step": 4440 }, { "epoch": 0.712, "grad_norm": 177.44934740803313, "learning_rate": 2.8816e-07, "logits/chosen": 0.7583984136581421, "logits/rejected": 0.867401123046875, "logps/chosen": -244.96249389648438, "logps/rejected": -224.25, "loss": 0.7234, "rewards/accuracies": 0.75, "rewards/chosen": -3.667651414871216, "rewards/margins": 2.6597657203674316, "rewards/rejected": -6.3291015625, "step": 4450 }, { "epoch": 0.7136, "grad_norm": 112.98146551733457, "learning_rate": 2.8656e-07, "logits/chosen": 0.5724731683731079, "logits/rejected": 0.753955066204071, "logps/chosen": -280.86248779296875, "logps/rejected": -236.6125030517578, "loss": 0.375, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.900952100753784, "rewards/margins": 3.600390672683716, "rewards/rejected": -6.504687309265137, "step": 4460 }, { "epoch": 0.7152, "grad_norm": 80.68484857953526, "learning_rate": 2.8496e-07, "logits/chosen": 0.800000011920929, "logits/rejected": 0.9327636957168579, "logps/chosen": -242.0625, "logps/rejected": -220.0625, "loss": 0.5006, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.3425049781799316, "rewards/margins": 3.2493653297424316, "rewards/rejected": -6.583984375, "step": 4470 }, { "epoch": 0.7168, "grad_norm": 131.00930901520476, "learning_rate": 2.8335999999999996e-07, "logits/chosen": 0.7206786870956421, "logits/rejected": 0.9078124761581421, "logps/chosen": -241.71249389648438, "logps/rejected": -202.5124969482422, "loss": 0.5174, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.987109422683716, "rewards/margins": 3.1705079078674316, "rewards/rejected": -6.158398628234863, "step": 4480 }, { "epoch": 0.7184, "grad_norm": 204.57929852011236, "learning_rate": 2.8176e-07, "logits/chosen": 0.718127429485321, "logits/rejected": 0.8491455316543579, "logps/chosen": -246.125, "logps/rejected": -222.0625, "loss": 0.4864, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2947754859924316, "rewards/margins": 3.63134765625, "rewards/rejected": -6.921875, "step": 4490 }, { "epoch": 0.72, "grad_norm": 127.89779105974324, "learning_rate": 2.8016e-07, "logits/chosen": 0.6959228515625, "logits/rejected": 0.836840808391571, "logps/chosen": -230.9499969482422, "logps/rejected": -213.5625, "loss": 0.45, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3189454078674316, "rewards/margins": 3.09814453125, "rewards/rejected": -6.415625095367432, "step": 4500 }, { "epoch": 0.7216, "grad_norm": 75.25093293396107, "learning_rate": 2.7856e-07, "logits/chosen": 0.7814391851425171, "logits/rejected": 0.8532470464706421, "logps/chosen": -233.5625, "logps/rejected": -217.16250610351562, "loss": 0.8246, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.707653760910034, "rewards/margins": 2.648632764816284, "rewards/rejected": -5.354882717132568, "step": 4510 }, { "epoch": 0.7232, "grad_norm": 188.65361775737563, "learning_rate": 2.7695999999999996e-07, "logits/chosen": 0.793774425983429, "logits/rejected": 0.963916003704071, "logps/chosen": -217.5, "logps/rejected": -199.77499389648438, "loss": 0.5909, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.20458984375, "rewards/margins": 3.0272459983825684, "rewards/rejected": -5.231835842132568, "step": 4520 }, { "epoch": 0.7248, "grad_norm": 110.92185512369647, "learning_rate": 2.7536e-07, "logits/chosen": 0.698352038860321, "logits/rejected": NaN, "logps/chosen": -263.0249938964844, "logps/rejected": -228.125, "loss": 0.6366, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.4579100608825684, "rewards/margins": 2.8345704078674316, "rewards/rejected": -5.292382717132568, "step": 4530 }, { "epoch": 0.7264, "grad_norm": 92.38290941737819, "learning_rate": 2.7376e-07, "logits/chosen": 0.7570785284042358, "logits/rejected": 0.876953125, "logps/chosen": -225.8625030517578, "logps/rejected": -214.3000030517578, "loss": 0.5309, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.634033203125, "rewards/margins": 3.0556640625, "rewards/rejected": -5.685937404632568, "step": 4540 }, { "epoch": 0.728, "grad_norm": 167.50293350727597, "learning_rate": 2.7216e-07, "logits/chosen": NaN, "logits/rejected": 0.903735339641571, "logps/chosen": -244.0124969482422, "logps/rejected": -232.97500610351562, "loss": 0.3889, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.167919874191284, "rewards/margins": 3.945385694503784, "rewards/rejected": -6.1083984375, "step": 4550 }, { "epoch": 0.7296, "grad_norm": 147.187714414269, "learning_rate": 2.7056e-07, "logits/chosen": 0.766308605670929, "logits/rejected": 0.8929198980331421, "logps/chosen": -244.4499969482422, "logps/rejected": -216.58749389648438, "loss": 0.5353, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.476611375808716, "rewards/margins": 3.004199266433716, "rewards/rejected": -5.481640815734863, "step": 4560 }, { "epoch": 0.7312, "grad_norm": 156.90272950778925, "learning_rate": 2.6895999999999994e-07, "logits/chosen": 0.601916491985321, "logits/rejected": 0.7897094488143921, "logps/chosen": -263.4125061035156, "logps/rejected": -206.9375, "loss": 0.5911, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.590441942214966, "rewards/margins": 2.9390625953674316, "rewards/rejected": -5.52734375, "step": 4570 }, { "epoch": 0.7328, "grad_norm": 97.30317685000733, "learning_rate": 2.6736e-07, "logits/chosen": 0.650463879108429, "logits/rejected": 0.7693237066268921, "logps/chosen": -254.8125, "logps/rejected": -237.0749969482422, "loss": 0.4406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.224169969558716, "rewards/margins": 3.5606446266174316, "rewards/rejected": -5.786718845367432, "step": 4580 }, { "epoch": 0.7344, "grad_norm": 106.87990738369065, "learning_rate": 2.6575999999999997e-07, "logits/chosen": 0.7257080078125, "logits/rejected": 0.839221179485321, "logps/chosen": -237.0500030517578, "logps/rejected": -224.85000610351562, "loss": 0.5296, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.6367430686950684, "rewards/margins": 3.1437010765075684, "rewards/rejected": -5.782422065734863, "step": 4590 }, { "epoch": 0.736, "grad_norm": 172.31477614266814, "learning_rate": 2.6416e-07, "logits/chosen": 0.786376953125, "logits/rejected": 0.835742175579071, "logps/chosen": -248.2624969482422, "logps/rejected": -220.0500030517578, "loss": 0.6067, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.756542921066284, "rewards/margins": 3.1329588890075684, "rewards/rejected": -5.894629001617432, "step": 4600 }, { "epoch": 0.7376, "grad_norm": 183.90616639987144, "learning_rate": 2.6256e-07, "logits/chosen": 0.761645495891571, "logits/rejected": 0.8932129144668579, "logps/chosen": -233.03750610351562, "logps/rejected": -214.4375, "loss": 0.3645, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5531249046325684, "rewards/margins": 3.8080077171325684, "rewards/rejected": -6.358007907867432, "step": 4610 }, { "epoch": 0.7392, "grad_norm": 127.40753801202388, "learning_rate": 2.6096000000000003e-07, "logits/chosen": NaN, "logits/rejected": 0.738207995891571, "logps/chosen": -263.42498779296875, "logps/rejected": -266.6499938964844, "loss": 0.524, "rewards/accuracies": 0.78125, "rewards/chosen": -2.328784227371216, "rewards/margins": 3.4569091796875, "rewards/rejected": -5.788671970367432, "step": 4620 }, { "epoch": 0.7408, "grad_norm": 122.37842012545053, "learning_rate": 2.5935999999999996e-07, "logits/chosen": 0.801684558391571, "logits/rejected": 0.9060424566268921, "logps/chosen": -244.4375, "logps/rejected": -223.8000030517578, "loss": 0.46, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.405517578125, "rewards/margins": 3.777636766433716, "rewards/rejected": -6.185937404632568, "step": 4630 }, { "epoch": 0.7424, "grad_norm": 88.24560277774144, "learning_rate": 2.5776e-07, "logits/chosen": 0.7765136957168579, "logits/rejected": 1.005712866783142, "logps/chosen": -269.48748779296875, "logps/rejected": -251.83749389648438, "loss": 0.4703, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7287964820861816, "rewards/margins": 3.3275389671325684, "rewards/rejected": -6.052734375, "step": 4640 }, { "epoch": 0.744, "grad_norm": 80.70400831033598, "learning_rate": 2.5616e-07, "logits/chosen": 0.6723998785018921, "logits/rejected": 0.8667968511581421, "logps/chosen": -238.77499389648438, "logps/rejected": -221.4375, "loss": 0.3906, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.334338426589966, "rewards/margins": 3.924853563308716, "rewards/rejected": -6.263671875, "step": 4650 }, { "epoch": 0.7456, "grad_norm": 96.59724069626816, "learning_rate": 2.5455999999999997e-07, "logits/chosen": 0.6678100824356079, "logits/rejected": 0.8526366949081421, "logps/chosen": -244.125, "logps/rejected": -228.1999969482422, "loss": 0.5305, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.985888719558716, "rewards/margins": 3.0103516578674316, "rewards/rejected": -5.993750095367432, "step": 4660 }, { "epoch": 0.7472, "grad_norm": 158.91756692470943, "learning_rate": 2.5296e-07, "logits/chosen": 0.7024170160293579, "logits/rejected": 0.7936035394668579, "logps/chosen": -239.91250610351562, "logps/rejected": -220.8874969482422, "loss": 0.5711, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.1900391578674316, "rewards/margins": 2.956835985183716, "rewards/rejected": -6.149609565734863, "step": 4670 }, { "epoch": 0.7488, "grad_norm": 164.05108080877758, "learning_rate": 2.5135999999999994e-07, "logits/chosen": 0.70458984375, "logits/rejected": 0.8341614007949829, "logps/chosen": -272.23748779296875, "logps/rejected": -218.27499389648438, "loss": 0.5523, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.540722608566284, "rewards/margins": 3.704882860183716, "rewards/rejected": -6.2470703125, "step": 4680 }, { "epoch": 0.7504, "grad_norm": 118.1989788847542, "learning_rate": 2.4976e-07, "logits/chosen": 0.649768054485321, "logits/rejected": 0.765148937702179, "logps/chosen": -261.625, "logps/rejected": -224.75, "loss": 0.3288, "rewards/accuracies": 0.8125, "rewards/chosen": -2.22998046875, "rewards/margins": 4.038378715515137, "rewards/rejected": -6.271484375, "step": 4690 }, { "epoch": 0.752, "grad_norm": 148.82632997325845, "learning_rate": 2.4815999999999997e-07, "logits/chosen": 0.7658325433731079, "logits/rejected": 0.880627453327179, "logps/chosen": -238.9875030517578, "logps/rejected": -211.4499969482422, "loss": 0.41, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2547364234924316, "rewards/margins": 3.495312452316284, "rewards/rejected": -6.747656345367432, "step": 4700 }, { "epoch": 0.7536, "grad_norm": 127.78397625788263, "learning_rate": 2.4656e-07, "logits/chosen": 0.7449951171875, "logits/rejected": 0.8744872808456421, "logps/chosen": -255.8125, "logps/rejected": -226.8000030517578, "loss": 0.6671, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.837451219558716, "rewards/margins": 2.27734375, "rewards/rejected": -6.111718654632568, "step": 4710 }, { "epoch": 0.7552, "grad_norm": 107.00117598566146, "learning_rate": 2.4496e-07, "logits/chosen": 0.7197173833847046, "logits/rejected": 0.871630847454071, "logps/chosen": -252.9250030517578, "logps/rejected": -213.0124969482422, "loss": 0.4695, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.6993408203125, "rewards/margins": 3.100878953933716, "rewards/rejected": -6.798437595367432, "step": 4720 }, { "epoch": 0.7568, "grad_norm": 131.1199142409411, "learning_rate": 2.4336e-07, "logits/chosen": 0.6417388916015625, "logits/rejected": 0.806103527545929, "logps/chosen": -262.45001220703125, "logps/rejected": -231.66250610351562, "loss": 0.5316, "rewards/accuracies": 0.8125, "rewards/chosen": -3.778515577316284, "rewards/margins": 3.091601610183716, "rewards/rejected": -6.865234375, "step": 4730 }, { "epoch": 0.7584, "grad_norm": 55.664780991577985, "learning_rate": 2.4176e-07, "logits/chosen": NaN, "logits/rejected": 0.805126965045929, "logps/chosen": -252.3874969482422, "logps/rejected": -221.21249389648438, "loss": 0.6731, "rewards/accuracies": 0.8125, "rewards/chosen": -3.686279296875, "rewards/margins": 3.2767577171325684, "rewards/rejected": -6.962500095367432, "step": 4740 }, { "epoch": 0.76, "grad_norm": 101.86980052215877, "learning_rate": 2.4016e-07, "logits/chosen": 0.698077380657196, "logits/rejected": 0.7491455078125, "logps/chosen": -257.0375061035156, "logps/rejected": -195.875, "loss": 0.6586, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.641308546066284, "rewards/margins": 2.7938475608825684, "rewards/rejected": -6.430859565734863, "step": 4750 }, { "epoch": 0.7616, "grad_norm": 83.10404830151462, "learning_rate": 2.3856e-07, "logits/chosen": 0.820605456829071, "logits/rejected": 0.9024413824081421, "logps/chosen": -233.10000610351562, "logps/rejected": -230.96249389648438, "loss": 0.5417, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.567333936691284, "rewards/margins": 3.056835889816284, "rewards/rejected": -6.624316215515137, "step": 4760 }, { "epoch": 0.7632, "grad_norm": 105.94953521687607, "learning_rate": 2.3696e-07, "logits/chosen": 0.648193359375, "logits/rejected": 0.776611328125, "logps/chosen": -252.0124969482422, "logps/rejected": -216.14999389648438, "loss": 0.5168, "rewards/accuracies": 0.78125, "rewards/chosen": -3.344921827316284, "rewards/margins": 3.2854981422424316, "rewards/rejected": -6.62890625, "step": 4770 }, { "epoch": 0.7648, "grad_norm": 138.66065400947713, "learning_rate": 2.3536e-07, "logits/chosen": 0.7433716058731079, "logits/rejected": 0.9263550043106079, "logps/chosen": -255.2375030517578, "logps/rejected": -212.625, "loss": 0.4319, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.6490235328674316, "rewards/margins": 3.5849609375, "rewards/rejected": -6.231640815734863, "step": 4780 }, { "epoch": 0.7664, "grad_norm": 41.47413025165156, "learning_rate": 2.3376e-07, "logits/chosen": NaN, "logits/rejected": 0.8062499761581421, "logps/chosen": -244.13125610351562, "logps/rejected": -443.5249938964844, "loss": 0.4948, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.221630811691284, "rewards/margins": 7.351660251617432, "rewards/rejected": -10.570703506469727, "step": 4790 }, { "epoch": 0.768, "grad_norm": 152.2003913963623, "learning_rate": 2.3215999999999998e-07, "logits/chosen": 0.83099365234375, "logits/rejected": 0.9326171875, "logps/chosen": -252.6875, "logps/rejected": -247.0749969482422, "loss": 0.5032, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7562499046325684, "rewards/margins": 3.3158202171325684, "rewards/rejected": -6.0693359375, "step": 4800 }, { "epoch": 0.7696, "grad_norm": 125.65263225586965, "learning_rate": 2.3055999999999997e-07, "logits/chosen": 0.5713699460029602, "logits/rejected": 0.7340332269668579, "logps/chosen": -245.10000610351562, "logps/rejected": -224.27499389648438, "loss": 0.3523, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.4559569358825684, "rewards/margins": 3.967529296875, "rewards/rejected": -6.420702934265137, "step": 4810 }, { "epoch": 0.7712, "grad_norm": 155.66622645513803, "learning_rate": 2.2895999999999998e-07, "logits/chosen": 0.635668933391571, "logits/rejected": 0.78656005859375, "logps/chosen": -279.4750061035156, "logps/rejected": -230.3125, "loss": 0.4136, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0514159202575684, "rewards/margins": 3.9376463890075684, "rewards/rejected": -5.9873046875, "step": 4820 }, { "epoch": 0.7728, "grad_norm": 68.73729840958894, "learning_rate": 2.2736e-07, "logits/chosen": 0.699450671672821, "logits/rejected": 0.7542358636856079, "logps/chosen": -252.5749969482422, "logps/rejected": -217.1999969482422, "loss": 0.4601, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.045092821121216, "rewards/margins": 3.56982421875, "rewards/rejected": -6.609765529632568, "step": 4830 }, { "epoch": 0.7744, "grad_norm": 131.01884784582515, "learning_rate": 2.2575999999999998e-07, "logits/chosen": 0.7751830816268921, "logits/rejected": 0.985107421875, "logps/chosen": -233.7375030517578, "logps/rejected": -219.6374969482422, "loss": 0.4027, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.632702589035034, "rewards/margins": 3.522656202316284, "rewards/rejected": -6.154296875, "step": 4840 }, { "epoch": 0.776, "grad_norm": 106.78593622031735, "learning_rate": 2.2416e-07, "logits/chosen": 0.63714599609375, "logits/rejected": 0.800567626953125, "logps/chosen": -232.6750030517578, "logps/rejected": -220.77499389648438, "loss": 0.6256, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.403979539871216, "rewards/margins": 3.221484422683716, "rewards/rejected": -6.630078315734863, "step": 4850 }, { "epoch": 0.7776, "grad_norm": 103.48253657495688, "learning_rate": 2.2256e-07, "logits/chosen": 0.7359374761581421, "logits/rejected": 0.8999999761581421, "logps/chosen": -238.28750610351562, "logps/rejected": -216.8000030517578, "loss": 0.4034, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.231738328933716, "rewards/margins": 3.225781202316284, "rewards/rejected": -6.45703125, "step": 4860 }, { "epoch": 0.7792, "grad_norm": 146.1142690642875, "learning_rate": 2.2096e-07, "logits/chosen": 0.7311767339706421, "logits/rejected": 0.890942394733429, "logps/chosen": -251.33749389648438, "logps/rejected": -219.3249969482422, "loss": 0.4652, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.322314500808716, "rewards/margins": 3.3470702171325684, "rewards/rejected": -6.666796684265137, "step": 4870 }, { "epoch": 0.7808, "grad_norm": 110.48268460856758, "learning_rate": 2.1936e-07, "logits/chosen": 0.7849365472793579, "logits/rejected": 0.905322253704071, "logps/chosen": -228.77499389648438, "logps/rejected": -201.91250610351562, "loss": 0.5198, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.48583984375, "rewards/margins": 3.0111327171325684, "rewards/rejected": -6.49609375, "step": 4880 }, { "epoch": 0.7824, "grad_norm": 43.43963559916475, "learning_rate": 2.1776e-07, "logits/chosen": 0.617932140827179, "logits/rejected": 0.781542956829071, "logps/chosen": -252.33749389648438, "logps/rejected": -232.8874969482422, "loss": 0.3366, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.8257813453674316, "rewards/margins": 3.8993163108825684, "rewards/rejected": -6.723437309265137, "step": 4890 }, { "epoch": 0.784, "grad_norm": 66.29354367776646, "learning_rate": 2.1615999999999997e-07, "logits/chosen": 0.682543933391571, "logits/rejected": 0.8688720464706421, "logps/chosen": -280.3374938964844, "logps/rejected": -247.33749389648438, "loss": 0.651, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8091187477111816, "rewards/margins": 3.3681640625, "rewards/rejected": -6.176562309265137, "step": 4900 }, { "epoch": 0.7856, "grad_norm": 120.5151734140636, "learning_rate": 2.1455999999999998e-07, "logits/chosen": 0.712890625, "logits/rejected": 0.865185558795929, "logps/chosen": -238.78750610351562, "logps/rejected": -230.1875, "loss": 0.4593, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.29638671875, "rewards/margins": 3.1636719703674316, "rewards/rejected": -6.460156440734863, "step": 4910 }, { "epoch": 0.7872, "grad_norm": 323.4602112245534, "learning_rate": 2.1296e-07, "logits/chosen": 0.7481689453125, "logits/rejected": 0.8528198003768921, "logps/chosen": -279.63751220703125, "logps/rejected": -235.0625, "loss": 0.6346, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.489990234375, "rewards/margins": 3.435839891433716, "rewards/rejected": -5.927343845367432, "step": 4920 }, { "epoch": 0.7888, "grad_norm": 180.3626912521013, "learning_rate": 2.1135999999999998e-07, "logits/chosen": 0.691149890422821, "logits/rejected": 0.7875732183456421, "logps/chosen": -269.3999938964844, "logps/rejected": -239.8874969482422, "loss": 0.4416, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.7798094749450684, "rewards/margins": 3.5360350608825684, "rewards/rejected": -6.319921970367432, "step": 4930 }, { "epoch": 0.7904, "grad_norm": 164.0951947172822, "learning_rate": 2.0976e-07, "logits/chosen": 0.660296618938446, "logits/rejected": 0.828417956829071, "logps/chosen": -227.14999389648438, "logps/rejected": -207.41250610351562, "loss": 0.4387, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.9486327171325684, "rewards/margins": 2.9745116233825684, "rewards/rejected": -6.9228515625, "step": 4940 }, { "epoch": 0.792, "grad_norm": 191.23239530604707, "learning_rate": 2.0816e-07, "logits/chosen": 0.686083972454071, "logits/rejected": 0.8687499761581421, "logps/chosen": -240.5124969482422, "logps/rejected": -221.3249969482422, "loss": 0.6319, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.768969774246216, "rewards/margins": 3.1958985328674316, "rewards/rejected": -5.965039253234863, "step": 4950 }, { "epoch": 0.7936, "grad_norm": 113.03234432843766, "learning_rate": 2.0656e-07, "logits/chosen": 0.792431652545929, "logits/rejected": 0.8931640386581421, "logps/chosen": -253.125, "logps/rejected": -250.5749969482422, "loss": 0.4751, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2921385765075684, "rewards/margins": 3.1460938453674316, "rewards/rejected": -6.440625190734863, "step": 4960 }, { "epoch": 0.7952, "grad_norm": 159.1741780306477, "learning_rate": 2.0496e-07, "logits/chosen": 0.7822510004043579, "logits/rejected": 0.87158203125, "logps/chosen": -236.9499969482422, "logps/rejected": -224.52499389648438, "loss": 0.3965, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1451172828674316, "rewards/margins": 3.7701172828674316, "rewards/rejected": -6.915625095367432, "step": 4970 }, { "epoch": 0.7968, "grad_norm": 101.96924834755038, "learning_rate": 2.0336000000000002e-07, "logits/chosen": 0.7122802734375, "logits/rejected": 0.866381824016571, "logps/chosen": -268.3500061035156, "logps/rejected": -228.28750610351562, "loss": 0.3821, "rewards/accuracies": 0.8125, "rewards/chosen": -2.845141649246216, "rewards/margins": 3.633984327316284, "rewards/rejected": -6.4765625, "step": 4980 }, { "epoch": 0.7984, "grad_norm": 109.7030246318179, "learning_rate": 2.0175999999999997e-07, "logits/chosen": 0.656481921672821, "logits/rejected": 0.794677734375, "logps/chosen": -240.9875030517578, "logps/rejected": -230.8625030517578, "loss": 0.5533, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.588671922683716, "rewards/margins": 2.927050828933716, "rewards/rejected": -6.513671875, "step": 4990 }, { "epoch": 0.8, "grad_norm": 697.4971026815771, "learning_rate": 2.0016e-07, "logits/chosen": 0.7895568609237671, "logits/rejected": 0.9032958745956421, "logps/chosen": -257.79998779296875, "logps/rejected": -255.0625, "loss": 0.5188, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9630370140075684, "rewards/margins": 3.595263719558716, "rewards/rejected": -6.558203220367432, "step": 5000 }, { "epoch": 0.8016, "grad_norm": 70.41092183262872, "learning_rate": 1.9855999999999997e-07, "logits/chosen": 0.688854992389679, "logits/rejected": 0.878125011920929, "logps/chosen": -234.8125, "logps/rejected": -220.5749969482422, "loss": 0.4585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2562499046325684, "rewards/margins": 3.30810546875, "rewards/rejected": -6.563867092132568, "step": 5010 }, { "epoch": 0.8032, "grad_norm": 92.00812000799544, "learning_rate": 1.9695999999999998e-07, "logits/chosen": 0.731213390827179, "logits/rejected": 0.912158191204071, "logps/chosen": -231.2624969482422, "logps/rejected": -205.58749389648438, "loss": 0.6209, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.106250047683716, "rewards/margins": 2.326855421066284, "rewards/rejected": -5.4375, "step": 5020 }, { "epoch": 0.8048, "grad_norm": 109.51976540064136, "learning_rate": 1.9536e-07, "logits/chosen": 0.7126709222793579, "logits/rejected": 0.8275390863418579, "logps/chosen": -258.70001220703125, "logps/rejected": -232.6999969482422, "loss": 0.3862, "rewards/accuracies": 0.84375, "rewards/chosen": -3.023364305496216, "rewards/margins": 3.7113280296325684, "rewards/rejected": -6.734570503234863, "step": 5030 }, { "epoch": 0.8064, "grad_norm": 62.87373385560829, "learning_rate": 1.9375999999999998e-07, "logits/chosen": 0.784069836139679, "logits/rejected": 0.89990234375, "logps/chosen": -235.27499389648438, "logps/rejected": -207.1750030517578, "loss": 0.3983, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.1298828125, "rewards/margins": 2.9129395484924316, "rewards/rejected": -6.041015625, "step": 5040 }, { "epoch": 0.808, "grad_norm": 48.474942013889944, "learning_rate": 1.9216e-07, "logits/chosen": 0.7084716558456421, "logits/rejected": 0.92889404296875, "logps/chosen": -228.6999969482422, "logps/rejected": -214.66250610351562, "loss": 0.4192, "rewards/accuracies": 0.78125, "rewards/chosen": -3.402636766433716, "rewards/margins": 2.93505859375, "rewards/rejected": -6.338671684265137, "step": 5050 }, { "epoch": 0.8096, "grad_norm": 143.32837810616394, "learning_rate": 1.9056e-07, "logits/chosen": 0.722332775592804, "logits/rejected": 0.927490234375, "logps/chosen": -261.7124938964844, "logps/rejected": -206.89999389648438, "loss": 0.4928, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.786032199859619, "rewards/margins": 3.309497117996216, "rewards/rejected": -6.097265720367432, "step": 5060 }, { "epoch": 0.8112, "grad_norm": 112.25717927784372, "learning_rate": 1.8896e-07, "logits/chosen": 0.7779083251953125, "logits/rejected": 0.9240051507949829, "logps/chosen": -234.15625, "logps/rejected": -217.0625, "loss": 0.6402, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.672473192214966, "rewards/margins": 2.814526319503784, "rewards/rejected": -5.482226371765137, "step": 5070 }, { "epoch": 0.8128, "grad_norm": 161.78295674473077, "learning_rate": 1.8735999999999998e-07, "logits/chosen": 0.679492175579071, "logits/rejected": 0.9078124761581421, "logps/chosen": -235.91250610351562, "logps/rejected": -221.08749389648438, "loss": 0.3998, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.149487257003784, "rewards/margins": 3.83544921875, "rewards/rejected": -5.979687690734863, "step": 5080 }, { "epoch": 0.8144, "grad_norm": 37.54688131873625, "learning_rate": 1.8576e-07, "logits/chosen": 0.7539459466934204, "logits/rejected": 0.8429840207099915, "logps/chosen": -249.8249969482422, "logps/rejected": -235.3000030517578, "loss": 0.5133, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.5462403297424316, "rewards/margins": 2.929003953933716, "rewards/rejected": -5.480370998382568, "step": 5090 }, { "epoch": 0.816, "grad_norm": 85.10881611317315, "learning_rate": 1.8415999999999998e-07, "logits/chosen": 0.6759277582168579, "logits/rejected": 0.867504894733429, "logps/chosen": -246.9875030517578, "logps/rejected": -222.4499969482422, "loss": 0.4213, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.8863282203674316, "rewards/margins": 3.5225586891174316, "rewards/rejected": -6.403906345367432, "step": 5100 }, { "epoch": 0.8176, "grad_norm": 136.9701522580087, "learning_rate": 1.8256e-07, "logits/chosen": NaN, "logits/rejected": 0.931884765625, "logps/chosen": -249.35000610351562, "logps/rejected": -227.1125030517578, "loss": 0.4587, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.731396436691284, "rewards/margins": 3.1668944358825684, "rewards/rejected": -5.899609565734863, "step": 5110 }, { "epoch": 0.8192, "grad_norm": 98.37438226160965, "learning_rate": 1.8096e-07, "logits/chosen": 0.806713879108429, "logits/rejected": 1.0500609874725342, "logps/chosen": -255.83749389648438, "logps/rejected": -224.2375030517578, "loss": 0.5877, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.0142579078674316, "rewards/margins": 2.91796875, "rewards/rejected": -5.934765815734863, "step": 5120 }, { "epoch": 0.8208, "grad_norm": 38.765843744358165, "learning_rate": 1.7935999999999999e-07, "logits/chosen": 0.7550018429756165, "logits/rejected": 0.98046875, "logps/chosen": -261.0, "logps/rejected": -235.91250610351562, "loss": 0.408, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.8628907203674316, "rewards/margins": 3.441113233566284, "rewards/rejected": -6.302734375, "step": 5130 }, { "epoch": 0.8224, "grad_norm": 163.3856164231553, "learning_rate": 1.7776e-07, "logits/chosen": 0.7954956293106079, "logits/rejected": 0.9321655035018921, "logps/chosen": -246.8125, "logps/rejected": -233.8125, "loss": 0.3267, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.816943407058716, "rewards/margins": 3.567919969558716, "rewards/rejected": -6.384375095367432, "step": 5140 }, { "epoch": 0.824, "grad_norm": 143.78550484674562, "learning_rate": 1.7616e-07, "logits/chosen": 0.9021545648574829, "logits/rejected": 1.045507788658142, "logps/chosen": -223.64999389648438, "logps/rejected": -206.91250610351562, "loss": 0.5122, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.003613233566284, "rewards/margins": 2.9131836891174316, "rewards/rejected": -5.914843559265137, "step": 5150 }, { "epoch": 0.8256, "grad_norm": 141.94433718846628, "learning_rate": 1.7456e-07, "logits/chosen": 0.754730224609375, "logits/rejected": 0.8282470703125, "logps/chosen": -247.89999389648438, "logps/rejected": -235.9250030517578, "loss": 0.5714, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7900390625, "rewards/margins": 2.713671922683716, "rewards/rejected": -5.501757621765137, "step": 5160 }, { "epoch": 0.8272, "grad_norm": 42.55683464894449, "learning_rate": 1.7295999999999998e-07, "logits/chosen": 0.848034679889679, "logits/rejected": 1.042333960533142, "logps/chosen": -232.14999389648438, "logps/rejected": -216.4375, "loss": 0.413, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.5018310546875, "rewards/margins": 3.1966552734375, "rewards/rejected": -5.697461128234863, "step": 5170 }, { "epoch": 0.8288, "grad_norm": 177.37535932379083, "learning_rate": 1.7136e-07, "logits/chosen": 0.751239001750946, "logits/rejected": 0.8775879144668579, "logps/chosen": -243.97500610351562, "logps/rejected": -199.0124969482422, "loss": 0.4325, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7982909679412842, "rewards/margins": 3.2360472679138184, "rewards/rejected": -5.032324314117432, "step": 5180 }, { "epoch": 0.8304, "grad_norm": 44.28937039816328, "learning_rate": 1.6975999999999998e-07, "logits/chosen": 0.6136474609375, "logits/rejected": 0.7484496831893921, "logps/chosen": -249.91250610351562, "logps/rejected": -219.72500610351562, "loss": 0.3991, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.685510277748108, "rewards/margins": 3.694628953933716, "rewards/rejected": -5.384961128234863, "step": 5190 }, { "epoch": 0.832, "grad_norm": 55.59704275676949, "learning_rate": 1.6816e-07, "logits/chosen": 0.6494385004043579, "logits/rejected": 0.836108386516571, "logps/chosen": -253.375, "logps/rejected": -231.33749389648438, "loss": 0.4131, "rewards/accuracies": 0.84375, "rewards/chosen": -2.249096632003784, "rewards/margins": 3.779296875, "rewards/rejected": -6.026562690734863, "step": 5200 }, { "epoch": 0.8336, "grad_norm": 45.717558733801, "learning_rate": 1.6656e-07, "logits/chosen": 0.799487292766571, "logits/rejected": 0.916430652141571, "logps/chosen": -231.125, "logps/rejected": -193.9375, "loss": 0.6229, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.6874022483825684, "rewards/margins": 3.071484327316284, "rewards/rejected": -5.755859375, "step": 5210 }, { "epoch": 0.8352, "grad_norm": 99.13729319084904, "learning_rate": 1.6496e-07, "logits/chosen": 0.7907959222793579, "logits/rejected": 0.855114758014679, "logps/chosen": -273.29998779296875, "logps/rejected": -245.77499389648438, "loss": 0.552, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.5616698265075684, "rewards/margins": 3.612597703933716, "rewards/rejected": -6.1767578125, "step": 5220 }, { "epoch": 0.8368, "grad_norm": 54.6166799871322, "learning_rate": 1.6336e-07, "logits/chosen": 0.813281238079071, "logits/rejected": 0.824414074420929, "logps/chosen": -265.4750061035156, "logps/rejected": -235.22500610351562, "loss": 0.5381, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.1238036155700684, "rewards/margins": 2.955029249191284, "rewards/rejected": -6.074609279632568, "step": 5230 }, { "epoch": 0.8384, "grad_norm": 83.40123221219854, "learning_rate": 1.6176e-07, "logits/chosen": 0.7939087152481079, "logits/rejected": 0.877124011516571, "logps/chosen": -270.0, "logps/rejected": -242.8000030517578, "loss": 0.5537, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2626953125, "rewards/margins": 3.5374999046325684, "rewards/rejected": -5.798242092132568, "step": 5240 }, { "epoch": 0.84, "grad_norm": 121.26210701265896, "learning_rate": 1.6016e-07, "logits/chosen": 0.7676757574081421, "logits/rejected": 0.892871081829071, "logps/chosen": -246.0, "logps/rejected": -239.4875030517578, "loss": 0.5045, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2921385765075684, "rewards/margins": 3.3136229515075684, "rewards/rejected": -5.607812404632568, "step": 5250 }, { "epoch": 0.8416, "grad_norm": 71.85394562532795, "learning_rate": 1.5856e-07, "logits/chosen": 0.766430675983429, "logits/rejected": 0.854565441608429, "logps/chosen": -223.9250030517578, "logps/rejected": -206.0124969482422, "loss": 0.4114, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.615771532058716, "rewards/margins": 3.330371141433716, "rewards/rejected": -5.942968845367432, "step": 5260 }, { "epoch": 0.8432, "grad_norm": 164.73863159114947, "learning_rate": 1.5695999999999997e-07, "logits/chosen": 0.6207641363143921, "logits/rejected": NaN, "logps/chosen": -241.7624969482422, "logps/rejected": -208.625, "loss": 0.5082, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8675780296325684, "rewards/margins": 3.3856444358825684, "rewards/rejected": -6.253125190734863, "step": 5270 }, { "epoch": 0.8448, "grad_norm": 96.21155125559778, "learning_rate": 1.5535999999999998e-07, "logits/chosen": 0.6164825558662415, "logits/rejected": 0.8075805902481079, "logps/chosen": -254.1125030517578, "logps/rejected": -252.6750030517578, "loss": 0.4718, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.4251465797424316, "rewards/margins": 3.731738328933716, "rewards/rejected": -6.157031059265137, "step": 5280 }, { "epoch": 0.8464, "grad_norm": 179.85386119389923, "learning_rate": 1.5376e-07, "logits/chosen": 0.8577026128768921, "logits/rejected": 0.951367199420929, "logps/chosen": -244.03750610351562, "logps/rejected": -257.7124938964844, "loss": 0.4776, "rewards/accuracies": 0.78125, "rewards/chosen": -3.025805711746216, "rewards/margins": 3.901171922683716, "rewards/rejected": -6.927734375, "step": 5290 }, { "epoch": 0.848, "grad_norm": 141.56374742650934, "learning_rate": 1.5215999999999998e-07, "logits/chosen": 0.649975597858429, "logits/rejected": 0.8560546636581421, "logps/chosen": -250.125, "logps/rejected": -229.0500030517578, "loss": 0.4731, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.3070311546325684, "rewards/margins": 3.3587889671325684, "rewards/rejected": -6.666455268859863, "step": 5300 }, { "epoch": 0.8496, "grad_norm": 69.5494248221738, "learning_rate": 1.5056e-07, "logits/chosen": 0.7088378667831421, "logits/rejected": 0.863232433795929, "logps/chosen": -240.52499389648438, "logps/rejected": -203.46249389648438, "loss": 0.3328, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4907469749450684, "rewards/margins": 3.548632860183716, "rewards/rejected": -6.036328315734863, "step": 5310 }, { "epoch": 0.8512, "grad_norm": 151.26668637913838, "learning_rate": 1.4896e-07, "logits/chosen": 0.697216808795929, "logits/rejected": 0.8481811285018921, "logps/chosen": -242.1374969482422, "logps/rejected": -222.9375, "loss": 0.4897, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.682177782058716, "rewards/margins": 3.003369092941284, "rewards/rejected": -5.686425685882568, "step": 5320 }, { "epoch": 0.8528, "grad_norm": 211.86565532542676, "learning_rate": 1.4736e-07, "logits/chosen": 0.638842761516571, "logits/rejected": NaN, "logps/chosen": -242.8249969482422, "logps/rejected": -212.6875, "loss": 0.4992, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.530810594558716, "rewards/margins": 3.301220655441284, "rewards/rejected": -5.831250190734863, "step": 5330 }, { "epoch": 0.8544, "grad_norm": 82.8233021268761, "learning_rate": 1.4576e-07, "logits/chosen": NaN, "logits/rejected": 0.985595703125, "logps/chosen": -251.1999969482422, "logps/rejected": -235.125, "loss": 0.4004, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.201489210128784, "rewards/margins": 3.194628953933716, "rewards/rejected": -5.396484375, "step": 5340 }, { "epoch": 0.856, "grad_norm": 80.4206497677409, "learning_rate": 1.4416000000000002e-07, "logits/chosen": 0.8599609136581421, "logits/rejected": 0.879150390625, "logps/chosen": -251.75, "logps/rejected": -216.1374969482422, "loss": 0.4287, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3568358421325684, "rewards/margins": 3.54736328125, "rewards/rejected": -5.903515815734863, "step": 5350 }, { "epoch": 0.8576, "grad_norm": 128.0455863879162, "learning_rate": 1.4255999999999997e-07, "logits/chosen": 0.706225574016571, "logits/rejected": 0.7742919921875, "logps/chosen": -248.60000610351562, "logps/rejected": -215.3249969482422, "loss": 0.5089, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.57855224609375, "rewards/margins": 3.2099609375, "rewards/rejected": -5.790429592132568, "step": 5360 }, { "epoch": 0.8592, "grad_norm": 109.96553588812276, "learning_rate": 1.4095999999999999e-07, "logits/chosen": 0.783374011516571, "logits/rejected": 0.981701672077179, "logps/chosen": -249.3625030517578, "logps/rejected": -229.1125030517578, "loss": 0.3998, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3104491233825684, "rewards/margins": 3.634472608566284, "rewards/rejected": -5.945703029632568, "step": 5370 }, { "epoch": 0.8608, "grad_norm": 112.2424572588941, "learning_rate": 1.3936e-07, "logits/chosen": 0.753002941608429, "logits/rejected": 0.8773437738418579, "logps/chosen": -244.3249969482422, "logps/rejected": -223.5500030517578, "loss": 0.5409, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.0920166969299316, "rewards/margins": 2.710742235183716, "rewards/rejected": -5.799609184265137, "step": 5380 }, { "epoch": 0.8624, "grad_norm": 162.05546456020252, "learning_rate": 1.3775999999999998e-07, "logits/chosen": 0.7800658941268921, "logits/rejected": 0.8680175542831421, "logps/chosen": -266.79998779296875, "logps/rejected": -213.5625, "loss": 0.5253, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2705321311950684, "rewards/margins": 3.1922850608825684, "rewards/rejected": -5.4609375, "step": 5390 }, { "epoch": 0.864, "grad_norm": 83.1775762816067, "learning_rate": 1.3616e-07, "logits/chosen": 0.7473510503768921, "logits/rejected": 0.9189208745956421, "logps/chosen": -241.41250610351562, "logps/rejected": -241.125, "loss": 0.5046, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.2747435569763184, "rewards/margins": 3.407470703125, "rewards/rejected": -5.68359375, "step": 5400 }, { "epoch": 0.8656, "grad_norm": 135.25573859262502, "learning_rate": 1.3456e-07, "logits/chosen": 0.777026355266571, "logits/rejected": 0.9043213129043579, "logps/chosen": -228.64999389648438, "logps/rejected": -202.5, "loss": 0.5931, "rewards/accuracies": 0.78125, "rewards/chosen": -2.5813536643981934, "rewards/margins": 2.919970750808716, "rewards/rejected": -5.499609470367432, "step": 5410 }, { "epoch": 0.8672, "grad_norm": 126.19031673210073, "learning_rate": 1.3296e-07, "logits/chosen": 0.7387939691543579, "logits/rejected": 0.936999499797821, "logps/chosen": -242.0749969482422, "logps/rejected": -242.7375030517578, "loss": 0.4696, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.781146287918091, "rewards/margins": 3.278759717941284, "rewards/rejected": -6.058203220367432, "step": 5420 }, { "epoch": 0.8688, "grad_norm": 27.626882328914228, "learning_rate": 1.3136e-07, "logits/chosen": 0.7903076410293579, "logits/rejected": 0.9297851324081421, "logps/chosen": -248.3249969482422, "logps/rejected": -226.2624969482422, "loss": 0.4751, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6717286109924316, "rewards/margins": 2.826171875, "rewards/rejected": -5.498046875, "step": 5430 }, { "epoch": 0.8704, "grad_norm": 107.80354169954688, "learning_rate": 1.2976e-07, "logits/chosen": 0.7203124761581421, "logits/rejected": 0.890887439250946, "logps/chosen": -245.9499969482422, "logps/rejected": -215.1125030517578, "loss": 0.3859, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.051013231277466, "rewards/margins": 3.368847608566284, "rewards/rejected": -5.418749809265137, "step": 5440 }, { "epoch": 0.872, "grad_norm": 136.36213671599293, "learning_rate": 1.2815999999999998e-07, "logits/chosen": 0.82275390625, "logits/rejected": 0.950976550579071, "logps/chosen": -248.3249969482422, "logps/rejected": -228.3874969482422, "loss": 0.5454, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5074706077575684, "rewards/margins": 3.3380370140075684, "rewards/rejected": -5.840087890625, "step": 5450 }, { "epoch": 0.8736, "grad_norm": 96.94391111425666, "learning_rate": 1.2656e-07, "logits/chosen": 0.740966796875, "logits/rejected": 0.8712402582168579, "logps/chosen": -219.7375030517578, "logps/rejected": -192.28750610351562, "loss": 0.5422, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.267871141433716, "rewards/margins": 2.789379835128784, "rewards/rejected": -5.057226657867432, "step": 5460 }, { "epoch": 0.8752, "grad_norm": 145.5511407455464, "learning_rate": 1.2496e-07, "logits/chosen": 0.8241943120956421, "logits/rejected": 0.931774914264679, "logps/chosen": -247.53750610351562, "logps/rejected": -216.5500030517578, "loss": 0.5187, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7843506336212158, "rewards/margins": 3.2958006858825684, "rewards/rejected": -5.07861328125, "step": 5470 }, { "epoch": 0.8768, "grad_norm": 93.39994322664981, "learning_rate": 1.2336e-07, "logits/chosen": 0.675854504108429, "logits/rejected": 0.796679675579071, "logps/chosen": -242.2624969482422, "logps/rejected": -217.83749389648438, "loss": 0.3416, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.560815453529358, "rewards/margins": 4.175195217132568, "rewards/rejected": -5.737500190734863, "step": 5480 }, { "epoch": 0.8784, "grad_norm": 86.10862620511, "learning_rate": 1.2176e-07, "logits/chosen": 0.7316650152206421, "logits/rejected": 0.8541015386581421, "logps/chosen": -281.5625, "logps/rejected": -208.1125030517578, "loss": 0.4654, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6822509765625, "rewards/margins": 3.6797852516174316, "rewards/rejected": -5.364306449890137, "step": 5490 }, { "epoch": 0.88, "grad_norm": 74.23543138366509, "learning_rate": 1.2015999999999999e-07, "logits/chosen": 0.768414318561554, "logits/rejected": 0.794873058795929, "logps/chosen": -252.4875030517578, "logps/rejected": -219.7624969482422, "loss": 0.4523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9785645008087158, "rewards/margins": 3.438525438308716, "rewards/rejected": -5.417187690734863, "step": 5500 }, { "epoch": 0.8816, "grad_norm": 66.38480173065068, "learning_rate": 1.1856e-07, "logits/chosen": 0.784533679485321, "logits/rejected": 0.938305675983429, "logps/chosen": -228.2624969482422, "logps/rejected": -208.375, "loss": 0.4813, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2565550804138184, "rewards/margins": 2.909960985183716, "rewards/rejected": -5.163476467132568, "step": 5510 }, { "epoch": 0.8832, "grad_norm": 180.52903288169463, "learning_rate": 1.1695999999999998e-07, "logits/chosen": 0.691577136516571, "logits/rejected": 0.834545910358429, "logps/chosen": -227.97500610351562, "logps/rejected": -217.28750610351562, "loss": 0.7437, "rewards/accuracies": 0.75, "rewards/chosen": -2.075634717941284, "rewards/margins": 2.710192918777466, "rewards/rejected": -4.785742282867432, "step": 5520 }, { "epoch": 0.8848, "grad_norm": 178.78442418802155, "learning_rate": 1.1536e-07, "logits/chosen": 0.781933605670929, "logits/rejected": 0.952563464641571, "logps/chosen": -258.88751220703125, "logps/rejected": -231.39999389648438, "loss": 0.5367, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.854882836341858, "rewards/margins": 3.8470702171325684, "rewards/rejected": -5.700781345367432, "step": 5530 }, { "epoch": 0.8864, "grad_norm": 143.83293762785786, "learning_rate": 1.1376e-07, "logits/chosen": 0.707611083984375, "logits/rejected": 0.920135498046875, "logps/chosen": -245.77499389648438, "logps/rejected": -217.0124969482422, "loss": 0.2936, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3772461414337158, "rewards/margins": 3.9073729515075684, "rewards/rejected": -5.287890434265137, "step": 5540 }, { "epoch": 0.888, "grad_norm": 166.5956653724222, "learning_rate": 1.1215999999999999e-07, "logits/chosen": 0.725781261920929, "logits/rejected": 0.9873291254043579, "logps/chosen": -257.25, "logps/rejected": -228.33749389648438, "loss": 0.5208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9843261241912842, "rewards/margins": 3.356738328933716, "rewards/rejected": -5.340234279632568, "step": 5550 }, { "epoch": 0.8896, "grad_norm": 20.016545223911166, "learning_rate": 1.1056e-07, "logits/chosen": 0.715014636516571, "logits/rejected": 0.8681640625, "logps/chosen": -246.4250030517578, "logps/rejected": -230.8000030517578, "loss": 0.4851, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4799072742462158, "rewards/margins": 3.624706983566284, "rewards/rejected": -5.107617378234863, "step": 5560 }, { "epoch": 0.8912, "grad_norm": 102.65427490185255, "learning_rate": 1.0895999999999999e-07, "logits/chosen": NaN, "logits/rejected": 0.80029296875, "logps/chosen": -278.1625061035156, "logps/rejected": -235.22500610351562, "loss": 0.4981, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.650781273841858, "rewards/margins": 3.5921387672424316, "rewards/rejected": -5.2431640625, "step": 5570 }, { "epoch": 0.8928, "grad_norm": 108.50468689202776, "learning_rate": 1.0735999999999999e-07, "logits/chosen": 0.7050536870956421, "logits/rejected": 0.794506847858429, "logps/chosen": -260.25, "logps/rejected": -233.60000610351562, "loss": 0.4687, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.044726610183716, "rewards/margins": 3.23779296875, "rewards/rejected": -5.278710842132568, "step": 5580 }, { "epoch": 0.8944, "grad_norm": 75.84279674693725, "learning_rate": 1.0576e-07, "logits/chosen": 0.652722179889679, "logits/rejected": 0.838854968547821, "logps/chosen": -264.75, "logps/rejected": -235.02499389648438, "loss": 0.3568, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.618737816810608, "rewards/margins": 4.0400390625, "rewards/rejected": -5.658203125, "step": 5590 }, { "epoch": 0.896, "grad_norm": 109.9749976130802, "learning_rate": 1.0416e-07, "logits/chosen": 0.7532104253768921, "logits/rejected": 0.835705578327179, "logps/chosen": -219.6750030517578, "logps/rejected": -193.77499389648438, "loss": 0.5026, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.100659132003784, "rewards/margins": 3.046154737472534, "rewards/rejected": -5.142968654632568, "step": 5600 }, { "epoch": 0.8976, "grad_norm": 234.84255227766377, "learning_rate": 1.0256e-07, "logits/chosen": 0.7079101800918579, "logits/rejected": 0.8302246332168579, "logps/chosen": -276.9125061035156, "logps/rejected": -246.89999389648438, "loss": 0.5655, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.948583960533142, "rewards/margins": 3.451171875, "rewards/rejected": -5.398828029632568, "step": 5610 }, { "epoch": 0.8992, "grad_norm": 82.70187292998422, "learning_rate": 1.0095999999999999e-07, "logits/chosen": 0.6893676519393921, "logits/rejected": 0.77386474609375, "logps/chosen": -249.9875030517578, "logps/rejected": -222.89999389648438, "loss": 0.3706, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.577551245689392, "rewards/margins": 4.083593845367432, "rewards/rejected": -5.661913871765137, "step": 5620 }, { "epoch": 0.9008, "grad_norm": 165.53524572940069, "learning_rate": 9.936e-08, "logits/chosen": 0.84814453125, "logits/rejected": 0.897265613079071, "logps/chosen": -242.1999969482422, "logps/rejected": -217.10000610351562, "loss": 0.4852, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.5575499534606934, "rewards/margins": 3.474902391433716, "rewards/rejected": -6.033203125, "step": 5630 }, { "epoch": 0.9024, "grad_norm": 132.59825794106712, "learning_rate": 9.776e-08, "logits/chosen": 0.8019775152206421, "logits/rejected": 0.898510754108429, "logps/chosen": -260.6625061035156, "logps/rejected": -226.4499969482422, "loss": 0.4865, "rewards/accuracies": 0.8125, "rewards/chosen": -2.47119140625, "rewards/margins": 3.3932127952575684, "rewards/rejected": -5.86328125, "step": 5640 }, { "epoch": 0.904, "grad_norm": 127.37879365690272, "learning_rate": 9.616e-08, "logits/chosen": 0.738696277141571, "logits/rejected": 0.9205322265625, "logps/chosen": -247.14999389648438, "logps/rejected": -204.85000610351562, "loss": 0.4221, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.4730467796325684, "rewards/margins": 3.4283204078674316, "rewards/rejected": -5.901171684265137, "step": 5650 }, { "epoch": 0.9056, "grad_norm": 172.6907429725849, "learning_rate": 9.456e-08, "logits/chosen": 0.7578490972518921, "logits/rejected": 0.8451293706893921, "logps/chosen": -230.2624969482422, "logps/rejected": -216.89999389648438, "loss": 0.4325, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.45220947265625, "rewards/margins": 3.201122999191284, "rewards/rejected": -5.652148246765137, "step": 5660 }, { "epoch": 0.9072, "grad_norm": 217.5496731117547, "learning_rate": 9.295999999999999e-08, "logits/chosen": 0.774487316608429, "logits/rejected": 0.9878174066543579, "logps/chosen": -259.23748779296875, "logps/rejected": -232.46249389648438, "loss": 0.4107, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.703588843345642, "rewards/margins": 3.550488233566284, "rewards/rejected": -5.253515720367432, "step": 5670 }, { "epoch": 0.9088, "grad_norm": 129.51358778972963, "learning_rate": 9.135999999999999e-08, "logits/chosen": 0.662841796875, "logits/rejected": 0.8354736566543579, "logps/chosen": -239.8249969482422, "logps/rejected": -239.8125, "loss": 0.3904, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.046875, "rewards/margins": 4.084424018859863, "rewards/rejected": -6.126562595367432, "step": 5680 }, { "epoch": 0.9104, "grad_norm": 63.84073598512455, "learning_rate": 8.976e-08, "logits/chosen": 0.7355057001113892, "logits/rejected": 0.818713366985321, "logps/chosen": -245.64999389648438, "logps/rejected": -214.97500610351562, "loss": 0.4136, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.371630907058716, "rewards/margins": 3.0418944358825684, "rewards/rejected": -5.414843559265137, "step": 5690 }, { "epoch": 0.912, "grad_norm": 117.00106730745055, "learning_rate": 8.816e-08, "logits/chosen": 0.8664306402206421, "logits/rejected": 0.861254870891571, "logps/chosen": -236.375, "logps/rejected": -222.21249389648438, "loss": 0.6877, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.4551758766174316, "rewards/margins": 2.788378953933716, "rewards/rejected": -5.244433403015137, "step": 5700 }, { "epoch": 0.9136, "grad_norm": 125.5312296853126, "learning_rate": 8.655999999999999e-08, "logits/chosen": 0.831860363483429, "logits/rejected": 0.940704345703125, "logps/chosen": -252.5749969482422, "logps/rejected": -254.6999969482422, "loss": 0.5322, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9778320789337158, "rewards/margins": 3.74267578125, "rewards/rejected": -5.717577934265137, "step": 5710 }, { "epoch": 0.9152, "grad_norm": 36.97723830137855, "learning_rate": 8.495999999999999e-08, "logits/chosen": 0.595874011516571, "logits/rejected": 0.8716064691543579, "logps/chosen": -255.125, "logps/rejected": -225.6875, "loss": 0.564, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.058837890625, "rewards/margins": 3.252392530441284, "rewards/rejected": -5.314453125, "step": 5720 }, { "epoch": 0.9168, "grad_norm": 23.415722911741554, "learning_rate": 8.336e-08, "logits/chosen": 0.8123779296875, "logits/rejected": 0.9681640863418579, "logps/chosen": -232.4375, "logps/rejected": -212.39999389648438, "loss": 0.4529, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.39019775390625, "rewards/margins": 3.0596680641174316, "rewards/rejected": -5.448193550109863, "step": 5730 }, { "epoch": 0.9184, "grad_norm": 203.14364520003403, "learning_rate": 8.176e-08, "logits/chosen": 0.751599133014679, "logits/rejected": 0.938000500202179, "logps/chosen": -259.42498779296875, "logps/rejected": -233.4375, "loss": 0.5874, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.629638671875, "rewards/margins": 3.423290967941284, "rewards/rejected": -6.051367282867432, "step": 5740 }, { "epoch": 0.92, "grad_norm": 164.10667311769546, "learning_rate": 8.016e-08, "logits/chosen": 0.7019287347793579, "logits/rejected": 0.833251953125, "logps/chosen": -246.33749389648438, "logps/rejected": -233.375, "loss": 0.5768, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.658496141433716, "rewards/margins": 3.011425733566284, "rewards/rejected": -5.670996189117432, "step": 5750 }, { "epoch": 0.9216, "grad_norm": 53.26172199339559, "learning_rate": 7.856e-08, "logits/chosen": 0.743090808391571, "logits/rejected": 0.859692394733429, "logps/chosen": -234.27499389648438, "logps/rejected": -205.375, "loss": 0.3648, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.201611280441284, "rewards/margins": 3.587939500808716, "rewards/rejected": -5.791796684265137, "step": 5760 }, { "epoch": 0.9232, "grad_norm": 88.78257102751517, "learning_rate": 7.696e-08, "logits/chosen": 0.87701416015625, "logits/rejected": 1.006933569908142, "logps/chosen": -236.1875, "logps/rejected": -229.9499969482422, "loss": 0.4494, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.6363768577575684, "rewards/margins": 3.1756834983825684, "rewards/rejected": -5.810937404632568, "step": 5770 }, { "epoch": 0.9248, "grad_norm": 72.55029288085152, "learning_rate": 7.536e-08, "logits/chosen": 0.8631836175918579, "logits/rejected": 0.93603515625, "logps/chosen": -242.3249969482422, "logps/rejected": -209.10000610351562, "loss": 0.4576, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6482176780700684, "rewards/margins": 3.347180128097534, "rewards/rejected": -5.996484279632568, "step": 5780 }, { "epoch": 0.9264, "grad_norm": 85.74176546162789, "learning_rate": 7.376000000000001e-08, "logits/chosen": 0.787487804889679, "logits/rejected": 0.9005981683731079, "logps/chosen": -265.125, "logps/rejected": -216.4499969482422, "loss": 0.3477, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3274168968200684, "rewards/margins": 3.919921875, "rewards/rejected": -6.248437404632568, "step": 5790 }, { "epoch": 0.928, "grad_norm": 73.83297944611415, "learning_rate": 7.215999999999999e-08, "logits/chosen": 0.739636242389679, "logits/rejected": 0.8722168207168579, "logps/chosen": -240.6999969482422, "logps/rejected": -226.97500610351562, "loss": 0.5314, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.193554639816284, "rewards/margins": 3.198046922683716, "rewards/rejected": -5.392578125, "step": 5800 }, { "epoch": 0.9296, "grad_norm": 109.23774538866047, "learning_rate": 7.055999999999999e-08, "logits/chosen": 0.786267101764679, "logits/rejected": 1.05126953125, "logps/chosen": -213.6750030517578, "logps/rejected": -202.8625030517578, "loss": 0.3721, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.104724168777466, "rewards/margins": 4.017773628234863, "rewards/rejected": -6.123437404632568, "step": 5810 }, { "epoch": 0.9312, "grad_norm": 103.1211280691931, "learning_rate": 6.895999999999999e-08, "logits/chosen": 0.6321166753768921, "logits/rejected": 0.845166027545929, "logps/chosen": -229.91250610351562, "logps/rejected": -235.35000610351562, "loss": 0.541, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.871337890625, "rewards/margins": 3.283764600753784, "rewards/rejected": -5.157031059265137, "step": 5820 }, { "epoch": 0.9328, "grad_norm": 129.0314690581027, "learning_rate": 6.736e-08, "logits/chosen": 0.6815124750137329, "logits/rejected": 0.9108642339706421, "logps/chosen": -234.0749969482422, "logps/rejected": -206.8000030517578, "loss": 0.5548, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.196856737136841, "rewards/margins": 3.341357469558716, "rewards/rejected": -5.53515625, "step": 5830 }, { "epoch": 0.9344, "grad_norm": 117.28327253739101, "learning_rate": 6.576e-08, "logits/chosen": 0.6954590082168579, "logits/rejected": 0.8276001214981079, "logps/chosen": -228.8000030517578, "logps/rejected": -220.2375030517578, "loss": 0.4766, "rewards/accuracies": 0.8125, "rewards/chosen": -2.82177734375, "rewards/margins": 3.525097608566284, "rewards/rejected": -6.350341796875, "step": 5840 }, { "epoch": 0.936, "grad_norm": 227.18069552469427, "learning_rate": 6.415999999999999e-08, "logits/chosen": 0.8011474609375, "logits/rejected": 0.8559325933456421, "logps/chosen": -274.3999938964844, "logps/rejected": -232.8249969482422, "loss": 0.6198, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.06597900390625, "rewards/margins": 3.722363233566284, "rewards/rejected": -5.791406154632568, "step": 5850 }, { "epoch": 0.9376, "grad_norm": 90.9950805278143, "learning_rate": 6.256e-08, "logits/chosen": 0.80364990234375, "logits/rejected": 0.9842773675918579, "logps/chosen": -211.02499389648438, "logps/rejected": -184.08749389648438, "loss": 0.387, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.9671387672424316, "rewards/margins": 2.9605469703674316, "rewards/rejected": -5.925976753234863, "step": 5860 }, { "epoch": 0.9392, "grad_norm": 146.0678528216964, "learning_rate": 6.096e-08, "logits/chosen": 0.643237292766571, "logits/rejected": 0.8125, "logps/chosen": -254.28750610351562, "logps/rejected": -220.56875610351562, "loss": 0.3574, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.465625047683716, "rewards/margins": 3.4169921875, "rewards/rejected": -5.886328220367432, "step": 5870 }, { "epoch": 0.9408, "grad_norm": 85.17467978095657, "learning_rate": 5.9360000000000003e-08, "logits/chosen": 0.6548217535018921, "logits/rejected": 0.759204089641571, "logps/chosen": -232.4375, "logps/rejected": -215.47500610351562, "loss": 0.3981, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.5816650390625, "rewards/margins": 3.499560594558716, "rewards/rejected": -6.079687595367432, "step": 5880 }, { "epoch": 0.9424, "grad_norm": 39.384071601787376, "learning_rate": 5.7759999999999996e-08, "logits/chosen": 0.6951659917831421, "logits/rejected": 0.833251953125, "logps/chosen": -273.13751220703125, "logps/rejected": -237.78750610351562, "loss": 0.367, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.904199242591858, "rewards/margins": 4.095898628234863, "rewards/rejected": -6.001855373382568, "step": 5890 }, { "epoch": 0.944, "grad_norm": 98.6116021729792, "learning_rate": 5.616e-08, "logits/chosen": 0.7753051519393921, "logits/rejected": NaN, "logps/chosen": -228.72500610351562, "logps/rejected": -204.1875, "loss": 0.4162, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.986743211746216, "rewards/margins": 3.3485350608825684, "rewards/rejected": -6.333984375, "step": 5900 }, { "epoch": 0.9456, "grad_norm": 126.7962184378763, "learning_rate": 5.4559999999999994e-08, "logits/chosen": 0.6955673098564148, "logits/rejected": 0.8438720703125, "logps/chosen": -244.64999389648438, "logps/rejected": -223.85000610351562, "loss": 0.4467, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.54638671875, "rewards/margins": 3.1884765625, "rewards/rejected": -5.734570503234863, "step": 5910 }, { "epoch": 0.9472, "grad_norm": 87.905810623802, "learning_rate": 5.296e-08, "logits/chosen": 0.7859741449356079, "logits/rejected": 0.8678954839706421, "logps/chosen": -243.6750030517578, "logps/rejected": -228.8874969482422, "loss": 0.4604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3330078125, "rewards/margins": 3.7286133766174316, "rewards/rejected": -6.060351371765137, "step": 5920 }, { "epoch": 0.9488, "grad_norm": 168.61011036746308, "learning_rate": 5.136e-08, "logits/chosen": 0.7694091796875, "logits/rejected": 0.94091796875, "logps/chosen": -243.375, "logps/rejected": -221.39999389648438, "loss": 0.4485, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.7824950218200684, "rewards/margins": 3.1786131858825684, "rewards/rejected": -5.964257717132568, "step": 5930 }, { "epoch": 0.9504, "grad_norm": 97.75720437529789, "learning_rate": 4.976e-08, "logits/chosen": 0.668286144733429, "logits/rejected": 0.781933605670929, "logps/chosen": -235.875, "logps/rejected": -227.5, "loss": 0.3474, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.2009520530700684, "rewards/margins": 3.632128953933716, "rewards/rejected": -5.831445217132568, "step": 5940 }, { "epoch": 0.952, "grad_norm": 61.81785368344826, "learning_rate": 4.816e-08, "logits/chosen": 0.7411133050918579, "logits/rejected": 0.885729968547821, "logps/chosen": -252.9375, "logps/rejected": -237.0, "loss": 0.5206, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.4683594703674316, "rewards/margins": 3.433398485183716, "rewards/rejected": -5.902148246765137, "step": 5950 }, { "epoch": 0.9536, "grad_norm": 81.0160148907843, "learning_rate": 4.6559999999999995e-08, "logits/chosen": 0.720141589641571, "logits/rejected": 0.8459717035293579, "logps/chosen": -255.2624969482422, "logps/rejected": -217.0625, "loss": 0.4755, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7251954078674316, "rewards/margins": 3.370361328125, "rewards/rejected": -6.093554496765137, "step": 5960 }, { "epoch": 0.9552, "grad_norm": 130.72898507063695, "learning_rate": 4.496e-08, "logits/chosen": 0.6896728277206421, "logits/rejected": 0.8962157964706421, "logps/chosen": -250.1999969482422, "logps/rejected": -223.27499389648438, "loss": 0.4399, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.486621141433716, "rewards/margins": 3.7012696266174316, "rewards/rejected": -6.190625190734863, "step": 5970 }, { "epoch": 0.9568, "grad_norm": 145.94062972079325, "learning_rate": 4.336e-08, "logits/chosen": 0.7087646722793579, "logits/rejected": 0.876635730266571, "logps/chosen": -227.85000610351562, "logps/rejected": -224.3625030517578, "loss": 0.441, "rewards/accuracies": 0.78125, "rewards/chosen": -2.54229736328125, "rewards/margins": 3.188281297683716, "rewards/rejected": -5.73046875, "step": 5980 }, { "epoch": 0.9584, "grad_norm": 199.94549237183003, "learning_rate": 4.176e-08, "logits/chosen": 0.742810070514679, "logits/rejected": 0.928295910358429, "logps/chosen": -256.32501220703125, "logps/rejected": -236.1750030517578, "loss": 0.4531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.608154296875, "rewards/margins": 3.293750047683716, "rewards/rejected": -5.903124809265137, "step": 5990 }, { "epoch": 0.96, "grad_norm": 1096.3576581036525, "learning_rate": 4.016e-08, "logits/chosen": 0.9157959222793579, "logits/rejected": 0.9022461175918579, "logps/chosen": -267.2875061035156, "logps/rejected": -210.71249389648438, "loss": 0.9072, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7989258766174316, "rewards/margins": 3.01123046875, "rewards/rejected": -5.802734375, "step": 6000 }, { "epoch": 0.9616, "grad_norm": 117.5190996906667, "learning_rate": 3.8559999999999996e-08, "logits/chosen": 0.736804187297821, "logits/rejected": 0.862231433391571, "logps/chosen": -260.3125, "logps/rejected": -220.375, "loss": 0.4471, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8083984851837158, "rewards/margins": 3.830078125, "rewards/rejected": -5.634375095367432, "step": 6010 }, { "epoch": 0.9632, "grad_norm": 169.99985754877667, "learning_rate": 3.696e-08, "logits/chosen": 0.7377716302871704, "logits/rejected": 0.898120105266571, "logps/chosen": -266.125, "logps/rejected": -244.125, "loss": 0.5402, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.456249952316284, "rewards/margins": 3.2662110328674316, "rewards/rejected": -5.72021484375, "step": 6020 }, { "epoch": 0.9648, "grad_norm": 90.49360753102371, "learning_rate": 3.536e-08, "logits/chosen": 0.6639251708984375, "logits/rejected": 0.848767101764679, "logps/chosen": -261.5, "logps/rejected": -220.3874969482422, "loss": 0.5122, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0738768577575684, "rewards/margins": 3.5103516578674316, "rewards/rejected": -5.582421779632568, "step": 6030 }, { "epoch": 0.9664, "grad_norm": 72.65315852204151, "learning_rate": 3.376e-08, "logits/chosen": 0.685791015625, "logits/rejected": 0.763427734375, "logps/chosen": -240.25, "logps/rejected": -214.1875, "loss": 0.5176, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.632885694503784, "rewards/margins": 2.993945360183716, "rewards/rejected": -5.628515720367432, "step": 6040 }, { "epoch": 0.968, "grad_norm": 65.01561228027529, "learning_rate": 3.216e-08, "logits/chosen": 0.862548828125, "logits/rejected": 0.9971923828125, "logps/chosen": -249.02499389648438, "logps/rejected": -228.9499969482422, "loss": 0.3856, "rewards/accuracies": 0.8125, "rewards/chosen": -1.898284912109375, "rewards/margins": 3.8363280296325684, "rewards/rejected": -5.732421875, "step": 6050 }, { "epoch": 0.9696, "grad_norm": 45.23269830376272, "learning_rate": 3.056e-08, "logits/chosen": 0.72296142578125, "logits/rejected": 0.8470214605331421, "logps/chosen": -251.22500610351562, "logps/rejected": -220.3125, "loss": 0.3433, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.982995629310608, "rewards/margins": 3.86083984375, "rewards/rejected": -5.845507621765137, "step": 6060 }, { "epoch": 0.9712, "grad_norm": 169.5772885754547, "learning_rate": 2.896e-08, "logits/chosen": 0.7316039800643921, "logits/rejected": 0.9727538824081421, "logps/chosen": -263.1875, "logps/rejected": -229.8249969482422, "loss": 0.4216, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.349658250808716, "rewards/margins": 3.674609422683716, "rewards/rejected": -6.024609565734863, "step": 6070 }, { "epoch": 0.9728, "grad_norm": 85.1466051787499, "learning_rate": 2.7359999999999998e-08, "logits/chosen": 0.8734130859375, "logits/rejected": 0.924243152141571, "logps/chosen": -261.04998779296875, "logps/rejected": -223.35000610351562, "loss": 0.4575, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0282959938049316, "rewards/margins": 3.227099657058716, "rewards/rejected": -5.254492282867432, "step": 6080 }, { "epoch": 0.9744, "grad_norm": 66.94384465770702, "learning_rate": 2.576e-08, "logits/chosen": 0.720996081829071, "logits/rejected": 0.9080566167831421, "logps/chosen": -233.125, "logps/rejected": -207.10000610351562, "loss": 0.3383, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.247967481613159, "rewards/margins": 3.373242139816284, "rewards/rejected": -5.618554592132568, "step": 6090 }, { "epoch": 0.976, "grad_norm": 98.37105403123294, "learning_rate": 2.416e-08, "logits/chosen": 0.888256847858429, "logits/rejected": 1.0355713367462158, "logps/chosen": -238.3249969482422, "logps/rejected": -209.6374969482422, "loss": 0.4518, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4561767578125, "rewards/margins": 3.485058546066284, "rewards/rejected": -5.939843654632568, "step": 6100 }, { "epoch": 0.9776, "grad_norm": 119.91154008718273, "learning_rate": 2.2559999999999998e-08, "logits/chosen": 0.733642578125, "logits/rejected": 0.9045044183731079, "logps/chosen": -242.72500610351562, "logps/rejected": -194.0749969482422, "loss": 0.5077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.7498536109924316, "rewards/margins": 3.034912109375, "rewards/rejected": -5.787890434265137, "step": 6110 }, { "epoch": 0.9792, "grad_norm": 188.81364737901487, "learning_rate": 2.0959999999999997e-08, "logits/chosen": 0.801647961139679, "logits/rejected": 0.9349731206893921, "logps/chosen": -223.875, "logps/rejected": -193.41250610351562, "loss": 0.4491, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6341490745544434, "rewards/margins": 2.685839891433716, "rewards/rejected": -5.3203125, "step": 6120 }, { "epoch": 0.9808, "grad_norm": 53.981313755207154, "learning_rate": 1.936e-08, "logits/chosen": 0.7764648199081421, "logits/rejected": 0.9073730707168579, "logps/chosen": -251.91250610351562, "logps/rejected": -228.9499969482422, "loss": 0.4789, "rewards/accuracies": 0.75, "rewards/chosen": -2.5196290016174316, "rewards/margins": 3.4813475608825684, "rewards/rejected": -6.001562595367432, "step": 6130 }, { "epoch": 0.9824, "grad_norm": 64.84098348274188, "learning_rate": 1.776e-08, "logits/chosen": 0.8289794921875, "logits/rejected": 0.869921863079071, "logps/chosen": -267.6000061035156, "logps/rejected": -225.7624969482422, "loss": 0.5215, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.1158690452575684, "rewards/margins": 3.648144483566284, "rewards/rejected": -5.760156154632568, "step": 6140 }, { "epoch": 0.984, "grad_norm": 82.52301611040912, "learning_rate": 1.616e-08, "logits/chosen": 0.575146496295929, "logits/rejected": 0.785961925983429, "logps/chosen": -251.77499389648438, "logps/rejected": -210.71249389648438, "loss": 0.4527, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.149853467941284, "rewards/margins": 3.6631836891174316, "rewards/rejected": -5.813086032867432, "step": 6150 }, { "epoch": 0.9856, "grad_norm": 67.07844517728084, "learning_rate": 1.456e-08, "logits/chosen": 0.622637927532196, "logits/rejected": 0.7719482183456421, "logps/chosen": -244.5, "logps/rejected": -228.78750610351562, "loss": 0.5069, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.959570288658142, "rewards/margins": 3.5005860328674316, "rewards/rejected": -5.460156440734863, "step": 6160 }, { "epoch": 0.9872, "grad_norm": 90.59471507550936, "learning_rate": 1.2959999999999998e-08, "logits/chosen": 0.72100830078125, "logits/rejected": 0.8014465570449829, "logps/chosen": -256.6499938964844, "logps/rejected": -256.0874938964844, "loss": 0.5816, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.910375952720642, "rewards/margins": 3.2080078125, "rewards/rejected": -5.118945121765137, "step": 6170 }, { "epoch": 0.9888, "grad_norm": 27.252573332929735, "learning_rate": 1.136e-08, "logits/chosen": 0.77197265625, "logits/rejected": 0.9393554925918579, "logps/chosen": -252.8000030517578, "logps/rejected": -232.0500030517578, "loss": 0.4787, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.975976586341858, "rewards/margins": 3.8001952171325684, "rewards/rejected": -5.778906345367432, "step": 6180 }, { "epoch": 0.9904, "grad_norm": 95.59927222601527, "learning_rate": 9.76e-09, "logits/chosen": 0.726635754108429, "logits/rejected": 0.8445800542831421, "logps/chosen": -235.53750610351562, "logps/rejected": -219.25, "loss": 0.5457, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.374072313308716, "rewards/margins": 2.943066358566284, "rewards/rejected": -5.318163871765137, "step": 6190 }, { "epoch": 0.992, "grad_norm": 65.87523283687784, "learning_rate": 8.16e-09, "logits/chosen": 0.7586669921875, "logits/rejected": 0.854931652545929, "logps/chosen": -214.4250030517578, "logps/rejected": -197.3000030517578, "loss": 0.3844, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5888733863830566, "rewards/margins": 3.2403321266174316, "rewards/rejected": -5.826171875, "step": 6200 }, { "epoch": 0.9936, "grad_norm": 191.31745464950043, "learning_rate": 6.56e-09, "logits/chosen": 0.743786633014679, "logits/rejected": 0.8792724609375, "logps/chosen": -279.0625, "logps/rejected": -232.9875030517578, "loss": 0.5572, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2726807594299316, "rewards/margins": 3.4114747047424316, "rewards/rejected": -5.683203220367432, "step": 6210 }, { "epoch": 0.9952, "grad_norm": 30.0572775149899, "learning_rate": 4.9599999999999994e-09, "logits/chosen": 0.8102051019668579, "logits/rejected": 0.946154773235321, "logps/chosen": -247.0625, "logps/rejected": -236.7375030517578, "loss": 0.5002, "rewards/accuracies": 0.78125, "rewards/chosen": -2.258715867996216, "rewards/margins": 3.527294874191284, "rewards/rejected": -5.784765720367432, "step": 6220 }, { "epoch": 0.9968, "grad_norm": 77.5434982337483, "learning_rate": 3.36e-09, "logits/chosen": 0.6787048578262329, "logits/rejected": 0.8617798089981079, "logps/chosen": -235.46249389648438, "logps/rejected": -223.2375030517578, "loss": 0.4249, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0650391578674316, "rewards/margins": 2.934375047683716, "rewards/rejected": -6.001562595367432, "step": 6230 }, { "epoch": 0.9984, "grad_norm": 106.74692234624403, "learning_rate": 1.76e-09, "logits/chosen": 0.7248290777206421, "logits/rejected": 0.839160144329071, "logps/chosen": -232.97500610351562, "logps/rejected": -223.7687530517578, "loss": 0.4639, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5500121116638184, "rewards/margins": 3.0849609375, "rewards/rejected": -5.637304782867432, "step": 6240 }, { "epoch": 1.0, "grad_norm": 58.94747460947692, "learning_rate": 1.6000000000000002e-10, "logits/chosen": 0.6669921875, "logits/rejected": 0.753857433795929, "logps/chosen": -244.53750610351562, "logps/rejected": -224.53750610351562, "loss": 0.4529, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.633410692214966, "rewards/margins": 2.945507764816284, "rewards/rejected": -5.578125, "step": 6250 } ], "logging_steps": 10, "max_steps": 6250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }