{ "best_global_step": 845, "best_metric": 4.954884052276611, "best_model_checkpoint": "/tmp/svadugur/39817/informativity_and_cost_preference-speaker=gemma-listener=pixtral_ft-length_conditioned=True-contexts=medium-39817/checkpoint-845", "epoch": 2.60638707195075, "eval_steps": 65, "global_step": 845, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_logits/chosen": -2.198549747467041, "eval_logits/rejected": -2.1211564540863037, "eval_logps/chosen": -58.27378845214844, "eval_logps/rejected": -80.16583251953125, "eval_loss": 1.0, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0, "eval_runtime": 659.6204, "eval_samples_per_second": 0.525, "eval_steps_per_second": 0.262, "step": 0 }, { "epoch": 0.0030781069642170067, "grad_norm": 2.760434150695801, "learning_rate": 1e-06, "logits/chosen": -2.212428331375122, "logits/rejected": -2.156350612640381, "logps/chosen": -61.298439025878906, "logps/rejected": -76.69519805908203, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0061562139284340135, "grad_norm": 3.093526840209961, "learning_rate": 9.996913580246914e-07, "logits/chosen": -2.2405107021331787, "logits/rejected": -2.183007001876831, "logps/chosen": -60.78013610839844, "logps/rejected": -71.30288696289062, "loss": 0.9963, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0006319168023765087, "rewards/margins": 0.014772631227970123, "rewards/rejected": -0.014140713959932327, "step": 2 }, { "epoch": 0.00923432089265102, "grad_norm": 3.1163671016693115, "learning_rate": 9.993827160493825e-07, "logits/chosen": -2.251145839691162, "logits/rejected": -2.2253992557525635, "logps/chosen": -63.33245086669922, "logps/rejected": -85.2788314819336, "loss": 0.9976, "rewards/accuracies": 0.53125, "rewards/chosen": -0.004229939077049494, "rewards/margins": 0.009759711101651192, "rewards/rejected": -0.013989650644361973, "step": 3 }, { "epoch": 0.012312427856868027, "grad_norm": 3.112494468688965, "learning_rate": 9.990740740740742e-07, "logits/chosen": -2.2609639167785645, "logits/rejected": -2.1294665336608887, "logps/chosen": -58.44734573364258, "logps/rejected": -77.84696197509766, "loss": 1.0008, "rewards/accuracies": 0.46875, "rewards/chosen": -0.01653284952044487, "rewards/margins": -0.0032810927368700504, "rewards/rejected": -0.013251757249236107, "step": 4 }, { "epoch": 0.015390534821085032, "grad_norm": 3.078284502029419, "learning_rate": 9.987654320987654e-07, "logits/chosen": -2.190129280090332, "logits/rejected": -2.141735076904297, "logps/chosen": -61.01892852783203, "logps/rejected": -76.51071166992188, "loss": 1.0061, "rewards/accuracies": 0.25, "rewards/chosen": -0.02672964334487915, "rewards/margins": -0.024284232407808304, "rewards/rejected": -0.00244541116990149, "step": 5 }, { "epoch": 0.01846864178530204, "grad_norm": 2.9404070377349854, "learning_rate": 9.984567901234568e-07, "logits/chosen": -2.098156690597534, "logits/rejected": -2.0781586170196533, "logps/chosen": -61.50341033935547, "logps/rejected": -83.98672485351562, "loss": 1.0025, "rewards/accuracies": 0.40625, "rewards/chosen": -0.017817987129092216, "rewards/margins": -0.01012104656547308, "rewards/rejected": -0.0076969382353127, "step": 6 }, { "epoch": 0.021546748749519045, "grad_norm": 8.745540618896484, "learning_rate": 9.981481481481482e-07, "logits/chosen": -2.1993627548217773, "logits/rejected": -2.082402467727661, "logps/chosen": -61.36785888671875, "logps/rejected": -80.3177261352539, "loss": 0.9918, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010181797668337822, "rewards/margins": 0.03294633701443672, "rewards/rejected": -0.0227645393460989, "step": 7 }, { "epoch": 0.024624855713736054, "grad_norm": 3.1729001998901367, "learning_rate": 9.978395061728394e-07, "logits/chosen": -2.2417190074920654, "logits/rejected": -2.1730921268463135, "logps/chosen": -57.85318374633789, "logps/rejected": -80.72602081298828, "loss": 0.9932, "rewards/accuracies": 0.65625, "rewards/chosen": 0.013833988457918167, "rewards/margins": 0.027112936601042747, "rewards/rejected": -0.01327895000576973, "step": 8 }, { "epoch": 0.02770296267795306, "grad_norm": 2.78568172454834, "learning_rate": 9.975308641975308e-07, "logits/chosen": -2.205286979675293, "logits/rejected": -2.168771266937256, "logps/chosen": -56.66880798339844, "logps/rejected": -76.43110656738281, "loss": 0.9938, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0007154345512390137, "rewards/margins": 0.02476816065609455, "rewards/rejected": -0.025483597069978714, "step": 9 }, { "epoch": 0.030781069642170065, "grad_norm": 3.3062009811401367, "learning_rate": 9.972222222222222e-07, "logits/chosen": -2.2399327754974365, "logits/rejected": -2.2229652404785156, "logps/chosen": -60.350337982177734, "logps/rejected": -79.1253433227539, "loss": 1.0037, "rewards/accuracies": 0.46875, "rewards/chosen": -0.018859578296542168, "rewards/margins": -0.014578782953321934, "rewards/rejected": -0.004280793014913797, "step": 10 }, { "epoch": 0.03385917660638707, "grad_norm": 2.947038173675537, "learning_rate": 9.969135802469136e-07, "logits/chosen": -2.2654433250427246, "logits/rejected": -2.1599783897399902, "logps/chosen": -55.81011962890625, "logps/rejected": -78.66986083984375, "loss": 1.0009, "rewards/accuracies": 0.4375, "rewards/chosen": -0.005487596616148949, "rewards/margins": -0.00353343621827662, "rewards/rejected": -0.0019541624933481216, "step": 11 }, { "epoch": 0.03693728357060408, "grad_norm": 3.3006091117858887, "learning_rate": 9.966049382716048e-07, "logits/chosen": -2.2351465225219727, "logits/rejected": -2.1484122276306152, "logps/chosen": -59.68974304199219, "logps/rejected": -77.71825408935547, "loss": 0.9992, "rewards/accuracies": 0.53125, "rewards/chosen": -0.01713475212454796, "rewards/margins": 0.0032503963448107243, "rewards/rejected": -0.020385149866342545, "step": 12 }, { "epoch": 0.04001539053482109, "grad_norm": 3.2862443923950195, "learning_rate": 9.962962962962964e-07, "logits/chosen": -2.3424508571624756, "logits/rejected": -2.2130074501037598, "logps/chosen": -62.42084503173828, "logps/rejected": -86.286376953125, "loss": 1.0031, "rewards/accuracies": 0.375, "rewards/chosen": -0.03127181529998779, "rewards/margins": -0.012489665299654007, "rewards/rejected": -0.018782151862978935, "step": 13 }, { "epoch": 0.04309349749903809, "grad_norm": 2.7819066047668457, "learning_rate": 9.959876543209876e-07, "logits/chosen": -2.213747024536133, "logits/rejected": -2.162879705429077, "logps/chosen": -59.6649055480957, "logps/rejected": -73.16128540039062, "loss": 0.9934, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0015632514841854572, "rewards/margins": 0.026406943798065186, "rewards/rejected": -0.027970194816589355, "step": 14 }, { "epoch": 0.0461716044632551, "grad_norm": 3.0421876907348633, "learning_rate": 9.95679012345679e-07, "logits/chosen": -2.167269706726074, "logits/rejected": -2.0946216583251953, "logps/chosen": -58.48046112060547, "logps/rejected": -81.56053161621094, "loss": 1.0041, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03926767036318779, "rewards/margins": -0.016627728939056396, "rewards/rejected": -0.022639941424131393, "step": 15 }, { "epoch": 0.04924971142747211, "grad_norm": 3.0790579319000244, "learning_rate": 9.953703703703704e-07, "logits/chosen": -2.2285053730010986, "logits/rejected": -2.1023221015930176, "logps/chosen": -57.63072967529297, "logps/rejected": -77.09473419189453, "loss": 0.9915, "rewards/accuracies": 0.65625, "rewards/chosen": 0.006297777872532606, "rewards/margins": 0.03402144834399223, "rewards/rejected": -0.02772367186844349, "step": 16 }, { "epoch": 0.05232781839168911, "grad_norm": 3.9801316261291504, "learning_rate": 9.950617283950616e-07, "logits/chosen": -2.2885091304779053, "logits/rejected": -2.177225112915039, "logps/chosen": -57.19432067871094, "logps/rejected": -79.25652313232422, "loss": 1.0049, "rewards/accuracies": 0.3125, "rewards/chosen": -0.029390549287199974, "rewards/margins": -0.019782472401857376, "rewards/rejected": -0.009608077816665173, "step": 17 }, { "epoch": 0.05540592535590612, "grad_norm": 3.1918065547943115, "learning_rate": 9.947530864197532e-07, "logits/chosen": -2.281702756881714, "logits/rejected": -2.1874310970306396, "logps/chosen": -62.961326599121094, "logps/rejected": -82.3508071899414, "loss": 0.9901, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0168455857783556, "rewards/margins": 0.0400349386036396, "rewards/rejected": -0.05688052624464035, "step": 18 }, { "epoch": 0.05848403232012313, "grad_norm": 3.161085844039917, "learning_rate": 9.944444444444444e-07, "logits/chosen": -2.239755868911743, "logits/rejected": -2.147164821624756, "logps/chosen": -61.52384948730469, "logps/rejected": -84.83968353271484, "loss": 0.9987, "rewards/accuracies": 0.5, "rewards/chosen": -0.01258298009634018, "rewards/margins": 0.005095994099974632, "rewards/rejected": -0.01767897605895996, "step": 19 }, { "epoch": 0.06156213928434013, "grad_norm": 2.923187017440796, "learning_rate": 9.941358024691358e-07, "logits/chosen": -2.245314836502075, "logits/rejected": -2.1538376808166504, "logps/chosen": -57.89520263671875, "logps/rejected": -76.67986297607422, "loss": 1.0011, "rewards/accuracies": 0.53125, "rewards/chosen": -0.02832282893359661, "rewards/margins": -0.004395483992993832, "rewards/rejected": -0.023927344009280205, "step": 20 }, { "epoch": 0.06464024624855713, "grad_norm": 2.8608133792877197, "learning_rate": 9.938271604938272e-07, "logits/chosen": -2.217172145843506, "logits/rejected": -2.139185905456543, "logps/chosen": -63.44913101196289, "logps/rejected": -83.15200805664062, "loss": 0.9929, "rewards/accuracies": 0.59375, "rewards/chosen": -0.012262867763638496, "rewards/margins": 0.028597917407751083, "rewards/rejected": -0.04086078330874443, "step": 21 }, { "epoch": 0.06771835321277414, "grad_norm": 3.1124982833862305, "learning_rate": 9.935185185185184e-07, "logits/chosen": -2.168670415878296, "logits/rejected": -2.149062156677246, "logps/chosen": -63.164669036865234, "logps/rejected": -83.50331115722656, "loss": 0.9896, "rewards/accuracies": 0.59375, "rewards/chosen": -0.034411944448947906, "rewards/margins": 0.04175690561532974, "rewards/rejected": -0.07616884261369705, "step": 22 }, { "epoch": 0.07079646017699115, "grad_norm": 3.2578768730163574, "learning_rate": 9.9320987654321e-07, "logits/chosen": -2.223332643508911, "logits/rejected": -2.1755528450012207, "logps/chosen": -55.6052360534668, "logps/rejected": -78.43476104736328, "loss": 0.9839, "rewards/accuracies": 0.8125, "rewards/chosen": -0.005784798413515091, "rewards/margins": 0.06439611315727234, "rewards/rejected": -0.07018091529607773, "step": 23 }, { "epoch": 0.07387456714120816, "grad_norm": 3.0148141384124756, "learning_rate": 9.929012345679012e-07, "logits/chosen": -2.262368679046631, "logits/rejected": -2.197526216506958, "logps/chosen": -60.62251663208008, "logps/rejected": -77.68494415283203, "loss": 0.9875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.04118910804390907, "rewards/margins": 0.050187256187200546, "rewards/rejected": -0.09137637168169022, "step": 24 }, { "epoch": 0.07695267410542517, "grad_norm": 3.447997570037842, "learning_rate": 9.925925925925926e-07, "logits/chosen": -2.283607006072998, "logits/rejected": -2.183025360107422, "logps/chosen": -57.5501823425293, "logps/rejected": -74.55166625976562, "loss": 0.995, "rewards/accuracies": 0.5, "rewards/chosen": -0.04635731875896454, "rewards/margins": 0.020058488473296165, "rewards/rejected": -0.06641580909490585, "step": 25 }, { "epoch": 0.08003078106964218, "grad_norm": 5.352074146270752, "learning_rate": 9.92283950617284e-07, "logits/chosen": -2.2345051765441895, "logits/rejected": -2.1711032390594482, "logps/chosen": -56.9674186706543, "logps/rejected": -80.67253112792969, "loss": 0.9859, "rewards/accuracies": 0.625, "rewards/chosen": -0.026602959260344505, "rewards/margins": 0.05645959451794624, "rewards/rejected": -0.0830625593662262, "step": 26 }, { "epoch": 0.08310888803385917, "grad_norm": 3.0193841457366943, "learning_rate": 9.919753086419754e-07, "logits/chosen": -2.2448549270629883, "logits/rejected": -2.156141757965088, "logps/chosen": -60.8770751953125, "logps/rejected": -79.49710083007812, "loss": 0.988, "rewards/accuracies": 0.75, "rewards/chosen": -0.04813968762755394, "rewards/margins": 0.04821355640888214, "rewards/rejected": -0.09635324776172638, "step": 27 }, { "epoch": 0.08618699499807618, "grad_norm": 3.4293904304504395, "learning_rate": 9.916666666666666e-07, "logits/chosen": -2.292771100997925, "logits/rejected": -2.16139554977417, "logps/chosen": -59.50252151489258, "logps/rejected": -81.00930786132812, "loss": 0.9789, "rewards/accuracies": 0.71875, "rewards/chosen": -0.030825400725007057, "rewards/margins": 0.08478343486785889, "rewards/rejected": -0.1156088337302208, "step": 28 }, { "epoch": 0.08926510196229319, "grad_norm": 3.1370913982391357, "learning_rate": 9.91358024691358e-07, "logits/chosen": -2.281458854675293, "logits/rejected": -2.2069571018218994, "logps/chosen": -58.6175651550293, "logps/rejected": -76.66333770751953, "loss": 0.9829, "rewards/accuracies": 0.625, "rewards/chosen": -0.03666277229785919, "rewards/margins": 0.06849025189876556, "rewards/rejected": -0.10515303164720535, "step": 29 }, { "epoch": 0.0923432089265102, "grad_norm": 4.853809356689453, "learning_rate": 9.910493827160494e-07, "logits/chosen": -2.1902480125427246, "logits/rejected": -2.0606184005737305, "logps/chosen": -59.44179153442383, "logps/rejected": -78.15677642822266, "loss": 0.9909, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07196469604969025, "rewards/margins": 0.036517515778541565, "rewards/rejected": -0.10848219692707062, "step": 30 }, { "epoch": 0.0954213158907272, "grad_norm": 3.3656957149505615, "learning_rate": 9.907407407407406e-07, "logits/chosen": -2.321260452270508, "logits/rejected": -2.1691291332244873, "logps/chosen": -61.82666778564453, "logps/rejected": -76.67313385009766, "loss": 0.9929, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03541330248117447, "rewards/margins": 0.02863396517932415, "rewards/rejected": -0.06404726207256317, "step": 31 }, { "epoch": 0.09849942285494422, "grad_norm": 3.1112794876098633, "learning_rate": 9.904320987654322e-07, "logits/chosen": -2.224329710006714, "logits/rejected": -2.1616554260253906, "logps/chosen": -56.566017150878906, "logps/rejected": -77.48594665527344, "loss": 0.9936, "rewards/accuracies": 0.6875, "rewards/chosen": -0.058195747435092926, "rewards/margins": 0.02550373040139675, "rewards/rejected": -0.08369947224855423, "step": 32 }, { "epoch": 0.10157752981916121, "grad_norm": 3.4538862705230713, "learning_rate": 9.901234567901234e-07, "logits/chosen": -2.2762815952301025, "logits/rejected": -2.1878809928894043, "logps/chosen": -54.25384521484375, "logps/rejected": -74.90228271484375, "loss": 0.9811, "rewards/accuracies": 0.75, "rewards/chosen": -0.04837929084897041, "rewards/margins": 0.07612262666225433, "rewards/rejected": -0.12450192123651505, "step": 33 }, { "epoch": 0.10465563678337822, "grad_norm": 3.587700605392456, "learning_rate": 9.898148148148148e-07, "logits/chosen": -2.320556163787842, "logits/rejected": -2.2019248008728027, "logps/chosen": -57.729366302490234, "logps/rejected": -76.26050567626953, "loss": 0.9856, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05632612854242325, "rewards/margins": 0.05803961306810379, "rewards/rejected": -0.11436574906110764, "step": 34 }, { "epoch": 0.10773374374759523, "grad_norm": 3.782381772994995, "learning_rate": 9.895061728395062e-07, "logits/chosen": -2.276524543762207, "logits/rejected": -2.119481325149536, "logps/chosen": -61.59113311767578, "logps/rejected": -83.87635040283203, "loss": 0.9838, "rewards/accuracies": 0.71875, "rewards/chosen": -0.041968926787376404, "rewards/margins": 0.06480035930871964, "rewards/rejected": -0.10676929354667664, "step": 35 }, { "epoch": 0.11081185071181224, "grad_norm": 3.686326265335083, "learning_rate": 9.891975308641974e-07, "logits/chosen": -2.269723653793335, "logits/rejected": -2.160050630569458, "logps/chosen": -56.069725036621094, "logps/rejected": -80.2764663696289, "loss": 0.976, "rewards/accuracies": 0.84375, "rewards/chosen": -0.059176042675971985, "rewards/margins": 0.09642143547534943, "rewards/rejected": -0.1555974781513214, "step": 36 }, { "epoch": 0.11388995767602925, "grad_norm": 3.9102907180786133, "learning_rate": 9.88888888888889e-07, "logits/chosen": -2.2250523567199707, "logits/rejected": -2.0899412631988525, "logps/chosen": -62.72491455078125, "logps/rejected": -78.77276611328125, "loss": 0.9799, "rewards/accuracies": 0.75, "rewards/chosen": -0.07582690566778183, "rewards/margins": 0.08061216026544571, "rewards/rejected": -0.15643906593322754, "step": 37 }, { "epoch": 0.11696806464024626, "grad_norm": 5.301410675048828, "learning_rate": 9.885802469135802e-07, "logits/chosen": -2.212362051010132, "logits/rejected": -2.186871290206909, "logps/chosen": -57.28205490112305, "logps/rejected": -78.02888488769531, "loss": 0.9822, "rewards/accuracies": 0.78125, "rewards/chosen": -0.07280739396810532, "rewards/margins": 0.07196379452943802, "rewards/rejected": -0.14477120339870453, "step": 38 }, { "epoch": 0.12004617160446325, "grad_norm": 3.4122724533081055, "learning_rate": 9.882716049382716e-07, "logits/chosen": -2.1833531856536865, "logits/rejected": -2.1645638942718506, "logps/chosen": -57.92047119140625, "logps/rejected": -77.45208740234375, "loss": 0.9729, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06806151568889618, "rewards/margins": 0.10932442545890808, "rewards/rejected": -0.17738592624664307, "step": 39 }, { "epoch": 0.12312427856868026, "grad_norm": 3.6089096069335938, "learning_rate": 9.87962962962963e-07, "logits/chosen": -2.16552472114563, "logits/rejected": -2.0891635417938232, "logps/chosen": -58.80733871459961, "logps/rejected": -92.33660125732422, "loss": 0.9643, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1047932580113411, "rewards/margins": 0.14473743736743927, "rewards/rejected": -0.24953071773052216, "step": 40 }, { "epoch": 0.12620238553289725, "grad_norm": 3.501697063446045, "learning_rate": 9.876543209876542e-07, "logits/chosen": -2.2717325687408447, "logits/rejected": -2.126877546310425, "logps/chosen": -57.497352600097656, "logps/rejected": -83.76193237304688, "loss": 0.968, "rewards/accuracies": 0.84375, "rewards/chosen": -0.07871709018945694, "rewards/margins": 0.129386767745018, "rewards/rejected": -0.20810388028621674, "step": 41 }, { "epoch": 0.12928049249711426, "grad_norm": 4.074934959411621, "learning_rate": 9.873456790123456e-07, "logits/chosen": -2.225627899169922, "logits/rejected": -2.124523878097534, "logps/chosen": -57.14786148071289, "logps/rejected": -74.8714599609375, "loss": 0.9742, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06322498619556427, "rewards/margins": 0.10389897227287292, "rewards/rejected": -0.1671239584684372, "step": 42 }, { "epoch": 0.13235859946133127, "grad_norm": 3.719116449356079, "learning_rate": 9.87037037037037e-07, "logits/chosen": -2.3459887504577637, "logits/rejected": -2.2011959552764893, "logps/chosen": -64.35167694091797, "logps/rejected": -82.92507934570312, "loss": 0.9725, "rewards/accuracies": 0.84375, "rewards/chosen": -0.07776975631713867, "rewards/margins": 0.11089575290679932, "rewards/rejected": -0.18866552412509918, "step": 43 }, { "epoch": 0.13543670642554828, "grad_norm": 3.604400157928467, "learning_rate": 9.867283950617284e-07, "logits/chosen": -2.2657525539398193, "logits/rejected": -2.182033061981201, "logps/chosen": -65.20905303955078, "logps/rejected": -87.38866424560547, "loss": 0.9752, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12803567945957184, "rewards/margins": 0.10005297511816025, "rewards/rejected": -0.22808866202831268, "step": 44 }, { "epoch": 0.1385148133897653, "grad_norm": 3.661207675933838, "learning_rate": 9.864197530864198e-07, "logits/chosen": -2.206930637359619, "logits/rejected": -2.176046371459961, "logps/chosen": -56.84519577026367, "logps/rejected": -81.57957458496094, "loss": 0.967, "rewards/accuracies": 0.75, "rewards/chosen": -0.09688656032085419, "rewards/margins": 0.13345104455947876, "rewards/rejected": -0.23033761978149414, "step": 45 }, { "epoch": 0.1415929203539823, "grad_norm": 2.9730844497680664, "learning_rate": 9.861111111111112e-07, "logits/chosen": -2.2427144050598145, "logits/rejected": -2.1515886783599854, "logps/chosen": -59.927486419677734, "logps/rejected": -76.84014892578125, "loss": 0.9742, "rewards/accuracies": 0.78125, "rewards/chosen": -0.10506591945886612, "rewards/margins": 0.10456842184066772, "rewards/rejected": -0.20963434875011444, "step": 46 }, { "epoch": 0.1446710273181993, "grad_norm": 4.020444869995117, "learning_rate": 9.858024691358024e-07, "logits/chosen": -2.3047637939453125, "logits/rejected": -2.2220194339752197, "logps/chosen": -58.31633758544922, "logps/rejected": -79.56758117675781, "loss": 0.9565, "rewards/accuracies": 0.875, "rewards/chosen": -0.09774318337440491, "rewards/margins": 0.1764446198940277, "rewards/rejected": -0.2741878032684326, "step": 47 }, { "epoch": 0.14774913428241632, "grad_norm": 3.6129517555236816, "learning_rate": 9.854938271604938e-07, "logits/chosen": -2.2509055137634277, "logits/rejected": -2.139814853668213, "logps/chosen": -59.530147552490234, "logps/rejected": -86.39602661132812, "loss": 0.955, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1024000346660614, "rewards/margins": 0.1819656789302826, "rewards/rejected": -0.284365713596344, "step": 48 }, { "epoch": 0.15082724124663333, "grad_norm": 3.6025545597076416, "learning_rate": 9.851851851851852e-07, "logits/chosen": -2.2729640007019043, "logits/rejected": -2.2010421752929688, "logps/chosen": -57.310707092285156, "logps/rejected": -78.63579559326172, "loss": 0.96, "rewards/accuracies": 0.875, "rewards/chosen": -0.08767473697662354, "rewards/margins": 0.1653064489364624, "rewards/rejected": -0.25298118591308594, "step": 49 }, { "epoch": 0.15390534821085033, "grad_norm": 3.7518036365509033, "learning_rate": 9.848765432098764e-07, "logits/chosen": -2.2569632530212402, "logits/rejected": -2.156235694885254, "logps/chosen": -57.60464096069336, "logps/rejected": -79.68107604980469, "loss": 0.9615, "rewards/accuracies": 0.875, "rewards/chosen": -0.0973990336060524, "rewards/margins": 0.1563912183046341, "rewards/rejected": -0.2537902295589447, "step": 50 }, { "epoch": 0.15698345517506734, "grad_norm": 10.124414443969727, "learning_rate": 9.84567901234568e-07, "logits/chosen": -2.3087525367736816, "logits/rejected": -2.2243282794952393, "logps/chosen": -57.12834167480469, "logps/rejected": -75.76184844970703, "loss": 0.9562, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06769189238548279, "rewards/margins": 0.17752349376678467, "rewards/rejected": -0.24521538615226746, "step": 51 }, { "epoch": 0.16006156213928435, "grad_norm": 3.002558469772339, "learning_rate": 9.842592592592592e-07, "logits/chosen": -2.2086381912231445, "logits/rejected": -2.147963523864746, "logps/chosen": -65.40888977050781, "logps/rejected": -81.30929565429688, "loss": 0.9648, "rewards/accuracies": 0.84375, "rewards/chosen": -0.18431836366653442, "rewards/margins": 0.14368771016597748, "rewards/rejected": -0.3280060887336731, "step": 52 }, { "epoch": 0.16313966910350133, "grad_norm": 4.053997039794922, "learning_rate": 9.839506172839506e-07, "logits/chosen": -2.2838587760925293, "logits/rejected": -2.2519800662994385, "logps/chosen": -61.59000015258789, "logps/rejected": -78.03919982910156, "loss": 0.9616, "rewards/accuracies": 0.875, "rewards/chosen": -0.15717877447605133, "rewards/margins": 0.15703755617141724, "rewards/rejected": -0.3142163157463074, "step": 53 }, { "epoch": 0.16621777606771834, "grad_norm": 3.3281757831573486, "learning_rate": 9.83641975308642e-07, "logits/chosen": -2.233320474624634, "logits/rejected": -2.19301700592041, "logps/chosen": -58.10341262817383, "logps/rejected": -71.9045639038086, "loss": 0.9534, "rewards/accuracies": 0.875, "rewards/chosen": -0.10385079681873322, "rewards/margins": 0.18974019587039948, "rewards/rejected": -0.2935909628868103, "step": 54 }, { "epoch": 0.16929588303193535, "grad_norm": 3.359407663345337, "learning_rate": 9.833333333333332e-07, "logits/chosen": -2.2518796920776367, "logits/rejected": -2.1709399223327637, "logps/chosen": -55.558876037597656, "logps/rejected": -79.19924926757812, "loss": 0.9444, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08251329511404037, "rewards/margins": 0.22703827917575836, "rewards/rejected": -0.3095515966415405, "step": 55 }, { "epoch": 0.17237398999615236, "grad_norm": 3.107761859893799, "learning_rate": 9.830246913580248e-07, "logits/chosen": -2.2195911407470703, "logits/rejected": -2.10892653465271, "logps/chosen": -60.266937255859375, "logps/rejected": -80.12210845947266, "loss": 0.9556, "rewards/accuracies": 0.90625, "rewards/chosen": -0.19765178859233856, "rewards/margins": 0.18449318408966064, "rewards/rejected": -0.382144957780838, "step": 56 }, { "epoch": 0.17545209696036937, "grad_norm": 3.250537395477295, "learning_rate": 9.82716049382716e-07, "logits/chosen": -2.219567060470581, "logits/rejected": -2.137305498123169, "logps/chosen": -59.20704650878906, "logps/rejected": -78.99317169189453, "loss": 0.9516, "rewards/accuracies": 0.90625, "rewards/chosen": -0.15500369668006897, "rewards/margins": 0.19814884662628174, "rewards/rejected": -0.3531525731086731, "step": 57 }, { "epoch": 0.17853020392458638, "grad_norm": 3.197932481765747, "learning_rate": 9.824074074074074e-07, "logits/chosen": -2.325204849243164, "logits/rejected": -2.2280678749084473, "logps/chosen": -58.076045989990234, "logps/rejected": -77.997314453125, "loss": 0.9527, "rewards/accuracies": 0.84375, "rewards/chosen": -0.15414543449878693, "rewards/margins": 0.19392552971839905, "rewards/rejected": -0.3480709493160248, "step": 58 }, { "epoch": 0.1816083108888034, "grad_norm": 3.554746150970459, "learning_rate": 9.820987654320988e-07, "logits/chosen": -2.2738447189331055, "logits/rejected": -2.132795810699463, "logps/chosen": -56.52858352661133, "logps/rejected": -76.50516510009766, "loss": 0.9518, "rewards/accuracies": 0.84375, "rewards/chosen": -0.15572671592235565, "rewards/margins": 0.19799663126468658, "rewards/rejected": -0.3537233769893646, "step": 59 }, { "epoch": 0.1846864178530204, "grad_norm": 3.276746988296509, "learning_rate": 9.817901234567902e-07, "logits/chosen": -2.3634731769561768, "logits/rejected": -2.2647314071655273, "logps/chosen": -60.28427505493164, "logps/rejected": -85.69828033447266, "loss": 0.9455, "rewards/accuracies": 0.875, "rewards/chosen": -0.18803244829177856, "rewards/margins": 0.22484959661960602, "rewards/rejected": -0.4128820598125458, "step": 60 }, { "epoch": 0.1877645248172374, "grad_norm": 3.212106943130493, "learning_rate": 9.814814814814814e-07, "logits/chosen": -2.3033721446990967, "logits/rejected": -2.2170302867889404, "logps/chosen": -60.0906982421875, "logps/rejected": -81.34297180175781, "loss": 0.9351, "rewards/accuracies": 0.875, "rewards/chosen": -0.12968984246253967, "rewards/margins": 0.267114520072937, "rewards/rejected": -0.3968043625354767, "step": 61 }, { "epoch": 0.1908426317814544, "grad_norm": 3.687067985534668, "learning_rate": 9.811728395061728e-07, "logits/chosen": -2.235355854034424, "logits/rejected": -2.1549837589263916, "logps/chosen": -65.79511260986328, "logps/rejected": -88.52996063232422, "loss": 0.9392, "rewards/accuracies": 0.90625, "rewards/chosen": -0.23313261568546295, "rewards/margins": 0.25255000591278076, "rewards/rejected": -0.4856826066970825, "step": 62 }, { "epoch": 0.19392073874567142, "grad_norm": 3.6252548694610596, "learning_rate": 9.808641975308642e-07, "logits/chosen": -2.279280662536621, "logits/rejected": -2.159705638885498, "logps/chosen": -61.53561019897461, "logps/rejected": -88.97927856445312, "loss": 0.9273, "rewards/accuracies": 0.96875, "rewards/chosen": -0.19020836055278778, "rewards/margins": 0.3087114989757538, "rewards/rejected": -0.49891987442970276, "step": 63 }, { "epoch": 0.19699884570988843, "grad_norm": 3.1547324657440186, "learning_rate": 9.805555555555554e-07, "logits/chosen": -2.2471373081207275, "logits/rejected": -2.1963589191436768, "logps/chosen": -61.47068786621094, "logps/rejected": -86.71387481689453, "loss": 0.9131, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18542787432670593, "rewards/margins": 0.3688512444496155, "rewards/rejected": -0.554279088973999, "step": 64 }, { "epoch": 0.2000769526741054, "grad_norm": 6.335638999938965, "learning_rate": 9.80246913580247e-07, "logits/chosen": -2.239534378051758, "logits/rejected": -2.135599374771118, "logps/chosen": -62.233097076416016, "logps/rejected": -89.16571044921875, "loss": 0.9216, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2122250497341156, "rewards/margins": 0.33135947585105896, "rewards/rejected": -0.5435845255851746, "step": 65 }, { "epoch": 0.2000769526741054, "eval_logits/chosen": -2.248974323272705, "eval_logits/rejected": -2.1742188930511475, "eval_logps/chosen": -60.0706672668457, "eval_logps/rejected": -84.7643814086914, "eval_loss": 0.9331058859825134, "eval_rewards/accuracies": 0.8670520186424255, "eval_rewards/chosen": -0.17968739569187164, "eval_rewards/margins": 0.28016793727874756, "eval_rewards/rejected": -0.4598553478717804, "eval_runtime": 640.1524, "eval_samples_per_second": 0.54, "eval_steps_per_second": 0.27, "step": 65 }, { "epoch": 0.20315505963832242, "grad_norm": 3.262861967086792, "learning_rate": 9.799382716049382e-07, "logits/chosen": -2.2573695182800293, "logits/rejected": -2.1649317741394043, "logps/chosen": -55.57811737060547, "logps/rejected": -82.70205688476562, "loss": 0.9238, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18610188364982605, "rewards/margins": 0.3231062889099121, "rewards/rejected": -0.5092081427574158, "step": 66 }, { "epoch": 0.20623316660253943, "grad_norm": 3.857335329055786, "learning_rate": 9.796296296296296e-07, "logits/chosen": -2.3400137424468994, "logits/rejected": -2.2532856464385986, "logps/chosen": -59.448585510253906, "logps/rejected": -78.99351501464844, "loss": 0.9302, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1669238656759262, "rewards/margins": 0.2925806939601898, "rewards/rejected": -0.45950454473495483, "step": 67 }, { "epoch": 0.20931127356675644, "grad_norm": 3.2657387256622314, "learning_rate": 9.79320987654321e-07, "logits/chosen": -2.2345848083496094, "logits/rejected": -2.1721432209014893, "logps/chosen": -61.616268157958984, "logps/rejected": -79.17733764648438, "loss": 0.9164, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1450514942407608, "rewards/margins": 0.34572839736938477, "rewards/rejected": -0.49077990651130676, "step": 68 }, { "epoch": 0.21238938053097345, "grad_norm": 3.5109872817993164, "learning_rate": 9.790123456790122e-07, "logits/chosen": -2.308786153793335, "logits/rejected": -2.1853137016296387, "logps/chosen": -58.50771713256836, "logps/rejected": -87.32928466796875, "loss": 0.8997, "rewards/accuracies": 0.96875, "rewards/chosen": -0.167616069316864, "rewards/margins": 0.42413130402565, "rewards/rejected": -0.5917474031448364, "step": 69 }, { "epoch": 0.21546748749519046, "grad_norm": 3.58642578125, "learning_rate": 9.787037037037038e-07, "logits/chosen": -2.2631120681762695, "logits/rejected": -2.18281888961792, "logps/chosen": -57.08592987060547, "logps/rejected": -92.07450866699219, "loss": 0.9092, "rewards/accuracies": 0.875, "rewards/chosen": -0.27796050906181335, "rewards/margins": 0.39447835087776184, "rewards/rejected": -0.6724388599395752, "step": 70 }, { "epoch": 0.21854559445940747, "grad_norm": 3.2762656211853027, "learning_rate": 9.78395061728395e-07, "logits/chosen": -2.2645668983459473, "logits/rejected": -2.1786742210388184, "logps/chosen": -62.60870361328125, "logps/rejected": -77.73043060302734, "loss": 0.9226, "rewards/accuracies": 0.84375, "rewards/chosen": -0.26466161012649536, "rewards/margins": 0.33028727769851685, "rewards/rejected": -0.5949488878250122, "step": 71 }, { "epoch": 0.22162370142362448, "grad_norm": 4.046759605407715, "learning_rate": 9.780864197530864e-07, "logits/chosen": -2.272505760192871, "logits/rejected": -2.2138490676879883, "logps/chosen": -65.99662780761719, "logps/rejected": -97.06468963623047, "loss": 0.8919, "rewards/accuracies": 0.96875, "rewards/chosen": -0.27730366587638855, "rewards/margins": 0.47314703464508057, "rewards/rejected": -0.7504507303237915, "step": 72 }, { "epoch": 0.22470180838784148, "grad_norm": 3.1704912185668945, "learning_rate": 9.777777777777778e-07, "logits/chosen": -2.272125244140625, "logits/rejected": -2.172887086868286, "logps/chosen": -65.15667724609375, "logps/rejected": -91.46578979492188, "loss": 0.9205, "rewards/accuracies": 0.875, "rewards/chosen": -0.3450431227684021, "rewards/margins": 0.345802366733551, "rewards/rejected": -0.6908454895019531, "step": 73 }, { "epoch": 0.2277799153520585, "grad_norm": 9.949706077575684, "learning_rate": 9.774691358024692e-07, "logits/chosen": -2.2340939044952393, "logits/rejected": -2.1859405040740967, "logps/chosen": -58.95082473754883, "logps/rejected": -87.41283416748047, "loss": 0.8915, "rewards/accuracies": 0.96875, "rewards/chosen": -0.15377593040466309, "rewards/margins": 0.4614659249782562, "rewards/rejected": -0.6152418851852417, "step": 74 }, { "epoch": 0.2308580223162755, "grad_norm": 3.393841505050659, "learning_rate": 9.771604938271606e-07, "logits/chosen": -2.3248980045318604, "logits/rejected": -2.155360221862793, "logps/chosen": -56.251220703125, "logps/rejected": -74.44638061523438, "loss": 0.9003, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07746972143650055, "rewards/margins": 0.4245290160179138, "rewards/rejected": -0.501998782157898, "step": 75 }, { "epoch": 0.2339361292804925, "grad_norm": 3.427189826965332, "learning_rate": 9.768518518518518e-07, "logits/chosen": -2.324207067489624, "logits/rejected": -2.179961681365967, "logps/chosen": -59.77492904663086, "logps/rejected": -86.68727111816406, "loss": 0.8911, "rewards/accuracies": 0.875, "rewards/chosen": -0.1953316479921341, "rewards/margins": 0.4725766181945801, "rewards/rejected": -0.6679081916809082, "step": 76 }, { "epoch": 0.2370142362447095, "grad_norm": 3.3243277072906494, "learning_rate": 9.765432098765432e-07, "logits/chosen": -2.2577786445617676, "logits/rejected": -2.1716489791870117, "logps/chosen": -61.15438461303711, "logps/rejected": -85.33625793457031, "loss": 0.8841, "rewards/accuracies": 0.875, "rewards/chosen": -0.2715519666671753, "rewards/margins": 0.5054634809494019, "rewards/rejected": -0.7770154476165771, "step": 77 }, { "epoch": 0.2400923432089265, "grad_norm": 4.082183837890625, "learning_rate": 9.762345679012346e-07, "logits/chosen": -2.3001325130462646, "logits/rejected": -2.2209160327911377, "logps/chosen": -57.67006301879883, "logps/rejected": -85.9696044921875, "loss": 0.8789, "rewards/accuracies": 0.90625, "rewards/chosen": -0.18207187950611115, "rewards/margins": 0.5222326517105103, "rewards/rejected": -0.7043045163154602, "step": 78 }, { "epoch": 0.2431704501731435, "grad_norm": 3.560321092605591, "learning_rate": 9.75925925925926e-07, "logits/chosen": -2.2787177562713623, "logits/rejected": -2.2303502559661865, "logps/chosen": -63.31169891357422, "logps/rejected": -86.29326629638672, "loss": 0.8909, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2820747494697571, "rewards/margins": 0.47888270020484924, "rewards/rejected": -0.7609574794769287, "step": 79 }, { "epoch": 0.24624855713736052, "grad_norm": 3.214315891265869, "learning_rate": 9.756172839506172e-07, "logits/chosen": -2.312192678451538, "logits/rejected": -2.198129415512085, "logps/chosen": -60.23138427734375, "logps/rejected": -83.08747100830078, "loss": 0.8989, "rewards/accuracies": 0.90625, "rewards/chosen": -0.29435208439826965, "rewards/margins": 0.45246875286102295, "rewards/rejected": -0.7468208074569702, "step": 80 }, { "epoch": 0.24932666410157753, "grad_norm": 3.2780160903930664, "learning_rate": 9.753086419753086e-07, "logits/chosen": -2.2892682552337646, "logits/rejected": -2.1795501708984375, "logps/chosen": -60.205448150634766, "logps/rejected": -85.36122131347656, "loss": 0.8907, "rewards/accuracies": 0.875, "rewards/chosen": -0.2603846490383148, "rewards/margins": 0.49043431878089905, "rewards/rejected": -0.7508189678192139, "step": 81 }, { "epoch": 0.2524047710657945, "grad_norm": 3.997421979904175, "learning_rate": 9.75e-07, "logits/chosen": -2.2681455612182617, "logits/rejected": -2.182377338409424, "logps/chosen": -54.64767837524414, "logps/rejected": -80.727783203125, "loss": 0.8726, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19685660302639008, "rewards/margins": 0.5698971748352051, "rewards/rejected": -0.7667537331581116, "step": 82 }, { "epoch": 0.2554828780300115, "grad_norm": 3.322955846786499, "learning_rate": 9.746913580246912e-07, "logits/chosen": -2.2654616832733154, "logits/rejected": -2.19730806350708, "logps/chosen": -54.515785217285156, "logps/rejected": -79.12677001953125, "loss": 0.8666, "rewards/accuracies": 0.96875, "rewards/chosen": -0.1839078962802887, "rewards/margins": 0.5890008211135864, "rewards/rejected": -0.772908627986908, "step": 83 }, { "epoch": 0.2585609849942285, "grad_norm": 3.9470789432525635, "learning_rate": 9.743827160493828e-07, "logits/chosen": -2.3072993755340576, "logits/rejected": -2.2342190742492676, "logps/chosen": -65.36041259765625, "logps/rejected": -80.3888168334961, "loss": 0.9154, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3571280837059021, "rewards/margins": 0.36848214268684387, "rewards/rejected": -0.7256101965904236, "step": 84 }, { "epoch": 0.26163909195844554, "grad_norm": 4.965789318084717, "learning_rate": 9.74074074074074e-07, "logits/chosen": -2.304424285888672, "logits/rejected": -2.2884490489959717, "logps/chosen": -62.77370834350586, "logps/rejected": -86.9851303100586, "loss": 0.8807, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3677733838558197, "rewards/margins": 0.5459418892860413, "rewards/rejected": -0.9137152433395386, "step": 85 }, { "epoch": 0.26471719892266254, "grad_norm": 3.3636722564697266, "learning_rate": 9.737654320987654e-07, "logits/chosen": -2.2683815956115723, "logits/rejected": -2.1771559715270996, "logps/chosen": -66.08097076416016, "logps/rejected": -90.68563842773438, "loss": 0.8668, "rewards/accuracies": 0.96875, "rewards/chosen": -0.29032009840011597, "rewards/margins": 0.5963868498802185, "rewards/rejected": -0.8867068886756897, "step": 86 }, { "epoch": 0.26779530588687955, "grad_norm": 3.095409870147705, "learning_rate": 9.734567901234568e-07, "logits/chosen": -2.253359317779541, "logits/rejected": -2.2450077533721924, "logps/chosen": -59.73688507080078, "logps/rejected": -82.9068832397461, "loss": 0.8721, "rewards/accuracies": 0.96875, "rewards/chosen": -0.26962199807167053, "rewards/margins": 0.5777297616004944, "rewards/rejected": -0.8473517298698425, "step": 87 }, { "epoch": 0.27087341285109656, "grad_norm": 3.118361711502075, "learning_rate": 9.73148148148148e-07, "logits/chosen": -2.311661720275879, "logits/rejected": -2.2663488388061523, "logps/chosen": -66.26983642578125, "logps/rejected": -86.41145324707031, "loss": 0.8779, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3569314181804657, "rewards/margins": 0.5739119648933411, "rewards/rejected": -0.9308434128761292, "step": 88 }, { "epoch": 0.27395151981531357, "grad_norm": 3.4721314907073975, "learning_rate": 9.728395061728396e-07, "logits/chosen": -2.2027947902679443, "logits/rejected": -2.1815905570983887, "logps/chosen": -63.136383056640625, "logps/rejected": -89.72218322753906, "loss": 0.8494, "rewards/accuracies": 0.96875, "rewards/chosen": -0.3215439021587372, "rewards/margins": 0.7052149176597595, "rewards/rejected": -1.0267587900161743, "step": 89 }, { "epoch": 0.2770296267795306, "grad_norm": 3.4242873191833496, "learning_rate": 9.725308641975308e-07, "logits/chosen": -2.3028602600097656, "logits/rejected": -2.242278575897217, "logps/chosen": -61.054283142089844, "logps/rejected": -85.11837768554688, "loss": 0.844, "rewards/accuracies": 0.875, "rewards/chosen": -0.1641973853111267, "rewards/margins": 0.705703616142273, "rewards/rejected": -0.8699010610580444, "step": 90 }, { "epoch": 0.2801077337437476, "grad_norm": 3.5261170864105225, "learning_rate": 9.722222222222222e-07, "logits/chosen": -2.4203124046325684, "logits/rejected": -2.2051470279693604, "logps/chosen": -60.611907958984375, "logps/rejected": -90.39794921875, "loss": 0.8424, "rewards/accuracies": 0.96875, "rewards/chosen": -0.24181726574897766, "rewards/margins": 0.7465718984603882, "rewards/rejected": -0.988389253616333, "step": 91 }, { "epoch": 0.2831858407079646, "grad_norm": 3.670621395111084, "learning_rate": 9.719135802469136e-07, "logits/chosen": -2.3934247493743896, "logits/rejected": -2.2731659412384033, "logps/chosen": -61.94556427001953, "logps/rejected": -94.53395080566406, "loss": 0.8284, "rewards/accuracies": 0.875, "rewards/chosen": -0.1664586365222931, "rewards/margins": 0.8053984045982361, "rewards/rejected": -0.9718570113182068, "step": 92 }, { "epoch": 0.2862639476721816, "grad_norm": 3.42252516746521, "learning_rate": 9.71604938271605e-07, "logits/chosen": -2.380155086517334, "logits/rejected": -2.287362575531006, "logps/chosen": -63.72702407836914, "logps/rejected": -86.15191650390625, "loss": 0.8536, "rewards/accuracies": 0.875, "rewards/chosen": -0.31710949540138245, "rewards/margins": 0.6939618587493896, "rewards/rejected": -1.0110714435577393, "step": 93 }, { "epoch": 0.2893420546363986, "grad_norm": 3.761611223220825, "learning_rate": 9.712962962962962e-07, "logits/chosen": -2.406691551208496, "logits/rejected": -2.306828498840332, "logps/chosen": -56.92264175415039, "logps/rejected": -85.11669921875, "loss": 0.8131, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04687336087226868, "rewards/margins": 0.8399583101272583, "rewards/rejected": -0.8868316411972046, "step": 94 }, { "epoch": 0.2924201616006156, "grad_norm": 3.6854825019836426, "learning_rate": 9.709876543209876e-07, "logits/chosen": -2.286405324935913, "logits/rejected": -2.2411515712738037, "logps/chosen": -58.40718460083008, "logps/rejected": -80.09916687011719, "loss": 0.8219, "rewards/accuracies": 0.96875, "rewards/chosen": -0.041688017547130585, "rewards/margins": 0.8049023151397705, "rewards/rejected": -0.8465902805328369, "step": 95 }, { "epoch": 0.29549826856483263, "grad_norm": 3.2137868404388428, "learning_rate": 9.70679012345679e-07, "logits/chosen": -2.313157558441162, "logits/rejected": -2.1915037631988525, "logps/chosen": -59.99639892578125, "logps/rejected": -90.53729248046875, "loss": 0.8155, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18190164864063263, "rewards/margins": 0.8659375905990601, "rewards/rejected": -1.0478392839431763, "step": 96 }, { "epoch": 0.29857637552904964, "grad_norm": 3.5776710510253906, "learning_rate": 9.703703703703704e-07, "logits/chosen": -2.298112392425537, "logits/rejected": -2.2356016635894775, "logps/chosen": -57.81808090209961, "logps/rejected": -78.5626220703125, "loss": 0.8393, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09086186438798904, "rewards/margins": 0.7402079105377197, "rewards/rejected": -0.831069827079773, "step": 97 }, { "epoch": 0.30165448249326665, "grad_norm": 3.418916940689087, "learning_rate": 9.700617283950618e-07, "logits/chosen": -2.3538246154785156, "logits/rejected": -2.2357232570648193, "logps/chosen": -61.82401657104492, "logps/rejected": -89.89620208740234, "loss": 0.8221, "rewards/accuracies": 0.96875, "rewards/chosen": -0.1468951255083084, "rewards/margins": 0.8347296714782715, "rewards/rejected": -0.9816248416900635, "step": 98 }, { "epoch": 0.30473258945748366, "grad_norm": 3.715747594833374, "learning_rate": 9.69753086419753e-07, "logits/chosen": -2.3388681411743164, "logits/rejected": -2.237071990966797, "logps/chosen": -58.26868438720703, "logps/rejected": -82.41633605957031, "loss": 0.8109, "rewards/accuracies": 0.90625, "rewards/chosen": -0.012538306415081024, "rewards/margins": 0.8756400942802429, "rewards/rejected": -0.8881784081459045, "step": 99 }, { "epoch": 0.30781069642170067, "grad_norm": 3.9271152019500732, "learning_rate": 9.694444444444444e-07, "logits/chosen": -2.3832170963287354, "logits/rejected": -2.2920727729797363, "logps/chosen": -53.686851501464844, "logps/rejected": -78.56365966796875, "loss": 0.8027, "rewards/accuracies": 0.90625, "rewards/chosen": 0.11209321767091751, "rewards/margins": 0.8737419843673706, "rewards/rejected": -0.7616487145423889, "step": 100 }, { "epoch": 0.3108888033859177, "grad_norm": 3.3285398483276367, "learning_rate": 9.691358024691358e-07, "logits/chosen": -2.3472542762756348, "logits/rejected": -2.252619743347168, "logps/chosen": -62.09461212158203, "logps/rejected": -90.05934143066406, "loss": 0.838, "rewards/accuracies": 0.90625, "rewards/chosen": -0.28647029399871826, "rewards/margins": 0.8164421319961548, "rewards/rejected": -1.1029125452041626, "step": 101 }, { "epoch": 0.3139669103501347, "grad_norm": 3.6469316482543945, "learning_rate": 9.68827160493827e-07, "logits/chosen": -2.453035831451416, "logits/rejected": -2.3673622608184814, "logps/chosen": -61.291778564453125, "logps/rejected": -81.7885513305664, "loss": 0.8251, "rewards/accuracies": 0.90625, "rewards/chosen": -0.05301966518163681, "rewards/margins": 0.8107079863548279, "rewards/rejected": -0.8637275695800781, "step": 102 }, { "epoch": 0.3170450173143517, "grad_norm": 3.6249091625213623, "learning_rate": 9.685185185185186e-07, "logits/chosen": -2.3674609661102295, "logits/rejected": -2.3141849040985107, "logps/chosen": -62.16651916503906, "logps/rejected": -88.12458801269531, "loss": 0.7958, "rewards/accuracies": 0.90625, "rewards/chosen": -0.11704695224761963, "rewards/margins": 0.9446767568588257, "rewards/rejected": -1.0617237091064453, "step": 103 }, { "epoch": 0.3201231242785687, "grad_norm": 3.470633029937744, "learning_rate": 9.682098765432098e-07, "logits/chosen": -2.342864751815796, "logits/rejected": -2.2730820178985596, "logps/chosen": -56.4071159362793, "logps/rejected": -82.97390747070312, "loss": 0.7884, "rewards/accuracies": 0.9375, "rewards/chosen": -0.050670966506004333, "rewards/margins": 1.001274824142456, "rewards/rejected": -1.0519458055496216, "step": 104 }, { "epoch": 0.3232012312427857, "grad_norm": 2.98177433013916, "learning_rate": 9.679012345679012e-07, "logits/chosen": -2.371933937072754, "logits/rejected": -2.217571973800659, "logps/chosen": -63.78718566894531, "logps/rejected": -85.73196411132812, "loss": 0.8348, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0527506023645401, "rewards/margins": 0.8140705823898315, "rewards/rejected": -0.8668211698532104, "step": 105 }, { "epoch": 0.32627933820700267, "grad_norm": 3.547852039337158, "learning_rate": 9.675925925925926e-07, "logits/chosen": -2.3186607360839844, "logits/rejected": -2.188603401184082, "logps/chosen": -54.650840759277344, "logps/rejected": -87.87281036376953, "loss": 0.7673, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0542915053665638, "rewards/margins": 1.1231998205184937, "rewards/rejected": -1.0689083337783813, "step": 106 }, { "epoch": 0.3293574451712197, "grad_norm": 2.8182201385498047, "learning_rate": 9.67283950617284e-07, "logits/chosen": -2.363981246948242, "logits/rejected": -2.3066697120666504, "logps/chosen": -60.131378173828125, "logps/rejected": -82.8123779296875, "loss": 0.8307, "rewards/accuracies": 0.875, "rewards/chosen": -0.18422923982143402, "rewards/margins": 0.7980085611343384, "rewards/rejected": -0.9822378754615784, "step": 107 }, { "epoch": 0.3324355521354367, "grad_norm": 3.07796573638916, "learning_rate": 9.669753086419754e-07, "logits/chosen": -2.3651411533355713, "logits/rejected": -2.278515338897705, "logps/chosen": -63.411956787109375, "logps/rejected": -86.82524108886719, "loss": 0.8482, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2812073230743408, "rewards/margins": 0.7783653736114502, "rewards/rejected": -1.059572696685791, "step": 108 }, { "epoch": 0.3355136590996537, "grad_norm": 3.643038511276245, "learning_rate": 9.666666666666666e-07, "logits/chosen": -2.446803331375122, "logits/rejected": -2.306865930557251, "logps/chosen": -58.712467193603516, "logps/rejected": -89.505126953125, "loss": 0.7692, "rewards/accuracies": 0.90625, "rewards/chosen": 0.004267971962690353, "rewards/margins": 1.130439043045044, "rewards/rejected": -1.1261709928512573, "step": 109 }, { "epoch": 0.3385917660638707, "grad_norm": 3.291058301925659, "learning_rate": 9.66358024691358e-07, "logits/chosen": -2.36527419090271, "logits/rejected": -2.2723374366760254, "logps/chosen": -62.55238723754883, "logps/rejected": -98.1033935546875, "loss": 0.7496, "rewards/accuracies": 0.90625, "rewards/chosen": -0.17612019181251526, "rewards/margins": 1.233510971069336, "rewards/rejected": -1.4096312522888184, "step": 110 }, { "epoch": 0.3416698730280877, "grad_norm": 3.0664825439453125, "learning_rate": 9.660493827160494e-07, "logits/chosen": -2.3900222778320312, "logits/rejected": -2.301774740219116, "logps/chosen": -56.730159759521484, "logps/rejected": -91.50101470947266, "loss": 0.7782, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05444101616740227, "rewards/margins": 1.0558232069015503, "rewards/rejected": -1.1102643013000488, "step": 111 }, { "epoch": 0.3447479799923047, "grad_norm": 3.611542224884033, "learning_rate": 9.657407407407408e-07, "logits/chosen": -2.2933735847473145, "logits/rejected": -2.241957187652588, "logps/chosen": -56.95295333862305, "logps/rejected": -89.16055297851562, "loss": 0.7561, "rewards/accuracies": 0.96875, "rewards/chosen": 0.050991665571928024, "rewards/margins": 1.1792148351669312, "rewards/rejected": -1.128223180770874, "step": 112 }, { "epoch": 0.34782608695652173, "grad_norm": 3.0230672359466553, "learning_rate": 9.65432098765432e-07, "logits/chosen": -2.3862719535827637, "logits/rejected": -2.2841689586639404, "logps/chosen": -62.180667877197266, "logps/rejected": -90.24922180175781, "loss": 0.8033, "rewards/accuracies": 0.96875, "rewards/chosen": -0.1268545240163803, "rewards/margins": 0.9729818105697632, "rewards/rejected": -1.0998363494873047, "step": 113 }, { "epoch": 0.35090419392073874, "grad_norm": 2.960042715072632, "learning_rate": 9.651234567901234e-07, "logits/chosen": -2.3378987312316895, "logits/rejected": -2.2584328651428223, "logps/chosen": -55.6922607421875, "logps/rejected": -90.51654815673828, "loss": 0.7683, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0770270824432373, "rewards/margins": 1.20180344581604, "rewards/rejected": -1.2788302898406982, "step": 114 }, { "epoch": 0.35398230088495575, "grad_norm": 3.7731504440307617, "learning_rate": 9.648148148148148e-07, "logits/chosen": -2.3724253177642822, "logits/rejected": -2.2811460494995117, "logps/chosen": -54.361907958984375, "logps/rejected": -81.98551940917969, "loss": 0.7826, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2036108374595642, "rewards/margins": 1.0242570638656616, "rewards/rejected": -0.8206461071968079, "step": 115 }, { "epoch": 0.35706040784917276, "grad_norm": 3.1443960666656494, "learning_rate": 9.64506172839506e-07, "logits/chosen": -2.3677468299865723, "logits/rejected": -2.2856428623199463, "logps/chosen": -56.01827621459961, "logps/rejected": -87.1959228515625, "loss": 0.7588, "rewards/accuracies": 0.9375, "rewards/chosen": 0.061868585646152496, "rewards/margins": 1.2548561096191406, "rewards/rejected": -1.1929874420166016, "step": 116 }, { "epoch": 0.36013851481338977, "grad_norm": 3.2562167644500732, "learning_rate": 9.641975308641976e-07, "logits/chosen": -2.431447982788086, "logits/rejected": -2.3702526092529297, "logps/chosen": -57.017093658447266, "logps/rejected": -82.01689147949219, "loss": 0.8165, "rewards/accuracies": 0.84375, "rewards/chosen": -0.03684867545962334, "rewards/margins": 0.8680396676063538, "rewards/rejected": -0.9048882722854614, "step": 117 }, { "epoch": 0.3632166217776068, "grad_norm": 3.109278917312622, "learning_rate": 9.638888888888888e-07, "logits/chosen": -2.291869640350342, "logits/rejected": -2.169773817062378, "logps/chosen": -59.253387451171875, "logps/rejected": -90.9432373046875, "loss": 0.7848, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1318141222000122, "rewards/margins": 1.0798982381820679, "rewards/rejected": -1.2117122411727905, "step": 118 }, { "epoch": 0.3662947287418238, "grad_norm": 3.458963394165039, "learning_rate": 9.635802469135802e-07, "logits/chosen": -2.393425226211548, "logits/rejected": -2.2729625701904297, "logps/chosen": -61.43100357055664, "logps/rejected": -98.45223999023438, "loss": 0.7839, "rewards/accuracies": 0.90625, "rewards/chosen": -0.16939164698123932, "rewards/margins": 1.2427661418914795, "rewards/rejected": -1.4121578931808472, "step": 119 }, { "epoch": 0.3693728357060408, "grad_norm": 3.290189743041992, "learning_rate": 9.632716049382716e-07, "logits/chosen": -2.3703365325927734, "logits/rejected": -2.2910923957824707, "logps/chosen": -54.59475326538086, "logps/rejected": -82.12841796875, "loss": 0.738, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3581835925579071, "rewards/margins": 1.323901653289795, "rewards/rejected": -0.9657179117202759, "step": 120 }, { "epoch": 0.3724509426702578, "grad_norm": 3.229985475540161, "learning_rate": 9.629629629629628e-07, "logits/chosen": -2.3682103157043457, "logits/rejected": -2.3260884284973145, "logps/chosen": -63.12253952026367, "logps/rejected": -90.03965759277344, "loss": 0.7838, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18079297244548798, "rewards/margins": 1.14082932472229, "rewards/rejected": -1.321622371673584, "step": 121 }, { "epoch": 0.3755290496344748, "grad_norm": 2.960737466812134, "learning_rate": 9.626543209876544e-07, "logits/chosen": -2.3248164653778076, "logits/rejected": -2.240534782409668, "logps/chosen": -64.0333480834961, "logps/rejected": -91.0220947265625, "loss": 0.7586, "rewards/accuracies": 0.96875, "rewards/chosen": -0.09113496541976929, "rewards/margins": 1.221632480621338, "rewards/rejected": -1.3127673864364624, "step": 122 }, { "epoch": 0.3786071565986918, "grad_norm": 3.614871025085449, "learning_rate": 9.623456790123456e-07, "logits/chosen": -2.380082607269287, "logits/rejected": -2.265094757080078, "logps/chosen": -54.359127044677734, "logps/rejected": -81.33898162841797, "loss": 0.7418, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3697752058506012, "rewards/margins": 1.2866675853729248, "rewards/rejected": -0.9168924689292908, "step": 123 }, { "epoch": 0.3816852635629088, "grad_norm": 3.295248508453369, "learning_rate": 9.62037037037037e-07, "logits/chosen": -2.375596046447754, "logits/rejected": -2.334869861602783, "logps/chosen": -60.350372314453125, "logps/rejected": -93.51887512207031, "loss": 0.7204, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10107779502868652, "rewards/margins": 1.4231960773468018, "rewards/rejected": -1.3221181631088257, "step": 124 }, { "epoch": 0.38476337052712584, "grad_norm": 3.4858434200286865, "learning_rate": 9.617283950617284e-07, "logits/chosen": -2.439857006072998, "logits/rejected": -2.37984037399292, "logps/chosen": -58.77433395385742, "logps/rejected": -89.63716125488281, "loss": 0.7104, "rewards/accuracies": 0.875, "rewards/chosen": 0.1576184332370758, "rewards/margins": 1.4318394660949707, "rewards/rejected": -1.2742209434509277, "step": 125 }, { "epoch": 0.38784147749134285, "grad_norm": 3.249058961868286, "learning_rate": 9.614197530864198e-07, "logits/chosen": -2.3974337577819824, "logits/rejected": -2.3102939128875732, "logps/chosen": -57.67648696899414, "logps/rejected": -88.82600402832031, "loss": 0.7283, "rewards/accuracies": 0.84375, "rewards/chosen": 0.06284304708242416, "rewards/margins": 1.3340119123458862, "rewards/rejected": -1.271168828010559, "step": 126 }, { "epoch": 0.39091958445555985, "grad_norm": 3.7678768634796143, "learning_rate": 9.61111111111111e-07, "logits/chosen": -2.393399953842163, "logits/rejected": -2.3294458389282227, "logps/chosen": -53.96731185913086, "logps/rejected": -92.71165466308594, "loss": 0.6868, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3086266815662384, "rewards/margins": 1.6495660543441772, "rewards/rejected": -1.3409392833709717, "step": 127 }, { "epoch": 0.39399769141977686, "grad_norm": 3.3481016159057617, "learning_rate": 9.608024691358024e-07, "logits/chosen": -2.347627639770508, "logits/rejected": -2.236880302429199, "logps/chosen": -59.347137451171875, "logps/rejected": -96.31621551513672, "loss": 0.719, "rewards/accuracies": 0.84375, "rewards/chosen": 0.14623260498046875, "rewards/margins": 1.4534380435943604, "rewards/rejected": -1.3072054386138916, "step": 128 }, { "epoch": 0.3970757983839938, "grad_norm": 2.8981707096099854, "learning_rate": 9.604938271604938e-07, "logits/chosen": -2.32326078414917, "logits/rejected": -2.25354266166687, "logps/chosen": -58.88190460205078, "logps/rejected": -94.9834976196289, "loss": 0.6964, "rewards/accuracies": 0.90625, "rewards/chosen": 0.23565585911273956, "rewards/margins": 1.552744746208191, "rewards/rejected": -1.3170888423919678, "step": 129 }, { "epoch": 0.4001539053482108, "grad_norm": 2.8510990142822266, "learning_rate": 9.601851851851852e-07, "logits/chosen": -2.337348699569702, "logits/rejected": -2.2869303226470947, "logps/chosen": -63.36298751831055, "logps/rejected": -93.15448760986328, "loss": 0.7913, "rewards/accuracies": 0.96875, "rewards/chosen": -0.20966944098472595, "rewards/margins": 1.1014941930770874, "rewards/rejected": -1.3111637830734253, "step": 130 }, { "epoch": 0.4001539053482108, "eval_logits/chosen": -2.349191427230835, "eval_logits/rejected": -2.3033416271209717, "eval_logps/chosen": -55.57199478149414, "eval_logps/rejected": -91.04338836669922, "eval_loss": 0.7403624653816223, "eval_rewards/accuracies": 0.8988439440727234, "eval_rewards/chosen": 0.2701801359653473, "eval_rewards/margins": 1.3579354286193848, "eval_rewards/rejected": -1.0877552032470703, "eval_runtime": 641.8624, "eval_samples_per_second": 0.539, "eval_steps_per_second": 0.27, "step": 130 }, { "epoch": 0.40323201231242783, "grad_norm": 2.7351269721984863, "learning_rate": 9.598765432098766e-07, "logits/chosen": -2.3452868461608887, "logits/rejected": -2.358386516571045, "logps/chosen": -58.17988967895508, "logps/rejected": -92.3962173461914, "loss": 0.6838, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3787102699279785, "rewards/margins": 1.7227580547332764, "rewards/rejected": -1.3440477848052979, "step": 131 }, { "epoch": 0.40631011927664484, "grad_norm": 3.989720344543457, "learning_rate": 9.595679012345678e-07, "logits/chosen": -2.310577869415283, "logits/rejected": -2.2414705753326416, "logps/chosen": -57.90480041503906, "logps/rejected": -94.56487274169922, "loss": 0.7232, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09299667924642563, "rewards/margins": 1.4803472757339478, "rewards/rejected": -1.3873504400253296, "step": 132 }, { "epoch": 0.40938822624086185, "grad_norm": 2.930600166320801, "learning_rate": 9.592592592592592e-07, "logits/chosen": -2.341740608215332, "logits/rejected": -2.2522521018981934, "logps/chosen": -62.03016662597656, "logps/rejected": -93.3059310913086, "loss": 0.7559, "rewards/accuracies": 0.96875, "rewards/chosen": 0.06462980061769485, "rewards/margins": 1.2864925861358643, "rewards/rejected": -1.22186279296875, "step": 133 }, { "epoch": 0.41246633320507886, "grad_norm": 3.0921688079833984, "learning_rate": 9.589506172839506e-07, "logits/chosen": -2.3788983821868896, "logits/rejected": -2.3218729496002197, "logps/chosen": -60.25922775268555, "logps/rejected": -89.8333969116211, "loss": 0.7573, "rewards/accuracies": 0.96875, "rewards/chosen": -0.03085894137620926, "rewards/margins": 1.3399478197097778, "rewards/rejected": -1.3708069324493408, "step": 134 }, { "epoch": 0.41554444016929587, "grad_norm": 2.9138007164001465, "learning_rate": 9.586419753086418e-07, "logits/chosen": -2.386998176574707, "logits/rejected": -2.3325088024139404, "logps/chosen": -50.57230758666992, "logps/rejected": -87.88773345947266, "loss": 0.6423, "rewards/accuracies": 1.0, "rewards/chosen": 0.7195413708686829, "rewards/margins": 1.9140251874923706, "rewards/rejected": -1.194483757019043, "step": 135 }, { "epoch": 0.4186225471335129, "grad_norm": 3.3823022842407227, "learning_rate": 9.583333333333334e-07, "logits/chosen": -2.355806350708008, "logits/rejected": -2.2594542503356934, "logps/chosen": -49.17155456542969, "logps/rejected": -85.3759994506836, "loss": 0.6805, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5944402813911438, "rewards/margins": 1.6740398406982422, "rewards/rejected": -1.0795994997024536, "step": 136 }, { "epoch": 0.4217006540977299, "grad_norm": 3.218871593475342, "learning_rate": 9.580246913580246e-07, "logits/chosen": -2.3808720111846924, "logits/rejected": -2.2865099906921387, "logps/chosen": -55.233821868896484, "logps/rejected": -90.5654067993164, "loss": 0.7352, "rewards/accuracies": 0.875, "rewards/chosen": 0.3482764661312103, "rewards/margins": 1.398787498474121, "rewards/rejected": -1.0505110025405884, "step": 137 }, { "epoch": 0.4247787610619469, "grad_norm": 3.8406877517700195, "learning_rate": 9.57716049382716e-07, "logits/chosen": -2.425323247909546, "logits/rejected": -2.332698345184326, "logps/chosen": -55.33529281616211, "logps/rejected": -91.5009765625, "loss": 0.7072, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3075845539569855, "rewards/margins": 1.6563973426818848, "rewards/rejected": -1.3488128185272217, "step": 138 }, { "epoch": 0.4278568680261639, "grad_norm": 2.975431203842163, "learning_rate": 9.574074074074074e-07, "logits/chosen": -2.419753074645996, "logits/rejected": -2.3230249881744385, "logps/chosen": -48.877899169921875, "logps/rejected": -97.04456329345703, "loss": 0.6207, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5350260138511658, "rewards/margins": 2.0331344604492188, "rewards/rejected": -1.4981082677841187, "step": 139 }, { "epoch": 0.4309349749903809, "grad_norm": 2.9479238986968994, "learning_rate": 9.570987654320988e-07, "logits/chosen": -2.406111001968384, "logits/rejected": -2.266131639480591, "logps/chosen": -54.148006439208984, "logps/rejected": -100.2186508178711, "loss": 0.6409, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3392511010169983, "rewards/margins": 1.9801757335662842, "rewards/rejected": -1.6409246921539307, "step": 140 }, { "epoch": 0.4340130819545979, "grad_norm": 2.8575828075408936, "learning_rate": 9.567901234567902e-07, "logits/chosen": -2.331556558609009, "logits/rejected": -2.277465581893921, "logps/chosen": -63.89174270629883, "logps/rejected": -87.3812484741211, "loss": 0.7847, "rewards/accuracies": 0.875, "rewards/chosen": -0.08250722289085388, "rewards/margins": 1.1660081148147583, "rewards/rejected": -1.2485151290893555, "step": 141 }, { "epoch": 0.43709118891881493, "grad_norm": 2.9750633239746094, "learning_rate": 9.564814814814814e-07, "logits/chosen": -2.4354124069213867, "logits/rejected": -2.385509490966797, "logps/chosen": -49.845455169677734, "logps/rejected": -89.96946716308594, "loss": 0.7015, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5034540891647339, "rewards/margins": 1.7045538425445557, "rewards/rejected": -1.2010996341705322, "step": 142 }, { "epoch": 0.44016929588303194, "grad_norm": 3.223254442214966, "learning_rate": 9.561728395061728e-07, "logits/chosen": -2.485191583633423, "logits/rejected": -2.402041435241699, "logps/chosen": -59.350128173828125, "logps/rejected": -89.07848358154297, "loss": 0.6925, "rewards/accuracies": 0.875, "rewards/chosen": 0.5649563074111938, "rewards/margins": 1.7118489742279053, "rewards/rejected": -1.1468926668167114, "step": 143 }, { "epoch": 0.44324740284724895, "grad_norm": 3.759110450744629, "learning_rate": 9.558641975308642e-07, "logits/chosen": -2.3778553009033203, "logits/rejected": -2.3218822479248047, "logps/chosen": -56.973533630371094, "logps/rejected": -90.88619995117188, "loss": 0.7094, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3408811092376709, "rewards/margins": 1.5467872619628906, "rewards/rejected": -1.2059061527252197, "step": 144 }, { "epoch": 0.44632550981146596, "grad_norm": 3.148838758468628, "learning_rate": 9.555555555555556e-07, "logits/chosen": -2.327157974243164, "logits/rejected": -2.324303150177002, "logps/chosen": -52.817626953125, "logps/rejected": -94.0676040649414, "loss": 0.665, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4238552451133728, "rewards/margins": 1.9328645467758179, "rewards/rejected": -1.5090094804763794, "step": 145 }, { "epoch": 0.44940361677568297, "grad_norm": 3.2902302742004395, "learning_rate": 9.552469135802468e-07, "logits/chosen": -2.4257211685180664, "logits/rejected": -2.357758045196533, "logps/chosen": -59.79077911376953, "logps/rejected": -98.19409942626953, "loss": 0.6948, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04811589792370796, "rewards/margins": 1.7886967658996582, "rewards/rejected": -1.7405807971954346, "step": 146 }, { "epoch": 0.4524817237399, "grad_norm": 2.990460157394409, "learning_rate": 9.549382716049382e-07, "logits/chosen": -2.4389750957489014, "logits/rejected": -2.376420021057129, "logps/chosen": -54.17119598388672, "logps/rejected": -93.17366790771484, "loss": 0.6229, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6108043789863586, "rewards/margins": 2.1362993717193604, "rewards/rejected": -1.525494933128357, "step": 147 }, { "epoch": 0.455559830704117, "grad_norm": 3.246271848678589, "learning_rate": 9.546296296296296e-07, "logits/chosen": -2.417654037475586, "logits/rejected": -2.3569703102111816, "logps/chosen": -50.42028045654297, "logps/rejected": -84.9511947631836, "loss": 0.6824, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6510617733001709, "rewards/margins": 1.8540288209915161, "rewards/rejected": -1.2029671669006348, "step": 148 }, { "epoch": 0.458637937668334, "grad_norm": 3.0190272331237793, "learning_rate": 9.543209876543208e-07, "logits/chosen": -2.3829596042633057, "logits/rejected": -2.367797613143921, "logps/chosen": -49.94941329956055, "logps/rejected": -92.59821319580078, "loss": 0.6215, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6040168404579163, "rewards/margins": 2.1457107067108154, "rewards/rejected": -1.541693925857544, "step": 149 }, { "epoch": 0.461716044632551, "grad_norm": 2.6618223190307617, "learning_rate": 9.540123456790124e-07, "logits/chosen": -2.4454894065856934, "logits/rejected": -2.360128402709961, "logps/chosen": -57.56269836425781, "logps/rejected": -91.02347564697266, "loss": 0.6852, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4994054436683655, "rewards/margins": 1.8750057220458984, "rewards/rejected": -1.3756003379821777, "step": 150 }, { "epoch": 0.464794151596768, "grad_norm": 2.9470932483673096, "learning_rate": 9.537037037037036e-07, "logits/chosen": -2.4891064167022705, "logits/rejected": -2.357296943664551, "logps/chosen": -60.11787796020508, "logps/rejected": -103.63739776611328, "loss": 0.6477, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1077844649553299, "rewards/margins": 2.161799430847168, "rewards/rejected": -2.0540151596069336, "step": 151 }, { "epoch": 0.467872258560985, "grad_norm": 12.647138595581055, "learning_rate": 9.53395061728395e-07, "logits/chosen": -2.454115629196167, "logits/rejected": -2.3961164951324463, "logps/chosen": -54.38486099243164, "logps/rejected": -95.17996215820312, "loss": 0.6533, "rewards/accuracies": 0.84375, "rewards/chosen": 0.49435263872146606, "rewards/margins": 2.0549285411834717, "rewards/rejected": -1.5605759620666504, "step": 152 }, { "epoch": 0.470950365525202, "grad_norm": 2.6998515129089355, "learning_rate": 9.530864197530863e-07, "logits/chosen": -2.497526168823242, "logits/rejected": -2.4297268390655518, "logps/chosen": -63.1408576965332, "logps/rejected": -95.77769470214844, "loss": 0.7367, "rewards/accuracies": 0.90625, "rewards/chosen": 0.004490181803703308, "rewards/margins": 1.5459473133087158, "rewards/rejected": -1.541457176208496, "step": 153 }, { "epoch": 0.474028472489419, "grad_norm": 4.737138271331787, "learning_rate": 9.527777777777777e-07, "logits/chosen": -2.4100804328918457, "logits/rejected": -2.3580939769744873, "logps/chosen": -51.9671516418457, "logps/rejected": -92.1158447265625, "loss": 0.6493, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5416803359985352, "rewards/margins": 2.000169038772583, "rewards/rejected": -1.4584888219833374, "step": 154 }, { "epoch": 0.477106579453636, "grad_norm": 3.4176666736602783, "learning_rate": 9.524691358024691e-07, "logits/chosen": -2.4179108142852783, "logits/rejected": -2.330063819885254, "logps/chosen": -47.61423110961914, "logps/rejected": -100.94380950927734, "loss": 0.5282, "rewards/accuracies": 1.0, "rewards/chosen": 0.8077340722084045, "rewards/margins": 2.8309593200683594, "rewards/rejected": -2.0232253074645996, "step": 155 }, { "epoch": 0.480184686417853, "grad_norm": 3.0015335083007812, "learning_rate": 9.521604938271605e-07, "logits/chosen": -2.440092086791992, "logits/rejected": -2.3606011867523193, "logps/chosen": -61.505218505859375, "logps/rejected": -107.69963073730469, "loss": 0.6354, "rewards/accuracies": 0.96875, "rewards/chosen": 0.15792518854141235, "rewards/margins": 2.386098861694336, "rewards/rejected": -2.2281737327575684, "step": 156 }, { "epoch": 0.48326279338207, "grad_norm": 2.916036605834961, "learning_rate": 9.518518518518518e-07, "logits/chosen": -2.415865898132324, "logits/rejected": -2.3111281394958496, "logps/chosen": -50.87012481689453, "logps/rejected": -99.05119323730469, "loss": 0.6162, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7376286387443542, "rewards/margins": 2.3508365154266357, "rewards/rejected": -1.6132079362869263, "step": 157 }, { "epoch": 0.486340900346287, "grad_norm": 3.3261666297912598, "learning_rate": 9.515432098765431e-07, "logits/chosen": -2.417018413543701, "logits/rejected": -2.3601229190826416, "logps/chosen": -54.34171676635742, "logps/rejected": -102.18463134765625, "loss": 0.6311, "rewards/accuracies": 1.0, "rewards/chosen": 0.33187344670295715, "rewards/margins": 2.250150680541992, "rewards/rejected": -1.9182770252227783, "step": 158 }, { "epoch": 0.48941900731050403, "grad_norm": 3.189695358276367, "learning_rate": 9.512345679012346e-07, "logits/chosen": -2.423196315765381, "logits/rejected": -2.384220838546753, "logps/chosen": -50.386749267578125, "logps/rejected": -82.91292572021484, "loss": 0.6633, "rewards/accuracies": 0.875, "rewards/chosen": 0.5881660580635071, "rewards/margins": 1.8550424575805664, "rewards/rejected": -1.2668763399124146, "step": 159 }, { "epoch": 0.49249711427472104, "grad_norm": 3.044964551925659, "learning_rate": 9.509259259259259e-07, "logits/chosen": -2.484602689743042, "logits/rejected": -2.4092607498168945, "logps/chosen": -50.013671875, "logps/rejected": -93.76225280761719, "loss": 0.6618, "rewards/accuracies": 0.96875, "rewards/chosen": 0.42952045798301697, "rewards/margins": 2.0300707817077637, "rewards/rejected": -1.6005504131317139, "step": 160 }, { "epoch": 0.49557522123893805, "grad_norm": 2.92433762550354, "learning_rate": 9.506172839506172e-07, "logits/chosen": -2.461122751235962, "logits/rejected": -2.377786159515381, "logps/chosen": -54.87761306762695, "logps/rejected": -90.5372085571289, "loss": 0.6748, "rewards/accuracies": 0.9375, "rewards/chosen": 0.33767735958099365, "rewards/margins": 1.9756239652633667, "rewards/rejected": -1.6379467248916626, "step": 161 }, { "epoch": 0.49865332820315506, "grad_norm": 3.1729371547698975, "learning_rate": 9.503086419753086e-07, "logits/chosen": -2.430629014968872, "logits/rejected": -2.4028310775756836, "logps/chosen": -54.75767135620117, "logps/rejected": -96.23094177246094, "loss": 0.6695, "rewards/accuracies": 1.0, "rewards/chosen": 0.38419097661972046, "rewards/margins": 2.0859737396240234, "rewards/rejected": -1.7017827033996582, "step": 162 }, { "epoch": 0.5017314351673721, "grad_norm": 3.3290648460388184, "learning_rate": 9.499999999999999e-07, "logits/chosen": -2.493173122406006, "logits/rejected": -2.450010061264038, "logps/chosen": -53.32080078125, "logps/rejected": -91.20701599121094, "loss": 0.7047, "rewards/accuracies": 1.0, "rewards/chosen": 0.5700912475585938, "rewards/margins": 2.0040459632873535, "rewards/rejected": -1.4339547157287598, "step": 163 }, { "epoch": 0.504809542131589, "grad_norm": 2.9724442958831787, "learning_rate": 9.496913580246913e-07, "logits/chosen": -2.3315913677215576, "logits/rejected": -2.307602882385254, "logps/chosen": -60.60820007324219, "logps/rejected": -102.05558776855469, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.14813652634620667, "rewards/margins": 2.009459972381592, "rewards/rejected": -1.861323356628418, "step": 164 }, { "epoch": 0.5078876490958061, "grad_norm": 3.0711050033569336, "learning_rate": 9.493827160493827e-07, "logits/chosen": -2.400763511657715, "logits/rejected": -2.3830010890960693, "logps/chosen": -56.85745620727539, "logps/rejected": -100.58525848388672, "loss": 0.6685, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5390109419822693, "rewards/margins": 2.335085868835449, "rewards/rejected": -1.7960749864578247, "step": 165 }, { "epoch": 0.510965756060023, "grad_norm": 3.875068187713623, "learning_rate": 9.49074074074074e-07, "logits/chosen": -2.323957920074463, "logits/rejected": -2.3299994468688965, "logps/chosen": -43.76856231689453, "logps/rejected": -93.14926147460938, "loss": 0.5331, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0353261232376099, "rewards/margins": 2.5975470542907715, "rewards/rejected": -1.5622210502624512, "step": 166 }, { "epoch": 0.5140438630242401, "grad_norm": 2.556124687194824, "learning_rate": 9.487654320987654e-07, "logits/chosen": -2.4636995792388916, "logits/rejected": -2.3857030868530273, "logps/chosen": -47.64371109008789, "logps/rejected": -89.66706848144531, "loss": 0.6662, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8086596727371216, "rewards/margins": 2.2125062942504883, "rewards/rejected": -1.4038467407226562, "step": 167 }, { "epoch": 0.517121969988457, "grad_norm": 3.2217578887939453, "learning_rate": 9.484567901234567e-07, "logits/chosen": -2.49454402923584, "logits/rejected": -2.4099411964416504, "logps/chosen": -54.103965759277344, "logps/rejected": -94.96905517578125, "loss": 0.6765, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5298041701316833, "rewards/margins": 2.0449068546295166, "rewards/rejected": -1.5151026248931885, "step": 168 }, { "epoch": 0.5202000769526741, "grad_norm": 2.9076905250549316, "learning_rate": 9.481481481481481e-07, "logits/chosen": -2.3849501609802246, "logits/rejected": -2.325199842453003, "logps/chosen": -48.76258850097656, "logps/rejected": -91.29879760742188, "loss": 0.608, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7636001706123352, "rewards/margins": 2.263770341873169, "rewards/rejected": -1.5001699924468994, "step": 169 }, { "epoch": 0.5232781839168911, "grad_norm": 4.924505710601807, "learning_rate": 9.478395061728395e-07, "logits/chosen": -2.4528417587280273, "logits/rejected": -2.4363222122192383, "logps/chosen": -53.36818313598633, "logps/rejected": -105.33737182617188, "loss": 0.578, "rewards/accuracies": 1.0, "rewards/chosen": 0.48011618852615356, "rewards/margins": 2.766507148742676, "rewards/rejected": -2.286391258239746, "step": 170 }, { "epoch": 0.5263562908811081, "grad_norm": 3.5291030406951904, "learning_rate": 9.475308641975308e-07, "logits/chosen": -2.48616623878479, "logits/rejected": -2.4377081394195557, "logps/chosen": -47.975345611572266, "logps/rejected": -83.79718780517578, "loss": 0.681, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6680082082748413, "rewards/margins": 1.8872753381729126, "rewards/rejected": -1.2192671298980713, "step": 171 }, { "epoch": 0.5294343978453251, "grad_norm": 2.7510974407196045, "learning_rate": 9.472222222222221e-07, "logits/chosen": -2.4462637901306152, "logits/rejected": -2.4196536540985107, "logps/chosen": -59.70549011230469, "logps/rejected": -97.62277221679688, "loss": 0.6624, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15860964357852936, "rewards/margins": 2.1446824073791504, "rewards/rejected": -1.9860727787017822, "step": 172 }, { "epoch": 0.5325125048095422, "grad_norm": 3.3543457984924316, "learning_rate": 9.469135802469136e-07, "logits/chosen": -2.431657314300537, "logits/rejected": -2.404188632965088, "logps/chosen": -46.46947479248047, "logps/rejected": -78.01934051513672, "loss": 0.7028, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8444581627845764, "rewards/margins": 1.7518669366836548, "rewards/rejected": -0.9074087142944336, "step": 173 }, { "epoch": 0.5355906117737591, "grad_norm": 2.453125476837158, "learning_rate": 9.466049382716049e-07, "logits/chosen": -2.3910465240478516, "logits/rejected": -2.364084005355835, "logps/chosen": -63.523040771484375, "logps/rejected": -94.32888793945312, "loss": 0.7418, "rewards/accuracies": 0.875, "rewards/chosen": -0.19492198526859283, "rewards/margins": 1.6581616401672363, "rewards/rejected": -1.8530837297439575, "step": 174 }, { "epoch": 0.5386687187379762, "grad_norm": 3.370898485183716, "learning_rate": 9.462962962962962e-07, "logits/chosen": -2.4009971618652344, "logits/rejected": -2.368461847305298, "logps/chosen": -58.59605407714844, "logps/rejected": -99.36207580566406, "loss": 0.6266, "rewards/accuracies": 0.96875, "rewards/chosen": 0.37723100185394287, "rewards/margins": 2.5182125568389893, "rewards/rejected": -2.1409811973571777, "step": 175 }, { "epoch": 0.5417468257021931, "grad_norm": 3.400818109512329, "learning_rate": 9.459876543209876e-07, "logits/chosen": -2.533757448196411, "logits/rejected": -2.452385425567627, "logps/chosen": -51.45113754272461, "logps/rejected": -82.9143295288086, "loss": 0.7265, "rewards/accuracies": 0.875, "rewards/chosen": 0.6980175375938416, "rewards/margins": 1.7065876722335815, "rewards/rejected": -1.0085700750350952, "step": 176 }, { "epoch": 0.5448249326664102, "grad_norm": 3.017878532409668, "learning_rate": 9.456790123456789e-07, "logits/chosen": -2.462146282196045, "logits/rejected": -2.3281548023223877, "logps/chosen": -53.845703125, "logps/rejected": -91.43633270263672, "loss": 0.7643, "rewards/accuracies": 0.8125, "rewards/chosen": 0.14741948246955872, "rewards/margins": 1.7748405933380127, "rewards/rejected": -1.6274211406707764, "step": 177 }, { "epoch": 0.5479030396306271, "grad_norm": 3.1482412815093994, "learning_rate": 9.453703703703704e-07, "logits/chosen": -2.604809522628784, "logits/rejected": -2.5332272052764893, "logps/chosen": -56.75010681152344, "logps/rejected": -94.68323516845703, "loss": 0.6734, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3095078766345978, "rewards/margins": 2.0122950077056885, "rewards/rejected": -1.702787160873413, "step": 178 }, { "epoch": 0.5509811465948442, "grad_norm": 2.806612491607666, "learning_rate": 9.450617283950617e-07, "logits/chosen": -2.520390033721924, "logits/rejected": -2.4647679328918457, "logps/chosen": -50.446861267089844, "logps/rejected": -89.80989837646484, "loss": 0.6402, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8557029366493225, "rewards/margins": 2.322169065475464, "rewards/rejected": -1.466465950012207, "step": 179 }, { "epoch": 0.5540592535590612, "grad_norm": 3.057952880859375, "learning_rate": 9.44753086419753e-07, "logits/chosen": -2.5132110118865967, "logits/rejected": -2.462681531906128, "logps/chosen": -50.41011428833008, "logps/rejected": -94.62890625, "loss": 0.5866, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9181693196296692, "rewards/margins": 2.7807912826538086, "rewards/rejected": -1.862621784210205, "step": 180 }, { "epoch": 0.5571373605232782, "grad_norm": 3.5119974613189697, "learning_rate": 9.444444444444444e-07, "logits/chosen": -2.393418312072754, "logits/rejected": -2.3486387729644775, "logps/chosen": -59.59280014038086, "logps/rejected": -111.87150573730469, "loss": 0.639, "rewards/accuracies": 0.96875, "rewards/chosen": 0.1756262183189392, "rewards/margins": 2.6586008071899414, "rewards/rejected": -2.4829745292663574, "step": 181 }, { "epoch": 0.5602154674874952, "grad_norm": 2.785459280014038, "learning_rate": 9.441358024691357e-07, "logits/chosen": -2.5364413261413574, "logits/rejected": -2.4217896461486816, "logps/chosen": -51.115997314453125, "logps/rejected": -91.44586181640625, "loss": 0.6834, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7738412022590637, "rewards/margins": 2.1956870555877686, "rewards/rejected": -1.4218459129333496, "step": 182 }, { "epoch": 0.5632935744517122, "grad_norm": 3.898836135864258, "learning_rate": 9.438271604938271e-07, "logits/chosen": -2.5164575576782227, "logits/rejected": -2.418027877807617, "logps/chosen": -45.60322952270508, "logps/rejected": -99.04080200195312, "loss": 0.5239, "rewards/accuracies": 0.96875, "rewards/chosen": 1.069001317024231, "rewards/margins": 2.9433836936950684, "rewards/rejected": -1.874382495880127, "step": 183 }, { "epoch": 0.5663716814159292, "grad_norm": 3.0302186012268066, "learning_rate": 9.435185185185185e-07, "logits/chosen": -2.530912160873413, "logits/rejected": -2.577169895172119, "logps/chosen": -52.30243682861328, "logps/rejected": -96.05391693115234, "loss": 0.6092, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6854079365730286, "rewards/margins": 2.606445550918579, "rewards/rejected": -1.9210375547409058, "step": 184 }, { "epoch": 0.5694497883801463, "grad_norm": 3.811326503753662, "learning_rate": 9.432098765432098e-07, "logits/chosen": -2.5026817321777344, "logits/rejected": -2.46850323677063, "logps/chosen": -54.10247039794922, "logps/rejected": -104.70036315917969, "loss": 0.5908, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4248885214328766, "rewards/margins": 2.943228244781494, "rewards/rejected": -2.5183396339416504, "step": 185 }, { "epoch": 0.5725278953443632, "grad_norm": 3.352590322494507, "learning_rate": 9.429012345679011e-07, "logits/chosen": -2.508408546447754, "logits/rejected": -2.4808380603790283, "logps/chosen": -50.45133972167969, "logps/rejected": -99.90965270996094, "loss": 0.58, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8158664107322693, "rewards/margins": 2.790006399154663, "rewards/rejected": -1.974139928817749, "step": 186 }, { "epoch": 0.5756060023085803, "grad_norm": 3.256568193435669, "learning_rate": 9.425925925925925e-07, "logits/chosen": -2.4806301593780518, "logits/rejected": -2.451636552810669, "logps/chosen": -46.26686477661133, "logps/rejected": -96.68470001220703, "loss": 0.5293, "rewards/accuracies": 1.0, "rewards/chosen": 0.9856312274932861, "rewards/margins": 2.9678826332092285, "rewards/rejected": -1.9822516441345215, "step": 187 }, { "epoch": 0.5786841092727972, "grad_norm": 3.3700196743011475, "learning_rate": 9.422839506172839e-07, "logits/chosen": -2.4717044830322266, "logits/rejected": -2.37485408782959, "logps/chosen": -46.48899459838867, "logps/rejected": -95.29428100585938, "loss": 0.5615, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0647377967834473, "rewards/margins": 2.9646799564361572, "rewards/rejected": -1.89994215965271, "step": 188 }, { "epoch": 0.5817622162370142, "grad_norm": 3.0509564876556396, "learning_rate": 9.419753086419753e-07, "logits/chosen": -2.530578136444092, "logits/rejected": -2.4289095401763916, "logps/chosen": -58.52287292480469, "logps/rejected": -107.07351684570312, "loss": 0.6029, "rewards/accuracies": 0.90625, "rewards/chosen": 0.30171021819114685, "rewards/margins": 2.89652156829834, "rewards/rejected": -2.5948116779327393, "step": 189 }, { "epoch": 0.5848403232012312, "grad_norm": 3.201169729232788, "learning_rate": 9.416666666666666e-07, "logits/chosen": -2.476519823074341, "logits/rejected": -2.48606538772583, "logps/chosen": -55.6549186706543, "logps/rejected": -91.80580139160156, "loss": 0.7197, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5917803645133972, "rewards/margins": 2.150052070617676, "rewards/rejected": -1.5582717657089233, "step": 190 }, { "epoch": 0.5879184301654482, "grad_norm": 3.1990060806274414, "learning_rate": 9.413580246913579e-07, "logits/chosen": -2.530785083770752, "logits/rejected": -2.4930763244628906, "logps/chosen": -53.1740837097168, "logps/rejected": -96.0416259765625, "loss": 0.6157, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6592293381690979, "rewards/margins": 2.6536271572113037, "rewards/rejected": -1.9943978786468506, "step": 191 }, { "epoch": 0.5909965371296653, "grad_norm": 2.927205801010132, "learning_rate": 9.410493827160494e-07, "logits/chosen": -2.5585522651672363, "logits/rejected": -2.4799323081970215, "logps/chosen": -48.724849700927734, "logps/rejected": -98.6844482421875, "loss": 0.5937, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9114745855331421, "rewards/margins": 2.9259629249572754, "rewards/rejected": -2.0144882202148438, "step": 192 }, { "epoch": 0.5940746440938822, "grad_norm": 3.1642916202545166, "learning_rate": 9.407407407407407e-07, "logits/chosen": -2.490614414215088, "logits/rejected": -2.4689362049102783, "logps/chosen": -58.82742691040039, "logps/rejected": -111.56881713867188, "loss": 0.5962, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17882564663887024, "rewards/margins": 2.9047188758850098, "rewards/rejected": -2.725893497467041, "step": 193 }, { "epoch": 0.5971527510580993, "grad_norm": 3.068449020385742, "learning_rate": 9.40432098765432e-07, "logits/chosen": -2.5402164459228516, "logits/rejected": -2.4976110458374023, "logps/chosen": -47.848724365234375, "logps/rejected": -91.43160247802734, "loss": 0.6184, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8665890693664551, "rewards/margins": 2.6092193126678467, "rewards/rejected": -1.7426303625106812, "step": 194 }, { "epoch": 0.6002308580223162, "grad_norm": 3.78131103515625, "learning_rate": 9.401234567901234e-07, "logits/chosen": -2.456618070602417, "logits/rejected": -2.408468723297119, "logps/chosen": -51.813045501708984, "logps/rejected": -102.75614166259766, "loss": 0.6443, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6546244621276855, "rewards/margins": 2.7353382110595703, "rewards/rejected": -2.0807137489318848, "step": 195 }, { "epoch": 0.6002308580223162, "eval_logits/chosen": -2.4553024768829346, "eval_logits/rejected": -2.454810380935669, "eval_logps/chosen": -50.62422561645508, "eval_logps/rejected": -98.03628540039062, "eval_loss": 0.6410006284713745, "eval_rewards/accuracies": 0.910404622554779, "eval_rewards/chosen": 0.7649564146995544, "eval_rewards/margins": 2.552001714706421, "eval_rewards/rejected": -1.7870454788208008, "eval_runtime": 639.1807, "eval_samples_per_second": 0.541, "eval_steps_per_second": 0.271, "step": 195 }, { "epoch": 0.6033089649865333, "grad_norm": 2.8251187801361084, "learning_rate": 9.398148148148147e-07, "logits/chosen": -2.51753306388855, "logits/rejected": -2.4937663078308105, "logps/chosen": -47.750244140625, "logps/rejected": -97.08165740966797, "loss": 0.6153, "rewards/accuracies": 0.875, "rewards/chosen": 0.43865451216697693, "rewards/margins": 2.642551898956299, "rewards/rejected": -2.20389723777771, "step": 196 }, { "epoch": 0.6063870719507503, "grad_norm": 3.216726541519165, "learning_rate": 9.395061728395062e-07, "logits/chosen": -2.478604793548584, "logits/rejected": -2.4659204483032227, "logps/chosen": -51.1640739440918, "logps/rejected": -92.93955993652344, "loss": 0.6288, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8214237093925476, "rewards/margins": 2.6350455284118652, "rewards/rejected": -1.8136217594146729, "step": 197 }, { "epoch": 0.6094651789149673, "grad_norm": 4.124622821807861, "learning_rate": 9.391975308641975e-07, "logits/chosen": -2.554816484451294, "logits/rejected": -2.4062976837158203, "logps/chosen": -53.479225158691406, "logps/rejected": -111.77421569824219, "loss": 0.52, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7531198263168335, "rewards/margins": 3.192842483520508, "rewards/rejected": -2.439722776412964, "step": 198 }, { "epoch": 0.6125432858791843, "grad_norm": 2.740999698638916, "learning_rate": 9.388888888888888e-07, "logits/chosen": -2.5349647998809814, "logits/rejected": -2.5169179439544678, "logps/chosen": -43.76857376098633, "logps/rejected": -102.37848663330078, "loss": 0.5349, "rewards/accuracies": 1.0, "rewards/chosen": 1.1554709672927856, "rewards/margins": 3.384432792663574, "rewards/rejected": -2.228961944580078, "step": 199 }, { "epoch": 0.6156213928434013, "grad_norm": 3.2258331775665283, "learning_rate": 9.385802469135802e-07, "logits/chosen": -2.543311357498169, "logits/rejected": -2.440028190612793, "logps/chosen": -54.585201263427734, "logps/rejected": -97.01506805419922, "loss": 0.6951, "rewards/accuracies": 0.84375, "rewards/chosen": 0.44891688227653503, "rewards/margins": 2.2617740631103516, "rewards/rejected": -1.8128573894500732, "step": 200 }, { "epoch": 0.6186994998076183, "grad_norm": 3.5308072566986084, "learning_rate": 9.382716049382715e-07, "logits/chosen": -2.560608386993408, "logits/rejected": -2.490032434463501, "logps/chosen": -47.79678726196289, "logps/rejected": -99.77317810058594, "loss": 0.5166, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8798754215240479, "rewards/margins": 2.9981529712677, "rewards/rejected": -2.1182777881622314, "step": 201 }, { "epoch": 0.6217776067718354, "grad_norm": 2.736955165863037, "learning_rate": 9.379629629629629e-07, "logits/chosen": -2.554363250732422, "logits/rejected": -2.5390679836273193, "logps/chosen": -49.21474075317383, "logps/rejected": -90.04205322265625, "loss": 0.658, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8129448294639587, "rewards/margins": 2.4557876586914062, "rewards/rejected": -1.6428430080413818, "step": 202 }, { "epoch": 0.6248557137360523, "grad_norm": 3.691833972930908, "learning_rate": 9.376543209876543e-07, "logits/chosen": -2.489957332611084, "logits/rejected": -2.447296142578125, "logps/chosen": -53.453041076660156, "logps/rejected": -94.9090576171875, "loss": 0.6257, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7735847234725952, "rewards/margins": 2.6583011150360107, "rewards/rejected": -1.8847166299819946, "step": 203 }, { "epoch": 0.6279338207002694, "grad_norm": 3.220451831817627, "learning_rate": 9.373456790123456e-07, "logits/chosen": -2.4636645317077637, "logits/rejected": -2.4433348178863525, "logps/chosen": -49.670989990234375, "logps/rejected": -101.84941101074219, "loss": 0.5664, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0443823337554932, "rewards/margins": 3.1252517700195312, "rewards/rejected": -2.080869197845459, "step": 204 }, { "epoch": 0.6310119276644863, "grad_norm": 3.5722100734710693, "learning_rate": 9.370370370370369e-07, "logits/chosen": -2.507133722305298, "logits/rejected": -2.4803340435028076, "logps/chosen": -44.90645217895508, "logps/rejected": -94.41202545166016, "loss": 0.5796, "rewards/accuracies": 0.9375, "rewards/chosen": 1.055967092514038, "rewards/margins": 2.905322313308716, "rewards/rejected": -1.849354863166809, "step": 205 }, { "epoch": 0.6340900346287034, "grad_norm": 4.921390056610107, "learning_rate": 9.367283950617284e-07, "logits/chosen": -2.4887919425964355, "logits/rejected": -2.498070001602173, "logps/chosen": -54.77848434448242, "logps/rejected": -102.27349853515625, "loss": 0.6261, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5191159844398499, "rewards/margins": 2.8685851097106934, "rewards/rejected": -2.3494694232940674, "step": 206 }, { "epoch": 0.6371681415929203, "grad_norm": 2.940537214279175, "learning_rate": 9.364197530864197e-07, "logits/chosen": -2.5379788875579834, "logits/rejected": -2.5557796955108643, "logps/chosen": -49.5312614440918, "logps/rejected": -97.61029052734375, "loss": 0.6044, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9622249603271484, "rewards/margins": 3.002868890762329, "rewards/rejected": -2.0406439304351807, "step": 207 }, { "epoch": 0.6402462485571374, "grad_norm": 2.966557741165161, "learning_rate": 9.361111111111111e-07, "logits/chosen": -2.5257387161254883, "logits/rejected": -2.5017929077148438, "logps/chosen": -53.396270751953125, "logps/rejected": -108.76403045654297, "loss": 0.6664, "rewards/accuracies": 0.9375, "rewards/chosen": 0.46376800537109375, "rewards/margins": 2.87304949760437, "rewards/rejected": -2.4092817306518555, "step": 208 }, { "epoch": 0.6433243555213544, "grad_norm": 3.3986783027648926, "learning_rate": 9.358024691358024e-07, "logits/chosen": -2.5608034133911133, "logits/rejected": -2.5680229663848877, "logps/chosen": -45.82611083984375, "logps/rejected": -105.4083023071289, "loss": 0.5056, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0905905961990356, "rewards/margins": 3.442164182662964, "rewards/rejected": -2.3515734672546387, "step": 209 }, { "epoch": 0.6464024624855714, "grad_norm": 3.118433713912964, "learning_rate": 9.354938271604937e-07, "logits/chosen": -2.55564022064209, "logits/rejected": -2.5344080924987793, "logps/chosen": -41.100669860839844, "logps/rejected": -96.50792694091797, "loss": 0.5605, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5822601318359375, "rewards/margins": 3.254073143005371, "rewards/rejected": -1.6718130111694336, "step": 210 }, { "epoch": 0.6494805694497884, "grad_norm": 3.066647529602051, "learning_rate": 9.351851851851852e-07, "logits/chosen": -2.5580806732177734, "logits/rejected": -2.5831427574157715, "logps/chosen": -60.09915542602539, "logps/rejected": -100.76399230957031, "loss": 0.6673, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4332161545753479, "rewards/margins": 2.566448211669922, "rewards/rejected": -2.1332321166992188, "step": 211 }, { "epoch": 0.6525586764140053, "grad_norm": 4.442399978637695, "learning_rate": 9.348765432098765e-07, "logits/chosen": -2.5718297958374023, "logits/rejected": -2.534317970275879, "logps/chosen": -51.263816833496094, "logps/rejected": -109.04266357421875, "loss": 0.574, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2528579235076904, "rewards/margins": 3.4304466247558594, "rewards/rejected": -2.177588939666748, "step": 212 }, { "epoch": 0.6556367833782224, "grad_norm": 3.5111756324768066, "learning_rate": 9.345679012345678e-07, "logits/chosen": -2.502255916595459, "logits/rejected": -2.4789981842041016, "logps/chosen": -46.233131408691406, "logps/rejected": -100.84355163574219, "loss": 0.5962, "rewards/accuracies": 0.96875, "rewards/chosen": 1.152850866317749, "rewards/margins": 3.1026506423950195, "rewards/rejected": -1.9497997760772705, "step": 213 }, { "epoch": 0.6587148903424394, "grad_norm": 4.444263458251953, "learning_rate": 9.342592592592592e-07, "logits/chosen": -2.5514445304870605, "logits/rejected": -2.4989359378814697, "logps/chosen": -40.264793395996094, "logps/rejected": -101.05160522460938, "loss": 0.4685, "rewards/accuracies": 1.0, "rewards/chosen": 1.5204670429229736, "rewards/margins": 3.7046332359313965, "rewards/rejected": -2.184166431427002, "step": 214 }, { "epoch": 0.6617929973066564, "grad_norm": 3.1408798694610596, "learning_rate": 9.339506172839505e-07, "logits/chosen": -2.553867816925049, "logits/rejected": -2.513965606689453, "logps/chosen": -49.0800895690918, "logps/rejected": -93.6390151977539, "loss": 0.6265, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1388617753982544, "rewards/margins": 2.8115992546081543, "rewards/rejected": -1.6727373600006104, "step": 215 }, { "epoch": 0.6648711042708734, "grad_norm": 3.1061856746673584, "learning_rate": 9.336419753086419e-07, "logits/chosen": -2.518256664276123, "logits/rejected": -2.412074089050293, "logps/chosen": -50.70615768432617, "logps/rejected": -113.40647888183594, "loss": 0.5812, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6807650923728943, "rewards/margins": 3.319185256958008, "rewards/rejected": -2.6384201049804688, "step": 216 }, { "epoch": 0.6679492112350904, "grad_norm": 3.043384075164795, "learning_rate": 9.333333333333333e-07, "logits/chosen": -2.516871690750122, "logits/rejected": -2.488454818725586, "logps/chosen": -47.60067367553711, "logps/rejected": -85.99879455566406, "loss": 0.7203, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0176938772201538, "rewards/margins": 2.0867412090301514, "rewards/rejected": -1.069047451019287, "step": 217 }, { "epoch": 0.6710273181993074, "grad_norm": 3.247565507888794, "learning_rate": 9.330246913580246e-07, "logits/chosen": -2.520442008972168, "logits/rejected": -2.486067533493042, "logps/chosen": -50.44165802001953, "logps/rejected": -89.75894165039062, "loss": 0.6717, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7913181781768799, "rewards/margins": 2.406400680541992, "rewards/rejected": -1.6150826215744019, "step": 218 }, { "epoch": 0.6741054251635245, "grad_norm": 3.0026774406433105, "learning_rate": 9.32716049382716e-07, "logits/chosen": -2.528803825378418, "logits/rejected": -2.5354979038238525, "logps/chosen": -42.0608024597168, "logps/rejected": -100.26925659179688, "loss": 0.4872, "rewards/accuracies": 1.0, "rewards/chosen": 1.6282458305358887, "rewards/margins": 3.7547240257263184, "rewards/rejected": -2.1264781951904297, "step": 219 }, { "epoch": 0.6771835321277414, "grad_norm": 3.3445754051208496, "learning_rate": 9.324074074074074e-07, "logits/chosen": -2.570845365524292, "logits/rejected": -2.5090298652648926, "logps/chosen": -52.621559143066406, "logps/rejected": -115.46925354003906, "loss": 0.4979, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9329714775085449, "rewards/margins": 3.95100474357605, "rewards/rejected": -3.018033266067505, "step": 220 }, { "epoch": 0.6802616390919585, "grad_norm": 3.7310776710510254, "learning_rate": 9.320987654320987e-07, "logits/chosen": -2.469003677368164, "logits/rejected": -2.4623286724090576, "logps/chosen": -52.08897399902344, "logps/rejected": -106.3140640258789, "loss": 0.5849, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8858616352081299, "rewards/margins": 3.1920344829559326, "rewards/rejected": -2.306173086166382, "step": 221 }, { "epoch": 0.6833397460561754, "grad_norm": 3.3953840732574463, "learning_rate": 9.317901234567901e-07, "logits/chosen": -2.5116162300109863, "logits/rejected": -2.429924726486206, "logps/chosen": -47.85185623168945, "logps/rejected": -104.40858459472656, "loss": 0.5118, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0194709300994873, "rewards/margins": 3.352532148361206, "rewards/rejected": -2.3330612182617188, "step": 222 }, { "epoch": 0.6864178530203925, "grad_norm": 3.3618006706237793, "learning_rate": 9.314814814814814e-07, "logits/chosen": -2.467696189880371, "logits/rejected": -2.481786012649536, "logps/chosen": -43.1153450012207, "logps/rejected": -105.46810150146484, "loss": 0.4589, "rewards/accuracies": 1.0, "rewards/chosen": 1.4241039752960205, "rewards/margins": 3.9625608921051025, "rewards/rejected": -2.538456678390503, "step": 223 }, { "epoch": 0.6894959599846094, "grad_norm": 4.20452356338501, "learning_rate": 9.311728395061727e-07, "logits/chosen": -2.5215771198272705, "logits/rejected": -2.5173768997192383, "logps/chosen": -46.09497833251953, "logps/rejected": -92.93782043457031, "loss": 0.5875, "rewards/accuracies": 0.90625, "rewards/chosen": 1.319096326828003, "rewards/margins": 2.820160388946533, "rewards/rejected": -1.5010641813278198, "step": 224 }, { "epoch": 0.6925740669488265, "grad_norm": 3.467524528503418, "learning_rate": 9.308641975308642e-07, "logits/chosen": -2.501818895339966, "logits/rejected": -2.472160816192627, "logps/chosen": -46.178611755371094, "logps/rejected": -84.02984619140625, "loss": 0.6753, "rewards/accuracies": 0.875, "rewards/chosen": 1.0798590183258057, "rewards/margins": 2.2138943672180176, "rewards/rejected": -1.1340354681015015, "step": 225 }, { "epoch": 0.6956521739130435, "grad_norm": 3.096839427947998, "learning_rate": 9.305555555555555e-07, "logits/chosen": -2.5720081329345703, "logits/rejected": -2.5215792655944824, "logps/chosen": -39.49162292480469, "logps/rejected": -79.63432312011719, "loss": 0.6996, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5910142660140991, "rewards/margins": 2.2971439361572266, "rewards/rejected": -0.7061295509338379, "step": 226 }, { "epoch": 0.6987302808772605, "grad_norm": 3.419701337814331, "learning_rate": 9.302469135802468e-07, "logits/chosen": -2.6412086486816406, "logits/rejected": -2.529240131378174, "logps/chosen": -40.15169906616211, "logps/rejected": -94.27589416503906, "loss": 0.5401, "rewards/accuracies": 0.9375, "rewards/chosen": 1.788132905960083, "rewards/margins": 3.304734706878662, "rewards/rejected": -1.5166016817092896, "step": 227 }, { "epoch": 0.7018083878414775, "grad_norm": 3.309004783630371, "learning_rate": 9.299382716049382e-07, "logits/chosen": -2.5037436485290527, "logits/rejected": -2.4452710151672363, "logps/chosen": -45.087764739990234, "logps/rejected": -102.48275756835938, "loss": 0.5911, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2781305313110352, "rewards/margins": 3.319096803665161, "rewards/rejected": -2.040966033935547, "step": 228 }, { "epoch": 0.7048864948056945, "grad_norm": 3.422783374786377, "learning_rate": 9.296296296296295e-07, "logits/chosen": -2.5662174224853516, "logits/rejected": -2.5752267837524414, "logps/chosen": -44.515342712402344, "logps/rejected": -95.11038970947266, "loss": 0.6108, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0526137351989746, "rewards/margins": 2.9060566425323486, "rewards/rejected": -1.8534425497055054, "step": 229 }, { "epoch": 0.7079646017699115, "grad_norm": 3.0696091651916504, "learning_rate": 9.29320987654321e-07, "logits/chosen": -2.612668752670288, "logits/rejected": -2.5397000312805176, "logps/chosen": -51.63105392456055, "logps/rejected": -89.94660949707031, "loss": 0.7235, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0308852195739746, "rewards/margins": 2.5575764179229736, "rewards/rejected": -1.5266913175582886, "step": 230 }, { "epoch": 0.7110427087341286, "grad_norm": 3.882664680480957, "learning_rate": 9.290123456790123e-07, "logits/chosen": -2.54378080368042, "logits/rejected": -2.52396297454834, "logps/chosen": -40.62314224243164, "logps/rejected": -89.8819808959961, "loss": 0.5807, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6651129722595215, "rewards/margins": 3.213552474975586, "rewards/rejected": -1.548439383506775, "step": 231 }, { "epoch": 0.7141208156983455, "grad_norm": 3.1344501972198486, "learning_rate": 9.287037037037036e-07, "logits/chosen": -2.526731491088867, "logits/rejected": -2.482027053833008, "logps/chosen": -50.149662017822266, "logps/rejected": -113.96255493164062, "loss": 0.518, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0519269704818726, "rewards/margins": 3.937784433364868, "rewards/rejected": -2.885857582092285, "step": 232 }, { "epoch": 0.7171989226625626, "grad_norm": 2.7000153064727783, "learning_rate": 9.28395061728395e-07, "logits/chosen": -2.507040500640869, "logits/rejected": -2.5098633766174316, "logps/chosen": -52.90354537963867, "logps/rejected": -114.14091491699219, "loss": 0.5438, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7778097987174988, "rewards/margins": 3.5883290767669678, "rewards/rejected": -2.810518980026245, "step": 233 }, { "epoch": 0.7202770296267795, "grad_norm": 4.076379776000977, "learning_rate": 9.280864197530863e-07, "logits/chosen": -2.56345534324646, "logits/rejected": -2.552523136138916, "logps/chosen": -52.41379165649414, "logps/rejected": -111.97518157958984, "loss": 0.5753, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9435467720031738, "rewards/margins": 3.690955400466919, "rewards/rejected": -2.747408866882324, "step": 234 }, { "epoch": 0.7233551365909965, "grad_norm": 2.6957125663757324, "learning_rate": 9.277777777777777e-07, "logits/chosen": -2.5583081245422363, "logits/rejected": -2.4843006134033203, "logps/chosen": -47.99848556518555, "logps/rejected": -100.234619140625, "loss": 0.6312, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1611839532852173, "rewards/margins": 3.1978657245635986, "rewards/rejected": -2.036681652069092, "step": 235 }, { "epoch": 0.7264332435552135, "grad_norm": 3.3783583641052246, "learning_rate": 9.274691358024691e-07, "logits/chosen": -2.551907777786255, "logits/rejected": -2.523765802383423, "logps/chosen": -45.67497253417969, "logps/rejected": -94.94474792480469, "loss": 0.6175, "rewards/accuracies": 0.9375, "rewards/chosen": 1.155747890472412, "rewards/margins": 3.0969693660736084, "rewards/rejected": -1.9412213563919067, "step": 236 }, { "epoch": 0.7295113505194305, "grad_norm": 3.7196810245513916, "learning_rate": 9.271604938271604e-07, "logits/chosen": -2.5668044090270996, "logits/rejected": -2.551241874694824, "logps/chosen": -54.32753372192383, "logps/rejected": -104.71369934082031, "loss": 0.6124, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6207692623138428, "rewards/margins": 2.9897031784057617, "rewards/rejected": -2.368934154510498, "step": 237 }, { "epoch": 0.7325894574836476, "grad_norm": 3.4727768898010254, "learning_rate": 9.268518518518517e-07, "logits/chosen": -2.495330810546875, "logits/rejected": -2.5032951831817627, "logps/chosen": -47.331729888916016, "logps/rejected": -111.26133728027344, "loss": 0.5642, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9328625202178955, "rewards/margins": 3.6606810092926025, "rewards/rejected": -2.727818489074707, "step": 238 }, { "epoch": 0.7356675644478645, "grad_norm": 2.9821414947509766, "learning_rate": 9.265432098765432e-07, "logits/chosen": -2.6322712898254395, "logits/rejected": -2.6013407707214355, "logps/chosen": -47.84272766113281, "logps/rejected": -110.04986572265625, "loss": 0.4788, "rewards/accuracies": 0.96875, "rewards/chosen": 1.226229190826416, "rewards/margins": 3.9312100410461426, "rewards/rejected": -2.7049803733825684, "step": 239 }, { "epoch": 0.7387456714120816, "grad_norm": 3.497087001800537, "learning_rate": 9.262345679012345e-07, "logits/chosen": -2.6222681999206543, "logits/rejected": -2.588829278945923, "logps/chosen": -49.748409271240234, "logps/rejected": -100.35984802246094, "loss": 0.5958, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8824483156204224, "rewards/margins": 3.108867883682251, "rewards/rejected": -2.226419687271118, "step": 240 }, { "epoch": 0.7418237783762985, "grad_norm": 2.9141933917999268, "learning_rate": 9.259259259259259e-07, "logits/chosen": -2.5263912677764893, "logits/rejected": -2.5696027278900146, "logps/chosen": -46.75834274291992, "logps/rejected": -102.6002426147461, "loss": 0.5431, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1143786907196045, "rewards/margins": 3.558305025100708, "rewards/rejected": -2.4439265727996826, "step": 241 }, { "epoch": 0.7449018853405156, "grad_norm": 4.224743366241455, "learning_rate": 9.256172839506172e-07, "logits/chosen": -2.4639387130737305, "logits/rejected": -2.506413221359253, "logps/chosen": -53.285152435302734, "logps/rejected": -107.24271392822266, "loss": 0.6386, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9325734376907349, "rewards/margins": 3.2211289405822754, "rewards/rejected": -2.288555145263672, "step": 242 }, { "epoch": 0.7479799923047326, "grad_norm": 4.312895774841309, "learning_rate": 9.253086419753085e-07, "logits/chosen": -2.5898053646087646, "logits/rejected": -2.5536856651306152, "logps/chosen": -48.32197570800781, "logps/rejected": -94.77409362792969, "loss": 0.6043, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8433923721313477, "rewards/margins": 2.856217861175537, "rewards/rejected": -2.0128254890441895, "step": 243 }, { "epoch": 0.7510580992689496, "grad_norm": 3.332305908203125, "learning_rate": 9.25e-07, "logits/chosen": -2.5708000659942627, "logits/rejected": -2.5196785926818848, "logps/chosen": -49.80482482910156, "logps/rejected": -109.53822326660156, "loss": 0.5564, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9400083422660828, "rewards/margins": 3.863356113433838, "rewards/rejected": -2.9233477115631104, "step": 244 }, { "epoch": 0.7541362062331666, "grad_norm": 3.3482964038848877, "learning_rate": 9.246913580246913e-07, "logits/chosen": -2.5216317176818848, "logits/rejected": -2.540832042694092, "logps/chosen": -44.4945068359375, "logps/rejected": -96.19371795654297, "loss": 0.5515, "rewards/accuracies": 0.875, "rewards/chosen": 1.116837501525879, "rewards/margins": 3.3529131412506104, "rewards/rejected": -2.2360761165618896, "step": 245 }, { "epoch": 0.7572143131973836, "grad_norm": 5.075887203216553, "learning_rate": 9.243827160493826e-07, "logits/chosen": -2.615999460220337, "logits/rejected": -2.5575764179229736, "logps/chosen": -52.859683990478516, "logps/rejected": -108.1877670288086, "loss": 0.6323, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6069881319999695, "rewards/margins": 3.26786470413208, "rewards/rejected": -2.660876750946045, "step": 246 }, { "epoch": 0.7602924201616006, "grad_norm": 3.3875999450683594, "learning_rate": 9.24074074074074e-07, "logits/chosen": -2.526608943939209, "logits/rejected": -2.5712876319885254, "logps/chosen": -45.126426696777344, "logps/rejected": -108.39273834228516, "loss": 0.4838, "rewards/accuracies": 1.0, "rewards/chosen": 1.2379045486450195, "rewards/margins": 3.949145793914795, "rewards/rejected": -2.7112414836883545, "step": 247 }, { "epoch": 0.7633705271258177, "grad_norm": 4.473629951477051, "learning_rate": 9.237654320987653e-07, "logits/chosen": -2.6431241035461426, "logits/rejected": -2.5246474742889404, "logps/chosen": -51.81708526611328, "logps/rejected": -102.17719268798828, "loss": 0.6357, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6328962445259094, "rewards/margins": 2.8739304542541504, "rewards/rejected": -2.2410342693328857, "step": 248 }, { "epoch": 0.7664486340900346, "grad_norm": 2.9779584407806396, "learning_rate": 9.234567901234567e-07, "logits/chosen": -2.6081316471099854, "logits/rejected": -2.53676700592041, "logps/chosen": -58.790218353271484, "logps/rejected": -122.87825012207031, "loss": 0.5503, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5050345063209534, "rewards/margins": 3.8579306602478027, "rewards/rejected": -3.352895975112915, "step": 249 }, { "epoch": 0.7695267410542517, "grad_norm": 3.3829710483551025, "learning_rate": 9.231481481481481e-07, "logits/chosen": -2.630014419555664, "logits/rejected": -2.6422760486602783, "logps/chosen": -50.98275375366211, "logps/rejected": -118.30909729003906, "loss": 0.4726, "rewards/accuracies": 0.875, "rewards/chosen": 0.9460659623146057, "rewards/margins": 4.084068298339844, "rewards/rejected": -3.138002395629883, "step": 250 }, { "epoch": 0.7726048480184686, "grad_norm": 3.5048184394836426, "learning_rate": 9.228395061728394e-07, "logits/chosen": -2.479278087615967, "logits/rejected": -2.517411708831787, "logps/chosen": -41.95247268676758, "logps/rejected": -99.82443237304688, "loss": 0.528, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4078240394592285, "rewards/margins": 3.8416762351989746, "rewards/rejected": -2.433851718902588, "step": 251 }, { "epoch": 0.7756829549826857, "grad_norm": 3.2299489974975586, "learning_rate": 9.225308641975308e-07, "logits/chosen": -2.523660182952881, "logits/rejected": -2.4890618324279785, "logps/chosen": -42.24872970581055, "logps/rejected": -94.74623107910156, "loss": 0.5161, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5426329374313354, "rewards/margins": 3.6483969688415527, "rewards/rejected": -2.1057639122009277, "step": 252 }, { "epoch": 0.7787610619469026, "grad_norm": 3.874493360519409, "learning_rate": 9.222222222222222e-07, "logits/chosen": -2.597382068634033, "logits/rejected": -2.575188159942627, "logps/chosen": -44.61785125732422, "logps/rejected": -95.53608703613281, "loss": 0.5549, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4300986528396606, "rewards/margins": 3.383997678756714, "rewards/rejected": -1.9538987874984741, "step": 253 }, { "epoch": 0.7818391689111197, "grad_norm": 4.142974853515625, "learning_rate": 9.219135802469135e-07, "logits/chosen": -2.656071186065674, "logits/rejected": -2.656618118286133, "logps/chosen": -40.18168640136719, "logps/rejected": -95.2264175415039, "loss": 0.53, "rewards/accuracies": 1.0, "rewards/chosen": 1.8479722738265991, "rewards/margins": 3.7213053703308105, "rewards/rejected": -1.8733329772949219, "step": 254 }, { "epoch": 0.7849172758753367, "grad_norm": 4.609100341796875, "learning_rate": 9.216049382716049e-07, "logits/chosen": -2.7219128608703613, "logits/rejected": -2.648160457611084, "logps/chosen": -59.1294059753418, "logps/rejected": -111.6597900390625, "loss": 0.6789, "rewards/accuracies": 0.9375, "rewards/chosen": 0.32596486806869507, "rewards/margins": 3.149057388305664, "rewards/rejected": -2.8230926990509033, "step": 255 }, { "epoch": 0.7879953828395537, "grad_norm": 3.253755569458008, "learning_rate": 9.212962962962962e-07, "logits/chosen": -2.646648406982422, "logits/rejected": -2.6170434951782227, "logps/chosen": -48.94092559814453, "logps/rejected": -110.5947494506836, "loss": 0.498, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0706961154937744, "rewards/margins": 3.9399967193603516, "rewards/rejected": -2.8693008422851562, "step": 256 }, { "epoch": 0.7910734898037707, "grad_norm": 3.5564117431640625, "learning_rate": 9.209876543209875e-07, "logits/chosen": -2.531296968460083, "logits/rejected": -2.567146062850952, "logps/chosen": -54.65451431274414, "logps/rejected": -95.7993392944336, "loss": 0.6552, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7143691182136536, "rewards/margins": 2.878884792327881, "rewards/rejected": -2.164515733718872, "step": 257 }, { "epoch": 0.7941515967679876, "grad_norm": 4.90385103225708, "learning_rate": 9.20679012345679e-07, "logits/chosen": -2.5180277824401855, "logits/rejected": -2.507877826690674, "logps/chosen": -49.494991302490234, "logps/rejected": -113.27205657958984, "loss": 0.5153, "rewards/accuracies": 0.96875, "rewards/chosen": 0.921296238899231, "rewards/margins": 3.794828176498413, "rewards/rejected": -2.8735315799713135, "step": 258 }, { "epoch": 0.7972297037322047, "grad_norm": 3.775928258895874, "learning_rate": 9.203703703703703e-07, "logits/chosen": -2.552502155303955, "logits/rejected": -2.5897955894470215, "logps/chosen": -47.24555969238281, "logps/rejected": -109.56806945800781, "loss": 0.4936, "rewards/accuracies": 0.875, "rewards/chosen": 1.448692798614502, "rewards/margins": 3.9566311836242676, "rewards/rejected": -2.5079383850097656, "step": 259 }, { "epoch": 0.8003078106964217, "grad_norm": 3.1077682971954346, "learning_rate": 9.200617283950616e-07, "logits/chosen": -2.532046318054199, "logits/rejected": -2.514073371887207, "logps/chosen": -53.859352111816406, "logps/rejected": -112.24592590332031, "loss": 0.5739, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9820669889450073, "rewards/margins": 3.7067031860351562, "rewards/rejected": -2.7246358394622803, "step": 260 }, { "epoch": 0.8003078106964217, "eval_logits/chosen": -2.5416691303253174, "eval_logits/rejected": -2.57967472076416, "eval_logps/chosen": -44.511356353759766, "eval_logps/rejected": -99.21448516845703, "eval_loss": 0.6007125973701477, "eval_rewards/accuracies": 0.8959537744522095, "eval_rewards/chosen": 1.3762434720993042, "eval_rewards/margins": 3.2811086177825928, "eval_rewards/rejected": -1.9048649072647095, "eval_runtime": 647.0895, "eval_samples_per_second": 0.535, "eval_steps_per_second": 0.267, "step": 260 }, { "epoch": 0.8033859176606387, "grad_norm": 3.699167013168335, "learning_rate": 9.19753086419753e-07, "logits/chosen": -2.626288414001465, "logits/rejected": -2.5732531547546387, "logps/chosen": -38.27876281738281, "logps/rejected": -86.52983856201172, "loss": 0.5892, "rewards/accuracies": 1.0, "rewards/chosen": 1.7244510650634766, "rewards/margins": 3.3113484382629395, "rewards/rejected": -1.5868972539901733, "step": 261 }, { "epoch": 0.8064640246248557, "grad_norm": 3.868537425994873, "learning_rate": 9.194444444444443e-07, "logits/chosen": -2.513434410095215, "logits/rejected": -2.5445282459259033, "logps/chosen": -51.536949157714844, "logps/rejected": -93.27861785888672, "loss": 0.6112, "rewards/accuracies": 0.90625, "rewards/chosen": 1.075799584388733, "rewards/margins": 2.742117404937744, "rewards/rejected": -1.6663177013397217, "step": 262 }, { "epoch": 0.8095421315890727, "grad_norm": 3.376837730407715, "learning_rate": 9.191358024691358e-07, "logits/chosen": -2.6189305782318115, "logits/rejected": -2.6494741439819336, "logps/chosen": -39.3719482421875, "logps/rejected": -102.66836547851562, "loss": 0.5055, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7938640117645264, "rewards/margins": 4.163243770599365, "rewards/rejected": -2.369379997253418, "step": 263 }, { "epoch": 0.8126202385532897, "grad_norm": 3.311459541320801, "learning_rate": 9.188271604938271e-07, "logits/chosen": -2.626796007156372, "logits/rejected": -2.5530292987823486, "logps/chosen": -33.239585876464844, "logps/rejected": -96.651611328125, "loss": 0.5286, "rewards/accuracies": 0.9375, "rewards/chosen": 1.987099528312683, "rewards/margins": 3.8389511108398438, "rewards/rejected": -1.8518518209457397, "step": 264 }, { "epoch": 0.8156983455175068, "grad_norm": 2.8475067615509033, "learning_rate": 9.185185185185184e-07, "logits/chosen": -2.6181538105010986, "logits/rejected": -2.605443000793457, "logps/chosen": -48.62809753417969, "logps/rejected": -110.28319549560547, "loss": 0.516, "rewards/accuracies": 0.96875, "rewards/chosen": 1.313554286956787, "rewards/margins": 4.182537078857422, "rewards/rejected": -2.8689825534820557, "step": 265 }, { "epoch": 0.8187764524817237, "grad_norm": 3.3830230236053467, "learning_rate": 9.182098765432098e-07, "logits/chosen": -2.6446099281311035, "logits/rejected": -2.5973987579345703, "logps/chosen": -47.07724380493164, "logps/rejected": -102.31806182861328, "loss": 0.5547, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3093924522399902, "rewards/margins": 3.5921270847320557, "rewards/rejected": -2.2827346324920654, "step": 266 }, { "epoch": 0.8218545594459408, "grad_norm": 2.874486207962036, "learning_rate": 9.179012345679011e-07, "logits/chosen": -2.600055694580078, "logits/rejected": -2.588064193725586, "logps/chosen": -42.54057312011719, "logps/rejected": -81.1456069946289, "loss": 0.6457, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6188712120056152, "rewards/margins": 2.6559910774230957, "rewards/rejected": -1.037119746208191, "step": 267 }, { "epoch": 0.8249326664101577, "grad_norm": 3.0185985565185547, "learning_rate": 9.175925925925925e-07, "logits/chosen": -2.5944533348083496, "logits/rejected": -2.5412731170654297, "logps/chosen": -36.99090576171875, "logps/rejected": -84.58300018310547, "loss": 0.6062, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9659675359725952, "rewards/margins": 3.1968326568603516, "rewards/rejected": -1.2308650016784668, "step": 268 }, { "epoch": 0.8280107733743748, "grad_norm": 4.907352447509766, "learning_rate": 9.172839506172839e-07, "logits/chosen": -2.560718059539795, "logits/rejected": -2.5487844944000244, "logps/chosen": -45.9289436340332, "logps/rejected": -101.8675308227539, "loss": 0.5865, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3746099472045898, "rewards/margins": 3.500842571258545, "rewards/rejected": -2.126232624053955, "step": 269 }, { "epoch": 0.8310888803385917, "grad_norm": 2.948448419570923, "learning_rate": 9.169753086419752e-07, "logits/chosen": -2.605248212814331, "logits/rejected": -2.5903377532958984, "logps/chosen": -49.0601806640625, "logps/rejected": -102.77564239501953, "loss": 0.6076, "rewards/accuracies": 0.90625, "rewards/chosen": 1.024146556854248, "rewards/margins": 3.3453238010406494, "rewards/rejected": -2.3211772441864014, "step": 270 }, { "epoch": 0.8341669873028088, "grad_norm": 4.008134365081787, "learning_rate": 9.166666666666665e-07, "logits/chosen": -2.650695562362671, "logits/rejected": -2.589987277984619, "logps/chosen": -37.05559158325195, "logps/rejected": -100.7747573852539, "loss": 0.4505, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7386938333511353, "rewards/margins": 4.071256160736084, "rewards/rejected": -2.3325624465942383, "step": 271 }, { "epoch": 0.8372450942670258, "grad_norm": 3.984403371810913, "learning_rate": 9.16358024691358e-07, "logits/chosen": -2.635608196258545, "logits/rejected": -2.623614549636841, "logps/chosen": -37.398983001708984, "logps/rejected": -99.07244873046875, "loss": 0.525, "rewards/accuracies": 0.9375, "rewards/chosen": 1.80023193359375, "rewards/margins": 4.0154194831848145, "rewards/rejected": -2.2151875495910645, "step": 272 }, { "epoch": 0.8403232012312428, "grad_norm": 4.791906356811523, "learning_rate": 9.160493827160493e-07, "logits/chosen": -2.6596875190734863, "logits/rejected": -2.6760733127593994, "logps/chosen": -42.93864440917969, "logps/rejected": -100.00141906738281, "loss": 0.5171, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4562852382659912, "rewards/margins": 3.7708518505096436, "rewards/rejected": -2.3145666122436523, "step": 273 }, { "epoch": 0.8434013081954598, "grad_norm": 3.4116759300231934, "learning_rate": 9.157407407407407e-07, "logits/chosen": -2.5909152030944824, "logits/rejected": -2.576144218444824, "logps/chosen": -37.546451568603516, "logps/rejected": -107.27818298339844, "loss": 0.4597, "rewards/accuracies": 0.9375, "rewards/chosen": 2.036391258239746, "rewards/margins": 4.676547527313232, "rewards/rejected": -2.6401565074920654, "step": 274 }, { "epoch": 0.8464794151596768, "grad_norm": 3.3608598709106445, "learning_rate": 9.15432098765432e-07, "logits/chosen": -2.586519479751587, "logits/rejected": -2.5948901176452637, "logps/chosen": -41.87301254272461, "logps/rejected": -92.47911834716797, "loss": 0.6473, "rewards/accuracies": 0.875, "rewards/chosen": 1.5057698488235474, "rewards/margins": 3.005998373031616, "rewards/rejected": -1.500228762626648, "step": 275 }, { "epoch": 0.8495575221238938, "grad_norm": 4.364643573760986, "learning_rate": 9.151234567901233e-07, "logits/chosen": -2.650465965270996, "logits/rejected": -2.6089437007904053, "logps/chosen": -64.65325927734375, "logps/rejected": -121.88529968261719, "loss": 0.6091, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14463753998279572, "rewards/margins": 3.8103315830230713, "rewards/rejected": -3.665693759918213, "step": 276 }, { "epoch": 0.8526356290881109, "grad_norm": 3.3818018436431885, "learning_rate": 9.148148148148148e-07, "logits/chosen": -2.680659532546997, "logits/rejected": -2.6984338760375977, "logps/chosen": -60.16276550292969, "logps/rejected": -113.06404876708984, "loss": 0.5941, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2844860255718231, "rewards/margins": 3.667433738708496, "rewards/rejected": -3.3829479217529297, "step": 277 }, { "epoch": 0.8557137360523278, "grad_norm": 4.736813545227051, "learning_rate": 9.145061728395061e-07, "logits/chosen": -2.6688835620880127, "logits/rejected": -2.7358038425445557, "logps/chosen": -43.91510772705078, "logps/rejected": -100.51078796386719, "loss": 0.6117, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2702950239181519, "rewards/margins": 3.6494436264038086, "rewards/rejected": -2.3791487216949463, "step": 278 }, { "epoch": 0.8587918430165449, "grad_norm": 4.244693756103516, "learning_rate": 9.141975308641974e-07, "logits/chosen": -2.6358840465545654, "logits/rejected": -2.7252132892608643, "logps/chosen": -50.12438201904297, "logps/rejected": -120.81317138671875, "loss": 0.5304, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6083054542541504, "rewards/margins": 4.49307918548584, "rewards/rejected": -3.8847742080688477, "step": 279 }, { "epoch": 0.8618699499807618, "grad_norm": 4.459984302520752, "learning_rate": 9.138888888888888e-07, "logits/chosen": -2.662658452987671, "logits/rejected": -2.6300771236419678, "logps/chosen": -50.2463264465332, "logps/rejected": -118.06233978271484, "loss": 0.5649, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6665768623352051, "rewards/margins": 4.132020473480225, "rewards/rejected": -3.4654433727264404, "step": 280 }, { "epoch": 0.8649480569449788, "grad_norm": 2.9463999271392822, "learning_rate": 9.135802469135801e-07, "logits/chosen": -2.6409459114074707, "logits/rejected": -2.519758939743042, "logps/chosen": -52.72004318237305, "logps/rejected": -104.6892318725586, "loss": 0.6493, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6716410517692566, "rewards/margins": 3.2777175903320312, "rewards/rejected": -2.60607647895813, "step": 281 }, { "epoch": 0.8680261639091958, "grad_norm": 3.9166276454925537, "learning_rate": 9.132716049382717e-07, "logits/chosen": -2.592158317565918, "logits/rejected": -2.649557113647461, "logps/chosen": -42.475914001464844, "logps/rejected": -103.35853576660156, "loss": 0.5295, "rewards/accuracies": 0.96875, "rewards/chosen": 1.509856104850769, "rewards/margins": 4.044781684875488, "rewards/rejected": -2.534925699234009, "step": 282 }, { "epoch": 0.8711042708734128, "grad_norm": 3.301525831222534, "learning_rate": 9.12962962962963e-07, "logits/chosen": -2.713449239730835, "logits/rejected": -2.626551866531372, "logps/chosen": -52.89474105834961, "logps/rejected": -108.40576171875, "loss": 0.5842, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7588895559310913, "rewards/margins": 3.536956310272217, "rewards/rejected": -2.778066635131836, "step": 283 }, { "epoch": 0.8741823778376299, "grad_norm": 4.576640605926514, "learning_rate": 9.126543209876542e-07, "logits/chosen": -2.68461012840271, "logits/rejected": -2.7133474349975586, "logps/chosen": -43.836700439453125, "logps/rejected": -114.19890594482422, "loss": 0.5427, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2264739274978638, "rewards/margins": 4.604991436004639, "rewards/rejected": -3.3785176277160645, "step": 284 }, { "epoch": 0.8772604848018468, "grad_norm": 4.1799092292785645, "learning_rate": 9.123456790123456e-07, "logits/chosen": -2.7501914501190186, "logits/rejected": -2.700298547744751, "logps/chosen": -56.2108154296875, "logps/rejected": -107.11463165283203, "loss": 0.6073, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4108406603336334, "rewards/margins": 3.428520917892456, "rewards/rejected": -3.0176806449890137, "step": 285 }, { "epoch": 0.8803385917660639, "grad_norm": 3.686326026916504, "learning_rate": 9.12037037037037e-07, "logits/chosen": -2.709312677383423, "logits/rejected": -2.684494972229004, "logps/chosen": -52.485679626464844, "logps/rejected": -112.1817855834961, "loss": 0.6567, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7174609899520874, "rewards/margins": 3.4807491302490234, "rewards/rejected": -2.7632877826690674, "step": 286 }, { "epoch": 0.8834166987302808, "grad_norm": 2.955463171005249, "learning_rate": 9.117283950617283e-07, "logits/chosen": -2.626614570617676, "logits/rejected": -2.6589488983154297, "logps/chosen": -48.58728790283203, "logps/rejected": -115.83495330810547, "loss": 0.459, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9966035485267639, "rewards/margins": 4.5555644035339355, "rewards/rejected": -3.5589609146118164, "step": 287 }, { "epoch": 0.8864948056944979, "grad_norm": 3.4917564392089844, "learning_rate": 9.114197530864197e-07, "logits/chosen": -2.679860830307007, "logits/rejected": -2.650186538696289, "logps/chosen": -41.65509033203125, "logps/rejected": -78.83633422851562, "loss": 0.6655, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4008560180664062, "rewards/margins": 2.4616973400115967, "rewards/rejected": -1.0608413219451904, "step": 288 }, { "epoch": 0.8895729126587149, "grad_norm": 4.436812877655029, "learning_rate": 9.11111111111111e-07, "logits/chosen": -2.5696001052856445, "logits/rejected": -2.603553056716919, "logps/chosen": -42.00364303588867, "logps/rejected": -101.50830841064453, "loss": 0.5512, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2449259757995605, "rewards/margins": 3.8347628116607666, "rewards/rejected": -2.589836835861206, "step": 289 }, { "epoch": 0.8926510196229319, "grad_norm": 4.79538106918335, "learning_rate": 9.108024691358023e-07, "logits/chosen": -2.6288750171661377, "logits/rejected": -2.6468093395233154, "logps/chosen": -38.405433654785156, "logps/rejected": -118.0263671875, "loss": 0.42, "rewards/accuracies": 1.0, "rewards/chosen": 1.8366925716400146, "rewards/margins": 5.111565589904785, "rewards/rejected": -3.2748730182647705, "step": 290 }, { "epoch": 0.8957291265871489, "grad_norm": 3.049189329147339, "learning_rate": 9.104938271604939e-07, "logits/chosen": -2.624251365661621, "logits/rejected": -2.640745162963867, "logps/chosen": -60.32903289794922, "logps/rejected": -123.60403442382812, "loss": 0.6409, "rewards/accuracies": 0.90625, "rewards/chosen": 0.11675992608070374, "rewards/margins": 3.9239351749420166, "rewards/rejected": -3.8071751594543457, "step": 291 }, { "epoch": 0.8988072335513659, "grad_norm": 5.215387344360352, "learning_rate": 9.101851851851851e-07, "logits/chosen": -2.627676486968994, "logits/rejected": -2.643030881881714, "logps/chosen": -51.14905548095703, "logps/rejected": -95.04225158691406, "loss": 0.7289, "rewards/accuracies": 0.90625, "rewards/chosen": 0.842557966709137, "rewards/margins": 2.7004787921905518, "rewards/rejected": -1.8579206466674805, "step": 292 }, { "epoch": 0.9018853405155829, "grad_norm": 3.6167376041412354, "learning_rate": 9.098765432098766e-07, "logits/chosen": -2.686946153640747, "logits/rejected": -2.664135217666626, "logps/chosen": -59.158634185791016, "logps/rejected": -118.00509643554688, "loss": 0.5931, "rewards/accuracies": 0.96875, "rewards/chosen": 0.27408891916275024, "rewards/margins": 3.934382677078247, "rewards/rejected": -3.6602935791015625, "step": 293 }, { "epoch": 0.9049634474798, "grad_norm": 4.194294452667236, "learning_rate": 9.095679012345678e-07, "logits/chosen": -2.710738182067871, "logits/rejected": -2.6961846351623535, "logps/chosen": -52.50547790527344, "logps/rejected": -114.26297760009766, "loss": 0.5913, "rewards/accuracies": 0.875, "rewards/chosen": 0.6981056928634644, "rewards/margins": 4.123798370361328, "rewards/rejected": -3.4256930351257324, "step": 294 }, { "epoch": 0.9080415544440169, "grad_norm": 3.9154460430145264, "learning_rate": 9.092592592592591e-07, "logits/chosen": -2.5912232398986816, "logits/rejected": -2.6137654781341553, "logps/chosen": -51.5435791015625, "logps/rejected": -125.25563049316406, "loss": 0.4728, "rewards/accuracies": 1.0, "rewards/chosen": 0.8556053638458252, "rewards/margins": 4.839925765991211, "rewards/rejected": -3.9843201637268066, "step": 295 }, { "epoch": 0.911119661408234, "grad_norm": 3.3704211711883545, "learning_rate": 9.089506172839507e-07, "logits/chosen": -2.6644251346588135, "logits/rejected": -2.694603681564331, "logps/chosen": -47.71076965332031, "logps/rejected": -105.44503784179688, "loss": 0.5664, "rewards/accuracies": 0.875, "rewards/chosen": 1.280562400817871, "rewards/margins": 3.9188804626464844, "rewards/rejected": -2.638317584991455, "step": 296 }, { "epoch": 0.9141977683724509, "grad_norm": 3.150693655014038, "learning_rate": 9.08641975308642e-07, "logits/chosen": -2.6575980186462402, "logits/rejected": -2.638939380645752, "logps/chosen": -35.28346633911133, "logps/rejected": -96.69977569580078, "loss": 0.5942, "rewards/accuracies": 0.875, "rewards/chosen": 1.9772037267684937, "rewards/margins": 4.0758819580078125, "rewards/rejected": -2.0986781120300293, "step": 297 }, { "epoch": 0.917275875336668, "grad_norm": 3.9854462146759033, "learning_rate": 9.083333333333332e-07, "logits/chosen": -2.656719207763672, "logits/rejected": -2.7101457118988037, "logps/chosen": -52.4990119934082, "logps/rejected": -118.94621276855469, "loss": 0.5573, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0982532501220703, "rewards/margins": 4.56098747253418, "rewards/rejected": -3.4627342224121094, "step": 298 }, { "epoch": 0.9203539823008849, "grad_norm": 3.722395181655884, "learning_rate": 9.080246913580247e-07, "logits/chosen": -2.6429102420806885, "logits/rejected": -2.705986976623535, "logps/chosen": -41.44295120239258, "logps/rejected": -100.18450164794922, "loss": 0.4942, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5801758766174316, "rewards/margins": 4.077601432800293, "rewards/rejected": -2.4974260330200195, "step": 299 }, { "epoch": 0.923432089265102, "grad_norm": 3.769412040710449, "learning_rate": 9.07716049382716e-07, "logits/chosen": -2.689605712890625, "logits/rejected": -2.650691509246826, "logps/chosen": -41.68489456176758, "logps/rejected": -104.90951538085938, "loss": 0.5683, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6875312328338623, "rewards/margins": 4.009076118469238, "rewards/rejected": -2.321545124053955, "step": 300 }, { "epoch": 0.926510196229319, "grad_norm": 3.5756216049194336, "learning_rate": 9.074074074074074e-07, "logits/chosen": -2.6158666610717773, "logits/rejected": -2.6512646675109863, "logps/chosen": -40.90281677246094, "logps/rejected": -113.00299072265625, "loss": 0.4583, "rewards/accuracies": 0.96875, "rewards/chosen": 1.836726427078247, "rewards/margins": 5.115339279174805, "rewards/rejected": -3.2786126136779785, "step": 301 }, { "epoch": 0.929588303193536, "grad_norm": 4.000990390777588, "learning_rate": 9.070987654320988e-07, "logits/chosen": -2.701000928878784, "logits/rejected": -2.653493881225586, "logps/chosen": -50.9619026184082, "logps/rejected": -101.31248474121094, "loss": 0.6021, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0367242097854614, "rewards/margins": 3.495206594467163, "rewards/rejected": -2.458482265472412, "step": 302 }, { "epoch": 0.932666410157753, "grad_norm": 5.14107084274292, "learning_rate": 9.0679012345679e-07, "logits/chosen": -2.63743257522583, "logits/rejected": -2.657294273376465, "logps/chosen": -34.704444885253906, "logps/rejected": -98.37327575683594, "loss": 0.527, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8218048810958862, "rewards/margins": 4.2754225730896, "rewards/rejected": -2.453617572784424, "step": 303 }, { "epoch": 0.93574451712197, "grad_norm": 5.214193344116211, "learning_rate": 9.064814814814815e-07, "logits/chosen": -2.7314817905426025, "logits/rejected": -2.662621021270752, "logps/chosen": -49.984066009521484, "logps/rejected": -117.05342102050781, "loss": 0.5202, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0210986137390137, "rewards/margins": 4.420319557189941, "rewards/rejected": -3.3992204666137695, "step": 304 }, { "epoch": 0.938822624086187, "grad_norm": 4.092897415161133, "learning_rate": 9.061728395061729e-07, "logits/chosen": -2.721301555633545, "logits/rejected": -2.7135043144226074, "logps/chosen": -39.488582611083984, "logps/rejected": -98.36294555664062, "loss": 0.5399, "rewards/accuracies": 1.0, "rewards/chosen": 1.548521876335144, "rewards/margins": 3.9371182918548584, "rewards/rejected": -2.388596773147583, "step": 305 }, { "epoch": 0.941900731050404, "grad_norm": 2.8785624504089355, "learning_rate": 9.058641975308642e-07, "logits/chosen": -2.738095760345459, "logits/rejected": -2.7224507331848145, "logps/chosen": -57.32669448852539, "logps/rejected": -107.10794830322266, "loss": 0.6525, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5964069366455078, "rewards/margins": 3.497321128845215, "rewards/rejected": -2.900913715362549, "step": 306 }, { "epoch": 0.944978838014621, "grad_norm": 3.3993546962738037, "learning_rate": 9.055555555555556e-07, "logits/chosen": -2.6394052505493164, "logits/rejected": -2.5717062950134277, "logps/chosen": -46.67266845703125, "logps/rejected": -94.64692687988281, "loss": 0.647, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2342429161071777, "rewards/margins": 3.034646511077881, "rewards/rejected": -1.8004037141799927, "step": 307 }, { "epoch": 0.948056944978838, "grad_norm": 3.461409568786621, "learning_rate": 9.052469135802469e-07, "logits/chosen": -2.650928258895874, "logits/rejected": -2.6736574172973633, "logps/chosen": -53.135650634765625, "logps/rejected": -126.42446899414062, "loss": 0.5349, "rewards/accuracies": 0.96875, "rewards/chosen": 0.47532689571380615, "rewards/margins": 4.490276336669922, "rewards/rejected": -4.014949798583984, "step": 308 }, { "epoch": 0.951135051943055, "grad_norm": 3.3775217533111572, "learning_rate": 9.049382716049381e-07, "logits/chosen": -2.658357858657837, "logits/rejected": -2.6257119178771973, "logps/chosen": -49.84489440917969, "logps/rejected": -98.0178451538086, "loss": 0.6623, "rewards/accuracies": 0.96875, "rewards/chosen": 1.137123465538025, "rewards/margins": 3.2096307277679443, "rewards/rejected": -2.072507381439209, "step": 309 }, { "epoch": 0.954213158907272, "grad_norm": 3.783550262451172, "learning_rate": 9.046296296296297e-07, "logits/chosen": -2.7423033714294434, "logits/rejected": -2.8023040294647217, "logps/chosen": -46.81342697143555, "logps/rejected": -101.11741638183594, "loss": 0.5487, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3937184810638428, "rewards/margins": 4.091033458709717, "rewards/rejected": -2.697314739227295, "step": 310 }, { "epoch": 0.957291265871489, "grad_norm": 3.5796148777008057, "learning_rate": 9.04320987654321e-07, "logits/chosen": -2.65371036529541, "logits/rejected": -2.6603519916534424, "logps/chosen": -43.74306106567383, "logps/rejected": -112.42642974853516, "loss": 0.4798, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7734375, "rewards/margins": 4.856256008148193, "rewards/rejected": -3.0828189849853516, "step": 311 }, { "epoch": 0.960369372835706, "grad_norm": 4.41948127746582, "learning_rate": 9.040123456790123e-07, "logits/chosen": -2.816708564758301, "logits/rejected": -2.7694406509399414, "logps/chosen": -60.96818542480469, "logps/rejected": -112.64524841308594, "loss": 0.6634, "rewards/accuracies": 0.90625, "rewards/chosen": 0.0720091462135315, "rewards/margins": 3.408776044845581, "rewards/rejected": -3.336766481399536, "step": 312 }, { "epoch": 0.9634474797999231, "grad_norm": 3.9719090461730957, "learning_rate": 9.037037037037037e-07, "logits/chosen": -2.7623097896575928, "logits/rejected": -2.673126459121704, "logps/chosen": -39.66309356689453, "logps/rejected": -97.40425109863281, "loss": 0.5273, "rewards/accuracies": 1.0, "rewards/chosen": 1.5031688213348389, "rewards/margins": 3.845468521118164, "rewards/rejected": -2.342299461364746, "step": 313 }, { "epoch": 0.96652558676414, "grad_norm": 3.668475389480591, "learning_rate": 9.03395061728395e-07, "logits/chosen": -2.7139832973480225, "logits/rejected": -2.6366591453552246, "logps/chosen": -58.019371032714844, "logps/rejected": -110.25265502929688, "loss": 0.5665, "rewards/accuracies": 0.875, "rewards/chosen": 0.36129292845726013, "rewards/margins": 3.425837755203247, "rewards/rejected": -3.064544677734375, "step": 314 }, { "epoch": 0.9696036937283571, "grad_norm": 3.1930770874023438, "learning_rate": 9.030864197530865e-07, "logits/chosen": -2.6773509979248047, "logits/rejected": -2.6592249870300293, "logps/chosen": -43.46270751953125, "logps/rejected": -97.58978271484375, "loss": 0.6478, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4468774795532227, "rewards/margins": 3.629112482070923, "rewards/rejected": -2.182234764099121, "step": 315 }, { "epoch": 0.972681800692574, "grad_norm": 4.014804840087891, "learning_rate": 9.027777777777778e-07, "logits/chosen": -2.719963550567627, "logits/rejected": -2.712210178375244, "logps/chosen": -44.64863967895508, "logps/rejected": -107.87019348144531, "loss": 0.5081, "rewards/accuracies": 0.96875, "rewards/chosen": 1.592881441116333, "rewards/margins": 4.594295978546143, "rewards/rejected": -3.0014142990112305, "step": 316 }, { "epoch": 0.9757599076567911, "grad_norm": 3.463143825531006, "learning_rate": 9.024691358024691e-07, "logits/chosen": -2.719482183456421, "logits/rejected": -2.6756792068481445, "logps/chosen": -43.86948776245117, "logps/rejected": -91.16435241699219, "loss": 0.6453, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5642277002334595, "rewards/margins": 3.241011142730713, "rewards/rejected": -1.6767833232879639, "step": 317 }, { "epoch": 0.9788380146210081, "grad_norm": 3.5040504932403564, "learning_rate": 9.021604938271605e-07, "logits/chosen": -2.7102527618408203, "logits/rejected": -2.650912046432495, "logps/chosen": -52.117881774902344, "logps/rejected": -114.46489715576172, "loss": 0.5329, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8370294570922852, "rewards/margins": 4.305965423583984, "rewards/rejected": -3.46893572807312, "step": 318 }, { "epoch": 0.9819161215852251, "grad_norm": 3.0398805141448975, "learning_rate": 9.018518518518519e-07, "logits/chosen": -2.8296122550964355, "logits/rejected": -2.7144885063171387, "logps/chosen": -37.86615753173828, "logps/rejected": -114.54821014404297, "loss": 0.476, "rewards/accuracies": 0.84375, "rewards/chosen": 1.594226598739624, "rewards/margins": 4.870185852050781, "rewards/rejected": -3.275958776473999, "step": 319 }, { "epoch": 0.9849942285494421, "grad_norm": 3.2945077419281006, "learning_rate": 9.015432098765432e-07, "logits/chosen": -2.748678207397461, "logits/rejected": -2.726768970489502, "logps/chosen": -54.14866256713867, "logps/rejected": -99.33917999267578, "loss": 0.6569, "rewards/accuracies": 0.875, "rewards/chosen": 0.9254288077354431, "rewards/margins": 3.2087087631225586, "rewards/rejected": -2.28328013420105, "step": 320 }, { "epoch": 0.9880723355136591, "grad_norm": 3.3935093879699707, "learning_rate": 9.012345679012346e-07, "logits/chosen": -2.5685489177703857, "logits/rejected": -2.5882632732391357, "logps/chosen": -50.33378601074219, "logps/rejected": -115.06824493408203, "loss": 0.5796, "rewards/accuracies": 0.875, "rewards/chosen": 1.2011758089065552, "rewards/margins": 4.230671405792236, "rewards/rejected": -3.0294954776763916, "step": 321 }, { "epoch": 0.9911504424778761, "grad_norm": 3.8573336601257324, "learning_rate": 9.009259259259259e-07, "logits/chosen": -2.7004692554473877, "logits/rejected": -2.6781225204467773, "logps/chosen": -51.67336654663086, "logps/rejected": -113.28419494628906, "loss": 0.5481, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8802011013031006, "rewards/margins": 4.014837741851807, "rewards/rejected": -3.134636878967285, "step": 322 }, { "epoch": 0.9942285494420932, "grad_norm": 3.051135301589966, "learning_rate": 9.006172839506172e-07, "logits/chosen": -2.654686450958252, "logits/rejected": -2.6806235313415527, "logps/chosen": -34.10821533203125, "logps/rejected": -106.44022369384766, "loss": 0.4743, "rewards/accuracies": 1.0, "rewards/chosen": 1.7857944965362549, "rewards/margins": 4.741148948669434, "rewards/rejected": -2.9553544521331787, "step": 323 }, { "epoch": 0.9973066564063101, "grad_norm": 3.4334757328033447, "learning_rate": 9.003086419753087e-07, "logits/chosen": -2.7081947326660156, "logits/rejected": -2.6696572303771973, "logps/chosen": -49.518165588378906, "logps/rejected": -103.13909912109375, "loss": 0.6144, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8965988755226135, "rewards/margins": 3.5096864700317383, "rewards/rejected": -2.6130874156951904, "step": 324 }, { "epoch": 1.003078106964217, "grad_norm": 5.218608379364014, "learning_rate": 9e-07, "logits/chosen": -2.6044321060180664, "logits/rejected": -2.623185873031616, "logps/chosen": -44.983219146728516, "logps/rejected": -103.23857116699219, "loss": 0.9355, "rewards/accuracies": 0.9629629850387573, "rewards/chosen": 1.3472987413406372, "rewards/margins": 4.1769700050354, "rewards/rejected": -2.8296709060668945, "step": 325 }, { "epoch": 1.003078106964217, "eval_logits/chosen": -2.6485178470611572, "eval_logits/rejected": -2.7063655853271484, "eval_logps/chosen": -45.910789489746094, "eval_logps/rejected": -108.13349914550781, "eval_loss": 0.5891013145446777, "eval_rewards/accuracies": 0.913294792175293, "eval_rewards/chosen": 1.2363004684448242, "eval_rewards/margins": 4.033067226409912, "eval_rewards/rejected": -2.796766757965088, "eval_runtime": 635.1396, "eval_samples_per_second": 0.545, "eval_steps_per_second": 0.272, "step": 325 }, { "epoch": 1.006156213928434, "grad_norm": 5.323425769805908, "learning_rate": 8.996913580246914e-07, "logits/chosen": -2.730741024017334, "logits/rejected": -2.6199841499328613, "logps/chosen": -68.0400619506836, "logps/rejected": -132.04075622558594, "loss": 0.6809, "rewards/accuracies": 0.875, "rewards/chosen": -0.5178602933883667, "rewards/margins": 3.9500532150268555, "rewards/rejected": -4.46791410446167, "step": 326 }, { "epoch": 1.009234320892651, "grad_norm": 3.092712640762329, "learning_rate": 8.993827160493827e-07, "logits/chosen": -2.788191318511963, "logits/rejected": -2.7816405296325684, "logps/chosen": -51.94953155517578, "logps/rejected": -96.34440612792969, "loss": 0.6778, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8126041889190674, "rewards/margins": 3.0640974044799805, "rewards/rejected": -2.251492977142334, "step": 327 }, { "epoch": 1.012312427856868, "grad_norm": 3.9435622692108154, "learning_rate": 8.99074074074074e-07, "logits/chosen": -2.692615032196045, "logits/rejected": -2.7255125045776367, "logps/chosen": -51.513328552246094, "logps/rejected": -108.37971496582031, "loss": 0.6172, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8339427709579468, "rewards/margins": 3.829017162322998, "rewards/rejected": -2.995074510574341, "step": 328 }, { "epoch": 1.015390534821085, "grad_norm": 3.6982107162475586, "learning_rate": 8.987654320987655e-07, "logits/chosen": -2.7197582721710205, "logits/rejected": -2.7498795986175537, "logps/chosen": -53.13407897949219, "logps/rejected": -113.52041625976562, "loss": 0.5911, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5907331705093384, "rewards/margins": 3.7252910137176514, "rewards/rejected": -3.1345579624176025, "step": 329 }, { "epoch": 1.018468641785302, "grad_norm": 3.055035352706909, "learning_rate": 8.984567901234568e-07, "logits/chosen": -2.8062970638275146, "logits/rejected": -2.7185568809509277, "logps/chosen": -53.70412063598633, "logps/rejected": -116.15768432617188, "loss": 0.5423, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5908687114715576, "rewards/margins": 4.133205890655518, "rewards/rejected": -3.54233717918396, "step": 330 }, { "epoch": 1.0215467487495191, "grad_norm": 4.8526530265808105, "learning_rate": 8.981481481481481e-07, "logits/chosen": -2.757310152053833, "logits/rejected": -2.7566561698913574, "logps/chosen": -38.05644226074219, "logps/rejected": -87.90837860107422, "loss": 0.5605, "rewards/accuracies": 0.90625, "rewards/chosen": 1.8818612098693848, "rewards/margins": 3.58345103263855, "rewards/rejected": -1.701589822769165, "step": 331 }, { "epoch": 1.024624855713736, "grad_norm": 3.4363696575164795, "learning_rate": 8.978395061728395e-07, "logits/chosen": -2.718587636947632, "logits/rejected": -2.6956708431243896, "logps/chosen": -39.79774475097656, "logps/rejected": -113.75038146972656, "loss": 0.4635, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5058660507202148, "rewards/margins": 4.811707019805908, "rewards/rejected": -3.3058412075042725, "step": 332 }, { "epoch": 1.027702962677953, "grad_norm": 3.3787589073181152, "learning_rate": 8.975308641975308e-07, "logits/chosen": -2.7535200119018555, "logits/rejected": -2.6979761123657227, "logps/chosen": -33.02869415283203, "logps/rejected": -101.9928970336914, "loss": 0.4714, "rewards/accuracies": 0.9375, "rewards/chosen": 2.2571351528167725, "rewards/margins": 4.852896213531494, "rewards/rejected": -2.595761299133301, "step": 333 }, { "epoch": 1.03078106964217, "grad_norm": 4.058920383453369, "learning_rate": 8.972222222222222e-07, "logits/chosen": -2.746760368347168, "logits/rejected": -2.803250312805176, "logps/chosen": -50.467559814453125, "logps/rejected": -105.32086944580078, "loss": 0.5993, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0643501281738281, "rewards/margins": 3.9025487899780273, "rewards/rejected": -2.838198661804199, "step": 334 }, { "epoch": 1.0338591766063872, "grad_norm": 4.307451248168945, "learning_rate": 8.969135802469136e-07, "logits/chosen": -2.737602949142456, "logits/rejected": -2.739133596420288, "logps/chosen": -36.632049560546875, "logps/rejected": -91.28084564208984, "loss": 0.5464, "rewards/accuracies": 0.90625, "rewards/chosen": 2.002199172973633, "rewards/margins": 3.991565704345703, "rewards/rejected": -1.9893665313720703, "step": 335 }, { "epoch": 1.036937283570604, "grad_norm": 3.9597299098968506, "learning_rate": 8.966049382716049e-07, "logits/chosen": -2.772561550140381, "logits/rejected": -2.7493650913238525, "logps/chosen": -43.31679153442383, "logps/rejected": -110.99324035644531, "loss": 0.5021, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4030365943908691, "rewards/margins": 4.611698627471924, "rewards/rejected": -3.2086620330810547, "step": 336 }, { "epoch": 1.040015390534821, "grad_norm": 4.034379482269287, "learning_rate": 8.962962962962963e-07, "logits/chosen": -2.796607494354248, "logits/rejected": -2.699756145477295, "logps/chosen": -44.677791595458984, "logps/rejected": -104.65818786621094, "loss": 0.5662, "rewards/accuracies": 0.90625, "rewards/chosen": 1.436598777770996, "rewards/margins": 3.977604389190674, "rewards/rejected": -2.5410051345825195, "step": 337 }, { "epoch": 1.043093497499038, "grad_norm": 3.3386878967285156, "learning_rate": 8.959876543209877e-07, "logits/chosen": -2.7629008293151855, "logits/rejected": -2.772430419921875, "logps/chosen": -55.692481994628906, "logps/rejected": -110.84430694580078, "loss": 0.65, "rewards/accuracies": 1.0, "rewards/chosen": 0.31896379590034485, "rewards/margins": 3.7604920864105225, "rewards/rejected": -3.441528081893921, "step": 338 }, { "epoch": 1.0461716044632552, "grad_norm": 4.1624250411987305, "learning_rate": 8.95679012345679e-07, "logits/chosen": -2.730869770050049, "logits/rejected": -2.6672661304473877, "logps/chosen": -49.42149353027344, "logps/rejected": -106.19185638427734, "loss": 0.6334, "rewards/accuracies": 0.875, "rewards/chosen": 0.9901663661003113, "rewards/margins": 3.8257341384887695, "rewards/rejected": -2.8355674743652344, "step": 339 }, { "epoch": 1.0492497114274721, "grad_norm": 3.950373888015747, "learning_rate": 8.953703703703704e-07, "logits/chosen": -2.6917495727539062, "logits/rejected": -2.710750102996826, "logps/chosen": -39.928436279296875, "logps/rejected": -96.36076354980469, "loss": 0.5276, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7990548610687256, "rewards/margins": 4.067576885223389, "rewards/rejected": -2.268522024154663, "step": 340 }, { "epoch": 1.052327818391689, "grad_norm": 4.322996616363525, "learning_rate": 8.950617283950617e-07, "logits/chosen": -2.8124618530273438, "logits/rejected": -2.7355165481567383, "logps/chosen": -46.620094299316406, "logps/rejected": -115.33992004394531, "loss": 0.5505, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5180715322494507, "rewards/margins": 4.63954496383667, "rewards/rejected": -3.1214733123779297, "step": 341 }, { "epoch": 1.055405925355906, "grad_norm": 4.276078224182129, "learning_rate": 8.94753086419753e-07, "logits/chosen": -2.744480848312378, "logits/rejected": -2.814993143081665, "logps/chosen": -41.58894729614258, "logps/rejected": -104.35924530029297, "loss": 0.5319, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5145652294158936, "rewards/margins": 4.39456033706665, "rewards/rejected": -2.879995346069336, "step": 342 }, { "epoch": 1.0584840323201232, "grad_norm": 3.9617340564727783, "learning_rate": 8.944444444444445e-07, "logits/chosen": -2.7602765560150146, "logits/rejected": -2.760115146636963, "logps/chosen": -44.58925247192383, "logps/rejected": -114.80441284179688, "loss": 0.4743, "rewards/accuracies": 1.0, "rewards/chosen": 1.637721061706543, "rewards/margins": 5.183080673217773, "rewards/rejected": -3.5453598499298096, "step": 343 }, { "epoch": 1.0615621392843402, "grad_norm": 4.640761852264404, "learning_rate": 8.941358024691358e-07, "logits/chosen": -2.7944083213806152, "logits/rejected": -2.7681732177734375, "logps/chosen": -43.136810302734375, "logps/rejected": -104.14981079101562, "loss": 0.5038, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9264949560165405, "rewards/margins": 4.717092037200928, "rewards/rejected": -2.7905972003936768, "step": 344 }, { "epoch": 1.0646402462485571, "grad_norm": 3.6528191566467285, "learning_rate": 8.938271604938271e-07, "logits/chosen": -2.7337937355041504, "logits/rejected": -2.675611734390259, "logps/chosen": -55.465694427490234, "logps/rejected": -107.46930694580078, "loss": 0.6138, "rewards/accuracies": 0.875, "rewards/chosen": 0.5346077084541321, "rewards/margins": 3.943742036819458, "rewards/rejected": -3.4091341495513916, "step": 345 }, { "epoch": 1.067718353212774, "grad_norm": 5.650940418243408, "learning_rate": 8.935185185185185e-07, "logits/chosen": -2.5876729488372803, "logits/rejected": -2.716843605041504, "logps/chosen": -35.84689712524414, "logps/rejected": -102.65965270996094, "loss": 0.5036, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9604861736297607, "rewards/margins": 4.581793308258057, "rewards/rejected": -2.621306896209717, "step": 346 }, { "epoch": 1.0707964601769913, "grad_norm": 5.115732669830322, "learning_rate": 8.932098765432098e-07, "logits/chosen": -2.672154664993286, "logits/rejected": -2.675381898880005, "logps/chosen": -51.32032775878906, "logps/rejected": -124.57972717285156, "loss": 0.5723, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9136536717414856, "rewards/margins": 4.818439483642578, "rewards/rejected": -3.90478515625, "step": 347 }, { "epoch": 1.0738745671412082, "grad_norm": 4.012619495391846, "learning_rate": 8.929012345679013e-07, "logits/chosen": -2.7187044620513916, "logits/rejected": -2.7826719284057617, "logps/chosen": -38.30029296875, "logps/rejected": -114.68107604980469, "loss": 0.4185, "rewards/accuracies": 1.0, "rewards/chosen": 1.6602286100387573, "rewards/margins": 5.280893802642822, "rewards/rejected": -3.620664596557617, "step": 348 }, { "epoch": 1.0769526741054252, "grad_norm": 3.721613645553589, "learning_rate": 8.925925925925926e-07, "logits/chosen": -2.6544766426086426, "logits/rejected": -2.6592884063720703, "logps/chosen": -39.728797912597656, "logps/rejected": -129.98170471191406, "loss": 0.4365, "rewards/accuracies": 1.0, "rewards/chosen": 1.6159262657165527, "rewards/margins": 5.98516321182251, "rewards/rejected": -4.369236946105957, "step": 349 }, { "epoch": 1.0800307810696421, "grad_norm": 3.5033891201019287, "learning_rate": 8.922839506172839e-07, "logits/chosen": -2.690636157989502, "logits/rejected": -2.7193212509155273, "logps/chosen": -53.086524963378906, "logps/rejected": -111.57524108886719, "loss": 0.6097, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7947431802749634, "rewards/margins": 3.9574410915374756, "rewards/rejected": -3.1626977920532227, "step": 350 }, { "epoch": 1.083108888033859, "grad_norm": 3.298316717147827, "learning_rate": 8.919753086419753e-07, "logits/chosen": -2.7421886920928955, "logits/rejected": -2.782541513442993, "logps/chosen": -47.190895080566406, "logps/rejected": -122.34271240234375, "loss": 0.4847, "rewards/accuracies": 0.90625, "rewards/chosen": 1.403032660484314, "rewards/margins": 5.068779945373535, "rewards/rejected": -3.6657474040985107, "step": 351 }, { "epoch": 1.0861869949980762, "grad_norm": 3.774052143096924, "learning_rate": 8.916666666666667e-07, "logits/chosen": -2.664945125579834, "logits/rejected": -2.712296485900879, "logps/chosen": -49.38701629638672, "logps/rejected": -129.40866088867188, "loss": 0.4939, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0856178998947144, "rewards/margins": 5.513049602508545, "rewards/rejected": -4.427431583404541, "step": 352 }, { "epoch": 1.0892651019622932, "grad_norm": 3.543613910675049, "learning_rate": 8.91358024691358e-07, "logits/chosen": -2.716691493988037, "logits/rejected": -2.7255849838256836, "logps/chosen": -45.59185791015625, "logps/rejected": -107.37541198730469, "loss": 0.5385, "rewards/accuracies": 0.9375, "rewards/chosen": 1.183633804321289, "rewards/margins": 4.172430992126465, "rewards/rejected": -2.9887964725494385, "step": 353 }, { "epoch": 1.0923432089265102, "grad_norm": 3.6768486499786377, "learning_rate": 8.910493827160494e-07, "logits/chosen": -2.777639627456665, "logits/rejected": -2.7530343532562256, "logps/chosen": -44.169254302978516, "logps/rejected": -101.0901107788086, "loss": 0.5878, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5162932872772217, "rewards/margins": 4.112740516662598, "rewards/rejected": -2.596447229385376, "step": 354 }, { "epoch": 1.095421315890727, "grad_norm": 3.598644256591797, "learning_rate": 8.907407407407407e-07, "logits/chosen": -2.6808905601501465, "logits/rejected": -2.6884753704071045, "logps/chosen": -42.19908142089844, "logps/rejected": -90.96024322509766, "loss": 0.6947, "rewards/accuracies": 0.90625, "rewards/chosen": 1.304225206375122, "rewards/margins": 3.3126559257507324, "rewards/rejected": -2.0084309577941895, "step": 355 }, { "epoch": 1.0984994228549443, "grad_norm": 3.3739144802093506, "learning_rate": 8.90432098765432e-07, "logits/chosen": -2.6795921325683594, "logits/rejected": -2.720015048980713, "logps/chosen": -51.12667465209961, "logps/rejected": -105.48377990722656, "loss": 0.6276, "rewards/accuracies": 0.875, "rewards/chosen": 1.090147852897644, "rewards/margins": 3.9866435527801514, "rewards/rejected": -2.896495819091797, "step": 356 }, { "epoch": 1.1015775298191612, "grad_norm": 3.166325569152832, "learning_rate": 8.901234567901235e-07, "logits/chosen": -2.6555874347686768, "logits/rejected": -2.7267277240753174, "logps/chosen": -43.61629867553711, "logps/rejected": -108.19656372070312, "loss": 0.5389, "rewards/accuracies": 0.9375, "rewards/chosen": 1.310362696647644, "rewards/margins": 4.498234748840332, "rewards/rejected": -3.1878721714019775, "step": 357 }, { "epoch": 1.1046556367833782, "grad_norm": 2.6871891021728516, "learning_rate": 8.898148148148148e-07, "logits/chosen": -2.6536033153533936, "logits/rejected": -2.6485583782196045, "logps/chosen": -45.293121337890625, "logps/rejected": -124.67941284179688, "loss": 0.5445, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1603127717971802, "rewards/margins": 5.127030372619629, "rewards/rejected": -3.966717481613159, "step": 358 }, { "epoch": 1.1077337437475951, "grad_norm": 4.715346336364746, "learning_rate": 8.895061728395062e-07, "logits/chosen": -2.8443920612335205, "logits/rejected": -2.777366876602173, "logps/chosen": -35.18056106567383, "logps/rejected": -94.21670532226562, "loss": 0.5587, "rewards/accuracies": 0.84375, "rewards/chosen": 1.7952032089233398, "rewards/margins": 4.008256912231445, "rewards/rejected": -2.2130537033081055, "step": 359 }, { "epoch": 1.1108118507118123, "grad_norm": 4.518795013427734, "learning_rate": 8.891975308641975e-07, "logits/chosen": -2.6534738540649414, "logits/rejected": -2.7456917762756348, "logps/chosen": -47.77797317504883, "logps/rejected": -109.61837005615234, "loss": 0.5982, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9982984066009521, "rewards/margins": 4.510091304779053, "rewards/rejected": -3.511793375015259, "step": 360 }, { "epoch": 1.1138899576760293, "grad_norm": 4.432154655456543, "learning_rate": 8.888888888888888e-07, "logits/chosen": -2.7547104358673096, "logits/rejected": -2.7364842891693115, "logps/chosen": -40.37964630126953, "logps/rejected": -116.76275634765625, "loss": 0.4892, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7340819835662842, "rewards/margins": 5.22028112411499, "rewards/rejected": -3.486198902130127, "step": 361 }, { "epoch": 1.1169680646402462, "grad_norm": 4.517007350921631, "learning_rate": 8.885802469135803e-07, "logits/chosen": -2.7734215259552, "logits/rejected": -2.6568241119384766, "logps/chosen": -45.93791198730469, "logps/rejected": -122.17041015625, "loss": 0.4578, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3174760341644287, "rewards/margins": 5.36116886138916, "rewards/rejected": -4.0436930656433105, "step": 362 }, { "epoch": 1.1200461716044632, "grad_norm": 4.214285373687744, "learning_rate": 8.882716049382716e-07, "logits/chosen": -2.693202257156372, "logits/rejected": -2.70584774017334, "logps/chosen": -50.75274658203125, "logps/rejected": -111.38531494140625, "loss": 0.5523, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8933073282241821, "rewards/margins": 4.237460136413574, "rewards/rejected": -3.3441526889801025, "step": 363 }, { "epoch": 1.1231242785686804, "grad_norm": 3.931877374649048, "learning_rate": 8.879629629629629e-07, "logits/chosen": -2.6300954818725586, "logits/rejected": -2.661055564880371, "logps/chosen": -44.262786865234375, "logps/rejected": -115.10452270507812, "loss": 0.5727, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2935853004455566, "rewards/margins": 4.632660388946533, "rewards/rejected": -3.3390753269195557, "step": 364 }, { "epoch": 1.1262023855328973, "grad_norm": 4.138123989105225, "learning_rate": 8.876543209876543e-07, "logits/chosen": -2.774716377258301, "logits/rejected": -2.818450450897217, "logps/chosen": -57.978145599365234, "logps/rejected": -126.81806182861328, "loss": 0.5712, "rewards/accuracies": 0.875, "rewards/chosen": 0.48086681962013245, "rewards/margins": 4.69692325592041, "rewards/rejected": -4.2160563468933105, "step": 365 }, { "epoch": 1.1292804924971143, "grad_norm": 3.649038076400757, "learning_rate": 8.873456790123457e-07, "logits/chosen": -2.704690933227539, "logits/rejected": -2.756229877471924, "logps/chosen": -41.930519104003906, "logps/rejected": -116.65167236328125, "loss": 0.4449, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7011501789093018, "rewards/margins": 5.368694305419922, "rewards/rejected": -3.6675446033477783, "step": 366 }, { "epoch": 1.1323585994613312, "grad_norm": 4.8762640953063965, "learning_rate": 8.870370370370371e-07, "logits/chosen": -2.7187936305999756, "logits/rejected": -2.736192226409912, "logps/chosen": -52.506309509277344, "logps/rejected": -122.34837341308594, "loss": 0.5358, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7146642208099365, "rewards/margins": 4.839643478393555, "rewards/rejected": -4.124979019165039, "step": 367 }, { "epoch": 1.1354367064255482, "grad_norm": 4.63283634185791, "learning_rate": 8.867283950617284e-07, "logits/chosen": -2.81618332862854, "logits/rejected": -2.7426939010620117, "logps/chosen": -42.573814392089844, "logps/rejected": -99.93598937988281, "loss": 0.6497, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2832177877426147, "rewards/margins": 3.6457626819610596, "rewards/rejected": -2.3625447750091553, "step": 368 }, { "epoch": 1.1385148133897653, "grad_norm": 3.4557621479034424, "learning_rate": 8.864197530864197e-07, "logits/chosen": -2.7813947200775146, "logits/rejected": -2.7738683223724365, "logps/chosen": -38.561439514160156, "logps/rejected": -103.76374053955078, "loss": 0.5285, "rewards/accuracies": 0.96875, "rewards/chosen": 1.709555745124817, "rewards/margins": 4.457563400268555, "rewards/rejected": -2.748007297515869, "step": 369 }, { "epoch": 1.1415929203539823, "grad_norm": 3.2802047729492188, "learning_rate": 8.861111111111111e-07, "logits/chosen": -2.780029773712158, "logits/rejected": -2.7310004234313965, "logps/chosen": -54.09687042236328, "logps/rejected": -117.84721374511719, "loss": 0.5624, "rewards/accuracies": 0.875, "rewards/chosen": 1.096437931060791, "rewards/margins": 4.676052570343018, "rewards/rejected": -3.5796144008636475, "step": 370 }, { "epoch": 1.1446710273181993, "grad_norm": 3.6363954544067383, "learning_rate": 8.858024691358025e-07, "logits/chosen": -2.745181083679199, "logits/rejected": -2.7590723037719727, "logps/chosen": -55.171939849853516, "logps/rejected": -105.5958251953125, "loss": 0.6337, "rewards/accuracies": 0.90625, "rewards/chosen": 0.39757293462753296, "rewards/margins": 3.4821395874023438, "rewards/rejected": -3.084566593170166, "step": 371 }, { "epoch": 1.1477491342824164, "grad_norm": 2.945056200027466, "learning_rate": 8.854938271604938e-07, "logits/chosen": -2.683396339416504, "logits/rejected": -2.7194385528564453, "logps/chosen": -43.507179260253906, "logps/rejected": -108.69664764404297, "loss": 0.5395, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3916597366333008, "rewards/margins": 4.472705841064453, "rewards/rejected": -3.0810461044311523, "step": 372 }, { "epoch": 1.1508272412466334, "grad_norm": 3.8618948459625244, "learning_rate": 8.851851851851852e-07, "logits/chosen": -2.7857112884521484, "logits/rejected": -2.762308120727539, "logps/chosen": -50.24229431152344, "logps/rejected": -114.96027374267578, "loss": 0.5925, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8880966901779175, "rewards/margins": 4.526221752166748, "rewards/rejected": -3.638125419616699, "step": 373 }, { "epoch": 1.1539053482108503, "grad_norm": 4.5215935707092285, "learning_rate": 8.848765432098765e-07, "logits/chosen": -2.6855087280273438, "logits/rejected": -2.7722558975219727, "logps/chosen": -41.296024322509766, "logps/rejected": -94.70060729980469, "loss": 0.5528, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7068345546722412, "rewards/margins": 4.066295146942139, "rewards/rejected": -2.3594605922698975, "step": 374 }, { "epoch": 1.1569834551750673, "grad_norm": 6.182140350341797, "learning_rate": 8.845679012345678e-07, "logits/chosen": -2.756608009338379, "logits/rejected": -2.7753639221191406, "logps/chosen": -40.47132873535156, "logps/rejected": -103.40464782714844, "loss": 0.5203, "rewards/accuracies": 0.875, "rewards/chosen": 1.8746219873428345, "rewards/margins": 4.592782020568848, "rewards/rejected": -2.718160629272461, "step": 375 }, { "epoch": 1.1600615621392842, "grad_norm": 4.682372570037842, "learning_rate": 8.842592592592593e-07, "logits/chosen": -2.901252269744873, "logits/rejected": -2.9137001037597656, "logps/chosen": -42.8128547668457, "logps/rejected": -115.0589370727539, "loss": 0.4889, "rewards/accuracies": 1.0, "rewards/chosen": 1.6384817361831665, "rewards/margins": 5.2621941566467285, "rewards/rejected": -3.6237120628356934, "step": 376 }, { "epoch": 1.1631396691035014, "grad_norm": 5.9883928298950195, "learning_rate": 8.839506172839506e-07, "logits/chosen": -2.715937614440918, "logits/rejected": -2.6734890937805176, "logps/chosen": -42.894203186035156, "logps/rejected": -92.49557495117188, "loss": 0.6376, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3967735767364502, "rewards/margins": 3.399667263031006, "rewards/rejected": -2.0028939247131348, "step": 377 }, { "epoch": 1.1662177760677184, "grad_norm": 4.44272518157959, "learning_rate": 8.83641975308642e-07, "logits/chosen": -2.718559741973877, "logits/rejected": -2.7365503311157227, "logps/chosen": -38.95957565307617, "logps/rejected": -87.49272155761719, "loss": 0.6436, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9435096979141235, "rewards/margins": 3.711543083190918, "rewards/rejected": -1.7680332660675049, "step": 378 }, { "epoch": 1.1692958830319353, "grad_norm": 4.544284343719482, "learning_rate": 8.833333333333333e-07, "logits/chosen": -2.7843191623687744, "logits/rejected": -2.6910276412963867, "logps/chosen": -42.57975387573242, "logps/rejected": -108.86529541015625, "loss": 0.5578, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3248802423477173, "rewards/margins": 4.419844627380371, "rewards/rejected": -3.0949645042419434, "step": 379 }, { "epoch": 1.1723739899961523, "grad_norm": 3.6255228519439697, "learning_rate": 8.830246913580246e-07, "logits/chosen": -2.7396669387817383, "logits/rejected": -2.8280060291290283, "logps/chosen": -42.64948272705078, "logps/rejected": -113.22039794921875, "loss": 0.5074, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4200334548950195, "rewards/margins": 5.149721145629883, "rewards/rejected": -3.729687452316284, "step": 380 }, { "epoch": 1.1754520969603695, "grad_norm": 4.395378589630127, "learning_rate": 8.827160493827161e-07, "logits/chosen": -2.7359161376953125, "logits/rejected": -2.7629754543304443, "logps/chosen": -45.00737380981445, "logps/rejected": -104.59160614013672, "loss": 0.477, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6893123388290405, "rewards/margins": 4.5435872077941895, "rewards/rejected": -2.8542747497558594, "step": 381 }, { "epoch": 1.1785302039245864, "grad_norm": 4.119265556335449, "learning_rate": 8.824074074074074e-07, "logits/chosen": -2.67144513130188, "logits/rejected": -2.6877222061157227, "logps/chosen": -38.05738830566406, "logps/rejected": -107.5048599243164, "loss": 0.4319, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8918123245239258, "rewards/margins": 4.9774556159973145, "rewards/rejected": -3.0856435298919678, "step": 382 }, { "epoch": 1.1816083108888034, "grad_norm": 3.9775023460388184, "learning_rate": 8.820987654320987e-07, "logits/chosen": -2.8012664318084717, "logits/rejected": -2.710698127746582, "logps/chosen": -50.45160675048828, "logps/rejected": -110.59567260742188, "loss": 0.5128, "rewards/accuracies": 0.9375, "rewards/chosen": 1.105674147605896, "rewards/margins": 4.215999603271484, "rewards/rejected": -3.110325336456299, "step": 383 }, { "epoch": 1.1846864178530203, "grad_norm": 4.087917804718018, "learning_rate": 8.817901234567901e-07, "logits/chosen": -2.778381586074829, "logits/rejected": -2.763965129852295, "logps/chosen": -42.75944519042969, "logps/rejected": -95.60743713378906, "loss": 0.68, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4188687801361084, "rewards/margins": 3.3930673599243164, "rewards/rejected": -1.974198341369629, "step": 384 }, { "epoch": 1.1877645248172375, "grad_norm": 7.003907203674316, "learning_rate": 8.814814814814815e-07, "logits/chosen": -2.790501594543457, "logits/rejected": -2.8394217491149902, "logps/chosen": -40.990394592285156, "logps/rejected": -90.89460754394531, "loss": 0.5781, "rewards/accuracies": 0.90625, "rewards/chosen": 1.621785044670105, "rewards/margins": 3.5677664279937744, "rewards/rejected": -1.9459812641143799, "step": 385 }, { "epoch": 1.1908426317814544, "grad_norm": 4.665971755981445, "learning_rate": 8.811728395061728e-07, "logits/chosen": -2.750939130783081, "logits/rejected": -2.7117910385131836, "logps/chosen": -46.30043411254883, "logps/rejected": -134.78652954101562, "loss": 0.4157, "rewards/accuracies": 1.0, "rewards/chosen": 1.1272308826446533, "rewards/margins": 5.742195129394531, "rewards/rejected": -4.614964008331299, "step": 386 }, { "epoch": 1.1939207387456714, "grad_norm": 4.247708320617676, "learning_rate": 8.808641975308642e-07, "logits/chosen": -2.7190685272216797, "logits/rejected": -2.7011618614196777, "logps/chosen": -46.037620544433594, "logps/rejected": -112.35049438476562, "loss": 0.4679, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3517944812774658, "rewards/margins": 4.802014350891113, "rewards/rejected": -3.4502201080322266, "step": 387 }, { "epoch": 1.1969988457098883, "grad_norm": 3.9806885719299316, "learning_rate": 8.805555555555555e-07, "logits/chosen": -2.818506956100464, "logits/rejected": -2.843794584274292, "logps/chosen": -52.86320877075195, "logps/rejected": -105.41265869140625, "loss": 0.6593, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8092026114463806, "rewards/margins": 3.846311092376709, "rewards/rejected": -3.0371084213256836, "step": 388 }, { "epoch": 1.2000769526741055, "grad_norm": 3.568552255630493, "learning_rate": 8.802469135802469e-07, "logits/chosen": -2.62540340423584, "logits/rejected": -2.713799476623535, "logps/chosen": -41.057430267333984, "logps/rejected": -112.48632049560547, "loss": 0.5023, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7688567638397217, "rewards/margins": 5.224759101867676, "rewards/rejected": -3.455901861190796, "step": 389 }, { "epoch": 1.2031550596383225, "grad_norm": 4.719287395477295, "learning_rate": 8.799382716049383e-07, "logits/chosen": -2.7596094608306885, "logits/rejected": -2.7684595584869385, "logps/chosen": -49.46556091308594, "logps/rejected": -106.55522918701172, "loss": 0.6264, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8091545104980469, "rewards/margins": 3.664018392562866, "rewards/rejected": -2.8548638820648193, "step": 390 }, { "epoch": 1.2031550596383225, "eval_logits/chosen": -2.687856674194336, "eval_logits/rejected": -2.744150161743164, "eval_logps/chosen": -45.79682540893555, "eval_logps/rejected": -111.00682830810547, "eval_loss": 0.5839963555335999, "eval_rewards/accuracies": 0.913294792175293, "eval_rewards/chosen": 1.2476974725723267, "eval_rewards/margins": 4.331796646118164, "eval_rewards/rejected": -3.084099292755127, "eval_runtime": 635.9178, "eval_samples_per_second": 0.544, "eval_steps_per_second": 0.272, "step": 390 }, { "epoch": 1.2062331666025394, "grad_norm": 3.6343436241149902, "learning_rate": 8.796296296296296e-07, "logits/chosen": -2.7481017112731934, "logits/rejected": -2.7649924755096436, "logps/chosen": -44.71501922607422, "logps/rejected": -107.89212036132812, "loss": 0.5245, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6210863590240479, "rewards/margins": 4.220058917999268, "rewards/rejected": -2.598973035812378, "step": 391 }, { "epoch": 1.2093112735667564, "grad_norm": 4.611740589141846, "learning_rate": 8.79320987654321e-07, "logits/chosen": -2.7481260299682617, "logits/rejected": -2.807015895843506, "logps/chosen": -39.964317321777344, "logps/rejected": -99.78328704833984, "loss": 0.5697, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7954607009887695, "rewards/margins": 4.327472686767578, "rewards/rejected": -2.5320119857788086, "step": 392 }, { "epoch": 1.2123893805309733, "grad_norm": 4.048872470855713, "learning_rate": 8.790123456790123e-07, "logits/chosen": -2.7801361083984375, "logits/rejected": -2.716434955596924, "logps/chosen": -39.31647872924805, "logps/rejected": -86.64339447021484, "loss": 0.6758, "rewards/accuracies": 0.8125, "rewards/chosen": 1.9223971366882324, "rewards/margins": 3.275338888168335, "rewards/rejected": -1.352941870689392, "step": 393 }, { "epoch": 1.2154674874951905, "grad_norm": 3.3962974548339844, "learning_rate": 8.787037037037036e-07, "logits/chosen": -2.755117416381836, "logits/rejected": -2.7769839763641357, "logps/chosen": -52.68999481201172, "logps/rejected": -122.42757415771484, "loss": 0.4966, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9712950587272644, "rewards/margins": 5.171798229217529, "rewards/rejected": -4.200503349304199, "step": 394 }, { "epoch": 1.2185455944594075, "grad_norm": 3.6326985359191895, "learning_rate": 8.783950617283951e-07, "logits/chosen": -2.6539652347564697, "logits/rejected": -2.6764674186706543, "logps/chosen": -37.88435745239258, "logps/rejected": -94.44776916503906, "loss": 0.5953, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8023390769958496, "rewards/margins": 4.039123058319092, "rewards/rejected": -2.236783981323242, "step": 395 }, { "epoch": 1.2216237014236244, "grad_norm": 4.576152801513672, "learning_rate": 8.780864197530864e-07, "logits/chosen": -2.821197271347046, "logits/rejected": -2.7506351470947266, "logps/chosen": -39.18404769897461, "logps/rejected": -108.52210235595703, "loss": 0.5062, "rewards/accuracies": 0.90625, "rewards/chosen": 1.8705236911773682, "rewards/margins": 4.843180179595947, "rewards/rejected": -2.972656488418579, "step": 396 }, { "epoch": 1.2247018083878416, "grad_norm": 3.892291784286499, "learning_rate": 8.777777777777777e-07, "logits/chosen": -2.6985061168670654, "logits/rejected": -2.739896059036255, "logps/chosen": -32.76089859008789, "logps/rejected": -100.24835205078125, "loss": 0.4331, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9243106842041016, "rewards/margins": 4.770061492919922, "rewards/rejected": -2.845750570297241, "step": 397 }, { "epoch": 1.2277799153520585, "grad_norm": 4.24825382232666, "learning_rate": 8.774691358024691e-07, "logits/chosen": -2.7549479007720947, "logits/rejected": -2.7557783126831055, "logps/chosen": -50.44514083862305, "logps/rejected": -112.47740173339844, "loss": 0.5173, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3395371437072754, "rewards/margins": 4.474265098571777, "rewards/rejected": -3.134727954864502, "step": 398 }, { "epoch": 1.2308580223162755, "grad_norm": 3.5681443214416504, "learning_rate": 8.771604938271605e-07, "logits/chosen": -2.7386560440063477, "logits/rejected": -2.732107639312744, "logps/chosen": -41.863399505615234, "logps/rejected": -101.63407135009766, "loss": 0.5734, "rewards/accuracies": 0.9375, "rewards/chosen": 1.467279076576233, "rewards/margins": 4.107944488525391, "rewards/rejected": -2.6406655311584473, "step": 399 }, { "epoch": 1.2339361292804925, "grad_norm": 5.167920112609863, "learning_rate": 8.768518518518519e-07, "logits/chosen": -2.774913787841797, "logits/rejected": -2.758551836013794, "logps/chosen": -51.275211334228516, "logps/rejected": -115.53167724609375, "loss": 0.6058, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5299093723297119, "rewards/margins": 4.045196533203125, "rewards/rejected": -3.515287160873413, "step": 400 }, { "epoch": 1.2370142362447094, "grad_norm": 4.343350887298584, "learning_rate": 8.765432098765432e-07, "logits/chosen": -2.740726947784424, "logits/rejected": -2.7538862228393555, "logps/chosen": -55.026851654052734, "logps/rejected": -101.32000732421875, "loss": 0.6253, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9346969127655029, "rewards/margins": 3.6324357986450195, "rewards/rejected": -2.6977391242980957, "step": 401 }, { "epoch": 1.2400923432089266, "grad_norm": 4.159266948699951, "learning_rate": 8.762345679012345e-07, "logits/chosen": -2.851757287979126, "logits/rejected": -2.827547550201416, "logps/chosen": -47.114925384521484, "logps/rejected": -111.549072265625, "loss": 0.596, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9019561409950256, "rewards/margins": 4.2549591064453125, "rewards/rejected": -3.3530030250549316, "step": 402 }, { "epoch": 1.2431704501731435, "grad_norm": 5.664335250854492, "learning_rate": 8.759259259259259e-07, "logits/chosen": -2.760502576828003, "logits/rejected": -2.739288091659546, "logps/chosen": -41.53828048706055, "logps/rejected": -109.32524871826172, "loss": 0.5359, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4804935455322266, "rewards/margins": 4.789822578430176, "rewards/rejected": -3.30932879447937, "step": 403 }, { "epoch": 1.2462485571373605, "grad_norm": 3.9847660064697266, "learning_rate": 8.756172839506173e-07, "logits/chosen": -2.8159430027008057, "logits/rejected": -2.7152926921844482, "logps/chosen": -45.12547302246094, "logps/rejected": -104.45260620117188, "loss": 0.5553, "rewards/accuracies": 0.875, "rewards/chosen": 1.2264795303344727, "rewards/margins": 4.209685802459717, "rewards/rejected": -2.9832065105438232, "step": 404 }, { "epoch": 1.2493266641015774, "grad_norm": 4.23655891418457, "learning_rate": 8.753086419753086e-07, "logits/chosen": -2.824434757232666, "logits/rejected": -2.7592520713806152, "logps/chosen": -39.55470657348633, "logps/rejected": -108.9683609008789, "loss": 0.5448, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7643996477127075, "rewards/margins": 4.727837562561035, "rewards/rejected": -2.963438034057617, "step": 405 }, { "epoch": 1.2524047710657946, "grad_norm": 3.1874170303344727, "learning_rate": 8.75e-07, "logits/chosen": -2.7314674854278564, "logits/rejected": -2.755495548248291, "logps/chosen": -46.703880310058594, "logps/rejected": -115.47087097167969, "loss": 0.4835, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2950985431671143, "rewards/margins": 4.880054950714111, "rewards/rejected": -3.584956169128418, "step": 406 }, { "epoch": 1.2554828780300116, "grad_norm": 5.3430094718933105, "learning_rate": 8.746913580246913e-07, "logits/chosen": -2.8015618324279785, "logits/rejected": -2.8209776878356934, "logps/chosen": -36.69210433959961, "logps/rejected": -96.4420166015625, "loss": 0.5413, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7368175983428955, "rewards/margins": 4.372384071350098, "rewards/rejected": -2.6355669498443604, "step": 407 }, { "epoch": 1.2585609849942285, "grad_norm": 3.2205517292022705, "learning_rate": 8.743827160493826e-07, "logits/chosen": -2.7637081146240234, "logits/rejected": -2.7788145542144775, "logps/chosen": -42.26286315917969, "logps/rejected": -95.42975616455078, "loss": 0.5811, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4612537622451782, "rewards/margins": 3.681138515472412, "rewards/rejected": -2.2198848724365234, "step": 408 }, { "epoch": 1.2616390919584455, "grad_norm": 3.5751688480377197, "learning_rate": 8.740740740740741e-07, "logits/chosen": -2.7530369758605957, "logits/rejected": -2.8344497680664062, "logps/chosen": -53.012489318847656, "logps/rejected": -125.36436462402344, "loss": 0.51, "rewards/accuracies": 1.0, "rewards/chosen": 1.111279010772705, "rewards/margins": 5.22649621963501, "rewards/rejected": -4.115217685699463, "step": 409 }, { "epoch": 1.2647171989226624, "grad_norm": 4.005385398864746, "learning_rate": 8.737654320987654e-07, "logits/chosen": -2.7423601150512695, "logits/rejected": -2.8012008666992188, "logps/chosen": -46.6833610534668, "logps/rejected": -105.52845001220703, "loss": 0.5657, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1227003335952759, "rewards/margins": 4.186243057250977, "rewards/rejected": -3.063542366027832, "step": 410 }, { "epoch": 1.2677953058868796, "grad_norm": 4.191664218902588, "learning_rate": 8.734567901234568e-07, "logits/chosen": -2.8003318309783936, "logits/rejected": -2.7853074073791504, "logps/chosen": -44.61855697631836, "logps/rejected": -108.2998275756836, "loss": 0.5819, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4292372465133667, "rewards/margins": 4.437068939208984, "rewards/rejected": -3.007831573486328, "step": 411 }, { "epoch": 1.2708734128510966, "grad_norm": 5.4680047035217285, "learning_rate": 8.731481481481481e-07, "logits/chosen": -2.815734386444092, "logits/rejected": -2.8227999210357666, "logps/chosen": -33.470985412597656, "logps/rejected": -105.64934539794922, "loss": 0.4307, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3643381595611572, "rewards/margins": 4.954202175140381, "rewards/rejected": -2.5898637771606445, "step": 412 }, { "epoch": 1.2739515198153135, "grad_norm": 3.8178205490112305, "learning_rate": 8.728395061728394e-07, "logits/chosen": -2.7439229488372803, "logits/rejected": -2.750035285949707, "logps/chosen": -50.62183380126953, "logps/rejected": -112.79438781738281, "loss": 0.4929, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1857227087020874, "rewards/margins": 4.5864033699035645, "rewards/rejected": -3.4006807804107666, "step": 413 }, { "epoch": 1.2770296267795307, "grad_norm": 4.306251049041748, "learning_rate": 8.725308641975309e-07, "logits/chosen": -2.814032793045044, "logits/rejected": -2.747260093688965, "logps/chosen": -53.693115234375, "logps/rejected": -108.53501892089844, "loss": 0.6227, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6861941814422607, "rewards/margins": 3.8322811126708984, "rewards/rejected": -3.1460869312286377, "step": 414 }, { "epoch": 1.2801077337437476, "grad_norm": 4.169206619262695, "learning_rate": 8.722222222222222e-07, "logits/chosen": -2.652799367904663, "logits/rejected": -2.7232608795166016, "logps/chosen": -49.7322883605957, "logps/rejected": -124.3753890991211, "loss": 0.5158, "rewards/accuracies": 1.0, "rewards/chosen": 0.8464288711547852, "rewards/margins": 5.128560543060303, "rewards/rejected": -4.282132625579834, "step": 415 }, { "epoch": 1.2831858407079646, "grad_norm": 4.3528337478637695, "learning_rate": 8.719135802469135e-07, "logits/chosen": -2.7473764419555664, "logits/rejected": -2.719116449356079, "logps/chosen": -43.95294189453125, "logps/rejected": -111.89099884033203, "loss": 0.5594, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5239964723587036, "rewards/margins": 4.770825386047363, "rewards/rejected": -3.2468292713165283, "step": 416 }, { "epoch": 1.2862639476721816, "grad_norm": 4.364219665527344, "learning_rate": 8.716049382716049e-07, "logits/chosen": -2.7297487258911133, "logits/rejected": -2.734745740890503, "logps/chosen": -39.39838409423828, "logps/rejected": -102.95865631103516, "loss": 0.5235, "rewards/accuracies": 0.84375, "rewards/chosen": 1.714596152305603, "rewards/margins": 4.613806247711182, "rewards/rejected": -2.8992104530334473, "step": 417 }, { "epoch": 1.2893420546363985, "grad_norm": 5.719569206237793, "learning_rate": 8.712962962962963e-07, "logits/chosen": -2.7433440685272217, "logits/rejected": -2.71722674369812, "logps/chosen": -56.1038818359375, "logps/rejected": -123.09749603271484, "loss": 0.5582, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6779492497444153, "rewards/margins": 4.571812152862549, "rewards/rejected": -3.893862247467041, "step": 418 }, { "epoch": 1.2924201616006157, "grad_norm": 3.57623291015625, "learning_rate": 8.709876543209877e-07, "logits/chosen": -2.8055615425109863, "logits/rejected": -2.82206654548645, "logps/chosen": -44.66054916381836, "logps/rejected": -101.71852111816406, "loss": 0.6086, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4368252754211426, "rewards/margins": 3.965644359588623, "rewards/rejected": -2.5288190841674805, "step": 419 }, { "epoch": 1.2954982685648326, "grad_norm": 3.763871908187866, "learning_rate": 8.70679012345679e-07, "logits/chosen": -2.646576404571533, "logits/rejected": -2.648977518081665, "logps/chosen": -44.26051330566406, "logps/rejected": -123.39755249023438, "loss": 0.4811, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4681687355041504, "rewards/margins": 5.326776027679443, "rewards/rejected": -3.858607292175293, "step": 420 }, { "epoch": 1.2985763755290496, "grad_norm": 3.474337100982666, "learning_rate": 8.703703703703703e-07, "logits/chosen": -2.823594093322754, "logits/rejected": -2.7970194816589355, "logps/chosen": -50.2730598449707, "logps/rejected": -132.7025604248047, "loss": 0.4502, "rewards/accuracies": 0.875, "rewards/chosen": 0.9012991189956665, "rewards/margins": 5.500693321228027, "rewards/rejected": -4.599394798278809, "step": 421 }, { "epoch": 1.3016544824932668, "grad_norm": 4.990508079528809, "learning_rate": 8.700617283950617e-07, "logits/chosen": -2.7320799827575684, "logits/rejected": -2.749818801879883, "logps/chosen": -48.56172180175781, "logps/rejected": -106.7072982788086, "loss": 0.6162, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0326958894729614, "rewards/margins": 3.9157471656799316, "rewards/rejected": -2.8830513954162598, "step": 422 }, { "epoch": 1.3047325894574837, "grad_norm": 4.172371864318848, "learning_rate": 8.697530864197531e-07, "logits/chosen": -2.785754680633545, "logits/rejected": -2.7940049171447754, "logps/chosen": -55.55989074707031, "logps/rejected": -130.09628295898438, "loss": 0.5425, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5384551286697388, "rewards/margins": 4.965723991394043, "rewards/rejected": -4.427268981933594, "step": 423 }, { "epoch": 1.3078106964217007, "grad_norm": 4.082664489746094, "learning_rate": 8.694444444444444e-07, "logits/chosen": -2.7497775554656982, "logits/rejected": -2.754957914352417, "logps/chosen": -38.7867546081543, "logps/rejected": -99.6485595703125, "loss": 0.5648, "rewards/accuracies": 1.0, "rewards/chosen": 1.6725142002105713, "rewards/margins": 4.300405025482178, "rewards/rejected": -2.6278905868530273, "step": 424 }, { "epoch": 1.3108888033859176, "grad_norm": 3.6814491748809814, "learning_rate": 8.691358024691358e-07, "logits/chosen": -2.7620692253112793, "logits/rejected": -2.7648208141326904, "logps/chosen": -48.038185119628906, "logps/rejected": -115.8727035522461, "loss": 0.5203, "rewards/accuracies": 1.0, "rewards/chosen": 1.0630323886871338, "rewards/margins": 4.737104415893555, "rewards/rejected": -3.674072265625, "step": 425 }, { "epoch": 1.3139669103501346, "grad_norm": 3.8516266345977783, "learning_rate": 8.688271604938271e-07, "logits/chosen": -2.6656715869903564, "logits/rejected": -2.6719634532928467, "logps/chosen": -37.884342193603516, "logps/rejected": -102.41546630859375, "loss": 0.5169, "rewards/accuracies": 0.84375, "rewards/chosen": 2.043219804763794, "rewards/margins": 4.551488876342773, "rewards/rejected": -2.5082690715789795, "step": 426 }, { "epoch": 1.3170450173143518, "grad_norm": 5.083359241485596, "learning_rate": 8.685185185185184e-07, "logits/chosen": -2.761073112487793, "logits/rejected": -2.788426637649536, "logps/chosen": -56.08763122558594, "logps/rejected": -112.26666259765625, "loss": 0.5796, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7147833108901978, "rewards/margins": 4.10665225982666, "rewards/rejected": -3.391868829727173, "step": 427 }, { "epoch": 1.3201231242785687, "grad_norm": 4.706692218780518, "learning_rate": 8.682098765432099e-07, "logits/chosen": -2.810370683670044, "logits/rejected": -2.8585875034332275, "logps/chosen": -45.62620544433594, "logps/rejected": -112.70231628417969, "loss": 0.5018, "rewards/accuracies": 1.0, "rewards/chosen": 1.2999098300933838, "rewards/margins": 4.6240644454956055, "rewards/rejected": -3.3241543769836426, "step": 428 }, { "epoch": 1.3232012312427857, "grad_norm": 3.91945219039917, "learning_rate": 8.679012345679012e-07, "logits/chosen": -2.730541467666626, "logits/rejected": -2.7371068000793457, "logps/chosen": -41.34614944458008, "logps/rejected": -113.8946533203125, "loss": 0.5436, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6069722175598145, "rewards/margins": 4.755295276641846, "rewards/rejected": -3.1483235359191895, "step": 429 }, { "epoch": 1.3262793382070026, "grad_norm": 4.7202067375183105, "learning_rate": 8.675925925925926e-07, "logits/chosen": -2.7395288944244385, "logits/rejected": -2.747620105743408, "logps/chosen": -40.39515686035156, "logps/rejected": -90.7668228149414, "loss": 0.6081, "rewards/accuracies": 0.96875, "rewards/chosen": 1.580759882926941, "rewards/margins": 3.506600856781006, "rewards/rejected": -1.925840973854065, "step": 430 }, { "epoch": 1.3293574451712198, "grad_norm": 5.599898815155029, "learning_rate": 8.672839506172839e-07, "logits/chosen": -2.7213940620422363, "logits/rejected": -2.809908866882324, "logps/chosen": -35.7892951965332, "logps/rejected": -97.60623931884766, "loss": 0.4844, "rewards/accuracies": 0.96875, "rewards/chosen": 2.444486618041992, "rewards/margins": 4.524001121520996, "rewards/rejected": -2.079514265060425, "step": 431 }, { "epoch": 1.3324355521354367, "grad_norm": 3.367170572280884, "learning_rate": 8.669753086419753e-07, "logits/chosen": -2.8493778705596924, "logits/rejected": -2.842350482940674, "logps/chosen": -38.4129753112793, "logps/rejected": -115.49964904785156, "loss": 0.4923, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8082537651062012, "rewards/margins": 5.097301959991455, "rewards/rejected": -3.289048433303833, "step": 432 }, { "epoch": 1.3355136590996537, "grad_norm": 3.9252638816833496, "learning_rate": 8.666666666666667e-07, "logits/chosen": -2.761268138885498, "logits/rejected": -2.778806209564209, "logps/chosen": -46.227569580078125, "logps/rejected": -99.90704345703125, "loss": 0.5881, "rewards/accuracies": 0.96875, "rewards/chosen": 1.471848726272583, "rewards/margins": 3.7992312908172607, "rewards/rejected": -2.327382802963257, "step": 433 }, { "epoch": 1.3385917660638706, "grad_norm": 3.7837283611297607, "learning_rate": 8.66358024691358e-07, "logits/chosen": -2.7369070053100586, "logits/rejected": -2.7823123931884766, "logps/chosen": -39.36227798461914, "logps/rejected": -107.62454223632812, "loss": 0.5328, "rewards/accuracies": 1.0, "rewards/chosen": 1.678908348083496, "rewards/margins": 5.017334461212158, "rewards/rejected": -3.338426351547241, "step": 434 }, { "epoch": 1.3416698730280876, "grad_norm": 4.546927452087402, "learning_rate": 8.660493827160493e-07, "logits/chosen": -2.66511607170105, "logits/rejected": -2.702864170074463, "logps/chosen": -42.61738967895508, "logps/rejected": -112.20984649658203, "loss": 0.4994, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4439866542816162, "rewards/margins": 4.807631015777588, "rewards/rejected": -3.3636443614959717, "step": 435 }, { "epoch": 1.3447479799923048, "grad_norm": 4.817926406860352, "learning_rate": 8.657407407407407e-07, "logits/chosen": -2.7604658603668213, "logits/rejected": -2.7420966625213623, "logps/chosen": -54.656211853027344, "logps/rejected": -107.05135345458984, "loss": 0.6945, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5567015409469604, "rewards/margins": 3.4949402809143066, "rewards/rejected": -2.9382388591766357, "step": 436 }, { "epoch": 1.3478260869565217, "grad_norm": 7.603365898132324, "learning_rate": 8.654320987654321e-07, "logits/chosen": -2.719834804534912, "logits/rejected": -2.8059539794921875, "logps/chosen": -41.211273193359375, "logps/rejected": -99.74800872802734, "loss": 0.592, "rewards/accuracies": 1.0, "rewards/chosen": 1.8514821529388428, "rewards/margins": 4.171772003173828, "rewards/rejected": -2.3202900886535645, "step": 437 }, { "epoch": 1.3509041939207387, "grad_norm": 4.508315086364746, "learning_rate": 8.651234567901234e-07, "logits/chosen": -2.691967487335205, "logits/rejected": -2.7226686477661133, "logps/chosen": -40.294921875, "logps/rejected": -98.00861358642578, "loss": 0.5656, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6495106220245361, "rewards/margins": 4.000494480133057, "rewards/rejected": -2.3509838581085205, "step": 438 }, { "epoch": 1.3539823008849559, "grad_norm": 3.8041727542877197, "learning_rate": 8.648148148148148e-07, "logits/chosen": -2.789583206176758, "logits/rejected": -2.780623197555542, "logps/chosen": -49.177940368652344, "logps/rejected": -114.3790054321289, "loss": 0.573, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0403761863708496, "rewards/margins": 4.417273044586182, "rewards/rejected": -3.376896858215332, "step": 439 }, { "epoch": 1.3570604078491728, "grad_norm": 3.688650608062744, "learning_rate": 8.645061728395061e-07, "logits/chosen": -2.7136764526367188, "logits/rejected": -2.7568881511688232, "logps/chosen": -42.38011169433594, "logps/rejected": -113.3343734741211, "loss": 0.5049, "rewards/accuracies": 0.90625, "rewards/chosen": 1.433027744293213, "rewards/margins": 4.842779636383057, "rewards/rejected": -3.4097516536712646, "step": 440 }, { "epoch": 1.3601385148133898, "grad_norm": 4.274280548095703, "learning_rate": 8.641975308641974e-07, "logits/chosen": -2.705796480178833, "logits/rejected": -2.7297279834747314, "logps/chosen": -42.652950286865234, "logps/rejected": -99.96900177001953, "loss": 0.5369, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6642564535140991, "rewards/margins": 4.230477333068848, "rewards/rejected": -2.56622052192688, "step": 441 }, { "epoch": 1.3632166217776067, "grad_norm": 4.020526885986328, "learning_rate": 8.638888888888889e-07, "logits/chosen": -2.702043056488037, "logits/rejected": -2.7203590869903564, "logps/chosen": -36.402854919433594, "logps/rejected": -111.12738037109375, "loss": 0.5023, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6602091789245605, "rewards/margins": 4.8507256507873535, "rewards/rejected": -3.190516233444214, "step": 442 }, { "epoch": 1.3662947287418237, "grad_norm": 3.6833999156951904, "learning_rate": 8.635802469135802e-07, "logits/chosen": -2.7540693283081055, "logits/rejected": -2.8118085861206055, "logps/chosen": -49.679771423339844, "logps/rejected": -102.7165756225586, "loss": 0.651, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1465413570404053, "rewards/margins": 3.825741767883301, "rewards/rejected": -2.6792001724243164, "step": 443 }, { "epoch": 1.3693728357060408, "grad_norm": 3.7894771099090576, "learning_rate": 8.632716049382716e-07, "logits/chosen": -2.8049275875091553, "logits/rejected": -2.7075889110565186, "logps/chosen": -48.55695724487305, "logps/rejected": -117.91169738769531, "loss": 0.5112, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0430810451507568, "rewards/margins": 4.7302021980285645, "rewards/rejected": -3.6871211528778076, "step": 444 }, { "epoch": 1.3724509426702578, "grad_norm": 4.465208530426025, "learning_rate": 8.629629629629629e-07, "logits/chosen": -2.8548927307128906, "logits/rejected": -2.8120052814483643, "logps/chosen": -42.870391845703125, "logps/rejected": -110.44439697265625, "loss": 0.4813, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3320591449737549, "rewards/margins": 4.603089809417725, "rewards/rejected": -3.2710304260253906, "step": 445 }, { "epoch": 1.3755290496344748, "grad_norm": 5.908717155456543, "learning_rate": 8.626543209876542e-07, "logits/chosen": -2.7812459468841553, "logits/rejected": -2.683037757873535, "logps/chosen": -49.79789733886719, "logps/rejected": -129.16587829589844, "loss": 0.5269, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8924311995506287, "rewards/margins": 5.279480457305908, "rewards/rejected": -4.387049198150635, "step": 446 }, { "epoch": 1.378607156598692, "grad_norm": 3.487874746322632, "learning_rate": 8.623456790123457e-07, "logits/chosen": -2.7989914417266846, "logits/rejected": -2.837334156036377, "logps/chosen": -51.3070068359375, "logps/rejected": -119.54878234863281, "loss": 0.5232, "rewards/accuracies": 0.875, "rewards/chosen": 1.2119083404541016, "rewards/margins": 4.818431854248047, "rewards/rejected": -3.6065235137939453, "step": 447 }, { "epoch": 1.3816852635629089, "grad_norm": 4.595246315002441, "learning_rate": 8.62037037037037e-07, "logits/chosen": -2.767251491546631, "logits/rejected": -2.764256238937378, "logps/chosen": -52.61651611328125, "logps/rejected": -124.51499938964844, "loss": 0.5218, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6744053959846497, "rewards/margins": 4.858223915100098, "rewards/rejected": -4.183818817138672, "step": 448 }, { "epoch": 1.3847633705271258, "grad_norm": 5.793344974517822, "learning_rate": 8.617283950617283e-07, "logits/chosen": -2.7051472663879395, "logits/rejected": -2.7776145935058594, "logps/chosen": -43.37232971191406, "logps/rejected": -125.5089111328125, "loss": 0.363, "rewards/accuracies": 1.0, "rewards/chosen": 1.774674892425537, "rewards/margins": 6.277159690856934, "rewards/rejected": -4.502484321594238, "step": 449 }, { "epoch": 1.3878414774913428, "grad_norm": 4.399031639099121, "learning_rate": 8.614197530864197e-07, "logits/chosen": -2.7175965309143066, "logits/rejected": -2.7511327266693115, "logps/chosen": -36.09843444824219, "logps/rejected": -103.72041320800781, "loss": 0.5362, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8823171854019165, "rewards/margins": 4.467890739440918, "rewards/rejected": -2.58557391166687, "step": 450 }, { "epoch": 1.3909195844555597, "grad_norm": 3.97684383392334, "learning_rate": 8.611111111111111e-07, "logits/chosen": -2.693230628967285, "logits/rejected": -2.729067802429199, "logps/chosen": -48.26095962524414, "logps/rejected": -121.0561294555664, "loss": 0.5026, "rewards/accuracies": 1.0, "rewards/chosen": 1.4328532218933105, "rewards/margins": 5.330498218536377, "rewards/rejected": -3.8976449966430664, "step": 451 }, { "epoch": 1.393997691419777, "grad_norm": 3.6091504096984863, "learning_rate": 8.608024691358025e-07, "logits/chosen": -2.680449962615967, "logits/rejected": -2.783282995223999, "logps/chosen": -33.49571990966797, "logps/rejected": -125.88920593261719, "loss": 0.2801, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3205008506774902, "rewards/margins": 6.543156147003174, "rewards/rejected": -4.222654342651367, "step": 452 }, { "epoch": 1.3970757983839939, "grad_norm": 4.876512050628662, "learning_rate": 8.604938271604938e-07, "logits/chosen": -2.747699737548828, "logits/rejected": -2.7924516201019287, "logps/chosen": -36.552913665771484, "logps/rejected": -100.43305969238281, "loss": 0.5571, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8997764587402344, "rewards/margins": 4.491486072540283, "rewards/rejected": -2.591709852218628, "step": 453 }, { "epoch": 1.4001539053482108, "grad_norm": 4.204183101654053, "learning_rate": 8.601851851851851e-07, "logits/chosen": -2.803800106048584, "logits/rejected": -2.724268674850464, "logps/chosen": -61.47042465209961, "logps/rejected": -121.11611938476562, "loss": 0.5921, "rewards/accuracies": 0.90625, "rewards/chosen": 0.22753366827964783, "rewards/margins": 4.2161431312561035, "rewards/rejected": -3.9886093139648438, "step": 454 }, { "epoch": 1.4032320123124278, "grad_norm": 5.101119041442871, "learning_rate": 8.598765432098765e-07, "logits/chosen": -2.8225765228271484, "logits/rejected": -2.76560115814209, "logps/chosen": -54.4891242980957, "logps/rejected": -120.37102508544922, "loss": 0.5883, "rewards/accuracies": 0.875, "rewards/chosen": 0.49022218585014343, "rewards/margins": 4.422501087188721, "rewards/rejected": -3.932278871536255, "step": 455 }, { "epoch": 1.4032320123124278, "eval_logits/chosen": -2.699073314666748, "eval_logits/rejected": -2.7519779205322266, "eval_logps/chosen": -44.70673370361328, "eval_logps/rejected": -111.03082275390625, "eval_loss": 0.5795396566390991, "eval_rewards/accuracies": 0.9017341136932373, "eval_rewards/chosen": 1.3567060232162476, "eval_rewards/margins": 4.443203449249268, "eval_rewards/rejected": -3.0864977836608887, "eval_runtime": 639.8749, "eval_samples_per_second": 0.541, "eval_steps_per_second": 0.27, "step": 455 }, { "epoch": 1.406310119276645, "grad_norm": 3.900136947631836, "learning_rate": 8.595679012345679e-07, "logits/chosen": -2.869934558868408, "logits/rejected": -2.7735679149627686, "logps/chosen": -46.8817253112793, "logps/rejected": -101.2826919555664, "loss": 0.6062, "rewards/accuracies": 0.875, "rewards/chosen": 1.6103826761245728, "rewards/margins": 3.992750406265259, "rewards/rejected": -2.3823678493499756, "step": 456 }, { "epoch": 1.409388226240862, "grad_norm": 5.451925277709961, "learning_rate": 8.592592592592592e-07, "logits/chosen": -2.774148464202881, "logits/rejected": -2.7565040588378906, "logps/chosen": -55.878334045410156, "logps/rejected": -126.4359130859375, "loss": 0.5107, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7300463914871216, "rewards/margins": 5.004105091094971, "rewards/rejected": -4.2740583419799805, "step": 457 }, { "epoch": 1.4124663332050789, "grad_norm": 5.268304347991943, "learning_rate": 8.589506172839506e-07, "logits/chosen": -2.763251781463623, "logits/rejected": -2.7484254837036133, "logps/chosen": -38.150203704833984, "logps/rejected": -121.60379791259766, "loss": 0.4691, "rewards/accuracies": 0.96875, "rewards/chosen": 1.892861247062683, "rewards/margins": 5.663969039916992, "rewards/rejected": -3.7711071968078613, "step": 458 }, { "epoch": 1.4155444401692958, "grad_norm": 5.390244483947754, "learning_rate": 8.586419753086419e-07, "logits/chosen": -2.8265419006347656, "logits/rejected": -2.750502109527588, "logps/chosen": -43.00567626953125, "logps/rejected": -101.92303466796875, "loss": 0.5634, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2533810138702393, "rewards/margins": 3.961381435394287, "rewards/rejected": -2.708000421524048, "step": 459 }, { "epoch": 1.4186225471335128, "grad_norm": 5.336375713348389, "learning_rate": 8.583333333333332e-07, "logits/chosen": -2.669814109802246, "logits/rejected": -2.7152791023254395, "logps/chosen": -42.86963653564453, "logps/rejected": -107.57072448730469, "loss": 0.5587, "rewards/accuracies": 0.9375, "rewards/chosen": 1.63015878200531, "rewards/margins": 4.494204998016357, "rewards/rejected": -2.864046573638916, "step": 460 }, { "epoch": 1.42170065409773, "grad_norm": 3.9117894172668457, "learning_rate": 8.580246913580247e-07, "logits/chosen": -2.7859628200531006, "logits/rejected": -2.804560899734497, "logps/chosen": -55.475486755371094, "logps/rejected": -117.69107055664062, "loss": 0.5611, "rewards/accuracies": 0.875, "rewards/chosen": 0.7327648997306824, "rewards/margins": 4.108468055725098, "rewards/rejected": -3.3757033348083496, "step": 461 }, { "epoch": 1.424778761061947, "grad_norm": 4.001144886016846, "learning_rate": 8.57716049382716e-07, "logits/chosen": -2.707159996032715, "logits/rejected": -2.7570066452026367, "logps/chosen": -49.60067367553711, "logps/rejected": -120.72731018066406, "loss": 0.5567, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9583505988121033, "rewards/margins": 4.495384693145752, "rewards/rejected": -3.537034273147583, "step": 462 }, { "epoch": 1.4278568680261639, "grad_norm": 4.3204522132873535, "learning_rate": 8.574074074074074e-07, "logits/chosen": -2.6577513217926025, "logits/rejected": -2.6638057231903076, "logps/chosen": -36.86759948730469, "logps/rejected": -101.45079803466797, "loss": 0.5864, "rewards/accuracies": 0.84375, "rewards/chosen": 1.913230061531067, "rewards/margins": 4.280379772186279, "rewards/rejected": -2.367149829864502, "step": 463 }, { "epoch": 1.430934974990381, "grad_norm": 3.874907970428467, "learning_rate": 8.570987654320987e-07, "logits/chosen": -2.7028591632843018, "logits/rejected": -2.720193862915039, "logps/chosen": -53.59135437011719, "logps/rejected": -111.0986328125, "loss": 0.5986, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8201597929000854, "rewards/margins": 4.225282669067383, "rewards/rejected": -3.4051225185394287, "step": 464 }, { "epoch": 1.434013081954598, "grad_norm": 4.283587455749512, "learning_rate": 8.567901234567901e-07, "logits/chosen": -2.743178367614746, "logits/rejected": -2.7282283306121826, "logps/chosen": -35.65962219238281, "logps/rejected": -102.79766845703125, "loss": 0.4939, "rewards/accuracies": 0.90625, "rewards/chosen": 2.1263110637664795, "rewards/margins": 4.871950149536133, "rewards/rejected": -2.7456390857696533, "step": 465 }, { "epoch": 1.437091188918815, "grad_norm": 3.0908451080322266, "learning_rate": 8.564814814814815e-07, "logits/chosen": -2.6986613273620605, "logits/rejected": -2.730302095413208, "logps/chosen": -51.93602752685547, "logps/rejected": -115.27721405029297, "loss": 0.511, "rewards/accuracies": 0.875, "rewards/chosen": 0.9131894707679749, "rewards/margins": 4.454510688781738, "rewards/rejected": -3.54132080078125, "step": 466 }, { "epoch": 1.4401692958830319, "grad_norm": 7.409972667694092, "learning_rate": 8.561728395061728e-07, "logits/chosen": -2.7605392932891846, "logits/rejected": -2.7636618614196777, "logps/chosen": -39.9962272644043, "logps/rejected": -109.25057983398438, "loss": 0.5227, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8460521697998047, "rewards/margins": 4.830957889556885, "rewards/rejected": -2.984905958175659, "step": 467 }, { "epoch": 1.4432474028472488, "grad_norm": 4.7528228759765625, "learning_rate": 8.558641975308641e-07, "logits/chosen": -2.662910223007202, "logits/rejected": -2.756408214569092, "logps/chosen": -36.1470947265625, "logps/rejected": -96.879150390625, "loss": 0.5546, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9447842836380005, "rewards/margins": 4.270630836486816, "rewards/rejected": -2.3258466720581055, "step": 468 }, { "epoch": 1.446325509811466, "grad_norm": 4.504846096038818, "learning_rate": 8.555555555555555e-07, "logits/chosen": -2.8626699447631836, "logits/rejected": -2.822871685028076, "logps/chosen": -43.89458084106445, "logps/rejected": -105.03414154052734, "loss": 0.5619, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4150751829147339, "rewards/margins": 4.379446983337402, "rewards/rejected": -2.9643714427948, "step": 469 }, { "epoch": 1.449403616775683, "grad_norm": 4.702287197113037, "learning_rate": 8.552469135802469e-07, "logits/chosen": -2.6789777278900146, "logits/rejected": -2.743854284286499, "logps/chosen": -34.14011001586914, "logps/rejected": -117.08099365234375, "loss": 0.4575, "rewards/accuracies": 0.96875, "rewards/chosen": 2.345226287841797, "rewards/margins": 5.83968448638916, "rewards/rejected": -3.4944584369659424, "step": 470 }, { "epoch": 1.4524817237399, "grad_norm": 4.136104583740234, "learning_rate": 8.549382716049382e-07, "logits/chosen": -2.7864415645599365, "logits/rejected": -2.7234816551208496, "logps/chosen": -40.768917083740234, "logps/rejected": -102.11028289794922, "loss": 0.5443, "rewards/accuracies": 0.9375, "rewards/chosen": 1.630904197692871, "rewards/margins": 4.295895576477051, "rewards/rejected": -2.6649909019470215, "step": 471 }, { "epoch": 1.455559830704117, "grad_norm": 5.651308059692383, "learning_rate": 8.546296296296296e-07, "logits/chosen": -2.717282295227051, "logits/rejected": -2.7192752361297607, "logps/chosen": -48.56663513183594, "logps/rejected": -123.0557861328125, "loss": 0.4946, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2100317478179932, "rewards/margins": 5.061887741088867, "rewards/rejected": -3.851855754852295, "step": 472 }, { "epoch": 1.458637937668334, "grad_norm": 4.237666130065918, "learning_rate": 8.543209876543209e-07, "logits/chosen": -2.761305809020996, "logits/rejected": -2.7937214374542236, "logps/chosen": -41.512596130371094, "logps/rejected": -110.2935562133789, "loss": 0.6033, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3353779315948486, "rewards/margins": 4.732450008392334, "rewards/rejected": -3.3970723152160645, "step": 473 }, { "epoch": 1.461716044632551, "grad_norm": 4.949310302734375, "learning_rate": 8.540123456790123e-07, "logits/chosen": -2.762453079223633, "logits/rejected": -2.763327121734619, "logps/chosen": -56.74636459350586, "logps/rejected": -117.57953643798828, "loss": 0.6399, "rewards/accuracies": 0.96875, "rewards/chosen": 0.2792595326900482, "rewards/margins": 3.7911365032196045, "rewards/rejected": -3.5118765830993652, "step": 474 }, { "epoch": 1.464794151596768, "grad_norm": 4.949736595153809, "learning_rate": 8.537037037037037e-07, "logits/chosen": -2.7186837196350098, "logits/rejected": -2.7776665687561035, "logps/chosen": -42.17243194580078, "logps/rejected": -118.80086517333984, "loss": 0.5352, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3910211324691772, "rewards/margins": 5.00819730758667, "rewards/rejected": -3.617176055908203, "step": 475 }, { "epoch": 1.467872258560985, "grad_norm": 4.415830135345459, "learning_rate": 8.53395061728395e-07, "logits/chosen": -2.755032539367676, "logits/rejected": -2.7535457611083984, "logps/chosen": -50.48781967163086, "logps/rejected": -116.37399291992188, "loss": 0.6095, "rewards/accuracies": 1.0, "rewards/chosen": 1.0270335674285889, "rewards/margins": 4.5157389640808105, "rewards/rejected": -3.4887051582336426, "step": 476 }, { "epoch": 1.4709503655252019, "grad_norm": 5.002704620361328, "learning_rate": 8.530864197530864e-07, "logits/chosen": -2.7335546016693115, "logits/rejected": -2.7002451419830322, "logps/chosen": -52.007293701171875, "logps/rejected": -120.78654479980469, "loss": 0.5194, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6105967164039612, "rewards/margins": 4.6751532554626465, "rewards/rejected": -4.06455659866333, "step": 477 }, { "epoch": 1.474028472489419, "grad_norm": 5.162557125091553, "learning_rate": 8.527777777777777e-07, "logits/chosen": -2.7832419872283936, "logits/rejected": -2.7200634479522705, "logps/chosen": -33.15583801269531, "logps/rejected": -114.92369842529297, "loss": 0.4729, "rewards/accuracies": 1.0, "rewards/chosen": 2.1984264850616455, "rewards/margins": 5.643194198608398, "rewards/rejected": -3.444767713546753, "step": 478 }, { "epoch": 1.477106579453636, "grad_norm": 3.964069128036499, "learning_rate": 8.524691358024691e-07, "logits/chosen": -2.721179485321045, "logits/rejected": -2.77116060256958, "logps/chosen": -44.868438720703125, "logps/rejected": -108.08790588378906, "loss": 0.5329, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2856192588806152, "rewards/margins": 4.270518779754639, "rewards/rejected": -2.9848997592926025, "step": 479 }, { "epoch": 1.480184686417853, "grad_norm": 5.6409125328063965, "learning_rate": 8.521604938271605e-07, "logits/chosen": -2.708221435546875, "logits/rejected": -2.7126007080078125, "logps/chosen": -44.64219284057617, "logps/rejected": -125.114990234375, "loss": 0.4798, "rewards/accuracies": 0.96875, "rewards/chosen": 1.384803056716919, "rewards/margins": 5.536199569702148, "rewards/rejected": -4.151396751403809, "step": 480 }, { "epoch": 1.4832627933820701, "grad_norm": 3.8323006629943848, "learning_rate": 8.518518518518518e-07, "logits/chosen": -2.7432656288146973, "logits/rejected": -2.6711394786834717, "logps/chosen": -38.02014923095703, "logps/rejected": -116.26927947998047, "loss": 0.4897, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7245490550994873, "rewards/margins": 5.287727355957031, "rewards/rejected": -3.563178539276123, "step": 481 }, { "epoch": 1.486340900346287, "grad_norm": 4.859666347503662, "learning_rate": 8.515432098765431e-07, "logits/chosen": -2.8119473457336426, "logits/rejected": -2.7510855197906494, "logps/chosen": -38.08110809326172, "logps/rejected": -112.38801574707031, "loss": 0.4906, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1018898487091064, "rewards/margins": 5.201578617095947, "rewards/rejected": -3.099688768386841, "step": 482 }, { "epoch": 1.489419007310504, "grad_norm": 4.32905387878418, "learning_rate": 8.512345679012345e-07, "logits/chosen": -2.710266351699829, "logits/rejected": -2.7388720512390137, "logps/chosen": -33.68647384643555, "logps/rejected": -112.65975952148438, "loss": 0.3807, "rewards/accuracies": 0.9375, "rewards/chosen": 2.191981315612793, "rewards/margins": 5.735570430755615, "rewards/rejected": -3.543588638305664, "step": 483 }, { "epoch": 1.492497114274721, "grad_norm": 4.2683491706848145, "learning_rate": 8.509259259259259e-07, "logits/chosen": -2.7253239154815674, "logits/rejected": -2.756741523742676, "logps/chosen": -35.77650833129883, "logps/rejected": -96.03495025634766, "loss": 0.511, "rewards/accuracies": 0.90625, "rewards/chosen": 2.0169856548309326, "rewards/margins": 4.601327896118164, "rewards/rejected": -2.5843424797058105, "step": 484 }, { "epoch": 1.495575221238938, "grad_norm": 3.812722682952881, "learning_rate": 8.506172839506173e-07, "logits/chosen": -2.6540040969848633, "logits/rejected": -2.7000749111175537, "logps/chosen": -33.93217468261719, "logps/rejected": -127.37173461914062, "loss": 0.3835, "rewards/accuracies": 0.90625, "rewards/chosen": 2.1578807830810547, "rewards/margins": 6.447052001953125, "rewards/rejected": -4.28917121887207, "step": 485 }, { "epoch": 1.498653328203155, "grad_norm": 4.962221622467041, "learning_rate": 8.503086419753086e-07, "logits/chosen": -2.822633981704712, "logits/rejected": -2.7484469413757324, "logps/chosen": -39.87115478515625, "logps/rejected": -107.38420104980469, "loss": 0.5444, "rewards/accuracies": 1.0, "rewards/chosen": 1.771332025527954, "rewards/margins": 4.690587520599365, "rewards/rejected": -2.919255018234253, "step": 486 }, { "epoch": 1.501731435167372, "grad_norm": 3.7859408855438232, "learning_rate": 8.499999999999999e-07, "logits/chosen": -2.7708358764648438, "logits/rejected": -2.735015869140625, "logps/chosen": -43.36330795288086, "logps/rejected": -120.37162780761719, "loss": 0.4968, "rewards/accuracies": 1.0, "rewards/chosen": 1.4874130487442017, "rewards/margins": 5.425991058349609, "rewards/rejected": -3.9385783672332764, "step": 487 }, { "epoch": 1.504809542131589, "grad_norm": 3.870828628540039, "learning_rate": 8.496913580246913e-07, "logits/chosen": -2.7577390670776367, "logits/rejected": -2.8216488361358643, "logps/chosen": -41.11439514160156, "logps/rejected": -122.983154296875, "loss": 0.4393, "rewards/accuracies": 0.875, "rewards/chosen": 1.7824617624282837, "rewards/margins": 5.7971625328063965, "rewards/rejected": -4.014700889587402, "step": 488 }, { "epoch": 1.5078876490958062, "grad_norm": 3.5948283672332764, "learning_rate": 8.493827160493827e-07, "logits/chosen": -2.805957555770874, "logits/rejected": -2.7967541217803955, "logps/chosen": -48.15707015991211, "logps/rejected": -115.50585174560547, "loss": 0.6186, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7616106271743774, "rewards/margins": 4.23968505859375, "rewards/rejected": -3.478074073791504, "step": 489 }, { "epoch": 1.5109657560600231, "grad_norm": 4.667148113250732, "learning_rate": 8.49074074074074e-07, "logits/chosen": -2.6765687465667725, "logits/rejected": -2.7689473628997803, "logps/chosen": -38.41400909423828, "logps/rejected": -94.75465393066406, "loss": 0.5567, "rewards/accuracies": 0.9375, "rewards/chosen": 1.552908182144165, "rewards/margins": 4.0382232666015625, "rewards/rejected": -2.4853148460388184, "step": 490 }, { "epoch": 1.51404386302424, "grad_norm": 3.4014790058135986, "learning_rate": 8.487654320987654e-07, "logits/chosen": -2.7238664627075195, "logits/rejected": -2.7551400661468506, "logps/chosen": -32.76210403442383, "logps/rejected": -95.5943374633789, "loss": 0.5407, "rewards/accuracies": 0.84375, "rewards/chosen": 2.392857074737549, "rewards/margins": 4.456665992736816, "rewards/rejected": -2.0638089179992676, "step": 491 }, { "epoch": 1.517121969988457, "grad_norm": 3.339066505432129, "learning_rate": 8.484567901234567e-07, "logits/chosen": -2.8800277709960938, "logits/rejected": -2.815422534942627, "logps/chosen": -44.78589630126953, "logps/rejected": -104.27519989013672, "loss": 0.5849, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4154911041259766, "rewards/margins": 4.169369697570801, "rewards/rejected": -2.753878593444824, "step": 492 }, { "epoch": 1.520200076952674, "grad_norm": 3.881460428237915, "learning_rate": 8.48148148148148e-07, "logits/chosen": -2.798307180404663, "logits/rejected": -2.770236015319824, "logps/chosen": -41.99332046508789, "logps/rejected": -107.03468322753906, "loss": 0.5463, "rewards/accuracies": 0.96875, "rewards/chosen": 1.771614670753479, "rewards/margins": 4.9616475105285645, "rewards/rejected": -3.190033197402954, "step": 493 }, { "epoch": 1.523278183916891, "grad_norm": 6.376860618591309, "learning_rate": 8.478395061728395e-07, "logits/chosen": -2.8084936141967773, "logits/rejected": -2.8227500915527344, "logps/chosen": -57.712100982666016, "logps/rejected": -127.9955825805664, "loss": 0.5295, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5201593637466431, "rewards/margins": 4.954164028167725, "rewards/rejected": -4.434004783630371, "step": 494 }, { "epoch": 1.5263562908811081, "grad_norm": 5.038609981536865, "learning_rate": 8.475308641975308e-07, "logits/chosen": -2.745231866836548, "logits/rejected": -2.6965017318725586, "logps/chosen": -35.140708923339844, "logps/rejected": -103.59317016601562, "loss": 0.5057, "rewards/accuracies": 0.90625, "rewards/chosen": 2.0185327529907227, "rewards/margins": 4.5576372146606445, "rewards/rejected": -2.539104461669922, "step": 495 }, { "epoch": 1.529434397845325, "grad_norm": 4.636759281158447, "learning_rate": 8.472222222222222e-07, "logits/chosen": -2.6532914638519287, "logits/rejected": -2.7322375774383545, "logps/chosen": -31.883405685424805, "logps/rejected": -111.83598327636719, "loss": 0.4428, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1752500534057617, "rewards/margins": 5.7372894287109375, "rewards/rejected": -3.562040328979492, "step": 496 }, { "epoch": 1.5325125048095423, "grad_norm": 4.782121181488037, "learning_rate": 8.469135802469135e-07, "logits/chosen": -2.706293821334839, "logits/rejected": -2.7068746089935303, "logps/chosen": -36.80719757080078, "logps/rejected": -121.6998519897461, "loss": 0.5152, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9718390703201294, "rewards/margins": 5.856916904449463, "rewards/rejected": -3.885077953338623, "step": 497 }, { "epoch": 1.5355906117737592, "grad_norm": 3.7541115283966064, "learning_rate": 8.466049382716049e-07, "logits/chosen": -2.810309410095215, "logits/rejected": -2.886561393737793, "logps/chosen": -44.89790344238281, "logps/rejected": -96.82682037353516, "loss": 0.5839, "rewards/accuracies": 0.75, "rewards/chosen": 1.5667283535003662, "rewards/margins": 3.9840102195739746, "rewards/rejected": -2.4172821044921875, "step": 498 }, { "epoch": 1.5386687187379762, "grad_norm": 5.609348297119141, "learning_rate": 8.462962962962963e-07, "logits/chosen": -2.6939167976379395, "logits/rejected": -2.7316713333129883, "logps/chosen": -57.28856658935547, "logps/rejected": -113.75658416748047, "loss": 0.6534, "rewards/accuracies": 0.96875, "rewards/chosen": 0.43136340379714966, "rewards/margins": 4.164360046386719, "rewards/rejected": -3.7329964637756348, "step": 499 }, { "epoch": 1.5417468257021931, "grad_norm": 4.3728437423706055, "learning_rate": 8.459876543209876e-07, "logits/chosen": -2.7580032348632812, "logits/rejected": -2.7517099380493164, "logps/chosen": -40.345462799072266, "logps/rejected": -103.32365417480469, "loss": 0.5079, "rewards/accuracies": 1.0, "rewards/chosen": 1.8741755485534668, "rewards/margins": 4.727820873260498, "rewards/rejected": -2.853645086288452, "step": 500 }, { "epoch": 1.54482493266641, "grad_norm": 4.261357307434082, "learning_rate": 8.456790123456789e-07, "logits/chosen": -2.666248083114624, "logits/rejected": -2.6998023986816406, "logps/chosen": -36.84687042236328, "logps/rejected": -120.39249420166016, "loss": 0.4565, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1397266387939453, "rewards/margins": 5.794426441192627, "rewards/rejected": -3.6546990871429443, "step": 501 }, { "epoch": 1.547903039630627, "grad_norm": 5.07733678817749, "learning_rate": 8.453703703703703e-07, "logits/chosen": -2.7421677112579346, "logits/rejected": -2.774655818939209, "logps/chosen": -44.8667106628418, "logps/rejected": -102.56123352050781, "loss": 0.568, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5368496179580688, "rewards/margins": 4.300107955932617, "rewards/rejected": -2.7632577419281006, "step": 502 }, { "epoch": 1.5509811465948442, "grad_norm": 3.6294636726379395, "learning_rate": 8.450617283950617e-07, "logits/chosen": -2.652747631072998, "logits/rejected": -2.670536518096924, "logps/chosen": -44.06578826904297, "logps/rejected": -109.22113800048828, "loss": 0.5693, "rewards/accuracies": 1.0, "rewards/chosen": 1.4648761749267578, "rewards/margins": 4.539705276489258, "rewards/rejected": -3.074828624725342, "step": 503 }, { "epoch": 1.5540592535590612, "grad_norm": 4.5922675132751465, "learning_rate": 8.447530864197531e-07, "logits/chosen": -2.8413479328155518, "logits/rejected": -2.815887212753296, "logps/chosen": -43.25503158569336, "logps/rejected": -110.00958251953125, "loss": 0.5137, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5584557056427002, "rewards/margins": 4.777260780334473, "rewards/rejected": -3.2188053131103516, "step": 504 }, { "epoch": 1.5571373605232783, "grad_norm": 4.4778361320495605, "learning_rate": 8.444444444444444e-07, "logits/chosen": -2.743068218231201, "logits/rejected": -2.7870683670043945, "logps/chosen": -34.78040313720703, "logps/rejected": -101.14128875732422, "loss": 0.4484, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2400217056274414, "rewards/margins": 4.814328670501709, "rewards/rejected": -2.5743069648742676, "step": 505 }, { "epoch": 1.5602154674874953, "grad_norm": 3.2315032482147217, "learning_rate": 8.441358024691357e-07, "logits/chosen": -2.7309696674346924, "logits/rejected": -2.727827310562134, "logps/chosen": -55.318172454833984, "logps/rejected": -111.39413452148438, "loss": 0.6231, "rewards/accuracies": 0.875, "rewards/chosen": 0.7146507501602173, "rewards/margins": 4.252204418182373, "rewards/rejected": -3.537553310394287, "step": 506 }, { "epoch": 1.5632935744517122, "grad_norm": 4.121606826782227, "learning_rate": 8.438271604938271e-07, "logits/chosen": -2.798715829849243, "logits/rejected": -2.751662492752075, "logps/chosen": -59.149532318115234, "logps/rejected": -121.52283477783203, "loss": 0.5711, "rewards/accuracies": 0.96875, "rewards/chosen": 0.48452848196029663, "rewards/margins": 4.474173545837402, "rewards/rejected": -3.989644765853882, "step": 507 }, { "epoch": 1.5663716814159292, "grad_norm": 5.301509380340576, "learning_rate": 8.435185185185185e-07, "logits/chosen": -2.8409085273742676, "logits/rejected": -2.7984955310821533, "logps/chosen": -58.15578842163086, "logps/rejected": -129.79901123046875, "loss": 0.5001, "rewards/accuracies": 0.9375, "rewards/chosen": 0.45676153898239136, "rewards/margins": 5.003209114074707, "rewards/rejected": -4.546446800231934, "step": 508 }, { "epoch": 1.5694497883801461, "grad_norm": 4.088747024536133, "learning_rate": 8.432098765432098e-07, "logits/chosen": -2.7595841884613037, "logits/rejected": -2.7096688747406006, "logps/chosen": -52.821632385253906, "logps/rejected": -125.05866241455078, "loss": 0.5769, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7380809783935547, "rewards/margins": 4.929512023925781, "rewards/rejected": -4.191431999206543, "step": 509 }, { "epoch": 1.572527895344363, "grad_norm": 4.709401607513428, "learning_rate": 8.429012345679012e-07, "logits/chosen": -2.8028042316436768, "logits/rejected": -2.7442028522491455, "logps/chosen": -49.486454010009766, "logps/rejected": -112.45415496826172, "loss": 0.5738, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9891164302825928, "rewards/margins": 4.472884654998779, "rewards/rejected": -3.4837684631347656, "step": 510 }, { "epoch": 1.5756060023085803, "grad_norm": 5.187496185302734, "learning_rate": 8.425925925925925e-07, "logits/chosen": -2.7903099060058594, "logits/rejected": -2.7877988815307617, "logps/chosen": -42.41993713378906, "logps/rejected": -103.96977233886719, "loss": 0.5609, "rewards/accuracies": 0.875, "rewards/chosen": 1.6963454484939575, "rewards/margins": 4.379185676574707, "rewards/rejected": -2.682840347290039, "step": 511 }, { "epoch": 1.5786841092727972, "grad_norm": 4.790981769561768, "learning_rate": 8.422839506172839e-07, "logits/chosen": -2.806591272354126, "logits/rejected": -2.850968599319458, "logps/chosen": -46.455047607421875, "logps/rejected": -98.87523651123047, "loss": 0.6302, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4359986782073975, "rewards/margins": 3.6839537620544434, "rewards/rejected": -2.247955083847046, "step": 512 }, { "epoch": 1.5817622162370142, "grad_norm": 5.927247047424316, "learning_rate": 8.419753086419753e-07, "logits/chosen": -2.736030101776123, "logits/rejected": -2.7722554206848145, "logps/chosen": -62.2666130065918, "logps/rejected": -134.27569580078125, "loss": 0.5893, "rewards/accuracies": 0.875, "rewards/chosen": 0.22744455933570862, "rewards/margins": 4.851947784423828, "rewards/rejected": -4.624503135681152, "step": 513 }, { "epoch": 1.5848403232012314, "grad_norm": 5.178743839263916, "learning_rate": 8.416666666666666e-07, "logits/chosen": -2.742907762527466, "logits/rejected": -2.7399637699127197, "logps/chosen": -40.50349426269531, "logps/rejected": -114.65771484375, "loss": 0.4655, "rewards/accuracies": 1.0, "rewards/chosen": 1.983993649482727, "rewards/margins": 5.3552985191345215, "rewards/rejected": -3.371304750442505, "step": 514 }, { "epoch": 1.5879184301654483, "grad_norm": 4.536141395568848, "learning_rate": 8.41358024691358e-07, "logits/chosen": -2.7290210723876953, "logits/rejected": -2.8036437034606934, "logps/chosen": -38.876556396484375, "logps/rejected": -106.41812133789062, "loss": 0.5638, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6684341430664062, "rewards/margins": 4.627778053283691, "rewards/rejected": -2.9593443870544434, "step": 515 }, { "epoch": 1.5909965371296653, "grad_norm": 5.796743869781494, "learning_rate": 8.410493827160493e-07, "logits/chosen": -2.7719008922576904, "logits/rejected": -2.8026058673858643, "logps/chosen": -51.075931549072266, "logps/rejected": -109.79564666748047, "loss": 0.6309, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6038020253181458, "rewards/margins": 3.862971544265747, "rewards/rejected": -3.259169578552246, "step": 516 }, { "epoch": 1.5940746440938822, "grad_norm": 4.330446243286133, "learning_rate": 8.407407407407407e-07, "logits/chosen": -2.795693874359131, "logits/rejected": -2.742950201034546, "logps/chosen": -45.17516326904297, "logps/rejected": -112.06242370605469, "loss": 0.5648, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2843668460845947, "rewards/margins": 4.530392646789551, "rewards/rejected": -3.246025562286377, "step": 517 }, { "epoch": 1.5971527510580992, "grad_norm": 3.1671829223632812, "learning_rate": 8.404320987654321e-07, "logits/chosen": -2.64774489402771, "logits/rejected": -2.6915366649627686, "logps/chosen": -33.06139373779297, "logps/rejected": -122.30655670166016, "loss": 0.4275, "rewards/accuracies": 1.0, "rewards/chosen": 2.079439401626587, "rewards/margins": 6.290975093841553, "rewards/rejected": -4.211535453796387, "step": 518 }, { "epoch": 1.6002308580223161, "grad_norm": 5.147547245025635, "learning_rate": 8.401234567901234e-07, "logits/chosen": -2.8206825256347656, "logits/rejected": -2.804516553878784, "logps/chosen": -31.278053283691406, "logps/rejected": -96.90584564208984, "loss": 0.546, "rewards/accuracies": 1.0, "rewards/chosen": 2.424659490585327, "rewards/margins": 4.58096981048584, "rewards/rejected": -2.1563103199005127, "step": 519 }, { "epoch": 1.6033089649865333, "grad_norm": 5.566901206970215, "learning_rate": 8.398148148148147e-07, "logits/chosen": -2.8209993839263916, "logits/rejected": -2.8194003105163574, "logps/chosen": -37.60646057128906, "logps/rejected": -95.83033752441406, "loss": 0.598, "rewards/accuracies": 0.90625, "rewards/chosen": 2.0057709217071533, "rewards/margins": 4.071435928344727, "rewards/rejected": -2.0656654834747314, "step": 520 }, { "epoch": 1.6033089649865333, "eval_logits/chosen": -2.700025796890259, "eval_logits/rejected": -2.753866672515869, "eval_logps/chosen": -44.736572265625, "eval_logps/rejected": -113.02586364746094, "eval_loss": 0.5745362043380737, "eval_rewards/accuracies": 0.9046242833137512, "eval_rewards/chosen": 1.3537219762802124, "eval_rewards/margins": 4.6397247314453125, "eval_rewards/rejected": -3.2860026359558105, "eval_runtime": 641.5493, "eval_samples_per_second": 0.539, "eval_steps_per_second": 0.27, "step": 520 }, { "epoch": 1.6063870719507503, "grad_norm": 4.772017002105713, "learning_rate": 8.395061728395061e-07, "logits/chosen": -2.912543296813965, "logits/rejected": -2.8180339336395264, "logps/chosen": -37.83483123779297, "logps/rejected": -115.2734603881836, "loss": 0.5, "rewards/accuracies": 0.875, "rewards/chosen": 1.8587650060653687, "rewards/margins": 5.392540454864502, "rewards/rejected": -3.5337753295898438, "step": 521 }, { "epoch": 1.6094651789149674, "grad_norm": 4.170019626617432, "learning_rate": 8.391975308641975e-07, "logits/chosen": -2.7965431213378906, "logits/rejected": -2.813425064086914, "logps/chosen": -48.42216110229492, "logps/rejected": -119.71609497070312, "loss": 0.5458, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2532944679260254, "rewards/margins": 5.066788196563721, "rewards/rejected": -3.8134937286376953, "step": 522 }, { "epoch": 1.6125432858791844, "grad_norm": 6.2316789627075195, "learning_rate": 8.388888888888888e-07, "logits/chosen": -2.7937779426574707, "logits/rejected": -2.7240707874298096, "logps/chosen": -42.598968505859375, "logps/rejected": -102.9259262084961, "loss": 0.5794, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5740327835083008, "rewards/margins": 4.291625499725342, "rewards/rejected": -2.717592716217041, "step": 523 }, { "epoch": 1.6156213928434013, "grad_norm": 4.7185893058776855, "learning_rate": 8.385802469135802e-07, "logits/chosen": -2.7160696983337402, "logits/rejected": -2.7269859313964844, "logps/chosen": -45.619544982910156, "logps/rejected": -114.65634155273438, "loss": 0.5596, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2003138065338135, "rewards/margins": 5.034206867218018, "rewards/rejected": -3.833893299102783, "step": 524 }, { "epoch": 1.6186994998076183, "grad_norm": 5.276822090148926, "learning_rate": 8.382716049382715e-07, "logits/chosen": -2.642098903656006, "logits/rejected": -2.66418194770813, "logps/chosen": -41.14723205566406, "logps/rejected": -99.87583923339844, "loss": 0.5977, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4101917743682861, "rewards/margins": 3.859034538269043, "rewards/rejected": -2.448842763900757, "step": 525 }, { "epoch": 1.6217776067718352, "grad_norm": 5.17531681060791, "learning_rate": 8.379629629629629e-07, "logits/chosen": -2.8557260036468506, "logits/rejected": -2.896378993988037, "logps/chosen": -46.44449234008789, "logps/rejected": -120.08366394042969, "loss": 0.4587, "rewards/accuracies": 1.0, "rewards/chosen": 1.5344505310058594, "rewards/margins": 5.408830165863037, "rewards/rejected": -3.8743796348571777, "step": 526 }, { "epoch": 1.6248557137360522, "grad_norm": 3.773433208465576, "learning_rate": 8.376543209876543e-07, "logits/chosen": -2.777895927429199, "logits/rejected": -2.7725586891174316, "logps/chosen": -53.08352279663086, "logps/rejected": -116.89885711669922, "loss": 0.5748, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7723179459571838, "rewards/margins": 4.572566032409668, "rewards/rejected": -3.800248146057129, "step": 527 }, { "epoch": 1.6279338207002694, "grad_norm": 4.778159141540527, "learning_rate": 8.373456790123456e-07, "logits/chosen": -2.741117000579834, "logits/rejected": -2.746461868286133, "logps/chosen": -48.39366912841797, "logps/rejected": -118.52122497558594, "loss": 0.5251, "rewards/accuracies": 1.0, "rewards/chosen": 1.0771385431289673, "rewards/margins": 4.783949851989746, "rewards/rejected": -3.7068119049072266, "step": 528 }, { "epoch": 1.6310119276644863, "grad_norm": 5.061434268951416, "learning_rate": 8.37037037037037e-07, "logits/chosen": -2.7763288021087646, "logits/rejected": -2.763533353805542, "logps/chosen": -44.006622314453125, "logps/rejected": -105.15216827392578, "loss": 0.4773, "rewards/accuracies": 0.875, "rewards/chosen": 1.589280366897583, "rewards/margins": 4.604839324951172, "rewards/rejected": -3.0155584812164307, "step": 529 }, { "epoch": 1.6340900346287035, "grad_norm": 4.6599555015563965, "learning_rate": 8.367283950617283e-07, "logits/chosen": -2.7483179569244385, "logits/rejected": -2.781324863433838, "logps/chosen": -52.470584869384766, "logps/rejected": -114.3968505859375, "loss": 0.5379, "rewards/accuracies": 0.875, "rewards/chosen": 1.0338879823684692, "rewards/margins": 4.57088041305542, "rewards/rejected": -3.5369927883148193, "step": 530 }, { "epoch": 1.6371681415929205, "grad_norm": 4.470277309417725, "learning_rate": 8.364197530864197e-07, "logits/chosen": -2.8195102214813232, "logits/rejected": -2.8254354000091553, "logps/chosen": -41.6094970703125, "logps/rejected": -109.1093521118164, "loss": 0.5007, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4761462211608887, "rewards/margins": 4.90103816986084, "rewards/rejected": -3.424891710281372, "step": 531 }, { "epoch": 1.6402462485571374, "grad_norm": 4.26698112487793, "learning_rate": 8.361111111111111e-07, "logits/chosen": -2.777982234954834, "logits/rejected": -2.800537586212158, "logps/chosen": -36.487667083740234, "logps/rejected": -111.05683898925781, "loss": 0.3741, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8741334676742554, "rewards/margins": 5.503963947296143, "rewards/rejected": -3.6298301219940186, "step": 532 }, { "epoch": 1.6433243555213544, "grad_norm": 4.76511287689209, "learning_rate": 8.358024691358024e-07, "logits/chosen": -2.7822251319885254, "logits/rejected": -2.812281847000122, "logps/chosen": -51.25007629394531, "logps/rejected": -104.97879028320312, "loss": 0.5464, "rewards/accuracies": 1.0, "rewards/chosen": 0.7188231945037842, "rewards/margins": 3.997093915939331, "rewards/rejected": -3.278270721435547, "step": 533 }, { "epoch": 1.6464024624855713, "grad_norm": 3.5384271144866943, "learning_rate": 8.354938271604937e-07, "logits/chosen": -2.842836380004883, "logits/rejected": -2.7885217666625977, "logps/chosen": -54.60448455810547, "logps/rejected": -131.4335479736328, "loss": 0.5217, "rewards/accuracies": 0.9375, "rewards/chosen": 0.40869665145874023, "rewards/margins": 5.443045616149902, "rewards/rejected": -5.03434944152832, "step": 534 }, { "epoch": 1.6494805694497883, "grad_norm": 3.840157985687256, "learning_rate": 8.351851851851851e-07, "logits/chosen": -2.7710418701171875, "logits/rejected": -2.7643356323242188, "logps/chosen": -46.969478607177734, "logps/rejected": -118.15895080566406, "loss": 0.5726, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3425853252410889, "rewards/margins": 4.953789234161377, "rewards/rejected": -3.611203908920288, "step": 535 }, { "epoch": 1.6525586764140052, "grad_norm": 4.075953483581543, "learning_rate": 8.348765432098765e-07, "logits/chosen": -2.7875092029571533, "logits/rejected": -2.780473232269287, "logps/chosen": -45.73774337768555, "logps/rejected": -116.96512603759766, "loss": 0.4828, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2232680320739746, "rewards/margins": 5.247304439544678, "rewards/rejected": -4.024036884307861, "step": 536 }, { "epoch": 1.6556367833782224, "grad_norm": 4.822761058807373, "learning_rate": 8.345679012345679e-07, "logits/chosen": -2.7757325172424316, "logits/rejected": -2.7774364948272705, "logps/chosen": -37.900665283203125, "logps/rejected": -106.13534545898438, "loss": 0.5053, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7947278022766113, "rewards/margins": 4.742358684539795, "rewards/rejected": -2.9476308822631836, "step": 537 }, { "epoch": 1.6587148903424394, "grad_norm": 4.459584712982178, "learning_rate": 8.342592592592592e-07, "logits/chosen": -2.7939705848693848, "logits/rejected": -2.802330255508423, "logps/chosen": -45.133174896240234, "logps/rejected": -124.80856323242188, "loss": 0.4379, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3764115571975708, "rewards/margins": 5.657584190368652, "rewards/rejected": -4.281172752380371, "step": 538 }, { "epoch": 1.6617929973066565, "grad_norm": 4.304520130157471, "learning_rate": 8.339506172839505e-07, "logits/chosen": -2.6820034980773926, "logits/rejected": -2.671043634414673, "logps/chosen": -56.23412322998047, "logps/rejected": -107.44029235839844, "loss": 0.6676, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6152776479721069, "rewards/margins": 3.623617649078369, "rewards/rejected": -3.0083398818969727, "step": 539 }, { "epoch": 1.6648711042708735, "grad_norm": 4.2851481437683105, "learning_rate": 8.336419753086419e-07, "logits/chosen": -2.8086647987365723, "logits/rejected": -2.747087001800537, "logps/chosen": -43.0950813293457, "logps/rejected": -105.5395278930664, "loss": 0.5702, "rewards/accuracies": 1.0, "rewards/chosen": 1.448526382446289, "rewards/margins": 4.4713335037231445, "rewards/rejected": -3.0228068828582764, "step": 540 }, { "epoch": 1.6679492112350904, "grad_norm": 4.236932754516602, "learning_rate": 8.333333333333333e-07, "logits/chosen": -2.740388870239258, "logits/rejected": -2.7429497241973877, "logps/chosen": -40.98859786987305, "logps/rejected": -115.96564483642578, "loss": 0.4815, "rewards/accuracies": 1.0, "rewards/chosen": 1.4018762111663818, "rewards/margins": 5.25253963470459, "rewards/rejected": -3.850663423538208, "step": 541 }, { "epoch": 1.6710273181993074, "grad_norm": 4.387599468231201, "learning_rate": 8.330246913580246e-07, "logits/chosen": -2.687406301498413, "logits/rejected": -2.7051095962524414, "logps/chosen": -51.79591751098633, "logps/rejected": -117.76292419433594, "loss": 0.5438, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8477526903152466, "rewards/margins": 4.753791332244873, "rewards/rejected": -3.906038522720337, "step": 542 }, { "epoch": 1.6741054251635243, "grad_norm": 5.771734714508057, "learning_rate": 8.32716049382716e-07, "logits/chosen": -2.713292360305786, "logits/rejected": -2.772505760192871, "logps/chosen": -52.84925079345703, "logps/rejected": -113.53691864013672, "loss": 0.5144, "rewards/accuracies": 0.9375, "rewards/chosen": 0.684005856513977, "rewards/margins": 4.785798072814941, "rewards/rejected": -4.101792335510254, "step": 543 }, { "epoch": 1.6771835321277413, "grad_norm": 3.17818546295166, "learning_rate": 8.324074074074073e-07, "logits/chosen": -2.809704542160034, "logits/rejected": -2.7547476291656494, "logps/chosen": -66.78567504882812, "logps/rejected": -119.91973114013672, "loss": 0.6469, "rewards/accuracies": 0.875, "rewards/chosen": -0.2767106890678406, "rewards/margins": 3.9285125732421875, "rewards/rejected": -4.205223560333252, "step": 544 }, { "epoch": 1.6802616390919585, "grad_norm": 3.6845574378967285, "learning_rate": 8.320987654320987e-07, "logits/chosen": -2.743264675140381, "logits/rejected": -2.6997029781341553, "logps/chosen": -61.54037094116211, "logps/rejected": -118.62265014648438, "loss": 0.6071, "rewards/accuracies": 0.84375, "rewards/chosen": 0.21599604189395905, "rewards/margins": 3.9527089595794678, "rewards/rejected": -3.736712694168091, "step": 545 }, { "epoch": 1.6833397460561754, "grad_norm": 5.207515716552734, "learning_rate": 8.317901234567901e-07, "logits/chosen": -2.8306727409362793, "logits/rejected": -2.7719576358795166, "logps/chosen": -49.65234375, "logps/rejected": -127.62135314941406, "loss": 0.4659, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9178534150123596, "rewards/margins": 5.618062496185303, "rewards/rejected": -4.700209140777588, "step": 546 }, { "epoch": 1.6864178530203926, "grad_norm": 5.569070339202881, "learning_rate": 8.314814814814814e-07, "logits/chosen": -2.6809184551239014, "logits/rejected": -2.7545950412750244, "logps/chosen": -43.25208282470703, "logps/rejected": -121.96465301513672, "loss": 0.4414, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5176652669906616, "rewards/margins": 5.808688163757324, "rewards/rejected": -4.291022777557373, "step": 547 }, { "epoch": 1.6894959599846096, "grad_norm": 4.578601360321045, "learning_rate": 8.311728395061728e-07, "logits/chosen": -2.834775686264038, "logits/rejected": -2.7904036045074463, "logps/chosen": -46.22577667236328, "logps/rejected": -123.95277404785156, "loss": 0.4955, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3019592761993408, "rewards/margins": 5.5989532470703125, "rewards/rejected": -4.296994209289551, "step": 548 }, { "epoch": 1.6925740669488265, "grad_norm": 4.3135085105896, "learning_rate": 8.308641975308641e-07, "logits/chosen": -2.7521941661834717, "logits/rejected": -2.7618463039398193, "logps/chosen": -47.077667236328125, "logps/rejected": -101.16253662109375, "loss": 0.6034, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7696244716644287, "rewards/margins": 4.060938358306885, "rewards/rejected": -3.291314125061035, "step": 549 }, { "epoch": 1.6956521739130435, "grad_norm": 5.280683994293213, "learning_rate": 8.305555555555555e-07, "logits/chosen": -2.7859411239624023, "logits/rejected": -2.764540433883667, "logps/chosen": -34.539756774902344, "logps/rejected": -113.29496765136719, "loss": 0.4735, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8496322631835938, "rewards/margins": 5.493692874908447, "rewards/rejected": -3.6440608501434326, "step": 550 }, { "epoch": 1.6987302808772604, "grad_norm": 5.336928367614746, "learning_rate": 8.302469135802469e-07, "logits/chosen": -2.7077457904815674, "logits/rejected": -2.802110433578491, "logps/chosen": -51.723594665527344, "logps/rejected": -104.40586853027344, "loss": 0.6102, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8058369755744934, "rewards/margins": 3.909451484680176, "rewards/rejected": -3.103614330291748, "step": 551 }, { "epoch": 1.7018083878414774, "grad_norm": 4.212486743927002, "learning_rate": 8.299382716049382e-07, "logits/chosen": -2.7723588943481445, "logits/rejected": -2.7760279178619385, "logps/chosen": -58.55997085571289, "logps/rejected": -136.97097778320312, "loss": 0.5068, "rewards/accuracies": 0.9375, "rewards/chosen": 0.41213661432266235, "rewards/margins": 5.736687660217285, "rewards/rejected": -5.324551105499268, "step": 552 }, { "epoch": 1.7048864948056945, "grad_norm": 4.422597885131836, "learning_rate": 8.296296296296295e-07, "logits/chosen": -2.779038429260254, "logits/rejected": -2.838407039642334, "logps/chosen": -39.53129959106445, "logps/rejected": -113.14078521728516, "loss": 0.4558, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4317162036895752, "rewards/margins": 5.555215358734131, "rewards/rejected": -4.123499393463135, "step": 553 }, { "epoch": 1.7079646017699115, "grad_norm": 4.9304118156433105, "learning_rate": 8.293209876543209e-07, "logits/chosen": -2.8577845096588135, "logits/rejected": -2.7810206413269043, "logps/chosen": -42.60234451293945, "logps/rejected": -113.71824645996094, "loss": 0.531, "rewards/accuracies": 0.9375, "rewards/chosen": 1.163018822669983, "rewards/margins": 5.081546306610107, "rewards/rejected": -3.918527126312256, "step": 554 }, { "epoch": 1.7110427087341287, "grad_norm": 5.503544330596924, "learning_rate": 8.290123456790123e-07, "logits/chosen": -2.800434112548828, "logits/rejected": -2.7546842098236084, "logps/chosen": -52.021976470947266, "logps/rejected": -135.24127197265625, "loss": 0.4414, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7573094367980957, "rewards/margins": 6.2309465408325195, "rewards/rejected": -5.473636627197266, "step": 555 }, { "epoch": 1.7141208156983456, "grad_norm": 3.7153778076171875, "learning_rate": 8.287037037037036e-07, "logits/chosen": -2.8590333461761475, "logits/rejected": -2.8626468181610107, "logps/chosen": -61.54450607299805, "logps/rejected": -116.6192398071289, "loss": 0.5984, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1931619942188263, "rewards/margins": 4.3831987380981445, "rewards/rejected": -4.190036773681641, "step": 556 }, { "epoch": 1.7171989226625626, "grad_norm": 7.474632740020752, "learning_rate": 8.28395061728395e-07, "logits/chosen": -2.8293685913085938, "logits/rejected": -2.7609760761260986, "logps/chosen": -50.455543518066406, "logps/rejected": -118.98310089111328, "loss": 0.5485, "rewards/accuracies": 0.875, "rewards/chosen": 0.7071155309677124, "rewards/margins": 4.982056617736816, "rewards/rejected": -4.2749409675598145, "step": 557 }, { "epoch": 1.7202770296267795, "grad_norm": 6.676851272583008, "learning_rate": 8.280864197530863e-07, "logits/chosen": -2.7407145500183105, "logits/rejected": -2.7451765537261963, "logps/chosen": -43.28877258300781, "logps/rejected": -120.1311264038086, "loss": 0.5262, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7262498140335083, "rewards/margins": 5.692273139953613, "rewards/rejected": -3.9660234451293945, "step": 558 }, { "epoch": 1.7233551365909965, "grad_norm": 4.60856819152832, "learning_rate": 8.277777777777777e-07, "logits/chosen": -2.8212621212005615, "logits/rejected": -2.8089511394500732, "logps/chosen": -56.96446990966797, "logps/rejected": -131.3027801513672, "loss": 0.5973, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6010758876800537, "rewards/margins": 5.465061187744141, "rewards/rejected": -4.86398458480835, "step": 559 }, { "epoch": 1.7264332435552134, "grad_norm": 4.84807825088501, "learning_rate": 8.274691358024691e-07, "logits/chosen": -2.7318902015686035, "logits/rejected": -2.7260336875915527, "logps/chosen": -59.99148941040039, "logps/rejected": -117.07235717773438, "loss": 0.6181, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0760895162820816, "rewards/margins": 4.057740688323975, "rewards/rejected": -4.1338300704956055, "step": 560 }, { "epoch": 1.7295113505194304, "grad_norm": 9.472513198852539, "learning_rate": 8.271604938271604e-07, "logits/chosen": -2.7676210403442383, "logits/rejected": -2.7781050205230713, "logps/chosen": -51.19005584716797, "logps/rejected": -138.58946228027344, "loss": 0.514, "rewards/accuracies": 1.0, "rewards/chosen": 0.7215098142623901, "rewards/margins": 6.086462020874023, "rewards/rejected": -5.364952087402344, "step": 561 }, { "epoch": 1.7325894574836476, "grad_norm": 5.455531597137451, "learning_rate": 8.268518518518518e-07, "logits/chosen": -2.7835769653320312, "logits/rejected": -2.7542734146118164, "logps/chosen": -52.9112548828125, "logps/rejected": -111.10183715820312, "loss": 0.6435, "rewards/accuracies": 1.0, "rewards/chosen": 0.8015432357788086, "rewards/margins": 4.372526168823242, "rewards/rejected": -3.5709824562072754, "step": 562 }, { "epoch": 1.7356675644478645, "grad_norm": 5.384328842163086, "learning_rate": 8.265432098765431e-07, "logits/chosen": -2.8194713592529297, "logits/rejected": -2.8627946376800537, "logps/chosen": -42.3115234375, "logps/rejected": -107.59199523925781, "loss": 0.4897, "rewards/accuracies": 0.875, "rewards/chosen": 1.3066678047180176, "rewards/margins": 4.487341403961182, "rewards/rejected": -3.1806740760803223, "step": 563 }, { "epoch": 1.7387456714120817, "grad_norm": 4.810291290283203, "learning_rate": 8.262345679012345e-07, "logits/chosen": -2.7699012756347656, "logits/rejected": -2.794985294342041, "logps/chosen": -58.529327392578125, "logps/rejected": -119.7598648071289, "loss": 0.5585, "rewards/accuracies": 0.90625, "rewards/chosen": 0.24361379444599152, "rewards/margins": 4.564545631408691, "rewards/rejected": -4.320931911468506, "step": 564 }, { "epoch": 1.7418237783762986, "grad_norm": 3.637054681777954, "learning_rate": 8.259259259259259e-07, "logits/chosen": -2.764756917953491, "logits/rejected": -2.777580738067627, "logps/chosen": -64.0434341430664, "logps/rejected": -117.38088989257812, "loss": 0.6373, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13741691410541534, "rewards/margins": 3.7515835762023926, "rewards/rejected": -3.8889999389648438, "step": 565 }, { "epoch": 1.7449018853405156, "grad_norm": 5.0616865158081055, "learning_rate": 8.256172839506172e-07, "logits/chosen": -2.72855281829834, "logits/rejected": -2.75590181350708, "logps/chosen": -38.798789978027344, "logps/rejected": -102.56137084960938, "loss": 0.5167, "rewards/accuracies": 0.875, "rewards/chosen": 1.941861867904663, "rewards/margins": 4.651712894439697, "rewards/rejected": -2.709851026535034, "step": 566 }, { "epoch": 1.7479799923047326, "grad_norm": 4.106424808502197, "learning_rate": 8.253086419753085e-07, "logits/chosen": -2.7791390419006348, "logits/rejected": -2.812635660171509, "logps/chosen": -51.233558654785156, "logps/rejected": -115.5835952758789, "loss": 0.5611, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8171654939651489, "rewards/margins": 4.888144016265869, "rewards/rejected": -4.070979595184326, "step": 567 }, { "epoch": 1.7510580992689495, "grad_norm": 6.285093307495117, "learning_rate": 8.249999999999999e-07, "logits/chosen": -2.78556489944458, "logits/rejected": -2.7441372871398926, "logps/chosen": -49.465946197509766, "logps/rejected": -114.5757064819336, "loss": 0.5179, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8056058287620544, "rewards/margins": 4.833130836486816, "rewards/rejected": -4.027524948120117, "step": 568 }, { "epoch": 1.7541362062331665, "grad_norm": 5.580617427825928, "learning_rate": 8.246913580246913e-07, "logits/chosen": -2.8117873668670654, "logits/rejected": -2.785968065261841, "logps/chosen": -49.70905685424805, "logps/rejected": -110.17823791503906, "loss": 0.579, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0721570253372192, "rewards/margins": 4.593324661254883, "rewards/rejected": -3.521167278289795, "step": 569 }, { "epoch": 1.7572143131973836, "grad_norm": 4.030699729919434, "learning_rate": 8.243827160493827e-07, "logits/chosen": -2.7360236644744873, "logits/rejected": -2.7064805030822754, "logps/chosen": -61.32687759399414, "logps/rejected": -131.61459350585938, "loss": 0.5301, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2307163029909134, "rewards/margins": 5.024377346038818, "rewards/rejected": -4.793661117553711, "step": 570 }, { "epoch": 1.7602924201616006, "grad_norm": 5.610108375549316, "learning_rate": 8.24074074074074e-07, "logits/chosen": -2.799968957901001, "logits/rejected": -2.769183397293091, "logps/chosen": -36.90653610229492, "logps/rejected": -122.99288940429688, "loss": 0.4473, "rewards/accuracies": 1.0, "rewards/chosen": 2.1034772396087646, "rewards/margins": 6.142984390258789, "rewards/rejected": -4.0395073890686035, "step": 571 }, { "epoch": 1.7633705271258178, "grad_norm": 5.061585903167725, "learning_rate": 8.237654320987653e-07, "logits/chosen": -2.7872302532196045, "logits/rejected": -2.7263309955596924, "logps/chosen": -58.99483108520508, "logps/rejected": -115.73563385009766, "loss": 0.6119, "rewards/accuracies": 0.84375, "rewards/chosen": 0.40904611349105835, "rewards/margins": 3.9589900970458984, "rewards/rejected": -3.5499441623687744, "step": 572 }, { "epoch": 1.7664486340900347, "grad_norm": 4.97266960144043, "learning_rate": 8.234567901234567e-07, "logits/chosen": -2.7731785774230957, "logits/rejected": -2.7410149574279785, "logps/chosen": -50.192420959472656, "logps/rejected": -137.59739685058594, "loss": 0.5185, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7465214729309082, "rewards/margins": 5.963581562042236, "rewards/rejected": -5.217060089111328, "step": 573 }, { "epoch": 1.7695267410542517, "grad_norm": 6.969456195831299, "learning_rate": 8.231481481481481e-07, "logits/chosen": -2.764641046524048, "logits/rejected": -2.7339487075805664, "logps/chosen": -57.56258773803711, "logps/rejected": -132.71853637695312, "loss": 0.6303, "rewards/accuracies": 0.9375, "rewards/chosen": 0.006936401128768921, "rewards/margins": 5.250393390655518, "rewards/rejected": -5.243456840515137, "step": 574 }, { "epoch": 1.7726048480184686, "grad_norm": 5.049924373626709, "learning_rate": 8.228395061728394e-07, "logits/chosen": -2.7344956398010254, "logits/rejected": -2.808924436569214, "logps/chosen": -42.66386795043945, "logps/rejected": -109.57945251464844, "loss": 0.5542, "rewards/accuracies": 0.875, "rewards/chosen": 1.3834909200668335, "rewards/margins": 4.624438285827637, "rewards/rejected": -3.2409472465515137, "step": 575 }, { "epoch": 1.7756829549826856, "grad_norm": 4.113063812255859, "learning_rate": 8.225308641975308e-07, "logits/chosen": -2.855900764465332, "logits/rejected": -2.817495346069336, "logps/chosen": -41.95174026489258, "logps/rejected": -133.54954528808594, "loss": 0.4774, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4252480268478394, "rewards/margins": 6.018917083740234, "rewards/rejected": -4.593669414520264, "step": 576 }, { "epoch": 1.7787610619469025, "grad_norm": 4.61047887802124, "learning_rate": 8.222222222222221e-07, "logits/chosen": -2.6928765773773193, "logits/rejected": -2.6937718391418457, "logps/chosen": -46.15195083618164, "logps/rejected": -100.41596221923828, "loss": 0.5765, "rewards/accuracies": 0.96875, "rewards/chosen": 1.453756332397461, "rewards/margins": 3.9870712757110596, "rewards/rejected": -2.5333151817321777, "step": 577 }, { "epoch": 1.7818391689111197, "grad_norm": 5.335884094238281, "learning_rate": 8.219135802469136e-07, "logits/chosen": -2.837308645248413, "logits/rejected": -2.7703046798706055, "logps/chosen": -48.83916091918945, "logps/rejected": -125.20684814453125, "loss": 0.4782, "rewards/accuracies": 1.0, "rewards/chosen": 1.2213101387023926, "rewards/margins": 5.86259126663208, "rewards/rejected": -4.6412811279296875, "step": 578 }, { "epoch": 1.7849172758753367, "grad_norm": 4.482906818389893, "learning_rate": 8.216049382716049e-07, "logits/chosen": -2.7926042079925537, "logits/rejected": -2.770320177078247, "logps/chosen": -49.56791305541992, "logps/rejected": -114.43638610839844, "loss": 0.5091, "rewards/accuracies": 0.96875, "rewards/chosen": 1.015669345855713, "rewards/margins": 4.591315746307373, "rewards/rejected": -3.57564640045166, "step": 579 }, { "epoch": 1.7879953828395538, "grad_norm": 4.533597946166992, "learning_rate": 8.212962962962962e-07, "logits/chosen": -2.7075114250183105, "logits/rejected": -2.747523307800293, "logps/chosen": -37.83313751220703, "logps/rejected": -131.46253967285156, "loss": 0.4273, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9011529684066772, "rewards/margins": 6.315458297729492, "rewards/rejected": -4.414306163787842, "step": 580 }, { "epoch": 1.7910734898037708, "grad_norm": 3.9369709491729736, "learning_rate": 8.209876543209876e-07, "logits/chosen": -2.719036102294922, "logits/rejected": -2.7029852867126465, "logps/chosen": -44.88811492919922, "logps/rejected": -108.39762115478516, "loss": 0.5063, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4504053592681885, "rewards/margins": 4.593606472015381, "rewards/rejected": -3.143200635910034, "step": 581 }, { "epoch": 1.7941515967679877, "grad_norm": 3.5052073001861572, "learning_rate": 8.206790123456789e-07, "logits/chosen": -2.820040225982666, "logits/rejected": -2.798954725265503, "logps/chosen": -48.07337951660156, "logps/rejected": -131.62403869628906, "loss": 0.4821, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2789188623428345, "rewards/margins": 5.951666831970215, "rewards/rejected": -4.67274808883667, "step": 582 }, { "epoch": 1.7972297037322047, "grad_norm": 4.2694573402404785, "learning_rate": 8.203703703703703e-07, "logits/chosen": -2.682615041732788, "logits/rejected": -2.7513504028320312, "logps/chosen": -44.511878967285156, "logps/rejected": -115.5698471069336, "loss": 0.5269, "rewards/accuracies": 0.875, "rewards/chosen": 1.292746663093567, "rewards/margins": 4.88408899307251, "rewards/rejected": -3.5913424491882324, "step": 583 }, { "epoch": 1.8003078106964217, "grad_norm": 5.975352764129639, "learning_rate": 8.200617283950617e-07, "logits/chosen": -2.744567394256592, "logits/rejected": -2.725109577178955, "logps/chosen": -33.8690071105957, "logps/rejected": -107.57404327392578, "loss": 0.498, "rewards/accuracies": 1.0, "rewards/chosen": 2.2368671894073486, "rewards/margins": 5.109851360321045, "rewards/rejected": -2.8729841709136963, "step": 584 }, { "epoch": 1.8033859176606386, "grad_norm": 4.511993408203125, "learning_rate": 8.19753086419753e-07, "logits/chosen": -2.7571969032287598, "logits/rejected": -2.7205042839050293, "logps/chosen": -40.10261535644531, "logps/rejected": -106.77888488769531, "loss": 0.5603, "rewards/accuracies": 0.875, "rewards/chosen": 2.058502197265625, "rewards/margins": 4.780135631561279, "rewards/rejected": -2.721633195877075, "step": 585 }, { "epoch": 1.8033859176606386, "eval_logits/chosen": -2.6930418014526367, "eval_logits/rejected": -2.744455575942993, "eval_logps/chosen": -43.68545913696289, "eval_logps/rejected": -111.54931640625, "eval_loss": 0.5756955742835999, "eval_rewards/accuracies": 0.9046242833137512, "eval_rewards/chosen": 1.4588334560394287, "eval_rewards/margins": 4.597181797027588, "eval_rewards/rejected": -3.138348340988159, "eval_runtime": 637.6743, "eval_samples_per_second": 0.543, "eval_steps_per_second": 0.271, "step": 585 }, { "epoch": 1.8064640246248556, "grad_norm": 3.953626871109009, "learning_rate": 8.194444444444443e-07, "logits/chosen": -2.847034454345703, "logits/rejected": -2.7508981227874756, "logps/chosen": -49.067081451416016, "logps/rejected": -104.24158477783203, "loss": 0.5158, "rewards/accuracies": 0.9375, "rewards/chosen": 1.264807939529419, "rewards/margins": 4.22362756729126, "rewards/rejected": -2.958819627761841, "step": 586 }, { "epoch": 1.8095421315890727, "grad_norm": 4.929173469543457, "learning_rate": 8.191358024691357e-07, "logits/chosen": -2.847533702850342, "logits/rejected": -2.7274386882781982, "logps/chosen": -38.48179626464844, "logps/rejected": -100.11824798583984, "loss": 0.5591, "rewards/accuracies": 1.0, "rewards/chosen": 2.076723575592041, "rewards/margins": 4.513071060180664, "rewards/rejected": -2.436347484588623, "step": 587 }, { "epoch": 1.8126202385532897, "grad_norm": 4.910367012023926, "learning_rate": 8.188271604938271e-07, "logits/chosen": -2.786238193511963, "logits/rejected": -2.685290575027466, "logps/chosen": -39.37531280517578, "logps/rejected": -98.1924057006836, "loss": 0.5806, "rewards/accuracies": 0.96875, "rewards/chosen": 2.036031484603882, "rewards/margins": 4.259962558746338, "rewards/rejected": -2.223930835723877, "step": 588 }, { "epoch": 1.8156983455175069, "grad_norm": 5.521851062774658, "learning_rate": 8.185185185185185e-07, "logits/chosen": -2.6791188716888428, "logits/rejected": -2.6829328536987305, "logps/chosen": -44.884857177734375, "logps/rejected": -109.8424072265625, "loss": 0.589, "rewards/accuracies": 0.875, "rewards/chosen": 1.341733694076538, "rewards/margins": 4.622644424438477, "rewards/rejected": -3.2809107303619385, "step": 589 }, { "epoch": 1.8187764524817238, "grad_norm": 4.292459964752197, "learning_rate": 8.182098765432098e-07, "logits/chosen": -2.733626365661621, "logits/rejected": -2.723519802093506, "logps/chosen": -37.34818649291992, "logps/rejected": -130.97378540039062, "loss": 0.3456, "rewards/accuracies": 1.0, "rewards/chosen": 1.9218952655792236, "rewards/margins": 6.511597633361816, "rewards/rejected": -4.589702606201172, "step": 590 }, { "epoch": 1.8218545594459408, "grad_norm": 5.443973064422607, "learning_rate": 8.179012345679011e-07, "logits/chosen": -2.6950523853302, "logits/rejected": -2.798297882080078, "logps/chosen": -47.091453552246094, "logps/rejected": -126.01717376708984, "loss": 0.4751, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2399325370788574, "rewards/margins": 5.900646209716797, "rewards/rejected": -4.6607136726379395, "step": 591 }, { "epoch": 1.8249326664101577, "grad_norm": 4.107137203216553, "learning_rate": 8.175925925925925e-07, "logits/chosen": -2.7048683166503906, "logits/rejected": -2.703315496444702, "logps/chosen": -51.74125671386719, "logps/rejected": -116.93836975097656, "loss": 0.6252, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9980874061584473, "rewards/margins": 4.354103088378906, "rewards/rejected": -3.356015682220459, "step": 592 }, { "epoch": 1.8280107733743747, "grad_norm": 4.167624473571777, "learning_rate": 8.17283950617284e-07, "logits/chosen": -2.8199219703674316, "logits/rejected": -2.7707831859588623, "logps/chosen": -36.21611022949219, "logps/rejected": -115.95658111572266, "loss": 0.5312, "rewards/accuracies": 0.96875, "rewards/chosen": 2.10127854347229, "rewards/margins": 5.396409034729004, "rewards/rejected": -3.2951302528381348, "step": 593 }, { "epoch": 1.8310888803385916, "grad_norm": 5.551128387451172, "learning_rate": 8.169753086419752e-07, "logits/chosen": -2.7298624515533447, "logits/rejected": -2.709125280380249, "logps/chosen": -39.875709533691406, "logps/rejected": -114.54557800292969, "loss": 0.4346, "rewards/accuracies": 1.0, "rewards/chosen": 1.7763322591781616, "rewards/margins": 5.362733364105225, "rewards/rejected": -3.5864009857177734, "step": 594 }, { "epoch": 1.8341669873028088, "grad_norm": 3.4309356212615967, "learning_rate": 8.166666666666666e-07, "logits/chosen": -2.7848727703094482, "logits/rejected": -2.811943769454956, "logps/chosen": -33.11189270019531, "logps/rejected": -122.65503692626953, "loss": 0.3822, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3462040424346924, "rewards/margins": 6.492448329925537, "rewards/rejected": -4.146244049072266, "step": 595 }, { "epoch": 1.8372450942670258, "grad_norm": 5.3807454109191895, "learning_rate": 8.163580246913579e-07, "logits/chosen": -2.646963596343994, "logits/rejected": -2.7089293003082275, "logps/chosen": -43.32307434082031, "logps/rejected": -97.53388977050781, "loss": 0.6374, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5444234609603882, "rewards/margins": 3.9388766288757324, "rewards/rejected": -2.3944530487060547, "step": 596 }, { "epoch": 1.840323201231243, "grad_norm": 3.70381760597229, "learning_rate": 8.160493827160493e-07, "logits/chosen": -2.6989874839782715, "logits/rejected": -2.710636854171753, "logps/chosen": -54.46627426147461, "logps/rejected": -116.57514190673828, "loss": 0.5579, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7927922010421753, "rewards/margins": 4.437745094299316, "rewards/rejected": -3.6449530124664307, "step": 597 }, { "epoch": 1.8434013081954599, "grad_norm": 3.9562294483184814, "learning_rate": 8.157407407407407e-07, "logits/chosen": -2.6298508644104004, "logits/rejected": -2.6412811279296875, "logps/chosen": -41.330989837646484, "logps/rejected": -99.40579986572266, "loss": 0.5513, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7197022438049316, "rewards/margins": 4.407790660858154, "rewards/rejected": -2.6880886554718018, "step": 598 }, { "epoch": 1.8464794151596768, "grad_norm": 6.954900741577148, "learning_rate": 8.15432098765432e-07, "logits/chosen": -2.833095073699951, "logits/rejected": -2.7681901454925537, "logps/chosen": -38.57568359375, "logps/rejected": -103.07420349121094, "loss": 0.5715, "rewards/accuracies": 0.90625, "rewards/chosen": 1.512209177017212, "rewards/margins": 4.373490810394287, "rewards/rejected": -2.8612818717956543, "step": 599 }, { "epoch": 1.8495575221238938, "grad_norm": 3.976748466491699, "learning_rate": 8.151234567901234e-07, "logits/chosen": -2.8326985836029053, "logits/rejected": -2.7562499046325684, "logps/chosen": -41.01713562011719, "logps/rejected": -110.71296691894531, "loss": 0.5539, "rewards/accuracies": 0.8125, "rewards/chosen": 1.7073429822921753, "rewards/margins": 4.964303016662598, "rewards/rejected": -3.256960153579712, "step": 600 }, { "epoch": 1.8526356290881107, "grad_norm": 3.8283042907714844, "learning_rate": 8.148148148148147e-07, "logits/chosen": -2.6942458152770996, "logits/rejected": -2.778195858001709, "logps/chosen": -57.50828552246094, "logps/rejected": -129.78900146484375, "loss": 0.4979, "rewards/accuracies": 0.9375, "rewards/chosen": 0.687754213809967, "rewards/margins": 5.329473972320557, "rewards/rejected": -4.641719818115234, "step": 601 }, { "epoch": 1.8557137360523277, "grad_norm": 5.053724765777588, "learning_rate": 8.145061728395061e-07, "logits/chosen": -2.759593963623047, "logits/rejected": -2.748262882232666, "logps/chosen": -44.89959716796875, "logps/rejected": -97.7231674194336, "loss": 0.6307, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3142521381378174, "rewards/margins": 3.679645299911499, "rewards/rejected": -2.3653929233551025, "step": 602 }, { "epoch": 1.8587918430165449, "grad_norm": 4.40705680847168, "learning_rate": 8.141975308641976e-07, "logits/chosen": -2.759988784790039, "logits/rejected": -2.7335703372955322, "logps/chosen": -51.077781677246094, "logps/rejected": -133.39715576171875, "loss": 0.4533, "rewards/accuracies": 0.96875, "rewards/chosen": 1.160249948501587, "rewards/margins": 5.899967670440674, "rewards/rejected": -4.739717960357666, "step": 603 }, { "epoch": 1.8618699499807618, "grad_norm": 4.189136028289795, "learning_rate": 8.138888888888888e-07, "logits/chosen": -2.731043815612793, "logits/rejected": -2.686831474304199, "logps/chosen": -45.510833740234375, "logps/rejected": -124.77099609375, "loss": 0.485, "rewards/accuracies": 1.0, "rewards/chosen": 1.0919773578643799, "rewards/margins": 5.406651973724365, "rewards/rejected": -4.314673900604248, "step": 604 }, { "epoch": 1.8649480569449788, "grad_norm": 4.812796592712402, "learning_rate": 8.135802469135801e-07, "logits/chosen": -2.7687437534332275, "logits/rejected": -2.771812915802002, "logps/chosen": -44.19822692871094, "logps/rejected": -100.43087768554688, "loss": 0.5743, "rewards/accuracies": 0.78125, "rewards/chosen": 1.5468920469284058, "rewards/margins": 4.024961471557617, "rewards/rejected": -2.478070020675659, "step": 605 }, { "epoch": 1.868026163909196, "grad_norm": 3.9161930084228516, "learning_rate": 8.132716049382715e-07, "logits/chosen": -2.806382417678833, "logits/rejected": -2.86199688911438, "logps/chosen": -50.88847732543945, "logps/rejected": -137.39112854003906, "loss": 0.3739, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9438698291778564, "rewards/margins": 6.273854732513428, "rewards/rejected": -5.329984664916992, "step": 606 }, { "epoch": 1.871104270873413, "grad_norm": 4.971579074859619, "learning_rate": 8.12962962962963e-07, "logits/chosen": -2.7048370838165283, "logits/rejected": -2.7048912048339844, "logps/chosen": -51.63713455200195, "logps/rejected": -133.32212829589844, "loss": 0.5648, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8455559015274048, "rewards/margins": 5.608076572418213, "rewards/rejected": -4.762521266937256, "step": 607 }, { "epoch": 1.8741823778376299, "grad_norm": 5.109352111816406, "learning_rate": 8.126543209876542e-07, "logits/chosen": -2.740542411804199, "logits/rejected": -2.7700154781341553, "logps/chosen": -50.57904815673828, "logps/rejected": -123.98869323730469, "loss": 0.6071, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8967276215553284, "rewards/margins": 5.0680952072143555, "rewards/rejected": -4.17136812210083, "step": 608 }, { "epoch": 1.8772604848018468, "grad_norm": 4.821654796600342, "learning_rate": 8.123456790123457e-07, "logits/chosen": -2.7573466300964355, "logits/rejected": -2.6822071075439453, "logps/chosen": -54.02415084838867, "logps/rejected": -111.59371948242188, "loss": 0.5654, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0977542400360107, "rewards/margins": 4.5246124267578125, "rewards/rejected": -3.4268579483032227, "step": 609 }, { "epoch": 1.8803385917660638, "grad_norm": 3.9814438819885254, "learning_rate": 8.12037037037037e-07, "logits/chosen": -2.7697081565856934, "logits/rejected": -2.809833526611328, "logps/chosen": -53.374420166015625, "logps/rejected": -117.51517486572266, "loss": 0.6521, "rewards/accuracies": 0.875, "rewards/chosen": 0.4374583065509796, "rewards/margins": 4.3618879318237305, "rewards/rejected": -3.924429178237915, "step": 610 }, { "epoch": 1.8834166987302807, "grad_norm": 4.413801193237305, "learning_rate": 8.117283950617285e-07, "logits/chosen": -2.756950616836548, "logits/rejected": -2.7697198390960693, "logps/chosen": -57.100616455078125, "logps/rejected": -131.68222045898438, "loss": 0.5218, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5000920295715332, "rewards/margins": 5.308391571044922, "rewards/rejected": -4.808299541473389, "step": 611 }, { "epoch": 1.886494805694498, "grad_norm": 4.028811931610107, "learning_rate": 8.114197530864198e-07, "logits/chosen": -2.810025453567505, "logits/rejected": -2.767141103744507, "logps/chosen": -49.06450653076172, "logps/rejected": -117.09075927734375, "loss": 0.5106, "rewards/accuracies": 1.0, "rewards/chosen": 1.0158945322036743, "rewards/margins": 5.08049201965332, "rewards/rejected": -4.064598083496094, "step": 612 }, { "epoch": 1.8895729126587149, "grad_norm": 5.466759204864502, "learning_rate": 8.11111111111111e-07, "logits/chosen": -2.705018997192383, "logits/rejected": -2.784501552581787, "logps/chosen": -47.91410827636719, "logps/rejected": -115.8839111328125, "loss": 0.5305, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0349637269973755, "rewards/margins": 5.023154258728027, "rewards/rejected": -3.9881906509399414, "step": 613 }, { "epoch": 1.892651019622932, "grad_norm": 4.2902374267578125, "learning_rate": 8.108024691358025e-07, "logits/chosen": -2.7357873916625977, "logits/rejected": -2.7295570373535156, "logps/chosen": -49.36305236816406, "logps/rejected": -116.09113311767578, "loss": 0.5194, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1724119186401367, "rewards/margins": 4.813697814941406, "rewards/rejected": -3.6412861347198486, "step": 614 }, { "epoch": 1.895729126587149, "grad_norm": 6.617118835449219, "learning_rate": 8.104938271604938e-07, "logits/chosen": -2.8473615646362305, "logits/rejected": -2.839357852935791, "logps/chosen": -44.45550537109375, "logps/rejected": -130.21890258789062, "loss": 0.5042, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4282457828521729, "rewards/margins": 6.1498284339904785, "rewards/rejected": -4.721582889556885, "step": 615 }, { "epoch": 1.898807233551366, "grad_norm": 4.683820724487305, "learning_rate": 8.101851851851852e-07, "logits/chosen": -2.7855169773101807, "logits/rejected": -2.661708354949951, "logps/chosen": -45.43803787231445, "logps/rejected": -104.26305389404297, "loss": 0.5545, "rewards/accuracies": 0.875, "rewards/chosen": 1.4588632583618164, "rewards/margins": 4.29115104675293, "rewards/rejected": -2.8322877883911133, "step": 616 }, { "epoch": 1.901885340515583, "grad_norm": 4.171562671661377, "learning_rate": 8.098765432098766e-07, "logits/chosen": -2.758878707885742, "logits/rejected": -2.820800304412842, "logps/chosen": -50.38412094116211, "logps/rejected": -130.88699340820312, "loss": 0.5068, "rewards/accuracies": 0.9375, "rewards/chosen": 0.734041690826416, "rewards/margins": 5.621922016143799, "rewards/rejected": -4.887879371643066, "step": 617 }, { "epoch": 1.9049634474797998, "grad_norm": 3.6444408893585205, "learning_rate": 8.095679012345679e-07, "logits/chosen": -2.773305892944336, "logits/rejected": -2.6951000690460205, "logps/chosen": -41.898643493652344, "logps/rejected": -126.93212127685547, "loss": 0.5018, "rewards/accuracies": 0.9375, "rewards/chosen": 1.302836298942566, "rewards/margins": 5.948100566864014, "rewards/rejected": -4.645264625549316, "step": 618 }, { "epoch": 1.9080415544440168, "grad_norm": 4.2175140380859375, "learning_rate": 8.092592592592592e-07, "logits/chosen": -2.725403070449829, "logits/rejected": -2.683187961578369, "logps/chosen": -44.81927490234375, "logps/rejected": -122.5419921875, "loss": 0.4795, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7612580060958862, "rewards/margins": 5.708377838134766, "rewards/rejected": -3.9471194744110107, "step": 619 }, { "epoch": 1.911119661408234, "grad_norm": 5.100200176239014, "learning_rate": 8.089506172839506e-07, "logits/chosen": -2.8294806480407715, "logits/rejected": -2.8380978107452393, "logps/chosen": -40.912174224853516, "logps/rejected": -105.497314453125, "loss": 0.5392, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4755781888961792, "rewards/margins": 4.833164691925049, "rewards/rejected": -3.35758638381958, "step": 620 }, { "epoch": 1.914197768372451, "grad_norm": 4.832522869110107, "learning_rate": 8.08641975308642e-07, "logits/chosen": -2.817017078399658, "logits/rejected": -2.7478370666503906, "logps/chosen": -42.322757720947266, "logps/rejected": -116.70622253417969, "loss": 0.4446, "rewards/accuracies": 0.90625, "rewards/chosen": 1.785298228263855, "rewards/margins": 5.597977161407471, "rewards/rejected": -3.812678575515747, "step": 621 }, { "epoch": 1.917275875336668, "grad_norm": 3.182448625564575, "learning_rate": 8.083333333333334e-07, "logits/chosen": -2.8028054237365723, "logits/rejected": -2.7710988521575928, "logps/chosen": -39.89966583251953, "logps/rejected": -123.4151382446289, "loss": 0.4103, "rewards/accuracies": 1.0, "rewards/chosen": 1.6621458530426025, "rewards/margins": 6.114780426025391, "rewards/rejected": -4.452634334564209, "step": 622 }, { "epoch": 1.920353982300885, "grad_norm": 4.524631023406982, "learning_rate": 8.080246913580247e-07, "logits/chosen": -2.699313163757324, "logits/rejected": -2.7282602787017822, "logps/chosen": -40.88827896118164, "logps/rejected": -100.81407928466797, "loss": 0.5317, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7320271730422974, "rewards/margins": 4.46021032333374, "rewards/rejected": -2.7281830310821533, "step": 623 }, { "epoch": 1.923432089265102, "grad_norm": 4.9097185134887695, "learning_rate": 8.07716049382716e-07, "logits/chosen": -2.702125072479248, "logits/rejected": -2.694037675857544, "logps/chosen": -54.45298767089844, "logps/rejected": -124.83631896972656, "loss": 0.5511, "rewards/accuracies": 0.875, "rewards/chosen": 0.7266004681587219, "rewards/margins": 5.04270076751709, "rewards/rejected": -4.316099643707275, "step": 624 }, { "epoch": 1.926510196229319, "grad_norm": 4.617408752441406, "learning_rate": 8.074074074074075e-07, "logits/chosen": -2.7144956588745117, "logits/rejected": -2.6780238151550293, "logps/chosen": -43.686973571777344, "logps/rejected": -105.56072235107422, "loss": 0.5992, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5008277893066406, "rewards/margins": 4.436856269836426, "rewards/rejected": -2.936028003692627, "step": 625 }, { "epoch": 1.929588303193536, "grad_norm": 3.5232338905334473, "learning_rate": 8.070987654320988e-07, "logits/chosen": -2.671996593475342, "logits/rejected": -2.7273449897766113, "logps/chosen": -49.358665466308594, "logps/rejected": -110.17159271240234, "loss": 0.5663, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0946751832962036, "rewards/margins": 4.70956563949585, "rewards/rejected": -3.6148903369903564, "step": 626 }, { "epoch": 1.9326664101577529, "grad_norm": 5.401504039764404, "learning_rate": 8.067901234567901e-07, "logits/chosen": -2.821162462234497, "logits/rejected": -2.797048330307007, "logps/chosen": -40.86854934692383, "logps/rejected": -111.6106185913086, "loss": 0.4548, "rewards/accuracies": 1.0, "rewards/chosen": 1.7815474271774292, "rewards/margins": 5.074953079223633, "rewards/rejected": -3.293405771255493, "step": 627 }, { "epoch": 1.93574451712197, "grad_norm": 5.4347405433654785, "learning_rate": 8.064814814814815e-07, "logits/chosen": -2.7665233612060547, "logits/rejected": -2.7954745292663574, "logps/chosen": -45.07377624511719, "logps/rejected": -124.73404693603516, "loss": 0.5343, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2168360948562622, "rewards/margins": 5.644065856933594, "rewards/rejected": -4.427229404449463, "step": 628 }, { "epoch": 1.938822624086187, "grad_norm": 5.333725929260254, "learning_rate": 8.061728395061728e-07, "logits/chosen": -2.773499011993408, "logits/rejected": -2.7416207790374756, "logps/chosen": -44.72056198120117, "logps/rejected": -109.49330139160156, "loss": 0.5446, "rewards/accuracies": 0.875, "rewards/chosen": 1.6093757152557373, "rewards/margins": 4.990245819091797, "rewards/rejected": -3.3808698654174805, "step": 629 }, { "epoch": 1.941900731050404, "grad_norm": 5.329519271850586, "learning_rate": 8.058641975308642e-07, "logits/chosen": -2.792727470397949, "logits/rejected": -2.843590497970581, "logps/chosen": -39.53407287597656, "logps/rejected": -116.93571472167969, "loss": 0.4876, "rewards/accuracies": 0.9375, "rewards/chosen": 1.673721432685852, "rewards/margins": 5.448455810546875, "rewards/rejected": -3.7747347354888916, "step": 630 }, { "epoch": 1.9449788380146211, "grad_norm": 4.699075222015381, "learning_rate": 8.055555555555556e-07, "logits/chosen": -2.75331711769104, "logits/rejected": -2.7611656188964844, "logps/chosen": -43.11050033569336, "logps/rejected": -113.1224365234375, "loss": 0.4957, "rewards/accuracies": 1.0, "rewards/chosen": 1.5150359869003296, "rewards/margins": 4.958576202392578, "rewards/rejected": -3.443540096282959, "step": 631 }, { "epoch": 1.948056944978838, "grad_norm": 4.305717945098877, "learning_rate": 8.052469135802469e-07, "logits/chosen": -2.6614208221435547, "logits/rejected": -2.6815695762634277, "logps/chosen": -49.53041076660156, "logps/rejected": -109.85497283935547, "loss": 0.5669, "rewards/accuracies": 0.875, "rewards/chosen": 1.3014371395111084, "rewards/margins": 4.38945198059082, "rewards/rejected": -3.088014841079712, "step": 632 }, { "epoch": 1.951135051943055, "grad_norm": 4.168779373168945, "learning_rate": 8.049382716049383e-07, "logits/chosen": -2.77297043800354, "logits/rejected": -2.7690789699554443, "logps/chosen": -47.57619094848633, "logps/rejected": -123.38126373291016, "loss": 0.4175, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4214162826538086, "rewards/margins": 5.73532247543335, "rewards/rejected": -4.313905715942383, "step": 633 }, { "epoch": 1.954213158907272, "grad_norm": 4.933310508728027, "learning_rate": 8.046296296296296e-07, "logits/chosen": -2.7966086864471436, "logits/rejected": -2.7712345123291016, "logps/chosen": -44.5945930480957, "logps/rejected": -112.67228698730469, "loss": 0.5379, "rewards/accuracies": 0.9375, "rewards/chosen": 1.05181884765625, "rewards/margins": 4.697638988494873, "rewards/rejected": -3.645819902420044, "step": 634 }, { "epoch": 1.957291265871489, "grad_norm": 4.360702037811279, "learning_rate": 8.04320987654321e-07, "logits/chosen": -2.7409539222717285, "logits/rejected": -2.725520133972168, "logps/chosen": -51.901851654052734, "logps/rejected": -117.96556854248047, "loss": 0.609, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9879247546195984, "rewards/margins": 4.518933296203613, "rewards/rejected": -3.531008243560791, "step": 635 }, { "epoch": 1.960369372835706, "grad_norm": 4.035224914550781, "learning_rate": 8.040123456790124e-07, "logits/chosen": -2.688600778579712, "logits/rejected": -2.699836492538452, "logps/chosen": -42.35466384887695, "logps/rejected": -138.84719848632812, "loss": 0.4099, "rewards/accuracies": 1.0, "rewards/chosen": 1.6935343742370605, "rewards/margins": 6.84776496887207, "rewards/rejected": -5.154231071472168, "step": 636 }, { "epoch": 1.963447479799923, "grad_norm": 3.949718475341797, "learning_rate": 8.037037037037037e-07, "logits/chosen": -2.691197395324707, "logits/rejected": -2.752479076385498, "logps/chosen": -54.46730422973633, "logps/rejected": -117.21868896484375, "loss": 0.6518, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5940196514129639, "rewards/margins": 4.31953239440918, "rewards/rejected": -3.7255125045776367, "step": 637 }, { "epoch": 1.96652558676414, "grad_norm": 4.268950939178467, "learning_rate": 8.03395061728395e-07, "logits/chosen": -2.8068175315856934, "logits/rejected": -2.7595362663269043, "logps/chosen": -44.28553009033203, "logps/rejected": -103.55217742919922, "loss": 0.5394, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4993038177490234, "rewards/margins": 4.204148292541504, "rewards/rejected": -2.7048442363739014, "step": 638 }, { "epoch": 1.9696036937283572, "grad_norm": 4.3949666023254395, "learning_rate": 8.030864197530864e-07, "logits/chosen": -2.751133441925049, "logits/rejected": -2.7273154258728027, "logps/chosen": -39.198795318603516, "logps/rejected": -118.8099136352539, "loss": 0.3149, "rewards/accuracies": 0.96875, "rewards/chosen": 2.301459789276123, "rewards/margins": 6.154610633850098, "rewards/rejected": -3.853151321411133, "step": 639 }, { "epoch": 1.9726818006925741, "grad_norm": 3.8866608142852783, "learning_rate": 8.027777777777778e-07, "logits/chosen": -2.772488594055176, "logits/rejected": -2.7236931324005127, "logps/chosen": -38.28326416015625, "logps/rejected": -119.67526245117188, "loss": 0.4895, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7368762493133545, "rewards/margins": 5.760923385620117, "rewards/rejected": -4.024046897888184, "step": 640 }, { "epoch": 1.975759907656791, "grad_norm": 4.371628284454346, "learning_rate": 8.024691358024691e-07, "logits/chosen": -2.75663685798645, "logits/rejected": -2.7696738243103027, "logps/chosen": -53.44837951660156, "logps/rejected": -109.71154022216797, "loss": 0.6425, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7599049806594849, "rewards/margins": 4.001981258392334, "rewards/rejected": -3.2420763969421387, "step": 641 }, { "epoch": 1.978838014621008, "grad_norm": 4.9094109535217285, "learning_rate": 8.021604938271605e-07, "logits/chosen": -2.666954517364502, "logits/rejected": -2.6846961975097656, "logps/chosen": -35.50911331176758, "logps/rejected": -115.41810607910156, "loss": 0.4096, "rewards/accuracies": 0.90625, "rewards/chosen": 2.0582892894744873, "rewards/margins": 5.502712726593018, "rewards/rejected": -3.444423198699951, "step": 642 }, { "epoch": 1.981916121585225, "grad_norm": 4.021296501159668, "learning_rate": 8.018518518518518e-07, "logits/chosen": -2.7391357421875, "logits/rejected": -2.694931983947754, "logps/chosen": -39.9237060546875, "logps/rejected": -115.0733413696289, "loss": 0.5091, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7192749977111816, "rewards/margins": 5.382310390472412, "rewards/rejected": -3.663034677505493, "step": 643 }, { "epoch": 1.984994228549442, "grad_norm": 6.718598365783691, "learning_rate": 8.015432098765433e-07, "logits/chosen": -2.825012445449829, "logits/rejected": -2.7303659915924072, "logps/chosen": -39.19097137451172, "logps/rejected": -101.32571411132812, "loss": 0.4922, "rewards/accuracies": 1.0, "rewards/chosen": 2.016385078430176, "rewards/margins": 4.587304592132568, "rewards/rejected": -2.570919990539551, "step": 644 }, { "epoch": 1.9880723355136591, "grad_norm": 4.541456699371338, "learning_rate": 8.012345679012346e-07, "logits/chosen": -2.6515183448791504, "logits/rejected": -2.7708616256713867, "logps/chosen": -46.057098388671875, "logps/rejected": -109.03398132324219, "loss": 0.5544, "rewards/accuracies": 0.875, "rewards/chosen": 1.5606086254119873, "rewards/margins": 4.575100898742676, "rewards/rejected": -3.0144922733306885, "step": 645 }, { "epoch": 1.991150442477876, "grad_norm": 4.891855716705322, "learning_rate": 8.009259259259259e-07, "logits/chosen": -2.7728214263916016, "logits/rejected": -2.775893211364746, "logps/chosen": -41.577125549316406, "logps/rejected": -114.87741088867188, "loss": 0.5008, "rewards/accuracies": 1.0, "rewards/chosen": 1.5694385766983032, "rewards/margins": 5.251520156860352, "rewards/rejected": -3.6820812225341797, "step": 646 }, { "epoch": 1.9942285494420933, "grad_norm": 3.873626232147217, "learning_rate": 8.006172839506173e-07, "logits/chosen": -2.8108303546905518, "logits/rejected": -2.7551448345184326, "logps/chosen": -38.613380432128906, "logps/rejected": -106.10430145263672, "loss": 0.5313, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5283617973327637, "rewards/margins": 4.902468204498291, "rewards/rejected": -3.3741064071655273, "step": 647 }, { "epoch": 1.9973066564063102, "grad_norm": 3.1766135692596436, "learning_rate": 8.003086419753086e-07, "logits/chosen": -2.6911747455596924, "logits/rejected": -2.6701509952545166, "logps/chosen": -42.378684997558594, "logps/rejected": -117.22125244140625, "loss": 0.5337, "rewards/accuracies": 0.8125, "rewards/chosen": 1.6248308420181274, "rewards/margins": 5.247743129730225, "rewards/rejected": -3.6229124069213867, "step": 648 }, { "epoch": 2.003078106964217, "grad_norm": 5.584718704223633, "learning_rate": 8e-07, "logits/chosen": -2.7622487545013428, "logits/rejected": -2.741386890411377, "logps/chosen": -51.91221237182617, "logps/rejected": -112.75147247314453, "loss": 0.9167, "rewards/accuracies": 0.9259259104728699, "rewards/chosen": 0.8886984586715698, "rewards/margins": 4.477855682373047, "rewards/rejected": -3.5891575813293457, "step": 649 }, { "epoch": 2.006156213928434, "grad_norm": 7.1809000968933105, "learning_rate": 7.996913580246914e-07, "logits/chosen": -2.778204917907715, "logits/rejected": -2.796128034591675, "logps/chosen": -35.79905319213867, "logps/rejected": -101.55854797363281, "loss": 0.4718, "rewards/accuracies": 1.0, "rewards/chosen": 2.1403162479400635, "rewards/margins": 4.783703327178955, "rewards/rejected": -2.6433873176574707, "step": 650 }, { "epoch": 2.006156213928434, "eval_logits/chosen": -2.7025980949401855, "eval_logits/rejected": -2.746889352798462, "eval_logps/chosen": -44.32548141479492, "eval_logps/rejected": -113.70758819580078, "eval_loss": 0.5755375027656555, "eval_rewards/accuracies": 0.8930636048316956, "eval_rewards/chosen": 1.3948311805725098, "eval_rewards/margins": 4.749007701873779, "eval_rewards/rejected": -3.3541767597198486, "eval_runtime": 641.0047, "eval_samples_per_second": 0.54, "eval_steps_per_second": 0.27, "step": 650 }, { "epoch": 2.009234320892651, "grad_norm": 4.015069007873535, "learning_rate": 7.993827160493827e-07, "logits/chosen": -2.6990749835968018, "logits/rejected": -2.7149949073791504, "logps/chosen": -49.014434814453125, "logps/rejected": -113.08331298828125, "loss": 0.5489, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2447543144226074, "rewards/margins": 4.524658679962158, "rewards/rejected": -3.279904842376709, "step": 651 }, { "epoch": 2.012312427856868, "grad_norm": 3.755449056625366, "learning_rate": 7.99074074074074e-07, "logits/chosen": -2.7917535305023193, "logits/rejected": -2.7334487438201904, "logps/chosen": -43.7401237487793, "logps/rejected": -124.21673583984375, "loss": 0.4954, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5981028079986572, "rewards/margins": 5.905927658081055, "rewards/rejected": -4.307824611663818, "step": 652 }, { "epoch": 2.015390534821085, "grad_norm": 4.379607677459717, "learning_rate": 7.987654320987654e-07, "logits/chosen": -2.7554075717926025, "logits/rejected": -2.756934404373169, "logps/chosen": -60.19337844848633, "logps/rejected": -127.28446960449219, "loss": 0.5685, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5173967480659485, "rewards/margins": 5.130213737487793, "rewards/rejected": -4.61281681060791, "step": 653 }, { "epoch": 2.018468641785302, "grad_norm": 4.15785026550293, "learning_rate": 7.984567901234568e-07, "logits/chosen": -2.784454107284546, "logits/rejected": -2.7362022399902344, "logps/chosen": -48.35429382324219, "logps/rejected": -127.69639587402344, "loss": 0.4879, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8432615995407104, "rewards/margins": 5.19171142578125, "rewards/rejected": -4.348450183868408, "step": 654 }, { "epoch": 2.021546748749519, "grad_norm": 4.809130668640137, "learning_rate": 7.981481481481482e-07, "logits/chosen": -2.797994613647461, "logits/rejected": -2.8027501106262207, "logps/chosen": -38.70281982421875, "logps/rejected": -126.45524597167969, "loss": 0.4234, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8164044618606567, "rewards/margins": 6.404676914215088, "rewards/rejected": -4.5882720947265625, "step": 655 }, { "epoch": 2.024624855713736, "grad_norm": 5.753055095672607, "learning_rate": 7.978395061728395e-07, "logits/chosen": -2.780259370803833, "logits/rejected": -2.7647528648376465, "logps/chosen": -45.04132843017578, "logps/rejected": -111.1014404296875, "loss": 0.5932, "rewards/accuracies": 0.875, "rewards/chosen": 1.2387452125549316, "rewards/margins": 4.534432888031006, "rewards/rejected": -3.295687437057495, "step": 656 }, { "epoch": 2.027702962677953, "grad_norm": 3.3351075649261475, "learning_rate": 7.975308641975308e-07, "logits/chosen": -2.734373092651367, "logits/rejected": -2.7522311210632324, "logps/chosen": -39.808502197265625, "logps/rejected": -102.89078521728516, "loss": 0.495, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9802136421203613, "rewards/margins": 4.9527082443237305, "rewards/rejected": -2.9724948406219482, "step": 657 }, { "epoch": 2.03078106964217, "grad_norm": 4.486507415771484, "learning_rate": 7.972222222222223e-07, "logits/chosen": -2.7321078777313232, "logits/rejected": -2.7624738216400146, "logps/chosen": -49.94172286987305, "logps/rejected": -113.49569702148438, "loss": 0.5931, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0769327878952026, "rewards/margins": 4.615248680114746, "rewards/rejected": -3.538315773010254, "step": 658 }, { "epoch": 2.033859176606387, "grad_norm": 3.881791353225708, "learning_rate": 7.969135802469136e-07, "logits/chosen": -2.863917350769043, "logits/rejected": -2.8049817085266113, "logps/chosen": -48.04943084716797, "logps/rejected": -127.39347839355469, "loss": 0.5176, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9720737934112549, "rewards/margins": 5.4408369064331055, "rewards/rejected": -4.468762397766113, "step": 659 }, { "epoch": 2.036937283570604, "grad_norm": 4.060637474060059, "learning_rate": 7.966049382716049e-07, "logits/chosen": -2.7493011951446533, "logits/rejected": -2.709439992904663, "logps/chosen": -53.35963439941406, "logps/rejected": -115.90830993652344, "loss": 0.5899, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6996996402740479, "rewards/margins": 4.338224411010742, "rewards/rejected": -3.638524055480957, "step": 660 }, { "epoch": 2.0400153905348213, "grad_norm": 4.891465187072754, "learning_rate": 7.962962962962963e-07, "logits/chosen": -2.7276668548583984, "logits/rejected": -2.7658965587615967, "logps/chosen": -44.735931396484375, "logps/rejected": -107.75801086425781, "loss": 0.5279, "rewards/accuracies": 0.875, "rewards/chosen": 1.2511955499649048, "rewards/margins": 4.4297871589660645, "rewards/rejected": -3.178591728210449, "step": 661 }, { "epoch": 2.0430934974990382, "grad_norm": 4.146290302276611, "learning_rate": 7.959876543209876e-07, "logits/chosen": -2.8316683769226074, "logits/rejected": -2.8450069427490234, "logps/chosen": -41.7783203125, "logps/rejected": -110.97156524658203, "loss": 0.5471, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5042564868927002, "rewards/margins": 4.926462173461914, "rewards/rejected": -3.422205924987793, "step": 662 }, { "epoch": 2.046171604463255, "grad_norm": 5.131987571716309, "learning_rate": 7.956790123456791e-07, "logits/chosen": -2.836730718612671, "logits/rejected": -2.7130179405212402, "logps/chosen": -41.858985900878906, "logps/rejected": -134.71267700195312, "loss": 0.4246, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7076425552368164, "rewards/margins": 6.587222099304199, "rewards/rejected": -4.879579544067383, "step": 663 }, { "epoch": 2.049249711427472, "grad_norm": 5.17915153503418, "learning_rate": 7.953703703703704e-07, "logits/chosen": -2.8701844215393066, "logits/rejected": -2.849757671356201, "logps/chosen": -46.630062103271484, "logps/rejected": -116.93299102783203, "loss": 0.5617, "rewards/accuracies": 1.0, "rewards/chosen": 1.5092217922210693, "rewards/margins": 5.032148838043213, "rewards/rejected": -3.5229272842407227, "step": 664 }, { "epoch": 2.052327818391689, "grad_norm": 4.835546016693115, "learning_rate": 7.950617283950617e-07, "logits/chosen": -2.8169138431549072, "logits/rejected": -2.814084529876709, "logps/chosen": -49.63850021362305, "logps/rejected": -115.96846008300781, "loss": 0.5923, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9517965316772461, "rewards/margins": 4.676342010498047, "rewards/rejected": -3.72454571723938, "step": 665 }, { "epoch": 2.055405925355906, "grad_norm": 4.360071182250977, "learning_rate": 7.947530864197531e-07, "logits/chosen": -2.7537643909454346, "logits/rejected": -2.7063419818878174, "logps/chosen": -44.915321350097656, "logps/rejected": -118.30360412597656, "loss": 0.5306, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3567335605621338, "rewards/margins": 5.275709629058838, "rewards/rejected": -3.918976306915283, "step": 666 }, { "epoch": 2.058484032320123, "grad_norm": 4.7259979248046875, "learning_rate": 7.944444444444444e-07, "logits/chosen": -2.800274610519409, "logits/rejected": -2.7707161903381348, "logps/chosen": -40.979248046875, "logps/rejected": -124.23307800292969, "loss": 0.3548, "rewards/accuracies": 1.0, "rewards/chosen": 1.9086381196975708, "rewards/margins": 6.267760276794434, "rewards/rejected": -4.359122276306152, "step": 667 }, { "epoch": 2.06156213928434, "grad_norm": 4.810956001281738, "learning_rate": 7.941358024691358e-07, "logits/chosen": -2.7862937450408936, "logits/rejected": -2.7695534229278564, "logps/chosen": -36.936195373535156, "logps/rejected": -114.73802947998047, "loss": 0.4379, "rewards/accuracies": 1.0, "rewards/chosen": 2.0123584270477295, "rewards/margins": 5.737522602081299, "rewards/rejected": -3.7251641750335693, "step": 668 }, { "epoch": 2.0646402462485574, "grad_norm": 5.349696159362793, "learning_rate": 7.938271604938272e-07, "logits/chosen": -2.8074073791503906, "logits/rejected": -2.7363007068634033, "logps/chosen": -50.805198669433594, "logps/rejected": -115.14579772949219, "loss": 0.5402, "rewards/accuracies": 0.875, "rewards/chosen": 0.9600033760070801, "rewards/margins": 4.445902347564697, "rewards/rejected": -3.485898733139038, "step": 669 }, { "epoch": 2.0677183532127743, "grad_norm": 3.6070923805236816, "learning_rate": 7.935185185185185e-07, "logits/chosen": -2.757133960723877, "logits/rejected": -2.6979165077209473, "logps/chosen": -50.37788009643555, "logps/rejected": -129.25743103027344, "loss": 0.5118, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0104663372039795, "rewards/margins": 5.890657424926758, "rewards/rejected": -4.880190849304199, "step": 670 }, { "epoch": 2.0707964601769913, "grad_norm": 5.88468074798584, "learning_rate": 7.932098765432098e-07, "logits/chosen": -2.8460607528686523, "logits/rejected": -2.82342267036438, "logps/chosen": -38.557125091552734, "logps/rejected": -99.8722152709961, "loss": 0.5733, "rewards/accuracies": 0.875, "rewards/chosen": 1.9440178871154785, "rewards/margins": 4.297142505645752, "rewards/rejected": -2.3531248569488525, "step": 671 }, { "epoch": 2.073874567141208, "grad_norm": 5.583738803863525, "learning_rate": 7.929012345679012e-07, "logits/chosen": -2.6199722290039062, "logits/rejected": -2.713294267654419, "logps/chosen": -39.6611328125, "logps/rejected": -120.55096435546875, "loss": 0.3861, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6577013731002808, "rewards/margins": 5.78753662109375, "rewards/rejected": -4.12983512878418, "step": 672 }, { "epoch": 2.076952674105425, "grad_norm": 4.183536529541016, "learning_rate": 7.925925925925926e-07, "logits/chosen": -2.8192596435546875, "logits/rejected": -2.7775168418884277, "logps/chosen": -66.49462890625, "logps/rejected": -119.8758544921875, "loss": 0.6826, "rewards/accuracies": 0.90625, "rewards/chosen": -0.19493134319782257, "rewards/margins": 3.7622785568237305, "rewards/rejected": -3.957210063934326, "step": 673 }, { "epoch": 2.080030781069642, "grad_norm": 4.478097438812256, "learning_rate": 7.92283950617284e-07, "logits/chosen": -2.81367826461792, "logits/rejected": -2.7209067344665527, "logps/chosen": -41.412654876708984, "logps/rejected": -121.87914276123047, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 1.5062192678451538, "rewards/margins": 5.765106201171875, "rewards/rejected": -4.258886814117432, "step": 674 }, { "epoch": 2.083108888033859, "grad_norm": 4.347833156585693, "learning_rate": 7.919753086419753e-07, "logits/chosen": -2.7477900981903076, "logits/rejected": -2.797072649002075, "logps/chosen": -53.561397552490234, "logps/rejected": -119.69253540039062, "loss": 0.5833, "rewards/accuracies": 0.875, "rewards/chosen": 0.8091406226158142, "rewards/margins": 4.887243270874023, "rewards/rejected": -4.0781025886535645, "step": 675 }, { "epoch": 2.086186994998076, "grad_norm": 5.622611999511719, "learning_rate": 7.916666666666666e-07, "logits/chosen": -2.719532012939453, "logits/rejected": -2.715569257736206, "logps/chosen": -46.63997268676758, "logps/rejected": -114.44036865234375, "loss": 0.5446, "rewards/accuracies": 0.96875, "rewards/chosen": 1.184370756149292, "rewards/margins": 4.831629753112793, "rewards/rejected": -3.64725923538208, "step": 676 }, { "epoch": 2.089265101962293, "grad_norm": 3.744312286376953, "learning_rate": 7.913580246913581e-07, "logits/chosen": -2.74735164642334, "logits/rejected": -2.7428038120269775, "logps/chosen": -36.46965789794922, "logps/rejected": -100.28988647460938, "loss": 0.5118, "rewards/accuracies": 0.96875, "rewards/chosen": 2.06321120262146, "rewards/margins": 4.802245140075684, "rewards/rejected": -2.7390341758728027, "step": 677 }, { "epoch": 2.0923432089265104, "grad_norm": 6.416472911834717, "learning_rate": 7.910493827160494e-07, "logits/chosen": -2.7457034587860107, "logits/rejected": -2.713500499725342, "logps/chosen": -54.736419677734375, "logps/rejected": -129.5870361328125, "loss": 0.5622, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5358284711837769, "rewards/margins": 5.148530006408691, "rewards/rejected": -4.612701416015625, "step": 678 }, { "epoch": 2.0954213158907273, "grad_norm": 5.055352687835693, "learning_rate": 7.907407407407407e-07, "logits/chosen": -2.797982692718506, "logits/rejected": -2.73544979095459, "logps/chosen": -40.84853744506836, "logps/rejected": -116.2738265991211, "loss": 0.459, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5061488151550293, "rewards/margins": 5.574873924255371, "rewards/rejected": -4.068725109100342, "step": 679 }, { "epoch": 2.0984994228549443, "grad_norm": 3.8145830631256104, "learning_rate": 7.904320987654321e-07, "logits/chosen": -2.7023346424102783, "logits/rejected": -2.688183307647705, "logps/chosen": -51.68669128417969, "logps/rejected": -120.0810775756836, "loss": 0.5422, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9016408324241638, "rewards/margins": 4.805737018585205, "rewards/rejected": -3.9040961265563965, "step": 680 }, { "epoch": 2.1015775298191612, "grad_norm": 5.999112606048584, "learning_rate": 7.901234567901234e-07, "logits/chosen": -2.7200536727905273, "logits/rejected": -2.7040257453918457, "logps/chosen": -39.11464309692383, "logps/rejected": -113.26041412353516, "loss": 0.4954, "rewards/accuracies": 0.875, "rewards/chosen": 1.8124170303344727, "rewards/margins": 5.184593677520752, "rewards/rejected": -3.372175931930542, "step": 681 }, { "epoch": 2.104655636783378, "grad_norm": 4.333581924438477, "learning_rate": 7.898148148148148e-07, "logits/chosen": -2.813277244567871, "logits/rejected": -2.8483591079711914, "logps/chosen": -48.070098876953125, "logps/rejected": -119.23896789550781, "loss": 0.4849, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2774133682250977, "rewards/margins": 5.033026695251465, "rewards/rejected": -3.7556140422821045, "step": 682 }, { "epoch": 2.107733743747595, "grad_norm": 4.600674152374268, "learning_rate": 7.895061728395062e-07, "logits/chosen": -2.765043258666992, "logits/rejected": -2.7724666595458984, "logps/chosen": -53.12898635864258, "logps/rejected": -124.60729217529297, "loss": 0.5199, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8167502880096436, "rewards/margins": 5.294604301452637, "rewards/rejected": -4.477853775024414, "step": 683 }, { "epoch": 2.110811850711812, "grad_norm": 4.4392924308776855, "learning_rate": 7.891975308641975e-07, "logits/chosen": -2.7401156425476074, "logits/rejected": -2.7401630878448486, "logps/chosen": -45.64852523803711, "logps/rejected": -117.08628845214844, "loss": 0.5579, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2271050214767456, "rewards/margins": 5.240917682647705, "rewards/rejected": -4.01381254196167, "step": 684 }, { "epoch": 2.113889957676029, "grad_norm": 6.422277927398682, "learning_rate": 7.888888888888889e-07, "logits/chosen": -2.775425910949707, "logits/rejected": -2.788558006286621, "logps/chosen": -42.779518127441406, "logps/rejected": -139.33262634277344, "loss": 0.4345, "rewards/accuracies": 0.96875, "rewards/chosen": 1.216821312904358, "rewards/margins": 6.768884181976318, "rewards/rejected": -5.552062511444092, "step": 685 }, { "epoch": 2.1169680646402464, "grad_norm": 4.567877769470215, "learning_rate": 7.885802469135802e-07, "logits/chosen": -2.771615743637085, "logits/rejected": -2.754513740539551, "logps/chosen": -40.607975006103516, "logps/rejected": -114.84574890136719, "loss": 0.4465, "rewards/accuracies": 0.9375, "rewards/chosen": 1.747258186340332, "rewards/margins": 5.459510326385498, "rewards/rejected": -3.712252140045166, "step": 686 }, { "epoch": 2.1200461716044634, "grad_norm": 5.623806953430176, "learning_rate": 7.882716049382716e-07, "logits/chosen": -2.805680751800537, "logits/rejected": -2.7886950969696045, "logps/chosen": -48.92798614501953, "logps/rejected": -120.12184143066406, "loss": 0.4604, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0270220041275024, "rewards/margins": 5.425939083099365, "rewards/rejected": -4.398916721343994, "step": 687 }, { "epoch": 2.1231242785686804, "grad_norm": 4.341281414031982, "learning_rate": 7.87962962962963e-07, "logits/chosen": -2.6841983795166016, "logits/rejected": -2.779669761657715, "logps/chosen": -39.771915435791016, "logps/rejected": -108.19911193847656, "loss": 0.5037, "rewards/accuracies": 1.0, "rewards/chosen": 1.860548973083496, "rewards/margins": 5.119799613952637, "rewards/rejected": -3.2592501640319824, "step": 688 }, { "epoch": 2.1262023855328973, "grad_norm": 6.458545684814453, "learning_rate": 7.876543209876543e-07, "logits/chosen": -2.674532175064087, "logits/rejected": -2.727562665939331, "logps/chosen": -45.7528076171875, "logps/rejected": -107.51600646972656, "loss": 0.4498, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2419449090957642, "rewards/margins": 4.578522205352783, "rewards/rejected": -3.3365776538848877, "step": 689 }, { "epoch": 2.1292804924971143, "grad_norm": 5.211511611938477, "learning_rate": 7.873456790123456e-07, "logits/chosen": -2.760814666748047, "logits/rejected": -2.715049982070923, "logps/chosen": -46.4532585144043, "logps/rejected": -130.3008270263672, "loss": 0.4206, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1750166416168213, "rewards/margins": 6.090671539306641, "rewards/rejected": -4.915655136108398, "step": 690 }, { "epoch": 2.132358599461331, "grad_norm": 5.167674541473389, "learning_rate": 7.870370370370371e-07, "logits/chosen": -2.7528722286224365, "logits/rejected": -2.7942709922790527, "logps/chosen": -41.222721099853516, "logps/rejected": -111.04878234863281, "loss": 0.4376, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9181214570999146, "rewards/margins": 5.381793022155762, "rewards/rejected": -3.4636716842651367, "step": 691 }, { "epoch": 2.135436706425548, "grad_norm": 4.77363395690918, "learning_rate": 7.867283950617284e-07, "logits/chosen": -2.751138210296631, "logits/rejected": -2.74243426322937, "logps/chosen": -40.69942092895508, "logps/rejected": -141.6184539794922, "loss": 0.3767, "rewards/accuracies": 1.0, "rewards/chosen": 1.7946714162826538, "rewards/margins": 7.347445487976074, "rewards/rejected": -5.552773475646973, "step": 692 }, { "epoch": 2.138514813389765, "grad_norm": 3.86557936668396, "learning_rate": 7.864197530864197e-07, "logits/chosen": -2.696645975112915, "logits/rejected": -2.771338701248169, "logps/chosen": -51.29063034057617, "logps/rejected": -114.82000732421875, "loss": 0.5665, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5694479942321777, "rewards/margins": 4.830585479736328, "rewards/rejected": -4.26113748550415, "step": 693 }, { "epoch": 2.1415929203539825, "grad_norm": 6.1495208740234375, "learning_rate": 7.861111111111111e-07, "logits/chosen": -2.721668243408203, "logits/rejected": -2.7269487380981445, "logps/chosen": -56.36588668823242, "logps/rejected": -133.6355743408203, "loss": 0.5922, "rewards/accuracies": 1.0, "rewards/chosen": 0.5951437950134277, "rewards/margins": 5.530899524688721, "rewards/rejected": -4.935755729675293, "step": 694 }, { "epoch": 2.1446710273181995, "grad_norm": 6.056707859039307, "learning_rate": 7.858024691358024e-07, "logits/chosen": -2.7854135036468506, "logits/rejected": -2.7777328491210938, "logps/chosen": -42.553985595703125, "logps/rejected": -93.567626953125, "loss": 0.5933, "rewards/accuracies": 0.96875, "rewards/chosen": 1.690410852432251, "rewards/margins": 4.116422653198242, "rewards/rejected": -2.426011562347412, "step": 695 }, { "epoch": 2.1477491342824164, "grad_norm": 4.539950370788574, "learning_rate": 7.854938271604939e-07, "logits/chosen": -2.7280709743499756, "logits/rejected": -2.727569103240967, "logps/chosen": -38.01907730102539, "logps/rejected": -118.45905303955078, "loss": 0.424, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6878721714019775, "rewards/margins": 5.770634174346924, "rewards/rejected": -4.082761764526367, "step": 696 }, { "epoch": 2.1508272412466334, "grad_norm": 4.559895992279053, "learning_rate": 7.851851851851852e-07, "logits/chosen": -2.68996524810791, "logits/rejected": -2.701000452041626, "logps/chosen": -42.885684967041016, "logps/rejected": -117.84453582763672, "loss": 0.52, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4171955585479736, "rewards/margins": 5.103404998779297, "rewards/rejected": -3.6862099170684814, "step": 697 }, { "epoch": 2.1539053482108503, "grad_norm": 4.820269584655762, "learning_rate": 7.848765432098765e-07, "logits/chosen": -2.6914544105529785, "logits/rejected": -2.735107660293579, "logps/chosen": -43.792198181152344, "logps/rejected": -120.3994369506836, "loss": 0.4594, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5275516510009766, "rewards/margins": 5.620519638061523, "rewards/rejected": -4.092967987060547, "step": 698 }, { "epoch": 2.1569834551750673, "grad_norm": 4.3422322273254395, "learning_rate": 7.845679012345679e-07, "logits/chosen": -2.8476855754852295, "logits/rejected": -2.878347635269165, "logps/chosen": -50.714176177978516, "logps/rejected": -110.72317504882812, "loss": 0.6039, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7456731796264648, "rewards/margins": 4.387017250061035, "rewards/rejected": -3.6413443088531494, "step": 699 }, { "epoch": 2.1600615621392842, "grad_norm": 4.429203987121582, "learning_rate": 7.842592592592592e-07, "logits/chosen": -2.7662460803985596, "logits/rejected": -2.771484613418579, "logps/chosen": -41.872413635253906, "logps/rejected": -107.00012969970703, "loss": 0.5232, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5992881059646606, "rewards/margins": 4.462174415588379, "rewards/rejected": -2.862886428833008, "step": 700 }, { "epoch": 2.163139669103501, "grad_norm": 4.933664321899414, "learning_rate": 7.839506172839506e-07, "logits/chosen": -2.765636444091797, "logits/rejected": -2.7656445503234863, "logps/chosen": -35.81806182861328, "logps/rejected": -113.5915756225586, "loss": 0.4499, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1879539489746094, "rewards/margins": 5.656737327575684, "rewards/rejected": -3.468783378601074, "step": 701 }, { "epoch": 2.166217776067718, "grad_norm": 5.3687744140625, "learning_rate": 7.83641975308642e-07, "logits/chosen": -2.719531297683716, "logits/rejected": -2.715420961380005, "logps/chosen": -54.9886589050293, "logps/rejected": -124.69091796875, "loss": 0.5688, "rewards/accuracies": 0.875, "rewards/chosen": 0.6109014749526978, "rewards/margins": 4.915565013885498, "rewards/rejected": -4.30466365814209, "step": 702 }, { "epoch": 2.1692958830319355, "grad_norm": 4.272634983062744, "learning_rate": 7.833333333333333e-07, "logits/chosen": -2.753598690032959, "logits/rejected": -2.8421289920806885, "logps/chosen": -47.791534423828125, "logps/rejected": -126.87093353271484, "loss": 0.4816, "rewards/accuracies": 1.0, "rewards/chosen": 1.1001861095428467, "rewards/margins": 5.648505210876465, "rewards/rejected": -4.548319339752197, "step": 703 }, { "epoch": 2.1723739899961525, "grad_norm": 4.334601879119873, "learning_rate": 7.830246913580246e-07, "logits/chosen": -2.787433624267578, "logits/rejected": -2.7951111793518066, "logps/chosen": -40.270259857177734, "logps/rejected": -106.03976440429688, "loss": 0.5312, "rewards/accuracies": 0.875, "rewards/chosen": 2.0778346061706543, "rewards/margins": 4.990180015563965, "rewards/rejected": -2.9123454093933105, "step": 704 }, { "epoch": 2.1754520969603695, "grad_norm": 5.663484573364258, "learning_rate": 7.82716049382716e-07, "logits/chosen": -2.677539587020874, "logits/rejected": -2.733924388885498, "logps/chosen": -37.36760711669922, "logps/rejected": -100.25263977050781, "loss": 0.5383, "rewards/accuracies": 0.875, "rewards/chosen": 1.8684698343276978, "rewards/margins": 4.476319789886475, "rewards/rejected": -2.6078500747680664, "step": 705 }, { "epoch": 2.1785302039245864, "grad_norm": 5.682709693908691, "learning_rate": 7.824074074074074e-07, "logits/chosen": -2.7771711349487305, "logits/rejected": -2.7706429958343506, "logps/chosen": -50.48872756958008, "logps/rejected": -141.9869842529297, "loss": 0.451, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8013976216316223, "rewards/margins": 6.438893795013428, "rewards/rejected": -5.6374969482421875, "step": 706 }, { "epoch": 2.1816083108888034, "grad_norm": 3.939972162246704, "learning_rate": 7.820987654320988e-07, "logits/chosen": -2.7441904544830322, "logits/rejected": -2.74775767326355, "logps/chosen": -42.25829315185547, "logps/rejected": -125.83683013916016, "loss": 0.4896, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3684403896331787, "rewards/margins": 5.856513023376465, "rewards/rejected": -4.488071918487549, "step": 707 }, { "epoch": 2.1846864178530203, "grad_norm": 5.35056734085083, "learning_rate": 7.817901234567901e-07, "logits/chosen": -2.6306333541870117, "logits/rejected": -2.6491189002990723, "logps/chosen": -46.438175201416016, "logps/rejected": -116.99906921386719, "loss": 0.4354, "rewards/accuracies": 0.90625, "rewards/chosen": 1.287505865097046, "rewards/margins": 5.18880558013916, "rewards/rejected": -3.901299476623535, "step": 708 }, { "epoch": 2.1877645248172373, "grad_norm": 4.110047817230225, "learning_rate": 7.814814814814814e-07, "logits/chosen": -2.7049312591552734, "logits/rejected": -2.621037721633911, "logps/chosen": -40.84932327270508, "logps/rejected": -113.0994644165039, "loss": 0.4565, "rewards/accuracies": 0.96875, "rewards/chosen": 1.701019525527954, "rewards/margins": 5.295424938201904, "rewards/rejected": -3.594405174255371, "step": 709 }, { "epoch": 2.190842631781454, "grad_norm": 7.056813716888428, "learning_rate": 7.811728395061729e-07, "logits/chosen": -2.7021877765655518, "logits/rejected": -2.669762134552002, "logps/chosen": -39.1102294921875, "logps/rejected": -86.96807098388672, "loss": 0.589, "rewards/accuracies": 0.84375, "rewards/chosen": 1.924004077911377, "rewards/margins": 3.5685110092163086, "rewards/rejected": -1.6445066928863525, "step": 710 }, { "epoch": 2.1939207387456716, "grad_norm": 3.706124782562256, "learning_rate": 7.808641975308642e-07, "logits/chosen": -2.805250406265259, "logits/rejected": -2.79190731048584, "logps/chosen": -39.43463134765625, "logps/rejected": -108.42373657226562, "loss": 0.5315, "rewards/accuracies": 0.9375, "rewards/chosen": 1.764954686164856, "rewards/margins": 5.128754615783691, "rewards/rejected": -3.363799571990967, "step": 711 }, { "epoch": 2.1969988457098886, "grad_norm": 3.702368974685669, "learning_rate": 7.805555555555555e-07, "logits/chosen": -2.7647621631622314, "logits/rejected": -2.739956855773926, "logps/chosen": -44.33161926269531, "logps/rejected": -120.80648803710938, "loss": 0.5048, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5278308391571045, "rewards/margins": 5.613738059997559, "rewards/rejected": -4.085906982421875, "step": 712 }, { "epoch": 2.2000769526741055, "grad_norm": 5.010550022125244, "learning_rate": 7.802469135802469e-07, "logits/chosen": -2.7628629207611084, "logits/rejected": -2.6942172050476074, "logps/chosen": -50.704627990722656, "logps/rejected": -133.28076171875, "loss": 0.5435, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9214507341384888, "rewards/margins": 5.606963634490967, "rewards/rejected": -4.685513019561768, "step": 713 }, { "epoch": 2.2031550596383225, "grad_norm": 3.0857839584350586, "learning_rate": 7.799382716049382e-07, "logits/chosen": -2.6815147399902344, "logits/rejected": -2.7244861125946045, "logps/chosen": -44.198150634765625, "logps/rejected": -111.67005157470703, "loss": 0.5564, "rewards/accuracies": 0.875, "rewards/chosen": 1.3576266765594482, "rewards/margins": 4.958220958709717, "rewards/rejected": -3.6005940437316895, "step": 714 }, { "epoch": 2.2062331666025394, "grad_norm": 5.964361667633057, "learning_rate": 7.796296296296296e-07, "logits/chosen": -2.773184061050415, "logits/rejected": -2.8214871883392334, "logps/chosen": -52.753936767578125, "logps/rejected": -117.2070541381836, "loss": 0.5667, "rewards/accuracies": 0.875, "rewards/chosen": 0.8778300881385803, "rewards/margins": 4.551321506500244, "rewards/rejected": -3.673491954803467, "step": 715 }, { "epoch": 2.2062331666025394, "eval_logits/chosen": -2.6935629844665527, "eval_logits/rejected": -2.737121343612671, "eval_logps/chosen": -44.78866195678711, "eval_logps/rejected": -115.40980529785156, "eval_loss": 0.5767555832862854, "eval_rewards/accuracies": 0.9017341136932373, "eval_rewards/chosen": 1.3485136032104492, "eval_rewards/margins": 4.872910022735596, "eval_rewards/rejected": -3.5243961811065674, "eval_runtime": 638.4804, "eval_samples_per_second": 0.542, "eval_steps_per_second": 0.271, "step": 715 }, { "epoch": 2.2093112735667564, "grad_norm": 5.137154579162598, "learning_rate": 7.79320987654321e-07, "logits/chosen": -2.69399094581604, "logits/rejected": -2.695570468902588, "logps/chosen": -36.731964111328125, "logps/rejected": -118.92082214355469, "loss": 0.4282, "rewards/accuracies": 0.96875, "rewards/chosen": 1.984549641609192, "rewards/margins": 6.029061794281006, "rewards/rejected": -4.044511795043945, "step": 716 }, { "epoch": 2.2123893805309733, "grad_norm": 4.581099987030029, "learning_rate": 7.790123456790123e-07, "logits/chosen": -2.7574357986450195, "logits/rejected": -2.76393461227417, "logps/chosen": -37.405479431152344, "logps/rejected": -108.22081756591797, "loss": 0.4833, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9559075832366943, "rewards/margins": 5.371187686920166, "rewards/rejected": -3.415280342102051, "step": 717 }, { "epoch": 2.2154674874951903, "grad_norm": 4.336902141571045, "learning_rate": 7.787037037037037e-07, "logits/chosen": -2.6800074577331543, "logits/rejected": -2.786555528640747, "logps/chosen": -52.25389862060547, "logps/rejected": -134.56651306152344, "loss": 0.465, "rewards/accuracies": 1.0, "rewards/chosen": 0.9368507862091064, "rewards/margins": 6.236032485961914, "rewards/rejected": -5.2991814613342285, "step": 718 }, { "epoch": 2.2185455944594077, "grad_norm": 5.837581157684326, "learning_rate": 7.78395061728395e-07, "logits/chosen": -2.7315475940704346, "logits/rejected": -2.7442262172698975, "logps/chosen": -38.30225372314453, "logps/rejected": -104.62523651123047, "loss": 0.5178, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9446748495101929, "rewards/margins": 4.782560348510742, "rewards/rejected": -2.8378853797912598, "step": 719 }, { "epoch": 2.2216237014236246, "grad_norm": 5.259190082550049, "learning_rate": 7.780864197530864e-07, "logits/chosen": -2.779827833175659, "logits/rejected": -2.7953779697418213, "logps/chosen": -36.745548248291016, "logps/rejected": -91.73506164550781, "loss": 0.5722, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0211799144744873, "rewards/margins": 3.8937790393829346, "rewards/rejected": -1.8725991249084473, "step": 720 }, { "epoch": 2.2247018083878416, "grad_norm": 5.710188388824463, "learning_rate": 7.777777777777778e-07, "logits/chosen": -2.699481248855591, "logits/rejected": -2.695807456970215, "logps/chosen": -54.427734375, "logps/rejected": -123.64415740966797, "loss": 0.5627, "rewards/accuracies": 0.9375, "rewards/chosen": 0.666202187538147, "rewards/margins": 4.780368804931641, "rewards/rejected": -4.114166736602783, "step": 721 }, { "epoch": 2.2277799153520585, "grad_norm": 6.781929016113281, "learning_rate": 7.774691358024691e-07, "logits/chosen": -2.7685630321502686, "logits/rejected": -2.6915597915649414, "logps/chosen": -32.77450180053711, "logps/rejected": -105.25363159179688, "loss": 0.4748, "rewards/accuracies": 0.9375, "rewards/chosen": 2.334587335586548, "rewards/margins": 5.174844741821289, "rewards/rejected": -2.8402578830718994, "step": 722 }, { "epoch": 2.2308580223162755, "grad_norm": 5.611055374145508, "learning_rate": 7.771604938271604e-07, "logits/chosen": -2.777209758758545, "logits/rejected": -2.8475890159606934, "logps/chosen": -41.51122283935547, "logps/rejected": -116.65461730957031, "loss": 0.4241, "rewards/accuracies": 0.96875, "rewards/chosen": 1.777789831161499, "rewards/margins": 5.4264912605285645, "rewards/rejected": -3.6487014293670654, "step": 723 }, { "epoch": 2.2339361292804925, "grad_norm": 5.6109700202941895, "learning_rate": 7.768518518518519e-07, "logits/chosen": -2.7806057929992676, "logits/rejected": -2.7510173320770264, "logps/chosen": -51.23543930053711, "logps/rejected": -124.82920837402344, "loss": 0.5524, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9841482639312744, "rewards/margins": 5.336513519287109, "rewards/rejected": -4.352364540100098, "step": 724 }, { "epoch": 2.2370142362447094, "grad_norm": 4.4726786613464355, "learning_rate": 7.765432098765432e-07, "logits/chosen": -2.778571128845215, "logits/rejected": -2.7573065757751465, "logps/chosen": -39.374000549316406, "logps/rejected": -119.75334930419922, "loss": 0.4504, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7310631275177002, "rewards/margins": 6.136997699737549, "rewards/rejected": -4.4059343338012695, "step": 725 }, { "epoch": 2.2400923432089264, "grad_norm": 6.098813056945801, "learning_rate": 7.762345679012345e-07, "logits/chosen": -2.8292155265808105, "logits/rejected": -2.8519973754882812, "logps/chosen": -43.24686813354492, "logps/rejected": -129.82545471191406, "loss": 0.4723, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8309727907180786, "rewards/margins": 6.515277862548828, "rewards/rejected": -4.684304714202881, "step": 726 }, { "epoch": 2.2431704501731433, "grad_norm": 5.975611209869385, "learning_rate": 7.759259259259259e-07, "logits/chosen": -2.8549275398254395, "logits/rejected": -2.746501922607422, "logps/chosen": -51.54450225830078, "logps/rejected": -122.17623901367188, "loss": 0.5133, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9132692813873291, "rewards/margins": 5.0826873779296875, "rewards/rejected": -4.169417858123779, "step": 727 }, { "epoch": 2.2462485571373607, "grad_norm": 3.9743826389312744, "learning_rate": 7.756172839506172e-07, "logits/chosen": -2.7416434288024902, "logits/rejected": -2.6738991737365723, "logps/chosen": -35.11714172363281, "logps/rejected": -103.49589538574219, "loss": 0.5189, "rewards/accuracies": 0.96875, "rewards/chosen": 1.945455551147461, "rewards/margins": 4.975276470184326, "rewards/rejected": -3.0298211574554443, "step": 728 }, { "epoch": 2.2493266641015777, "grad_norm": 3.7954180240631104, "learning_rate": 7.753086419753087e-07, "logits/chosen": -2.6786203384399414, "logits/rejected": -2.72318959236145, "logps/chosen": -35.34229278564453, "logps/rejected": -100.13231658935547, "loss": 0.5561, "rewards/accuracies": 0.90625, "rewards/chosen": 2.018073081970215, "rewards/margins": 4.875677108764648, "rewards/rejected": -2.8576035499572754, "step": 729 }, { "epoch": 2.2524047710657946, "grad_norm": 3.3727636337280273, "learning_rate": 7.75e-07, "logits/chosen": -2.6874520778656006, "logits/rejected": -2.727527618408203, "logps/chosen": -54.69327163696289, "logps/rejected": -121.51007080078125, "loss": 0.6488, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7170959711074829, "rewards/margins": 4.754605293273926, "rewards/rejected": -4.037509441375732, "step": 730 }, { "epoch": 2.2554828780300116, "grad_norm": 5.265850067138672, "learning_rate": 7.746913580246913e-07, "logits/chosen": -2.714725971221924, "logits/rejected": -2.6570348739624023, "logps/chosen": -45.090576171875, "logps/rejected": -110.0130615234375, "loss": 0.5302, "rewards/accuracies": 0.90625, "rewards/chosen": 1.642831802368164, "rewards/margins": 4.831244945526123, "rewards/rejected": -3.18841290473938, "step": 731 }, { "epoch": 2.2585609849942285, "grad_norm": 5.492014408111572, "learning_rate": 7.743827160493827e-07, "logits/chosen": -2.7121543884277344, "logits/rejected": -2.70626163482666, "logps/chosen": -45.38597869873047, "logps/rejected": -119.36134338378906, "loss": 0.4515, "rewards/accuracies": 1.0, "rewards/chosen": 1.5398982763290405, "rewards/margins": 5.408300399780273, "rewards/rejected": -3.8684024810791016, "step": 732 }, { "epoch": 2.2616390919584455, "grad_norm": 5.3935346603393555, "learning_rate": 7.74074074074074e-07, "logits/chosen": -2.7626359462738037, "logits/rejected": -2.7242093086242676, "logps/chosen": -46.17121124267578, "logps/rejected": -126.88533020019531, "loss": 0.571, "rewards/accuracies": 1.0, "rewards/chosen": 1.3178786039352417, "rewards/margins": 5.519165515899658, "rewards/rejected": -4.201287269592285, "step": 733 }, { "epoch": 2.2647171989226624, "grad_norm": 5.503937721252441, "learning_rate": 7.737654320987654e-07, "logits/chosen": -2.7663562297821045, "logits/rejected": -2.8004653453826904, "logps/chosen": -34.394813537597656, "logps/rejected": -113.0761947631836, "loss": 0.3855, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1828861236572266, "rewards/margins": 5.875726699829102, "rewards/rejected": -3.692840814590454, "step": 734 }, { "epoch": 2.2677953058868794, "grad_norm": 5.93931770324707, "learning_rate": 7.734567901234568e-07, "logits/chosen": -2.800166606903076, "logits/rejected": -2.7563111782073975, "logps/chosen": -56.41410827636719, "logps/rejected": -119.40133666992188, "loss": 0.5286, "rewards/accuracies": 1.0, "rewards/chosen": 0.41539955139160156, "rewards/margins": 4.538458824157715, "rewards/rejected": -4.123059272766113, "step": 735 }, { "epoch": 2.2708734128510963, "grad_norm": 4.499582290649414, "learning_rate": 7.731481481481481e-07, "logits/chosen": -2.796593189239502, "logits/rejected": -2.658470630645752, "logps/chosen": -44.88767623901367, "logps/rejected": -109.48661804199219, "loss": 0.5768, "rewards/accuracies": 0.90625, "rewards/chosen": 1.175431728363037, "rewards/margins": 4.533977508544922, "rewards/rejected": -3.3585457801818848, "step": 736 }, { "epoch": 2.2739515198153137, "grad_norm": 4.4064788818359375, "learning_rate": 7.728395061728394e-07, "logits/chosen": -2.7705678939819336, "logits/rejected": -2.728102445602417, "logps/chosen": -41.619850158691406, "logps/rejected": -101.04845428466797, "loss": 0.5917, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3691589832305908, "rewards/margins": 4.31081485748291, "rewards/rejected": -2.941655397415161, "step": 737 }, { "epoch": 2.2770296267795307, "grad_norm": 5.6192307472229, "learning_rate": 7.725308641975308e-07, "logits/chosen": -2.762657880783081, "logits/rejected": -2.731760025024414, "logps/chosen": -41.68611145019531, "logps/rejected": -100.3373031616211, "loss": 0.5885, "rewards/accuracies": 0.875, "rewards/chosen": 1.3896853923797607, "rewards/margins": 4.541407585144043, "rewards/rejected": -3.1517229080200195, "step": 738 }, { "epoch": 2.2801077337437476, "grad_norm": 4.453019142150879, "learning_rate": 7.722222222222222e-07, "logits/chosen": -2.7248752117156982, "logits/rejected": -2.735945463180542, "logps/chosen": -46.354549407958984, "logps/rejected": -127.3680191040039, "loss": 0.4708, "rewards/accuracies": 1.0, "rewards/chosen": 1.3703488111495972, "rewards/margins": 5.899259567260742, "rewards/rejected": -4.528911113739014, "step": 739 }, { "epoch": 2.2831858407079646, "grad_norm": 5.6703643798828125, "learning_rate": 7.719135802469136e-07, "logits/chosen": -2.6478209495544434, "logits/rejected": -2.730762004852295, "logps/chosen": -44.49723434448242, "logps/rejected": -124.88117218017578, "loss": 0.4598, "rewards/accuracies": 1.0, "rewards/chosen": 1.3482670783996582, "rewards/margins": 5.931793212890625, "rewards/rejected": -4.583526134490967, "step": 740 }, { "epoch": 2.2862639476721816, "grad_norm": 4.128565311431885, "learning_rate": 7.716049382716049e-07, "logits/chosen": -2.7137913703918457, "logits/rejected": -2.7268259525299072, "logps/chosen": -49.92973709106445, "logps/rejected": -130.03118896484375, "loss": 0.4992, "rewards/accuracies": 0.90625, "rewards/chosen": 1.212810754776001, "rewards/margins": 5.835278511047363, "rewards/rejected": -4.622467517852783, "step": 741 }, { "epoch": 2.2893420546363985, "grad_norm": 4.858769416809082, "learning_rate": 7.712962962962962e-07, "logits/chosen": -2.705662965774536, "logits/rejected": -2.773768663406372, "logps/chosen": -40.20744705200195, "logps/rejected": -114.24485778808594, "loss": 0.4422, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8372018337249756, "rewards/margins": 5.471485137939453, "rewards/rejected": -3.6342835426330566, "step": 742 }, { "epoch": 2.2924201616006155, "grad_norm": 5.797990322113037, "learning_rate": 7.709876543209877e-07, "logits/chosen": -2.8227720260620117, "logits/rejected": -2.7584824562072754, "logps/chosen": -60.94245529174805, "logps/rejected": -127.00942993164062, "loss": 0.5857, "rewards/accuracies": 0.875, "rewards/chosen": 0.469637930393219, "rewards/margins": 4.795527458190918, "rewards/rejected": -4.3258891105651855, "step": 743 }, { "epoch": 2.295498268564833, "grad_norm": 10.93332290649414, "learning_rate": 7.70679012345679e-07, "logits/chosen": -2.6775994300842285, "logits/rejected": -2.6769049167633057, "logps/chosen": -46.04383850097656, "logps/rejected": -116.42697143554688, "loss": 0.5468, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3653807640075684, "rewards/margins": 5.193789005279541, "rewards/rejected": -3.828408718109131, "step": 744 }, { "epoch": 2.29857637552905, "grad_norm": 5.447676658630371, "learning_rate": 7.703703703703703e-07, "logits/chosen": -2.7611804008483887, "logits/rejected": -2.7913691997528076, "logps/chosen": -34.317779541015625, "logps/rejected": -111.16848754882812, "loss": 0.4044, "rewards/accuracies": 0.96875, "rewards/chosen": 2.4702367782592773, "rewards/margins": 6.013836860656738, "rewards/rejected": -3.543600559234619, "step": 745 }, { "epoch": 2.3016544824932668, "grad_norm": 6.501634120941162, "learning_rate": 7.700617283950617e-07, "logits/chosen": -2.693004608154297, "logits/rejected": -2.7292613983154297, "logps/chosen": -46.13459396362305, "logps/rejected": -101.3446273803711, "loss": 0.6553, "rewards/accuracies": 0.875, "rewards/chosen": 1.2047654390335083, "rewards/margins": 4.057995319366455, "rewards/rejected": -2.8532299995422363, "step": 746 }, { "epoch": 2.3047325894574837, "grad_norm": 4.748025894165039, "learning_rate": 7.69753086419753e-07, "logits/chosen": -2.586886405944824, "logits/rejected": -2.712784767150879, "logps/chosen": -48.02671432495117, "logps/rejected": -110.34345245361328, "loss": 0.5608, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9296358227729797, "rewards/margins": 4.451066493988037, "rewards/rejected": -3.521430253982544, "step": 747 }, { "epoch": 2.3078106964217007, "grad_norm": 5.702347755432129, "learning_rate": 7.694444444444445e-07, "logits/chosen": -2.7276759147644043, "logits/rejected": -2.7212767601013184, "logps/chosen": -50.69841384887695, "logps/rejected": -127.81804656982422, "loss": 0.4828, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9029543399810791, "rewards/margins": 5.589543342590332, "rewards/rejected": -4.686589241027832, "step": 748 }, { "epoch": 2.3108888033859176, "grad_norm": 6.507685661315918, "learning_rate": 7.691358024691358e-07, "logits/chosen": -2.696474552154541, "logits/rejected": -2.6626932621002197, "logps/chosen": -41.612789154052734, "logps/rejected": -117.7003402709961, "loss": 0.466, "rewards/accuracies": 1.0, "rewards/chosen": 1.600748062133789, "rewards/margins": 5.5128493309021, "rewards/rejected": -3.9121010303497314, "step": 749 }, { "epoch": 2.3139669103501346, "grad_norm": 5.432562351226807, "learning_rate": 7.688271604938271e-07, "logits/chosen": -2.79021954536438, "logits/rejected": -2.771630048751831, "logps/chosen": -48.50384521484375, "logps/rejected": -119.32493591308594, "loss": 0.4779, "rewards/accuracies": 1.0, "rewards/chosen": 1.3048279285430908, "rewards/margins": 5.098104000091553, "rewards/rejected": -3.793276786804199, "step": 750 }, { "epoch": 2.3170450173143515, "grad_norm": 5.1725640296936035, "learning_rate": 7.685185185185185e-07, "logits/chosen": -2.7242610454559326, "logits/rejected": -2.7088706493377686, "logps/chosen": -42.18305587768555, "logps/rejected": -121.64606475830078, "loss": 0.4959, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5102968215942383, "rewards/margins": 5.584077835083008, "rewards/rejected": -4.0737810134887695, "step": 751 }, { "epoch": 2.3201231242785685, "grad_norm": 5.6656599044799805, "learning_rate": 7.682098765432098e-07, "logits/chosen": -2.849191188812256, "logits/rejected": -2.7882397174835205, "logps/chosen": -46.42090606689453, "logps/rejected": -107.017333984375, "loss": 0.5916, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4011441469192505, "rewards/margins": 4.427544593811035, "rewards/rejected": -3.026400566101074, "step": 752 }, { "epoch": 2.323201231242786, "grad_norm": 5.542544841766357, "learning_rate": 7.679012345679012e-07, "logits/chosen": -2.8030612468719482, "logits/rejected": -2.830216407775879, "logps/chosen": -46.514869689941406, "logps/rejected": -125.3962631225586, "loss": 0.4345, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5192714929580688, "rewards/margins": 5.977634429931641, "rewards/rejected": -4.458362579345703, "step": 753 }, { "epoch": 2.326279338207003, "grad_norm": 5.2905964851379395, "learning_rate": 7.675925925925926e-07, "logits/chosen": -2.7609400749206543, "logits/rejected": -2.689150333404541, "logps/chosen": -38.2645378112793, "logps/rejected": -104.04777526855469, "loss": 0.5848, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8524984121322632, "rewards/margins": 4.6129021644592285, "rewards/rejected": -2.760403633117676, "step": 754 }, { "epoch": 2.32935744517122, "grad_norm": 3.6922426223754883, "learning_rate": 7.672839506172839e-07, "logits/chosen": -2.759018898010254, "logits/rejected": -2.7780284881591797, "logps/chosen": -40.02683639526367, "logps/rejected": -104.41864776611328, "loss": 0.5502, "rewards/accuracies": 0.78125, "rewards/chosen": 1.797154426574707, "rewards/margins": 4.741345405578613, "rewards/rejected": -2.9441909790039062, "step": 755 }, { "epoch": 2.3324355521354367, "grad_norm": 3.3620097637176514, "learning_rate": 7.669753086419752e-07, "logits/chosen": -2.7791430950164795, "logits/rejected": -2.7694172859191895, "logps/chosen": -49.6381950378418, "logps/rejected": -112.73805236816406, "loss": 0.5732, "rewards/accuracies": 0.875, "rewards/chosen": 1.1739015579223633, "rewards/margins": 4.812044143676758, "rewards/rejected": -3.6381421089172363, "step": 756 }, { "epoch": 2.3355136590996537, "grad_norm": 5.20897102355957, "learning_rate": 7.666666666666667e-07, "logits/chosen": -2.7928881645202637, "logits/rejected": -2.7676761150360107, "logps/chosen": -58.325157165527344, "logps/rejected": -130.27996826171875, "loss": 0.5456, "rewards/accuracies": 0.90625, "rewards/chosen": 0.31438612937927246, "rewards/margins": 5.2095441818237305, "rewards/rejected": -4.895157814025879, "step": 757 }, { "epoch": 2.3385917660638706, "grad_norm": 4.972716808319092, "learning_rate": 7.66358024691358e-07, "logits/chosen": -2.6407742500305176, "logits/rejected": -2.6423277854919434, "logps/chosen": -42.5771484375, "logps/rejected": -115.09931182861328, "loss": 0.5054, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7642232179641724, "rewards/margins": 5.356192588806152, "rewards/rejected": -3.5919690132141113, "step": 758 }, { "epoch": 2.3416698730280876, "grad_norm": 4.9358062744140625, "learning_rate": 7.660493827160494e-07, "logits/chosen": -2.756882667541504, "logits/rejected": -2.7814924716949463, "logps/chosen": -42.05128479003906, "logps/rejected": -117.98738098144531, "loss": 0.4729, "rewards/accuracies": 0.96875, "rewards/chosen": 1.551372766494751, "rewards/margins": 5.322296142578125, "rewards/rejected": -3.770923137664795, "step": 759 }, { "epoch": 2.3447479799923046, "grad_norm": 4.796830177307129, "learning_rate": 7.657407407407407e-07, "logits/chosen": -2.7362442016601562, "logits/rejected": -2.753638982772827, "logps/chosen": -47.169246673583984, "logps/rejected": -106.07298278808594, "loss": 0.6677, "rewards/accuracies": 0.875, "rewards/chosen": 1.0337034463882446, "rewards/margins": 4.224587917327881, "rewards/rejected": -3.190884590148926, "step": 760 }, { "epoch": 2.3478260869565215, "grad_norm": 4.837957859039307, "learning_rate": 7.65432098765432e-07, "logits/chosen": -2.703338146209717, "logits/rejected": -2.679704189300537, "logps/chosen": -48.29157638549805, "logps/rejected": -120.60589599609375, "loss": 0.5436, "rewards/accuracies": 0.875, "rewards/chosen": 1.289488434791565, "rewards/margins": 4.939645767211914, "rewards/rejected": -3.6501574516296387, "step": 761 }, { "epoch": 2.350904193920739, "grad_norm": 4.825180530548096, "learning_rate": 7.651234567901235e-07, "logits/chosen": -2.7215585708618164, "logits/rejected": -2.7142906188964844, "logps/chosen": -30.80881118774414, "logps/rejected": -116.57345581054688, "loss": 0.4166, "rewards/accuracies": 1.0, "rewards/chosen": 2.653218984603882, "rewards/margins": 6.38267707824707, "rewards/rejected": -3.7294580936431885, "step": 762 }, { "epoch": 2.353982300884956, "grad_norm": 6.093669414520264, "learning_rate": 7.648148148148148e-07, "logits/chosen": -2.693673610687256, "logits/rejected": -2.7323157787323, "logps/chosen": -53.31671905517578, "logps/rejected": -112.4281005859375, "loss": 0.619, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7810168266296387, "rewards/margins": 4.347545623779297, "rewards/rejected": -3.5665283203125, "step": 763 }, { "epoch": 2.357060407849173, "grad_norm": 6.063239574432373, "learning_rate": 7.645061728395061e-07, "logits/chosen": -2.749734401702881, "logits/rejected": -2.7226951122283936, "logps/chosen": -43.19233322143555, "logps/rejected": -119.25390625, "loss": 0.5393, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4870116710662842, "rewards/margins": 5.58094596862793, "rewards/rejected": -4.093934535980225, "step": 764 }, { "epoch": 2.3601385148133898, "grad_norm": 5.304174900054932, "learning_rate": 7.641975308641975e-07, "logits/chosen": -2.7578155994415283, "logits/rejected": -2.633923053741455, "logps/chosen": -43.52983093261719, "logps/rejected": -108.27580261230469, "loss": 0.5756, "rewards/accuracies": 0.9375, "rewards/chosen": 1.411332607269287, "rewards/margins": 4.79555606842041, "rewards/rejected": -3.384223699569702, "step": 765 }, { "epoch": 2.3632166217776067, "grad_norm": 6.018707275390625, "learning_rate": 7.638888888888888e-07, "logits/chosen": -2.701996326446533, "logits/rejected": -2.707244396209717, "logps/chosen": -41.391414642333984, "logps/rejected": -106.0760498046875, "loss": 0.5063, "rewards/accuracies": 1.0, "rewards/chosen": 1.80685293674469, "rewards/margins": 4.697969913482666, "rewards/rejected": -2.8911168575286865, "step": 766 }, { "epoch": 2.3662947287418237, "grad_norm": 4.778380870819092, "learning_rate": 7.635802469135802e-07, "logits/chosen": -2.684591054916382, "logits/rejected": -2.6732945442199707, "logps/chosen": -32.30967712402344, "logps/rejected": -123.24079132080078, "loss": 0.339, "rewards/accuracies": 1.0, "rewards/chosen": 2.448469400405884, "rewards/margins": 6.922240257263184, "rewards/rejected": -4.473771095275879, "step": 767 }, { "epoch": 2.3693728357060406, "grad_norm": 4.36652946472168, "learning_rate": 7.632716049382716e-07, "logits/chosen": -2.656235933303833, "logits/rejected": -2.6555535793304443, "logps/chosen": -43.30083084106445, "logps/rejected": -117.63874053955078, "loss": 0.4803, "rewards/accuracies": 0.875, "rewards/chosen": 1.6415152549743652, "rewards/margins": 5.48613166809082, "rewards/rejected": -3.844616413116455, "step": 768 }, { "epoch": 2.372450942670258, "grad_norm": 5.204097270965576, "learning_rate": 7.629629629629629e-07, "logits/chosen": -2.7611682415008545, "logits/rejected": -2.7409207820892334, "logps/chosen": -36.45071029663086, "logps/rejected": -113.36222076416016, "loss": 0.4696, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1898419857025146, "rewards/margins": 5.52997350692749, "rewards/rejected": -3.3401312828063965, "step": 769 }, { "epoch": 2.375529049634475, "grad_norm": 4.774354934692383, "learning_rate": 7.626543209876543e-07, "logits/chosen": -2.714541435241699, "logits/rejected": -2.7433507442474365, "logps/chosen": -54.33820343017578, "logps/rejected": -108.85099029541016, "loss": 0.5731, "rewards/accuracies": 0.875, "rewards/chosen": 0.856283962726593, "rewards/margins": 4.320939540863037, "rewards/rejected": -3.464655876159668, "step": 770 }, { "epoch": 2.378607156598692, "grad_norm": 5.370654582977295, "learning_rate": 7.623456790123457e-07, "logits/chosen": -2.8254830837249756, "logits/rejected": -2.793095827102661, "logps/chosen": -53.989768981933594, "logps/rejected": -116.38390350341797, "loss": 0.5989, "rewards/accuracies": 0.96875, "rewards/chosen": 0.948235273361206, "rewards/margins": 4.510865688323975, "rewards/rejected": -3.5626308917999268, "step": 771 }, { "epoch": 2.381685263562909, "grad_norm": 4.291588306427002, "learning_rate": 7.62037037037037e-07, "logits/chosen": -2.581911563873291, "logits/rejected": -2.6681108474731445, "logps/chosen": -37.372840881347656, "logps/rejected": -102.80875396728516, "loss": 0.4876, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8180415630340576, "rewards/margins": 4.764037132263184, "rewards/rejected": -2.945996046066284, "step": 772 }, { "epoch": 2.384763370527126, "grad_norm": 6.166382789611816, "learning_rate": 7.617283950617284e-07, "logits/chosen": -2.6398799419403076, "logits/rejected": -2.7612814903259277, "logps/chosen": -45.736976623535156, "logps/rejected": -95.95069122314453, "loss": 0.5271, "rewards/accuracies": 0.875, "rewards/chosen": 1.3337836265563965, "rewards/margins": 4.030664920806885, "rewards/rejected": -2.6968812942504883, "step": 773 }, { "epoch": 2.387841477491343, "grad_norm": 5.043137550354004, "learning_rate": 7.614197530864197e-07, "logits/chosen": -2.72160005569458, "logits/rejected": -2.668081521987915, "logps/chosen": -45.85280227661133, "logps/rejected": -113.88223266601562, "loss": 0.4974, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6013025045394897, "rewards/margins": 5.127467155456543, "rewards/rejected": -3.5261645317077637, "step": 774 }, { "epoch": 2.3909195844555597, "grad_norm": 3.702986240386963, "learning_rate": 7.61111111111111e-07, "logits/chosen": -2.739804744720459, "logits/rejected": -2.7403314113616943, "logps/chosen": -46.082576751708984, "logps/rejected": -122.30223846435547, "loss": 0.5053, "rewards/accuracies": 0.96875, "rewards/chosen": 1.361241340637207, "rewards/margins": 5.437224388122559, "rewards/rejected": -4.075982570648193, "step": 775 }, { "epoch": 2.3939976914197767, "grad_norm": 4.0628662109375, "learning_rate": 7.608024691358025e-07, "logits/chosen": -2.8191261291503906, "logits/rejected": -2.7616944313049316, "logps/chosen": -41.655357360839844, "logps/rejected": -118.66954040527344, "loss": 0.5015, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4230238199234009, "rewards/margins": 5.255645751953125, "rewards/rejected": -3.8326220512390137, "step": 776 }, { "epoch": 2.3970757983839937, "grad_norm": 6.390618801116943, "learning_rate": 7.604938271604938e-07, "logits/chosen": -2.687725305557251, "logits/rejected": -2.6788997650146484, "logps/chosen": -56.866783142089844, "logps/rejected": -131.05194091796875, "loss": 0.582, "rewards/accuracies": 0.9375, "rewards/chosen": 0.48488909006118774, "rewards/margins": 5.000405788421631, "rewards/rejected": -4.51551628112793, "step": 777 }, { "epoch": 2.400153905348211, "grad_norm": 4.383968830108643, "learning_rate": 7.601851851851851e-07, "logits/chosen": -2.753901958465576, "logits/rejected": -2.6678104400634766, "logps/chosen": -40.78691101074219, "logps/rejected": -120.53553771972656, "loss": 0.4739, "rewards/accuracies": 0.9375, "rewards/chosen": 1.965306043624878, "rewards/margins": 5.890353679656982, "rewards/rejected": -3.925046920776367, "step": 778 }, { "epoch": 2.403232012312428, "grad_norm": 5.205958843231201, "learning_rate": 7.598765432098765e-07, "logits/chosen": -2.726255416870117, "logits/rejected": -2.663527250289917, "logps/chosen": -45.23704528808594, "logps/rejected": -113.81511688232422, "loss": 0.4896, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1755746603012085, "rewards/margins": 4.785064697265625, "rewards/rejected": -3.609489917755127, "step": 779 }, { "epoch": 2.406310119276645, "grad_norm": 5.1723198890686035, "learning_rate": 7.595679012345678e-07, "logits/chosen": -2.701198101043701, "logits/rejected": -2.6653287410736084, "logps/chosen": -52.917850494384766, "logps/rejected": -120.9527359008789, "loss": 0.5725, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6265257596969604, "rewards/margins": 4.623675346374512, "rewards/rejected": -3.997149705886841, "step": 780 }, { "epoch": 2.406310119276645, "eval_logits/chosen": -2.669494867324829, "eval_logits/rejected": -2.7036006450653076, "eval_logps/chosen": -46.49936294555664, "eval_logps/rejected": -117.78170013427734, "eval_loss": 0.5788290500640869, "eval_rewards/accuracies": 0.9046242833137512, "eval_rewards/chosen": 1.1774436235427856, "eval_rewards/margins": 4.939029693603516, "eval_rewards/rejected": -3.7615861892700195, "eval_runtime": 638.1392, "eval_samples_per_second": 0.542, "eval_steps_per_second": 0.271, "step": 780 }, { "epoch": 2.409388226240862, "grad_norm": 5.167616367340088, "learning_rate": 7.592592592592593e-07, "logits/chosen": -2.783294200897217, "logits/rejected": -2.7618215084075928, "logps/chosen": -38.4740104675293, "logps/rejected": -103.42992401123047, "loss": 0.4684, "rewards/accuracies": 0.96875, "rewards/chosen": 2.27963924407959, "rewards/margins": 5.135547161102295, "rewards/rejected": -2.855907917022705, "step": 781 }, { "epoch": 2.412466333205079, "grad_norm": 8.313228607177734, "learning_rate": 7.589506172839506e-07, "logits/chosen": -2.715538501739502, "logits/rejected": -2.619720220565796, "logps/chosen": -42.701072692871094, "logps/rejected": -150.10691833496094, "loss": 0.4718, "rewards/accuracies": 1.0, "rewards/chosen": 1.577957034111023, "rewards/margins": 7.296627998352051, "rewards/rejected": -5.718670845031738, "step": 782 }, { "epoch": 2.415544440169296, "grad_norm": 4.492687225341797, "learning_rate": 7.586419753086419e-07, "logits/chosen": -2.829857349395752, "logits/rejected": -2.75801157951355, "logps/chosen": -32.953670501708984, "logps/rejected": -106.8319320678711, "loss": 0.5185, "rewards/accuracies": 0.875, "rewards/chosen": 2.321429967880249, "rewards/margins": 5.443865776062012, "rewards/rejected": -3.1224358081817627, "step": 783 }, { "epoch": 2.4186225471335128, "grad_norm": 4.471456050872803, "learning_rate": 7.583333333333333e-07, "logits/chosen": -2.7483558654785156, "logits/rejected": -2.6962943077087402, "logps/chosen": -48.44480895996094, "logps/rejected": -127.29681396484375, "loss": 0.5201, "rewards/accuracies": 0.875, "rewards/chosen": 0.9961825013160706, "rewards/margins": 5.565019607543945, "rewards/rejected": -4.5688371658325195, "step": 784 }, { "epoch": 2.4217006540977297, "grad_norm": 6.029494285583496, "learning_rate": 7.580246913580246e-07, "logits/chosen": -2.7043025493621826, "logits/rejected": -2.680358648300171, "logps/chosen": -42.57699966430664, "logps/rejected": -108.84367370605469, "loss": 0.5291, "rewards/accuracies": 0.875, "rewards/chosen": 1.408258080482483, "rewards/margins": 4.930988788604736, "rewards/rejected": -3.522730827331543, "step": 785 }, { "epoch": 2.4247787610619467, "grad_norm": 5.327023983001709, "learning_rate": 7.57716049382716e-07, "logits/chosen": -2.6550681591033936, "logits/rejected": -2.5692741870880127, "logps/chosen": -43.02910614013672, "logps/rejected": -104.44395446777344, "loss": 0.5381, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2883384227752686, "rewards/margins": 4.19838285446167, "rewards/rejected": -2.9100446701049805, "step": 786 }, { "epoch": 2.427856868026164, "grad_norm": 5.461986541748047, "learning_rate": 7.574074074074074e-07, "logits/chosen": -2.734321355819702, "logits/rejected": -2.706057071685791, "logps/chosen": -60.06419372558594, "logps/rejected": -135.28378295898438, "loss": 0.5968, "rewards/accuracies": 0.875, "rewards/chosen": 0.08841607719659805, "rewards/margins": 5.000537872314453, "rewards/rejected": -4.912121295928955, "step": 787 }, { "epoch": 2.430934974990381, "grad_norm": 5.573113918304443, "learning_rate": 7.570987654320987e-07, "logits/chosen": -2.7127671241760254, "logits/rejected": -2.712456703186035, "logps/chosen": -31.122196197509766, "logps/rejected": -103.75628662109375, "loss": 0.4612, "rewards/accuracies": 0.90625, "rewards/chosen": 2.385709047317505, "rewards/margins": 5.349811553955078, "rewards/rejected": -2.9641027450561523, "step": 788 }, { "epoch": 2.434013081954598, "grad_norm": 4.788643836975098, "learning_rate": 7.5679012345679e-07, "logits/chosen": -2.770775318145752, "logits/rejected": -2.7679009437561035, "logps/chosen": -39.62640380859375, "logps/rejected": -121.43771362304688, "loss": 0.4083, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9226694107055664, "rewards/margins": 5.8798418045043945, "rewards/rejected": -3.9571726322174072, "step": 789 }, { "epoch": 2.437091188918815, "grad_norm": 4.415814399719238, "learning_rate": 7.564814814814815e-07, "logits/chosen": -2.807936906814575, "logits/rejected": -2.7255191802978516, "logps/chosen": -42.502784729003906, "logps/rejected": -129.03704833984375, "loss": 0.4589, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2349509000778198, "rewards/margins": 6.212737083435059, "rewards/rejected": -4.977785587310791, "step": 790 }, { "epoch": 2.440169295883032, "grad_norm": 4.6317057609558105, "learning_rate": 7.561728395061728e-07, "logits/chosen": -2.7532260417938232, "logits/rejected": -2.789789915084839, "logps/chosen": -51.25208282470703, "logps/rejected": -123.66594696044922, "loss": 0.5275, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7740688920021057, "rewards/margins": 5.149168968200684, "rewards/rejected": -4.3750996589660645, "step": 791 }, { "epoch": 2.443247402847249, "grad_norm": 5.936807155609131, "learning_rate": 7.558641975308642e-07, "logits/chosen": -2.665146589279175, "logits/rejected": -2.6759755611419678, "logps/chosen": -47.76023864746094, "logps/rejected": -134.56190490722656, "loss": 0.4812, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0627431869506836, "rewards/margins": 6.385815620422363, "rewards/rejected": -5.32307243347168, "step": 792 }, { "epoch": 2.446325509811466, "grad_norm": 4.860722064971924, "learning_rate": 7.555555555555555e-07, "logits/chosen": -2.7157034873962402, "logits/rejected": -2.748020648956299, "logps/chosen": -39.22316360473633, "logps/rejected": -112.46150970458984, "loss": 0.4065, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1346609592437744, "rewards/margins": 5.919085502624512, "rewards/rejected": -3.784424304962158, "step": 793 }, { "epoch": 2.449403616775683, "grad_norm": 3.655348300933838, "learning_rate": 7.552469135802468e-07, "logits/chosen": -2.6630985736846924, "logits/rejected": -2.6255500316619873, "logps/chosen": -46.246437072753906, "logps/rejected": -142.7498779296875, "loss": 0.4769, "rewards/accuracies": 1.0, "rewards/chosen": 1.1657166481018066, "rewards/margins": 6.4318318367004395, "rewards/rejected": -5.266115188598633, "step": 794 }, { "epoch": 2.4524817237399, "grad_norm": 4.74520206451416, "learning_rate": 7.549382716049383e-07, "logits/chosen": -2.750290870666504, "logits/rejected": -2.736517906188965, "logps/chosen": -57.01538848876953, "logps/rejected": -115.51576232910156, "loss": 0.5023, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6719371676445007, "rewards/margins": 4.783926963806152, "rewards/rejected": -4.111989974975586, "step": 795 }, { "epoch": 2.455559830704117, "grad_norm": 3.6850123405456543, "learning_rate": 7.546296296296296e-07, "logits/chosen": -2.7857041358947754, "logits/rejected": -2.790308952331543, "logps/chosen": -50.445228576660156, "logps/rejected": -121.787841796875, "loss": 0.6307, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0752304792404175, "rewards/margins": 5.007518291473389, "rewards/rejected": -3.9322874546051025, "step": 796 }, { "epoch": 2.458637937668334, "grad_norm": 4.913079738616943, "learning_rate": 7.543209876543209e-07, "logits/chosen": -2.778103828430176, "logits/rejected": -2.734828472137451, "logps/chosen": -44.46849060058594, "logps/rejected": -115.84346771240234, "loss": 0.4921, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5405030250549316, "rewards/margins": 5.291162967681885, "rewards/rejected": -3.750659942626953, "step": 797 }, { "epoch": 2.461716044632551, "grad_norm": 6.012718677520752, "learning_rate": 7.540123456790123e-07, "logits/chosen": -2.7210073471069336, "logits/rejected": -2.724698066711426, "logps/chosen": -45.883174896240234, "logps/rejected": -123.81379699707031, "loss": 0.443, "rewards/accuracies": 1.0, "rewards/chosen": 1.217510461807251, "rewards/margins": 5.4958271980285645, "rewards/rejected": -4.278316497802734, "step": 798 }, { "epoch": 2.464794151596768, "grad_norm": 4.2132134437561035, "learning_rate": 7.537037037037036e-07, "logits/chosen": -2.6851909160614014, "logits/rejected": -2.7361505031585693, "logps/chosen": -51.19783401489258, "logps/rejected": -110.3349380493164, "loss": 0.6401, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7486779689788818, "rewards/margins": 4.234760284423828, "rewards/rejected": -3.486081838607788, "step": 799 }, { "epoch": 2.467872258560985, "grad_norm": 5.580009937286377, "learning_rate": 7.533950617283951e-07, "logits/chosen": -2.7401905059814453, "logits/rejected": -2.751258134841919, "logps/chosen": -35.73492431640625, "logps/rejected": -106.3349380493164, "loss": 0.4939, "rewards/accuracies": 1.0, "rewards/chosen": 2.5547661781311035, "rewards/margins": 5.568514823913574, "rewards/rejected": -3.0137486457824707, "step": 800 }, { "epoch": 2.470950365525202, "grad_norm": 5.750882148742676, "learning_rate": 7.530864197530864e-07, "logits/chosen": -2.7292368412017822, "logits/rejected": -2.6851296424865723, "logps/chosen": -42.218746185302734, "logps/rejected": -115.09042358398438, "loss": 0.5392, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4504001140594482, "rewards/margins": 5.368910789489746, "rewards/rejected": -3.9185104370117188, "step": 801 }, { "epoch": 2.474028472489419, "grad_norm": 4.4848198890686035, "learning_rate": 7.527777777777777e-07, "logits/chosen": -2.7358341217041016, "logits/rejected": -2.7871158123016357, "logps/chosen": -51.161537170410156, "logps/rejected": -121.56041717529297, "loss": 0.5526, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9175221920013428, "rewards/margins": 4.865943908691406, "rewards/rejected": -3.9484214782714844, "step": 802 }, { "epoch": 2.477106579453636, "grad_norm": 4.9622721672058105, "learning_rate": 7.524691358024691e-07, "logits/chosen": -2.7216756343841553, "logits/rejected": -2.7113542556762695, "logps/chosen": -40.62596893310547, "logps/rejected": -104.45045471191406, "loss": 0.5468, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7217903137207031, "rewards/margins": 4.766770362854004, "rewards/rejected": -3.04498028755188, "step": 803 }, { "epoch": 2.480184686417853, "grad_norm": 5.395832538604736, "learning_rate": 7.521604938271605e-07, "logits/chosen": -2.8602569103240967, "logits/rejected": -2.735159158706665, "logps/chosen": -35.559593200683594, "logps/rejected": -100.986328125, "loss": 0.5432, "rewards/accuracies": 1.0, "rewards/chosen": 2.1868534088134766, "rewards/margins": 5.065361976623535, "rewards/rejected": -2.8785085678100586, "step": 804 }, { "epoch": 2.48326279338207, "grad_norm": 6.315579891204834, "learning_rate": 7.518518518518518e-07, "logits/chosen": -2.7715775966644287, "logits/rejected": -2.7266578674316406, "logps/chosen": -35.38755798339844, "logps/rejected": -106.91474914550781, "loss": 0.4533, "rewards/accuracies": 0.9375, "rewards/chosen": 2.2207493782043457, "rewards/margins": 5.422447681427002, "rewards/rejected": -3.201698064804077, "step": 805 }, { "epoch": 2.486340900346287, "grad_norm": 8.765884399414062, "learning_rate": 7.515432098765432e-07, "logits/chosen": -2.704282283782959, "logits/rejected": -2.710874080657959, "logps/chosen": -46.16485595703125, "logps/rejected": -134.87940979003906, "loss": 0.457, "rewards/accuracies": 0.96875, "rewards/chosen": 1.062707543373108, "rewards/margins": 6.393315315246582, "rewards/rejected": -5.3306074142456055, "step": 806 }, { "epoch": 2.489419007310504, "grad_norm": 6.207128524780273, "learning_rate": 7.512345679012345e-07, "logits/chosen": -2.6526384353637695, "logits/rejected": -2.6548211574554443, "logps/chosen": -45.11623764038086, "logps/rejected": -124.67787170410156, "loss": 0.4968, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6054952144622803, "rewards/margins": 5.950144290924072, "rewards/rejected": -4.344648838043213, "step": 807 }, { "epoch": 2.492497114274721, "grad_norm": 5.3438239097595215, "learning_rate": 7.509259259259258e-07, "logits/chosen": -2.7459640502929688, "logits/rejected": -2.7475059032440186, "logps/chosen": -44.37455749511719, "logps/rejected": -121.18937683105469, "loss": 0.3807, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3480231761932373, "rewards/margins": 5.863309383392334, "rewards/rejected": -4.515285968780518, "step": 808 }, { "epoch": 2.495575221238938, "grad_norm": 5.431687831878662, "learning_rate": 7.506172839506173e-07, "logits/chosen": -2.791597366333008, "logits/rejected": -2.7528574466705322, "logps/chosen": -40.59736633300781, "logps/rejected": -120.67196655273438, "loss": 0.473, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9659364223480225, "rewards/margins": 5.930593967437744, "rewards/rejected": -3.9646568298339844, "step": 809 }, { "epoch": 2.498653328203155, "grad_norm": 6.038602828979492, "learning_rate": 7.503086419753086e-07, "logits/chosen": -2.782902240753174, "logits/rejected": -2.7446086406707764, "logps/chosen": -46.21637725830078, "logps/rejected": -112.5072250366211, "loss": 0.5845, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9357312917709351, "rewards/margins": 4.535285472869873, "rewards/rejected": -3.5995540618896484, "step": 810 }, { "epoch": 2.501731435167372, "grad_norm": 5.4766669273376465, "learning_rate": 7.5e-07, "logits/chosen": -2.6419003009796143, "logits/rejected": -2.6547670364379883, "logps/chosen": -44.86552047729492, "logps/rejected": -114.94369506835938, "loss": 0.4719, "rewards/accuracies": 0.96875, "rewards/chosen": 1.292327880859375, "rewards/margins": 5.139604568481445, "rewards/rejected": -3.847276210784912, "step": 811 }, { "epoch": 2.5048095421315892, "grad_norm": 3.4185750484466553, "learning_rate": 7.496913580246913e-07, "logits/chosen": -2.762028217315674, "logits/rejected": -2.759286403656006, "logps/chosen": -29.96500015258789, "logps/rejected": -106.67548370361328, "loss": 0.4882, "rewards/accuracies": 0.9375, "rewards/chosen": 2.6264450550079346, "rewards/margins": 5.710204124450684, "rewards/rejected": -3.083759069442749, "step": 812 }, { "epoch": 2.507887649095806, "grad_norm": 6.127938747406006, "learning_rate": 7.493827160493826e-07, "logits/chosen": -2.6748437881469727, "logits/rejected": -2.7169947624206543, "logps/chosen": -58.45743942260742, "logps/rejected": -132.57028198242188, "loss": 0.5531, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23944130539894104, "rewards/margins": 5.067846775054932, "rewards/rejected": -4.828405380249023, "step": 813 }, { "epoch": 2.510965756060023, "grad_norm": 5.468734264373779, "learning_rate": 7.490740740740741e-07, "logits/chosen": -2.737002372741699, "logits/rejected": -2.7737417221069336, "logps/chosen": -42.2847785949707, "logps/rejected": -128.435302734375, "loss": 0.4244, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7580416202545166, "rewards/margins": 6.281239032745361, "rewards/rejected": -4.523197650909424, "step": 814 }, { "epoch": 2.51404386302424, "grad_norm": 8.246940612792969, "learning_rate": 7.487654320987654e-07, "logits/chosen": -2.7593889236450195, "logits/rejected": -2.698507785797119, "logps/chosen": -44.535194396972656, "logps/rejected": -134.86558532714844, "loss": 0.4214, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5438718795776367, "rewards/margins": 6.541757583618164, "rewards/rejected": -4.9978861808776855, "step": 815 }, { "epoch": 2.517121969988457, "grad_norm": 5.53248405456543, "learning_rate": 7.484567901234567e-07, "logits/chosen": -2.668332815170288, "logits/rejected": -2.7342445850372314, "logps/chosen": -33.770233154296875, "logps/rejected": -103.19413757324219, "loss": 0.5348, "rewards/accuracies": 1.0, "rewards/chosen": 2.540225028991699, "rewards/margins": 5.242027759552002, "rewards/rejected": -2.7018027305603027, "step": 816 }, { "epoch": 2.520200076952674, "grad_norm": 6.602954864501953, "learning_rate": 7.481481481481481e-07, "logits/chosen": -2.6661980152130127, "logits/rejected": -2.653657913208008, "logps/chosen": -31.62140655517578, "logps/rejected": -87.44820404052734, "loss": 0.5592, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2969963550567627, "rewards/margins": 4.076028823852539, "rewards/rejected": -1.7790322303771973, "step": 817 }, { "epoch": 2.523278183916891, "grad_norm": 8.938089370727539, "learning_rate": 7.478395061728394e-07, "logits/chosen": -2.6717631816864014, "logits/rejected": -2.630709648132324, "logps/chosen": -45.1915283203125, "logps/rejected": -117.55667877197266, "loss": 0.5553, "rewards/accuracies": 1.0, "rewards/chosen": 1.2460851669311523, "rewards/margins": 4.98126220703125, "rewards/rejected": -3.7351770401000977, "step": 818 }, { "epoch": 2.5263562908811084, "grad_norm": 6.130400657653809, "learning_rate": 7.475308641975308e-07, "logits/chosen": -2.6524832248687744, "logits/rejected": -2.643697500228882, "logps/chosen": -43.90400314331055, "logps/rejected": -129.6461639404297, "loss": 0.4183, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1094446182250977, "rewards/margins": 5.936629772186279, "rewards/rejected": -4.827184677124023, "step": 819 }, { "epoch": 2.529434397845325, "grad_norm": 5.510296821594238, "learning_rate": 7.472222222222222e-07, "logits/chosen": -2.7366394996643066, "logits/rejected": -2.6856398582458496, "logps/chosen": -47.91456604003906, "logps/rejected": -116.89794921875, "loss": 0.6205, "rewards/accuracies": 0.90625, "rewards/chosen": 1.029609203338623, "rewards/margins": 4.893352508544922, "rewards/rejected": -3.8637430667877197, "step": 820 }, { "epoch": 2.5325125048095423, "grad_norm": 5.486467361450195, "learning_rate": 7.469135802469135e-07, "logits/chosen": -2.780190944671631, "logits/rejected": -2.7868542671203613, "logps/chosen": -40.92790603637695, "logps/rejected": -122.84722900390625, "loss": 0.4893, "rewards/accuracies": 1.0, "rewards/chosen": 1.7108964920043945, "rewards/margins": 6.280796051025391, "rewards/rejected": -4.569899559020996, "step": 821 }, { "epoch": 2.535590611773759, "grad_norm": 7.444575786590576, "learning_rate": 7.466049382716048e-07, "logits/chosen": -2.6325201988220215, "logits/rejected": -2.700157880783081, "logps/chosen": -45.809139251708984, "logps/rejected": -98.61060333251953, "loss": 0.5337, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4507975578308105, "rewards/margins": 4.009893894195557, "rewards/rejected": -2.559096097946167, "step": 822 }, { "epoch": 2.538668718737976, "grad_norm": 4.867286205291748, "learning_rate": 7.462962962962963e-07, "logits/chosen": -2.7055959701538086, "logits/rejected": -2.7582030296325684, "logps/chosen": -39.607913970947266, "logps/rejected": -125.88938903808594, "loss": 0.4528, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6784265041351318, "rewards/margins": 6.292168140411377, "rewards/rejected": -4.613741874694824, "step": 823 }, { "epoch": 2.541746825702193, "grad_norm": 5.221421241760254, "learning_rate": 7.459876543209876e-07, "logits/chosen": -2.788327693939209, "logits/rejected": -2.743610382080078, "logps/chosen": -46.697998046875, "logps/rejected": -115.60615539550781, "loss": 0.5713, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3706653118133545, "rewards/margins": 4.73123025894165, "rewards/rejected": -3.360564947128296, "step": 824 }, { "epoch": 2.54482493266641, "grad_norm": 5.294747352600098, "learning_rate": 7.45679012345679e-07, "logits/chosen": -2.753774642944336, "logits/rejected": -2.703603982925415, "logps/chosen": -33.55255889892578, "logps/rejected": -99.8165512084961, "loss": 0.4861, "rewards/accuracies": 0.9375, "rewards/chosen": 2.051344394683838, "rewards/margins": 4.610053062438965, "rewards/rejected": -2.558708906173706, "step": 825 }, { "epoch": 2.547903039630627, "grad_norm": 5.869159698486328, "learning_rate": 7.453703703703703e-07, "logits/chosen": -2.733489990234375, "logits/rejected": -2.722134828567505, "logps/chosen": -33.53962326049805, "logps/rejected": -98.25291442871094, "loss": 0.4522, "rewards/accuracies": 0.9375, "rewards/chosen": 2.4097280502319336, "rewards/margins": 4.887970447540283, "rewards/rejected": -2.4782423973083496, "step": 826 }, { "epoch": 2.550981146594844, "grad_norm": 3.7990386486053467, "learning_rate": 7.450617283950616e-07, "logits/chosen": -2.769411563873291, "logits/rejected": -2.7586669921875, "logps/chosen": -44.35859680175781, "logps/rejected": -118.96722412109375, "loss": 0.5589, "rewards/accuracies": 1.0, "rewards/chosen": 1.8417468070983887, "rewards/margins": 5.317167282104492, "rewards/rejected": -3.4754202365875244, "step": 827 }, { "epoch": 2.5540592535590614, "grad_norm": 7.0478644371032715, "learning_rate": 7.447530864197531e-07, "logits/chosen": -2.7031381130218506, "logits/rejected": -2.7192134857177734, "logps/chosen": -52.73657989501953, "logps/rejected": -124.4120864868164, "loss": 0.5594, "rewards/accuracies": 1.0, "rewards/chosen": 1.0424940586090088, "rewards/margins": 5.120949745178223, "rewards/rejected": -4.078455924987793, "step": 828 }, { "epoch": 2.5571373605232783, "grad_norm": 6.137929439544678, "learning_rate": 7.444444444444444e-07, "logits/chosen": -2.645951747894287, "logits/rejected": -2.635615110397339, "logps/chosen": -47.4234504699707, "logps/rejected": -128.36822509765625, "loss": 0.4509, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4141101837158203, "rewards/margins": 5.608643054962158, "rewards/rejected": -4.194532871246338, "step": 829 }, { "epoch": 2.5602154674874953, "grad_norm": 5.393021583557129, "learning_rate": 7.441358024691357e-07, "logits/chosen": -2.6651322841644287, "logits/rejected": -2.712942361831665, "logps/chosen": -38.458805084228516, "logps/rejected": -123.8851547241211, "loss": 0.3951, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9811381101608276, "rewards/margins": 6.053623199462891, "rewards/rejected": -4.072485446929932, "step": 830 }, { "epoch": 2.5632935744517122, "grad_norm": 3.750803232192993, "learning_rate": 7.438271604938271e-07, "logits/chosen": -2.7041759490966797, "logits/rejected": -2.69907808303833, "logps/chosen": -47.20735168457031, "logps/rejected": -115.073974609375, "loss": 0.5261, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5197724103927612, "rewards/margins": 5.0595383644104, "rewards/rejected": -3.539766311645508, "step": 831 }, { "epoch": 2.566371681415929, "grad_norm": 6.894580841064453, "learning_rate": 7.435185185185184e-07, "logits/chosen": -2.712785482406616, "logits/rejected": -2.740577220916748, "logps/chosen": -52.11052703857422, "logps/rejected": -105.98057556152344, "loss": 0.6065, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2751810550689697, "rewards/margins": 3.9024569988250732, "rewards/rejected": -2.6272759437561035, "step": 832 }, { "epoch": 2.569449788380146, "grad_norm": 4.367819786071777, "learning_rate": 7.432098765432099e-07, "logits/chosen": -2.7409186363220215, "logits/rejected": -2.720942497253418, "logps/chosen": -51.579750061035156, "logps/rejected": -115.85975646972656, "loss": 0.5685, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9270248413085938, "rewards/margins": 4.655708312988281, "rewards/rejected": -3.7286832332611084, "step": 833 }, { "epoch": 2.572527895344363, "grad_norm": 4.268983840942383, "learning_rate": 7.429012345679012e-07, "logits/chosen": -2.6514599323272705, "logits/rejected": -2.693413257598877, "logps/chosen": -33.573265075683594, "logps/rejected": -89.96334838867188, "loss": 0.5255, "rewards/accuracies": 0.96875, "rewards/chosen": 2.448035717010498, "rewards/margins": 4.524176597595215, "rewards/rejected": -2.0761406421661377, "step": 834 }, { "epoch": 2.5756060023085805, "grad_norm": 6.431707382202148, "learning_rate": 7.425925925925925e-07, "logits/chosen": -2.7014589309692383, "logits/rejected": -2.7605671882629395, "logps/chosen": -50.66900634765625, "logps/rejected": -103.74775695800781, "loss": 0.6035, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0190849304199219, "rewards/margins": 3.95558500289917, "rewards/rejected": -2.936500310897827, "step": 835 }, { "epoch": 2.578684109272797, "grad_norm": 5.523716926574707, "learning_rate": 7.422839506172839e-07, "logits/chosen": -2.698415517807007, "logits/rejected": -2.6802656650543213, "logps/chosen": -43.647003173828125, "logps/rejected": -110.13800048828125, "loss": 0.5888, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3818175792694092, "rewards/margins": 4.603945255279541, "rewards/rejected": -3.222127914428711, "step": 836 }, { "epoch": 2.5817622162370144, "grad_norm": 4.98521614074707, "learning_rate": 7.419753086419753e-07, "logits/chosen": -2.7321345806121826, "logits/rejected": -2.6474170684814453, "logps/chosen": -31.600370407104492, "logps/rejected": -109.57699584960938, "loss": 0.4703, "rewards/accuracies": 0.9375, "rewards/chosen": 2.276163101196289, "rewards/margins": 5.437215328216553, "rewards/rejected": -3.1610522270202637, "step": 837 }, { "epoch": 2.5848403232012314, "grad_norm": 5.330788612365723, "learning_rate": 7.416666666666666e-07, "logits/chosen": -2.7704319953918457, "logits/rejected": -2.7236328125, "logps/chosen": -55.2291259765625, "logps/rejected": -114.19551086425781, "loss": 0.5623, "rewards/accuracies": 0.78125, "rewards/chosen": 0.664188027381897, "rewards/margins": 4.131922721862793, "rewards/rejected": -3.4677343368530273, "step": 838 }, { "epoch": 2.5879184301654483, "grad_norm": 5.176431179046631, "learning_rate": 7.41358024691358e-07, "logits/chosen": -2.641087055206299, "logits/rejected": -2.673008680343628, "logps/chosen": -39.29281997680664, "logps/rejected": -113.70809173583984, "loss": 0.4971, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6507385969161987, "rewards/margins": 4.99709415435791, "rewards/rejected": -3.3463551998138428, "step": 839 }, { "epoch": 2.5909965371296653, "grad_norm": 4.557927131652832, "learning_rate": 7.410493827160493e-07, "logits/chosen": -2.6855037212371826, "logits/rejected": -2.7070703506469727, "logps/chosen": -37.35820770263672, "logps/rejected": -118.67713165283203, "loss": 0.4269, "rewards/accuracies": 0.96875, "rewards/chosen": 2.161813974380493, "rewards/margins": 5.871424674987793, "rewards/rejected": -3.7096107006073, "step": 840 }, { "epoch": 2.594074644093882, "grad_norm": 9.008949279785156, "learning_rate": 7.407407407407406e-07, "logits/chosen": -2.7390379905700684, "logits/rejected": -2.703618049621582, "logps/chosen": -42.664188385009766, "logps/rejected": -110.91249084472656, "loss": 0.5342, "rewards/accuracies": 1.0, "rewards/chosen": 1.7788617610931396, "rewards/margins": 4.899935245513916, "rewards/rejected": -3.1210737228393555, "step": 841 }, { "epoch": 2.597152751058099, "grad_norm": 7.14028263092041, "learning_rate": 7.404320987654321e-07, "logits/chosen": -2.8154690265655518, "logits/rejected": -2.758004903793335, "logps/chosen": -53.39942932128906, "logps/rejected": -144.31117248535156, "loss": 0.466, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9883124828338623, "rewards/margins": 6.459924221038818, "rewards/rejected": -5.471611976623535, "step": 842 }, { "epoch": 2.600230858022316, "grad_norm": 4.503868579864502, "learning_rate": 7.401234567901234e-07, "logits/chosen": -2.7487876415252686, "logits/rejected": -2.7635841369628906, "logps/chosen": -44.51216506958008, "logps/rejected": -127.73009490966797, "loss": 0.4732, "rewards/accuracies": 0.9375, "rewards/chosen": 1.546547293663025, "rewards/margins": 6.044504642486572, "rewards/rejected": -4.497957229614258, "step": 843 }, { "epoch": 2.6033089649865335, "grad_norm": 6.028653621673584, "learning_rate": 7.398148148148148e-07, "logits/chosen": -2.656043529510498, "logits/rejected": -2.651959180831909, "logps/chosen": -29.618846893310547, "logps/rejected": -106.98807525634766, "loss": 0.4156, "rewards/accuracies": 0.96875, "rewards/chosen": 2.414419412612915, "rewards/margins": 5.804570198059082, "rewards/rejected": -3.390150547027588, "step": 844 }, { "epoch": 2.60638707195075, "grad_norm": 4.875899791717529, "learning_rate": 7.395061728395061e-07, "logits/chosen": -2.6975841522216797, "logits/rejected": -2.723205327987671, "logps/chosen": -35.31017303466797, "logps/rejected": -115.18816375732422, "loss": 0.4689, "rewards/accuracies": 0.9375, "rewards/chosen": 2.1987950801849365, "rewards/margins": 5.816195487976074, "rewards/rejected": -3.617400646209717, "step": 845 }, { "epoch": 2.60638707195075, "eval_logits/chosen": -2.6657285690307617, "eval_logits/rejected": -2.7027597427368164, "eval_logps/chosen": -44.64780044555664, "eval_logps/rejected": -116.08868408203125, "eval_loss": 0.5757032036781311, "eval_rewards/accuracies": 0.8988439440727234, "eval_rewards/chosen": 1.362599492073059, "eval_rewards/margins": 4.954884052276611, "eval_rewards/rejected": -3.592284917831421, "eval_runtime": 639.1158, "eval_samples_per_second": 0.541, "eval_steps_per_second": 0.271, "step": 845 } ], "logging_steps": 1.0, "max_steps": 3240, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 65, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }