{ "best_global_step": 792, "best_metric": 2.6713390350341797, "best_model_checkpoint": "/tmp/svadugur/39815/informativity_and_cost_preference-speaker=gemma-listener=pixtral_ft-length_conditioned=False-contexts=medium-39815/checkpoint-792", "epoch": 1.8135204812374677, "eval_steps": 88, "global_step": 792, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_logits/chosen": -2.3045759201049805, "eval_logits/rejected": -2.296954870223999, "eval_logps/chosen": -57.06235122680664, "eval_logps/rejected": -61.049434661865234, "eval_loss": 1.0, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0, "eval_runtime": 988.2659, "eval_samples_per_second": 0.535, "eval_steps_per_second": 0.268, "step": 0 }, { "epoch": 0.0022916069894013177, "grad_norm": 1.8024064302444458, "learning_rate": 1e-06, "logits/chosen": -2.285168170928955, "logits/rejected": -2.2504661083221436, "logps/chosen": -53.30577087402344, "logps/rejected": -58.43346405029297, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004583213978802635, "grad_norm": 1.8952271938323975, "learning_rate": 9.997706422018348e-07, "logits/chosen": -2.196585178375244, "logits/rejected": -2.195831537246704, "logps/chosen": -55.260353088378906, "logps/rejected": -61.919925689697266, "loss": 0.9987, "rewards/accuracies": 0.4375, "rewards/chosen": 0.005814695730805397, "rewards/margins": 0.005041887983679771, "rewards/rejected": 0.0007728112395852804, "step": 2 }, { "epoch": 0.006874820968203953, "grad_norm": 2.278347969055176, "learning_rate": 9.995412844036696e-07, "logits/chosen": -2.309056282043457, "logits/rejected": -2.2635951042175293, "logps/chosen": -56.7116584777832, "logps/rejected": -65.10358428955078, "loss": 0.9983, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0007827281951904297, "rewards/margins": 0.0066553824581205845, "rewards/rejected": -0.005872656591236591, "step": 3 }, { "epoch": 0.00916642795760527, "grad_norm": 2.57346248626709, "learning_rate": 9.993119266055046e-07, "logits/chosen": -2.300158739089966, "logits/rejected": -2.2687935829162598, "logps/chosen": -58.14372634887695, "logps/rejected": -64.14485931396484, "loss": 0.9938, "rewards/accuracies": 0.59375, "rewards/chosen": 0.020578062161803246, "rewards/margins": 0.024701913818717003, "rewards/rejected": -0.004123854450881481, "step": 4 }, { "epoch": 0.011458034947006588, "grad_norm": 2.265455722808838, "learning_rate": 9.990825688073395e-07, "logits/chosen": -2.3228437900543213, "logits/rejected": -2.2767603397369385, "logps/chosen": -53.53734588623047, "logps/rejected": -59.63623809814453, "loss": 0.9999, "rewards/accuracies": 0.5, "rewards/chosen": -0.03766823932528496, "rewards/margins": 0.0003072400577366352, "rewards/rejected": -0.037975478917360306, "step": 5 }, { "epoch": 0.013749641936407906, "grad_norm": 1.883252739906311, "learning_rate": 9.988532110091743e-07, "logits/chosen": -2.2673239707946777, "logits/rejected": -2.218733787536621, "logps/chosen": -56.61028289794922, "logps/rejected": -61.524776458740234, "loss": 0.9971, "rewards/accuracies": 0.46875, "rewards/chosen": -0.007082799449563026, "rewards/margins": 0.011671637184917927, "rewards/rejected": -0.01875443384051323, "step": 6 }, { "epoch": 0.016041248925809225, "grad_norm": 1.9135973453521729, "learning_rate": 9.986238532110091e-07, "logits/chosen": -2.2647297382354736, "logits/rejected": -2.2506093978881836, "logps/chosen": -54.260223388671875, "logps/rejected": -59.16828918457031, "loss": 0.9986, "rewards/accuracies": 0.59375, "rewards/chosen": 0.013435423374176025, "rewards/margins": 0.0054061533883214, "rewards/rejected": 0.008029269054532051, "step": 7 }, { "epoch": 0.01833285591521054, "grad_norm": 2.5636866092681885, "learning_rate": 9.98394495412844e-07, "logits/chosen": -2.3067684173583984, "logits/rejected": -2.2797176837921143, "logps/chosen": -55.44165802001953, "logps/rejected": -61.53620529174805, "loss": 1.0064, "rewards/accuracies": 0.40625, "rewards/chosen": -0.013596583157777786, "rewards/margins": -0.025478601455688477, "rewards/rejected": 0.01188202016055584, "step": 8 }, { "epoch": 0.02062446290461186, "grad_norm": 1.9106698036193848, "learning_rate": 9.98165137614679e-07, "logits/chosen": -2.2075631618499756, "logits/rejected": -2.207092046737671, "logps/chosen": -53.72904586791992, "logps/rejected": -57.451419830322266, "loss": 0.9939, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0047488100826740265, "rewards/margins": 0.024647153913974762, "rewards/rejected": -0.019898343831300735, "step": 9 }, { "epoch": 0.022916069894013177, "grad_norm": 2.434770345687866, "learning_rate": 9.979357798165136e-07, "logits/chosen": -2.2556893825531006, "logits/rejected": -2.2159547805786133, "logps/chosen": -56.22480773925781, "logps/rejected": -65.14265441894531, "loss": 1.0002, "rewards/accuracies": 0.40625, "rewards/chosen": -0.028405608609318733, "rewards/margins": -0.0007333874236792326, "rewards/rejected": -0.027672218158841133, "step": 10 }, { "epoch": 0.025207676883414496, "grad_norm": 2.1098973751068115, "learning_rate": 9.977064220183486e-07, "logits/chosen": -2.2611212730407715, "logits/rejected": -2.2667183876037598, "logps/chosen": -54.75017547607422, "logps/rejected": -62.018775939941406, "loss": 0.9931, "rewards/accuracies": 0.625, "rewards/chosen": 0.004906916059553623, "rewards/margins": 0.027534784749150276, "rewards/rejected": -0.02262786589562893, "step": 11 }, { "epoch": 0.027499283872815812, "grad_norm": 2.0023891925811768, "learning_rate": 9.974770642201835e-07, "logits/chosen": -2.314368963241577, "logits/rejected": -2.2963554859161377, "logps/chosen": -53.500999450683594, "logps/rejected": -56.134342193603516, "loss": 0.9997, "rewards/accuracies": 0.53125, "rewards/chosen": 0.006466103717684746, "rewards/margins": 0.0013528335839509964, "rewards/rejected": 0.0051132682710886, "step": 12 }, { "epoch": 0.02979089086221713, "grad_norm": 2.220789909362793, "learning_rate": 9.972477064220183e-07, "logits/chosen": -2.2553319931030273, "logits/rejected": -2.1970489025115967, "logps/chosen": -54.49557876586914, "logps/rejected": -58.923824310302734, "loss": 1.0051, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03764292970299721, "rewards/margins": -0.020627712830901146, "rewards/rejected": -0.01701522059738636, "step": 13 }, { "epoch": 0.03208249785161845, "grad_norm": 1.9915388822555542, "learning_rate": 9.970183486238531e-07, "logits/chosen": -2.137967586517334, "logits/rejected": -2.176685333251953, "logps/chosen": -55.43719482421875, "logps/rejected": -57.449745178222656, "loss": 0.994, "rewards/accuracies": 0.59375, "rewards/chosen": 0.013510335236787796, "rewards/margins": 0.024136627092957497, "rewards/rejected": -0.01062629371881485, "step": 14 }, { "epoch": 0.03437410484101976, "grad_norm": 2.3387460708618164, "learning_rate": 9.96788990825688e-07, "logits/chosen": -2.2430882453918457, "logits/rejected": -2.2347681522369385, "logps/chosen": -53.544219970703125, "logps/rejected": -56.54861831665039, "loss": 1.0036, "rewards/accuracies": 0.46875, "rewards/chosen": -0.026296043768525124, "rewards/margins": -0.014452947303652763, "rewards/rejected": -0.01184309832751751, "step": 15 }, { "epoch": 0.03666571183042108, "grad_norm": 3.0936973094940186, "learning_rate": 9.96559633027523e-07, "logits/chosen": -2.30712628364563, "logits/rejected": -2.305339813232422, "logps/chosen": -56.89485549926758, "logps/rejected": -58.54441833496094, "loss": 1.002, "rewards/accuracies": 0.46875, "rewards/chosen": 0.011517953127622604, "rewards/margins": -0.0081613315269351, "rewards/rejected": 0.01967928186058998, "step": 16 }, { "epoch": 0.0389573188198224, "grad_norm": 9.135102272033691, "learning_rate": 9.963302752293578e-07, "logits/chosen": -2.250602960586548, "logits/rejected": -2.236987590789795, "logps/chosen": -53.6083984375, "logps/rejected": -58.617008209228516, "loss": 0.9997, "rewards/accuracies": 0.4375, "rewards/chosen": -0.007143700495362282, "rewards/margins": 0.0011186245828866959, "rewards/rejected": -0.008262323215603828, "step": 17 }, { "epoch": 0.04124892580922372, "grad_norm": 2.2526419162750244, "learning_rate": 9.961009174311926e-07, "logits/chosen": -2.2329225540161133, "logits/rejected": -2.228003978729248, "logps/chosen": -53.2277946472168, "logps/rejected": -57.913631439208984, "loss": 1.0079, "rewards/accuracies": 0.4375, "rewards/chosen": -0.04973185062408447, "rewards/margins": -0.03167439624667168, "rewards/rejected": -0.018057454377412796, "step": 18 }, { "epoch": 0.043540532798625034, "grad_norm": 1.8606770038604736, "learning_rate": 9.958715596330275e-07, "logits/chosen": -2.263852596282959, "logits/rejected": -2.249187707901001, "logps/chosen": -52.30138397216797, "logps/rejected": -58.83649444580078, "loss": 0.9979, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00028444500640034676, "rewards/margins": 0.008253371343016624, "rewards/rejected": -0.008537816815078259, "step": 19 }, { "epoch": 0.04583213978802635, "grad_norm": 1.8901959657669067, "learning_rate": 9.956422018348623e-07, "logits/chosen": -2.2570266723632812, "logits/rejected": -2.2203757762908936, "logps/chosen": -55.44905090332031, "logps/rejected": -59.46892166137695, "loss": 0.9986, "rewards/accuracies": 0.5625, "rewards/chosen": -0.025874221697449684, "rewards/margins": 0.005647291429340839, "rewards/rejected": -0.0315215140581131, "step": 20 }, { "epoch": 0.04812374677742767, "grad_norm": 2.2244114875793457, "learning_rate": 9.954128440366973e-07, "logits/chosen": -2.233423948287964, "logits/rejected": -2.2288975715637207, "logps/chosen": -54.666194915771484, "logps/rejected": -58.96556091308594, "loss": 1.0009, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008564346935600042, "rewards/margins": -0.003786743152886629, "rewards/rejected": 0.002930307760834694, "step": 21 }, { "epoch": 0.05041535376682899, "grad_norm": 2.136143922805786, "learning_rate": 9.951834862385321e-07, "logits/chosen": -2.2710747718811035, "logits/rejected": -2.2629053592681885, "logps/chosen": -53.27805709838867, "logps/rejected": -60.3846549987793, "loss": 1.0003, "rewards/accuracies": 0.4375, "rewards/chosen": 0.005496214143931866, "rewards/margins": -0.0010509847197681665, "rewards/rejected": 0.006547200493514538, "step": 22 }, { "epoch": 0.052706960756230305, "grad_norm": 1.8552672863006592, "learning_rate": 9.94954128440367e-07, "logits/chosen": -2.288865327835083, "logits/rejected": -2.2382264137268066, "logps/chosen": -55.17938995361328, "logps/rejected": -59.79063415527344, "loss": 0.9965, "rewards/accuracies": 0.53125, "rewards/chosen": -0.013250279240310192, "rewards/margins": 0.014188027009367943, "rewards/rejected": -0.02743830531835556, "step": 23 }, { "epoch": 0.054998567745631624, "grad_norm": 1.8770885467529297, "learning_rate": 9.947247706422018e-07, "logits/chosen": -2.235234260559082, "logits/rejected": -2.235291004180908, "logps/chosen": -53.20248031616211, "logps/rejected": -58.49292755126953, "loss": 0.9956, "rewards/accuracies": 0.625, "rewards/chosen": 0.02307293191552162, "rewards/margins": 0.017775021493434906, "rewards/rejected": 0.005297910422086716, "step": 24 }, { "epoch": 0.057290174735032943, "grad_norm": 1.9043933153152466, "learning_rate": 9.944954128440366e-07, "logits/chosen": -2.2330260276794434, "logits/rejected": -2.2068495750427246, "logps/chosen": -53.2531623840332, "logps/rejected": -57.93571472167969, "loss": 0.9993, "rewards/accuracies": 0.4375, "rewards/chosen": -0.017097163945436478, "rewards/margins": 0.002617561724036932, "rewards/rejected": -0.019714724272489548, "step": 25 }, { "epoch": 0.05958178172443426, "grad_norm": 2.322286367416382, "learning_rate": 9.942660550458714e-07, "logits/chosen": -2.267261505126953, "logits/rejected": -2.2791459560394287, "logps/chosen": -56.48197555541992, "logps/rejected": -59.37757873535156, "loss": 1.0017, "rewards/accuracies": 0.5, "rewards/chosen": -0.03400798141956329, "rewards/margins": -0.006736147683113813, "rewards/rejected": -0.027271831408143044, "step": 26 }, { "epoch": 0.061873388713835575, "grad_norm": 2.3322501182556152, "learning_rate": 9.940366972477063e-07, "logits/chosen": -2.271859645843506, "logits/rejected": -2.2530832290649414, "logps/chosen": -52.76985168457031, "logps/rejected": -60.496803283691406, "loss": 0.9977, "rewards/accuracies": 0.59375, "rewards/chosen": 0.000554169062525034, "rewards/margins": 0.009377172216773033, "rewards/rejected": -0.008823004551231861, "step": 27 }, { "epoch": 0.0641649957032369, "grad_norm": 2.21805739402771, "learning_rate": 9.938073394495413e-07, "logits/chosen": -2.275367259979248, "logits/rejected": -2.2342512607574463, "logps/chosen": -55.538963317871094, "logps/rejected": -60.56772232055664, "loss": 0.999, "rewards/accuracies": 0.5, "rewards/chosen": -0.00583039503544569, "rewards/margins": 0.004256522748619318, "rewards/rejected": -0.010086918249726295, "step": 28 }, { "epoch": 0.06645660269263821, "grad_norm": 2.0768306255340576, "learning_rate": 9.935779816513761e-07, "logits/chosen": -2.2528700828552246, "logits/rejected": -2.2281408309936523, "logps/chosen": -54.27984619140625, "logps/rejected": -60.424903869628906, "loss": 0.997, "rewards/accuracies": 0.46875, "rewards/chosen": 0.004791749641299248, "rewards/margins": 0.011931621469557285, "rewards/rejected": -0.007139873690903187, "step": 29 }, { "epoch": 0.06874820968203953, "grad_norm": 2.2483839988708496, "learning_rate": 9.93348623853211e-07, "logits/chosen": -2.215796709060669, "logits/rejected": -2.2318663597106934, "logps/chosen": -54.9439697265625, "logps/rejected": -58.76552963256836, "loss": 0.9965, "rewards/accuracies": 0.5625, "rewards/chosen": -0.025863900780677795, "rewards/margins": 0.014031359925866127, "rewards/rejected": -0.03989525884389877, "step": 30 }, { "epoch": 0.07103981667144085, "grad_norm": 2.0699055194854736, "learning_rate": 9.931192660550458e-07, "logits/chosen": -2.2324671745300293, "logits/rejected": -2.2182414531707764, "logps/chosen": -54.74161148071289, "logps/rejected": -59.10639572143555, "loss": 1.0005, "rewards/accuracies": 0.5, "rewards/chosen": -0.03378240019083023, "rewards/margins": -0.001909839455038309, "rewards/rejected": -0.03187255933880806, "step": 31 }, { "epoch": 0.07333142366084217, "grad_norm": 2.6088125705718994, "learning_rate": 9.928899082568806e-07, "logits/chosen": -2.2357447147369385, "logits/rejected": -2.260582685470581, "logps/chosen": -55.40456008911133, "logps/rejected": -62.48126220703125, "loss": 1.0088, "rewards/accuracies": 0.375, "rewards/chosen": -0.02981417253613472, "rewards/margins": -0.03527139499783516, "rewards/rejected": 0.005457223393023014, "step": 32 }, { "epoch": 0.07562303065024348, "grad_norm": 2.0270261764526367, "learning_rate": 9.926605504587156e-07, "logits/chosen": -2.2753398418426514, "logits/rejected": -2.2473573684692383, "logps/chosen": -55.85089874267578, "logps/rejected": -59.650856018066406, "loss": 1.0061, "rewards/accuracies": 0.5, "rewards/chosen": -0.03790983930230141, "rewards/margins": -0.02469799667596817, "rewards/rejected": -0.013211844488978386, "step": 33 }, { "epoch": 0.0779146376396448, "grad_norm": 2.077585458755493, "learning_rate": 9.924311926605505e-07, "logits/chosen": -2.275383234024048, "logits/rejected": -2.2418479919433594, "logps/chosen": -52.34581756591797, "logps/rejected": -57.48112869262695, "loss": 1.0004, "rewards/accuracies": 0.53125, "rewards/chosen": -0.03780067712068558, "rewards/margins": -0.0017274495912715793, "rewards/rejected": -0.0360732339322567, "step": 34 }, { "epoch": 0.08020624462904612, "grad_norm": 1.818581223487854, "learning_rate": 9.922018348623853e-07, "logits/chosen": -2.2651479244232178, "logits/rejected": -2.2322773933410645, "logps/chosen": -55.70407485961914, "logps/rejected": -60.33871841430664, "loss": 1.0008, "rewards/accuracies": 0.40625, "rewards/chosen": -0.030547212809324265, "rewards/margins": -0.0031949649564921856, "rewards/rejected": -0.02735225111246109, "step": 35 }, { "epoch": 0.08249785161844744, "grad_norm": 10.914041519165039, "learning_rate": 9.919724770642201e-07, "logits/chosen": -2.2224671840667725, "logits/rejected": -2.1825098991394043, "logps/chosen": -57.76067352294922, "logps/rejected": -65.59249877929688, "loss": 0.9964, "rewards/accuracies": 0.59375, "rewards/chosen": -0.016748882830142975, "rewards/margins": 0.014551901258528233, "rewards/rejected": -0.03130078315734863, "step": 36 }, { "epoch": 0.08478945860784876, "grad_norm": 2.055598735809326, "learning_rate": 9.91743119266055e-07, "logits/chosen": -2.281888008117676, "logits/rejected": -2.274064302444458, "logps/chosen": -54.43236541748047, "logps/rejected": -60.04231262207031, "loss": 0.992, "rewards/accuracies": 0.625, "rewards/chosen": 0.007791102398186922, "rewards/margins": 0.03193409740924835, "rewards/rejected": -0.02414299175143242, "step": 37 }, { "epoch": 0.08708106559725007, "grad_norm": 2.067560911178589, "learning_rate": 9.9151376146789e-07, "logits/chosen": -2.2964019775390625, "logits/rejected": -2.242004156112671, "logps/chosen": -57.737525939941406, "logps/rejected": -60.441307067871094, "loss": 0.9997, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03331613540649414, "rewards/margins": 0.0013259155675768852, "rewards/rejected": -0.0346420556306839, "step": 38 }, { "epoch": 0.0893726725866514, "grad_norm": 2.1999645233154297, "learning_rate": 9.912844036697248e-07, "logits/chosen": -2.2578392028808594, "logits/rejected": -2.2330121994018555, "logps/chosen": -56.91482162475586, "logps/rejected": -62.65378952026367, "loss": 0.9955, "rewards/accuracies": 0.59375, "rewards/chosen": -0.05973862484097481, "rewards/margins": 0.017925594002008438, "rewards/rejected": -0.07766421884298325, "step": 39 }, { "epoch": 0.0916642795760527, "grad_norm": 2.396113157272339, "learning_rate": 9.910550458715596e-07, "logits/chosen": -2.2728302478790283, "logits/rejected": -2.3092923164367676, "logps/chosen": -57.491092681884766, "logps/rejected": -62.87043380737305, "loss": 0.9927, "rewards/accuracies": 0.59375, "rewards/chosen": 0.01683647744357586, "rewards/margins": 0.029263485223054886, "rewards/rejected": -0.012427008710801601, "step": 40 }, { "epoch": 0.09395588656545402, "grad_norm": 5.573437690734863, "learning_rate": 9.908256880733945e-07, "logits/chosen": -2.2910966873168945, "logits/rejected": -2.2753102779388428, "logps/chosen": -52.649723052978516, "logps/rejected": -59.624454498291016, "loss": 0.996, "rewards/accuracies": 0.59375, "rewards/chosen": -0.028833989053964615, "rewards/margins": 0.0163214560598135, "rewards/rejected": -0.045155443251132965, "step": 41 }, { "epoch": 0.09624749355485535, "grad_norm": 2.008760690689087, "learning_rate": 9.905963302752293e-07, "logits/chosen": -2.285581111907959, "logits/rejected": -2.2513580322265625, "logps/chosen": -57.3240966796875, "logps/rejected": -61.17287826538086, "loss": 1.0013, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05286711826920509, "rewards/margins": -0.005198289640247822, "rewards/rejected": -0.04766882583498955, "step": 42 }, { "epoch": 0.09853910054425666, "grad_norm": 1.8647111654281616, "learning_rate": 9.903669724770641e-07, "logits/chosen": -2.2672765254974365, "logits/rejected": -2.2616517543792725, "logps/chosen": -55.3670768737793, "logps/rejected": -58.69886779785156, "loss": 0.9945, "rewards/accuracies": 0.625, "rewards/chosen": -0.04020695760846138, "rewards/margins": 0.0222028736025095, "rewards/rejected": -0.06240983307361603, "step": 43 }, { "epoch": 0.10083070753365798, "grad_norm": 2.1342997550964355, "learning_rate": 9.90137614678899e-07, "logits/chosen": -2.3298559188842773, "logits/rejected": -2.273150682449341, "logps/chosen": -56.305179595947266, "logps/rejected": -61.02680206298828, "loss": 0.9843, "rewards/accuracies": 0.75, "rewards/chosen": -0.017465054988861084, "rewards/margins": 0.06294895708560944, "rewards/rejected": -0.08041401207447052, "step": 44 }, { "epoch": 0.1031223145230593, "grad_norm": 2.2883505821228027, "learning_rate": 9.899082568807338e-07, "logits/chosen": -2.2614359855651855, "logits/rejected": -2.264427423477173, "logps/chosen": -59.764610290527344, "logps/rejected": -62.20191192626953, "loss": 1.0013, "rewards/accuracies": 0.4375, "rewards/chosen": -0.06823592633008957, "rewards/margins": -0.005112671293318272, "rewards/rejected": -0.06312324851751328, "step": 45 }, { "epoch": 0.10541392151246061, "grad_norm": 3.397052049636841, "learning_rate": 9.896788990825688e-07, "logits/chosen": -2.2086803913116455, "logits/rejected": -2.2015292644500732, "logps/chosen": -59.29709243774414, "logps/rejected": -64.59185028076172, "loss": 0.9883, "rewards/accuracies": 0.75, "rewards/chosen": -0.005200793035328388, "rewards/margins": 0.04707469046115875, "rewards/rejected": -0.05227547883987427, "step": 46 }, { "epoch": 0.10770552850186194, "grad_norm": 2.153496742248535, "learning_rate": 9.894495412844036e-07, "logits/chosen": -2.2699146270751953, "logits/rejected": -2.246856689453125, "logps/chosen": -57.28041076660156, "logps/rejected": -60.047393798828125, "loss": 0.9996, "rewards/accuracies": 0.59375, "rewards/chosen": -0.050542913377285004, "rewards/margins": 0.0014779437333345413, "rewards/rejected": -0.05202086269855499, "step": 47 }, { "epoch": 0.10999713549126325, "grad_norm": 2.6306941509246826, "learning_rate": 9.892201834862385e-07, "logits/chosen": -2.240090847015381, "logits/rejected": -2.2835235595703125, "logps/chosen": -60.00048065185547, "logps/rejected": -63.929283142089844, "loss": 0.9962, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03799328953027725, "rewards/margins": 0.01508406549692154, "rewards/rejected": -0.05307735130190849, "step": 48 }, { "epoch": 0.11228874248066456, "grad_norm": 2.4552886486053467, "learning_rate": 9.889908256880733e-07, "logits/chosen": -2.3317036628723145, "logits/rejected": -2.310199737548828, "logps/chosen": -57.99382781982422, "logps/rejected": -61.555572509765625, "loss": 0.9951, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0393819734454155, "rewards/margins": 0.019750498235225677, "rewards/rejected": -0.05913246423006058, "step": 49 }, { "epoch": 0.11458034947006589, "grad_norm": 2.1514551639556885, "learning_rate": 9.887614678899083e-07, "logits/chosen": -2.292430877685547, "logits/rejected": -2.2612314224243164, "logps/chosen": -54.63652420043945, "logps/rejected": -59.8076171875, "loss": 1.0014, "rewards/accuracies": 0.5, "rewards/chosen": -0.06560905277729034, "rewards/margins": -0.005637171678245068, "rewards/rejected": -0.0599718801677227, "step": 50 }, { "epoch": 0.1168719564594672, "grad_norm": 2.579472303390503, "learning_rate": 9.885321100917431e-07, "logits/chosen": -2.2558107376098633, "logits/rejected": -2.245762825012207, "logps/chosen": -54.02759552001953, "logps/rejected": -57.50906753540039, "loss": 1.0011, "rewards/accuracies": 0.46875, "rewards/chosen": -0.06275400519371033, "rewards/margins": -0.004358362406492233, "rewards/rejected": -0.058395639061927795, "step": 51 }, { "epoch": 0.11916356344886853, "grad_norm": 2.3518378734588623, "learning_rate": 9.88302752293578e-07, "logits/chosen": -2.2736799716949463, "logits/rejected": -2.270439624786377, "logps/chosen": -55.8138427734375, "logps/rejected": -60.52402877807617, "loss": 0.9907, "rewards/accuracies": 0.625, "rewards/chosen": -0.010474801063537598, "rewards/margins": 0.03737213462591171, "rewards/rejected": -0.04784693568944931, "step": 52 }, { "epoch": 0.12145517043826984, "grad_norm": 2.1405029296875, "learning_rate": 9.880733944954128e-07, "logits/chosen": -2.260054588317871, "logits/rejected": -2.224705696105957, "logps/chosen": -53.51076889038086, "logps/rejected": -60.53538131713867, "loss": 0.9925, "rewards/accuracies": 0.625, "rewards/chosen": -0.04563383013010025, "rewards/margins": 0.03017866611480713, "rewards/rejected": -0.07581249624490738, "step": 53 }, { "epoch": 0.12374677742767115, "grad_norm": 1.9388738870620728, "learning_rate": 9.878440366972476e-07, "logits/chosen": -2.249469518661499, "logits/rejected": -2.215237617492676, "logps/chosen": -49.69164276123047, "logps/rejected": -55.095218658447266, "loss": 0.995, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03565865382552147, "rewards/margins": 0.019951146095991135, "rewards/rejected": -0.055609799921512604, "step": 54 }, { "epoch": 0.12603838441707246, "grad_norm": 2.7804760932922363, "learning_rate": 9.876146788990827e-07, "logits/chosen": -2.2454943656921387, "logits/rejected": -2.2659945487976074, "logps/chosen": -53.94697570800781, "logps/rejected": -60.86894226074219, "loss": 0.9937, "rewards/accuracies": 0.625, "rewards/chosen": -0.06320160627365112, "rewards/margins": 0.025501955300569534, "rewards/rejected": -0.08870355784893036, "step": 55 }, { "epoch": 0.1283299914064738, "grad_norm": 2.805026054382324, "learning_rate": 9.873853211009175e-07, "logits/chosen": -2.3921737670898438, "logits/rejected": -2.3297243118286133, "logps/chosen": -52.860748291015625, "logps/rejected": -59.73773956298828, "loss": 0.9956, "rewards/accuracies": 0.53125, "rewards/chosen": -0.040753867477178574, "rewards/margins": 0.017520785331726074, "rewards/rejected": -0.058274656534194946, "step": 56 }, { "epoch": 0.13062159839587512, "grad_norm": 2.1799428462982178, "learning_rate": 9.871559633027523e-07, "logits/chosen": -2.2534308433532715, "logits/rejected": -2.251300811767578, "logps/chosen": -56.19977569580078, "logps/rejected": -60.67996597290039, "loss": 0.9951, "rewards/accuracies": 0.53125, "rewards/chosen": -0.03774009644985199, "rewards/margins": 0.019863545894622803, "rewards/rejected": -0.05760364979505539, "step": 57 }, { "epoch": 0.13291320538527643, "grad_norm": 2.1490306854248047, "learning_rate": 9.869266055045871e-07, "logits/chosen": -2.289924144744873, "logits/rejected": -2.2294580936431885, "logps/chosen": -55.69793701171875, "logps/rejected": -60.37866973876953, "loss": 0.9945, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07516489923000336, "rewards/margins": 0.02211831882596016, "rewards/rejected": -0.09728322178125381, "step": 58 }, { "epoch": 0.13520481237467774, "grad_norm": 2.451678514480591, "learning_rate": 9.86697247706422e-07, "logits/chosen": -2.332660436630249, "logits/rejected": -2.2858948707580566, "logps/chosen": -53.19508743286133, "logps/rejected": -60.0675048828125, "loss": 0.9897, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0655772015452385, "rewards/margins": 0.041423942893743515, "rewards/rejected": -0.10700114816427231, "step": 59 }, { "epoch": 0.13749641936407905, "grad_norm": 2.3335928916931152, "learning_rate": 9.864678899082568e-07, "logits/chosen": -2.2452619075775146, "logits/rejected": -2.25412654876709, "logps/chosen": -55.34220886230469, "logps/rejected": -60.79353332519531, "loss": 0.9943, "rewards/accuracies": 0.625, "rewards/chosen": -0.06351224333047867, "rewards/margins": 0.022858524695038795, "rewards/rejected": -0.08637076616287231, "step": 60 }, { "epoch": 0.13978802635348037, "grad_norm": 2.2631778717041016, "learning_rate": 9.862385321100916e-07, "logits/chosen": -2.2377359867095947, "logits/rejected": -2.2334861755371094, "logps/chosen": -54.73091125488281, "logps/rejected": -60.9425048828125, "loss": 0.9902, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04806937277317047, "rewards/margins": 0.03958786651492119, "rewards/rejected": -0.08765724301338196, "step": 61 }, { "epoch": 0.1420796333428817, "grad_norm": 2.334578514099121, "learning_rate": 9.860091743119264e-07, "logits/chosen": -2.2960023880004883, "logits/rejected": -2.2840945720672607, "logps/chosen": -54.15938186645508, "logps/rejected": -60.615692138671875, "loss": 0.9886, "rewards/accuracies": 0.75, "rewards/chosen": -0.04662933573126793, "rewards/margins": 0.045687153935432434, "rewards/rejected": -0.09231648594141006, "step": 62 }, { "epoch": 0.14437124033228302, "grad_norm": 2.154158592224121, "learning_rate": 9.857798165137615e-07, "logits/chosen": -2.2865383625030518, "logits/rejected": -2.234926223754883, "logps/chosen": -57.11757278442383, "logps/rejected": -61.27959442138672, "loss": 0.9926, "rewards/accuracies": 0.625, "rewards/chosen": -0.08515572547912598, "rewards/margins": 0.02974555641412735, "rewards/rejected": -0.11490127444267273, "step": 63 }, { "epoch": 0.14666284732168433, "grad_norm": 2.4766898155212402, "learning_rate": 9.855504587155963e-07, "logits/chosen": -2.251610279083252, "logits/rejected": -2.2395830154418945, "logps/chosen": -54.79426574707031, "logps/rejected": -60.02269744873047, "loss": 0.9859, "rewards/accuracies": 0.65625, "rewards/chosen": -0.062424689531326294, "rewards/margins": 0.05642938241362572, "rewards/rejected": -0.11885407567024231, "step": 64 }, { "epoch": 0.14895445431108564, "grad_norm": 2.228755235671997, "learning_rate": 9.853211009174311e-07, "logits/chosen": -2.294185161590576, "logits/rejected": -2.2923717498779297, "logps/chosen": -55.0169792175293, "logps/rejected": -59.29924392700195, "loss": 0.9871, "rewards/accuracies": 0.65625, "rewards/chosen": -0.06690752506256104, "rewards/margins": 0.05181329324841499, "rewards/rejected": -0.11872082948684692, "step": 65 }, { "epoch": 0.15124606130048696, "grad_norm": 2.0844147205352783, "learning_rate": 9.85091743119266e-07, "logits/chosen": -2.2627367973327637, "logits/rejected": -2.222179412841797, "logps/chosen": -53.63114929199219, "logps/rejected": -57.460941314697266, "loss": 0.9885, "rewards/accuracies": 0.65625, "rewards/chosen": -0.056211523711681366, "rewards/margins": 0.04631359502673149, "rewards/rejected": -0.10252512246370316, "step": 66 }, { "epoch": 0.1535376682898883, "grad_norm": 2.6145360469818115, "learning_rate": 9.848623853211008e-07, "logits/chosen": -2.257087230682373, "logits/rejected": -2.2397308349609375, "logps/chosen": -57.15543746948242, "logps/rejected": -68.40916442871094, "loss": 0.9853, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08271430432796478, "rewards/margins": 0.05925467610359192, "rewards/rejected": -0.1419689655303955, "step": 67 }, { "epoch": 0.1558292752792896, "grad_norm": 2.3879969120025635, "learning_rate": 9.846330275229358e-07, "logits/chosen": -2.2821526527404785, "logits/rejected": -2.2584385871887207, "logps/chosen": -58.35992431640625, "logps/rejected": -61.25897216796875, "loss": 0.9886, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07838606834411621, "rewards/margins": 0.045829154551029205, "rewards/rejected": -0.12421522289514542, "step": 68 }, { "epoch": 0.15812088226869092, "grad_norm": 2.482217311859131, "learning_rate": 9.844036697247706e-07, "logits/chosen": -2.274644613265991, "logits/rejected": -2.28141713142395, "logps/chosen": -52.480506896972656, "logps/rejected": -60.71365737915039, "loss": 0.9901, "rewards/accuracies": 0.75, "rewards/chosen": -0.06663210690021515, "rewards/margins": 0.039463870227336884, "rewards/rejected": -0.10609596967697144, "step": 69 }, { "epoch": 0.16041248925809223, "grad_norm": 2.5124387741088867, "learning_rate": 9.841743119266055e-07, "logits/chosen": -2.2977380752563477, "logits/rejected": -2.2571487426757812, "logps/chosen": -52.11460876464844, "logps/rejected": -57.28373718261719, "loss": 0.9868, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08710253983736038, "rewards/margins": 0.05331660807132721, "rewards/rejected": -0.1404191553592682, "step": 70 }, { "epoch": 0.16270409624749355, "grad_norm": 2.3412137031555176, "learning_rate": 9.839449541284403e-07, "logits/chosen": -2.272664785385132, "logits/rejected": -2.259523391723633, "logps/chosen": -56.7895393371582, "logps/rejected": -62.2685546875, "loss": 0.9973, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09073349833488464, "rewards/margins": 0.011010587215423584, "rewards/rejected": -0.10174407809972763, "step": 71 }, { "epoch": 0.1649957032368949, "grad_norm": 2.405036211013794, "learning_rate": 9.837155963302753e-07, "logits/chosen": -2.257375955581665, "logits/rejected": -2.2576324939727783, "logps/chosen": -57.95681381225586, "logps/rejected": -64.62413024902344, "loss": 0.9847, "rewards/accuracies": 0.75, "rewards/chosen": -0.0924774631857872, "rewards/margins": 0.061439916491508484, "rewards/rejected": -0.15391740202903748, "step": 72 }, { "epoch": 0.1672873102262962, "grad_norm": 2.3043112754821777, "learning_rate": 9.834862385321102e-07, "logits/chosen": -2.2793467044830322, "logits/rejected": -2.2597718238830566, "logps/chosen": -53.25971221923828, "logps/rejected": -55.248191833496094, "loss": 0.9916, "rewards/accuracies": 0.625, "rewards/chosen": -0.09948988258838654, "rewards/margins": 0.033824436366558075, "rewards/rejected": -0.133314311504364, "step": 73 }, { "epoch": 0.1695789172156975, "grad_norm": 2.510230302810669, "learning_rate": 9.832568807339448e-07, "logits/chosen": -2.211651086807251, "logits/rejected": -2.241121292114258, "logps/chosen": -58.20675277709961, "logps/rejected": -60.62789535522461, "loss": 0.987, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1000632494688034, "rewards/margins": 0.05218367278575897, "rewards/rejected": -0.15224692225456238, "step": 74 }, { "epoch": 0.17187052420509882, "grad_norm": 2.345466136932373, "learning_rate": 9.830275229357798e-07, "logits/chosen": -2.2876572608947754, "logits/rejected": -2.310811996459961, "logps/chosen": -55.20918655395508, "logps/rejected": -60.6778678894043, "loss": 0.984, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1037675142288208, "rewards/margins": 0.06451713293790817, "rewards/rejected": -0.16828463971614838, "step": 75 }, { "epoch": 0.17416213119450014, "grad_norm": 2.541722536087036, "learning_rate": 9.827981651376146e-07, "logits/chosen": -2.268829345703125, "logits/rejected": -2.2162439823150635, "logps/chosen": -55.1480598449707, "logps/rejected": -61.11545181274414, "loss": 0.982, "rewards/accuracies": 0.78125, "rewards/chosen": -0.08479016274213791, "rewards/margins": 0.07275549322366714, "rewards/rejected": -0.15754567086696625, "step": 76 }, { "epoch": 0.17645373818390145, "grad_norm": 2.657405138015747, "learning_rate": 9.825688073394495e-07, "logits/chosen": -2.2917282581329346, "logits/rejected": -2.2735159397125244, "logps/chosen": -53.42503356933594, "logps/rejected": -58.89674377441406, "loss": 0.9859, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11327068507671356, "rewards/margins": 0.0566793717443943, "rewards/rejected": -0.16995003819465637, "step": 77 }, { "epoch": 0.1787453451733028, "grad_norm": 2.262871265411377, "learning_rate": 9.823394495412843e-07, "logits/chosen": -2.242443561553955, "logits/rejected": -2.206453800201416, "logps/chosen": -54.086158752441406, "logps/rejected": -62.059547424316406, "loss": 0.9831, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10366638004779816, "rewards/margins": 0.06831050664186478, "rewards/rejected": -0.17197689414024353, "step": 78 }, { "epoch": 0.1810369521627041, "grad_norm": 2.523561716079712, "learning_rate": 9.821100917431191e-07, "logits/chosen": -2.2702584266662598, "logits/rejected": -2.259735107421875, "logps/chosen": -59.44007873535156, "logps/rejected": -63.11256408691406, "loss": 0.9832, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12610691785812378, "rewards/margins": 0.06775548309087753, "rewards/rejected": -0.1938624083995819, "step": 79 }, { "epoch": 0.1833285591521054, "grad_norm": 2.7787630558013916, "learning_rate": 9.818807339449542e-07, "logits/chosen": -2.2637736797332764, "logits/rejected": -2.2998738288879395, "logps/chosen": -55.702667236328125, "logps/rejected": -62.367488861083984, "loss": 0.986, "rewards/accuracies": 0.625, "rewards/chosen": -0.10687491297721863, "rewards/margins": 0.05691622942686081, "rewards/rejected": -0.16379114985466003, "step": 80 }, { "epoch": 0.18562016614150673, "grad_norm": 2.4939489364624023, "learning_rate": 9.81651376146789e-07, "logits/chosen": -2.2633509635925293, "logits/rejected": -2.27091908454895, "logps/chosen": -57.29564666748047, "logps/rejected": -62.47737121582031, "loss": 0.984, "rewards/accuracies": 0.75, "rewards/chosen": -0.17566820979118347, "rewards/margins": 0.06468132883310318, "rewards/rejected": -0.24034951627254486, "step": 81 }, { "epoch": 0.18791177313090804, "grad_norm": 2.9198966026306152, "learning_rate": 9.814220183486238e-07, "logits/chosen": -2.265245199203491, "logits/rejected": -2.2692694664001465, "logps/chosen": -55.06877517700195, "logps/rejected": -59.56055450439453, "loss": 0.9805, "rewards/accuracies": 0.75, "rewards/chosen": -0.12424929440021515, "rewards/margins": 0.07917994260787964, "rewards/rejected": -0.20342925190925598, "step": 82 }, { "epoch": 0.19020338012030938, "grad_norm": 2.356034517288208, "learning_rate": 9.811926605504586e-07, "logits/chosen": -2.2577004432678223, "logits/rejected": -2.2313284873962402, "logps/chosen": -54.69230651855469, "logps/rejected": -60.89252853393555, "loss": 0.9886, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13729578256607056, "rewards/margins": 0.04584010690450668, "rewards/rejected": -0.18313589692115784, "step": 83 }, { "epoch": 0.1924949871097107, "grad_norm": 4.034863471984863, "learning_rate": 9.809633027522935e-07, "logits/chosen": -2.240403652191162, "logits/rejected": -2.266171932220459, "logps/chosen": -57.64067840576172, "logps/rejected": -62.647499084472656, "loss": 0.9864, "rewards/accuracies": 0.71875, "rewards/chosen": -0.16169242560863495, "rewards/margins": 0.054852474480867386, "rewards/rejected": -0.21654488146305084, "step": 84 }, { "epoch": 0.194786594099112, "grad_norm": 2.364189386367798, "learning_rate": 9.807339449541285e-07, "logits/chosen": -2.2918381690979004, "logits/rejected": -2.2672126293182373, "logps/chosen": -53.396419525146484, "logps/rejected": -58.83460998535156, "loss": 0.9747, "rewards/accuracies": 0.78125, "rewards/chosen": -0.11851036548614502, "rewards/margins": 0.10237519443035126, "rewards/rejected": -0.22088555991649628, "step": 85 }, { "epoch": 0.19707820108851332, "grad_norm": 3.3050003051757812, "learning_rate": 9.805045871559633e-07, "logits/chosen": -2.2627077102661133, "logits/rejected": -2.214418411254883, "logps/chosen": -54.76317596435547, "logps/rejected": -62.99654006958008, "loss": 0.9766, "rewards/accuracies": 0.78125, "rewards/chosen": -0.15194562077522278, "rewards/margins": 0.09501925110816956, "rewards/rejected": -0.24696487188339233, "step": 86 }, { "epoch": 0.19936980807791463, "grad_norm": 2.6096816062927246, "learning_rate": 9.802752293577981e-07, "logits/chosen": -2.3231041431427, "logits/rejected": -2.245713949203491, "logps/chosen": -57.5986328125, "logps/rejected": -63.917701721191406, "loss": 0.9711, "rewards/accuracies": 0.875, "rewards/chosen": -0.16507181525230408, "rewards/margins": 0.11751526594161987, "rewards/rejected": -0.28258708119392395, "step": 87 }, { "epoch": 0.20166141506731597, "grad_norm": 2.435739755630493, "learning_rate": 9.80045871559633e-07, "logits/chosen": -2.301229238510132, "logits/rejected": -2.267078399658203, "logps/chosen": -55.666751861572266, "logps/rejected": -59.10593795776367, "loss": 0.9834, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14861387014389038, "rewards/margins": 0.06763258576393127, "rewards/rejected": -0.21624645590782166, "step": 88 }, { "epoch": 0.20166141506731597, "eval_logits/chosen": -2.3328747749328613, "eval_logits/rejected": -2.326422929763794, "eval_logps/chosen": -58.89457321166992, "eval_logps/rejected": -63.57447052001953, "eval_loss": 0.9829249382019043, "eval_rewards/accuracies": 0.6811320781707764, "eval_rewards/chosen": -0.18322210013866425, "eval_rewards/margins": 0.06928078830242157, "eval_rewards/rejected": -0.2525028884410858, "eval_runtime": 964.9038, "eval_samples_per_second": 0.548, "eval_steps_per_second": 0.275, "step": 88 }, { "epoch": 0.20395302205671728, "grad_norm": 2.2782914638519287, "learning_rate": 9.798165137614678e-07, "logits/chosen": -2.3232297897338867, "logits/rejected": -2.2718794345855713, "logps/chosen": -56.503883361816406, "logps/rejected": -61.422821044921875, "loss": 0.9713, "rewards/accuracies": 0.875, "rewards/chosen": -0.13381344079971313, "rewards/margins": 0.116395004093647, "rewards/rejected": -0.25020843744277954, "step": 89 }, { "epoch": 0.2062446290461186, "grad_norm": 2.9069197177886963, "learning_rate": 9.795871559633026e-07, "logits/chosen": -2.2789194583892822, "logits/rejected": -2.2761588096618652, "logps/chosen": -58.9554557800293, "logps/rejected": -65.06226348876953, "loss": 0.9859, "rewards/accuracies": 0.625, "rewards/chosen": -0.20881929993629456, "rewards/margins": 0.05833371728658676, "rewards/rejected": -0.2671529948711395, "step": 90 }, { "epoch": 0.2085362360355199, "grad_norm": 2.5379087924957275, "learning_rate": 9.793577981651374e-07, "logits/chosen": -2.3214282989501953, "logits/rejected": -2.310032844543457, "logps/chosen": -55.60780715942383, "logps/rejected": -61.245506286621094, "loss": 0.9806, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1992567777633667, "rewards/margins": 0.07930031418800354, "rewards/rejected": -0.27855706214904785, "step": 91 }, { "epoch": 0.21082784302492122, "grad_norm": 2.1756627559661865, "learning_rate": 9.791284403669725e-07, "logits/chosen": -2.2539119720458984, "logits/rejected": -2.2597763538360596, "logps/chosen": -58.86365509033203, "logps/rejected": -64.42219543457031, "loss": 0.9885, "rewards/accuracies": 0.65625, "rewards/chosen": -0.22267985343933105, "rewards/margins": 0.04669353738427162, "rewards/rejected": -0.26937341690063477, "step": 92 }, { "epoch": 0.21311945001432253, "grad_norm": 2.547180652618408, "learning_rate": 9.788990825688073e-07, "logits/chosen": -2.268528461456299, "logits/rejected": -2.1997556686401367, "logps/chosen": -56.28026580810547, "logps/rejected": -66.01068115234375, "loss": 0.965, "rewards/accuracies": 0.9375, "rewards/chosen": -0.194570392370224, "rewards/margins": 0.14385806024074554, "rewards/rejected": -0.33842846751213074, "step": 93 }, { "epoch": 0.21541105700372387, "grad_norm": 2.790682554244995, "learning_rate": 9.786697247706421e-07, "logits/chosen": -2.259021282196045, "logits/rejected": -2.263533592224121, "logps/chosen": -57.73589324951172, "logps/rejected": -64.8173599243164, "loss": 0.9775, "rewards/accuracies": 0.65625, "rewards/chosen": -0.23520641028881073, "rewards/margins": 0.09329140186309814, "rewards/rejected": -0.32849782705307007, "step": 94 }, { "epoch": 0.21770266399312518, "grad_norm": 2.753235101699829, "learning_rate": 9.78440366972477e-07, "logits/chosen": -2.270951986312866, "logits/rejected": -2.2369284629821777, "logps/chosen": -55.203773498535156, "logps/rejected": -62.66721725463867, "loss": 0.9696, "rewards/accuracies": 0.75, "rewards/chosen": -0.2138325572013855, "rewards/margins": 0.12463142722845078, "rewards/rejected": -0.3384639322757721, "step": 95 }, { "epoch": 0.2199942709825265, "grad_norm": 2.3224689960479736, "learning_rate": 9.782110091743118e-07, "logits/chosen": -2.263597249984741, "logits/rejected": -2.2310967445373535, "logps/chosen": -56.703548431396484, "logps/rejected": -61.92481231689453, "loss": 0.9895, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2751515507698059, "rewards/margins": 0.041873641312122345, "rewards/rejected": -0.31702518463134766, "step": 96 }, { "epoch": 0.2222858779719278, "grad_norm": 2.65478777885437, "learning_rate": 9.779816513761468e-07, "logits/chosen": -2.314836025238037, "logits/rejected": -2.2868833541870117, "logps/chosen": -56.218017578125, "logps/rejected": -63.3751220703125, "loss": 0.9823, "rewards/accuracies": 0.65625, "rewards/chosen": -0.26177680492401123, "rewards/margins": 0.07332320511341095, "rewards/rejected": -0.3351000249385834, "step": 97 }, { "epoch": 0.22457748496132912, "grad_norm": 2.91223406791687, "learning_rate": 9.777522935779817e-07, "logits/chosen": -2.3135969638824463, "logits/rejected": -2.31720232963562, "logps/chosen": -60.46677017211914, "logps/rejected": -63.87541961669922, "loss": 0.976, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2126462459564209, "rewards/margins": 0.0981181412935257, "rewards/rejected": -0.3107644319534302, "step": 98 }, { "epoch": 0.22686909195073046, "grad_norm": 2.2924790382385254, "learning_rate": 9.775229357798165e-07, "logits/chosen": -2.2454757690429688, "logits/rejected": -2.261589288711548, "logps/chosen": -56.13756561279297, "logps/rejected": -60.21965026855469, "loss": 0.9696, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2055058777332306, "rewards/margins": 0.12363177537918091, "rewards/rejected": -0.3291376531124115, "step": 99 }, { "epoch": 0.22916069894013177, "grad_norm": 2.4295101165771484, "learning_rate": 9.772935779816513e-07, "logits/chosen": -2.2938642501831055, "logits/rejected": -2.3304121494293213, "logps/chosen": -57.234771728515625, "logps/rejected": -65.30625915527344, "loss": 0.9649, "rewards/accuracies": 0.875, "rewards/chosen": -0.2431465983390808, "rewards/margins": 0.14561186730861664, "rewards/rejected": -0.38875848054885864, "step": 100 }, { "epoch": 0.2314523059295331, "grad_norm": 2.205414295196533, "learning_rate": 9.770642201834861e-07, "logits/chosen": -2.280993700027466, "logits/rejected": -2.290090560913086, "logps/chosen": -53.7105827331543, "logps/rejected": -62.399940490722656, "loss": 0.9749, "rewards/accuracies": 0.75, "rewards/chosen": -0.2814340591430664, "rewards/margins": 0.10453303903341293, "rewards/rejected": -0.38596707582473755, "step": 101 }, { "epoch": 0.2337439129189344, "grad_norm": 6.934295654296875, "learning_rate": 9.768348623853212e-07, "logits/chosen": -2.2606074810028076, "logits/rejected": -2.2848429679870605, "logps/chosen": -54.08441925048828, "logps/rejected": -60.7122802734375, "loss": 0.9698, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2022152990102768, "rewards/margins": 0.12547726929187775, "rewards/rejected": -0.32769256830215454, "step": 102 }, { "epoch": 0.2360355199083357, "grad_norm": 2.8541529178619385, "learning_rate": 9.76605504587156e-07, "logits/chosen": -2.3569295406341553, "logits/rejected": -2.314295530319214, "logps/chosen": -57.70225524902344, "logps/rejected": -63.26179504394531, "loss": 0.9737, "rewards/accuracies": 0.75, "rewards/chosen": -0.25551801919937134, "rewards/margins": 0.10915958136320114, "rewards/rejected": -0.3646776080131531, "step": 103 }, { "epoch": 0.23832712689773705, "grad_norm": 2.5258994102478027, "learning_rate": 9.763761467889908e-07, "logits/chosen": -2.2411537170410156, "logits/rejected": -2.2225334644317627, "logps/chosen": -59.02644729614258, "logps/rejected": -66.61002349853516, "loss": 0.9687, "rewards/accuracies": 0.78125, "rewards/chosen": -0.24060533940792084, "rewards/margins": 0.13153359293937683, "rewards/rejected": -0.37213897705078125, "step": 104 }, { "epoch": 0.24061873388713836, "grad_norm": 2.182965040206909, "learning_rate": 9.761467889908256e-07, "logits/chosen": -2.285491704940796, "logits/rejected": -2.237971544265747, "logps/chosen": -55.4235954284668, "logps/rejected": -60.7089958190918, "loss": 0.9689, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22821295261383057, "rewards/margins": 0.12794974446296692, "rewards/rejected": -0.3561626672744751, "step": 105 }, { "epoch": 0.24291034087653968, "grad_norm": 2.4593470096588135, "learning_rate": 9.759174311926605e-07, "logits/chosen": -2.322093963623047, "logits/rejected": -2.3115692138671875, "logps/chosen": -52.9210090637207, "logps/rejected": -57.57618713378906, "loss": 0.9773, "rewards/accuracies": 0.71875, "rewards/chosen": -0.26033174991607666, "rewards/margins": 0.09338255226612091, "rewards/rejected": -0.35371431708335876, "step": 106 }, { "epoch": 0.245201947865941, "grad_norm": 2.8422036170959473, "learning_rate": 9.756880733944953e-07, "logits/chosen": -2.318422317504883, "logits/rejected": -2.317319393157959, "logps/chosen": -51.254913330078125, "logps/rejected": -61.765010833740234, "loss": 0.9569, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15156146883964539, "rewards/margins": 0.1777164489030838, "rewards/rejected": -0.3292779326438904, "step": 107 }, { "epoch": 0.2474935548553423, "grad_norm": 2.2588369846343994, "learning_rate": 9.754587155963301e-07, "logits/chosen": -2.3171396255493164, "logits/rejected": -2.314578056335449, "logps/chosen": -56.40192794799805, "logps/rejected": -63.46107482910156, "loss": 0.9578, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2537056505680084, "rewards/margins": 0.17777621746063232, "rewards/rejected": -0.43148186802864075, "step": 108 }, { "epoch": 0.24978516184474361, "grad_norm": 2.4533426761627197, "learning_rate": 9.752293577981652e-07, "logits/chosen": -2.345057964324951, "logits/rejected": -2.3152074813842773, "logps/chosen": -60.97263717651367, "logps/rejected": -65.57259368896484, "loss": 0.9597, "rewards/accuracies": 0.84375, "rewards/chosen": -0.32138463854789734, "rewards/margins": 0.1716977059841156, "rewards/rejected": -0.49308234453201294, "step": 109 }, { "epoch": 0.2520767688341449, "grad_norm": 2.503978729248047, "learning_rate": 9.75e-07, "logits/chosen": -2.3283333778381348, "logits/rejected": -2.331357955932617, "logps/chosen": -59.7822380065918, "logps/rejected": -63.97633361816406, "loss": 0.9625, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3263043463230133, "rewards/margins": 0.1578531414270401, "rewards/rejected": -0.4841575026512146, "step": 110 }, { "epoch": 0.25436837582354627, "grad_norm": 2.551347255706787, "learning_rate": 9.747706422018348e-07, "logits/chosen": -2.281740665435791, "logits/rejected": -2.3090171813964844, "logps/chosen": -62.886268615722656, "logps/rejected": -68.31816101074219, "loss": 0.9632, "rewards/accuracies": 0.75, "rewards/chosen": -0.35534143447875977, "rewards/margins": 0.1521342694759369, "rewards/rejected": -0.5074756741523743, "step": 111 }, { "epoch": 0.2566599828129476, "grad_norm": 2.665898561477661, "learning_rate": 9.745412844036696e-07, "logits/chosen": -2.2753782272338867, "logits/rejected": -2.2714250087738037, "logps/chosen": -60.426090240478516, "logps/rejected": -66.99775695800781, "loss": 0.9603, "rewards/accuracies": 0.84375, "rewards/chosen": -0.32670220732688904, "rewards/margins": 0.1678197979927063, "rewards/rejected": -0.4945220351219177, "step": 112 }, { "epoch": 0.2589515898023489, "grad_norm": 2.420654296875, "learning_rate": 9.743119266055045e-07, "logits/chosen": -2.283048391342163, "logits/rejected": -2.296807050704956, "logps/chosen": -61.98945236206055, "logps/rejected": -67.46572875976562, "loss": 0.9687, "rewards/accuracies": 0.75, "rewards/chosen": -0.39902836084365845, "rewards/margins": 0.13316091895103455, "rewards/rejected": -0.5321892499923706, "step": 113 }, { "epoch": 0.26124319679175023, "grad_norm": 2.2228686809539795, "learning_rate": 9.740825688073395e-07, "logits/chosen": -2.3490214347839355, "logits/rejected": -2.296261787414551, "logps/chosen": -61.318695068359375, "logps/rejected": -66.0455322265625, "loss": 0.9737, "rewards/accuracies": 0.75, "rewards/chosen": -0.4011589288711548, "rewards/margins": 0.11222012341022491, "rewards/rejected": -0.5133790373802185, "step": 114 }, { "epoch": 0.2635348037811515, "grad_norm": 2.5851333141326904, "learning_rate": 9.738532110091743e-07, "logits/chosen": -2.338334560394287, "logits/rejected": -2.341705322265625, "logps/chosen": -59.2625846862793, "logps/rejected": -65.3931884765625, "loss": 0.9561, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3841384947299957, "rewards/margins": 0.1859096884727478, "rewards/rejected": -0.5700481534004211, "step": 115 }, { "epoch": 0.26582641077055286, "grad_norm": 2.7571005821228027, "learning_rate": 9.736238532110091e-07, "logits/chosen": -2.3140485286712646, "logits/rejected": -2.3013992309570312, "logps/chosen": -60.9134635925293, "logps/rejected": -67.15079498291016, "loss": 0.9611, "rewards/accuracies": 0.6875, "rewards/chosen": -0.33000558614730835, "rewards/margins": 0.16947826743125916, "rewards/rejected": -0.4994838535785675, "step": 116 }, { "epoch": 0.26811801775995414, "grad_norm": 2.749811887741089, "learning_rate": 9.73394495412844e-07, "logits/chosen": -2.3410451412200928, "logits/rejected": -2.3143107891082764, "logps/chosen": -56.14763641357422, "logps/rejected": -66.46581268310547, "loss": 0.9457, "rewards/accuracies": 0.8125, "rewards/chosen": -0.33828163146972656, "rewards/margins": 0.2335522323846817, "rewards/rejected": -0.5718339085578918, "step": 117 }, { "epoch": 0.2704096247493555, "grad_norm": 2.6634786128997803, "learning_rate": 9.731651376146788e-07, "logits/chosen": -2.258547782897949, "logits/rejected": -2.2193846702575684, "logps/chosen": -59.04573059082031, "logps/rejected": -63.87548065185547, "loss": 0.9588, "rewards/accuracies": 0.875, "rewards/chosen": -0.33736857771873474, "rewards/margins": 0.17330996692180634, "rewards/rejected": -0.5106785297393799, "step": 118 }, { "epoch": 0.2727012317387568, "grad_norm": 2.172290086746216, "learning_rate": 9.729357798165138e-07, "logits/chosen": -2.3353590965270996, "logits/rejected": -2.3206100463867188, "logps/chosen": -59.65561294555664, "logps/rejected": -64.64927673339844, "loss": 0.9573, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3320775628089905, "rewards/margins": 0.17930178344249725, "rewards/rejected": -0.5113793611526489, "step": 119 }, { "epoch": 0.2749928387281581, "grad_norm": 2.5488128662109375, "learning_rate": 9.727064220183487e-07, "logits/chosen": -2.3051865100860596, "logits/rejected": -2.2847652435302734, "logps/chosen": -56.583343505859375, "logps/rejected": -65.47535705566406, "loss": 0.9412, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3109131455421448, "rewards/margins": 0.25249454379081726, "rewards/rejected": -0.5634077191352844, "step": 120 }, { "epoch": 0.27728444571755945, "grad_norm": 3.365874767303467, "learning_rate": 9.724770642201835e-07, "logits/chosen": -2.3478312492370605, "logits/rejected": -2.3240575790405273, "logps/chosen": -61.26841735839844, "logps/rejected": -69.84873962402344, "loss": 0.943, "rewards/accuracies": 0.96875, "rewards/chosen": -0.345612496137619, "rewards/margins": 0.24423855543136597, "rewards/rejected": -0.5898510217666626, "step": 121 }, { "epoch": 0.27957605270696073, "grad_norm": 3.0100488662719727, "learning_rate": 9.722477064220183e-07, "logits/chosen": -2.3225178718566895, "logits/rejected": -2.2879703044891357, "logps/chosen": -59.399600982666016, "logps/rejected": -65.4563217163086, "loss": 0.9703, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3885825276374817, "rewards/margins": 0.12965300679206848, "rewards/rejected": -0.5182355046272278, "step": 122 }, { "epoch": 0.28186765969636207, "grad_norm": 2.6057262420654297, "learning_rate": 9.720183486238531e-07, "logits/chosen": -2.290009021759033, "logits/rejected": -2.302854299545288, "logps/chosen": -57.624629974365234, "logps/rejected": -62.268375396728516, "loss": 0.9561, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3846766948699951, "rewards/margins": 0.19008468091487885, "rewards/rejected": -0.5747613906860352, "step": 123 }, { "epoch": 0.2841592666857634, "grad_norm": 2.5313079357147217, "learning_rate": 9.71788990825688e-07, "logits/chosen": -2.2885336875915527, "logits/rejected": -2.2962894439697266, "logps/chosen": -57.78489685058594, "logps/rejected": -65.10771179199219, "loss": 0.9543, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4137738049030304, "rewards/margins": 0.20018909871578217, "rewards/rejected": -0.6139628887176514, "step": 124 }, { "epoch": 0.2864508736751647, "grad_norm": 2.6213932037353516, "learning_rate": 9.715596330275228e-07, "logits/chosen": -2.3783557415008545, "logits/rejected": -2.3652312755584717, "logps/chosen": -59.199676513671875, "logps/rejected": -67.52406311035156, "loss": 0.9398, "rewards/accuracies": 0.875, "rewards/chosen": -0.3999214470386505, "rewards/margins": 0.2657517194747925, "rewards/rejected": -0.6656730771064758, "step": 125 }, { "epoch": 0.28874248066456604, "grad_norm": 2.791961431503296, "learning_rate": 9.713302752293578e-07, "logits/chosen": -2.339372158050537, "logits/rejected": -2.335164785385132, "logps/chosen": -60.58339309692383, "logps/rejected": -67.60011291503906, "loss": 0.948, "rewards/accuracies": 0.78125, "rewards/chosen": -0.39226290583610535, "rewards/margins": 0.2329002320766449, "rewards/rejected": -0.6251631379127502, "step": 126 }, { "epoch": 0.2910340876539673, "grad_norm": 2.932570695877075, "learning_rate": 9.711009174311927e-07, "logits/chosen": -2.333326578140259, "logits/rejected": -2.294379234313965, "logps/chosen": -59.54530334472656, "logps/rejected": -64.37084197998047, "loss": 0.9529, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3954862058162689, "rewards/margins": 0.20781229436397552, "rewards/rejected": -0.6032984852790833, "step": 127 }, { "epoch": 0.29332569464336866, "grad_norm": 2.861504316329956, "learning_rate": 9.708715596330275e-07, "logits/chosen": -2.3131091594696045, "logits/rejected": -2.32389497756958, "logps/chosen": -56.25912857055664, "logps/rejected": -64.0858154296875, "loss": 0.9349, "rewards/accuracies": 0.8125, "rewards/chosen": -0.29767948389053345, "rewards/margins": 0.2813127934932709, "rewards/rejected": -0.5789922475814819, "step": 128 }, { "epoch": 0.29561730163277, "grad_norm": 2.67392897605896, "learning_rate": 9.706422018348623e-07, "logits/chosen": -2.293217897415161, "logits/rejected": -2.300659656524658, "logps/chosen": -59.12329864501953, "logps/rejected": -69.96846008300781, "loss": 0.9278, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3839358389377594, "rewards/margins": 0.31733426451683044, "rewards/rejected": -0.7012699842453003, "step": 129 }, { "epoch": 0.2979089086221713, "grad_norm": 2.5723860263824463, "learning_rate": 9.704128440366971e-07, "logits/chosen": -2.2847373485565186, "logits/rejected": -2.249488115310669, "logps/chosen": -57.844749450683594, "logps/rejected": -64.11847686767578, "loss": 0.956, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3942826986312866, "rewards/margins": 0.18589362502098083, "rewards/rejected": -0.5801763534545898, "step": 130 }, { "epoch": 0.3002005156115726, "grad_norm": 2.8100152015686035, "learning_rate": 9.701834862385322e-07, "logits/chosen": -2.30733585357666, "logits/rejected": -2.2819647789001465, "logps/chosen": -57.906742095947266, "logps/rejected": -67.02531433105469, "loss": 0.9356, "rewards/accuracies": 0.75, "rewards/chosen": -0.30838829278945923, "rewards/margins": 0.27783697843551636, "rewards/rejected": -0.5862252712249756, "step": 131 }, { "epoch": 0.3024921226009739, "grad_norm": 2.8554635047912598, "learning_rate": 9.69954128440367e-07, "logits/chosen": -2.335334062576294, "logits/rejected": -2.2865688800811768, "logps/chosen": -55.86989212036133, "logps/rejected": -65.69154357910156, "loss": 0.9267, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3768165707588196, "rewards/margins": 0.3249545097351074, "rewards/rejected": -0.701771080493927, "step": 132 }, { "epoch": 0.30478372959037525, "grad_norm": 10.98526382446289, "learning_rate": 9.697247706422018e-07, "logits/chosen": -2.2793710231781006, "logits/rejected": -2.28108549118042, "logps/chosen": -59.882530212402344, "logps/rejected": -64.8948745727539, "loss": 0.948, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4505596160888672, "rewards/margins": 0.24176305532455444, "rewards/rejected": -0.6923226118087769, "step": 133 }, { "epoch": 0.3070753365797766, "grad_norm": 2.978496551513672, "learning_rate": 9.694954128440366e-07, "logits/chosen": -2.3483455181121826, "logits/rejected": -2.3254730701446533, "logps/chosen": -53.70109939575195, "logps/rejected": -62.57904815673828, "loss": 0.9242, "rewards/accuracies": 0.875, "rewards/chosen": -0.2549404799938202, "rewards/margins": 0.3289993107318878, "rewards/rejected": -0.5839398503303528, "step": 134 }, { "epoch": 0.3093669435691779, "grad_norm": 2.9038355350494385, "learning_rate": 9.692660550458715e-07, "logits/chosen": -2.300981283187866, "logits/rejected": -2.2643611431121826, "logps/chosen": -58.28428268432617, "logps/rejected": -64.02203369140625, "loss": 0.9419, "rewards/accuracies": 0.75, "rewards/chosen": -0.3523592948913574, "rewards/margins": 0.24889647960662842, "rewards/rejected": -0.6012558341026306, "step": 135 }, { "epoch": 0.3116585505585792, "grad_norm": 2.6926727294921875, "learning_rate": 9.690366972477065e-07, "logits/chosen": -2.3217477798461914, "logits/rejected": -2.3188860416412354, "logps/chosen": -57.27831268310547, "logps/rejected": -64.87081146240234, "loss": 0.924, "rewards/accuracies": 0.875, "rewards/chosen": -0.3591987192630768, "rewards/margins": 0.3350643515586853, "rewards/rejected": -0.6942630410194397, "step": 136 }, { "epoch": 0.3139501575479805, "grad_norm": 4.860139846801758, "learning_rate": 9.688073394495413e-07, "logits/chosen": -2.322327136993408, "logits/rejected": -2.324695110321045, "logps/chosen": -61.51036834716797, "logps/rejected": -64.70442199707031, "loss": 0.9579, "rewards/accuracies": 0.625, "rewards/chosen": -0.48019373416900635, "rewards/margins": 0.18848447501659393, "rewards/rejected": -0.6686781644821167, "step": 137 }, { "epoch": 0.31624176453738184, "grad_norm": 3.337664842605591, "learning_rate": 9.68577981651376e-07, "logits/chosen": -2.303896188735962, "logits/rejected": -2.2764859199523926, "logps/chosen": -57.51160430908203, "logps/rejected": -67.33910369873047, "loss": 0.9252, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3576929271221161, "rewards/margins": 0.3328256607055664, "rewards/rejected": -0.6905185580253601, "step": 138 }, { "epoch": 0.3185333715267832, "grad_norm": 2.893420457839966, "learning_rate": 9.68348623853211e-07, "logits/chosen": -2.3568949699401855, "logits/rejected": -2.3327829837799072, "logps/chosen": -59.07436752319336, "logps/rejected": -66.80510711669922, "loss": 0.9255, "rewards/accuracies": 0.75, "rewards/chosen": -0.43387317657470703, "rewards/margins": 0.35720404982566833, "rewards/rejected": -0.7910772562026978, "step": 139 }, { "epoch": 0.32082497851618447, "grad_norm": 2.936436891555786, "learning_rate": 9.681192660550458e-07, "logits/chosen": -2.4139363765716553, "logits/rejected": -2.3869638442993164, "logps/chosen": -64.03744506835938, "logps/rejected": -69.01074981689453, "loss": 0.9434, "rewards/accuracies": 0.84375, "rewards/chosen": -0.456216037273407, "rewards/margins": 0.24695757031440735, "rewards/rejected": -0.7031736373901367, "step": 140 }, { "epoch": 0.3231165855055858, "grad_norm": 2.5924532413482666, "learning_rate": 9.678899082568806e-07, "logits/chosen": -2.3602375984191895, "logits/rejected": -2.3752758502960205, "logps/chosen": -59.08089828491211, "logps/rejected": -65.04644012451172, "loss": 0.9306, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4092082977294922, "rewards/margins": 0.3034776747226715, "rewards/rejected": -0.7126859426498413, "step": 141 }, { "epoch": 0.3254081924949871, "grad_norm": 2.6289005279541016, "learning_rate": 9.676605504587155e-07, "logits/chosen": -2.297403335571289, "logits/rejected": -2.340635299682617, "logps/chosen": -61.39676284790039, "logps/rejected": -65.7353286743164, "loss": 0.9308, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3769003450870514, "rewards/margins": 0.3092087209224701, "rewards/rejected": -0.6861090660095215, "step": 142 }, { "epoch": 0.32769979948438843, "grad_norm": 2.6742773056030273, "learning_rate": 9.674311926605505e-07, "logits/chosen": -2.353869915008545, "logits/rejected": -2.3500144481658936, "logps/chosen": -55.48981475830078, "logps/rejected": -63.39218521118164, "loss": 0.9343, "rewards/accuracies": 0.84375, "rewards/chosen": -0.30264827609062195, "rewards/margins": 0.2939738929271698, "rewards/rejected": -0.5966222286224365, "step": 143 }, { "epoch": 0.3299914064737898, "grad_norm": 5.597414970397949, "learning_rate": 9.672018348623853e-07, "logits/chosen": -2.301363945007324, "logits/rejected": -2.318812131881714, "logps/chosen": -56.98631286621094, "logps/rejected": -67.50762939453125, "loss": 0.9139, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2931143045425415, "rewards/margins": 0.3711920380592346, "rewards/rejected": -0.6643063426017761, "step": 144 }, { "epoch": 0.33228301346319106, "grad_norm": 2.692392110824585, "learning_rate": 9.669724770642202e-07, "logits/chosen": -2.3799755573272705, "logits/rejected": -2.362351417541504, "logps/chosen": -60.30418014526367, "logps/rejected": -66.30126953125, "loss": 0.9245, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4348638653755188, "rewards/margins": 0.34771230816841125, "rewards/rejected": -0.7825762033462524, "step": 145 }, { "epoch": 0.3345746204525924, "grad_norm": 3.1196413040161133, "learning_rate": 9.66743119266055e-07, "logits/chosen": -2.3648629188537598, "logits/rejected": -2.3465304374694824, "logps/chosen": -57.7672233581543, "logps/rejected": -64.3330307006836, "loss": 0.9126, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2917521595954895, "rewards/margins": 0.3931223154067993, "rewards/rejected": -0.684874415397644, "step": 146 }, { "epoch": 0.3368662274419937, "grad_norm": 2.4261958599090576, "learning_rate": 9.665137614678898e-07, "logits/chosen": -2.393589735031128, "logits/rejected": -2.3812084197998047, "logps/chosen": -58.6768913269043, "logps/rejected": -65.685546875, "loss": 0.924, "rewards/accuracies": 0.875, "rewards/chosen": -0.3760756254196167, "rewards/margins": 0.35012534260749817, "rewards/rejected": -0.7262009382247925, "step": 147 }, { "epoch": 0.339157834431395, "grad_norm": 2.763627290725708, "learning_rate": 9.662844036697248e-07, "logits/chosen": -2.4049832820892334, "logits/rejected": -2.402682304382324, "logps/chosen": -59.08803939819336, "logps/rejected": -64.72093200683594, "loss": 0.9248, "rewards/accuracies": 0.84375, "rewards/chosen": -0.45434874296188354, "rewards/margins": 0.33508527278900146, "rewards/rejected": -0.789434015750885, "step": 148 }, { "epoch": 0.3414494414207963, "grad_norm": 2.736406087875366, "learning_rate": 9.660550458715597e-07, "logits/chosen": -2.3501508235931396, "logits/rejected": -2.345503807067871, "logps/chosen": -60.17226028442383, "logps/rejected": -70.50970458984375, "loss": 0.9351, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5172921419143677, "rewards/margins": 0.2955937385559082, "rewards/rejected": -0.8128858804702759, "step": 149 }, { "epoch": 0.34374104841019765, "grad_norm": 2.6858859062194824, "learning_rate": 9.658256880733945e-07, "logits/chosen": -2.4237096309661865, "logits/rejected": -2.3774831295013428, "logps/chosen": -56.5577392578125, "logps/rejected": -66.68565368652344, "loss": 0.9358, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3372434377670288, "rewards/margins": 0.29074451327323914, "rewards/rejected": -0.6279879212379456, "step": 150 }, { "epoch": 0.346032655399599, "grad_norm": 2.4924495220184326, "learning_rate": 9.655963302752293e-07, "logits/chosen": -2.325406312942505, "logits/rejected": -2.344850540161133, "logps/chosen": -57.68502426147461, "logps/rejected": -64.82011413574219, "loss": 0.9316, "rewards/accuracies": 0.84375, "rewards/chosen": -0.40265947580337524, "rewards/margins": 0.3245387673377991, "rewards/rejected": -0.7271982431411743, "step": 151 }, { "epoch": 0.34832426238900027, "grad_norm": 3.6187684535980225, "learning_rate": 9.653669724770641e-07, "logits/chosen": -2.3858883380889893, "logits/rejected": -2.401646852493286, "logps/chosen": -60.90057373046875, "logps/rejected": -72.72177124023438, "loss": 0.9305, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5747073888778687, "rewards/margins": 0.3326442539691925, "rewards/rejected": -0.9073516130447388, "step": 152 }, { "epoch": 0.3506158693784016, "grad_norm": 2.5169930458068848, "learning_rate": 9.651376146788992e-07, "logits/chosen": -2.3501014709472656, "logits/rejected": -2.32460880279541, "logps/chosen": -56.97672653198242, "logps/rejected": -66.26618957519531, "loss": 0.9225, "rewards/accuracies": 0.75, "rewards/chosen": -0.45351147651672363, "rewards/margins": 0.34666377305984497, "rewards/rejected": -0.8001752495765686, "step": 153 }, { "epoch": 0.3529074763678029, "grad_norm": 2.9333760738372803, "learning_rate": 9.649082568807338e-07, "logits/chosen": -2.3453824520111084, "logits/rejected": -2.3627405166625977, "logps/chosen": -59.19622039794922, "logps/rejected": -68.20867919921875, "loss": 0.8941, "rewards/accuracies": 0.875, "rewards/chosen": -0.28208619356155396, "rewards/margins": 0.46039292216300964, "rewards/rejected": -0.742479145526886, "step": 154 }, { "epoch": 0.35519908335720424, "grad_norm": 3.4071621894836426, "learning_rate": 9.646788990825686e-07, "logits/chosen": -2.3962619304656982, "logits/rejected": -2.36580228805542, "logps/chosen": -62.85544967651367, "logps/rejected": -68.5031509399414, "loss": 0.9172, "rewards/accuracies": 0.78125, "rewards/chosen": -0.45719650387763977, "rewards/margins": 0.3902134299278259, "rewards/rejected": -0.8474100828170776, "step": 155 }, { "epoch": 0.3574906903466056, "grad_norm": 2.7445192337036133, "learning_rate": 9.644495412844037e-07, "logits/chosen": -2.3469440937042236, "logits/rejected": -2.365939140319824, "logps/chosen": -56.69953155517578, "logps/rejected": -66.53624725341797, "loss": 0.8991, "rewards/accuracies": 0.78125, "rewards/chosen": -0.29074937105178833, "rewards/margins": 0.4558735489845276, "rewards/rejected": -0.7466228604316711, "step": 156 }, { "epoch": 0.35978229733600686, "grad_norm": 3.624547243118286, "learning_rate": 9.642201834862385e-07, "logits/chosen": -2.3895161151885986, "logits/rejected": -2.3473572731018066, "logps/chosen": -57.66722106933594, "logps/rejected": -65.16983032226562, "loss": 0.9164, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2421422153711319, "rewards/margins": 0.36030012369155884, "rewards/rejected": -0.6024423837661743, "step": 157 }, { "epoch": 0.3620739043254082, "grad_norm": 2.9103429317474365, "learning_rate": 9.639908256880733e-07, "logits/chosen": -2.4216506481170654, "logits/rejected": -2.3779537677764893, "logps/chosen": -55.94016647338867, "logps/rejected": -66.4172592163086, "loss": 0.9175, "rewards/accuracies": 0.78125, "rewards/chosen": -0.23236477375030518, "rewards/margins": 0.3624899685382843, "rewards/rejected": -0.5948547124862671, "step": 158 }, { "epoch": 0.3643655113148095, "grad_norm": 2.7289209365844727, "learning_rate": 9.637614678899081e-07, "logits/chosen": -2.334482192993164, "logits/rejected": -2.3165884017944336, "logps/chosen": -55.42976379394531, "logps/rejected": -67.19985961914062, "loss": 0.8922, "rewards/accuracies": 0.90625, "rewards/chosen": -0.14156436920166016, "rewards/margins": 0.46748021245002747, "rewards/rejected": -0.6090445518493652, "step": 159 }, { "epoch": 0.3666571183042108, "grad_norm": 3.1522819995880127, "learning_rate": 9.63532110091743e-07, "logits/chosen": -2.3969151973724365, "logits/rejected": -2.368656635284424, "logps/chosen": -56.80965042114258, "logps/rejected": -63.15319061279297, "loss": 0.9235, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2224026471376419, "rewards/margins": 0.33078598976135254, "rewards/rejected": -0.5531886219978333, "step": 160 }, { "epoch": 0.36894872529361217, "grad_norm": 2.757612943649292, "learning_rate": 9.63302752293578e-07, "logits/chosen": -2.376579761505127, "logits/rejected": -2.3502423763275146, "logps/chosen": -58.9051628112793, "logps/rejected": -65.75262451171875, "loss": 0.9473, "rewards/accuracies": 0.75, "rewards/chosen": -0.35930708050727844, "rewards/margins": 0.2638401985168457, "rewards/rejected": -0.6231473088264465, "step": 161 }, { "epoch": 0.37124033228301345, "grad_norm": 3.2465107440948486, "learning_rate": 9.630733944954128e-07, "logits/chosen": -2.334193229675293, "logits/rejected": -2.337095260620117, "logps/chosen": -56.509986877441406, "logps/rejected": -64.60763549804688, "loss": 0.9042, "rewards/accuracies": 0.90625, "rewards/chosen": -0.17785128951072693, "rewards/margins": 0.4254097044467926, "rewards/rejected": -0.6032609939575195, "step": 162 }, { "epoch": 0.3735319392724148, "grad_norm": 2.8342130184173584, "learning_rate": 9.628440366972477e-07, "logits/chosen": -2.3685083389282227, "logits/rejected": -2.287564992904663, "logps/chosen": -57.68168258666992, "logps/rejected": -69.21521759033203, "loss": 0.8915, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3147605359554291, "rewards/margins": 0.5015648603439331, "rewards/rejected": -0.8163253664970398, "step": 163 }, { "epoch": 0.3758235462618161, "grad_norm": 2.8167266845703125, "learning_rate": 9.626146788990825e-07, "logits/chosen": -2.3585760593414307, "logits/rejected": -2.33943772315979, "logps/chosen": -58.86089324951172, "logps/rejected": -68.49765014648438, "loss": 0.9156, "rewards/accuracies": 0.78125, "rewards/chosen": -0.33879873156547546, "rewards/margins": 0.3839074373245239, "rewards/rejected": -0.722706139087677, "step": 164 }, { "epoch": 0.3781151532512174, "grad_norm": 2.997729778289795, "learning_rate": 9.623853211009175e-07, "logits/chosen": -2.4253158569335938, "logits/rejected": -2.3740487098693848, "logps/chosen": -52.18218231201172, "logps/rejected": -63.324066162109375, "loss": 0.8983, "rewards/accuracies": 0.90625, "rewards/chosen": -0.13000650703907013, "rewards/margins": 0.453350305557251, "rewards/rejected": -0.5833567976951599, "step": 165 }, { "epoch": 0.38040676024061876, "grad_norm": 2.881781816482544, "learning_rate": 9.621559633027523e-07, "logits/chosen": -2.3807032108306885, "logits/rejected": -2.383831739425659, "logps/chosen": -60.660064697265625, "logps/rejected": -67.13481903076172, "loss": 0.9314, "rewards/accuracies": 0.875, "rewards/chosen": -0.3663901388645172, "rewards/margins": 0.31964942812919617, "rewards/rejected": -0.6860395669937134, "step": 166 }, { "epoch": 0.38269836723002004, "grad_norm": 2.8734471797943115, "learning_rate": 9.619266055045872e-07, "logits/chosen": -2.3752079010009766, "logits/rejected": -2.3418686389923096, "logps/chosen": -57.633480072021484, "logps/rejected": -65.43158721923828, "loss": 0.9115, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2913338541984558, "rewards/margins": 0.39275360107421875, "rewards/rejected": -0.6840873956680298, "step": 167 }, { "epoch": 0.3849899742194214, "grad_norm": 3.062058925628662, "learning_rate": 9.61697247706422e-07, "logits/chosen": -2.4193673133850098, "logits/rejected": -2.3335494995117188, "logps/chosen": -53.18678283691406, "logps/rejected": -61.352272033691406, "loss": 0.8896, "rewards/accuracies": 0.875, "rewards/chosen": -0.16135273873806, "rewards/margins": 0.48708751797676086, "rewards/rejected": -0.6484402418136597, "step": 168 }, { "epoch": 0.38728158120882267, "grad_norm": 2.9127702713012695, "learning_rate": 9.614678899082568e-07, "logits/chosen": -2.4355123043060303, "logits/rejected": -2.41461181640625, "logps/chosen": -60.54923629760742, "logps/rejected": -72.06566619873047, "loss": 0.8853, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3808773159980774, "rewards/margins": 0.5378756523132324, "rewards/rejected": -0.9187529683113098, "step": 169 }, { "epoch": 0.389573188198224, "grad_norm": 3.08127760887146, "learning_rate": 9.612385321100916e-07, "logits/chosen": -2.3852994441986084, "logits/rejected": -2.3289170265197754, "logps/chosen": -57.45722961425781, "logps/rejected": -70.59864807128906, "loss": 0.8916, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2821565866470337, "rewards/margins": 0.5006932616233826, "rewards/rejected": -0.782849907875061, "step": 170 }, { "epoch": 0.39186479518762535, "grad_norm": 3.3056344985961914, "learning_rate": 9.610091743119265e-07, "logits/chosen": -2.43369460105896, "logits/rejected": -2.406106472015381, "logps/chosen": -56.40676498413086, "logps/rejected": -68.90959167480469, "loss": 0.8488, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18312682211399078, "rewards/margins": 0.6596843600273132, "rewards/rejected": -0.8428112268447876, "step": 171 }, { "epoch": 0.39415640217702663, "grad_norm": 3.0166399478912354, "learning_rate": 9.607798165137613e-07, "logits/chosen": -2.3782973289489746, "logits/rejected": -2.3720765113830566, "logps/chosen": -56.66511154174805, "logps/rejected": -65.57038879394531, "loss": 0.8984, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2324577420949936, "rewards/margins": 0.46634021401405334, "rewards/rejected": -0.6987979412078857, "step": 172 }, { "epoch": 0.396448009166428, "grad_norm": 2.8755714893341064, "learning_rate": 9.605504587155963e-07, "logits/chosen": -2.4090347290039062, "logits/rejected": -2.403980016708374, "logps/chosen": -54.9586067199707, "logps/rejected": -63.881378173828125, "loss": 0.9198, "rewards/accuracies": 0.90625, "rewards/chosen": -0.22099775075912476, "rewards/margins": 0.3621407151222229, "rewards/rejected": -0.5831384658813477, "step": 173 }, { "epoch": 0.39873961615582926, "grad_norm": 3.004509925842285, "learning_rate": 9.603211009174312e-07, "logits/chosen": -2.377800941467285, "logits/rejected": -2.4310803413391113, "logps/chosen": -58.655757904052734, "logps/rejected": -66.72102355957031, "loss": 0.8776, "rewards/accuracies": 0.8125, "rewards/chosen": -0.23728018999099731, "rewards/margins": 0.5616479516029358, "rewards/rejected": -0.7989281415939331, "step": 174 }, { "epoch": 0.4010312231452306, "grad_norm": 6.163204193115234, "learning_rate": 9.60091743119266e-07, "logits/chosen": -2.386035919189453, "logits/rejected": -2.401392698287964, "logps/chosen": -57.374080657958984, "logps/rejected": -67.16217803955078, "loss": 0.8939, "rewards/accuracies": 0.875, "rewards/chosen": -0.2481478899717331, "rewards/margins": 0.4936680793762207, "rewards/rejected": -0.7418159246444702, "step": 175 }, { "epoch": 0.40332283013463194, "grad_norm": 3.105839729309082, "learning_rate": 9.598623853211008e-07, "logits/chosen": -2.351590633392334, "logits/rejected": -2.357968807220459, "logps/chosen": -54.72328186035156, "logps/rejected": -67.47663116455078, "loss": 0.9032, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3684324622154236, "rewards/margins": 0.4394669234752655, "rewards/rejected": -0.8078994750976562, "step": 176 }, { "epoch": 0.40332283013463194, "eval_logits/chosen": -2.4645488262176514, "eval_logits/rejected": -2.4611871242523193, "eval_logps/chosen": -60.865779876708984, "eval_logps/rejected": -68.7463150024414, "eval_loss": 0.9169811606407166, "eval_rewards/accuracies": 0.7641509175300598, "eval_rewards/chosen": -0.3803427219390869, "eval_rewards/margins": 0.38934457302093506, "eval_rewards/rejected": -0.769687294960022, "eval_runtime": 957.2871, "eval_samples_per_second": 0.553, "eval_steps_per_second": 0.277, "step": 176 }, { "epoch": 0.4056144371240332, "grad_norm": 2.7472636699676514, "learning_rate": 9.596330275229356e-07, "logits/chosen": -2.411644458770752, "logits/rejected": -2.3707993030548096, "logps/chosen": -56.812583923339844, "logps/rejected": -65.00780487060547, "loss": 0.9238, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2779198884963989, "rewards/margins": 0.3684301972389221, "rewards/rejected": -0.646350085735321, "step": 177 }, { "epoch": 0.40790604411343456, "grad_norm": 13.00869369506836, "learning_rate": 9.594036697247707e-07, "logits/chosen": -2.4342522621154785, "logits/rejected": -2.3901753425598145, "logps/chosen": -54.193634033203125, "logps/rejected": -66.29923248291016, "loss": 0.8955, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2097783386707306, "rewards/margins": 0.4973820149898529, "rewards/rejected": -0.7071604132652283, "step": 178 }, { "epoch": 0.41019765110283585, "grad_norm": 3.1467926502227783, "learning_rate": 9.591743119266055e-07, "logits/chosen": -2.4123992919921875, "logits/rejected": -2.3830385208129883, "logps/chosen": -60.441551208496094, "logps/rejected": -70.1729507446289, "loss": 0.8937, "rewards/accuracies": 0.84375, "rewards/chosen": -0.34745457768440247, "rewards/margins": 0.49277985095977783, "rewards/rejected": -0.8402344584465027, "step": 179 }, { "epoch": 0.4124892580922372, "grad_norm": 3.0355377197265625, "learning_rate": 9.589449541284403e-07, "logits/chosen": -2.4315853118896484, "logits/rejected": -2.4202492237091064, "logps/chosen": -58.0799560546875, "logps/rejected": -69.51080322265625, "loss": 0.8674, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2250761091709137, "rewards/margins": 0.6066556572914124, "rewards/rejected": -0.8317316770553589, "step": 180 }, { "epoch": 0.4147808650816385, "grad_norm": 2.826247453689575, "learning_rate": 9.587155963302752e-07, "logits/chosen": -2.3385605812072754, "logits/rejected": -2.3473215103149414, "logps/chosen": -59.494834899902344, "logps/rejected": -69.8537368774414, "loss": 0.8993, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3696363568305969, "rewards/margins": 0.5148459076881409, "rewards/rejected": -0.8844822645187378, "step": 181 }, { "epoch": 0.4170724720710398, "grad_norm": 3.477809429168701, "learning_rate": 9.5848623853211e-07, "logits/chosen": -2.388392448425293, "logits/rejected": -2.344538450241089, "logps/chosen": -61.04086685180664, "logps/rejected": -71.88777923583984, "loss": 0.9025, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3753671944141388, "rewards/margins": 0.4511111080646515, "rewards/rejected": -0.8264783024787903, "step": 182 }, { "epoch": 0.41936407906044115, "grad_norm": 3.6538054943084717, "learning_rate": 9.58256880733945e-07, "logits/chosen": -2.465456485748291, "logits/rejected": -2.4436264038085938, "logps/chosen": -56.844642639160156, "logps/rejected": -64.38996124267578, "loss": 0.9052, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18693800270557404, "rewards/margins": 0.4241493046283722, "rewards/rejected": -0.6110873222351074, "step": 183 }, { "epoch": 0.42165568604984244, "grad_norm": 3.4791958332061768, "learning_rate": 9.580275229357798e-07, "logits/chosen": -2.3749191761016846, "logits/rejected": -2.3872463703155518, "logps/chosen": -58.33497619628906, "logps/rejected": -67.18158721923828, "loss": 0.8689, "rewards/accuracies": 1.0, "rewards/chosen": -0.21829326450824738, "rewards/margins": 0.6333922743797302, "rewards/rejected": -0.8516855239868164, "step": 184 }, { "epoch": 0.4239472930392438, "grad_norm": 3.198455572128296, "learning_rate": 9.577981651376147e-07, "logits/chosen": -2.400951385498047, "logits/rejected": -2.3514180183410645, "logps/chosen": -56.29878234863281, "logps/rejected": -69.96709442138672, "loss": 0.8704, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2753079831600189, "rewards/margins": 0.6277791261672974, "rewards/rejected": -0.9030871391296387, "step": 185 }, { "epoch": 0.42623890002864506, "grad_norm": 3.7085108757019043, "learning_rate": 9.575688073394495e-07, "logits/chosen": -2.481287956237793, "logits/rejected": -2.4262630939483643, "logps/chosen": -57.80772018432617, "logps/rejected": -64.74309539794922, "loss": 0.9052, "rewards/accuracies": 0.84375, "rewards/chosen": -0.22444754838943481, "rewards/margins": 0.4593219757080078, "rewards/rejected": -0.6837695240974426, "step": 186 }, { "epoch": 0.4285305070180464, "grad_norm": 3.164928674697876, "learning_rate": 9.573394495412843e-07, "logits/chosen": -2.42494797706604, "logits/rejected": -2.3851988315582275, "logps/chosen": -57.505672454833984, "logps/rejected": -64.87091827392578, "loss": 0.9203, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1430569589138031, "rewards/margins": 0.3525164723396301, "rewards/rejected": -0.49557337164878845, "step": 187 }, { "epoch": 0.43082211400744774, "grad_norm": 3.2351372241973877, "learning_rate": 9.571100917431191e-07, "logits/chosen": -2.451615810394287, "logits/rejected": -2.471215009689331, "logps/chosen": -54.08177947998047, "logps/rejected": -63.930686950683594, "loss": 0.8844, "rewards/accuracies": 0.84375, "rewards/chosen": -0.08061020076274872, "rewards/margins": 0.5383530259132385, "rewards/rejected": -0.6189632415771484, "step": 188 }, { "epoch": 0.433113720996849, "grad_norm": 3.2325923442840576, "learning_rate": 9.56880733944954e-07, "logits/chosen": -2.4296350479125977, "logits/rejected": -2.4152915477752686, "logps/chosen": -56.71369934082031, "logps/rejected": -65.10672760009766, "loss": 0.8746, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1600496768951416, "rewards/margins": 0.5461959838867188, "rewards/rejected": -0.7062456607818604, "step": 189 }, { "epoch": 0.43540532798625037, "grad_norm": 3.7529714107513428, "learning_rate": 9.56651376146789e-07, "logits/chosen": -2.4869205951690674, "logits/rejected": -2.466566562652588, "logps/chosen": -55.91749954223633, "logps/rejected": -63.36970901489258, "loss": 0.9126, "rewards/accuracies": 0.78125, "rewards/chosen": -0.39225471019744873, "rewards/margins": 0.40042367577552795, "rewards/rejected": -0.7926784157752991, "step": 190 }, { "epoch": 0.43769693497565165, "grad_norm": 3.4099924564361572, "learning_rate": 9.564220183486238e-07, "logits/chosen": -2.4354517459869385, "logits/rejected": -2.406160831451416, "logps/chosen": -57.15814971923828, "logps/rejected": -67.46202087402344, "loss": 0.8734, "rewards/accuracies": 0.84375, "rewards/chosen": -0.07059422880411148, "rewards/margins": 0.5992038249969482, "rewards/rejected": -0.6697980165481567, "step": 191 }, { "epoch": 0.439988541965053, "grad_norm": 3.3287672996520996, "learning_rate": 9.561926605504587e-07, "logits/chosen": -2.380380630493164, "logits/rejected": -2.391918659210205, "logps/chosen": -59.76410675048828, "logps/rejected": -70.63294982910156, "loss": 0.8559, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0981329157948494, "rewards/margins": 0.6754910945892334, "rewards/rejected": -0.7736240029335022, "step": 192 }, { "epoch": 0.44228014895445433, "grad_norm": 3.950752019882202, "learning_rate": 9.559633027522935e-07, "logits/chosen": -2.4251952171325684, "logits/rejected": -2.413726329803467, "logps/chosen": -58.05948257446289, "logps/rejected": -69.9758529663086, "loss": 0.8791, "rewards/accuracies": 0.875, "rewards/chosen": -0.2606387138366699, "rewards/margins": 0.6196160912513733, "rewards/rejected": -0.8802547454833984, "step": 193 }, { "epoch": 0.4445717559438556, "grad_norm": 3.5758440494537354, "learning_rate": 9.557339449541283e-07, "logits/chosen": -2.395589590072632, "logits/rejected": -2.3785271644592285, "logps/chosen": -60.62250518798828, "logps/rejected": -71.46736907958984, "loss": 0.88, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1681176722049713, "rewards/margins": 0.5596777200698853, "rewards/rejected": -0.7277954816818237, "step": 194 }, { "epoch": 0.44686336293325696, "grad_norm": 3.8050196170806885, "learning_rate": 9.555045871559633e-07, "logits/chosen": -2.4293086528778076, "logits/rejected": -2.4571878910064697, "logps/chosen": -61.05373001098633, "logps/rejected": -69.84813690185547, "loss": 0.8676, "rewards/accuracies": 0.875, "rewards/chosen": -0.09983000159263611, "rewards/margins": 0.6014201641082764, "rewards/rejected": -0.7012501955032349, "step": 195 }, { "epoch": 0.44915496992265824, "grad_norm": 3.5706732273101807, "learning_rate": 9.552752293577982e-07, "logits/chosen": -2.3675365447998047, "logits/rejected": -2.3588106632232666, "logps/chosen": -51.94842529296875, "logps/rejected": -65.9637680053711, "loss": 0.8233, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13071033358573914, "rewards/margins": 0.8347146511077881, "rewards/rejected": -0.7040042281150818, "step": 196 }, { "epoch": 0.4514465769120596, "grad_norm": 3.0327813625335693, "learning_rate": 9.55045871559633e-07, "logits/chosen": -2.431762218475342, "logits/rejected": -2.4073774814605713, "logps/chosen": -60.124759674072266, "logps/rejected": -66.83148956298828, "loss": 0.9054, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2393801212310791, "rewards/margins": 0.4406622052192688, "rewards/rejected": -0.6800423860549927, "step": 197 }, { "epoch": 0.4537381839014609, "grad_norm": 3.264798641204834, "learning_rate": 9.548165137614678e-07, "logits/chosen": -2.4065873622894287, "logits/rejected": -2.410881996154785, "logps/chosen": -55.302757263183594, "logps/rejected": -63.74501037597656, "loss": 0.8666, "rewards/accuracies": 0.90625, "rewards/chosen": 0.013195313513278961, "rewards/margins": 0.6081701517105103, "rewards/rejected": -0.5949748158454895, "step": 198 }, { "epoch": 0.4560297908908622, "grad_norm": 3.9610280990600586, "learning_rate": 9.545871559633026e-07, "logits/chosen": -2.454116106033325, "logits/rejected": -2.4142792224884033, "logps/chosen": -50.07085037231445, "logps/rejected": -57.58184051513672, "loss": 0.9021, "rewards/accuracies": 0.75, "rewards/chosen": 0.23348617553710938, "rewards/margins": 0.42640119791030884, "rewards/rejected": -0.19291502237319946, "step": 199 }, { "epoch": 0.45832139788026355, "grad_norm": 3.7920033931732178, "learning_rate": 9.543577981651377e-07, "logits/chosen": -2.4391391277313232, "logits/rejected": -2.4358556270599365, "logps/chosen": -56.35243225097656, "logps/rejected": -68.54472351074219, "loss": 0.8667, "rewards/accuracies": 0.8125, "rewards/chosen": 0.010675342753529549, "rewards/margins": 0.60451740026474, "rewards/rejected": -0.5938420295715332, "step": 200 }, { "epoch": 0.46061300486966483, "grad_norm": 3.6598854064941406, "learning_rate": 9.541284403669725e-07, "logits/chosen": -2.398434638977051, "logits/rejected": -2.3763012886047363, "logps/chosen": -55.33749008178711, "logps/rejected": -64.80366516113281, "loss": 0.8768, "rewards/accuracies": 0.8125, "rewards/chosen": -0.026613561436533928, "rewards/margins": 0.5852590203285217, "rewards/rejected": -0.6118725538253784, "step": 201 }, { "epoch": 0.4629046118590662, "grad_norm": 3.6104085445404053, "learning_rate": 9.538990825688073e-07, "logits/chosen": -2.398533344268799, "logits/rejected": -2.4576497077941895, "logps/chosen": -58.63932800292969, "logps/rejected": -70.28650665283203, "loss": 0.849, "rewards/accuracies": 0.90625, "rewards/chosen": -0.11513733118772507, "rewards/margins": 0.7422495484352112, "rewards/rejected": -0.8573867678642273, "step": 202 }, { "epoch": 0.4651962188484675, "grad_norm": 4.092774391174316, "learning_rate": 9.536697247706422e-07, "logits/chosen": -2.4570159912109375, "logits/rejected": -2.4544947147369385, "logps/chosen": -50.08770751953125, "logps/rejected": -66.9119873046875, "loss": 0.7841, "rewards/accuracies": 0.96875, "rewards/chosen": 0.24991829693317413, "rewards/margins": 1.011635661125183, "rewards/rejected": -0.7617173790931702, "step": 203 }, { "epoch": 0.4674878258378688, "grad_norm": 4.088425636291504, "learning_rate": 9.53440366972477e-07, "logits/chosen": -2.4743456840515137, "logits/rejected": -2.4268786907196045, "logps/chosen": -55.01140213012695, "logps/rejected": -65.04462432861328, "loss": 0.8557, "rewards/accuracies": 0.84375, "rewards/chosen": -0.05091837793588638, "rewards/margins": 0.6854045391082764, "rewards/rejected": -0.7363229393959045, "step": 204 }, { "epoch": 0.46977943282727014, "grad_norm": 3.6108877658843994, "learning_rate": 9.532110091743118e-07, "logits/chosen": -2.404358148574829, "logits/rejected": -2.407741069793701, "logps/chosen": -58.463836669921875, "logps/rejected": -72.46856689453125, "loss": 0.8306, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08719196915626526, "rewards/margins": 0.839323878288269, "rewards/rejected": -0.9265157580375671, "step": 205 }, { "epoch": 0.4720710398166714, "grad_norm": 3.2420287132263184, "learning_rate": 9.529816513761467e-07, "logits/chosen": -2.4705567359924316, "logits/rejected": -2.4900944232940674, "logps/chosen": -56.77485656738281, "logps/rejected": -65.9498291015625, "loss": 0.8624, "rewards/accuracies": 0.90625, "rewards/chosen": -0.170283704996109, "rewards/margins": 0.6322447061538696, "rewards/rejected": -0.8025283813476562, "step": 206 }, { "epoch": 0.47436264680607276, "grad_norm": 5.002233982086182, "learning_rate": 9.527522935779816e-07, "logits/chosen": -2.4967916011810303, "logits/rejected": -2.4627277851104736, "logps/chosen": -54.638553619384766, "logps/rejected": -68.61199951171875, "loss": 0.8022, "rewards/accuracies": 0.96875, "rewards/chosen": 0.06492935121059418, "rewards/margins": 0.9725061655044556, "rewards/rejected": -0.9075769186019897, "step": 207 }, { "epoch": 0.4766542537954741, "grad_norm": 3.639686107635498, "learning_rate": 9.525229357798165e-07, "logits/chosen": -2.455521583557129, "logits/rejected": -2.4587972164154053, "logps/chosen": -53.613441467285156, "logps/rejected": -63.83517074584961, "loss": 0.8553, "rewards/accuracies": 0.875, "rewards/chosen": 0.1397363245487213, "rewards/margins": 0.6882168054580688, "rewards/rejected": -0.5484804511070251, "step": 208 }, { "epoch": 0.4789458607848754, "grad_norm": 4.224642753601074, "learning_rate": 9.522935779816513e-07, "logits/chosen": -2.516932964324951, "logits/rejected": -2.505331039428711, "logps/chosen": -51.16203308105469, "logps/rejected": -64.42747497558594, "loss": 0.8086, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2924925684928894, "rewards/margins": 0.9225843548774719, "rewards/rejected": -0.6300918459892273, "step": 209 }, { "epoch": 0.48123746777427673, "grad_norm": 4.20077657699585, "learning_rate": 9.520642201834862e-07, "logits/chosen": -2.5012383460998535, "logits/rejected": -2.449305534362793, "logps/chosen": -53.01974105834961, "logps/rejected": -64.1409912109375, "loss": 0.8381, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1074962466955185, "rewards/margins": 0.7304190397262573, "rewards/rejected": -0.8379152417182922, "step": 210 }, { "epoch": 0.483529074763678, "grad_norm": 4.125888824462891, "learning_rate": 9.518348623853211e-07, "logits/chosen": -2.464386463165283, "logits/rejected": -2.462472915649414, "logps/chosen": -49.8414192199707, "logps/rejected": -65.26338195800781, "loss": 0.8279, "rewards/accuracies": 0.90625, "rewards/chosen": 0.19226977229118347, "rewards/margins": 0.8360263109207153, "rewards/rejected": -0.6437565684318542, "step": 211 }, { "epoch": 0.48582068175307935, "grad_norm": 4.200509071350098, "learning_rate": 9.516055045871559e-07, "logits/chosen": -2.4728283882141113, "logits/rejected": -2.4226038455963135, "logps/chosen": -54.959716796875, "logps/rejected": -65.17616271972656, "loss": 0.8682, "rewards/accuracies": 0.78125, "rewards/chosen": -0.16011874377727509, "rewards/margins": 0.6439104080200195, "rewards/rejected": -0.8040291666984558, "step": 212 }, { "epoch": 0.48811228874248064, "grad_norm": 6.586763858795166, "learning_rate": 9.513761467889908e-07, "logits/chosen": -2.495201349258423, "logits/rejected": -2.4663448333740234, "logps/chosen": -55.02480697631836, "logps/rejected": -67.38420867919922, "loss": 0.8263, "rewards/accuracies": 0.78125, "rewards/chosen": -0.008366349153220654, "rewards/margins": 0.8043149709701538, "rewards/rejected": -0.8126813173294067, "step": 213 }, { "epoch": 0.490403895731882, "grad_norm": 3.656954526901245, "learning_rate": 9.511467889908257e-07, "logits/chosen": -2.442364454269409, "logits/rejected": -2.4403228759765625, "logps/chosen": -56.113807678222656, "logps/rejected": -66.005615234375, "loss": 0.8869, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2912103235721588, "rewards/margins": 0.5619010925292969, "rewards/rejected": -0.8531113266944885, "step": 214 }, { "epoch": 0.4926955027212833, "grad_norm": 4.0487961769104, "learning_rate": 9.509174311926605e-07, "logits/chosen": -2.5042405128479004, "logits/rejected": -2.4494335651397705, "logps/chosen": -62.38850402832031, "logps/rejected": -74.84709167480469, "loss": 0.8594, "rewards/accuracies": 0.875, "rewards/chosen": -0.28546643257141113, "rewards/margins": 0.7016189098358154, "rewards/rejected": -0.9870854020118713, "step": 215 }, { "epoch": 0.4949871097106846, "grad_norm": 3.5712223052978516, "learning_rate": 9.506880733944954e-07, "logits/chosen": -2.4708304405212402, "logits/rejected": -2.4595885276794434, "logps/chosen": -58.180763244628906, "logps/rejected": -71.5160140991211, "loss": 0.8553, "rewards/accuracies": 0.90625, "rewards/chosen": -0.22875815629959106, "rewards/margins": 0.7776964902877808, "rewards/rejected": -1.0064547061920166, "step": 216 }, { "epoch": 0.49727871670008594, "grad_norm": 4.662532329559326, "learning_rate": 9.504587155963303e-07, "logits/chosen": -2.4554460048675537, "logits/rejected": -2.4210517406463623, "logps/chosen": -55.954158782958984, "logps/rejected": -71.23664855957031, "loss": 0.8267, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2627810537815094, "rewards/margins": 0.9391190409660339, "rewards/rejected": -1.2019001245498657, "step": 217 }, { "epoch": 0.49957032368948723, "grad_norm": 4.14913272857666, "learning_rate": 9.502293577981651e-07, "logits/chosen": -2.455706834793091, "logits/rejected": -2.477999210357666, "logps/chosen": -53.24388885498047, "logps/rejected": -67.75436401367188, "loss": 0.7998, "rewards/accuracies": 0.9375, "rewards/chosen": 0.028823163360357285, "rewards/margins": 0.9959439039230347, "rewards/rejected": -0.9671207666397095, "step": 218 }, { "epoch": 0.5018619306788886, "grad_norm": 4.287445068359375, "learning_rate": 9.499999999999999e-07, "logits/chosen": -2.5265257358551025, "logits/rejected": -2.5306522846221924, "logps/chosen": -57.54804229736328, "logps/rejected": -67.61871337890625, "loss": 0.8527, "rewards/accuracies": 0.9375, "rewards/chosen": -0.27311035990715027, "rewards/margins": 0.738780677318573, "rewards/rejected": -1.0118910074234009, "step": 219 }, { "epoch": 0.5041535376682899, "grad_norm": 3.858490467071533, "learning_rate": 9.497706422018347e-07, "logits/chosen": -2.455972194671631, "logits/rejected": -2.4568004608154297, "logps/chosen": -54.73013687133789, "logps/rejected": -70.75605010986328, "loss": 0.7996, "rewards/accuracies": 0.875, "rewards/chosen": -0.12280157208442688, "rewards/margins": 1.0136010646820068, "rewards/rejected": -1.1364026069641113, "step": 220 }, { "epoch": 0.5064451446576912, "grad_norm": 4.535824775695801, "learning_rate": 9.495412844036697e-07, "logits/chosen": -2.4724440574645996, "logits/rejected": -2.4642062187194824, "logps/chosen": -55.98501205444336, "logps/rejected": -65.50963592529297, "loss": 0.831, "rewards/accuracies": 0.84375, "rewards/chosen": -0.05462131276726723, "rewards/margins": 0.8382307291030884, "rewards/rejected": -0.8928520679473877, "step": 221 }, { "epoch": 0.5087367516470925, "grad_norm": 3.912445068359375, "learning_rate": 9.493119266055045e-07, "logits/chosen": -2.5042061805725098, "logits/rejected": -2.4946858882904053, "logps/chosen": -52.64169692993164, "logps/rejected": -66.14314270019531, "loss": 0.8183, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0847882628440857, "rewards/margins": 0.8993375897407532, "rewards/rejected": -0.9841258525848389, "step": 222 }, { "epoch": 0.5110283586364939, "grad_norm": 4.728421688079834, "learning_rate": 9.490825688073394e-07, "logits/chosen": -2.472116231918335, "logits/rejected": -2.4255800247192383, "logps/chosen": -53.73482131958008, "logps/rejected": -71.0102310180664, "loss": 0.8227, "rewards/accuracies": 0.875, "rewards/chosen": -0.06730340421199799, "rewards/margins": 0.9610850811004639, "rewards/rejected": -1.028388500213623, "step": 223 }, { "epoch": 0.5133199656258952, "grad_norm": 4.142309665679932, "learning_rate": 9.488532110091742e-07, "logits/chosen": -2.4800398349761963, "logits/rejected": -2.481369972229004, "logps/chosen": -59.615177154541016, "logps/rejected": -73.05657958984375, "loss": 0.8177, "rewards/accuracies": 0.90625, "rewards/chosen": -0.25287115573883057, "rewards/margins": 0.9416797757148743, "rewards/rejected": -1.1945509910583496, "step": 224 }, { "epoch": 0.5156115726152964, "grad_norm": 4.452775478363037, "learning_rate": 9.486238532110092e-07, "logits/chosen": -2.478522777557373, "logits/rejected": -2.482492208480835, "logps/chosen": -56.71668243408203, "logps/rejected": -68.0482406616211, "loss": 0.9009, "rewards/accuracies": 0.75, "rewards/chosen": -0.3258417844772339, "rewards/margins": 0.5518010854721069, "rewards/rejected": -0.8776429295539856, "step": 225 }, { "epoch": 0.5179031796046978, "grad_norm": 5.01326847076416, "learning_rate": 9.48394495412844e-07, "logits/chosen": -2.47304368019104, "logits/rejected": -2.461528778076172, "logps/chosen": -50.9107666015625, "logps/rejected": -66.55201721191406, "loss": 0.7771, "rewards/accuracies": 0.96875, "rewards/chosen": 0.10622110962867737, "rewards/margins": 1.077745795249939, "rewards/rejected": -0.9715246558189392, "step": 226 }, { "epoch": 0.5201947865940991, "grad_norm": 4.262879371643066, "learning_rate": 9.481651376146788e-07, "logits/chosen": -2.4802160263061523, "logits/rejected": -2.442028760910034, "logps/chosen": -51.3328742980957, "logps/rejected": -65.1541976928711, "loss": 0.7951, "rewards/accuracies": 0.875, "rewards/chosen": 0.09634541720151901, "rewards/margins": 0.9572333693504333, "rewards/rejected": -0.860887885093689, "step": 227 }, { "epoch": 0.5224863935835005, "grad_norm": 3.930107831954956, "learning_rate": 9.479357798165138e-07, "logits/chosen": -2.541877031326294, "logits/rejected": -2.5623369216918945, "logps/chosen": -57.34138107299805, "logps/rejected": -71.46298217773438, "loss": 0.8245, "rewards/accuracies": 0.84375, "rewards/chosen": -0.19679507613182068, "rewards/margins": 0.9233821034431458, "rewards/rejected": -1.120177149772644, "step": 228 }, { "epoch": 0.5247780005729017, "grad_norm": 4.453365325927734, "learning_rate": 9.477064220183486e-07, "logits/chosen": -2.4438538551330566, "logits/rejected": -2.4312431812286377, "logps/chosen": -60.362422943115234, "logps/rejected": -72.31782531738281, "loss": 0.8501, "rewards/accuracies": 0.78125, "rewards/chosen": -0.23742519319057465, "rewards/margins": 0.8202124834060669, "rewards/rejected": -1.0576375722885132, "step": 229 }, { "epoch": 0.527069607562303, "grad_norm": 4.10828161239624, "learning_rate": 9.474770642201835e-07, "logits/chosen": -2.5478363037109375, "logits/rejected": -2.507852077484131, "logps/chosen": -58.882408142089844, "logps/rejected": -72.45860290527344, "loss": 0.8229, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1297636181116104, "rewards/margins": 0.8763121366500854, "rewards/rejected": -1.0060756206512451, "step": 230 }, { "epoch": 0.5293612145517044, "grad_norm": 4.122023582458496, "learning_rate": 9.472477064220183e-07, "logits/chosen": -2.563418388366699, "logits/rejected": -2.546837329864502, "logps/chosen": -50.53895568847656, "logps/rejected": -63.7515983581543, "loss": 0.8348, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04373221844434738, "rewards/margins": 0.8095434308052063, "rewards/rejected": -0.7658112049102783, "step": 231 }, { "epoch": 0.5316528215411057, "grad_norm": 4.374768257141113, "learning_rate": 9.470183486238532e-07, "logits/chosen": -2.507256507873535, "logits/rejected": -2.4951682090759277, "logps/chosen": -59.873512268066406, "logps/rejected": -75.61546325683594, "loss": 0.7963, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2501155734062195, "rewards/margins": 1.0022144317626953, "rewards/rejected": -1.25232994556427, "step": 232 }, { "epoch": 0.533944428530507, "grad_norm": 4.810795783996582, "learning_rate": 9.467889908256881e-07, "logits/chosen": -2.5458192825317383, "logits/rejected": -2.551541566848755, "logps/chosen": -55.907657623291016, "logps/rejected": -65.49179077148438, "loss": 0.8486, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04389909282326698, "rewards/margins": 0.7389214634895325, "rewards/rejected": -0.7828205823898315, "step": 233 }, { "epoch": 0.5362360355199083, "grad_norm": 4.799188613891602, "learning_rate": 9.465596330275228e-07, "logits/chosen": -2.482546329498291, "logits/rejected": -2.53066086769104, "logps/chosen": -52.63505935668945, "logps/rejected": -69.68272399902344, "loss": 0.8079, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12260840833187103, "rewards/margins": 0.9557756185531616, "rewards/rejected": -0.8331671953201294, "step": 234 }, { "epoch": 0.5385276425093096, "grad_norm": 4.834675312042236, "learning_rate": 9.463302752293578e-07, "logits/chosen": -2.459481716156006, "logits/rejected": -2.4439003467559814, "logps/chosen": -53.92341613769531, "logps/rejected": -70.33899688720703, "loss": 0.8088, "rewards/accuracies": 0.875, "rewards/chosen": 0.08241058886051178, "rewards/margins": 0.9592021107673645, "rewards/rejected": -0.8767914772033691, "step": 235 }, { "epoch": 0.540819249498711, "grad_norm": 4.75538969039917, "learning_rate": 9.461009174311926e-07, "logits/chosen": -2.506141185760498, "logits/rejected": -2.470888614654541, "logps/chosen": -50.75487518310547, "logps/rejected": -69.25782012939453, "loss": 0.7569, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3269006013870239, "rewards/margins": 1.258165955543518, "rewards/rejected": -0.9312653541564941, "step": 236 }, { "epoch": 0.5431108564881123, "grad_norm": 5.99112606048584, "learning_rate": 9.458715596330274e-07, "logits/chosen": -2.482410192489624, "logits/rejected": -2.4875433444976807, "logps/chosen": -49.16838073730469, "logps/rejected": -64.8904800415039, "loss": 0.803, "rewards/accuracies": 0.875, "rewards/chosen": 0.37879446148872375, "rewards/margins": 0.9806692004203796, "rewards/rejected": -0.6018746495246887, "step": 237 }, { "epoch": 0.5454024634775136, "grad_norm": 4.897491455078125, "learning_rate": 9.456422018348623e-07, "logits/chosen": -2.5410852432250977, "logits/rejected": -2.520782947540283, "logps/chosen": -52.49319076538086, "logps/rejected": -67.89915466308594, "loss": 0.7741, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2729357182979584, "rewards/margins": 1.117650032043457, "rewards/rejected": -0.8447144627571106, "step": 238 }, { "epoch": 0.5476940704669149, "grad_norm": 4.781443119049072, "learning_rate": 9.454128440366972e-07, "logits/chosen": -2.516477584838867, "logits/rejected": -2.503204822540283, "logps/chosen": -50.0324592590332, "logps/rejected": -63.01984405517578, "loss": 0.8116, "rewards/accuracies": 0.875, "rewards/chosen": 0.3201492428779602, "rewards/margins": 0.9173401594161987, "rewards/rejected": -0.5971909761428833, "step": 239 }, { "epoch": 0.5499856774563162, "grad_norm": 3.9313557147979736, "learning_rate": 9.451834862385321e-07, "logits/chosen": -2.5582776069641113, "logits/rejected": -2.5420632362365723, "logps/chosen": -54.69719696044922, "logps/rejected": -66.83018493652344, "loss": 0.8421, "rewards/accuracies": 0.71875, "rewards/chosen": 0.009750787168741226, "rewards/margins": 0.7871648669242859, "rewards/rejected": -0.7774141430854797, "step": 240 }, { "epoch": 0.5522772844457176, "grad_norm": 4.5426506996154785, "learning_rate": 9.449541284403669e-07, "logits/chosen": -2.5041444301605225, "logits/rejected": -2.484734058380127, "logps/chosen": -52.8242301940918, "logps/rejected": -65.35581970214844, "loss": 0.8276, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3517111539840698, "rewards/margins": 0.8296982049942017, "rewards/rejected": -0.47798705101013184, "step": 241 }, { "epoch": 0.5545688914351189, "grad_norm": 4.1401591300964355, "learning_rate": 9.447247706422018e-07, "logits/chosen": -2.5341720581054688, "logits/rejected": -2.5354256629943848, "logps/chosen": -52.82135772705078, "logps/rejected": -67.38308715820312, "loss": 0.7927, "rewards/accuracies": 0.96875, "rewards/chosen": 0.29671207070350647, "rewards/margins": 1.07285737991333, "rewards/rejected": -0.776145339012146, "step": 242 }, { "epoch": 0.5568604984245202, "grad_norm": 4.4837965965271, "learning_rate": 9.444954128440367e-07, "logits/chosen": -2.5805583000183105, "logits/rejected": -2.578395366668701, "logps/chosen": -56.904205322265625, "logps/rejected": -69.67080688476562, "loss": 0.8286, "rewards/accuracies": 0.875, "rewards/chosen": -0.059186339378356934, "rewards/margins": 0.865915060043335, "rewards/rejected": -0.9251014590263367, "step": 243 }, { "epoch": 0.5591521054139215, "grad_norm": 6.60350227355957, "learning_rate": 9.442660550458715e-07, "logits/chosen": -2.477644920349121, "logits/rejected": -2.512723684310913, "logps/chosen": -52.90184783935547, "logps/rejected": -73.2394027709961, "loss": 0.7313, "rewards/accuracies": 0.90625, "rewards/chosen": 0.20968982577323914, "rewards/margins": 1.45701265335083, "rewards/rejected": -1.2473227977752686, "step": 244 }, { "epoch": 0.5614437124033228, "grad_norm": 4.233705997467041, "learning_rate": 9.440366972477064e-07, "logits/chosen": -2.530106782913208, "logits/rejected": -2.505053758621216, "logps/chosen": -51.763465881347656, "logps/rejected": -69.07044219970703, "loss": 0.8009, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17525047063827515, "rewards/margins": 1.0599465370178223, "rewards/rejected": -0.8846961259841919, "step": 245 }, { "epoch": 0.5637353193927241, "grad_norm": 4.426912307739258, "learning_rate": 9.438073394495413e-07, "logits/chosen": -2.479046583175659, "logits/rejected": -2.4794342517852783, "logps/chosen": -48.54230499267578, "logps/rejected": -67.55069732666016, "loss": 0.7903, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2930450439453125, "rewards/margins": 1.1042670011520386, "rewards/rejected": -0.8112220764160156, "step": 246 }, { "epoch": 0.5660269263821255, "grad_norm": 6.389403820037842, "learning_rate": 9.435779816513762e-07, "logits/chosen": -2.4416518211364746, "logits/rejected": -2.470052480697632, "logps/chosen": -53.09716033935547, "logps/rejected": -72.37268829345703, "loss": 0.7514, "rewards/accuracies": 0.9375, "rewards/chosen": 0.36954832077026367, "rewards/margins": 1.2807557582855225, "rewards/rejected": -0.9112074375152588, "step": 247 }, { "epoch": 0.5683185333715268, "grad_norm": 4.860208511352539, "learning_rate": 9.43348623853211e-07, "logits/chosen": -2.586580276489258, "logits/rejected": -2.581444263458252, "logps/chosen": -52.776851654052734, "logps/rejected": -69.40792083740234, "loss": 0.7482, "rewards/accuracies": 1.0, "rewards/chosen": 0.32207614183425903, "rewards/margins": 1.2758305072784424, "rewards/rejected": -0.9537543654441833, "step": 248 }, { "epoch": 0.570610140360928, "grad_norm": 4.360997676849365, "learning_rate": 9.431192660550458e-07, "logits/chosen": -2.506876230239868, "logits/rejected": -2.437983512878418, "logps/chosen": -56.644107818603516, "logps/rejected": -73.10363006591797, "loss": 0.8256, "rewards/accuracies": 0.78125, "rewards/chosen": -0.08643624186515808, "rewards/margins": 0.9440158605575562, "rewards/rejected": -1.0304521322250366, "step": 249 }, { "epoch": 0.5729017473503294, "grad_norm": 4.886284351348877, "learning_rate": 9.428899082568807e-07, "logits/chosen": -2.563849449157715, "logits/rejected": -2.566481113433838, "logps/chosen": -51.05428695678711, "logps/rejected": -65.70259094238281, "loss": 0.8491, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2568379342556, "rewards/margins": 0.7650701999664307, "rewards/rejected": -0.5082322359085083, "step": 250 }, { "epoch": 0.5751933543397307, "grad_norm": 4.670022487640381, "learning_rate": 9.426605504587155e-07, "logits/chosen": -2.5721395015716553, "logits/rejected": -2.5770516395568848, "logps/chosen": -51.468719482421875, "logps/rejected": -62.81729507446289, "loss": 0.8647, "rewards/accuracies": 0.78125, "rewards/chosen": 0.34786269068717957, "rewards/margins": 0.6560289859771729, "rewards/rejected": -0.30816638469696045, "step": 251 }, { "epoch": 0.5774849613291321, "grad_norm": 4.80432653427124, "learning_rate": 9.424311926605504e-07, "logits/chosen": -2.5859780311584473, "logits/rejected": -2.523981809616089, "logps/chosen": -53.522586822509766, "logps/rejected": -64.51270294189453, "loss": 0.7952, "rewards/accuracies": 0.78125, "rewards/chosen": 0.16835278272628784, "rewards/margins": 1.0091253519058228, "rewards/rejected": -0.8407726883888245, "step": 252 }, { "epoch": 0.5797765683185334, "grad_norm": 4.301229000091553, "learning_rate": 9.422018348623852e-07, "logits/chosen": -2.5787744522094727, "logits/rejected": -2.565483331680298, "logps/chosen": -52.606849670410156, "logps/rejected": -65.7780532836914, "loss": 0.8202, "rewards/accuracies": 0.84375, "rewards/chosen": 0.19995513558387756, "rewards/margins": 0.9372264742851257, "rewards/rejected": -0.7372714281082153, "step": 253 }, { "epoch": 0.5820681753079346, "grad_norm": 4.984973430633545, "learning_rate": 9.419724770642201e-07, "logits/chosen": -2.57393741607666, "logits/rejected": -2.559917449951172, "logps/chosen": -56.1127815246582, "logps/rejected": -73.21530151367188, "loss": 0.7917, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18606415390968323, "rewards/margins": 1.2665667533874512, "rewards/rejected": -1.4526309967041016, "step": 254 }, { "epoch": 0.584359782297336, "grad_norm": 4.987429141998291, "learning_rate": 9.41743119266055e-07, "logits/chosen": -2.5621285438537598, "logits/rejected": -2.569967746734619, "logps/chosen": -50.81196975708008, "logps/rejected": -64.10604095458984, "loss": 0.8138, "rewards/accuracies": 0.84375, "rewards/chosen": 0.45022791624069214, "rewards/margins": 0.9736542701721191, "rewards/rejected": -0.5234264135360718, "step": 255 }, { "epoch": 0.5866513892867373, "grad_norm": 4.939540386199951, "learning_rate": 9.415137614678898e-07, "logits/chosen": -2.468020439147949, "logits/rejected": -2.509620189666748, "logps/chosen": -56.00531005859375, "logps/rejected": -70.92655944824219, "loss": 0.8146, "rewards/accuracies": 0.8125, "rewards/chosen": 0.09673285484313965, "rewards/margins": 0.976108193397522, "rewards/rejected": -0.8793753981590271, "step": 256 }, { "epoch": 0.5889429962761387, "grad_norm": 5.509927749633789, "learning_rate": 9.412844036697248e-07, "logits/chosen": -2.5535364151000977, "logits/rejected": -2.545215368270874, "logps/chosen": -60.93865966796875, "logps/rejected": -77.36051940917969, "loss": 0.8047, "rewards/accuracies": 0.8125, "rewards/chosen": -0.45049628615379333, "rewards/margins": 1.0874661207199097, "rewards/rejected": -1.5379623174667358, "step": 257 }, { "epoch": 0.59123460326554, "grad_norm": 4.569234371185303, "learning_rate": 9.410550458715596e-07, "logits/chosen": -2.607616424560547, "logits/rejected": -2.567002296447754, "logps/chosen": -55.241127014160156, "logps/rejected": -72.70723724365234, "loss": 0.7876, "rewards/accuracies": 0.875, "rewards/chosen": 0.111435666680336, "rewards/margins": 1.1915665864944458, "rewards/rejected": -1.0801310539245605, "step": 258 }, { "epoch": 0.5935262102549412, "grad_norm": 5.940906047821045, "learning_rate": 9.408256880733944e-07, "logits/chosen": -2.5899758338928223, "logits/rejected": -2.5807394981384277, "logps/chosen": -58.882808685302734, "logps/rejected": -77.61549377441406, "loss": 0.7847, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0741177424788475, "rewards/margins": 1.2599637508392334, "rewards/rejected": -1.3340815305709839, "step": 259 }, { "epoch": 0.5958178172443426, "grad_norm": 5.48697566986084, "learning_rate": 9.405963302752293e-07, "logits/chosen": -2.5721161365509033, "logits/rejected": -2.571422815322876, "logps/chosen": -61.42055892944336, "logps/rejected": -74.38735961914062, "loss": 0.8371, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2978840470314026, "rewards/margins": 0.8594042658805847, "rewards/rejected": -1.1572883129119873, "step": 260 }, { "epoch": 0.5981094242337439, "grad_norm": 5.689541339874268, "learning_rate": 9.403669724770642e-07, "logits/chosen": -2.583285093307495, "logits/rejected": -2.5634336471557617, "logps/chosen": -56.00029754638672, "logps/rejected": -72.32232666015625, "loss": 0.7859, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12951195240020752, "rewards/margins": 1.1061701774597168, "rewards/rejected": -0.9766581058502197, "step": 261 }, { "epoch": 0.6004010312231453, "grad_norm": 4.663910388946533, "learning_rate": 9.401376146788991e-07, "logits/chosen": -2.6218361854553223, "logits/rejected": -2.594586133956909, "logps/chosen": -54.930076599121094, "logps/rejected": -71.74039459228516, "loss": 0.7759, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1461702585220337, "rewards/margins": 1.2083263397216797, "rewards/rejected": -1.062156081199646, "step": 262 }, { "epoch": 0.6026926382125466, "grad_norm": 5.314773082733154, "learning_rate": 9.399082568807339e-07, "logits/chosen": -2.6391241550445557, "logits/rejected": -2.588806629180908, "logps/chosen": -48.3875617980957, "logps/rejected": -67.3711166381836, "loss": 0.7534, "rewards/accuracies": 0.8125, "rewards/chosen": 0.31267040967941284, "rewards/margins": 1.2229702472686768, "rewards/rejected": -0.9102997779846191, "step": 263 }, { "epoch": 0.6049842452019478, "grad_norm": 6.44840669631958, "learning_rate": 9.396788990825689e-07, "logits/chosen": -2.5542197227478027, "logits/rejected": -2.524921417236328, "logps/chosen": -53.910831451416016, "logps/rejected": -69.93946838378906, "loss": 0.7775, "rewards/accuracies": 0.875, "rewards/chosen": 0.28614890575408936, "rewards/margins": 1.1548264026641846, "rewards/rejected": -0.8686776161193848, "step": 264 }, { "epoch": 0.6049842452019478, "eval_logits/chosen": -2.656860113143921, "eval_logits/rejected": -2.6592459678649902, "eval_logps/chosen": -57.99863815307617, "eval_logps/rejected": -70.9998779296875, "eval_loss": 0.8356152772903442, "eval_rewards/accuracies": 0.798113226890564, "eval_rewards/chosen": -0.09362836182117462, "eval_rewards/margins": 0.9014152884483337, "eval_rewards/rejected": -0.9950434565544128, "eval_runtime": 958.4917, "eval_samples_per_second": 0.552, "eval_steps_per_second": 0.276, "step": 264 }, { "epoch": 0.6072758521913492, "grad_norm": 5.809870719909668, "learning_rate": 9.394495412844037e-07, "logits/chosen": -2.6072936058044434, "logits/rejected": -2.597188711166382, "logps/chosen": -49.62434387207031, "logps/rejected": -67.47427368164062, "loss": 0.7981, "rewards/accuracies": 0.875, "rewards/chosen": 0.26291701197624207, "rewards/margins": 1.1313602924346924, "rewards/rejected": -0.8684433698654175, "step": 265 }, { "epoch": 0.6095674591807505, "grad_norm": 5.136531829833984, "learning_rate": 9.392201834862384e-07, "logits/chosen": -2.670398235321045, "logits/rejected": -2.6142539978027344, "logps/chosen": -46.74652099609375, "logps/rejected": -64.24138641357422, "loss": 0.7694, "rewards/accuracies": 0.90625, "rewards/chosen": 0.659913182258606, "rewards/margins": 1.2456408739089966, "rewards/rejected": -0.5857276916503906, "step": 266 }, { "epoch": 0.6118590661701518, "grad_norm": 5.652091979980469, "learning_rate": 9.389908256880733e-07, "logits/chosen": -2.611121892929077, "logits/rejected": -2.5876107215881348, "logps/chosen": -50.81434631347656, "logps/rejected": -69.69065856933594, "loss": 0.7981, "rewards/accuracies": 0.875, "rewards/chosen": 0.3608332872390747, "rewards/margins": 1.0940465927124023, "rewards/rejected": -0.7332133054733276, "step": 267 }, { "epoch": 0.6141506731595532, "grad_norm": 4.819527626037598, "learning_rate": 9.387614678899082e-07, "logits/chosen": -2.615774154663086, "logits/rejected": -2.6027591228485107, "logps/chosen": -51.60170364379883, "logps/rejected": -67.3653564453125, "loss": 0.8016, "rewards/accuracies": 0.875, "rewards/chosen": 0.201298788189888, "rewards/margins": 1.0478854179382324, "rewards/rejected": -0.8465868234634399, "step": 268 }, { "epoch": 0.6164422801489544, "grad_norm": 5.613049507141113, "learning_rate": 9.385321100917431e-07, "logits/chosen": -2.50502347946167, "logits/rejected": -2.510028600692749, "logps/chosen": -52.402549743652344, "logps/rejected": -74.4714584350586, "loss": 0.7327, "rewards/accuracies": 0.875, "rewards/chosen": 0.33794018626213074, "rewards/margins": 1.467504858970642, "rewards/rejected": -1.1295647621154785, "step": 269 }, { "epoch": 0.6187338871383558, "grad_norm": 6.716032028198242, "learning_rate": 9.383027522935779e-07, "logits/chosen": -2.6396477222442627, "logits/rejected": -2.595288038253784, "logps/chosen": -50.10356521606445, "logps/rejected": -73.49261474609375, "loss": 0.6933, "rewards/accuracies": 0.9375, "rewards/chosen": 0.40660417079925537, "rewards/margins": 1.672985553741455, "rewards/rejected": -1.2663815021514893, "step": 270 }, { "epoch": 0.6210254941277571, "grad_norm": 5.059470176696777, "learning_rate": 9.380733944954127e-07, "logits/chosen": -2.578867197036743, "logits/rejected": -2.561290740966797, "logps/chosen": -54.5650634765625, "logps/rejected": -73.50376892089844, "loss": 0.7374, "rewards/accuracies": 0.875, "rewards/chosen": 0.1863970160484314, "rewards/margins": 1.391034483909607, "rewards/rejected": -1.2046375274658203, "step": 271 }, { "epoch": 0.6233171011171584, "grad_norm": 6.020323753356934, "learning_rate": 9.378440366972477e-07, "logits/chosen": -2.640394926071167, "logits/rejected": -2.6124484539031982, "logps/chosen": -59.13202667236328, "logps/rejected": -77.35587310791016, "loss": 0.7852, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1838321089744568, "rewards/margins": 1.2872577905654907, "rewards/rejected": -1.4710900783538818, "step": 272 }, { "epoch": 0.6256087081065598, "grad_norm": 6.345904350280762, "learning_rate": 9.376146788990825e-07, "logits/chosen": -2.554239273071289, "logits/rejected": -2.5158333778381348, "logps/chosen": -54.18724060058594, "logps/rejected": -75.58413696289062, "loss": 0.7323, "rewards/accuracies": 0.90625, "rewards/chosen": 0.17531529068946838, "rewards/margins": 1.5790802240371704, "rewards/rejected": -1.4037647247314453, "step": 273 }, { "epoch": 0.627900315095961, "grad_norm": 5.552344799041748, "learning_rate": 9.373853211009174e-07, "logits/chosen": -2.706061363220215, "logits/rejected": -2.6642019748687744, "logps/chosen": -47.51987838745117, "logps/rejected": -61.22273635864258, "loss": 0.8049, "rewards/accuracies": 0.90625, "rewards/chosen": 0.46485090255737305, "rewards/margins": 1.031338095664978, "rewards/rejected": -0.5664872527122498, "step": 274 }, { "epoch": 0.6301919220853623, "grad_norm": 7.282151222229004, "learning_rate": 9.371559633027523e-07, "logits/chosen": -2.6025028228759766, "logits/rejected": -2.5865325927734375, "logps/chosen": -42.28664779663086, "logps/rejected": -58.16633987426758, "loss": 0.7863, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8746553063392639, "rewards/margins": 1.1522959470748901, "rewards/rejected": -0.277640700340271, "step": 275 }, { "epoch": 0.6324835290747637, "grad_norm": 7.1024298667907715, "learning_rate": 9.369266055045871e-07, "logits/chosen": -2.6530532836914062, "logits/rejected": -2.5849080085754395, "logps/chosen": -51.22486114501953, "logps/rejected": -66.8668212890625, "loss": 0.7746, "rewards/accuracies": 0.875, "rewards/chosen": 0.3620362877845764, "rewards/margins": 1.1012239456176758, "rewards/rejected": -0.7391877174377441, "step": 276 }, { "epoch": 0.634775136064165, "grad_norm": 6.691248893737793, "learning_rate": 9.36697247706422e-07, "logits/chosen": -2.676159381866455, "logits/rejected": -2.659632682800293, "logps/chosen": -41.73007583618164, "logps/rejected": -62.09906005859375, "loss": 0.7365, "rewards/accuracies": 0.875, "rewards/chosen": 0.9520897269248962, "rewards/margins": 1.3979865312576294, "rewards/rejected": -0.4458969533443451, "step": 277 }, { "epoch": 0.6370667430535664, "grad_norm": 6.227463722229004, "learning_rate": 9.364678899082568e-07, "logits/chosen": -2.6297316551208496, "logits/rejected": -2.596214532852173, "logps/chosen": -49.57692337036133, "logps/rejected": -67.51412963867188, "loss": 0.763, "rewards/accuracies": 0.875, "rewards/chosen": 0.2052653729915619, "rewards/margins": 1.213535189628601, "rewards/rejected": -1.0082696676254272, "step": 278 }, { "epoch": 0.6393583500429676, "grad_norm": 5.2158122062683105, "learning_rate": 9.362385321100918e-07, "logits/chosen": -2.6696290969848633, "logits/rejected": -2.654937744140625, "logps/chosen": -52.50606918334961, "logps/rejected": -69.38017272949219, "loss": 0.7731, "rewards/accuracies": 0.84375, "rewards/chosen": 0.30537864565849304, "rewards/margins": 1.2560110092163086, "rewards/rejected": -0.9506323337554932, "step": 279 }, { "epoch": 0.6416499570323689, "grad_norm": 6.939571380615234, "learning_rate": 9.360091743119266e-07, "logits/chosen": -2.578003406524658, "logits/rejected": -2.6032016277313232, "logps/chosen": -51.49696731567383, "logps/rejected": -70.39667510986328, "loss": 0.7648, "rewards/accuracies": 0.90625, "rewards/chosen": 0.19547368586063385, "rewards/margins": 1.2970017194747925, "rewards/rejected": -1.1015281677246094, "step": 280 }, { "epoch": 0.6439415640217703, "grad_norm": 4.928709506988525, "learning_rate": 9.357798165137614e-07, "logits/chosen": -2.53792142868042, "logits/rejected": -2.585638999938965, "logps/chosen": -55.302825927734375, "logps/rejected": -71.33003997802734, "loss": 0.7698, "rewards/accuracies": 0.875, "rewards/chosen": 0.0804656594991684, "rewards/margins": 1.195522665977478, "rewards/rejected": -1.1150569915771484, "step": 281 }, { "epoch": 0.6462331710111716, "grad_norm": 4.868886947631836, "learning_rate": 9.355504587155963e-07, "logits/chosen": -2.6425163745880127, "logits/rejected": -2.601503372192383, "logps/chosen": -52.085044860839844, "logps/rejected": -65.79268646240234, "loss": 0.7995, "rewards/accuracies": 0.78125, "rewards/chosen": -0.00041419267654418945, "rewards/margins": 1.0482542514801025, "rewards/rejected": -1.048668384552002, "step": 282 }, { "epoch": 0.648524778000573, "grad_norm": 5.751716136932373, "learning_rate": 9.353211009174311e-07, "logits/chosen": -2.596512794494629, "logits/rejected": -2.5802786350250244, "logps/chosen": -53.748023986816406, "logps/rejected": -66.87416076660156, "loss": 0.8272, "rewards/accuracies": 0.75, "rewards/chosen": 0.03924783319234848, "rewards/margins": 1.0199480056762695, "rewards/rejected": -0.9807001352310181, "step": 283 }, { "epoch": 0.6508163849899742, "grad_norm": 7.477166652679443, "learning_rate": 9.35091743119266e-07, "logits/chosen": -2.6995344161987305, "logits/rejected": -2.7104532718658447, "logps/chosen": -57.158531188964844, "logps/rejected": -74.41187286376953, "loss": 0.7675, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11782519519329071, "rewards/margins": 1.3638321161270142, "rewards/rejected": -1.4816572666168213, "step": 284 }, { "epoch": 0.6531079919793755, "grad_norm": 5.952273845672607, "learning_rate": 9.348623853211008e-07, "logits/chosen": -2.5883615016937256, "logits/rejected": -2.577089786529541, "logps/chosen": -45.05295181274414, "logps/rejected": -63.59604263305664, "loss": 0.8046, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5733873248100281, "rewards/margins": 1.2045843601226807, "rewards/rejected": -0.6311971545219421, "step": 285 }, { "epoch": 0.6553995989687769, "grad_norm": 6.733729839324951, "learning_rate": 9.346330275229357e-07, "logits/chosen": -2.6017258167266846, "logits/rejected": -2.6379878520965576, "logps/chosen": -56.74882125854492, "logps/rejected": -75.54713439941406, "loss": 0.707, "rewards/accuracies": 0.9375, "rewards/chosen": 0.004292171448469162, "rewards/margins": 1.540117859840393, "rewards/rejected": -1.5358257293701172, "step": 286 }, { "epoch": 0.6576912059581782, "grad_norm": 5.41800594329834, "learning_rate": 9.344036697247706e-07, "logits/chosen": -2.671621799468994, "logits/rejected": -2.603969097137451, "logps/chosen": -51.273258209228516, "logps/rejected": -60.235679626464844, "loss": 0.8652, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12753713130950928, "rewards/margins": 0.669442892074585, "rewards/rejected": -0.5419057607650757, "step": 287 }, { "epoch": 0.6599828129475795, "grad_norm": 5.355335712432861, "learning_rate": 9.341743119266054e-07, "logits/chosen": -2.5545077323913574, "logits/rejected": -2.5319716930389404, "logps/chosen": -57.31746292114258, "logps/rejected": -70.46867370605469, "loss": 0.8356, "rewards/accuracies": 0.84375, "rewards/chosen": -0.29096898436546326, "rewards/margins": 1.0216161012649536, "rewards/rejected": -1.3125849962234497, "step": 288 }, { "epoch": 0.6622744199369808, "grad_norm": 5.946140289306641, "learning_rate": 9.339449541284404e-07, "logits/chosen": -2.672013998031616, "logits/rejected": -2.6469383239746094, "logps/chosen": -50.98649597167969, "logps/rejected": -70.35344696044922, "loss": 0.7212, "rewards/accuracies": 0.90625, "rewards/chosen": 0.37567514181137085, "rewards/margins": 1.453900694847107, "rewards/rejected": -1.0782256126403809, "step": 289 }, { "epoch": 0.6645660269263821, "grad_norm": 6.038196563720703, "learning_rate": 9.337155963302752e-07, "logits/chosen": -2.6178736686706543, "logits/rejected": -2.6595280170440674, "logps/chosen": -49.894737243652344, "logps/rejected": -72.41776275634766, "loss": 0.6871, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6327916383743286, "rewards/margins": 1.7547415494918823, "rewards/rejected": -1.1219500303268433, "step": 290 }, { "epoch": 0.6668576339157835, "grad_norm": 7.865609645843506, "learning_rate": 9.334862385321101e-07, "logits/chosen": -2.634046792984009, "logits/rejected": -2.6217257976531982, "logps/chosen": -53.0368537902832, "logps/rejected": -67.96994018554688, "loss": 0.8077, "rewards/accuracies": 0.90625, "rewards/chosen": 0.07848355919122696, "rewards/margins": 1.067720890045166, "rewards/rejected": -0.9892373085021973, "step": 291 }, { "epoch": 0.6691492409051848, "grad_norm": 5.627008438110352, "learning_rate": 9.332568807339449e-07, "logits/chosen": -2.5544214248657227, "logits/rejected": -2.5966532230377197, "logps/chosen": -52.08951187133789, "logps/rejected": -69.9273681640625, "loss": 0.7688, "rewards/accuracies": 0.875, "rewards/chosen": 0.35277774930000305, "rewards/margins": 1.2723475694656372, "rewards/rejected": -0.9195697903633118, "step": 292 }, { "epoch": 0.671440847894586, "grad_norm": 6.527369976043701, "learning_rate": 9.330275229357798e-07, "logits/chosen": -2.6188035011291504, "logits/rejected": -2.625260353088379, "logps/chosen": -48.85770034790039, "logps/rejected": -71.65483856201172, "loss": 0.7453, "rewards/accuracies": 0.84375, "rewards/chosen": 0.05243232101202011, "rewards/margins": 1.5192617177963257, "rewards/rejected": -1.4668292999267578, "step": 293 }, { "epoch": 0.6737324548839874, "grad_norm": 10.573755264282227, "learning_rate": 9.327981651376147e-07, "logits/chosen": -2.7246363162994385, "logits/rejected": -2.678586959838867, "logps/chosen": -50.02998352050781, "logps/rejected": -72.75901794433594, "loss": 0.723, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3509270250797272, "rewards/margins": 1.7294293642044067, "rewards/rejected": -1.378502368927002, "step": 294 }, { "epoch": 0.6760240618733887, "grad_norm": 6.383469104766846, "learning_rate": 9.325688073394495e-07, "logits/chosen": -2.6547415256500244, "logits/rejected": -2.6589231491088867, "logps/chosen": -52.00181579589844, "logps/rejected": -66.2138671875, "loss": 0.8376, "rewards/accuracies": 0.78125, "rewards/chosen": 0.09474818408489227, "rewards/margins": 0.9257742762565613, "rewards/rejected": -0.8310259580612183, "step": 295 }, { "epoch": 0.67831566886279, "grad_norm": 6.458634853363037, "learning_rate": 9.323394495412844e-07, "logits/chosen": -2.6273341178894043, "logits/rejected": -2.6544532775878906, "logps/chosen": -55.30955505371094, "logps/rejected": -80.00634765625, "loss": 0.702, "rewards/accuracies": 0.96875, "rewards/chosen": 0.08071079850196838, "rewards/margins": 1.8012720346450806, "rewards/rejected": -1.7205613851547241, "step": 296 }, { "epoch": 0.6806072758521914, "grad_norm": 4.929348945617676, "learning_rate": 9.321100917431193e-07, "logits/chosen": -2.6669936180114746, "logits/rejected": -2.6472830772399902, "logps/chosen": -48.784934997558594, "logps/rejected": -65.0811996459961, "loss": 0.8148, "rewards/accuracies": 0.90625, "rewards/chosen": 0.38942229747772217, "rewards/margins": 1.0871679782867432, "rewards/rejected": -0.697745680809021, "step": 297 }, { "epoch": 0.6828988828415926, "grad_norm": 6.713610649108887, "learning_rate": 9.31880733944954e-07, "logits/chosen": -2.6908345222473145, "logits/rejected": -2.6245124340057373, "logps/chosen": -50.011268615722656, "logps/rejected": -68.74568939208984, "loss": 0.7621, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6102133989334106, "rewards/margins": 1.3339869976043701, "rewards/rejected": -0.7237736582756042, "step": 298 }, { "epoch": 0.685190489830994, "grad_norm": 6.276400089263916, "learning_rate": 9.316513761467889e-07, "logits/chosen": -2.642648696899414, "logits/rejected": -2.6079342365264893, "logps/chosen": -52.96281433105469, "logps/rejected": -75.7122802734375, "loss": 0.7179, "rewards/accuracies": 0.875, "rewards/chosen": 0.3693735599517822, "rewards/margins": 1.5340803861618042, "rewards/rejected": -1.164706826210022, "step": 299 }, { "epoch": 0.6874820968203953, "grad_norm": 8.628591537475586, "learning_rate": 9.314220183486238e-07, "logits/chosen": -2.7051188945770264, "logits/rejected": -2.671801805496216, "logps/chosen": -46.1861572265625, "logps/rejected": -66.36266326904297, "loss": 0.6907, "rewards/accuracies": 0.875, "rewards/chosen": 0.8216037154197693, "rewards/margins": 1.6061707735061646, "rewards/rejected": -0.7845670580863953, "step": 300 }, { "epoch": 0.6897737038097966, "grad_norm": 6.440505504608154, "learning_rate": 9.311926605504587e-07, "logits/chosen": -2.6650826930999756, "logits/rejected": -2.652815818786621, "logps/chosen": -48.08549499511719, "logps/rejected": -70.15719604492188, "loss": 0.7238, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5941813588142395, "rewards/margins": 1.3859161138534546, "rewards/rejected": -0.7917348742485046, "step": 301 }, { "epoch": 0.692065310799198, "grad_norm": 5.359339714050293, "learning_rate": 9.309633027522935e-07, "logits/chosen": -2.680539608001709, "logits/rejected": -2.635784387588501, "logps/chosen": -48.940433502197266, "logps/rejected": -71.78356170654297, "loss": 0.7179, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5751877427101135, "rewards/margins": 1.692313551902771, "rewards/rejected": -1.1171256303787231, "step": 302 }, { "epoch": 0.6943569177885992, "grad_norm": 5.057033061981201, "learning_rate": 9.307339449541283e-07, "logits/chosen": -2.694389820098877, "logits/rejected": -2.660111665725708, "logps/chosen": -44.83932113647461, "logps/rejected": -61.25117492675781, "loss": 0.803, "rewards/accuracies": 0.875, "rewards/chosen": 0.7223673462867737, "rewards/margins": 1.0620067119598389, "rewards/rejected": -0.3396393656730652, "step": 303 }, { "epoch": 0.6966485247780005, "grad_norm": 5.974701881408691, "learning_rate": 9.305045871559633e-07, "logits/chosen": -2.6814043521881104, "logits/rejected": -2.689110040664673, "logps/chosen": -49.956295013427734, "logps/rejected": -69.78269958496094, "loss": 0.7223, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5884120464324951, "rewards/margins": 1.596127986907959, "rewards/rejected": -1.0077160596847534, "step": 304 }, { "epoch": 0.6989401317674019, "grad_norm": 6.726781368255615, "learning_rate": 9.302752293577981e-07, "logits/chosen": -2.581974983215332, "logits/rejected": -2.5883753299713135, "logps/chosen": -58.570030212402344, "logps/rejected": -78.57171630859375, "loss": 0.7664, "rewards/accuracies": 0.84375, "rewards/chosen": 0.05419183149933815, "rewards/margins": 1.3637399673461914, "rewards/rejected": -1.3095481395721436, "step": 305 }, { "epoch": 0.7012317387568032, "grad_norm": 5.47599458694458, "learning_rate": 9.30045871559633e-07, "logits/chosen": -2.5920591354370117, "logits/rejected": -2.6165637969970703, "logps/chosen": -47.71095275878906, "logps/rejected": -71.86416625976562, "loss": 0.6296, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7202261686325073, "rewards/margins": 1.9868664741516113, "rewards/rejected": -1.2666401863098145, "step": 306 }, { "epoch": 0.7035233457462046, "grad_norm": 5.777373313903809, "learning_rate": 9.298165137614679e-07, "logits/chosen": -2.7216954231262207, "logits/rejected": -2.74579119682312, "logps/chosen": -49.71026611328125, "logps/rejected": -66.83014678955078, "loss": 0.782, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4418894648551941, "rewards/margins": 1.2097687721252441, "rewards/rejected": -0.7678793668746948, "step": 307 }, { "epoch": 0.7058149527356058, "grad_norm": 6.148016929626465, "learning_rate": 9.295871559633027e-07, "logits/chosen": -2.6593868732452393, "logits/rejected": -2.71828293800354, "logps/chosen": -50.899932861328125, "logps/rejected": -66.67103576660156, "loss": 0.7484, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5115935802459717, "rewards/margins": 1.2868036031723022, "rewards/rejected": -0.7752099633216858, "step": 308 }, { "epoch": 0.7081065597250071, "grad_norm": 6.613726615905762, "learning_rate": 9.293577981651376e-07, "logits/chosen": -2.708317756652832, "logits/rejected": -2.6726083755493164, "logps/chosen": -45.8685302734375, "logps/rejected": -67.62336730957031, "loss": 0.7084, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7502938508987427, "rewards/margins": 1.7017261981964111, "rewards/rejected": -0.9514322280883789, "step": 309 }, { "epoch": 0.7103981667144085, "grad_norm": 8.766711235046387, "learning_rate": 9.291284403669724e-07, "logits/chosen": -2.63211727142334, "logits/rejected": -2.605884313583374, "logps/chosen": -57.19648742675781, "logps/rejected": -80.3425064086914, "loss": 0.7463, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0028791576623916626, "rewards/margins": 1.7937946319580078, "rewards/rejected": -1.7966736555099487, "step": 310 }, { "epoch": 0.7126897737038098, "grad_norm": 8.371105194091797, "learning_rate": 9.288990825688074e-07, "logits/chosen": -2.714887857437134, "logits/rejected": -2.7049827575683594, "logps/chosen": -49.44217300415039, "logps/rejected": -68.60822296142578, "loss": 0.7315, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8249673843383789, "rewards/margins": 1.4924650192260742, "rewards/rejected": -0.6674976348876953, "step": 311 }, { "epoch": 0.7149813806932112, "grad_norm": 5.895245552062988, "learning_rate": 9.286697247706422e-07, "logits/chosen": -2.673557758331299, "logits/rejected": -2.6713364124298096, "logps/chosen": -56.734657287597656, "logps/rejected": -76.59254455566406, "loss": 0.7725, "rewards/accuracies": 0.875, "rewards/chosen": 0.0355062298476696, "rewards/margins": 1.3535679578781128, "rewards/rejected": -1.3180615901947021, "step": 312 }, { "epoch": 0.7172729876826124, "grad_norm": 7.580226898193359, "learning_rate": 9.284403669724771e-07, "logits/chosen": -2.6923775672912598, "logits/rejected": -2.686574697494507, "logps/chosen": -49.36705780029297, "logps/rejected": -68.06683349609375, "loss": 0.6884, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8183351159095764, "rewards/margins": 1.7346729040145874, "rewards/rejected": -0.9163378477096558, "step": 313 }, { "epoch": 0.7195645946720137, "grad_norm": 5.531320571899414, "learning_rate": 9.282110091743118e-07, "logits/chosen": -2.734508991241455, "logits/rejected": -2.7189762592315674, "logps/chosen": -53.46392822265625, "logps/rejected": -72.79191589355469, "loss": 0.7619, "rewards/accuracies": 0.875, "rewards/chosen": 0.222720205783844, "rewards/margins": 1.4941738843917847, "rewards/rejected": -1.2714537382125854, "step": 314 }, { "epoch": 0.7218562016614151, "grad_norm": 6.360165596008301, "learning_rate": 9.279816513761467e-07, "logits/chosen": -2.6527347564697266, "logits/rejected": -2.6479294300079346, "logps/chosen": -52.28504180908203, "logps/rejected": -70.52548217773438, "loss": 0.7571, "rewards/accuracies": 0.9375, "rewards/chosen": 0.39201033115386963, "rewards/margins": 1.4794327020645142, "rewards/rejected": -1.0874223709106445, "step": 315 }, { "epoch": 0.7241478086508164, "grad_norm": 9.471627235412598, "learning_rate": 9.277522935779816e-07, "logits/chosen": -2.606841564178467, "logits/rejected": -2.60524320602417, "logps/chosen": -53.827510833740234, "logps/rejected": -71.78755187988281, "loss": 0.7635, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2885501980781555, "rewards/margins": 1.455904245376587, "rewards/rejected": -1.1673541069030762, "step": 316 }, { "epoch": 0.7264394156402177, "grad_norm": 6.177737236022949, "learning_rate": 9.275229357798164e-07, "logits/chosen": -2.6941277980804443, "logits/rejected": -2.6611075401306152, "logps/chosen": -52.64148712158203, "logps/rejected": -69.6147689819336, "loss": 0.8089, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0041885897517204285, "rewards/margins": 1.2433408498764038, "rewards/rejected": -1.2475292682647705, "step": 317 }, { "epoch": 0.728731022629619, "grad_norm": 6.5022807121276855, "learning_rate": 9.272935779816514e-07, "logits/chosen": -2.6911423206329346, "logits/rejected": -2.666844129562378, "logps/chosen": -61.06907653808594, "logps/rejected": -77.8279037475586, "loss": 0.7914, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2706552743911743, "rewards/margins": 1.3436059951782227, "rewards/rejected": -1.614261269569397, "step": 318 }, { "epoch": 0.7310226296190203, "grad_norm": 6.9370293617248535, "learning_rate": 9.270642201834862e-07, "logits/chosen": -2.7461681365966797, "logits/rejected": -2.7205517292022705, "logps/chosen": -52.981956481933594, "logps/rejected": -68.80435180664062, "loss": 0.7256, "rewards/accuracies": 0.90625, "rewards/chosen": 0.20379656553268433, "rewards/margins": 1.4949713945388794, "rewards/rejected": -1.2911748886108398, "step": 319 }, { "epoch": 0.7333142366084217, "grad_norm": 11.275660514831543, "learning_rate": 9.26834862385321e-07, "logits/chosen": -2.6384973526000977, "logits/rejected": -2.594599723815918, "logps/chosen": -61.062870025634766, "logps/rejected": -80.94255065917969, "loss": 0.7876, "rewards/accuracies": 0.9375, "rewards/chosen": -0.35588786005973816, "rewards/margins": 1.52791166305542, "rewards/rejected": -1.88379967212677, "step": 320 }, { "epoch": 0.735605843597823, "grad_norm": 6.553607940673828, "learning_rate": 9.266055045871559e-07, "logits/chosen": -2.675363063812256, "logits/rejected": -2.6580865383148193, "logps/chosen": -52.617347717285156, "logps/rejected": -78.3368911743164, "loss": 0.6759, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21649259328842163, "rewards/margins": 1.8958333730697632, "rewards/rejected": -1.6793408393859863, "step": 321 }, { "epoch": 0.7378974505872243, "grad_norm": 9.806528091430664, "learning_rate": 9.263761467889908e-07, "logits/chosen": -2.746356964111328, "logits/rejected": -2.7125871181488037, "logps/chosen": -49.06481170654297, "logps/rejected": -78.5463638305664, "loss": 0.586, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5616695880889893, "rewards/margins": 2.306489944458008, "rewards/rejected": -1.744820237159729, "step": 322 }, { "epoch": 0.7401890575766256, "grad_norm": 10.282638549804688, "learning_rate": 9.261467889908257e-07, "logits/chosen": -2.7156314849853516, "logits/rejected": -2.709059000015259, "logps/chosen": -52.86640167236328, "logps/rejected": -72.64443969726562, "loss": 0.761, "rewards/accuracies": 0.875, "rewards/chosen": 0.1512645035982132, "rewards/margins": 1.5428390502929688, "rewards/rejected": -1.3915746212005615, "step": 323 }, { "epoch": 0.7424806645660269, "grad_norm": 6.0150041580200195, "learning_rate": 9.259174311926605e-07, "logits/chosen": -2.6571106910705566, "logits/rejected": -2.6465437412261963, "logps/chosen": -50.8936653137207, "logps/rejected": -67.60090637207031, "loss": 0.7589, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3987787961959839, "rewards/margins": 1.4057258367538452, "rewards/rejected": -1.0069470405578613, "step": 324 }, { "epoch": 0.7447722715554282, "grad_norm": 7.966578483581543, "learning_rate": 9.256880733944953e-07, "logits/chosen": -2.7668673992156982, "logits/rejected": -2.698885440826416, "logps/chosen": -51.876365661621094, "logps/rejected": -74.59330749511719, "loss": 0.7299, "rewards/accuracies": 0.90625, "rewards/chosen": 0.25302693247795105, "rewards/margins": 1.5850459337234497, "rewards/rejected": -1.3320189714431763, "step": 325 }, { "epoch": 0.7470638785448296, "grad_norm": 5.686113357543945, "learning_rate": 9.254587155963303e-07, "logits/chosen": -2.7058019638061523, "logits/rejected": -2.6789238452911377, "logps/chosen": -51.090126037597656, "logps/rejected": -71.60540008544922, "loss": 0.7287, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1853671818971634, "rewards/margins": 1.5494506359100342, "rewards/rejected": -1.3640835285186768, "step": 326 }, { "epoch": 0.7493554855342309, "grad_norm": 5.694385051727295, "learning_rate": 9.252293577981651e-07, "logits/chosen": -2.7034542560577393, "logits/rejected": -2.6984992027282715, "logps/chosen": -51.324302673339844, "logps/rejected": -73.80966186523438, "loss": 0.7013, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5552006959915161, "rewards/margins": 1.6896777153015137, "rewards/rejected": -1.134477138519287, "step": 327 }, { "epoch": 0.7516470925236322, "grad_norm": 6.710911273956299, "learning_rate": 9.25e-07, "logits/chosen": -2.644695997238159, "logits/rejected": -2.651027202606201, "logps/chosen": -51.499908447265625, "logps/rejected": -79.66869354248047, "loss": 0.6584, "rewards/accuracies": 0.84375, "rewards/chosen": 0.25387534499168396, "rewards/margins": 2.117008924484253, "rewards/rejected": -1.863133430480957, "step": 328 }, { "epoch": 0.7539386995130335, "grad_norm": 7.869955539703369, "learning_rate": 9.247706422018349e-07, "logits/chosen": -2.750239849090576, "logits/rejected": -2.729773998260498, "logps/chosen": -47.71949005126953, "logps/rejected": -65.39254760742188, "loss": 0.7764, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9699089527130127, "rewards/margins": 1.3600369691848755, "rewards/rejected": -0.3901280462741852, "step": 329 }, { "epoch": 0.7562303065024348, "grad_norm": 9.118949890136719, "learning_rate": 9.245412844036696e-07, "logits/chosen": -2.691195249557495, "logits/rejected": -2.6899123191833496, "logps/chosen": -45.49633026123047, "logps/rejected": -65.2481918334961, "loss": 0.7552, "rewards/accuracies": 0.875, "rewards/chosen": 0.5062370300292969, "rewards/margins": 1.6250838041305542, "rewards/rejected": -1.1188467741012573, "step": 330 }, { "epoch": 0.7585219134918362, "grad_norm": 5.984745025634766, "learning_rate": 9.243119266055045e-07, "logits/chosen": -2.670910358428955, "logits/rejected": -2.6605072021484375, "logps/chosen": -49.36393737792969, "logps/rejected": -67.04859924316406, "loss": 0.7905, "rewards/accuracies": 0.875, "rewards/chosen": 0.3787236511707306, "rewards/margins": 1.3121864795684814, "rewards/rejected": -0.9334626197814941, "step": 331 }, { "epoch": 0.7608135204812375, "grad_norm": 6.127556324005127, "learning_rate": 9.240825688073393e-07, "logits/chosen": -2.667285203933716, "logits/rejected": -2.6617653369903564, "logps/chosen": -49.60015106201172, "logps/rejected": -73.46753692626953, "loss": 0.7257, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5686452984809875, "rewards/margins": 1.7799302339553833, "rewards/rejected": -1.21128511428833, "step": 332 }, { "epoch": 0.7631051274706387, "grad_norm": 5.67525053024292, "learning_rate": 9.238532110091743e-07, "logits/chosen": -2.7993550300598145, "logits/rejected": -2.788600444793701, "logps/chosen": -56.53118896484375, "logps/rejected": -75.06880187988281, "loss": 0.7661, "rewards/accuracies": 0.9375, "rewards/chosen": 0.002457279711961746, "rewards/margins": 1.502623200416565, "rewards/rejected": -1.5001659393310547, "step": 333 }, { "epoch": 0.7653967344600401, "grad_norm": 7.050647735595703, "learning_rate": 9.236238532110091e-07, "logits/chosen": -2.6880977153778076, "logits/rejected": -2.6890392303466797, "logps/chosen": -39.51174545288086, "logps/rejected": -58.6702766418457, "loss": 0.7303, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9226584434509277, "rewards/margins": 1.5107495784759521, "rewards/rejected": -0.588091254234314, "step": 334 }, { "epoch": 0.7676883414494414, "grad_norm": 6.249875545501709, "learning_rate": 9.23394495412844e-07, "logits/chosen": -2.684173583984375, "logits/rejected": -2.653200149536133, "logps/chosen": -52.36058044433594, "logps/rejected": -75.16995239257812, "loss": 0.7294, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4985213577747345, "rewards/margins": 1.5970282554626465, "rewards/rejected": -1.0985068082809448, "step": 335 }, { "epoch": 0.7699799484388428, "grad_norm": 5.407040596008301, "learning_rate": 9.231651376146789e-07, "logits/chosen": -2.725461483001709, "logits/rejected": -2.7293035984039307, "logps/chosen": -56.0599365234375, "logps/rejected": -77.62919616699219, "loss": 0.7672, "rewards/accuracies": 0.96875, "rewards/chosen": -0.08116450905799866, "rewards/margins": 1.3824056386947632, "rewards/rejected": -1.4635698795318604, "step": 336 }, { "epoch": 0.7722715554282441, "grad_norm": 5.937873363494873, "learning_rate": 9.229357798165137e-07, "logits/chosen": -2.676939010620117, "logits/rejected": -2.64969539642334, "logps/chosen": -47.42722702026367, "logps/rejected": -74.38787841796875, "loss": 0.6592, "rewards/accuracies": 0.96875, "rewards/chosen": 0.762557864189148, "rewards/margins": 2.041743755340576, "rewards/rejected": -1.2791857719421387, "step": 337 }, { "epoch": 0.7745631624176453, "grad_norm": 7.122337818145752, "learning_rate": 9.227064220183486e-07, "logits/chosen": -2.659635066986084, "logits/rejected": -2.6673386096954346, "logps/chosen": -49.19886016845703, "logps/rejected": -70.43307495117188, "loss": 0.7201, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8896642327308655, "rewards/margins": 1.7097458839416504, "rewards/rejected": -0.8200818300247192, "step": 338 }, { "epoch": 0.7768547694070467, "grad_norm": 6.155177116394043, "learning_rate": 9.224770642201834e-07, "logits/chosen": -2.7392756938934326, "logits/rejected": -2.7355427742004395, "logps/chosen": -52.09617233276367, "logps/rejected": -72.2508544921875, "loss": 0.7573, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5507375597953796, "rewards/margins": 1.3817261457443237, "rewards/rejected": -0.8309885859489441, "step": 339 }, { "epoch": 0.779146376396448, "grad_norm": 7.9037275314331055, "learning_rate": 9.222477064220184e-07, "logits/chosen": -2.660238265991211, "logits/rejected": -2.693035125732422, "logps/chosen": -48.126991271972656, "logps/rejected": -70.56439971923828, "loss": 0.7173, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6557871699333191, "rewards/margins": 1.7812167406082153, "rewards/rejected": -1.1254295110702515, "step": 340 }, { "epoch": 0.7814379833858494, "grad_norm": 6.254977703094482, "learning_rate": 9.220183486238532e-07, "logits/chosen": -2.6745996475219727, "logits/rejected": -2.684039831161499, "logps/chosen": -49.61480712890625, "logps/rejected": -70.01897430419922, "loss": 0.76, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6180250644683838, "rewards/margins": 1.4971866607666016, "rewards/rejected": -0.8791614770889282, "step": 341 }, { "epoch": 0.7837295903752507, "grad_norm": 8.495152473449707, "learning_rate": 9.21788990825688e-07, "logits/chosen": -2.751105308532715, "logits/rejected": -2.721966028213501, "logps/chosen": -52.70249557495117, "logps/rejected": -72.50753784179688, "loss": 0.756, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4760459065437317, "rewards/margins": 1.4121276140213013, "rewards/rejected": -0.9360816478729248, "step": 342 }, { "epoch": 0.7860211973646519, "grad_norm": 9.72978687286377, "learning_rate": 9.21559633027523e-07, "logits/chosen": -2.797729969024658, "logits/rejected": -2.7807631492614746, "logps/chosen": -43.547325134277344, "logps/rejected": -65.4448471069336, "loss": 0.7101, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9657273292541504, "rewards/margins": 1.8096628189086914, "rewards/rejected": -0.8439353704452515, "step": 343 }, { "epoch": 0.7883128043540533, "grad_norm": 9.404946327209473, "learning_rate": 9.213302752293578e-07, "logits/chosen": -2.7083261013031006, "logits/rejected": -2.6577870845794678, "logps/chosen": -49.94548416137695, "logps/rejected": -73.49323272705078, "loss": 0.6989, "rewards/accuracies": 0.9375, "rewards/chosen": 0.44170865416526794, "rewards/margins": 1.8282010555267334, "rewards/rejected": -1.3864924907684326, "step": 344 }, { "epoch": 0.7906044113434546, "grad_norm": 9.420591354370117, "learning_rate": 9.211009174311927e-07, "logits/chosen": -2.73184871673584, "logits/rejected": -2.6851718425750732, "logps/chosen": -48.38861846923828, "logps/rejected": -71.2001724243164, "loss": 0.7444, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7398673892021179, "rewards/margins": 1.773229956626892, "rewards/rejected": -1.0333627462387085, "step": 345 }, { "epoch": 0.792896018332856, "grad_norm": 5.599563121795654, "learning_rate": 9.208715596330274e-07, "logits/chosen": -2.7206966876983643, "logits/rejected": -2.681046962738037, "logps/chosen": -51.26990509033203, "logps/rejected": -69.45085906982422, "loss": 0.7501, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6259634494781494, "rewards/margins": 1.5185999870300293, "rewards/rejected": -0.8926364183425903, "step": 346 }, { "epoch": 0.7951876253222573, "grad_norm": 13.253547668457031, "learning_rate": 9.206422018348623e-07, "logits/chosen": -2.7350080013275146, "logits/rejected": -2.7599873542785645, "logps/chosen": -48.524253845214844, "logps/rejected": -63.56654357910156, "loss": 0.7746, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6749916076660156, "rewards/margins": 1.213783860206604, "rewards/rejected": -0.5387921929359436, "step": 347 }, { "epoch": 0.7974792323116585, "grad_norm": 6.624053001403809, "learning_rate": 9.204128440366972e-07, "logits/chosen": -2.6695497035980225, "logits/rejected": -2.6587042808532715, "logps/chosen": -51.68325424194336, "logps/rejected": -74.76049041748047, "loss": 0.7357, "rewards/accuracies": 0.9375, "rewards/chosen": 0.30837446451187134, "rewards/margins": 1.6974338293075562, "rewards/rejected": -1.38905930519104, "step": 348 }, { "epoch": 0.7997708393010599, "grad_norm": 6.998566627502441, "learning_rate": 9.20183486238532e-07, "logits/chosen": -2.685335397720337, "logits/rejected": -2.7223892211914062, "logps/chosen": -53.66202163696289, "logps/rejected": -74.52205657958984, "loss": 0.7518, "rewards/accuracies": 0.875, "rewards/chosen": 0.09694499522447586, "rewards/margins": 1.4861277341842651, "rewards/rejected": -1.3891825675964355, "step": 349 }, { "epoch": 0.8020624462904612, "grad_norm": 8.686656951904297, "learning_rate": 9.199541284403669e-07, "logits/chosen": -2.7125816345214844, "logits/rejected": -2.7319021224975586, "logps/chosen": -53.31010818481445, "logps/rejected": -74.87728118896484, "loss": 0.7376, "rewards/accuracies": 0.96875, "rewards/chosen": 0.13180851936340332, "rewards/margins": 1.728675127029419, "rewards/rejected": -1.5968666076660156, "step": 350 }, { "epoch": 0.8043540532798625, "grad_norm": 7.3089823722839355, "learning_rate": 9.197247706422018e-07, "logits/chosen": -2.718557357788086, "logits/rejected": -2.6681008338928223, "logps/chosen": -44.00445556640625, "logps/rejected": -70.46133422851562, "loss": 0.6554, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7913710474967957, "rewards/margins": 2.1079463958740234, "rewards/rejected": -1.3165754079818726, "step": 351 }, { "epoch": 0.8066456602692639, "grad_norm": 8.469682693481445, "learning_rate": 9.194954128440366e-07, "logits/chosen": -2.7123734951019287, "logits/rejected": -2.691831350326538, "logps/chosen": -41.43022918701172, "logps/rejected": -67.84603881835938, "loss": 0.6382, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2834818363189697, "rewards/margins": 2.128718852996826, "rewards/rejected": -0.8452369570732117, "step": 352 }, { "epoch": 0.8066456602692639, "eval_logits/chosen": -2.7932679653167725, "eval_logits/rejected": -2.7990829944610596, "eval_logps/chosen": -57.56865310668945, "eval_logps/rejected": -75.40155029296875, "eval_loss": 0.7857644557952881, "eval_rewards/accuracies": 0.8150943517684937, "eval_rewards/chosen": -0.050629764795303345, "eval_rewards/margins": 1.3845810890197754, "eval_rewards/rejected": -1.435210943222046, "eval_runtime": 957.5846, "eval_samples_per_second": 0.552, "eval_steps_per_second": 0.277, "step": 352 }, { "epoch": 0.8089372672586651, "grad_norm": 7.2574567794799805, "learning_rate": 9.192660550458715e-07, "logits/chosen": -2.668030261993408, "logits/rejected": -2.686202049255371, "logps/chosen": -47.91956329345703, "logps/rejected": -78.14730834960938, "loss": 0.6482, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7440254092216492, "rewards/margins": 2.4334239959716797, "rewards/rejected": -1.6893982887268066, "step": 353 }, { "epoch": 0.8112288742480664, "grad_norm": 8.897721290588379, "learning_rate": 9.190366972477064e-07, "logits/chosen": -2.803189516067505, "logits/rejected": -2.7375807762145996, "logps/chosen": -56.82188415527344, "logps/rejected": -72.90160369873047, "loss": 0.7831, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0398235097527504, "rewards/margins": 1.338247299194336, "rewards/rejected": -1.2984237670898438, "step": 354 }, { "epoch": 0.8135204812374678, "grad_norm": 5.680558204650879, "learning_rate": 9.188073394495413e-07, "logits/chosen": -2.7160332202911377, "logits/rejected": -2.6720950603485107, "logps/chosen": -55.084999084472656, "logps/rejected": -70.38575744628906, "loss": 0.7976, "rewards/accuracies": 0.75, "rewards/chosen": 0.2539314329624176, "rewards/margins": 1.110538363456726, "rewards/rejected": -0.8566069602966309, "step": 355 }, { "epoch": 0.8158120882268691, "grad_norm": 6.845278263092041, "learning_rate": 9.185779816513761e-07, "logits/chosen": -2.6607816219329834, "logits/rejected": -2.6966776847839355, "logps/chosen": -55.808509826660156, "logps/rejected": -72.27617645263672, "loss": 0.7324, "rewards/accuracies": 0.84375, "rewards/chosen": 0.09458237886428833, "rewards/margins": 1.4214969873428345, "rewards/rejected": -1.3269145488739014, "step": 356 }, { "epoch": 0.8181036952162704, "grad_norm": 6.313134670257568, "learning_rate": 9.18348623853211e-07, "logits/chosen": -2.730578660964966, "logits/rejected": -2.6816303730010986, "logps/chosen": -48.31555938720703, "logps/rejected": -72.46643829345703, "loss": 0.703, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5653499364852905, "rewards/margins": 1.757791519165039, "rewards/rejected": -1.1924415826797485, "step": 357 }, { "epoch": 0.8203953022056717, "grad_norm": 6.582780361175537, "learning_rate": 9.181192660550459e-07, "logits/chosen": -2.7206318378448486, "logits/rejected": -2.7228012084960938, "logps/chosen": -55.04595184326172, "logps/rejected": -79.31980895996094, "loss": 0.7238, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12026151269674301, "rewards/margins": 1.9304206371307373, "rewards/rejected": -1.8101592063903809, "step": 358 }, { "epoch": 0.822686909195073, "grad_norm": 8.447746276855469, "learning_rate": 9.178899082568807e-07, "logits/chosen": -2.685009479522705, "logits/rejected": -2.712524652481079, "logps/chosen": -48.597652435302734, "logps/rejected": -77.32186889648438, "loss": 0.6596, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6789374947547913, "rewards/margins": 2.3104665279388428, "rewards/rejected": -1.6315289735794067, "step": 359 }, { "epoch": 0.8249785161844744, "grad_norm": 6.757567405700684, "learning_rate": 9.176605504587156e-07, "logits/chosen": -2.7262799739837646, "logits/rejected": -2.685901165008545, "logps/chosen": -52.86701583862305, "logps/rejected": -79.45989990234375, "loss": 0.7282, "rewards/accuracies": 0.84375, "rewards/chosen": 0.19711196422576904, "rewards/margins": 1.6702362298965454, "rewards/rejected": -1.4731241464614868, "step": 360 }, { "epoch": 0.8272701231738757, "grad_norm": 7.344590663909912, "learning_rate": 9.174311926605505e-07, "logits/chosen": -2.759305238723755, "logits/rejected": -2.7448604106903076, "logps/chosen": -43.700958251953125, "logps/rejected": -66.41059875488281, "loss": 0.7162, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0962235927581787, "rewards/margins": 1.890195369720459, "rewards/rejected": -0.7939717769622803, "step": 361 }, { "epoch": 0.829561730163277, "grad_norm": 9.360381126403809, "learning_rate": 9.172018348623853e-07, "logits/chosen": -2.6549224853515625, "logits/rejected": -2.6840455532073975, "logps/chosen": -54.72574234008789, "logps/rejected": -77.7623519897461, "loss": 0.7164, "rewards/accuracies": 0.875, "rewards/chosen": 0.18659526109695435, "rewards/margins": 1.862777829170227, "rewards/rejected": -1.676182746887207, "step": 362 }, { "epoch": 0.8318533371526783, "grad_norm": 6.127354621887207, "learning_rate": 9.169724770642201e-07, "logits/chosen": -2.7229561805725098, "logits/rejected": -2.740668296813965, "logps/chosen": -49.17110061645508, "logps/rejected": -68.8419189453125, "loss": 0.7333, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6764799356460571, "rewards/margins": 1.5988519191741943, "rewards/rejected": -0.922372043132782, "step": 363 }, { "epoch": 0.8341449441420796, "grad_norm": 9.340347290039062, "learning_rate": 9.167431192660549e-07, "logits/chosen": -2.6694066524505615, "logits/rejected": -2.696141242980957, "logps/chosen": -52.657188415527344, "logps/rejected": -76.66195678710938, "loss": 0.7001, "rewards/accuracies": 0.90625, "rewards/chosen": 0.39189016819000244, "rewards/margins": 1.9583412408828735, "rewards/rejected": -1.566451072692871, "step": 364 }, { "epoch": 0.836436551131481, "grad_norm": 8.92735481262207, "learning_rate": 9.165137614678899e-07, "logits/chosen": -2.734232187271118, "logits/rejected": -2.6938891410827637, "logps/chosen": -42.76641082763672, "logps/rejected": -63.954444885253906, "loss": 0.7228, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7601646184921265, "rewards/margins": 1.606305718421936, "rewards/rejected": -0.84614098072052, "step": 365 }, { "epoch": 0.8387281581208823, "grad_norm": 9.562631607055664, "learning_rate": 9.162844036697247e-07, "logits/chosen": -2.6998603343963623, "logits/rejected": -2.688021659851074, "logps/chosen": -42.06915283203125, "logps/rejected": -61.441444396972656, "loss": 0.7533, "rewards/accuracies": 0.8125, "rewards/chosen": 1.166413426399231, "rewards/margins": 1.5734925270080566, "rewards/rejected": -0.4070790112018585, "step": 366 }, { "epoch": 0.8410197651102835, "grad_norm": 8.997941970825195, "learning_rate": 9.160550458715596e-07, "logits/chosen": -2.741565465927124, "logits/rejected": -2.713042736053467, "logps/chosen": -52.415523529052734, "logps/rejected": -74.59005737304688, "loss": 0.7068, "rewards/accuracies": 0.875, "rewards/chosen": 0.1739051192998886, "rewards/margins": 1.6801170110702515, "rewards/rejected": -1.5062119960784912, "step": 367 }, { "epoch": 0.8433113720996849, "grad_norm": 7.388655185699463, "learning_rate": 9.158256880733944e-07, "logits/chosen": -2.7375974655151367, "logits/rejected": -2.7255115509033203, "logps/chosen": -49.58521270751953, "logps/rejected": -73.490478515625, "loss": 0.6901, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7375156283378601, "rewards/margins": 1.998033046722412, "rewards/rejected": -1.2605173587799072, "step": 368 }, { "epoch": 0.8456029790890862, "grad_norm": 7.6949896812438965, "learning_rate": 9.155963302752293e-07, "logits/chosen": -2.6015336513519287, "logits/rejected": -2.623850107192993, "logps/chosen": -47.436153411865234, "logps/rejected": -74.66218566894531, "loss": 0.6227, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7764937877655029, "rewards/margins": 2.2302443981170654, "rewards/rejected": -1.453750729560852, "step": 369 }, { "epoch": 0.8478945860784876, "grad_norm": 8.802088737487793, "learning_rate": 9.153669724770642e-07, "logits/chosen": -2.6845812797546387, "logits/rejected": -2.64755916595459, "logps/chosen": -49.626625061035156, "logps/rejected": -78.63785552978516, "loss": 0.6679, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6046723127365112, "rewards/margins": 2.265115261077881, "rewards/rejected": -1.6604427099227905, "step": 370 }, { "epoch": 0.8501861930678889, "grad_norm": 6.960660457611084, "learning_rate": 9.15137614678899e-07, "logits/chosen": -2.6999173164367676, "logits/rejected": -2.7031309604644775, "logps/chosen": -40.50932693481445, "logps/rejected": -73.15096282958984, "loss": 0.6416, "rewards/accuracies": 0.96875, "rewards/chosen": 1.323624849319458, "rewards/margins": 2.4642977714538574, "rewards/rejected": -1.1406726837158203, "step": 371 }, { "epoch": 0.8524778000572901, "grad_norm": 9.183350563049316, "learning_rate": 9.14908256880734e-07, "logits/chosen": -2.7419021129608154, "logits/rejected": -2.7724928855895996, "logps/chosen": -41.6130485534668, "logps/rejected": -68.57496643066406, "loss": 0.6578, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1312973499298096, "rewards/margins": 2.0781896114349365, "rewards/rejected": -0.9468921422958374, "step": 372 }, { "epoch": 0.8547694070466915, "grad_norm": 8.321191787719727, "learning_rate": 9.146788990825688e-07, "logits/chosen": -2.7454886436462402, "logits/rejected": -2.75478196144104, "logps/chosen": -47.054054260253906, "logps/rejected": -69.8761215209961, "loss": 0.7255, "rewards/accuracies": 0.875, "rewards/chosen": 0.7174318432807922, "rewards/margins": 1.8497741222381592, "rewards/rejected": -1.1323423385620117, "step": 373 }, { "epoch": 0.8570610140360928, "grad_norm": 10.52584457397461, "learning_rate": 9.144495412844036e-07, "logits/chosen": -2.7431302070617676, "logits/rejected": -2.664504051208496, "logps/chosen": -51.599342346191406, "logps/rejected": -75.71045684814453, "loss": 0.7318, "rewards/accuracies": 0.84375, "rewards/chosen": 0.31080853939056396, "rewards/margins": 1.8349978923797607, "rewards/rejected": -1.5241892337799072, "step": 374 }, { "epoch": 0.8593526210254941, "grad_norm": 7.824237823486328, "learning_rate": 9.142201834862385e-07, "logits/chosen": -2.7518112659454346, "logits/rejected": -2.7789382934570312, "logps/chosen": -44.5767822265625, "logps/rejected": -68.94972229003906, "loss": 0.6896, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1076383590698242, "rewards/margins": 1.9776732921600342, "rewards/rejected": -0.8700348734855652, "step": 375 }, { "epoch": 0.8616442280148955, "grad_norm": 7.343204498291016, "learning_rate": 9.139908256880734e-07, "logits/chosen": -2.787119150161743, "logits/rejected": -2.7941062450408936, "logps/chosen": -46.42622375488281, "logps/rejected": -68.45121765136719, "loss": 0.7236, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9050140380859375, "rewards/margins": 1.687866449356079, "rewards/rejected": -0.7828524112701416, "step": 376 }, { "epoch": 0.8639358350042967, "grad_norm": 8.300143241882324, "learning_rate": 9.137614678899083e-07, "logits/chosen": -2.6554503440856934, "logits/rejected": -2.682154417037964, "logps/chosen": -48.37464141845703, "logps/rejected": -69.5357666015625, "loss": 0.717, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6498701572418213, "rewards/margins": 1.831261396408081, "rewards/rejected": -1.1813912391662598, "step": 377 }, { "epoch": 0.866227441993698, "grad_norm": 6.6996331214904785, "learning_rate": 9.13532110091743e-07, "logits/chosen": -2.7388298511505127, "logits/rejected": -2.6919984817504883, "logps/chosen": -48.89787292480469, "logps/rejected": -71.62584686279297, "loss": 0.7519, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5845589637756348, "rewards/margins": 1.758622407913208, "rewards/rejected": -1.1740634441375732, "step": 378 }, { "epoch": 0.8685190489830994, "grad_norm": 9.526315689086914, "learning_rate": 9.133027522935778e-07, "logits/chosen": -2.715291976928711, "logits/rejected": -2.7339487075805664, "logps/chosen": -48.31828689575195, "logps/rejected": -77.3013687133789, "loss": 0.6707, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6302147507667542, "rewards/margins": 2.243560791015625, "rewards/rejected": -1.6133463382720947, "step": 379 }, { "epoch": 0.8708106559725007, "grad_norm": 7.82873010635376, "learning_rate": 9.130733944954128e-07, "logits/chosen": -2.7596418857574463, "logits/rejected": -2.7541518211364746, "logps/chosen": -51.27600860595703, "logps/rejected": -70.28108215332031, "loss": 0.7777, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5115796327590942, "rewards/margins": 1.4998983144760132, "rewards/rejected": -0.9883187413215637, "step": 380 }, { "epoch": 0.8731022629619021, "grad_norm": 7.141968250274658, "learning_rate": 9.128440366972476e-07, "logits/chosen": -2.713554859161377, "logits/rejected": -2.6824042797088623, "logps/chosen": -47.313602447509766, "logps/rejected": -73.84039306640625, "loss": 0.6792, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7145927548408508, "rewards/margins": 2.0076632499694824, "rewards/rejected": -1.2930705547332764, "step": 381 }, { "epoch": 0.8753938699513033, "grad_norm": 6.397091388702393, "learning_rate": 9.126146788990825e-07, "logits/chosen": -2.725660800933838, "logits/rejected": -2.738344669342041, "logps/chosen": -47.78562927246094, "logps/rejected": -72.53717041015625, "loss": 0.6784, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7496761679649353, "rewards/margins": 2.043175220489502, "rewards/rejected": -1.293499231338501, "step": 382 }, { "epoch": 0.8776854769407046, "grad_norm": 6.927574634552002, "learning_rate": 9.123853211009174e-07, "logits/chosen": -2.7376368045806885, "logits/rejected": -2.727186441421509, "logps/chosen": -44.64984893798828, "logps/rejected": -66.76777648925781, "loss": 0.7094, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8037688136100769, "rewards/margins": 1.7768819332122803, "rewards/rejected": -0.9731131196022034, "step": 383 }, { "epoch": 0.879977083930106, "grad_norm": 6.42455530166626, "learning_rate": 9.121559633027523e-07, "logits/chosen": -2.6778953075408936, "logits/rejected": -2.6864612102508545, "logps/chosen": -48.1298828125, "logps/rejected": -73.22445678710938, "loss": 0.6758, "rewards/accuracies": 0.9375, "rewards/chosen": 0.74419766664505, "rewards/margins": 2.060321092605591, "rewards/rejected": -1.316123366355896, "step": 384 }, { "epoch": 0.8822686909195073, "grad_norm": 7.070319175720215, "learning_rate": 9.119266055045871e-07, "logits/chosen": -2.7775821685791016, "logits/rejected": -2.713993549346924, "logps/chosen": -49.413082122802734, "logps/rejected": -75.1528549194336, "loss": 0.7464, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5297504663467407, "rewards/margins": 1.8554396629333496, "rewards/rejected": -1.3256891965866089, "step": 385 }, { "epoch": 0.8845602979089087, "grad_norm": 7.951888561248779, "learning_rate": 9.116972477064219e-07, "logits/chosen": -2.7151899337768555, "logits/rejected": -2.7251014709472656, "logps/chosen": -49.07512664794922, "logps/rejected": -78.25099182128906, "loss": 0.6142, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6270815134048462, "rewards/margins": 2.369112491607666, "rewards/rejected": -1.7420309782028198, "step": 386 }, { "epoch": 0.8868519048983099, "grad_norm": 10.433921813964844, "learning_rate": 9.114678899082569e-07, "logits/chosen": -2.7843897342681885, "logits/rejected": -2.8217124938964844, "logps/chosen": -44.02106857299805, "logps/rejected": -65.03958129882812, "loss": 0.708, "rewards/accuracies": 0.75, "rewards/chosen": 1.1765029430389404, "rewards/margins": 1.7078680992126465, "rewards/rejected": -0.531365156173706, "step": 387 }, { "epoch": 0.8891435118877112, "grad_norm": 6.811026096343994, "learning_rate": 9.112385321100917e-07, "logits/chosen": -2.746073007583618, "logits/rejected": -2.733973741531372, "logps/chosen": -48.632904052734375, "logps/rejected": -67.22705841064453, "loss": 0.7093, "rewards/accuracies": 0.84375, "rewards/chosen": 0.715310275554657, "rewards/margins": 1.706075668334961, "rewards/rejected": -0.9907654523849487, "step": 388 }, { "epoch": 0.8914351188771126, "grad_norm": 7.673537731170654, "learning_rate": 9.110091743119266e-07, "logits/chosen": -2.698808431625366, "logits/rejected": -2.6730027198791504, "logps/chosen": -37.934940338134766, "logps/rejected": -60.221107482910156, "loss": 0.735, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3039029836654663, "rewards/margins": 1.701127290725708, "rewards/rejected": -0.3972243666648865, "step": 389 }, { "epoch": 0.8937267258665139, "grad_norm": 8.509215354919434, "learning_rate": 9.107798165137615e-07, "logits/chosen": -2.7202014923095703, "logits/rejected": -2.6901228427886963, "logps/chosen": -38.27848434448242, "logps/rejected": -68.07093811035156, "loss": 0.6473, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2349307537078857, "rewards/margins": 2.23230242729187, "rewards/rejected": -0.9973715543746948, "step": 390 }, { "epoch": 0.8960183328559153, "grad_norm": 10.64453125, "learning_rate": 9.105504587155963e-07, "logits/chosen": -2.786675453186035, "logits/rejected": -2.739593744277954, "logps/chosen": -42.709991455078125, "logps/rejected": -65.29847717285156, "loss": 0.7152, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1788134574890137, "rewards/margins": 1.838475227355957, "rewards/rejected": -0.6596617698669434, "step": 391 }, { "epoch": 0.8983099398453165, "grad_norm": 6.689558506011963, "learning_rate": 9.103211009174312e-07, "logits/chosen": -2.741288900375366, "logits/rejected": -2.7669010162353516, "logps/chosen": -42.72196578979492, "logps/rejected": -72.70289611816406, "loss": 0.6205, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1496365070343018, "rewards/margins": 2.2246205806732178, "rewards/rejected": -1.074984073638916, "step": 392 }, { "epoch": 0.9006015468347178, "grad_norm": 8.898541450500488, "learning_rate": 9.10091743119266e-07, "logits/chosen": -2.7004952430725098, "logits/rejected": -2.68522310256958, "logps/chosen": -49.23902893066406, "logps/rejected": -74.6203842163086, "loss": 0.6926, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8465957641601562, "rewards/margins": 2.164196252822876, "rewards/rejected": -1.3176007270812988, "step": 393 }, { "epoch": 0.9028931538241192, "grad_norm": 11.650423049926758, "learning_rate": 9.098623853211009e-07, "logits/chosen": -2.762080669403076, "logits/rejected": -2.766106128692627, "logps/chosen": -47.61907196044922, "logps/rejected": -70.38665771484375, "loss": 0.6795, "rewards/accuracies": 0.875, "rewards/chosen": 0.6862860918045044, "rewards/margins": 1.9908080101013184, "rewards/rejected": -1.3045220375061035, "step": 394 }, { "epoch": 0.9051847608135205, "grad_norm": 10.215896606445312, "learning_rate": 9.096330275229357e-07, "logits/chosen": -2.7408132553100586, "logits/rejected": -2.769883394241333, "logps/chosen": -45.617706298828125, "logps/rejected": -72.0541763305664, "loss": 0.6562, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6136986017227173, "rewards/margins": 2.1411256790161133, "rewards/rejected": -1.527427077293396, "step": 395 }, { "epoch": 0.9074763678029218, "grad_norm": 6.757777690887451, "learning_rate": 9.094036697247705e-07, "logits/chosen": -2.7366316318511963, "logits/rejected": -2.7400412559509277, "logps/chosen": -46.461177825927734, "logps/rejected": -77.24394989013672, "loss": 0.5735, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9502944350242615, "rewards/margins": 2.3198318481445312, "rewards/rejected": -1.369537591934204, "step": 396 }, { "epoch": 0.9097679747923231, "grad_norm": 9.470986366271973, "learning_rate": 9.091743119266054e-07, "logits/chosen": -2.8032546043395996, "logits/rejected": -2.7584452629089355, "logps/chosen": -54.965110778808594, "logps/rejected": -79.34161376953125, "loss": 0.7274, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07048926502466202, "rewards/margins": 2.024664878845215, "rewards/rejected": -1.9541754722595215, "step": 397 }, { "epoch": 0.9120595817817244, "grad_norm": 6.374336242675781, "learning_rate": 9.089449541284403e-07, "logits/chosen": -2.771104335784912, "logits/rejected": -2.7764086723327637, "logps/chosen": -49.976646423339844, "logps/rejected": -70.14351654052734, "loss": 0.7411, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4555395245552063, "rewards/margins": 1.6077816486358643, "rewards/rejected": -1.1522421836853027, "step": 398 }, { "epoch": 0.9143511887711258, "grad_norm": 7.049314022064209, "learning_rate": 9.087155963302752e-07, "logits/chosen": -2.7915401458740234, "logits/rejected": -2.7880077362060547, "logps/chosen": -55.66562271118164, "logps/rejected": -80.74363708496094, "loss": 0.7246, "rewards/accuracies": 0.90625, "rewards/chosen": 0.025149889290332794, "rewards/margins": 1.974671483039856, "rewards/rejected": -1.949521541595459, "step": 399 }, { "epoch": 0.9166427957605271, "grad_norm": 6.613648414611816, "learning_rate": 9.0848623853211e-07, "logits/chosen": -2.759977340698242, "logits/rejected": -2.726731061935425, "logps/chosen": -47.65987777709961, "logps/rejected": -72.19667053222656, "loss": 0.6505, "rewards/accuracies": 0.875, "rewards/chosen": 0.47656917572021484, "rewards/margins": 2.077627420425415, "rewards/rejected": -1.6010583639144897, "step": 400 }, { "epoch": 0.9189344027499284, "grad_norm": 5.279773235321045, "learning_rate": 9.082568807339449e-07, "logits/chosen": -2.7542343139648438, "logits/rejected": -2.7158610820770264, "logps/chosen": -52.690879821777344, "logps/rejected": -79.5162582397461, "loss": 0.714, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3964727818965912, "rewards/margins": 2.1565561294555664, "rewards/rejected": -1.7600836753845215, "step": 401 }, { "epoch": 0.9212260097393297, "grad_norm": 8.119961738586426, "learning_rate": 9.080275229357798e-07, "logits/chosen": -2.748384475708008, "logits/rejected": -2.691221237182617, "logps/chosen": -55.79926300048828, "logps/rejected": -78.38001251220703, "loss": 0.7576, "rewards/accuracies": 0.90625, "rewards/chosen": -0.04471273720264435, "rewards/margins": 1.8300073146820068, "rewards/rejected": -1.8747200965881348, "step": 402 }, { "epoch": 0.923517616728731, "grad_norm": 9.65650749206543, "learning_rate": 9.077981651376146e-07, "logits/chosen": -2.7143876552581787, "logits/rejected": -2.7239134311676025, "logps/chosen": -52.016380310058594, "logps/rejected": -75.1856918334961, "loss": 0.7064, "rewards/accuracies": 0.84375, "rewards/chosen": 0.46576008200645447, "rewards/margins": 1.8336331844329834, "rewards/rejected": -1.367873191833496, "step": 403 }, { "epoch": 0.9258092237181323, "grad_norm": 13.937490463256836, "learning_rate": 9.075688073394495e-07, "logits/chosen": -2.7145700454711914, "logits/rejected": -2.7676570415496826, "logps/chosen": -50.92052459716797, "logps/rejected": -83.29857635498047, "loss": 0.6407, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4339560270309448, "rewards/margins": 2.5425217151641846, "rewards/rejected": -2.1085658073425293, "step": 404 }, { "epoch": 0.9281008307075337, "grad_norm": 8.299779891967773, "learning_rate": 9.073394495412844e-07, "logits/chosen": -2.777062177658081, "logits/rejected": -2.7531580924987793, "logps/chosen": -52.042720794677734, "logps/rejected": -80.04518127441406, "loss": 0.7056, "rewards/accuracies": 0.875, "rewards/chosen": 0.3999495506286621, "rewards/margins": 2.168945074081421, "rewards/rejected": -1.768995761871338, "step": 405 }, { "epoch": 0.930392437696935, "grad_norm": 12.54997730255127, "learning_rate": 9.071100917431193e-07, "logits/chosen": -2.693125009536743, "logits/rejected": -2.7492265701293945, "logps/chosen": -50.45193862915039, "logps/rejected": -81.33460998535156, "loss": 0.6226, "rewards/accuracies": 0.90625, "rewards/chosen": 0.29338011145591736, "rewards/margins": 2.458252429962158, "rewards/rejected": -2.164872407913208, "step": 406 }, { "epoch": 0.9326840446863363, "grad_norm": 8.076930046081543, "learning_rate": 9.068807339449541e-07, "logits/chosen": -2.7934231758117676, "logits/rejected": -2.7428112030029297, "logps/chosen": -50.38451385498047, "logps/rejected": -70.9013671875, "loss": 0.7399, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5677551031112671, "rewards/margins": 1.6101031303405762, "rewards/rejected": -1.042348027229309, "step": 407 }, { "epoch": 0.9349756516757376, "grad_norm": 7.546566009521484, "learning_rate": 9.06651376146789e-07, "logits/chosen": -2.729219913482666, "logits/rejected": -2.7432408332824707, "logps/chosen": -50.57685089111328, "logps/rejected": -79.65044403076172, "loss": 0.6826, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4754323959350586, "rewards/margins": 2.0390522480010986, "rewards/rejected": -1.56361985206604, "step": 408 }, { "epoch": 0.9372672586651389, "grad_norm": 8.631101608276367, "learning_rate": 9.064220183486239e-07, "logits/chosen": -2.714142084121704, "logits/rejected": -2.683824300765991, "logps/chosen": -57.4214973449707, "logps/rejected": -82.50225830078125, "loss": 0.7337, "rewards/accuracies": 0.78125, "rewards/chosen": -0.03717140853404999, "rewards/margins": 1.915252923965454, "rewards/rejected": -1.95242440700531, "step": 409 }, { "epoch": 0.9395588656545403, "grad_norm": 9.689010620117188, "learning_rate": 9.061926605504586e-07, "logits/chosen": -2.702781915664673, "logits/rejected": -2.6919710636138916, "logps/chosen": -48.63930892944336, "logps/rejected": -74.49269104003906, "loss": 0.6731, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9475463032722473, "rewards/margins": 1.950898289680481, "rewards/rejected": -1.0033518075942993, "step": 410 }, { "epoch": 0.9418504726439416, "grad_norm": 8.039535522460938, "learning_rate": 9.059633027522935e-07, "logits/chosen": -2.7331111431121826, "logits/rejected": -2.737781524658203, "logps/chosen": -56.155216217041016, "logps/rejected": -73.90144348144531, "loss": 0.757, "rewards/accuracies": 0.78125, "rewards/chosen": 0.28696250915527344, "rewards/margins": 1.606863021850586, "rewards/rejected": -1.3199007511138916, "step": 411 }, { "epoch": 0.9441420796333428, "grad_norm": 11.112536430358887, "learning_rate": 9.057339449541284e-07, "logits/chosen": -2.742048740386963, "logits/rejected": -2.806425094604492, "logps/chosen": -44.66618347167969, "logps/rejected": -74.10813903808594, "loss": 0.6547, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1838195323944092, "rewards/margins": 2.2881133556365967, "rewards/rejected": -1.1042940616607666, "step": 412 }, { "epoch": 0.9464336866227442, "grad_norm": 11.426857948303223, "learning_rate": 9.055045871559632e-07, "logits/chosen": -2.793147563934326, "logits/rejected": -2.7973897457122803, "logps/chosen": -39.95058059692383, "logps/rejected": -64.71569061279297, "loss": 0.7219, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3672065734863281, "rewards/margins": 1.9127519130706787, "rewards/rejected": -0.5455451011657715, "step": 413 }, { "epoch": 0.9487252936121455, "grad_norm": 7.178503036499023, "learning_rate": 9.052752293577981e-07, "logits/chosen": -2.7146921157836914, "logits/rejected": -2.7224597930908203, "logps/chosen": -43.43101501464844, "logps/rejected": -67.8965835571289, "loss": 0.6957, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0599372386932373, "rewards/margins": 1.8743391036987305, "rewards/rejected": -0.8144019246101379, "step": 414 }, { "epoch": 0.9510169006015469, "grad_norm": 8.94846248626709, "learning_rate": 9.050458715596329e-07, "logits/chosen": -2.8503212928771973, "logits/rejected": -2.9097695350646973, "logps/chosen": -45.8375244140625, "logps/rejected": -71.54638671875, "loss": 0.6606, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8143361806869507, "rewards/margins": 2.2810161113739014, "rewards/rejected": -1.4666800498962402, "step": 415 }, { "epoch": 0.9533085075909482, "grad_norm": 8.648880004882812, "learning_rate": 9.048165137614679e-07, "logits/chosen": -2.70906925201416, "logits/rejected": -2.720691442489624, "logps/chosen": -42.78130340576172, "logps/rejected": -65.80652618408203, "loss": 0.7106, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0684142112731934, "rewards/margins": 1.873241662979126, "rewards/rejected": -0.8048274517059326, "step": 416 }, { "epoch": 0.9556001145803494, "grad_norm": 8.596169471740723, "learning_rate": 9.045871559633027e-07, "logits/chosen": -2.7704873085021973, "logits/rejected": -2.783348321914673, "logps/chosen": -48.29999923706055, "logps/rejected": -78.34453582763672, "loss": 0.6301, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8111220598220825, "rewards/margins": 2.4723384380340576, "rewards/rejected": -1.6612164974212646, "step": 417 }, { "epoch": 0.9578917215697508, "grad_norm": 7.927072048187256, "learning_rate": 9.043577981651375e-07, "logits/chosen": -2.7368218898773193, "logits/rejected": -2.7844035625457764, "logps/chosen": -43.12287139892578, "logps/rejected": -72.55984497070312, "loss": 0.6294, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1935267448425293, "rewards/margins": 2.3740363121032715, "rewards/rejected": -1.1805095672607422, "step": 418 }, { "epoch": 0.9601833285591521, "grad_norm": 11.54346752166748, "learning_rate": 9.041284403669725e-07, "logits/chosen": -2.800002098083496, "logits/rejected": -2.7253103256225586, "logps/chosen": -47.71730041503906, "logps/rejected": -78.07161712646484, "loss": 0.6217, "rewards/accuracies": 0.9375, "rewards/chosen": 0.733596920967102, "rewards/margins": 2.6399056911468506, "rewards/rejected": -1.906308889389038, "step": 419 }, { "epoch": 0.9624749355485535, "grad_norm": 7.409751892089844, "learning_rate": 9.038990825688073e-07, "logits/chosen": -2.7423105239868164, "logits/rejected": -2.771604299545288, "logps/chosen": -47.09276580810547, "logps/rejected": -76.11859893798828, "loss": 0.6115, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6714968681335449, "rewards/margins": 2.5065853595733643, "rewards/rejected": -1.8350884914398193, "step": 420 }, { "epoch": 0.9647665425379547, "grad_norm": 7.634303569793701, "learning_rate": 9.036697247706422e-07, "logits/chosen": -2.7336645126342773, "logits/rejected": -2.7634127140045166, "logps/chosen": -49.20062255859375, "logps/rejected": -69.2663803100586, "loss": 0.796, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5704700946807861, "rewards/margins": 1.7008341550827026, "rewards/rejected": -1.1303640604019165, "step": 421 }, { "epoch": 0.967058149527356, "grad_norm": 8.875053405761719, "learning_rate": 9.03440366972477e-07, "logits/chosen": -2.708442211151123, "logits/rejected": -2.7503669261932373, "logps/chosen": -41.55755615234375, "logps/rejected": -68.66255187988281, "loss": 0.6974, "rewards/accuracies": 0.9375, "rewards/chosen": 1.104544758796692, "rewards/margins": 2.0328073501586914, "rewards/rejected": -0.9282625913619995, "step": 422 }, { "epoch": 0.9693497565167574, "grad_norm": 5.845530033111572, "learning_rate": 9.032110091743119e-07, "logits/chosen": -2.725614547729492, "logits/rejected": -2.74137544631958, "logps/chosen": -48.41815185546875, "logps/rejected": -74.57381439208984, "loss": 0.7109, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4564259946346283, "rewards/margins": 1.9103673696517944, "rewards/rejected": -1.4539413452148438, "step": 423 }, { "epoch": 0.9716413635061587, "grad_norm": 8.816184997558594, "learning_rate": 9.029816513761468e-07, "logits/chosen": -2.799814224243164, "logits/rejected": -2.8424906730651855, "logps/chosen": -44.89625549316406, "logps/rejected": -72.70471954345703, "loss": 0.6289, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9585426449775696, "rewards/margins": 2.3457560539245605, "rewards/rejected": -1.3872135877609253, "step": 424 }, { "epoch": 0.97393297049556, "grad_norm": 7.78678560256958, "learning_rate": 9.027522935779816e-07, "logits/chosen": -2.795078754425049, "logits/rejected": -2.8196489810943604, "logps/chosen": -48.46710968017578, "logps/rejected": -72.2926025390625, "loss": 0.7091, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5791400074958801, "rewards/margins": 2.0177855491638184, "rewards/rejected": -1.438645362854004, "step": 425 }, { "epoch": 0.9762245774849613, "grad_norm": 6.533586025238037, "learning_rate": 9.025229357798165e-07, "logits/chosen": -2.7906417846679688, "logits/rejected": -2.7955217361450195, "logps/chosen": -48.69097137451172, "logps/rejected": -78.9268569946289, "loss": 0.6395, "rewards/accuracies": 0.90625, "rewards/chosen": 0.42373284697532654, "rewards/margins": 2.344146966934204, "rewards/rejected": -1.9204140901565552, "step": 426 }, { "epoch": 0.9785161844743626, "grad_norm": 7.144700050354004, "learning_rate": 9.022935779816513e-07, "logits/chosen": -2.7437682151794434, "logits/rejected": -2.7830138206481934, "logps/chosen": -40.468910217285156, "logps/rejected": -67.41027069091797, "loss": 0.7175, "rewards/accuracies": 0.875, "rewards/chosen": 1.161961555480957, "rewards/margins": 2.131009340286255, "rewards/rejected": -0.969048023223877, "step": 427 }, { "epoch": 0.980807791463764, "grad_norm": 10.583667755126953, "learning_rate": 9.020642201834862e-07, "logits/chosen": -2.7499661445617676, "logits/rejected": -2.733613967895508, "logps/chosen": -46.74424743652344, "logps/rejected": -66.19168090820312, "loss": 0.6958, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8908998370170593, "rewards/margins": 1.921811819076538, "rewards/rejected": -1.030911922454834, "step": 428 }, { "epoch": 0.9830993984531653, "grad_norm": 8.09971809387207, "learning_rate": 9.01834862385321e-07, "logits/chosen": -2.7389049530029297, "logits/rejected": -2.733347177505493, "logps/chosen": -41.22541046142578, "logps/rejected": -62.48960494995117, "loss": 0.7303, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2534213066101074, "rewards/margins": 1.6630052328109741, "rewards/rejected": -0.40958383679389954, "step": 429 }, { "epoch": 0.9853910054425666, "grad_norm": 14.506853103637695, "learning_rate": 9.016055045871559e-07, "logits/chosen": -2.831915855407715, "logits/rejected": -2.8483974933624268, "logps/chosen": -37.78094482421875, "logps/rejected": -70.79369354248047, "loss": 0.5764, "rewards/accuracies": 0.9375, "rewards/chosen": 1.691823959350586, "rewards/margins": 2.7666265964508057, "rewards/rejected": -1.0748029947280884, "step": 430 }, { "epoch": 0.9876826124319679, "grad_norm": 9.374785423278809, "learning_rate": 9.013761467889908e-07, "logits/chosen": -2.8057284355163574, "logits/rejected": -2.773116111755371, "logps/chosen": -43.53971862792969, "logps/rejected": -75.99028778076172, "loss": 0.6415, "rewards/accuracies": 0.9375, "rewards/chosen": 1.059090256690979, "rewards/margins": 2.5460832118988037, "rewards/rejected": -1.4869928359985352, "step": 431 }, { "epoch": 0.9899742194213692, "grad_norm": 8.892009735107422, "learning_rate": 9.011467889908256e-07, "logits/chosen": -2.892439365386963, "logits/rejected": -2.8205831050872803, "logps/chosen": -51.72599792480469, "logps/rejected": -74.96505737304688, "loss": 0.7029, "rewards/accuracies": 0.9375, "rewards/chosen": 0.35989266633987427, "rewards/margins": 2.048736572265625, "rewards/rejected": -1.688843846321106, "step": 432 }, { "epoch": 0.9922658264107705, "grad_norm": 8.312777519226074, "learning_rate": 9.009174311926606e-07, "logits/chosen": -2.749232292175293, "logits/rejected": -2.757455587387085, "logps/chosen": -50.287940979003906, "logps/rejected": -78.397705078125, "loss": 0.6488, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6706288456916809, "rewards/margins": 2.4196841716766357, "rewards/rejected": -1.7490553855895996, "step": 433 }, { "epoch": 0.9945574334001719, "grad_norm": 8.194364547729492, "learning_rate": 9.006880733944954e-07, "logits/chosen": -2.82466983795166, "logits/rejected": -2.7689361572265625, "logps/chosen": -48.50038146972656, "logps/rejected": -71.37777709960938, "loss": 0.707, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8125506639480591, "rewards/margins": 1.9992581605911255, "rewards/rejected": -1.1867077350616455, "step": 434 }, { "epoch": 0.9968490403895732, "grad_norm": 7.903297424316406, "learning_rate": 9.004587155963302e-07, "logits/chosen": -2.754586935043335, "logits/rejected": -2.7767300605773926, "logps/chosen": -56.19023132324219, "logps/rejected": -80.89680480957031, "loss": 0.678, "rewards/accuracies": 0.875, "rewards/chosen": 0.03153020143508911, "rewards/margins": 2.0751821994781494, "rewards/rejected": -2.043652296066284, "step": 435 }, { "epoch": 0.9991406473789745, "grad_norm": 6.758319854736328, "learning_rate": 9.002293577981651e-07, "logits/chosen": -2.750351905822754, "logits/rejected": -2.7173047065734863, "logps/chosen": -51.40251541137695, "logps/rejected": -68.26502227783203, "loss": 0.785, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3332269489765167, "rewards/margins": 1.4711549282073975, "rewards/rejected": -1.1379280090332031, "step": 436 }, { "epoch": 1.0, "grad_norm": 6.514362812042236, "learning_rate": 9e-07, "logits/chosen": -2.886584997177124, "logits/rejected": -2.894085168838501, "logps/chosen": -44.31527328491211, "logps/rejected": -78.81527709960938, "loss": 0.222, "rewards/accuracies": 1.0, "rewards/chosen": 0.7636635899543762, "rewards/margins": 2.6828525066375732, "rewards/rejected": -1.9191888570785522, "step": 437 }, { "epoch": 1.0022916069894012, "grad_norm": 8.84343433380127, "learning_rate": 8.997706422018349e-07, "logits/chosen": -2.8203022480010986, "logits/rejected": -2.8167500495910645, "logps/chosen": -39.745304107666016, "logps/rejected": -71.2107162475586, "loss": 0.6529, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0669636726379395, "rewards/margins": 2.372133255004883, "rewards/rejected": -1.305169701576233, "step": 438 }, { "epoch": 1.0045832139788027, "grad_norm": 12.805978775024414, "learning_rate": 8.995412844036697e-07, "logits/chosen": -2.763704299926758, "logits/rejected": -2.804750919342041, "logps/chosen": -44.254112243652344, "logps/rejected": -69.78892517089844, "loss": 0.6524, "rewards/accuracies": 0.875, "rewards/chosen": 0.9020769596099854, "rewards/margins": 2.4430882930755615, "rewards/rejected": -1.5410114526748657, "step": 439 }, { "epoch": 1.006874820968204, "grad_norm": 11.017248153686523, "learning_rate": 8.993119266055045e-07, "logits/chosen": -2.8185837268829346, "logits/rejected": -2.809269666671753, "logps/chosen": -51.373958587646484, "logps/rejected": -87.77285766601562, "loss": 0.636, "rewards/accuracies": 1.0, "rewards/chosen": 0.34791797399520874, "rewards/margins": 2.9507548809051514, "rewards/rejected": -2.602837085723877, "step": 440 }, { "epoch": 1.006874820968204, "eval_logits/chosen": -2.8833000659942627, "eval_logits/rejected": -2.8941314220428467, "eval_logps/chosen": -60.151790618896484, "eval_logps/rejected": -82.09523010253906, "eval_loss": 0.7669705748558044, "eval_rewards/accuracies": 0.8169811367988586, "eval_rewards/chosen": -0.30894359946250916, "eval_rewards/margins": 1.79563570022583, "eval_rewards/rejected": -2.104579210281372, "eval_runtime": 960.5031, "eval_samples_per_second": 0.551, "eval_steps_per_second": 0.276, "step": 440 }, { "epoch": 1.0091664279576054, "grad_norm": 10.22191333770752, "learning_rate": 8.990825688073395e-07, "logits/chosen": -2.844771146774292, "logits/rejected": -2.8321378231048584, "logps/chosen": -49.0184326171875, "logps/rejected": -74.07817077636719, "loss": 0.698, "rewards/accuracies": 0.9375, "rewards/chosen": 0.31309181451797485, "rewards/margins": 1.9963730573654175, "rewards/rejected": -1.683281421661377, "step": 441 }, { "epoch": 1.0114580349470066, "grad_norm": 7.532583236694336, "learning_rate": 8.988532110091742e-07, "logits/chosen": -2.8089723587036133, "logits/rejected": -2.7707576751708984, "logps/chosen": -46.991722106933594, "logps/rejected": -79.9345932006836, "loss": 0.664, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8038578033447266, "rewards/margins": 2.5324623584747314, "rewards/rejected": -1.7286044359207153, "step": 442 }, { "epoch": 1.0137496419364078, "grad_norm": 8.309694290161133, "learning_rate": 8.986238532110091e-07, "logits/chosen": -2.7237539291381836, "logits/rejected": -2.787379026412964, "logps/chosen": -50.422996520996094, "logps/rejected": -87.18121337890625, "loss": 0.63, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3918379247188568, "rewards/margins": 2.8707785606384277, "rewards/rejected": -2.478940486907959, "step": 443 }, { "epoch": 1.0160412489258093, "grad_norm": 8.300362586975098, "learning_rate": 8.98394495412844e-07, "logits/chosen": -2.8439385890960693, "logits/rejected": -2.800447463989258, "logps/chosen": -42.81344223022461, "logps/rejected": -75.71028900146484, "loss": 0.6377, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1054922342300415, "rewards/margins": 2.5470855236053467, "rewards/rejected": -1.4415932893753052, "step": 444 }, { "epoch": 1.0183328559152105, "grad_norm": 7.564792156219482, "learning_rate": 8.981651376146788e-07, "logits/chosen": -2.8907241821289062, "logits/rejected": -2.8643343448638916, "logps/chosen": -47.94707107543945, "logps/rejected": -75.95658874511719, "loss": 0.6547, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6862834692001343, "rewards/margins": 2.4329540729522705, "rewards/rejected": -1.7466707229614258, "step": 445 }, { "epoch": 1.020624462904612, "grad_norm": 9.834857940673828, "learning_rate": 8.979357798165137e-07, "logits/chosen": -2.8066935539245605, "logits/rejected": -2.7749223709106445, "logps/chosen": -53.6800537109375, "logps/rejected": -82.76081848144531, "loss": 0.649, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3008660674095154, "rewards/margins": 2.2683188915252686, "rewards/rejected": -1.9674530029296875, "step": 446 }, { "epoch": 1.0229160698940132, "grad_norm": 8.2918701171875, "learning_rate": 8.977064220183485e-07, "logits/chosen": -2.8121068477630615, "logits/rejected": -2.772590398788452, "logps/chosen": -51.232566833496094, "logps/rejected": -72.18952178955078, "loss": 0.7812, "rewards/accuracies": 0.75, "rewards/chosen": 0.393094539642334, "rewards/margins": 1.646592617034912, "rewards/rejected": -1.253497838973999, "step": 447 }, { "epoch": 1.0252076768834144, "grad_norm": 10.609406471252441, "learning_rate": 8.974770642201835e-07, "logits/chosen": -2.769726514816284, "logits/rejected": -2.769613027572632, "logps/chosen": -40.45103073120117, "logps/rejected": -68.64353942871094, "loss": 0.6991, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1942975521087646, "rewards/margins": 2.1922452449798584, "rewards/rejected": -0.9979475736618042, "step": 448 }, { "epoch": 1.0274992838728159, "grad_norm": 6.89447021484375, "learning_rate": 8.972477064220183e-07, "logits/chosen": -2.828684091567993, "logits/rejected": -2.836768388748169, "logps/chosen": -45.327308654785156, "logps/rejected": -78.8598403930664, "loss": 0.624, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8639203906059265, "rewards/margins": 2.7598071098327637, "rewards/rejected": -1.895886778831482, "step": 449 }, { "epoch": 1.029790890862217, "grad_norm": 8.9567232131958, "learning_rate": 8.970183486238532e-07, "logits/chosen": -2.793862819671631, "logits/rejected": -2.804239511489868, "logps/chosen": -43.326995849609375, "logps/rejected": -76.90676879882812, "loss": 0.6052, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9383904337882996, "rewards/margins": 2.5949392318725586, "rewards/rejected": -1.6565488576889038, "step": 450 }, { "epoch": 1.0320824978516185, "grad_norm": 9.043886184692383, "learning_rate": 8.96788990825688e-07, "logits/chosen": -2.777462959289551, "logits/rejected": -2.8296263217926025, "logps/chosen": -39.20549011230469, "logps/rejected": -74.012939453125, "loss": 0.6125, "rewards/accuracies": 0.875, "rewards/chosen": 1.3943498134613037, "rewards/margins": 2.7172489166259766, "rewards/rejected": -1.3228991031646729, "step": 451 }, { "epoch": 1.0343741048410198, "grad_norm": 8.76190185546875, "learning_rate": 8.965596330275229e-07, "logits/chosen": -2.8077988624572754, "logits/rejected": -2.815592050552368, "logps/chosen": -52.941070556640625, "logps/rejected": -75.7795181274414, "loss": 0.6853, "rewards/accuracies": 0.90625, "rewards/chosen": 0.26810532808303833, "rewards/margins": 2.1129202842712402, "rewards/rejected": -1.8448148965835571, "step": 452 }, { "epoch": 1.036665711830421, "grad_norm": 7.038501739501953, "learning_rate": 8.963302752293578e-07, "logits/chosen": -2.7688400745391846, "logits/rejected": -2.776233434677124, "logps/chosen": -49.878883361816406, "logps/rejected": -80.35889434814453, "loss": 0.6491, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6442499160766602, "rewards/margins": 2.5554957389831543, "rewards/rejected": -1.9112457036972046, "step": 453 }, { "epoch": 1.0389573188198225, "grad_norm": 7.132302284240723, "learning_rate": 8.961009174311926e-07, "logits/chosen": -2.8057854175567627, "logits/rejected": -2.817300796508789, "logps/chosen": -41.792606353759766, "logps/rejected": -74.14578247070312, "loss": 0.618, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2740156650543213, "rewards/margins": 2.754734754562378, "rewards/rejected": -1.480718970298767, "step": 454 }, { "epoch": 1.0412489258092237, "grad_norm": 8.340088844299316, "learning_rate": 8.958715596330276e-07, "logits/chosen": -2.8366432189941406, "logits/rejected": -2.8439364433288574, "logps/chosen": -46.368221282958984, "logps/rejected": -71.62423706054688, "loss": 0.7175, "rewards/accuracies": 0.875, "rewards/chosen": 1.0166372060775757, "rewards/margins": 2.1766035556793213, "rewards/rejected": -1.1599663496017456, "step": 455 }, { "epoch": 1.0435405327986251, "grad_norm": 7.4487738609313965, "learning_rate": 8.956422018348624e-07, "logits/chosen": -2.8366012573242188, "logits/rejected": -2.830146312713623, "logps/chosen": -43.56428909301758, "logps/rejected": -72.23257446289062, "loss": 0.6938, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8031867742538452, "rewards/margins": 2.1863622665405273, "rewards/rejected": -1.3831754922866821, "step": 456 }, { "epoch": 1.0458321397880264, "grad_norm": 9.834014892578125, "learning_rate": 8.954128440366972e-07, "logits/chosen": -2.7667810916900635, "logits/rejected": -2.7377734184265137, "logps/chosen": -42.631622314453125, "logps/rejected": -70.1426773071289, "loss": 0.6617, "rewards/accuracies": 0.78125, "rewards/chosen": 1.404090404510498, "rewards/margins": 2.3283324241638184, "rewards/rejected": -0.924241840839386, "step": 457 }, { "epoch": 1.0481237467774276, "grad_norm": 8.110539436340332, "learning_rate": 8.95183486238532e-07, "logits/chosen": -2.7756967544555664, "logits/rejected": -2.7934839725494385, "logps/chosen": -47.69640350341797, "logps/rejected": -68.3932876586914, "loss": 0.7775, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6265788674354553, "rewards/margins": 1.5983203649520874, "rewards/rejected": -0.9717414379119873, "step": 458 }, { "epoch": 1.050415353766829, "grad_norm": 7.45346736907959, "learning_rate": 8.949541284403669e-07, "logits/chosen": -2.6968345642089844, "logits/rejected": -2.697496175765991, "logps/chosen": -45.6706657409668, "logps/rejected": -75.88806915283203, "loss": 0.6432, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9136272072792053, "rewards/margins": 2.368720531463623, "rewards/rejected": -1.4550933837890625, "step": 459 }, { "epoch": 1.0527069607562303, "grad_norm": 17.004560470581055, "learning_rate": 8.947247706422018e-07, "logits/chosen": -2.770517587661743, "logits/rejected": -2.739473342895508, "logps/chosen": -41.483184814453125, "logps/rejected": -70.99545288085938, "loss": 0.6367, "rewards/accuracies": 0.875, "rewards/chosen": 1.2333921194076538, "rewards/margins": 2.4232828617095947, "rewards/rejected": -1.1898908615112305, "step": 460 }, { "epoch": 1.0549985677456317, "grad_norm": 13.120827674865723, "learning_rate": 8.944954128440366e-07, "logits/chosen": -2.804180860519409, "logits/rejected": -2.7794270515441895, "logps/chosen": -40.174903869628906, "logps/rejected": -73.34671020507812, "loss": 0.5938, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4173123836517334, "rewards/margins": 2.768610954284668, "rewards/rejected": -1.351298451423645, "step": 461 }, { "epoch": 1.057290174735033, "grad_norm": 6.838485240936279, "learning_rate": 8.942660550458714e-07, "logits/chosen": -2.753584384918213, "logits/rejected": -2.771488904953003, "logps/chosen": -39.026206970214844, "logps/rejected": -67.5219497680664, "loss": 0.6519, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3675246238708496, "rewards/margins": 2.447758913040161, "rewards/rejected": -1.080234408378601, "step": 462 }, { "epoch": 1.0595817817244342, "grad_norm": 6.329094409942627, "learning_rate": 8.940366972477064e-07, "logits/chosen": -2.771798610687256, "logits/rejected": -2.7272679805755615, "logps/chosen": -57.00338363647461, "logps/rejected": -85.99834442138672, "loss": 0.6943, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2941244840621948, "rewards/margins": 2.3838772773742676, "rewards/rejected": -2.0897529125213623, "step": 463 }, { "epoch": 1.0618733887138356, "grad_norm": 7.811226844787598, "learning_rate": 8.938073394495412e-07, "logits/chosen": -2.7761504650115967, "logits/rejected": -2.8349995613098145, "logps/chosen": -46.26371765136719, "logps/rejected": -73.21125030517578, "loss": 0.6528, "rewards/accuracies": 0.9375, "rewards/chosen": 0.69087153673172, "rewards/margins": 2.2521443367004395, "rewards/rejected": -1.5612727403640747, "step": 464 }, { "epoch": 1.0641649957032369, "grad_norm": 7.911195278167725, "learning_rate": 8.935779816513761e-07, "logits/chosen": -2.7878384590148926, "logits/rejected": -2.785799980163574, "logps/chosen": -43.57362365722656, "logps/rejected": -75.55854034423828, "loss": 0.5904, "rewards/accuracies": 0.90625, "rewards/chosen": 1.027486801147461, "rewards/margins": 2.7770090103149414, "rewards/rejected": -1.7495219707489014, "step": 465 }, { "epoch": 1.0664566026926383, "grad_norm": 10.240463256835938, "learning_rate": 8.93348623853211e-07, "logits/chosen": -2.851325511932373, "logits/rejected": -2.8410708904266357, "logps/chosen": -46.910675048828125, "logps/rejected": -75.26744842529297, "loss": 0.6432, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7681266069412231, "rewards/margins": 2.459961414337158, "rewards/rejected": -1.6918349266052246, "step": 466 }, { "epoch": 1.0687482096820395, "grad_norm": 7.898848056793213, "learning_rate": 8.931192660550458e-07, "logits/chosen": -2.7893412113189697, "logits/rejected": -2.8135159015655518, "logps/chosen": -41.89862823486328, "logps/rejected": -79.29637145996094, "loss": 0.5724, "rewards/accuracies": 0.875, "rewards/chosen": 1.3487550020217896, "rewards/margins": 3.112854242324829, "rewards/rejected": -1.764099359512329, "step": 467 }, { "epoch": 1.0710398166714408, "grad_norm": 15.152456283569336, "learning_rate": 8.928899082568807e-07, "logits/chosen": -2.7548770904541016, "logits/rejected": -2.783970355987549, "logps/chosen": -46.076202392578125, "logps/rejected": -75.59078979492188, "loss": 0.647, "rewards/accuracies": 1.0, "rewards/chosen": 0.903243899345398, "rewards/margins": 2.4145212173461914, "rewards/rejected": -1.511277198791504, "step": 468 }, { "epoch": 1.0733314236608422, "grad_norm": 9.154630661010742, "learning_rate": 8.926605504587155e-07, "logits/chosen": -2.7823708057403564, "logits/rejected": -2.7971911430358887, "logps/chosen": -52.26249694824219, "logps/rejected": -79.72274780273438, "loss": 0.6648, "rewards/accuracies": 0.90625, "rewards/chosen": 0.38032227754592896, "rewards/margins": 2.3884456157684326, "rewards/rejected": -2.0081231594085693, "step": 469 }, { "epoch": 1.0756230306502435, "grad_norm": 20.803369522094727, "learning_rate": 8.924311926605505e-07, "logits/chosen": -2.7873244285583496, "logits/rejected": -2.78398060798645, "logps/chosen": -52.01904296875, "logps/rejected": -82.55859375, "loss": 0.6164, "rewards/accuracies": 0.84375, "rewards/chosen": 0.617662250995636, "rewards/margins": 2.7668514251708984, "rewards/rejected": -2.1491892337799072, "step": 470 }, { "epoch": 1.077914637639645, "grad_norm": 9.757161140441895, "learning_rate": 8.922018348623853e-07, "logits/chosen": -2.845318555831909, "logits/rejected": -2.8029141426086426, "logps/chosen": -50.16367721557617, "logps/rejected": -83.20915985107422, "loss": 0.6167, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5335133075714111, "rewards/margins": 2.8379502296447754, "rewards/rejected": -2.304436683654785, "step": 471 }, { "epoch": 1.0802062446290461, "grad_norm": 8.495870590209961, "learning_rate": 8.919724770642202e-07, "logits/chosen": -2.9099314212799072, "logits/rejected": -2.9141483306884766, "logps/chosen": -48.37239074707031, "logps/rejected": -79.82489013671875, "loss": 0.6556, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6955418586730957, "rewards/margins": 2.5942530632019043, "rewards/rejected": -1.8987112045288086, "step": 472 }, { "epoch": 1.0824978516184474, "grad_norm": 7.336570739746094, "learning_rate": 8.917431192660551e-07, "logits/chosen": -2.8313989639282227, "logits/rejected": -2.735469341278076, "logps/chosen": -54.174346923828125, "logps/rejected": -80.20145416259766, "loss": 0.7045, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3062875270843506, "rewards/margins": 2.1578428745269775, "rewards/rejected": -1.851555347442627, "step": 473 }, { "epoch": 1.0847894586078488, "grad_norm": 11.338467597961426, "learning_rate": 8.915137614678898e-07, "logits/chosen": -2.792173147201538, "logits/rejected": -2.7895660400390625, "logps/chosen": -42.700904846191406, "logps/rejected": -70.7333755493164, "loss": 0.6779, "rewards/accuracies": 0.9375, "rewards/chosen": 1.086087703704834, "rewards/margins": 2.242410898208618, "rewards/rejected": -1.1563231945037842, "step": 474 }, { "epoch": 1.08708106559725, "grad_norm": 9.568601608276367, "learning_rate": 8.912844036697247e-07, "logits/chosen": -2.835120439529419, "logits/rejected": -2.8701751232147217, "logps/chosen": -43.93636703491211, "logps/rejected": -85.43431854248047, "loss": 0.4776, "rewards/accuracies": 1.0, "rewards/chosen": 1.083711862564087, "rewards/margins": 3.3357088565826416, "rewards/rejected": -2.2519969940185547, "step": 475 }, { "epoch": 1.0893726725866515, "grad_norm": 6.615148544311523, "learning_rate": 8.910550458715595e-07, "logits/chosen": -2.8237392902374268, "logits/rejected": -2.7969841957092285, "logps/chosen": -56.269500732421875, "logps/rejected": -83.34374237060547, "loss": 0.6973, "rewards/accuracies": 0.875, "rewards/chosen": -0.04543560370802879, "rewards/margins": 2.351191282272339, "rewards/rejected": -2.3966267108917236, "step": 476 }, { "epoch": 1.0916642795760527, "grad_norm": 8.708864212036133, "learning_rate": 8.908256880733945e-07, "logits/chosen": -2.8154656887054443, "logits/rejected": -2.7734663486480713, "logps/chosen": -55.96074295043945, "logps/rejected": -79.2961654663086, "loss": 0.7616, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2487991750240326, "rewards/margins": 1.9365441799163818, "rewards/rejected": -2.1853432655334473, "step": 477 }, { "epoch": 1.093955886565454, "grad_norm": 8.834033012390137, "learning_rate": 8.905963302752293e-07, "logits/chosen": -2.7728111743927, "logits/rejected": -2.765341281890869, "logps/chosen": -47.900413513183594, "logps/rejected": -84.6274642944336, "loss": 0.5721, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4942557215690613, "rewards/margins": 2.970231533050537, "rewards/rejected": -2.475975751876831, "step": 478 }, { "epoch": 1.0962474935548554, "grad_norm": 9.011329650878906, "learning_rate": 8.903669724770641e-07, "logits/chosen": -2.766447067260742, "logits/rejected": -2.7550668716430664, "logps/chosen": -44.326053619384766, "logps/rejected": -71.64938354492188, "loss": 0.661, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1495956182479858, "rewards/margins": 2.3851351737976074, "rewards/rejected": -1.2355395555496216, "step": 479 }, { "epoch": 1.0985391005442566, "grad_norm": 10.14436149597168, "learning_rate": 8.901376146788991e-07, "logits/chosen": -2.8444125652313232, "logits/rejected": -2.819899320602417, "logps/chosen": -50.46723175048828, "logps/rejected": -85.18377685546875, "loss": 0.5951, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5120551586151123, "rewards/margins": 2.9782137870788574, "rewards/rejected": -2.466158628463745, "step": 480 }, { "epoch": 1.100830707533658, "grad_norm": 7.310146808624268, "learning_rate": 8.899082568807339e-07, "logits/chosen": -2.791428565979004, "logits/rejected": -2.8220643997192383, "logps/chosen": -54.44218826293945, "logps/rejected": -85.884521484375, "loss": 0.6541, "rewards/accuracies": 0.875, "rewards/chosen": 0.14546652138233185, "rewards/margins": 2.487020254135132, "rewards/rejected": -2.3415536880493164, "step": 481 }, { "epoch": 1.1031223145230593, "grad_norm": 8.923394203186035, "learning_rate": 8.896788990825688e-07, "logits/chosen": -2.837825298309326, "logits/rejected": -2.82777738571167, "logps/chosen": -48.24268341064453, "logps/rejected": -78.75312042236328, "loss": 0.6371, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7314746379852295, "rewards/margins": 2.597072124481201, "rewards/rejected": -1.8655977249145508, "step": 482 }, { "epoch": 1.1054139215124605, "grad_norm": 9.32580852508545, "learning_rate": 8.894495412844036e-07, "logits/chosen": -2.8827359676361084, "logits/rejected": -2.860692024230957, "logps/chosen": -47.26598358154297, "logps/rejected": -76.32130432128906, "loss": 0.6784, "rewards/accuracies": 0.84375, "rewards/chosen": 0.683820366859436, "rewards/margins": 2.0006027221679688, "rewards/rejected": -1.3167823553085327, "step": 483 }, { "epoch": 1.107705528501862, "grad_norm": 9.467401504516602, "learning_rate": 8.892201834862385e-07, "logits/chosen": -2.7600014209747314, "logits/rejected": -2.7591333389282227, "logps/chosen": -56.430450439453125, "logps/rejected": -80.57537841796875, "loss": 0.7654, "rewards/accuracies": 0.875, "rewards/chosen": -0.23018872737884521, "rewards/margins": 2.0441808700561523, "rewards/rejected": -2.274369478225708, "step": 484 }, { "epoch": 1.1099971354912632, "grad_norm": 9.314543724060059, "learning_rate": 8.889908256880734e-07, "logits/chosen": -2.787613868713379, "logits/rejected": -2.8333470821380615, "logps/chosen": -46.745826721191406, "logps/rejected": -77.63737487792969, "loss": 0.6497, "rewards/accuracies": 0.96875, "rewards/chosen": 0.867034912109375, "rewards/margins": 2.450507402420044, "rewards/rejected": -1.583472728729248, "step": 485 }, { "epoch": 1.1122887424806644, "grad_norm": 7.779623985290527, "learning_rate": 8.887614678899082e-07, "logits/chosen": -2.8411638736724854, "logits/rejected": -2.824366807937622, "logps/chosen": -42.67913055419922, "logps/rejected": -63.16246795654297, "loss": 0.7592, "rewards/accuracies": 0.875, "rewards/chosen": 1.179063081741333, "rewards/margins": 1.7720046043395996, "rewards/rejected": -0.592941403388977, "step": 486 }, { "epoch": 1.114580349470066, "grad_norm": 9.723306655883789, "learning_rate": 8.885321100917432e-07, "logits/chosen": -2.85115647315979, "logits/rejected": -2.801940441131592, "logps/chosen": -48.04238510131836, "logps/rejected": -77.60843658447266, "loss": 0.6136, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7688889503479004, "rewards/margins": 2.5753633975982666, "rewards/rejected": -1.8064743280410767, "step": 487 }, { "epoch": 1.1168719564594671, "grad_norm": 8.20394515991211, "learning_rate": 8.88302752293578e-07, "logits/chosen": -2.765845537185669, "logits/rejected": -2.7680015563964844, "logps/chosen": -44.897254943847656, "logps/rejected": -69.21036529541016, "loss": 0.6678, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1509498357772827, "rewards/margins": 2.096869945526123, "rewards/rejected": -0.9459200501441956, "step": 488 }, { "epoch": 1.1191635634488686, "grad_norm": 7.629037380218506, "learning_rate": 8.880733944954128e-07, "logits/chosen": -2.921414852142334, "logits/rejected": -2.921579360961914, "logps/chosen": -60.385562896728516, "logps/rejected": -82.1639404296875, "loss": 0.7596, "rewards/accuracies": 0.71875, "rewards/chosen": -0.21509087085723877, "rewards/margins": 1.8477493524551392, "rewards/rejected": -2.062840223312378, "step": 489 }, { "epoch": 1.1214551704382698, "grad_norm": 8.727511405944824, "learning_rate": 8.878440366972476e-07, "logits/chosen": -2.7945995330810547, "logits/rejected": -2.7995846271514893, "logps/chosen": -51.67963409423828, "logps/rejected": -78.66658020019531, "loss": 0.7152, "rewards/accuracies": 0.875, "rewards/chosen": 0.5281322598457336, "rewards/margins": 2.2744991779327393, "rewards/rejected": -1.7463669776916504, "step": 490 }, { "epoch": 1.1237467774276713, "grad_norm": 11.544198989868164, "learning_rate": 8.876146788990825e-07, "logits/chosen": -2.8866732120513916, "logits/rejected": -2.8185062408447266, "logps/chosen": -43.28451919555664, "logps/rejected": -72.0943374633789, "loss": 0.6587, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1358368396759033, "rewards/margins": 2.447028398513794, "rewards/rejected": -1.311191439628601, "step": 491 }, { "epoch": 1.1260383844170725, "grad_norm": 9.808731079101562, "learning_rate": 8.873853211009174e-07, "logits/chosen": -2.825570583343506, "logits/rejected": -2.8270559310913086, "logps/chosen": -43.80091094970703, "logps/rejected": -68.18866729736328, "loss": 0.6325, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2177056074142456, "rewards/margins": 2.1979997158050537, "rewards/rejected": -0.980293869972229, "step": 492 }, { "epoch": 1.1283299914064737, "grad_norm": 6.557858943939209, "learning_rate": 8.871559633027522e-07, "logits/chosen": -2.7979369163513184, "logits/rejected": -2.78967547416687, "logps/chosen": -49.08049011230469, "logps/rejected": -72.29840850830078, "loss": 0.7038, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6577196717262268, "rewards/margins": 2.0156707763671875, "rewards/rejected": -1.357951045036316, "step": 493 }, { "epoch": 1.1306215983958752, "grad_norm": 8.197505950927734, "learning_rate": 8.86926605504587e-07, "logits/chosen": -2.8326525688171387, "logits/rejected": -2.8104183673858643, "logps/chosen": -46.685184478759766, "logps/rejected": -75.15505981445312, "loss": 0.6088, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9355431795120239, "rewards/margins": 2.5652148723602295, "rewards/rejected": -1.6296714544296265, "step": 494 }, { "epoch": 1.1329132053852764, "grad_norm": 10.387160301208496, "learning_rate": 8.86697247706422e-07, "logits/chosen": -2.7611992359161377, "logits/rejected": -2.8150858879089355, "logps/chosen": -47.689300537109375, "logps/rejected": -78.02574157714844, "loss": 0.6264, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8047102689743042, "rewards/margins": 2.5678529739379883, "rewards/rejected": -1.7631428241729736, "step": 495 }, { "epoch": 1.1352048123746776, "grad_norm": 8.439704895019531, "learning_rate": 8.864678899082568e-07, "logits/chosen": -2.87153697013855, "logits/rejected": -2.8366851806640625, "logps/chosen": -46.149322509765625, "logps/rejected": -75.38741302490234, "loss": 0.6067, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1169753074645996, "rewards/margins": 2.582570791244507, "rewards/rejected": -1.4655954837799072, "step": 496 }, { "epoch": 1.137496419364079, "grad_norm": 8.312873840332031, "learning_rate": 8.862385321100917e-07, "logits/chosen": -2.811992883682251, "logits/rejected": -2.7797422409057617, "logps/chosen": -44.171905517578125, "logps/rejected": -73.05905151367188, "loss": 0.659, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1909102201461792, "rewards/margins": 2.5020902156829834, "rewards/rejected": -1.3111799955368042, "step": 497 }, { "epoch": 1.1397880263534803, "grad_norm": 8.871757507324219, "learning_rate": 8.860091743119266e-07, "logits/chosen": -2.887608051300049, "logits/rejected": -2.87978458404541, "logps/chosen": -36.63849639892578, "logps/rejected": -70.48076629638672, "loss": 0.6305, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3886826038360596, "rewards/margins": 2.740110397338867, "rewards/rejected": -1.3514277935028076, "step": 498 }, { "epoch": 1.1420796333428818, "grad_norm": 7.036399841308594, "learning_rate": 8.857798165137615e-07, "logits/chosen": -2.8329367637634277, "logits/rejected": -2.7876007556915283, "logps/chosen": -58.18490219116211, "logps/rejected": -77.2568359375, "loss": 0.842, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06481222808361053, "rewards/margins": 1.358675241470337, "rewards/rejected": -1.4234874248504639, "step": 499 }, { "epoch": 1.144371240332283, "grad_norm": 8.744758605957031, "learning_rate": 8.855504587155963e-07, "logits/chosen": -2.7916412353515625, "logits/rejected": -2.822768211364746, "logps/chosen": -49.796730041503906, "logps/rejected": -77.68354797363281, "loss": 0.6579, "rewards/accuracies": 0.875, "rewards/chosen": 0.7519443035125732, "rewards/margins": 2.389812469482422, "rewards/rejected": -1.637868046760559, "step": 500 }, { "epoch": 1.1466628473216844, "grad_norm": 10.065033912658691, "learning_rate": 8.853211009174311e-07, "logits/chosen": -2.7705483436584473, "logits/rejected": -2.7800629138946533, "logps/chosen": -41.99199295043945, "logps/rejected": -78.61676788330078, "loss": 0.5853, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1187645196914673, "rewards/margins": 3.0756678581237793, "rewards/rejected": -1.9569036960601807, "step": 501 }, { "epoch": 1.1489544543110857, "grad_norm": 7.429214000701904, "learning_rate": 8.850917431192661e-07, "logits/chosen": -2.812910795211792, "logits/rejected": -2.8200156688690186, "logps/chosen": -47.49641799926758, "logps/rejected": -75.87346649169922, "loss": 0.6748, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5849164724349976, "rewards/margins": 2.426058530807495, "rewards/rejected": -1.841141700744629, "step": 502 }, { "epoch": 1.151246061300487, "grad_norm": 7.031627655029297, "learning_rate": 8.848623853211009e-07, "logits/chosen": -2.781658411026001, "logits/rejected": -2.8009912967681885, "logps/chosen": -48.87895202636719, "logps/rejected": -83.30921173095703, "loss": 0.6485, "rewards/accuracies": 0.90625, "rewards/chosen": 0.768886387348175, "rewards/margins": 2.6797516345977783, "rewards/rejected": -1.9108651876449585, "step": 503 }, { "epoch": 1.1535376682898884, "grad_norm": 9.495769500732422, "learning_rate": 8.846330275229358e-07, "logits/chosen": -2.8416149616241455, "logits/rejected": -2.8494925498962402, "logps/chosen": -46.19395065307617, "logps/rejected": -78.41265869140625, "loss": 0.5987, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8271572589874268, "rewards/margins": 2.7514562606811523, "rewards/rejected": -1.9242987632751465, "step": 504 }, { "epoch": 1.1558292752792896, "grad_norm": 11.767537117004395, "learning_rate": 8.844036697247706e-07, "logits/chosen": -2.8314638137817383, "logits/rejected": -2.8830723762512207, "logps/chosen": -53.01689910888672, "logps/rejected": -78.11873626708984, "loss": 0.7047, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5665979981422424, "rewards/margins": 2.3859827518463135, "rewards/rejected": -1.8193845748901367, "step": 505 }, { "epoch": 1.1581208822686908, "grad_norm": 8.43411636352539, "learning_rate": 8.841743119266054e-07, "logits/chosen": -2.784492015838623, "logits/rejected": -2.817876100540161, "logps/chosen": -53.56269073486328, "logps/rejected": -78.92054748535156, "loss": 0.7091, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3766944706439972, "rewards/margins": 2.39461088180542, "rewards/rejected": -2.017916202545166, "step": 506 }, { "epoch": 1.1604124892580923, "grad_norm": 8.61422348022461, "learning_rate": 8.839449541284403e-07, "logits/chosen": -2.9661903381347656, "logits/rejected": -2.9973983764648438, "logps/chosen": -47.11008834838867, "logps/rejected": -78.4466781616211, "loss": 0.6352, "rewards/accuracies": 0.96875, "rewards/chosen": 0.968479573726654, "rewards/margins": 2.6068432331085205, "rewards/rejected": -1.6383638381958008, "step": 507 }, { "epoch": 1.1627040962474935, "grad_norm": 7.583284854888916, "learning_rate": 8.837155963302751e-07, "logits/chosen": -2.74472713470459, "logits/rejected": -2.7677624225616455, "logps/chosen": -53.72803497314453, "logps/rejected": -82.6325912475586, "loss": 0.6992, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21904414892196655, "rewards/margins": 2.3357975482940674, "rewards/rejected": -2.116753339767456, "step": 508 }, { "epoch": 1.164995703236895, "grad_norm": 9.186519622802734, "learning_rate": 8.834862385321101e-07, "logits/chosen": -2.8877549171447754, "logits/rejected": -2.9023778438568115, "logps/chosen": -44.621456146240234, "logps/rejected": -72.65818786621094, "loss": 0.6511, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8135183453559875, "rewards/margins": 2.4237618446350098, "rewards/rejected": -1.610243558883667, "step": 509 }, { "epoch": 1.1672873102262962, "grad_norm": 19.359115600585938, "learning_rate": 8.832568807339449e-07, "logits/chosen": -2.755357027053833, "logits/rejected": -2.8233094215393066, "logps/chosen": -46.42686080932617, "logps/rejected": -77.21424865722656, "loss": 0.6449, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7481995820999146, "rewards/margins": 2.6336281299591064, "rewards/rejected": -1.885428786277771, "step": 510 }, { "epoch": 1.1695789172156976, "grad_norm": 9.489893913269043, "learning_rate": 8.830275229357797e-07, "logits/chosen": -2.8561179637908936, "logits/rejected": -2.837515354156494, "logps/chosen": -45.499080657958984, "logps/rejected": -84.23675537109375, "loss": 0.5467, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9690145254135132, "rewards/margins": 3.3156001567840576, "rewards/rejected": -2.346585750579834, "step": 511 }, { "epoch": 1.1718705242050989, "grad_norm": 12.489190101623535, "learning_rate": 8.827981651376146e-07, "logits/chosen": -2.9033045768737793, "logits/rejected": -2.851199150085449, "logps/chosen": -40.044281005859375, "logps/rejected": -80.53630828857422, "loss": 0.5632, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2654296159744263, "rewards/margins": 3.2382376194000244, "rewards/rejected": -1.9728078842163086, "step": 512 }, { "epoch": 1.1741621311945, "grad_norm": 8.513059616088867, "learning_rate": 8.825688073394495e-07, "logits/chosen": -2.878225564956665, "logits/rejected": -2.7834534645080566, "logps/chosen": -43.99922180175781, "logps/rejected": -83.22400665283203, "loss": 0.5897, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0973131656646729, "rewards/margins": 3.3720154762268066, "rewards/rejected": -2.274702310562134, "step": 513 }, { "epoch": 1.1764537381839015, "grad_norm": 6.261756420135498, "learning_rate": 8.823394495412844e-07, "logits/chosen": -2.8042635917663574, "logits/rejected": -2.7917017936706543, "logps/chosen": -56.18501281738281, "logps/rejected": -86.84632873535156, "loss": 0.7842, "rewards/accuracies": 0.84375, "rewards/chosen": -0.024031490087509155, "rewards/margins": 2.4162404537200928, "rewards/rejected": -2.4402718544006348, "step": 514 }, { "epoch": 1.1787453451733028, "grad_norm": 9.470754623413086, "learning_rate": 8.821100917431192e-07, "logits/chosen": -2.835249185562134, "logits/rejected": -2.8564043045043945, "logps/chosen": -51.65571975708008, "logps/rejected": -79.90592193603516, "loss": 0.66, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5326328277587891, "rewards/margins": 2.4365968704223633, "rewards/rejected": -1.9039642810821533, "step": 515 }, { "epoch": 1.181036952162704, "grad_norm": 8.472245216369629, "learning_rate": 8.81880733944954e-07, "logits/chosen": -2.92226505279541, "logits/rejected": -2.929980516433716, "logps/chosen": -40.78834915161133, "logps/rejected": -75.64636993408203, "loss": 0.6082, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1248716115951538, "rewards/margins": 3.0397329330444336, "rewards/rejected": -1.9148614406585693, "step": 516 }, { "epoch": 1.1833285591521054, "grad_norm": 10.108110427856445, "learning_rate": 8.81651376146789e-07, "logits/chosen": -2.855621814727783, "logits/rejected": -2.865412712097168, "logps/chosen": -52.490325927734375, "logps/rejected": -77.04518127441406, "loss": 0.674, "rewards/accuracies": 0.84375, "rewards/chosen": 0.520874559879303, "rewards/margins": 2.277918815612793, "rewards/rejected": -1.7570441961288452, "step": 517 }, { "epoch": 1.1856201661415067, "grad_norm": 6.66834020614624, "learning_rate": 8.814220183486238e-07, "logits/chosen": -2.843787670135498, "logits/rejected": -2.8881914615631104, "logps/chosen": -43.5952033996582, "logps/rejected": -78.75225830078125, "loss": 0.6148, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8792945146560669, "rewards/margins": 2.8217740058898926, "rewards/rejected": -1.9424794912338257, "step": 518 }, { "epoch": 1.1879117731309081, "grad_norm": 8.319950103759766, "learning_rate": 8.811926605504587e-07, "logits/chosen": -2.9535679817199707, "logits/rejected": -2.9139788150787354, "logps/chosen": -47.63362121582031, "logps/rejected": -78.03401947021484, "loss": 0.6775, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6432821750640869, "rewards/margins": 2.55669903755188, "rewards/rejected": -1.9134169816970825, "step": 519 }, { "epoch": 1.1902033801203094, "grad_norm": 8.138769149780273, "learning_rate": 8.809633027522936e-07, "logits/chosen": -2.821465253829956, "logits/rejected": -2.809708595275879, "logps/chosen": -51.99226760864258, "logps/rejected": -78.65205383300781, "loss": 0.68, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3606048822402954, "rewards/margins": 2.1235485076904297, "rewards/rejected": -1.7629435062408447, "step": 520 }, { "epoch": 1.1924949871097108, "grad_norm": 9.84209156036377, "learning_rate": 8.807339449541285e-07, "logits/chosen": -2.778191566467285, "logits/rejected": -2.8118696212768555, "logps/chosen": -43.40102767944336, "logps/rejected": -79.203857421875, "loss": 0.5852, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1073389053344727, "rewards/margins": 2.959643840789795, "rewards/rejected": -1.8523049354553223, "step": 521 }, { "epoch": 1.194786594099112, "grad_norm": 14.019742012023926, "learning_rate": 8.805045871559632e-07, "logits/chosen": -2.836995840072632, "logits/rejected": -2.831186294555664, "logps/chosen": -37.7659797668457, "logps/rejected": -75.21417236328125, "loss": 0.5695, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6719955205917358, "rewards/margins": 2.946208953857422, "rewards/rejected": -1.2742133140563965, "step": 522 }, { "epoch": 1.1970782010885133, "grad_norm": 9.766317367553711, "learning_rate": 8.80275229357798e-07, "logits/chosen": -2.860100269317627, "logits/rejected": -2.9108638763427734, "logps/chosen": -45.199737548828125, "logps/rejected": -74.43733215332031, "loss": 0.6274, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7725635766983032, "rewards/margins": 2.515044927597046, "rewards/rejected": -1.7424812316894531, "step": 523 }, { "epoch": 1.1993698080779147, "grad_norm": 9.98034381866455, "learning_rate": 8.80045871559633e-07, "logits/chosen": -2.8149449825286865, "logits/rejected": -2.778597593307495, "logps/chosen": -45.20096969604492, "logps/rejected": -73.62825775146484, "loss": 0.649, "rewards/accuracies": 0.875, "rewards/chosen": 0.8707245588302612, "rewards/margins": 2.2757110595703125, "rewards/rejected": -1.4049866199493408, "step": 524 }, { "epoch": 1.201661415067316, "grad_norm": 9.654521942138672, "learning_rate": 8.798165137614678e-07, "logits/chosen": -2.8796305656433105, "logits/rejected": -2.790041923522949, "logps/chosen": -48.34125900268555, "logps/rejected": -77.21903228759766, "loss": 0.6381, "rewards/accuracies": 0.875, "rewards/chosen": 0.8235557079315186, "rewards/margins": 2.4849371910095215, "rewards/rejected": -1.6613816022872925, "step": 525 }, { "epoch": 1.2039530220567172, "grad_norm": 8.114791870117188, "learning_rate": 8.795871559633027e-07, "logits/chosen": -2.7305257320404053, "logits/rejected": -2.735158920288086, "logps/chosen": -52.3680534362793, "logps/rejected": -82.18898010253906, "loss": 0.688, "rewards/accuracies": 0.84375, "rewards/chosen": 0.47582465410232544, "rewards/margins": 2.2486157417297363, "rewards/rejected": -1.7727911472320557, "step": 526 }, { "epoch": 1.2062446290461186, "grad_norm": 7.050064563751221, "learning_rate": 8.793577981651376e-07, "logits/chosen": -2.9094104766845703, "logits/rejected": -2.8761959075927734, "logps/chosen": -57.10205078125, "logps/rejected": -79.46136474609375, "loss": 0.7526, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12228300422430038, "rewards/margins": 1.9041732549667358, "rewards/rejected": -2.026456117630005, "step": 527 }, { "epoch": 1.2085362360355199, "grad_norm": 8.538702964782715, "learning_rate": 8.791284403669724e-07, "logits/chosen": -2.7415785789489746, "logits/rejected": -2.7736282348632812, "logps/chosen": -42.343101501464844, "logps/rejected": -70.67878723144531, "loss": 0.6509, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8318321108818054, "rewards/margins": 2.2812724113464355, "rewards/rejected": -1.4494404792785645, "step": 528 }, { "epoch": 1.2085362360355199, "eval_logits/chosen": -2.9452784061431885, "eval_logits/rejected": -2.959803581237793, "eval_logps/chosen": -60.283992767333984, "eval_logps/rejected": -85.79029083251953, "eval_loss": 0.7477946281433105, "eval_rewards/accuracies": 0.8415094614028931, "eval_rewards/chosen": -0.3221636116504669, "eval_rewards/margins": 2.1519217491149902, "eval_rewards/rejected": -2.4740853309631348, "eval_runtime": 957.4805, "eval_samples_per_second": 0.552, "eval_steps_per_second": 0.277, "step": 528 }, { "epoch": 1.2108278430249213, "grad_norm": 9.94054889678955, "learning_rate": 8.788990825688073e-07, "logits/chosen": -2.834383964538574, "logits/rejected": -2.8788902759552, "logps/chosen": -45.3509521484375, "logps/rejected": -78.10248565673828, "loss": 0.6088, "rewards/accuracies": 1.0, "rewards/chosen": 0.7989250421524048, "rewards/margins": 2.8190293312072754, "rewards/rejected": -2.0201046466827393, "step": 529 }, { "epoch": 1.2131194500143225, "grad_norm": 9.77259635925293, "learning_rate": 8.786697247706421e-07, "logits/chosen": -2.8934786319732666, "logits/rejected": -2.85754656791687, "logps/chosen": -46.23880386352539, "logps/rejected": -69.8426284790039, "loss": 0.7347, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5448623895645142, "rewards/margins": 1.9481812715530396, "rewards/rejected": -1.4033188819885254, "step": 530 }, { "epoch": 1.215411057003724, "grad_norm": 13.658133506774902, "learning_rate": 8.784403669724771e-07, "logits/chosen": -2.924872398376465, "logits/rejected": -2.938371419906616, "logps/chosen": -52.62801742553711, "logps/rejected": -82.37652587890625, "loss": 0.6598, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3988291025161743, "rewards/margins": 2.706669569015503, "rewards/rejected": -2.307840347290039, "step": 531 }, { "epoch": 1.2177026639931252, "grad_norm": 22.531095504760742, "learning_rate": 8.782110091743119e-07, "logits/chosen": -2.896693706512451, "logits/rejected": -2.9225003719329834, "logps/chosen": -52.964515686035156, "logps/rejected": -89.05535125732422, "loss": 0.6087, "rewards/accuracies": 0.9375, "rewards/chosen": 0.036447957158088684, "rewards/margins": 3.0484795570373535, "rewards/rejected": -3.0120317935943604, "step": 532 }, { "epoch": 1.2199942709825264, "grad_norm": 11.700085639953613, "learning_rate": 8.779816513761467e-07, "logits/chosen": -2.7974514961242676, "logits/rejected": -2.799893617630005, "logps/chosen": -43.738162994384766, "logps/rejected": -85.30303955078125, "loss": 0.5617, "rewards/accuracies": 0.90625, "rewards/chosen": 1.105710744857788, "rewards/margins": 3.611987590789795, "rewards/rejected": -2.506277322769165, "step": 533 }, { "epoch": 1.222285877971928, "grad_norm": 11.270364761352539, "learning_rate": 8.777522935779817e-07, "logits/chosen": -2.9212281703948975, "logits/rejected": -2.8941376209259033, "logps/chosen": -58.13933563232422, "logps/rejected": -91.44303894042969, "loss": 0.6662, "rewards/accuracies": 0.9375, "rewards/chosen": -0.009910687804222107, "rewards/margins": 2.774477005004883, "rewards/rejected": -2.7843878269195557, "step": 534 }, { "epoch": 1.2245774849613291, "grad_norm": 10.38998794555664, "learning_rate": 8.775229357798165e-07, "logits/chosen": -2.783146858215332, "logits/rejected": -2.8265182971954346, "logps/chosen": -56.69249725341797, "logps/rejected": -82.68264770507812, "loss": 0.7034, "rewards/accuracies": 0.84375, "rewards/chosen": -0.02274191379547119, "rewards/margins": 2.4063756465911865, "rewards/rejected": -2.4291176795959473, "step": 535 }, { "epoch": 1.2268690919507304, "grad_norm": 7.327376842498779, "learning_rate": 8.772935779816514e-07, "logits/chosen": -2.8432228565216064, "logits/rejected": -2.846529722213745, "logps/chosen": -56.13378143310547, "logps/rejected": -81.08843231201172, "loss": 0.774, "rewards/accuracies": 0.875, "rewards/chosen": 0.06464999169111252, "rewards/margins": 1.9047868251800537, "rewards/rejected": -1.8401368856430054, "step": 536 }, { "epoch": 1.2291606989401318, "grad_norm": 10.40078353881836, "learning_rate": 8.770642201834862e-07, "logits/chosen": -2.882009983062744, "logits/rejected": -2.8829801082611084, "logps/chosen": -42.879146575927734, "logps/rejected": -77.3544692993164, "loss": 0.6213, "rewards/accuracies": 0.875, "rewards/chosen": 0.9387556314468384, "rewards/margins": 2.863189458847046, "rewards/rejected": -1.924433708190918, "step": 537 }, { "epoch": 1.231452305929533, "grad_norm": 12.907600402832031, "learning_rate": 8.76834862385321e-07, "logits/chosen": -2.877762794494629, "logits/rejected": -2.8588550090789795, "logps/chosen": -48.224029541015625, "logps/rejected": -84.45616912841797, "loss": 0.6111, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6863912343978882, "rewards/margins": 3.0937788486480713, "rewards/rejected": -2.4073877334594727, "step": 538 }, { "epoch": 1.2337439129189345, "grad_norm": 8.594748497009277, "learning_rate": 8.766055045871559e-07, "logits/chosen": -2.8483123779296875, "logits/rejected": -2.8464908599853516, "logps/chosen": -45.7006950378418, "logps/rejected": -75.71488189697266, "loss": 0.6009, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0484929084777832, "rewards/margins": 2.698578119277954, "rewards/rejected": -1.6500850915908813, "step": 539 }, { "epoch": 1.2360355199083357, "grad_norm": 9.0687837600708, "learning_rate": 8.763761467889907e-07, "logits/chosen": -2.8342559337615967, "logits/rejected": -2.794114589691162, "logps/chosen": -53.88392639160156, "logps/rejected": -81.90982055664062, "loss": 0.6727, "rewards/accuracies": 0.875, "rewards/chosen": 0.26274484395980835, "rewards/margins": 2.3051211833953857, "rewards/rejected": -2.0423762798309326, "step": 540 }, { "epoch": 1.2383271268977372, "grad_norm": 9.754436492919922, "learning_rate": 8.761467889908256e-07, "logits/chosen": -2.8414201736450195, "logits/rejected": -2.8163466453552246, "logps/chosen": -41.95368957519531, "logps/rejected": -82.05743408203125, "loss": 0.5839, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0412540435791016, "rewards/margins": 3.340578556060791, "rewards/rejected": -2.2993242740631104, "step": 541 }, { "epoch": 1.2406187338871384, "grad_norm": 10.556746482849121, "learning_rate": 8.759174311926605e-07, "logits/chosen": -2.853269577026367, "logits/rejected": -2.8273239135742188, "logps/chosen": -34.53715515136719, "logps/rejected": -77.18058776855469, "loss": 0.5125, "rewards/accuracies": 1.0, "rewards/chosen": 1.7388081550598145, "rewards/margins": 3.5661637783050537, "rewards/rejected": -1.8273558616638184, "step": 542 }, { "epoch": 1.2429103408765396, "grad_norm": 9.344738960266113, "learning_rate": 8.756880733944954e-07, "logits/chosen": -2.923680305480957, "logits/rejected": -2.9030590057373047, "logps/chosen": -57.95429229736328, "logps/rejected": -76.94894409179688, "loss": 0.7415, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2692614793777466, "rewards/margins": 1.6945136785507202, "rewards/rejected": -1.9637751579284668, "step": 543 }, { "epoch": 1.245201947865941, "grad_norm": 8.576349258422852, "learning_rate": 8.754587155963302e-07, "logits/chosen": -2.947906255722046, "logits/rejected": -2.911543369293213, "logps/chosen": -39.17530822753906, "logps/rejected": -73.54513549804688, "loss": 0.5964, "rewards/accuracies": 1.0, "rewards/chosen": 1.4739855527877808, "rewards/margins": 3.0148444175720215, "rewards/rejected": -1.5408588647842407, "step": 544 }, { "epoch": 1.2474935548553423, "grad_norm": 11.396215438842773, "learning_rate": 8.752293577981651e-07, "logits/chosen": -2.8931050300598145, "logits/rejected": -2.843968152999878, "logps/chosen": -40.74802017211914, "logps/rejected": -71.18618774414062, "loss": 0.6183, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5652309656143188, "rewards/margins": 2.739598512649536, "rewards/rejected": -1.1743677854537964, "step": 545 }, { "epoch": 1.2497851618447435, "grad_norm": 9.851797103881836, "learning_rate": 8.75e-07, "logits/chosen": -2.8012709617614746, "logits/rejected": -2.8121025562286377, "logps/chosen": -45.44166946411133, "logps/rejected": -77.52839660644531, "loss": 0.6263, "rewards/accuracies": 1.0, "rewards/chosen": 1.1016939878463745, "rewards/margins": 2.7634201049804688, "rewards/rejected": -1.6617262363433838, "step": 546 }, { "epoch": 1.252076768834145, "grad_norm": 11.584747314453125, "learning_rate": 8.747706422018348e-07, "logits/chosen": -2.857253074645996, "logits/rejected": -2.8883185386657715, "logps/chosen": -49.28782272338867, "logps/rejected": -70.36834716796875, "loss": 0.7045, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8978559374809265, "rewards/margins": 1.926164984703064, "rewards/rejected": -1.0283092260360718, "step": 547 }, { "epoch": 1.2543683758235462, "grad_norm": 9.01067066192627, "learning_rate": 8.745412844036697e-07, "logits/chosen": -2.91092848777771, "logits/rejected": -2.9010815620422363, "logps/chosen": -38.99641036987305, "logps/rejected": -71.71940612792969, "loss": 0.571, "rewards/accuracies": 0.84375, "rewards/chosen": 1.5123380422592163, "rewards/margins": 2.713761806488037, "rewards/rejected": -1.2014236450195312, "step": 548 }, { "epoch": 1.2566599828129477, "grad_norm": 9.477349281311035, "learning_rate": 8.743119266055046e-07, "logits/chosen": -2.801892042160034, "logits/rejected": -2.8252029418945312, "logps/chosen": -47.51951599121094, "logps/rejected": -66.72613525390625, "loss": 0.7599, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8258910179138184, "rewards/margins": 1.6375584602355957, "rewards/rejected": -0.8116676211357117, "step": 549 }, { "epoch": 1.258951589802349, "grad_norm": 10.932429313659668, "learning_rate": 8.740825688073394e-07, "logits/chosen": -2.8261117935180664, "logits/rejected": -2.870548725128174, "logps/chosen": -38.009647369384766, "logps/rejected": -65.97522735595703, "loss": 0.6935, "rewards/accuracies": 0.875, "rewards/chosen": 1.4623109102249146, "rewards/margins": 2.388566255569458, "rewards/rejected": -0.9262553453445435, "step": 550 }, { "epoch": 1.2612431967917503, "grad_norm": 9.433074951171875, "learning_rate": 8.738532110091743e-07, "logits/chosen": -2.797213077545166, "logits/rejected": -2.8321800231933594, "logps/chosen": -37.69807434082031, "logps/rejected": -66.62163543701172, "loss": 0.6645, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7779470682144165, "rewards/margins": 2.5184502601623535, "rewards/rejected": -0.7405034303665161, "step": 551 }, { "epoch": 1.2635348037811516, "grad_norm": 10.092733383178711, "learning_rate": 8.736238532110092e-07, "logits/chosen": -2.799562931060791, "logits/rejected": -2.764765739440918, "logps/chosen": -41.18036651611328, "logps/rejected": -72.33412170410156, "loss": 0.6289, "rewards/accuracies": 0.875, "rewards/chosen": 1.308057427406311, "rewards/margins": 2.722689628601074, "rewards/rejected": -1.4146320819854736, "step": 552 }, { "epoch": 1.2658264107705528, "grad_norm": 8.546154022216797, "learning_rate": 8.733944954128441e-07, "logits/chosen": -2.811645746231079, "logits/rejected": -2.7800917625427246, "logps/chosen": -40.89820098876953, "logps/rejected": -77.6470718383789, "loss": 0.6077, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4559457302093506, "rewards/margins": 3.0193300247192383, "rewards/rejected": -1.5633844137191772, "step": 553 }, { "epoch": 1.2681180177599543, "grad_norm": 12.425297737121582, "learning_rate": 8.731651376146789e-07, "logits/chosen": -2.819553852081299, "logits/rejected": -2.8352231979370117, "logps/chosen": -51.31978225708008, "logps/rejected": -79.35218811035156, "loss": 0.7334, "rewards/accuracies": 0.9375, "rewards/chosen": 0.46964746713638306, "rewards/margins": 2.407325029373169, "rewards/rejected": -1.9376775026321411, "step": 554 }, { "epoch": 1.2704096247493555, "grad_norm": 9.140946388244629, "learning_rate": 8.729357798165136e-07, "logits/chosen": -2.8525941371917725, "logits/rejected": -2.8486194610595703, "logps/chosen": -39.11985397338867, "logps/rejected": -67.15181732177734, "loss": 0.6704, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6701538562774658, "rewards/margins": 2.437070846557617, "rewards/rejected": -0.7669171094894409, "step": 555 }, { "epoch": 1.2727012317387567, "grad_norm": 7.29126501083374, "learning_rate": 8.727064220183486e-07, "logits/chosen": -2.8552870750427246, "logits/rejected": -2.8751144409179688, "logps/chosen": -50.078155517578125, "logps/rejected": -79.71354675292969, "loss": 0.6851, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0450986623764038, "rewards/margins": 2.6130943298339844, "rewards/rejected": -1.567995548248291, "step": 556 }, { "epoch": 1.2749928387281582, "grad_norm": 15.938874244689941, "learning_rate": 8.724770642201834e-07, "logits/chosen": -2.785756826400757, "logits/rejected": -2.814128875732422, "logps/chosen": -43.16875457763672, "logps/rejected": -75.09980010986328, "loss": 0.6481, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2122794389724731, "rewards/margins": 2.7654550075531006, "rewards/rejected": -1.553175449371338, "step": 557 }, { "epoch": 1.2772844457175594, "grad_norm": 10.418244361877441, "learning_rate": 8.722477064220183e-07, "logits/chosen": -2.8080477714538574, "logits/rejected": -2.845545530319214, "logps/chosen": -58.749107360839844, "logps/rejected": -92.38622283935547, "loss": 0.6215, "rewards/accuracies": 0.90625, "rewards/chosen": 0.23140527307987213, "rewards/margins": 2.908557176589966, "rewards/rejected": -2.677151679992676, "step": 558 }, { "epoch": 1.2795760527069606, "grad_norm": 7.338142395019531, "learning_rate": 8.720183486238531e-07, "logits/chosen": -2.827505588531494, "logits/rejected": -2.824096918106079, "logps/chosen": -46.342323303222656, "logps/rejected": -90.28483581542969, "loss": 0.5604, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7275605797767639, "rewards/margins": 3.4110002517700195, "rewards/rejected": -2.6834397315979004, "step": 559 }, { "epoch": 1.281867659696362, "grad_norm": 9.172104835510254, "learning_rate": 8.71788990825688e-07, "logits/chosen": -2.773042917251587, "logits/rejected": -2.796581983566284, "logps/chosen": -44.84857177734375, "logps/rejected": -70.59574890136719, "loss": 0.7058, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8384576439857483, "rewards/margins": 2.051481246948242, "rewards/rejected": -1.2130236625671387, "step": 560 }, { "epoch": 1.2841592666857635, "grad_norm": 7.713747501373291, "learning_rate": 8.715596330275229e-07, "logits/chosen": -2.8063931465148926, "logits/rejected": -2.866486072540283, "logps/chosen": -42.521934509277344, "logps/rejected": -86.42282104492188, "loss": 0.5758, "rewards/accuracies": 1.0, "rewards/chosen": 1.0491660833358765, "rewards/margins": 3.492807626724243, "rewards/rejected": -2.443641424179077, "step": 561 }, { "epoch": 1.2864508736751648, "grad_norm": 13.447182655334473, "learning_rate": 8.713302752293577e-07, "logits/chosen": -2.9015982151031494, "logits/rejected": -2.878803253173828, "logps/chosen": -36.365577697753906, "logps/rejected": -65.22566223144531, "loss": 0.6673, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6302498579025269, "rewards/margins": 2.3940956592559814, "rewards/rejected": -0.7638460397720337, "step": 562 }, { "epoch": 1.288742480664566, "grad_norm": 6.86068058013916, "learning_rate": 8.711009174311927e-07, "logits/chosen": -2.8084678649902344, "logits/rejected": -2.8713889122009277, "logps/chosen": -44.243370056152344, "logps/rejected": -81.20457458496094, "loss": 0.5674, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9039268493652344, "rewards/margins": 3.0387072563171387, "rewards/rejected": -2.1347806453704834, "step": 563 }, { "epoch": 1.2910340876539674, "grad_norm": 9.427541732788086, "learning_rate": 8.708715596330275e-07, "logits/chosen": -2.857194185256958, "logits/rejected": -2.8837029933929443, "logps/chosen": -40.914207458496094, "logps/rejected": -78.6574478149414, "loss": 0.5757, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1838371753692627, "rewards/margins": 3.276099443435669, "rewards/rejected": -2.0922625064849854, "step": 564 }, { "epoch": 1.2933256946433687, "grad_norm": 10.746400833129883, "learning_rate": 8.706422018348624e-07, "logits/chosen": -2.789039373397827, "logits/rejected": -2.8230061531066895, "logps/chosen": -45.455379486083984, "logps/rejected": -78.38447570800781, "loss": 0.5942, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3118195533752441, "rewards/margins": 2.8852994441986084, "rewards/rejected": -1.5734798908233643, "step": 565 }, { "epoch": 1.29561730163277, "grad_norm": 10.072774887084961, "learning_rate": 8.704128440366972e-07, "logits/chosen": -2.8112432956695557, "logits/rejected": -2.797260046005249, "logps/chosen": -45.61674118041992, "logps/rejected": -68.68563079833984, "loss": 0.7425, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8745553493499756, "rewards/margins": 1.9340100288391113, "rewards/rejected": -1.0594546794891357, "step": 566 }, { "epoch": 1.2979089086221713, "grad_norm": 8.8301362991333, "learning_rate": 8.701834862385321e-07, "logits/chosen": -2.809237480163574, "logits/rejected": -2.820695161819458, "logps/chosen": -39.31450653076172, "logps/rejected": -62.07235336303711, "loss": 0.7688, "rewards/accuracies": 0.75, "rewards/chosen": 1.1237316131591797, "rewards/margins": 1.8377768993377686, "rewards/rejected": -0.7140451669692993, "step": 567 }, { "epoch": 1.3002005156115726, "grad_norm": 8.11194896697998, "learning_rate": 8.69954128440367e-07, "logits/chosen": -2.787999391555786, "logits/rejected": -2.7430896759033203, "logps/chosen": -54.39020538330078, "logps/rejected": -81.93588256835938, "loss": 0.6797, "rewards/accuracies": 0.90625, "rewards/chosen": 0.32864680886268616, "rewards/margins": 2.350273847579956, "rewards/rejected": -2.0216269493103027, "step": 568 }, { "epoch": 1.3024921226009738, "grad_norm": 12.816180229187012, "learning_rate": 8.697247706422018e-07, "logits/chosen": -2.8487346172332764, "logits/rejected": -2.841752767562866, "logps/chosen": -40.40589141845703, "logps/rejected": -77.11871337890625, "loss": 0.5658, "rewards/accuracies": 0.875, "rewards/chosen": 1.4377683401107788, "rewards/margins": 3.007173776626587, "rewards/rejected": -1.5694057941436768, "step": 569 }, { "epoch": 1.3047837295903753, "grad_norm": 9.607053756713867, "learning_rate": 8.694954128440368e-07, "logits/chosen": -2.827343702316284, "logits/rejected": -2.8474221229553223, "logps/chosen": -41.079193115234375, "logps/rejected": -70.50853729248047, "loss": 0.688, "rewards/accuracies": 0.875, "rewards/chosen": 1.0311014652252197, "rewards/margins": 2.4400761127471924, "rewards/rejected": -1.4089748859405518, "step": 570 }, { "epoch": 1.3070753365797767, "grad_norm": 7.870156764984131, "learning_rate": 8.692660550458715e-07, "logits/chosen": -2.9147462844848633, "logits/rejected": -2.93019962310791, "logps/chosen": -34.340782165527344, "logps/rejected": -75.16285705566406, "loss": 0.5638, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6471126079559326, "rewards/margins": 3.4431686401367188, "rewards/rejected": -1.7960560321807861, "step": 571 }, { "epoch": 1.309366943569178, "grad_norm": 11.205430030822754, "learning_rate": 8.690366972477063e-07, "logits/chosen": -2.913628578186035, "logits/rejected": -2.8708252906799316, "logps/chosen": -51.462890625, "logps/rejected": -86.59075164794922, "loss": 0.6216, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2843170166015625, "rewards/margins": 2.9924705028533936, "rewards/rejected": -2.70815372467041, "step": 572 }, { "epoch": 1.3116585505585792, "grad_norm": 16.24692726135254, "learning_rate": 8.688073394495412e-07, "logits/chosen": -2.894204616546631, "logits/rejected": -2.89656925201416, "logps/chosen": -47.29188537597656, "logps/rejected": -73.69001007080078, "loss": 0.6536, "rewards/accuracies": 0.875, "rewards/chosen": 0.9431883096694946, "rewards/margins": 2.428267240524292, "rewards/rejected": -1.485079050064087, "step": 573 }, { "epoch": 1.3139501575479806, "grad_norm": 13.286011695861816, "learning_rate": 8.685779816513761e-07, "logits/chosen": -2.8518922328948975, "logits/rejected": -2.9064431190490723, "logps/chosen": -45.75670623779297, "logps/rejected": -87.2391357421875, "loss": 0.5456, "rewards/accuracies": 1.0, "rewards/chosen": 0.9603822827339172, "rewards/margins": 3.670750617980957, "rewards/rejected": -2.7103683948516846, "step": 574 }, { "epoch": 1.3162417645373818, "grad_norm": 12.267091751098633, "learning_rate": 8.68348623853211e-07, "logits/chosen": -2.8057403564453125, "logits/rejected": -2.8426101207733154, "logps/chosen": -53.65542984008789, "logps/rejected": -85.6519775390625, "loss": 0.6677, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2014506310224533, "rewards/margins": 2.8603429794311523, "rewards/rejected": -2.6588921546936035, "step": 575 }, { "epoch": 1.318533371526783, "grad_norm": 10.422828674316406, "learning_rate": 8.681192660550458e-07, "logits/chosen": -2.8448078632354736, "logits/rejected": -2.908798933029175, "logps/chosen": -45.05570602416992, "logps/rejected": -82.97484588623047, "loss": 0.6068, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7501072883605957, "rewards/margins": 3.1807055473327637, "rewards/rejected": -2.430598258972168, "step": 576 }, { "epoch": 1.3208249785161845, "grad_norm": 11.43398380279541, "learning_rate": 8.678899082568806e-07, "logits/chosen": -2.838489055633545, "logits/rejected": -2.83953857421875, "logps/chosen": -48.52384948730469, "logps/rejected": -82.9916763305664, "loss": 0.6052, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7365225553512573, "rewards/margins": 2.8290514945983887, "rewards/rejected": -2.092528820037842, "step": 577 }, { "epoch": 1.3231165855055858, "grad_norm": 9.140427589416504, "learning_rate": 8.676605504587156e-07, "logits/chosen": -2.840648889541626, "logits/rejected": -2.8229153156280518, "logps/chosen": -41.373722076416016, "logps/rejected": -76.52259826660156, "loss": 0.6704, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1483502388000488, "rewards/margins": 2.8313684463500977, "rewards/rejected": -1.6830183267593384, "step": 578 }, { "epoch": 1.325408192494987, "grad_norm": 12.473570823669434, "learning_rate": 8.674311926605504e-07, "logits/chosen": -2.91243314743042, "logits/rejected": -2.8717684745788574, "logps/chosen": -45.25349044799805, "logps/rejected": -76.19703674316406, "loss": 0.6639, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9704650640487671, "rewards/margins": 2.7009129524230957, "rewards/rejected": -1.730447769165039, "step": 579 }, { "epoch": 1.3276997994843884, "grad_norm": 9.178184509277344, "learning_rate": 8.672018348623853e-07, "logits/chosen": -2.822669267654419, "logits/rejected": -2.9013984203338623, "logps/chosen": -40.95546340942383, "logps/rejected": -81.31473541259766, "loss": 0.5435, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3233922719955444, "rewards/margins": 3.530151844024658, "rewards/rejected": -2.2067596912384033, "step": 580 }, { "epoch": 1.3299914064737899, "grad_norm": 14.315692901611328, "learning_rate": 8.669724770642202e-07, "logits/chosen": -2.9330382347106934, "logits/rejected": -2.910496711730957, "logps/chosen": -51.736202239990234, "logps/rejected": -88.00117492675781, "loss": 0.6294, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5312568545341492, "rewards/margins": 3.1598410606384277, "rewards/rejected": -2.6285839080810547, "step": 581 }, { "epoch": 1.3322830134631911, "grad_norm": 8.52137279510498, "learning_rate": 8.66743119266055e-07, "logits/chosen": -2.8465914726257324, "logits/rejected": -2.8418965339660645, "logps/chosen": -48.25731658935547, "logps/rejected": -84.71162414550781, "loss": 0.6014, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4301421046257019, "rewards/margins": 2.908217668533325, "rewards/rejected": -2.4780755043029785, "step": 582 }, { "epoch": 1.3345746204525923, "grad_norm": 17.726083755493164, "learning_rate": 8.665137614678899e-07, "logits/chosen": -2.906060218811035, "logits/rejected": -2.89936900138855, "logps/chosen": -43.30660629272461, "logps/rejected": -78.59895324707031, "loss": 0.6855, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9575456380844116, "rewards/margins": 2.776015281677246, "rewards/rejected": -1.818469524383545, "step": 583 }, { "epoch": 1.3368662274419938, "grad_norm": 13.657035827636719, "learning_rate": 8.662844036697247e-07, "logits/chosen": -2.9069223403930664, "logits/rejected": -2.877152919769287, "logps/chosen": -41.6921272277832, "logps/rejected": -78.39703369140625, "loss": 0.6123, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4115400314331055, "rewards/margins": 2.9589598178863525, "rewards/rejected": -1.547419548034668, "step": 584 }, { "epoch": 1.339157834431395, "grad_norm": 12.135503768920898, "learning_rate": 8.660550458715597e-07, "logits/chosen": -2.8500237464904785, "logits/rejected": -2.843696117401123, "logps/chosen": -43.5531120300293, "logps/rejected": -75.79480743408203, "loss": 0.619, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3739252090454102, "rewards/margins": 2.711348533630371, "rewards/rejected": -1.3374232053756714, "step": 585 }, { "epoch": 1.3414494414207963, "grad_norm": 9.461726188659668, "learning_rate": 8.658256880733945e-07, "logits/chosen": -2.881432294845581, "logits/rejected": -2.8408756256103516, "logps/chosen": -43.284629821777344, "logps/rejected": -76.70954895019531, "loss": 0.5897, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1651889085769653, "rewards/margins": 2.8674306869506836, "rewards/rejected": -1.7022420167922974, "step": 586 }, { "epoch": 1.3437410484101977, "grad_norm": 12.076260566711426, "learning_rate": 8.655963302752292e-07, "logits/chosen": -2.919689655303955, "logits/rejected": -2.8901844024658203, "logps/chosen": -33.941497802734375, "logps/rejected": -70.36898803710938, "loss": 0.6322, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9852490425109863, "rewards/margins": 3.1336264610290527, "rewards/rejected": -1.148377537727356, "step": 587 }, { "epoch": 1.346032655399599, "grad_norm": 7.451469898223877, "learning_rate": 8.653669724770641e-07, "logits/chosen": -2.9352493286132812, "logits/rejected": -2.9435834884643555, "logps/chosen": -41.50102996826172, "logps/rejected": -75.0322265625, "loss": 0.6488, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4007039070129395, "rewards/margins": 2.6167173385620117, "rewards/rejected": -1.2160133123397827, "step": 588 }, { "epoch": 1.3483242623890002, "grad_norm": 11.872488021850586, "learning_rate": 8.65137614678899e-07, "logits/chosen": -2.7952210903167725, "logits/rejected": -2.8344218730926514, "logps/chosen": -48.95122528076172, "logps/rejected": -81.84514617919922, "loss": 0.6183, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8005148768424988, "rewards/margins": 2.7444663047790527, "rewards/rejected": -1.9439512491226196, "step": 589 }, { "epoch": 1.3506158693784016, "grad_norm": 8.305583000183105, "learning_rate": 8.649082568807339e-07, "logits/chosen": -2.886251211166382, "logits/rejected": -2.8987491130828857, "logps/chosen": -48.623191833496094, "logps/rejected": -83.60182189941406, "loss": 0.6385, "rewards/accuracies": 1.0, "rewards/chosen": 0.7900115251541138, "rewards/margins": 2.979316473007202, "rewards/rejected": -2.189305305480957, "step": 590 }, { "epoch": 1.3529074763678028, "grad_norm": 10.404364585876465, "learning_rate": 8.646788990825687e-07, "logits/chosen": -2.9165754318237305, "logits/rejected": -2.88008189201355, "logps/chosen": -44.28811264038086, "logps/rejected": -71.9471664428711, "loss": 0.7131, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0405724048614502, "rewards/margins": 2.070927381515503, "rewards/rejected": -1.0303548574447632, "step": 591 }, { "epoch": 1.3551990833572043, "grad_norm": 9.365159034729004, "learning_rate": 8.644495412844037e-07, "logits/chosen": -2.7522692680358887, "logits/rejected": -2.766407012939453, "logps/chosen": -39.13132095336914, "logps/rejected": -81.35939025878906, "loss": 0.5477, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4052820205688477, "rewards/margins": 3.533940076828003, "rewards/rejected": -2.128657579421997, "step": 592 }, { "epoch": 1.3574906903466055, "grad_norm": 8.711198806762695, "learning_rate": 8.642201834862385e-07, "logits/chosen": -2.9170081615448, "logits/rejected": -2.9377336502075195, "logps/chosen": -41.84513854980469, "logps/rejected": -79.85565185546875, "loss": 0.5985, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2538074254989624, "rewards/margins": 3.195112466812134, "rewards/rejected": -1.941305160522461, "step": 593 }, { "epoch": 1.359782297336007, "grad_norm": 13.722671508789062, "learning_rate": 8.639908256880733e-07, "logits/chosen": -2.8282034397125244, "logits/rejected": -2.8375234603881836, "logps/chosen": -47.02557373046875, "logps/rejected": -80.39691162109375, "loss": 0.6402, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7952014207839966, "rewards/margins": 2.693359375, "rewards/rejected": -1.8981578350067139, "step": 594 }, { "epoch": 1.3620739043254082, "grad_norm": 14.523504257202148, "learning_rate": 8.637614678899082e-07, "logits/chosen": -2.9510574340820312, "logits/rejected": -2.928339719772339, "logps/chosen": -45.07023620605469, "logps/rejected": -66.05694580078125, "loss": 0.7522, "rewards/accuracies": 0.875, "rewards/chosen": 1.1506048440933228, "rewards/margins": 1.9260857105255127, "rewards/rejected": -0.7754809856414795, "step": 595 }, { "epoch": 1.3643655113148094, "grad_norm": 10.585907936096191, "learning_rate": 8.635321100917431e-07, "logits/chosen": -2.8499655723571777, "logits/rejected": -2.799696683883667, "logps/chosen": -46.254600524902344, "logps/rejected": -75.81117248535156, "loss": 0.7259, "rewards/accuracies": 0.875, "rewards/chosen": 0.8211585283279419, "rewards/margins": 2.253655195236206, "rewards/rejected": -1.4324966669082642, "step": 596 }, { "epoch": 1.3666571183042109, "grad_norm": 13.518196105957031, "learning_rate": 8.63302752293578e-07, "logits/chosen": -2.8832290172576904, "logits/rejected": -2.8488705158233643, "logps/chosen": -40.67107391357422, "logps/rejected": -64.78826904296875, "loss": 0.6862, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6255571842193604, "rewards/margins": 2.2842860221862793, "rewards/rejected": -0.6587291359901428, "step": 597 }, { "epoch": 1.3689487252936121, "grad_norm": 13.708663940429688, "learning_rate": 8.630733944954128e-07, "logits/chosen": -2.912433624267578, "logits/rejected": -2.9163496494293213, "logps/chosen": -35.63734436035156, "logps/rejected": -69.54252624511719, "loss": 0.6632, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6841166019439697, "rewards/margins": 2.72694730758667, "rewards/rejected": -1.0428307056427002, "step": 598 }, { "epoch": 1.3712403322830133, "grad_norm": 10.164531707763672, "learning_rate": 8.628440366972477e-07, "logits/chosen": -2.8452036380767822, "logits/rejected": -2.853346824645996, "logps/chosen": -38.72636032104492, "logps/rejected": -72.49779510498047, "loss": 0.6226, "rewards/accuracies": 0.84375, "rewards/chosen": 1.7594654560089111, "rewards/margins": 2.8279120922088623, "rewards/rejected": -1.0684466361999512, "step": 599 }, { "epoch": 1.3735319392724148, "grad_norm": 7.652174472808838, "learning_rate": 8.626146788990826e-07, "logits/chosen": -2.876345157623291, "logits/rejected": -2.8534231185913086, "logps/chosen": -48.872379302978516, "logps/rejected": -73.39599609375, "loss": 0.6965, "rewards/accuracies": 0.84375, "rewards/chosen": 1.095994234085083, "rewards/margins": 2.124889612197876, "rewards/rejected": -1.028895378112793, "step": 600 }, { "epoch": 1.375823546261816, "grad_norm": 14.901665687561035, "learning_rate": 8.623853211009174e-07, "logits/chosen": -2.8461523056030273, "logits/rejected": -2.7840561866760254, "logps/chosen": -43.32239532470703, "logps/rejected": -71.77783203125, "loss": 0.6853, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1518932580947876, "rewards/margins": 2.321770668029785, "rewards/rejected": -1.1698776483535767, "step": 601 }, { "epoch": 1.3781151532512175, "grad_norm": 11.075509071350098, "learning_rate": 8.621559633027523e-07, "logits/chosen": -2.855404853820801, "logits/rejected": -2.894664764404297, "logps/chosen": -42.38547134399414, "logps/rejected": -70.95701599121094, "loss": 0.6506, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4999374151229858, "rewards/margins": 2.4527578353881836, "rewards/rejected": -0.9528204798698425, "step": 602 }, { "epoch": 1.3804067602406187, "grad_norm": 7.150477409362793, "learning_rate": 8.619266055045871e-07, "logits/chosen": -2.9003219604492188, "logits/rejected": -2.9543867111206055, "logps/chosen": -41.37987518310547, "logps/rejected": -70.01649475097656, "loss": 0.6495, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2313685417175293, "rewards/margins": 2.5954413414001465, "rewards/rejected": -1.3640730381011963, "step": 603 }, { "epoch": 1.3826983672300202, "grad_norm": 8.067315101623535, "learning_rate": 8.616972477064219e-07, "logits/chosen": -2.948819875717163, "logits/rejected": -2.9611656665802, "logps/chosen": -40.691490173339844, "logps/rejected": -69.02455139160156, "loss": 0.7418, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2489900588989258, "rewards/margins": 2.3590219020843506, "rewards/rejected": -1.1100317239761353, "step": 604 }, { "epoch": 1.3849899742194214, "grad_norm": 7.433182716369629, "learning_rate": 8.614678899082568e-07, "logits/chosen": -2.882449150085449, "logits/rejected": -2.888458728790283, "logps/chosen": -42.14861297607422, "logps/rejected": -75.28173828125, "loss": 0.5992, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2466710805892944, "rewards/margins": 3.04638934135437, "rewards/rejected": -1.7997183799743652, "step": 605 }, { "epoch": 1.3872815812088226, "grad_norm": 11.271148681640625, "learning_rate": 8.612385321100916e-07, "logits/chosen": -2.9644436836242676, "logits/rejected": -2.9482033252716064, "logps/chosen": -42.420963287353516, "logps/rejected": -75.2066650390625, "loss": 0.5819, "rewards/accuracies": 0.90625, "rewards/chosen": 1.058891773223877, "rewards/margins": 2.7276597023010254, "rewards/rejected": -1.6687679290771484, "step": 606 }, { "epoch": 1.389573188198224, "grad_norm": 9.357904434204102, "learning_rate": 8.610091743119266e-07, "logits/chosen": -2.9211158752441406, "logits/rejected": -2.915501832962036, "logps/chosen": -49.67837905883789, "logps/rejected": -81.26661682128906, "loss": 0.6376, "rewards/accuracies": 0.875, "rewards/chosen": 0.6344574689865112, "rewards/margins": 2.760791063308716, "rewards/rejected": -2.126333236694336, "step": 607 }, { "epoch": 1.3918647951876253, "grad_norm": 12.836385726928711, "learning_rate": 8.607798165137614e-07, "logits/chosen": -2.9050216674804688, "logits/rejected": -2.8699874877929688, "logps/chosen": -42.58396911621094, "logps/rejected": -77.01746368408203, "loss": 0.5764, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8838998079299927, "rewards/margins": 2.9684462547302246, "rewards/rejected": -2.0845465660095215, "step": 608 }, { "epoch": 1.3941564021770265, "grad_norm": 8.80113410949707, "learning_rate": 8.605504587155962e-07, "logits/chosen": -2.792681932449341, "logits/rejected": -2.8532960414886475, "logps/chosen": -52.98316192626953, "logps/rejected": -77.21434783935547, "loss": 0.7481, "rewards/accuracies": 0.75, "rewards/chosen": 0.25816458463668823, "rewards/margins": 2.094980001449585, "rewards/rejected": -1.836815595626831, "step": 609 }, { "epoch": 1.396448009166428, "grad_norm": 8.216728210449219, "learning_rate": 8.603211009174312e-07, "logits/chosen": -2.8916008472442627, "logits/rejected": -2.8382389545440674, "logps/chosen": -39.735836029052734, "logps/rejected": -81.77629852294922, "loss": 0.6234, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2735061645507812, "rewards/margins": 3.5250840187072754, "rewards/rejected": -2.251577615737915, "step": 610 }, { "epoch": 1.3987396161558292, "grad_norm": 7.436587333679199, "learning_rate": 8.60091743119266e-07, "logits/chosen": -2.930312156677246, "logits/rejected": -2.934128761291504, "logps/chosen": -35.30278778076172, "logps/rejected": -66.5207290649414, "loss": 0.6383, "rewards/accuracies": 0.84375, "rewards/chosen": 1.5452461242675781, "rewards/margins": 2.680253744125366, "rewards/rejected": -1.135007619857788, "step": 611 }, { "epoch": 1.4010312231452307, "grad_norm": 8.653244018554688, "learning_rate": 8.598623853211009e-07, "logits/chosen": -2.813070297241211, "logits/rejected": -2.8515892028808594, "logps/chosen": -39.92707061767578, "logps/rejected": -77.07654571533203, "loss": 0.5697, "rewards/accuracies": 0.875, "rewards/chosen": 1.5412923097610474, "rewards/margins": 3.2486062049865723, "rewards/rejected": -1.707313895225525, "step": 612 }, { "epoch": 1.4033228301346319, "grad_norm": 10.808942794799805, "learning_rate": 8.596330275229357e-07, "logits/chosen": -2.8140816688537598, "logits/rejected": -2.838475227355957, "logps/chosen": -57.91594314575195, "logps/rejected": -81.1737289428711, "loss": 0.7576, "rewards/accuracies": 0.84375, "rewards/chosen": 0.003972157835960388, "rewards/margins": 2.060753583908081, "rewards/rejected": -2.05678129196167, "step": 613 }, { "epoch": 1.4056144371240333, "grad_norm": 8.314009666442871, "learning_rate": 8.594036697247707e-07, "logits/chosen": -2.847810745239258, "logits/rejected": -2.8622548580169678, "logps/chosen": -43.63530731201172, "logps/rejected": -81.65909576416016, "loss": 0.5739, "rewards/accuracies": 0.875, "rewards/chosen": 1.0303195714950562, "rewards/margins": 3.1968836784362793, "rewards/rejected": -2.1665639877319336, "step": 614 }, { "epoch": 1.4079060441134346, "grad_norm": 9.584829330444336, "learning_rate": 8.591743119266055e-07, "logits/chosen": -2.819286584854126, "logits/rejected": -2.8516993522644043, "logps/chosen": -49.14889144897461, "logps/rejected": -87.61512756347656, "loss": 0.5957, "rewards/accuracies": 0.90625, "rewards/chosen": 0.556250274181366, "rewards/margins": 3.1452267169952393, "rewards/rejected": -2.5889763832092285, "step": 615 }, { "epoch": 1.4101976511028358, "grad_norm": 12.810009956359863, "learning_rate": 8.589449541284403e-07, "logits/chosen": -2.9264636039733887, "logits/rejected": -2.8908326625823975, "logps/chosen": -52.65846252441406, "logps/rejected": -87.58707427978516, "loss": 0.6916, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3847886025905609, "rewards/margins": 2.9259774684906006, "rewards/rejected": -2.5411887168884277, "step": 616 }, { "epoch": 1.4101976511028358, "eval_logits/chosen": -2.952559232711792, "eval_logits/rejected": -2.9702305793762207, "eval_logps/chosen": -54.68046569824219, "eval_logps/rejected": -81.79021453857422, "eval_loss": 0.715264081954956, "eval_rewards/accuracies": 0.850943386554718, "eval_rewards/chosen": 0.23818878829479218, "eval_rewards/margins": 2.3122661113739014, "eval_rewards/rejected": -2.0740773677825928, "eval_runtime": 958.2217, "eval_samples_per_second": 0.552, "eval_steps_per_second": 0.277, "step": 616 }, { "epoch": 1.4124892580922372, "grad_norm": 9.928647994995117, "learning_rate": 8.587155963302753e-07, "logits/chosen": -2.9278523921966553, "logits/rejected": -2.955611228942871, "logps/chosen": -40.395896911621094, "logps/rejected": -79.88941192626953, "loss": 0.5759, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2621445655822754, "rewards/margins": 3.482513666152954, "rewards/rejected": -2.2203691005706787, "step": 617 }, { "epoch": 1.4147808650816385, "grad_norm": 9.075573921203613, "learning_rate": 8.584862385321101e-07, "logits/chosen": -2.8244311809539795, "logits/rejected": -2.849541664123535, "logps/chosen": -37.362483978271484, "logps/rejected": -74.02214050292969, "loss": 0.6324, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5691890716552734, "rewards/margins": 3.267742872238159, "rewards/rejected": -1.6985535621643066, "step": 618 }, { "epoch": 1.4170724720710397, "grad_norm": 8.212471008300781, "learning_rate": 8.582568807339449e-07, "logits/chosen": -2.8175673484802246, "logits/rejected": -2.820033550262451, "logps/chosen": -47.03791046142578, "logps/rejected": -79.6719741821289, "loss": 0.6698, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7067160606384277, "rewards/margins": 2.7980082035064697, "rewards/rejected": -2.091292142868042, "step": 619 }, { "epoch": 1.4193640790604412, "grad_norm": 11.878388404846191, "learning_rate": 8.580275229357797e-07, "logits/chosen": -2.8408846855163574, "logits/rejected": -2.812953472137451, "logps/chosen": -39.83729934692383, "logps/rejected": -71.3825912475586, "loss": 0.6088, "rewards/accuracies": 0.875, "rewards/chosen": 1.4759650230407715, "rewards/margins": 2.7259023189544678, "rewards/rejected": -1.2499371767044067, "step": 620 }, { "epoch": 1.4216556860498424, "grad_norm": 8.7399320602417, "learning_rate": 8.577981651376146e-07, "logits/chosen": -2.856846332550049, "logits/rejected": -2.841735363006592, "logps/chosen": -53.4182014465332, "logps/rejected": -82.55443572998047, "loss": 0.6902, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5496328473091125, "rewards/margins": 2.4926626682281494, "rewards/rejected": -1.9430298805236816, "step": 621 }, { "epoch": 1.4239472930392438, "grad_norm": 11.040484428405762, "learning_rate": 8.575688073394495e-07, "logits/chosen": -2.8068854808807373, "logits/rejected": -2.8828437328338623, "logps/chosen": -40.49285125732422, "logps/rejected": -72.60716247558594, "loss": 0.6359, "rewards/accuracies": 0.875, "rewards/chosen": 1.1114157438278198, "rewards/margins": 2.662277936935425, "rewards/rejected": -1.5508626699447632, "step": 622 }, { "epoch": 1.426238900028645, "grad_norm": 9.411283493041992, "learning_rate": 8.573394495412843e-07, "logits/chosen": -2.8755249977111816, "logits/rejected": -2.8758320808410645, "logps/chosen": -48.3846435546875, "logps/rejected": -74.28535461425781, "loss": 0.7258, "rewards/accuracies": 0.84375, "rewards/chosen": 1.088705062866211, "rewards/margins": 2.396620750427246, "rewards/rejected": -1.3079155683517456, "step": 623 }, { "epoch": 1.4285305070180465, "grad_norm": 13.242238998413086, "learning_rate": 8.571100917431193e-07, "logits/chosen": -2.8979175090789795, "logits/rejected": -2.859593629837036, "logps/chosen": -39.539302825927734, "logps/rejected": -69.07128143310547, "loss": 0.6586, "rewards/accuracies": 0.84375, "rewards/chosen": 1.64224374294281, "rewards/margins": 2.6290998458862305, "rewards/rejected": -0.9868561029434204, "step": 624 }, { "epoch": 1.4308221140074477, "grad_norm": 13.903903007507324, "learning_rate": 8.568807339449541e-07, "logits/chosen": -2.8000710010528564, "logits/rejected": -2.844153881072998, "logps/chosen": -37.08573913574219, "logps/rejected": -63.38592529296875, "loss": 0.7267, "rewards/accuracies": 0.8125, "rewards/chosen": 1.7144157886505127, "rewards/margins": 2.270879030227661, "rewards/rejected": -0.5564632415771484, "step": 625 }, { "epoch": 1.433113720996849, "grad_norm": 10.213741302490234, "learning_rate": 8.566513761467889e-07, "logits/chosen": -2.8882858753204346, "logits/rejected": -2.8824105262756348, "logps/chosen": -44.55916976928711, "logps/rejected": -79.88733673095703, "loss": 0.6852, "rewards/accuracies": 0.9375, "rewards/chosen": 1.195430874824524, "rewards/margins": 2.913625717163086, "rewards/rejected": -1.718194603919983, "step": 626 }, { "epoch": 1.4354053279862504, "grad_norm": 9.8517427444458, "learning_rate": 8.564220183486238e-07, "logits/chosen": -2.9259724617004395, "logits/rejected": -2.911530017852783, "logps/chosen": -42.86736297607422, "logps/rejected": -77.16443634033203, "loss": 0.672, "rewards/accuracies": 0.875, "rewards/chosen": 1.0624048709869385, "rewards/margins": 2.6847524642944336, "rewards/rejected": -1.6223475933074951, "step": 627 }, { "epoch": 1.4376969349756517, "grad_norm": 10.56619644165039, "learning_rate": 8.561926605504587e-07, "logits/chosen": -2.867980718612671, "logits/rejected": -2.8512821197509766, "logps/chosen": -46.20037841796875, "logps/rejected": -79.48617553710938, "loss": 0.6031, "rewards/accuracies": 0.875, "rewards/chosen": 0.8297080397605896, "rewards/margins": 2.8533945083618164, "rewards/rejected": -2.023686408996582, "step": 628 }, { "epoch": 1.4399885419650529, "grad_norm": 20.46930503845215, "learning_rate": 8.559633027522936e-07, "logits/chosen": -2.808973789215088, "logits/rejected": -2.8104798793792725, "logps/chosen": -46.65483856201172, "logps/rejected": -78.6059799194336, "loss": 0.6513, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9579139351844788, "rewards/margins": 2.4699220657348633, "rewards/rejected": -1.5120081901550293, "step": 629 }, { "epoch": 1.4422801489544543, "grad_norm": 7.506832599639893, "learning_rate": 8.557339449541284e-07, "logits/chosen": -2.887295961380005, "logits/rejected": -2.8773674964904785, "logps/chosen": -36.037864685058594, "logps/rejected": -69.67924499511719, "loss": 0.6182, "rewards/accuracies": 0.9375, "rewards/chosen": 1.617605447769165, "rewards/margins": 2.8373894691467285, "rewards/rejected": -1.219784140586853, "step": 630 }, { "epoch": 1.4445717559438556, "grad_norm": 7.9775214195251465, "learning_rate": 8.555045871559634e-07, "logits/chosen": -2.916804790496826, "logits/rejected": -2.8767330646514893, "logps/chosen": -38.8465461730957, "logps/rejected": -72.0118637084961, "loss": 0.6231, "rewards/accuracies": 0.875, "rewards/chosen": 1.6153271198272705, "rewards/margins": 3.001117706298828, "rewards/rejected": -1.3857905864715576, "step": 631 }, { "epoch": 1.446863362933257, "grad_norm": 10.351363182067871, "learning_rate": 8.552752293577982e-07, "logits/chosen": -2.8857240676879883, "logits/rejected": -2.8707313537597656, "logps/chosen": -38.78240203857422, "logps/rejected": -68.77133178710938, "loss": 0.6333, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3102912902832031, "rewards/margins": 2.7243270874023438, "rewards/rejected": -1.4140357971191406, "step": 632 }, { "epoch": 1.4491549699226582, "grad_norm": 8.843343734741211, "learning_rate": 8.55045871559633e-07, "logits/chosen": -2.84979248046875, "logits/rejected": -2.83492112159729, "logps/chosen": -48.517547607421875, "logps/rejected": -76.95475769042969, "loss": 0.683, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6454142928123474, "rewards/margins": 2.4394736289978027, "rewards/rejected": -1.7940593957901, "step": 633 }, { "epoch": 1.4514465769120597, "grad_norm": 15.761726379394531, "learning_rate": 8.548165137614679e-07, "logits/chosen": -2.8652358055114746, "logits/rejected": -2.889213800430298, "logps/chosen": -40.928688049316406, "logps/rejected": -77.83663940429688, "loss": 0.5547, "rewards/accuracies": 0.875, "rewards/chosen": 1.328755259513855, "rewards/margins": 3.4346539974212646, "rewards/rejected": -2.105898141860962, "step": 634 }, { "epoch": 1.453738183901461, "grad_norm": 15.78619384765625, "learning_rate": 8.545871559633027e-07, "logits/chosen": -2.881309986114502, "logits/rejected": -2.911236047744751, "logps/chosen": -48.48576736450195, "logps/rejected": -87.89336395263672, "loss": 0.668, "rewards/accuracies": 0.875, "rewards/chosen": 0.770766019821167, "rewards/margins": 3.469512939453125, "rewards/rejected": -2.698747158050537, "step": 635 }, { "epoch": 1.4560297908908622, "grad_norm": 7.58271598815918, "learning_rate": 8.543577981651376e-07, "logits/chosen": -2.814908027648926, "logits/rejected": -2.8481674194335938, "logps/chosen": -52.19138717651367, "logps/rejected": -80.4373779296875, "loss": 0.7274, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4150330424308777, "rewards/margins": 2.46997332572937, "rewards/rejected": -2.0549399852752686, "step": 636 }, { "epoch": 1.4583213978802636, "grad_norm": 9.59171199798584, "learning_rate": 8.541284403669724e-07, "logits/chosen": -2.9179418087005615, "logits/rejected": -2.896697998046875, "logps/chosen": -39.71175765991211, "logps/rejected": -78.67866516113281, "loss": 0.6163, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2562167644500732, "rewards/margins": 3.1108028888702393, "rewards/rejected": -1.8545863628387451, "step": 637 }, { "epoch": 1.4606130048696648, "grad_norm": 7.381679058074951, "learning_rate": 8.538990825688072e-07, "logits/chosen": -2.759326934814453, "logits/rejected": -2.7768819332122803, "logps/chosen": -51.062530517578125, "logps/rejected": -75.21879577636719, "loss": 0.7417, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6039450764656067, "rewards/margins": 2.023893117904663, "rewards/rejected": -1.4199479818344116, "step": 638 }, { "epoch": 1.462904611859066, "grad_norm": 13.041356086730957, "learning_rate": 8.536697247706422e-07, "logits/chosen": -2.8348212242126465, "logits/rejected": -2.9116251468658447, "logps/chosen": -46.93351745605469, "logps/rejected": -78.0733642578125, "loss": 0.6686, "rewards/accuracies": 0.875, "rewards/chosen": 0.8006098866462708, "rewards/margins": 2.6246402263641357, "rewards/rejected": -1.8240303993225098, "step": 639 }, { "epoch": 1.4651962188484675, "grad_norm": 8.775309562683105, "learning_rate": 8.53440366972477e-07, "logits/chosen": -2.811218500137329, "logits/rejected": -2.8622679710388184, "logps/chosen": -41.430442810058594, "logps/rejected": -79.81686401367188, "loss": 0.5627, "rewards/accuracies": 0.9375, "rewards/chosen": 1.453287124633789, "rewards/margins": 3.4994659423828125, "rewards/rejected": -2.0461785793304443, "step": 640 }, { "epoch": 1.4674878258378687, "grad_norm": 7.97914457321167, "learning_rate": 8.532110091743119e-07, "logits/chosen": -2.8320488929748535, "logits/rejected": -2.868112087249756, "logps/chosen": -48.94407272338867, "logps/rejected": -76.65811157226562, "loss": 0.6687, "rewards/accuracies": 0.84375, "rewards/chosen": 0.760679304599762, "rewards/margins": 2.5605993270874023, "rewards/rejected": -1.799919843673706, "step": 641 }, { "epoch": 1.4697794328272702, "grad_norm": 12.7265625, "learning_rate": 8.529816513761468e-07, "logits/chosen": -2.7988603115081787, "logits/rejected": -2.8734326362609863, "logps/chosen": -45.485801696777344, "logps/rejected": -75.558837890625, "loss": 0.6588, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9026385545730591, "rewards/margins": 2.540165662765503, "rewards/rejected": -1.6375268697738647, "step": 642 }, { "epoch": 1.4720710398166714, "grad_norm": 9.386987686157227, "learning_rate": 8.527522935779816e-07, "logits/chosen": -2.9192590713500977, "logits/rejected": -2.929123640060425, "logps/chosen": -42.362789154052734, "logps/rejected": -72.30436706542969, "loss": 0.6765, "rewards/accuracies": 0.875, "rewards/chosen": 1.1608822345733643, "rewards/margins": 2.604875087738037, "rewards/rejected": -1.4439926147460938, "step": 643 }, { "epoch": 1.4743626468060729, "grad_norm": 8.719470977783203, "learning_rate": 8.525229357798165e-07, "logits/chosen": -2.8398704528808594, "logits/rejected": -2.851740598678589, "logps/chosen": -39.10048294067383, "logps/rejected": -67.44645690917969, "loss": 0.6707, "rewards/accuracies": 0.875, "rewards/chosen": 1.388844609260559, "rewards/margins": 2.3371403217315674, "rewards/rejected": -0.9482958316802979, "step": 644 }, { "epoch": 1.476654253795474, "grad_norm": 12.23056411743164, "learning_rate": 8.522935779816513e-07, "logits/chosen": -2.8667893409729004, "logits/rejected": -2.8736984729766846, "logps/chosen": -47.19331359863281, "logps/rejected": -90.29883575439453, "loss": 0.5337, "rewards/accuracies": 0.875, "rewards/chosen": 0.8695092797279358, "rewards/margins": 3.698547601699829, "rewards/rejected": -2.829038143157959, "step": 645 }, { "epoch": 1.4789458607848753, "grad_norm": 8.939044952392578, "learning_rate": 8.520642201834863e-07, "logits/chosen": -2.8542492389678955, "logits/rejected": -2.840627908706665, "logps/chosen": -41.324771881103516, "logps/rejected": -85.0751953125, "loss": 0.5025, "rewards/accuracies": 0.9375, "rewards/chosen": 1.242061972618103, "rewards/margins": 3.7728822231292725, "rewards/rejected": -2.530820369720459, "step": 646 }, { "epoch": 1.4812374677742768, "grad_norm": 9.621044158935547, "learning_rate": 8.518348623853211e-07, "logits/chosen": -2.7866756916046143, "logits/rejected": -2.8216440677642822, "logps/chosen": -45.769981384277344, "logps/rejected": -84.96842956542969, "loss": 0.5748, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0143550634384155, "rewards/margins": 3.258186101913452, "rewards/rejected": -2.243830919265747, "step": 647 }, { "epoch": 1.483529074763678, "grad_norm": 10.42885684967041, "learning_rate": 8.516055045871559e-07, "logits/chosen": -2.85745906829834, "logits/rejected": -2.8789732456207275, "logps/chosen": -42.41094207763672, "logps/rejected": -74.0433349609375, "loss": 0.6779, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3828656673431396, "rewards/margins": 2.8537275791168213, "rewards/rejected": -1.4708619117736816, "step": 648 }, { "epoch": 1.4858206817530792, "grad_norm": 7.14238166809082, "learning_rate": 8.513761467889908e-07, "logits/chosen": -2.801781177520752, "logits/rejected": -2.8532228469848633, "logps/chosen": -41.826576232910156, "logps/rejected": -83.31965637207031, "loss": 0.5694, "rewards/accuracies": 0.9375, "rewards/chosen": 1.19049072265625, "rewards/margins": 3.6057515144348145, "rewards/rejected": -2.4152612686157227, "step": 649 }, { "epoch": 1.4881122887424807, "grad_norm": 9.733274459838867, "learning_rate": 8.511467889908257e-07, "logits/chosen": -2.8341736793518066, "logits/rejected": -2.870314836502075, "logps/chosen": -42.12228775024414, "logps/rejected": -86.74010467529297, "loss": 0.4962, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2421455383300781, "rewards/margins": 3.823962926864624, "rewards/rejected": -2.581817150115967, "step": 650 }, { "epoch": 1.490403895731882, "grad_norm": 10.71160888671875, "learning_rate": 8.509174311926605e-07, "logits/chosen": -2.8147711753845215, "logits/rejected": -2.806711196899414, "logps/chosen": -42.00555419921875, "logps/rejected": -75.6572036743164, "loss": 0.6002, "rewards/accuracies": 0.96875, "rewards/chosen": 1.306767225265503, "rewards/margins": 2.948071241378784, "rewards/rejected": -1.6413040161132812, "step": 651 }, { "epoch": 1.4926955027212834, "grad_norm": 8.326332092285156, "learning_rate": 8.506880733944953e-07, "logits/chosen": -2.8649933338165283, "logits/rejected": -2.856771230697632, "logps/chosen": -45.855796813964844, "logps/rejected": -78.6924819946289, "loss": 0.6363, "rewards/accuracies": 0.75, "rewards/chosen": 0.9129177927970886, "rewards/margins": 2.821903705596924, "rewards/rejected": -1.9089860916137695, "step": 652 }, { "epoch": 1.4949871097106846, "grad_norm": 20.583724975585938, "learning_rate": 8.504587155963302e-07, "logits/chosen": -2.904496669769287, "logits/rejected": -2.9028329849243164, "logps/chosen": -46.10087966918945, "logps/rejected": -88.77313232421875, "loss": 0.552, "rewards/accuracies": 0.875, "rewards/chosen": 0.8871656656265259, "rewards/margins": 3.6344027519226074, "rewards/rejected": -2.747237205505371, "step": 653 }, { "epoch": 1.497278716700086, "grad_norm": 12.387661933898926, "learning_rate": 8.502293577981651e-07, "logits/chosen": -2.9128549098968506, "logits/rejected": -2.859240770339966, "logps/chosen": -45.2047119140625, "logps/rejected": -89.76052856445312, "loss": 0.6009, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6719737648963928, "rewards/margins": 3.6692566871643066, "rewards/rejected": -2.9972829818725586, "step": 654 }, { "epoch": 1.4995703236894873, "grad_norm": 7.692575454711914, "learning_rate": 8.499999999999999e-07, "logits/chosen": -2.8689863681793213, "logits/rejected": -2.8566665649414062, "logps/chosen": -43.483943939208984, "logps/rejected": -84.9565658569336, "loss": 0.5683, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0576555728912354, "rewards/margins": 3.33828067779541, "rewards/rejected": -2.280625343322754, "step": 655 }, { "epoch": 1.5018619306788885, "grad_norm": 12.441452026367188, "learning_rate": 8.497706422018348e-07, "logits/chosen": -2.9000492095947266, "logits/rejected": -2.9471724033355713, "logps/chosen": -46.383262634277344, "logps/rejected": -84.62969970703125, "loss": 0.5617, "rewards/accuracies": 0.9375, "rewards/chosen": 0.781667172908783, "rewards/margins": 3.3581132888793945, "rewards/rejected": -2.576446056365967, "step": 656 }, { "epoch": 1.50415353766829, "grad_norm": 11.70147705078125, "learning_rate": 8.495412844036697e-07, "logits/chosen": -2.8797383308410645, "logits/rejected": -2.921459436416626, "logps/chosen": -36.3912353515625, "logps/rejected": -71.44070434570312, "loss": 0.5751, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7171618938446045, "rewards/margins": 2.926177740097046, "rewards/rejected": -1.2090160846710205, "step": 657 }, { "epoch": 1.5064451446576912, "grad_norm": 9.182726860046387, "learning_rate": 8.493119266055046e-07, "logits/chosen": -2.8854551315307617, "logits/rejected": -2.9233360290527344, "logps/chosen": -42.15067672729492, "logps/rejected": -80.06953430175781, "loss": 0.6155, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1754579544067383, "rewards/margins": 3.1489505767822266, "rewards/rejected": -1.9734928607940674, "step": 658 }, { "epoch": 1.5087367516470924, "grad_norm": 10.549168586730957, "learning_rate": 8.490825688073394e-07, "logits/chosen": -2.8714611530303955, "logits/rejected": -2.8668203353881836, "logps/chosen": -39.899024963378906, "logps/rejected": -68.51654052734375, "loss": 0.6807, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4041790962219238, "rewards/margins": 2.4997005462646484, "rewards/rejected": -1.0955214500427246, "step": 659 }, { "epoch": 1.5110283586364939, "grad_norm": 13.112060546875, "learning_rate": 8.488532110091742e-07, "logits/chosen": -2.949186086654663, "logits/rejected": -3.028864622116089, "logps/chosen": -38.16391372680664, "logps/rejected": -74.20191955566406, "loss": 0.5969, "rewards/accuracies": 0.875, "rewards/chosen": 1.4858521223068237, "rewards/margins": 2.993687629699707, "rewards/rejected": -1.507835865020752, "step": 660 }, { "epoch": 1.5133199656258953, "grad_norm": 12.447436332702637, "learning_rate": 8.486238532110092e-07, "logits/chosen": -2.845137119293213, "logits/rejected": -2.8622865676879883, "logps/chosen": -32.26686477661133, "logps/rejected": -74.69081115722656, "loss": 0.5278, "rewards/accuracies": 0.9375, "rewards/chosen": 2.123319625854492, "rewards/margins": 3.5802559852600098, "rewards/rejected": -1.456936240196228, "step": 661 }, { "epoch": 1.5156115726152963, "grad_norm": 15.039712905883789, "learning_rate": 8.48394495412844e-07, "logits/chosen": -2.8884057998657227, "logits/rejected": -2.877969264984131, "logps/chosen": -38.789180755615234, "logps/rejected": -73.2402114868164, "loss": 0.5908, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4062228202819824, "rewards/margins": 2.8179969787597656, "rewards/rejected": -1.4117741584777832, "step": 662 }, { "epoch": 1.5179031796046978, "grad_norm": 8.439352035522461, "learning_rate": 8.481651376146789e-07, "logits/chosen": -2.951524496078491, "logits/rejected": -2.9352035522460938, "logps/chosen": -48.25758361816406, "logps/rejected": -84.85565185546875, "loss": 0.6691, "rewards/accuracies": 0.84375, "rewards/chosen": 0.657880961894989, "rewards/margins": 2.9545533657073975, "rewards/rejected": -2.2966723442077637, "step": 663 }, { "epoch": 1.5201947865940992, "grad_norm": 10.082240104675293, "learning_rate": 8.479357798165138e-07, "logits/chosen": -2.793243408203125, "logits/rejected": -2.8050718307495117, "logps/chosen": -47.175941467285156, "logps/rejected": -82.60867309570312, "loss": 0.6833, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7520779371261597, "rewards/margins": 3.045328140258789, "rewards/rejected": -2.293250322341919, "step": 664 }, { "epoch": 1.5224863935835005, "grad_norm": 9.2249116897583, "learning_rate": 8.477064220183486e-07, "logits/chosen": -2.8130459785461426, "logits/rejected": -2.862004041671753, "logps/chosen": -48.71665954589844, "logps/rejected": -78.87068176269531, "loss": 0.6794, "rewards/accuracies": 0.84375, "rewards/chosen": 0.713622510433197, "rewards/margins": 2.4717535972595215, "rewards/rejected": -1.7581312656402588, "step": 665 }, { "epoch": 1.5247780005729017, "grad_norm": 9.677994728088379, "learning_rate": 8.474770642201835e-07, "logits/chosen": -2.885803699493408, "logits/rejected": -2.926318645477295, "logps/chosen": -52.571292877197266, "logps/rejected": -83.09210205078125, "loss": 0.6619, "rewards/accuracies": 0.875, "rewards/chosen": 0.23484408855438232, "rewards/margins": 2.558126211166382, "rewards/rejected": -2.323282241821289, "step": 666 }, { "epoch": 1.5270696075623031, "grad_norm": 10.353286743164062, "learning_rate": 8.472477064220182e-07, "logits/chosen": -2.8817880153656006, "logits/rejected": -2.8948001861572266, "logps/chosen": -44.624229431152344, "logps/rejected": -75.25517272949219, "loss": 0.6936, "rewards/accuracies": 1.0, "rewards/chosen": 1.1744695901870728, "rewards/margins": 2.5120351314544678, "rewards/rejected": -1.3375654220581055, "step": 667 }, { "epoch": 1.5293612145517044, "grad_norm": 9.712587356567383, "learning_rate": 8.470183486238532e-07, "logits/chosen": -2.8958661556243896, "logits/rejected": -2.911832094192505, "logps/chosen": -52.57347869873047, "logps/rejected": -82.3548583984375, "loss": 0.7456, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1005287617444992, "rewards/margins": 2.528218984603882, "rewards/rejected": -2.4276905059814453, "step": 668 }, { "epoch": 1.5316528215411056, "grad_norm": 9.267205238342285, "learning_rate": 8.46788990825688e-07, "logits/chosen": -2.8241708278656006, "logits/rejected": -2.869570732116699, "logps/chosen": -35.0711669921875, "logps/rejected": -66.66667175292969, "loss": 0.6452, "rewards/accuracies": 0.9375, "rewards/chosen": 1.680396318435669, "rewards/margins": 2.7979400157928467, "rewards/rejected": -1.1175434589385986, "step": 669 }, { "epoch": 1.533944428530507, "grad_norm": 9.696305274963379, "learning_rate": 8.465596330275228e-07, "logits/chosen": -2.991651773452759, "logits/rejected": -2.997159481048584, "logps/chosen": -38.80712890625, "logps/rejected": -77.19237518310547, "loss": 0.5732, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4427982568740845, "rewards/margins": 3.391249179840088, "rewards/rejected": -1.9484509229660034, "step": 670 }, { "epoch": 1.5362360355199083, "grad_norm": 14.296496391296387, "learning_rate": 8.463302752293578e-07, "logits/chosen": -2.8705129623413086, "logits/rejected": -2.85448956489563, "logps/chosen": -40.71278762817383, "logps/rejected": -84.68449401855469, "loss": 0.6109, "rewards/accuracies": 0.90625, "rewards/chosen": 1.395904779434204, "rewards/margins": 3.70338773727417, "rewards/rejected": -2.307482957839966, "step": 671 }, { "epoch": 1.5385276425093095, "grad_norm": 12.56131362915039, "learning_rate": 8.461009174311926e-07, "logits/chosen": -2.9165024757385254, "logits/rejected": -2.907533645629883, "logps/chosen": -44.88904571533203, "logps/rejected": -88.06713104248047, "loss": 0.5277, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0844552516937256, "rewards/margins": 3.8598384857177734, "rewards/rejected": -2.775383472442627, "step": 672 }, { "epoch": 1.540819249498711, "grad_norm": 10.057723045349121, "learning_rate": 8.458715596330275e-07, "logits/chosen": -2.8927693367004395, "logits/rejected": -2.930654525756836, "logps/chosen": -41.874393463134766, "logps/rejected": -76.42192077636719, "loss": 0.6117, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9873716831207275, "rewards/margins": 3.1013903617858887, "rewards/rejected": -2.1140189170837402, "step": 673 }, { "epoch": 1.5431108564881124, "grad_norm": 8.979331970214844, "learning_rate": 8.456422018348623e-07, "logits/chosen": -2.9157562255859375, "logits/rejected": -2.9199576377868652, "logps/chosen": -49.44228744506836, "logps/rejected": -85.78240203857422, "loss": 0.5892, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6188192367553711, "rewards/margins": 3.155687093734741, "rewards/rejected": -2.53686785697937, "step": 674 }, { "epoch": 1.5454024634775136, "grad_norm": 11.762375831604004, "learning_rate": 8.454128440366972e-07, "logits/chosen": -2.9312937259674072, "logits/rejected": -2.933810234069824, "logps/chosen": -42.383304595947266, "logps/rejected": -77.70027923583984, "loss": 0.6319, "rewards/accuracies": 0.90625, "rewards/chosen": 1.404364824295044, "rewards/margins": 3.109525680541992, "rewards/rejected": -1.7051607370376587, "step": 675 }, { "epoch": 1.5476940704669149, "grad_norm": 13.40396785736084, "learning_rate": 8.451834862385321e-07, "logits/chosen": -2.9065558910369873, "logits/rejected": -2.9121482372283936, "logps/chosen": -38.08701705932617, "logps/rejected": -82.93744659423828, "loss": 0.499, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4024333953857422, "rewards/margins": 3.915031909942627, "rewards/rejected": -2.512598991394043, "step": 676 }, { "epoch": 1.5499856774563163, "grad_norm": 20.498889923095703, "learning_rate": 8.449541284403669e-07, "logits/chosen": -3.0073983669281006, "logits/rejected": -2.988931894302368, "logps/chosen": -40.33797836303711, "logps/rejected": -69.21318817138672, "loss": 0.7186, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2987688779830933, "rewards/margins": 2.241544008255005, "rewards/rejected": -0.9427750110626221, "step": 677 }, { "epoch": 1.5522772844457176, "grad_norm": 7.481748580932617, "learning_rate": 8.447247706422019e-07, "logits/chosen": -2.8969991207122803, "logits/rejected": -2.942237615585327, "logps/chosen": -40.11172103881836, "logps/rejected": -79.87982177734375, "loss": 0.5574, "rewards/accuracies": 0.875, "rewards/chosen": 1.4393460750579834, "rewards/margins": 3.576186418533325, "rewards/rejected": -2.136840581893921, "step": 678 }, { "epoch": 1.5545688914351188, "grad_norm": 8.785699844360352, "learning_rate": 8.444954128440367e-07, "logits/chosen": -2.8585305213928223, "logits/rejected": -2.898942470550537, "logps/chosen": -48.993465423583984, "logps/rejected": -84.59280395507812, "loss": 0.606, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8113174438476562, "rewards/margins": 3.1497445106506348, "rewards/rejected": -2.3384273052215576, "step": 679 }, { "epoch": 1.5568604984245202, "grad_norm": 10.853344917297363, "learning_rate": 8.442660550458716e-07, "logits/chosen": -2.8086183071136475, "logits/rejected": -2.9051740169525146, "logps/chosen": -35.38317108154297, "logps/rejected": -77.18121337890625, "loss": 0.5021, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8125231266021729, "rewards/margins": 3.6744794845581055, "rewards/rejected": -1.8619564771652222, "step": 680 }, { "epoch": 1.5591521054139215, "grad_norm": 11.821959495544434, "learning_rate": 8.440366972477064e-07, "logits/chosen": -2.9350674152374268, "logits/rejected": -2.922410011291504, "logps/chosen": -46.37991714477539, "logps/rejected": -81.42961883544922, "loss": 0.6342, "rewards/accuracies": 0.875, "rewards/chosen": 0.7729088068008423, "rewards/margins": 2.999004602432251, "rewards/rejected": -2.2260959148406982, "step": 681 }, { "epoch": 1.5614437124033227, "grad_norm": 8.880218505859375, "learning_rate": 8.438073394495413e-07, "logits/chosen": -2.844217300415039, "logits/rejected": -2.8937950134277344, "logps/chosen": -46.35896301269531, "logps/rejected": -82.3383560180664, "loss": 0.6624, "rewards/accuracies": 0.875, "rewards/chosen": 0.9298986196517944, "rewards/margins": 2.9517452716827393, "rewards/rejected": -2.021846294403076, "step": 682 }, { "epoch": 1.5637353193927241, "grad_norm": 10.137269973754883, "learning_rate": 8.435779816513761e-07, "logits/chosen": -2.871608257293701, "logits/rejected": -2.9431729316711426, "logps/chosen": -47.24868392944336, "logps/rejected": -74.43451690673828, "loss": 0.6523, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8643269538879395, "rewards/margins": 2.511256217956543, "rewards/rejected": -1.6469292640686035, "step": 683 }, { "epoch": 1.5660269263821256, "grad_norm": 11.351330757141113, "learning_rate": 8.433486238532109e-07, "logits/chosen": -2.890495538711548, "logits/rejected": -2.920161724090576, "logps/chosen": -41.50606918334961, "logps/rejected": -76.74214172363281, "loss": 0.6628, "rewards/accuracies": 0.9375, "rewards/chosen": 1.159774661064148, "rewards/margins": 2.7252681255340576, "rewards/rejected": -1.5654933452606201, "step": 684 }, { "epoch": 1.5683185333715268, "grad_norm": 9.097590446472168, "learning_rate": 8.431192660550458e-07, "logits/chosen": -2.994385242462158, "logits/rejected": -2.981905698776245, "logps/chosen": -39.47800064086914, "logps/rejected": -80.77880096435547, "loss": 0.5675, "rewards/accuracies": 0.9375, "rewards/chosen": 1.481347918510437, "rewards/margins": 3.633753776550293, "rewards/rejected": -2.1524057388305664, "step": 685 }, { "epoch": 1.570610140360928, "grad_norm": 16.78045082092285, "learning_rate": 8.428899082568807e-07, "logits/chosen": -2.9269461631774902, "logits/rejected": -2.9275598526000977, "logps/chosen": -41.21281814575195, "logps/rejected": -81.23219299316406, "loss": 0.5673, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4183502197265625, "rewards/margins": 3.435523271560669, "rewards/rejected": -2.0171728134155273, "step": 686 }, { "epoch": 1.5729017473503295, "grad_norm": 13.696126937866211, "learning_rate": 8.426605504587155e-07, "logits/chosen": -2.872276544570923, "logits/rejected": -2.844601631164551, "logps/chosen": -34.0093994140625, "logps/rejected": -73.85432434082031, "loss": 0.5656, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9019908905029297, "rewards/margins": 3.3044333457946777, "rewards/rejected": -1.4024425745010376, "step": 687 }, { "epoch": 1.5751933543397307, "grad_norm": 9.20456600189209, "learning_rate": 8.424311926605504e-07, "logits/chosen": -2.9082653522491455, "logits/rejected": -2.878298759460449, "logps/chosen": -44.955806732177734, "logps/rejected": -88.29766845703125, "loss": 0.5561, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0456064939498901, "rewards/margins": 3.662642002105713, "rewards/rejected": -2.6170356273651123, "step": 688 }, { "epoch": 1.577484961329132, "grad_norm": 12.366024017333984, "learning_rate": 8.422018348623853e-07, "logits/chosen": -2.871135950088501, "logits/rejected": -2.8815948963165283, "logps/chosen": -53.98761749267578, "logps/rejected": -85.16609954833984, "loss": 0.6745, "rewards/accuracies": 0.96875, "rewards/chosen": 0.23123839497566223, "rewards/margins": 2.637178421020508, "rewards/rejected": -2.405940055847168, "step": 689 }, { "epoch": 1.5797765683185334, "grad_norm": 12.576412200927734, "learning_rate": 8.419724770642202e-07, "logits/chosen": -2.8009870052337646, "logits/rejected": -2.785128355026245, "logps/chosen": -54.44746398925781, "logps/rejected": -79.07220458984375, "loss": 0.768, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1757453978061676, "rewards/margins": 2.1848063468933105, "rewards/rejected": -2.009061098098755, "step": 690 }, { "epoch": 1.5820681753079346, "grad_norm": 8.971152305603027, "learning_rate": 8.41743119266055e-07, "logits/chosen": -2.9353370666503906, "logits/rejected": -2.936262607574463, "logps/chosen": -47.24331283569336, "logps/rejected": -76.80866241455078, "loss": 0.704, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6453623175621033, "rewards/margins": 2.401785373687744, "rewards/rejected": -1.756422996520996, "step": 691 }, { "epoch": 1.5843597822973359, "grad_norm": 12.17548942565918, "learning_rate": 8.415137614678898e-07, "logits/chosen": -2.891108512878418, "logits/rejected": -2.883145332336426, "logps/chosen": -38.90196990966797, "logps/rejected": -72.99725341796875, "loss": 0.6565, "rewards/accuracies": 0.84375, "rewards/chosen": 1.525840163230896, "rewards/margins": 2.931427001953125, "rewards/rejected": -1.4055869579315186, "step": 692 }, { "epoch": 1.5866513892867373, "grad_norm": 11.675238609313965, "learning_rate": 8.412844036697248e-07, "logits/chosen": -2.9463207721710205, "logits/rejected": -2.8583929538726807, "logps/chosen": -42.17227554321289, "logps/rejected": -70.59815979003906, "loss": 0.7044, "rewards/accuracies": 0.875, "rewards/chosen": 1.3549765348434448, "rewards/margins": 2.7456181049346924, "rewards/rejected": -1.390641689300537, "step": 693 }, { "epoch": 1.5889429962761388, "grad_norm": 9.6685152053833, "learning_rate": 8.410550458715596e-07, "logits/chosen": -2.9116644859313965, "logits/rejected": -2.9152450561523438, "logps/chosen": -40.482540130615234, "logps/rejected": -81.20694732666016, "loss": 0.5602, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6275233030319214, "rewards/margins": 3.558229923248291, "rewards/rejected": -1.9307063817977905, "step": 694 }, { "epoch": 1.59123460326554, "grad_norm": 20.36893081665039, "learning_rate": 8.408256880733945e-07, "logits/chosen": -2.9716038703918457, "logits/rejected": -2.97558331489563, "logps/chosen": -48.837345123291016, "logps/rejected": -80.85639953613281, "loss": 0.7044, "rewards/accuracies": 0.90625, "rewards/chosen": 0.47689545154571533, "rewards/margins": 2.810865640640259, "rewards/rejected": -2.333970308303833, "step": 695 }, { "epoch": 1.5935262102549412, "grad_norm": 12.986745834350586, "learning_rate": 8.405963302752294e-07, "logits/chosen": -2.9259471893310547, "logits/rejected": -2.989006996154785, "logps/chosen": -43.64246368408203, "logps/rejected": -76.85150146484375, "loss": 0.5811, "rewards/accuracies": 0.90625, "rewards/chosen": 1.348888635635376, "rewards/margins": 2.9181578159332275, "rewards/rejected": -1.5692692995071411, "step": 696 }, { "epoch": 1.5958178172443427, "grad_norm": 8.944034576416016, "learning_rate": 8.403669724770642e-07, "logits/chosen": -2.8576598167419434, "logits/rejected": -2.901155471801758, "logps/chosen": -43.227210998535156, "logps/rejected": -78.7515640258789, "loss": 0.6676, "rewards/accuracies": 0.875, "rewards/chosen": 1.0668349266052246, "rewards/margins": 2.8318896293640137, "rewards/rejected": -1.765054702758789, "step": 697 }, { "epoch": 1.598109424233744, "grad_norm": 9.101919174194336, "learning_rate": 8.401376146788991e-07, "logits/chosen": -2.9358561038970947, "logits/rejected": -2.9434196949005127, "logps/chosen": -39.587642669677734, "logps/rejected": -77.72422790527344, "loss": 0.6123, "rewards/accuracies": 0.9375, "rewards/chosen": 1.421708106994629, "rewards/margins": 3.2252755165100098, "rewards/rejected": -1.8035674095153809, "step": 698 }, { "epoch": 1.6004010312231451, "grad_norm": 10.202115058898926, "learning_rate": 8.399082568807338e-07, "logits/chosen": -2.9398396015167236, "logits/rejected": -2.906552791595459, "logps/chosen": -46.09099578857422, "logps/rejected": -73.56280517578125, "loss": 0.6364, "rewards/accuracies": 0.90625, "rewards/chosen": 0.733620285987854, "rewards/margins": 2.657053232192993, "rewards/rejected": -1.9234329462051392, "step": 699 }, { "epoch": 1.6026926382125466, "grad_norm": 11.112313270568848, "learning_rate": 8.396788990825688e-07, "logits/chosen": -2.8817830085754395, "logits/rejected": -2.9019200801849365, "logps/chosen": -46.40791320800781, "logps/rejected": -83.54972839355469, "loss": 0.5636, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9126853346824646, "rewards/margins": 3.2825536727905273, "rewards/rejected": -2.369868278503418, "step": 700 }, { "epoch": 1.6049842452019478, "grad_norm": 12.237630844116211, "learning_rate": 8.394495412844036e-07, "logits/chosen": -2.9297313690185547, "logits/rejected": -2.923285484313965, "logps/chosen": -43.803985595703125, "logps/rejected": -80.49295043945312, "loss": 0.5599, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1319347620010376, "rewards/margins": 3.2303102016448975, "rewards/rejected": -2.0983753204345703, "step": 701 }, { "epoch": 1.607275852191349, "grad_norm": 6.720021724700928, "learning_rate": 8.392201834862384e-07, "logits/chosen": -2.838008165359497, "logits/rejected": -2.835430860519409, "logps/chosen": -49.406150817871094, "logps/rejected": -75.64675903320312, "loss": 0.6879, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5398799180984497, "rewards/margins": 2.4038631916046143, "rewards/rejected": -1.863983154296875, "step": 702 }, { "epoch": 1.6095674591807505, "grad_norm": 11.426499366760254, "learning_rate": 8.389908256880733e-07, "logits/chosen": -2.9741814136505127, "logits/rejected": -2.9429984092712402, "logps/chosen": -46.67096710205078, "logps/rejected": -88.95132446289062, "loss": 0.6089, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7966777086257935, "rewards/margins": 3.5030596256256104, "rewards/rejected": -2.7063820362091064, "step": 703 }, { "epoch": 1.611859066170152, "grad_norm": 10.501007080078125, "learning_rate": 8.387614678899082e-07, "logits/chosen": -2.908534049987793, "logits/rejected": -2.893742561340332, "logps/chosen": -39.02824020385742, "logps/rejected": -74.93032836914062, "loss": 0.6205, "rewards/accuracies": 0.875, "rewards/chosen": 1.4911885261535645, "rewards/margins": 2.9721384048461914, "rewards/rejected": -1.4809494018554688, "step": 704 }, { "epoch": 1.611859066170152, "eval_logits/chosen": -2.974435806274414, "eval_logits/rejected": -2.9938924312591553, "eval_logps/chosen": -54.73336410522461, "eval_logps/rejected": -83.59025573730469, "eval_loss": 0.7050727605819702, "eval_rewards/accuracies": 0.8584905862808228, "eval_rewards/chosen": 0.23289886116981506, "eval_rewards/margins": 2.4869801998138428, "eval_rewards/rejected": -2.2540814876556396, "eval_runtime": 962.3967, "eval_samples_per_second": 0.55, "eval_steps_per_second": 0.275, "step": 704 }, { "epoch": 1.6141506731595532, "grad_norm": 11.90103816986084, "learning_rate": 8.385321100917431e-07, "logits/chosen": -2.933846950531006, "logits/rejected": -2.9139394760131836, "logps/chosen": -40.88996505737305, "logps/rejected": -70.38346862792969, "loss": 0.7216, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2637696266174316, "rewards/margins": 2.5530216693878174, "rewards/rejected": -1.2892521619796753, "step": 705 }, { "epoch": 1.6164422801489544, "grad_norm": 13.87745189666748, "learning_rate": 8.383027522935779e-07, "logits/chosen": -2.877833366394043, "logits/rejected": -2.8519883155822754, "logps/chosen": -37.10343933105469, "logps/rejected": -76.92779541015625, "loss": 0.5594, "rewards/accuracies": 0.875, "rewards/chosen": 1.5604263544082642, "rewards/margins": 3.368715524673462, "rewards/rejected": -1.808288812637329, "step": 706 }, { "epoch": 1.6187338871383559, "grad_norm": 13.818916320800781, "learning_rate": 8.380733944954129e-07, "logits/chosen": -2.930422306060791, "logits/rejected": -2.9382615089416504, "logps/chosen": -34.139892578125, "logps/rejected": -68.33879852294922, "loss": 0.6397, "rewards/accuracies": 0.84375, "rewards/chosen": 1.717195987701416, "rewards/margins": 2.9953367710113525, "rewards/rejected": -1.278140664100647, "step": 707 }, { "epoch": 1.621025494127757, "grad_norm": 12.67674446105957, "learning_rate": 8.378440366972477e-07, "logits/chosen": -2.903012752532959, "logits/rejected": -2.9118146896362305, "logps/chosen": -40.873252868652344, "logps/rejected": -72.3603515625, "loss": 0.6541, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2818514108657837, "rewards/margins": 2.6667721271514893, "rewards/rejected": -1.3849204778671265, "step": 708 }, { "epoch": 1.6233171011171583, "grad_norm": 9.648687362670898, "learning_rate": 8.376146788990825e-07, "logits/chosen": -2.9426469802856445, "logits/rejected": -2.9180665016174316, "logps/chosen": -42.714847564697266, "logps/rejected": -79.5299301147461, "loss": 0.6103, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0642471313476562, "rewards/margins": 2.9673657417297363, "rewards/rejected": -1.90311861038208, "step": 709 }, { "epoch": 1.6256087081065598, "grad_norm": 14.224353790283203, "learning_rate": 8.373853211009174e-07, "logits/chosen": -2.8553738594055176, "logits/rejected": -2.8619637489318848, "logps/chosen": -37.39741134643555, "logps/rejected": -74.18315887451172, "loss": 0.6163, "rewards/accuracies": 0.875, "rewards/chosen": 1.8193206787109375, "rewards/margins": 3.195861339569092, "rewards/rejected": -1.3765406608581543, "step": 710 }, { "epoch": 1.627900315095961, "grad_norm": 9.953266143798828, "learning_rate": 8.371559633027523e-07, "logits/chosen": -2.8466501235961914, "logits/rejected": -2.868197441101074, "logps/chosen": -36.55986022949219, "logps/rejected": -84.34591674804688, "loss": 0.5222, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5440547466278076, "rewards/margins": 4.117010116577148, "rewards/rejected": -2.57295560836792, "step": 711 }, { "epoch": 1.6301919220853622, "grad_norm": 5.85775089263916, "learning_rate": 8.369266055045872e-07, "logits/chosen": -2.9359683990478516, "logits/rejected": -2.923319101333618, "logps/chosen": -41.18251037597656, "logps/rejected": -80.51973724365234, "loss": 0.6279, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3865267038345337, "rewards/margins": 3.4184727668762207, "rewards/rejected": -2.0319459438323975, "step": 712 }, { "epoch": 1.6324835290747637, "grad_norm": 12.10578441619873, "learning_rate": 8.36697247706422e-07, "logits/chosen": -2.942570209503174, "logits/rejected": -2.9477477073669434, "logps/chosen": -57.74616241455078, "logps/rejected": -88.94952392578125, "loss": 0.6284, "rewards/accuracies": 0.875, "rewards/chosen": -0.12051592767238617, "rewards/margins": 2.8416285514831543, "rewards/rejected": -2.962144374847412, "step": 713 }, { "epoch": 1.6347751360641651, "grad_norm": 17.292177200317383, "learning_rate": 8.364678899082568e-07, "logits/chosen": -2.8996825218200684, "logits/rejected": -2.954008102416992, "logps/chosen": -47.7946891784668, "logps/rejected": -84.24409484863281, "loss": 0.652, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6319940090179443, "rewards/margins": 3.229738712310791, "rewards/rejected": -2.5977447032928467, "step": 714 }, { "epoch": 1.6370667430535664, "grad_norm": 10.9000883102417, "learning_rate": 8.362385321100917e-07, "logits/chosen": -2.9196155071258545, "logits/rejected": -2.872051954269409, "logps/chosen": -54.04612731933594, "logps/rejected": -94.01805114746094, "loss": 0.6154, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2948659658432007, "rewards/margins": 3.3959617614746094, "rewards/rejected": -3.10109543800354, "step": 715 }, { "epoch": 1.6393583500429676, "grad_norm": 10.11577033996582, "learning_rate": 8.360091743119265e-07, "logits/chosen": -2.852400064468384, "logits/rejected": -2.8588974475860596, "logps/chosen": -46.70075225830078, "logps/rejected": -80.97901153564453, "loss": 0.6666, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7197129726409912, "rewards/margins": 2.887910842895508, "rewards/rejected": -2.1681976318359375, "step": 716 }, { "epoch": 1.641649957032369, "grad_norm": 15.969891548156738, "learning_rate": 8.357798165137614e-07, "logits/chosen": -2.8857438564300537, "logits/rejected": -2.8988986015319824, "logps/chosen": -43.30284118652344, "logps/rejected": -82.94864654541016, "loss": 0.6229, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0762128829956055, "rewards/margins": 3.489987373352051, "rewards/rejected": -2.413774013519287, "step": 717 }, { "epoch": 1.6439415640217703, "grad_norm": 7.355677604675293, "learning_rate": 8.355504587155963e-07, "logits/chosen": -2.8730201721191406, "logits/rejected": -2.8921031951904297, "logps/chosen": -54.47871780395508, "logps/rejected": -84.37093353271484, "loss": 0.7273, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2066042721271515, "rewards/margins": 2.775240421295166, "rewards/rejected": -2.568635940551758, "step": 718 }, { "epoch": 1.6462331710111715, "grad_norm": 9.174332618713379, "learning_rate": 8.353211009174311e-07, "logits/chosen": -2.8493010997772217, "logits/rejected": -2.8384034633636475, "logps/chosen": -55.4113883972168, "logps/rejected": -96.4441909790039, "loss": 0.6294, "rewards/accuracies": 0.96875, "rewards/chosen": 0.07241660356521606, "rewards/margins": 3.7041549682617188, "rewards/rejected": -3.6317379474639893, "step": 719 }, { "epoch": 1.648524778000573, "grad_norm": 9.36205768585205, "learning_rate": 8.35091743119266e-07, "logits/chosen": -2.900303602218628, "logits/rejected": -2.920509099960327, "logps/chosen": -53.74786376953125, "logps/rejected": -89.228759765625, "loss": 0.6333, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5662643909454346, "rewards/margins": 3.218395948410034, "rewards/rejected": -2.6521315574645996, "step": 720 }, { "epoch": 1.6508163849899742, "grad_norm": 12.319586753845215, "learning_rate": 8.348623853211008e-07, "logits/chosen": -2.9497737884521484, "logits/rejected": -2.869701385498047, "logps/chosen": -48.04917526245117, "logps/rejected": -74.61832427978516, "loss": 0.7497, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7514667510986328, "rewards/margins": 2.2492430210113525, "rewards/rejected": -1.4977763891220093, "step": 721 }, { "epoch": 1.6531079919793754, "grad_norm": 9.99917984008789, "learning_rate": 8.346330275229358e-07, "logits/chosen": -2.8727126121520996, "logits/rejected": -2.868814468383789, "logps/chosen": -40.57344436645508, "logps/rejected": -75.21702575683594, "loss": 0.5955, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5076615810394287, "rewards/margins": 3.2065932750701904, "rewards/rejected": -1.6989314556121826, "step": 722 }, { "epoch": 1.6553995989687769, "grad_norm": 12.381864547729492, "learning_rate": 8.344036697247706e-07, "logits/chosen": -2.8025288581848145, "logits/rejected": -2.8636369705200195, "logps/chosen": -46.558441162109375, "logps/rejected": -87.04994201660156, "loss": 0.5663, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8081802129745483, "rewards/margins": 3.4059293270111084, "rewards/rejected": -2.5977489948272705, "step": 723 }, { "epoch": 1.6576912059581783, "grad_norm": 13.478499412536621, "learning_rate": 8.341743119266055e-07, "logits/chosen": -2.866220712661743, "logits/rejected": -2.865783214569092, "logps/chosen": -58.82593536376953, "logps/rejected": -86.59798431396484, "loss": 0.6618, "rewards/accuracies": 0.84375, "rewards/chosen": 0.20431600511074066, "rewards/margins": 2.6152596473693848, "rewards/rejected": -2.4109437465667725, "step": 724 }, { "epoch": 1.6599828129475795, "grad_norm": 15.870199203491211, "learning_rate": 8.339449541284404e-07, "logits/chosen": -2.9153687953948975, "logits/rejected": -2.9097907543182373, "logps/chosen": -48.193992614746094, "logps/rejected": -75.57749938964844, "loss": 0.7433, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7597990036010742, "rewards/margins": 2.4540205001831055, "rewards/rejected": -1.6942213773727417, "step": 725 }, { "epoch": 1.6622744199369808, "grad_norm": 9.187024116516113, "learning_rate": 8.337155963302752e-07, "logits/chosen": -2.902306079864502, "logits/rejected": -2.935553550720215, "logps/chosen": -48.716148376464844, "logps/rejected": -79.21375274658203, "loss": 0.664, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5958702564239502, "rewards/margins": 2.731088876724243, "rewards/rejected": -2.135218620300293, "step": 726 }, { "epoch": 1.6645660269263822, "grad_norm": 13.752784729003906, "learning_rate": 8.334862385321101e-07, "logits/chosen": -2.893463134765625, "logits/rejected": -2.8898637294769287, "logps/chosen": -37.43321990966797, "logps/rejected": -73.85372924804688, "loss": 0.5809, "rewards/accuracies": 0.84375, "rewards/chosen": 1.7598062753677368, "rewards/margins": 3.3259012699127197, "rewards/rejected": -1.5660951137542725, "step": 727 }, { "epoch": 1.6668576339157835, "grad_norm": 9.335478782653809, "learning_rate": 8.332568807339449e-07, "logits/chosen": -2.8595616817474365, "logits/rejected": -2.8471949100494385, "logps/chosen": -34.175968170166016, "logps/rejected": -77.02806091308594, "loss": 0.5196, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7366344928741455, "rewards/margins": 3.611443042755127, "rewards/rejected": -1.8748083114624023, "step": 728 }, { "epoch": 1.6691492409051847, "grad_norm": 9.805477142333984, "learning_rate": 8.330275229357799e-07, "logits/chosen": -2.8981449604034424, "logits/rejected": -2.911592483520508, "logps/chosen": -50.0900764465332, "logps/rejected": -79.31938171386719, "loss": 0.71, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9060300588607788, "rewards/margins": 2.6747469902038574, "rewards/rejected": -1.768716812133789, "step": 729 }, { "epoch": 1.6714408478945861, "grad_norm": 9.228120803833008, "learning_rate": 8.327981651376147e-07, "logits/chosen": -2.858212471008301, "logits/rejected": -2.844465494155884, "logps/chosen": -45.35531997680664, "logps/rejected": -78.5113525390625, "loss": 0.6089, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2593374252319336, "rewards/margins": 3.0616729259490967, "rewards/rejected": -1.802335262298584, "step": 730 }, { "epoch": 1.6737324548839874, "grad_norm": 10.068015098571777, "learning_rate": 8.325688073394494e-07, "logits/chosen": -2.949392318725586, "logits/rejected": -2.9152441024780273, "logps/chosen": -36.8303337097168, "logps/rejected": -76.29580688476562, "loss": 0.6218, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5820891857147217, "rewards/margins": 3.4222750663757324, "rewards/rejected": -1.8401858806610107, "step": 731 }, { "epoch": 1.6760240618733886, "grad_norm": 10.951092720031738, "learning_rate": 8.323394495412843e-07, "logits/chosen": -3.002379894256592, "logits/rejected": -2.9824981689453125, "logps/chosen": -41.828609466552734, "logps/rejected": -71.67854309082031, "loss": 0.6686, "rewards/accuracies": 0.875, "rewards/chosen": 1.592469334602356, "rewards/margins": 2.7564735412597656, "rewards/rejected": -1.1640042066574097, "step": 732 }, { "epoch": 1.67831566886279, "grad_norm": 15.451473236083984, "learning_rate": 8.321100917431192e-07, "logits/chosen": -2.955111503601074, "logits/rejected": -2.9463229179382324, "logps/chosen": -31.924129486083984, "logps/rejected": -70.5605239868164, "loss": 0.6194, "rewards/accuracies": 0.9375, "rewards/chosen": 2.1233444213867188, "rewards/margins": 3.3620378971099854, "rewards/rejected": -1.2386932373046875, "step": 733 }, { "epoch": 1.6806072758521915, "grad_norm": 11.067788124084473, "learning_rate": 8.318807339449541e-07, "logits/chosen": -2.8911848068237305, "logits/rejected": -2.904428720474243, "logps/chosen": -36.90828323364258, "logps/rejected": -74.15050506591797, "loss": 0.6538, "rewards/accuracies": 0.875, "rewards/chosen": 1.8572800159454346, "rewards/margins": 3.0503368377685547, "rewards/rejected": -1.1930568218231201, "step": 734 }, { "epoch": 1.6828988828415925, "grad_norm": 11.122968673706055, "learning_rate": 8.316513761467889e-07, "logits/chosen": -2.9078867435455322, "logits/rejected": -2.8913605213165283, "logps/chosen": -44.32851791381836, "logps/rejected": -86.71803283691406, "loss": 0.591, "rewards/accuracies": 0.875, "rewards/chosen": 1.149466872215271, "rewards/margins": 3.7555642127990723, "rewards/rejected": -2.606097459793091, "step": 735 }, { "epoch": 1.685190489830994, "grad_norm": 11.797262191772461, "learning_rate": 8.314220183486238e-07, "logits/chosen": -2.877758264541626, "logits/rejected": -2.856771230697632, "logps/chosen": -46.319156646728516, "logps/rejected": -82.9205093383789, "loss": 0.6027, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9872258901596069, "rewards/margins": 2.9553701877593994, "rewards/rejected": -1.9681442975997925, "step": 736 }, { "epoch": 1.6874820968203954, "grad_norm": 20.587120056152344, "learning_rate": 8.311926605504587e-07, "logits/chosen": -2.9069247245788574, "logits/rejected": -2.863119125366211, "logps/chosen": -38.14771270751953, "logps/rejected": -74.85073852539062, "loss": 0.567, "rewards/accuracies": 0.875, "rewards/chosen": 1.609839916229248, "rewards/margins": 3.1544861793518066, "rewards/rejected": -1.5446462631225586, "step": 737 }, { "epoch": 1.6897737038097966, "grad_norm": 10.443931579589844, "learning_rate": 8.309633027522935e-07, "logits/chosen": -2.8190271854400635, "logits/rejected": -2.8043930530548096, "logps/chosen": -51.22873306274414, "logps/rejected": -84.82232666015625, "loss": 0.6648, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4903714954853058, "rewards/margins": 2.7946791648864746, "rewards/rejected": -2.3043079376220703, "step": 738 }, { "epoch": 1.6920653107991979, "grad_norm": 12.191550254821777, "learning_rate": 8.307339449541284e-07, "logits/chosen": -2.8725874423980713, "logits/rejected": -2.842073917388916, "logps/chosen": -37.000877380371094, "logps/rejected": -81.23178100585938, "loss": 0.5346, "rewards/accuracies": 0.96875, "rewards/chosen": 1.604166865348816, "rewards/margins": 3.882657766342163, "rewards/rejected": -2.2784907817840576, "step": 739 }, { "epoch": 1.6943569177885993, "grad_norm": 9.366484642028809, "learning_rate": 8.305045871559633e-07, "logits/chosen": -2.8758857250213623, "logits/rejected": -2.8879857063293457, "logps/chosen": -33.36528015136719, "logps/rejected": -79.17382049560547, "loss": 0.4974, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5071295499801636, "rewards/margins": 3.7514257431030273, "rewards/rejected": -2.244296073913574, "step": 740 }, { "epoch": 1.6966485247780005, "grad_norm": 10.045736312866211, "learning_rate": 8.302752293577981e-07, "logits/chosen": -2.919564962387085, "logits/rejected": -2.888326644897461, "logps/chosen": -43.51104736328125, "logps/rejected": -78.13343048095703, "loss": 0.5941, "rewards/accuracies": 0.875, "rewards/chosen": 1.3636788129806519, "rewards/margins": 3.1141796112060547, "rewards/rejected": -1.7505009174346924, "step": 741 }, { "epoch": 1.6989401317674018, "grad_norm": 11.39289379119873, "learning_rate": 8.30045871559633e-07, "logits/chosen": -2.913677930831909, "logits/rejected": -2.89784574508667, "logps/chosen": -41.90134811401367, "logps/rejected": -79.97918701171875, "loss": 0.6009, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3217408657073975, "rewards/margins": 3.4346203804016113, "rewards/rejected": -2.112879753112793, "step": 742 }, { "epoch": 1.7012317387568032, "grad_norm": 14.861824035644531, "learning_rate": 8.298165137614679e-07, "logits/chosen": -2.8890578746795654, "logits/rejected": -2.8793389797210693, "logps/chosen": -46.79189682006836, "logps/rejected": -87.27267456054688, "loss": 0.5841, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2183728218078613, "rewards/margins": 3.422715663909912, "rewards/rejected": -2.204342842102051, "step": 743 }, { "epoch": 1.7035233457462047, "grad_norm": 11.59348201751709, "learning_rate": 8.295871559633028e-07, "logits/chosen": -2.8888938426971436, "logits/rejected": -2.8655006885528564, "logps/chosen": -44.1806526184082, "logps/rejected": -81.77677154541016, "loss": 0.571, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1532493829727173, "rewards/margins": 3.333224058151245, "rewards/rejected": -2.1799745559692383, "step": 744 }, { "epoch": 1.7058149527356057, "grad_norm": 9.981575012207031, "learning_rate": 8.293577981651376e-07, "logits/chosen": -2.9688568115234375, "logits/rejected": -2.96799898147583, "logps/chosen": -47.14961624145508, "logps/rejected": -76.57380676269531, "loss": 0.6484, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0352789163589478, "rewards/margins": 2.6445047855377197, "rewards/rejected": -1.6092259883880615, "step": 745 }, { "epoch": 1.7081065597250071, "grad_norm": 9.505598068237305, "learning_rate": 8.291284403669725e-07, "logits/chosen": -2.913580894470215, "logits/rejected": -2.8831095695495605, "logps/chosen": -54.309471130371094, "logps/rejected": -90.451171875, "loss": 0.6568, "rewards/accuracies": 0.84375, "rewards/chosen": 0.12465700507164001, "rewards/margins": 3.1423258781433105, "rewards/rejected": -3.0176689624786377, "step": 746 }, { "epoch": 1.7103981667144086, "grad_norm": 9.992000579833984, "learning_rate": 8.288990825688073e-07, "logits/chosen": -2.897538185119629, "logits/rejected": -2.8622143268585205, "logps/chosen": -41.76513671875, "logps/rejected": -92.06129455566406, "loss": 0.4651, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4348845481872559, "rewards/margins": 4.406283378601074, "rewards/rejected": -2.9713988304138184, "step": 747 }, { "epoch": 1.7126897737038098, "grad_norm": 11.67225170135498, "learning_rate": 8.286697247706421e-07, "logits/chosen": -2.9322283267974854, "logits/rejected": -2.8992648124694824, "logps/chosen": -44.799964904785156, "logps/rejected": -75.6418685913086, "loss": 0.722, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9709102511405945, "rewards/margins": 2.769106149673462, "rewards/rejected": -1.7981958389282227, "step": 748 }, { "epoch": 1.714981380693211, "grad_norm": 14.715551376342773, "learning_rate": 8.28440366972477e-07, "logits/chosen": -2.9648430347442627, "logits/rejected": -2.927380323410034, "logps/chosen": -37.24271011352539, "logps/rejected": -67.74237060546875, "loss": 0.6413, "rewards/accuracies": 0.84375, "rewards/chosen": 1.8306118249893188, "rewards/margins": 2.607652187347412, "rewards/rejected": -0.7770400047302246, "step": 749 }, { "epoch": 1.7172729876826125, "grad_norm": 14.263936042785645, "learning_rate": 8.282110091743118e-07, "logits/chosen": -2.837583541870117, "logits/rejected": -2.852644681930542, "logps/chosen": -35.504722595214844, "logps/rejected": -71.09093475341797, "loss": 0.6027, "rewards/accuracies": 0.9375, "rewards/chosen": 1.857092261314392, "rewards/margins": 3.160335063934326, "rewards/rejected": -1.3032426834106445, "step": 750 }, { "epoch": 1.7195645946720137, "grad_norm": 8.621002197265625, "learning_rate": 8.279816513761468e-07, "logits/chosen": -2.9156973361968994, "logits/rejected": -2.9213993549346924, "logps/chosen": -53.99327850341797, "logps/rejected": -91.71331787109375, "loss": 0.6205, "rewards/accuracies": 0.90625, "rewards/chosen": 0.16920074820518494, "rewards/margins": 3.190380573272705, "rewards/rejected": -3.021179676055908, "step": 751 }, { "epoch": 1.721856201661415, "grad_norm": 7.293160915374756, "learning_rate": 8.277522935779816e-07, "logits/chosen": -2.894730806350708, "logits/rejected": -2.9497172832489014, "logps/chosen": -39.63792419433594, "logps/rejected": -79.01145935058594, "loss": 0.5877, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4338414669036865, "rewards/margins": 3.4663848876953125, "rewards/rejected": -2.032543659210205, "step": 752 }, { "epoch": 1.7241478086508164, "grad_norm": 9.854781150817871, "learning_rate": 8.275229357798164e-07, "logits/chosen": -2.8383595943450928, "logits/rejected": -2.839324474334717, "logps/chosen": -42.631412506103516, "logps/rejected": -78.60292053222656, "loss": 0.5863, "rewards/accuracies": 1.0, "rewards/chosen": 1.3975083827972412, "rewards/margins": 3.263559341430664, "rewards/rejected": -1.8660508394241333, "step": 753 }, { "epoch": 1.7264394156402179, "grad_norm": 9.298770904541016, "learning_rate": 8.272935779816514e-07, "logits/chosen": -2.880213975906372, "logits/rejected": -2.899625778198242, "logps/chosen": -41.74782180786133, "logps/rejected": -78.61473846435547, "loss": 0.6409, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3714170455932617, "rewards/margins": 3.2051143646240234, "rewards/rejected": -1.8336974382400513, "step": 754 }, { "epoch": 1.7287310226296189, "grad_norm": 10.024859428405762, "learning_rate": 8.270642201834862e-07, "logits/chosen": -2.8974571228027344, "logits/rejected": -2.9067130088806152, "logps/chosen": -39.47825241088867, "logps/rejected": -81.24853515625, "loss": 0.5754, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2829596996307373, "rewards/margins": 3.506955623626709, "rewards/rejected": -2.2239959239959717, "step": 755 }, { "epoch": 1.7310226296190203, "grad_norm": 18.341794967651367, "learning_rate": 8.268348623853211e-07, "logits/chosen": -2.8696725368499756, "logits/rejected": -2.852341651916504, "logps/chosen": -42.42005920410156, "logps/rejected": -75.41828155517578, "loss": 0.6312, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2455101013183594, "rewards/margins": 3.070885181427002, "rewards/rejected": -1.8253751993179321, "step": 756 }, { "epoch": 1.7333142366084218, "grad_norm": 14.56188678741455, "learning_rate": 8.266055045871559e-07, "logits/chosen": -2.865640640258789, "logits/rejected": -2.834245443344116, "logps/chosen": -41.35041046142578, "logps/rejected": -72.95501708984375, "loss": 0.6308, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4484095573425293, "rewards/margins": 2.904169797897339, "rewards/rejected": -1.4557602405548096, "step": 757 }, { "epoch": 1.735605843597823, "grad_norm": 15.782402992248535, "learning_rate": 8.263761467889908e-07, "logits/chosen": -2.8254148960113525, "logits/rejected": -2.803708076477051, "logps/chosen": -43.8831901550293, "logps/rejected": -90.03782653808594, "loss": 0.4949, "rewards/accuracies": 0.875, "rewards/chosen": 1.2616708278656006, "rewards/margins": 4.1758928298950195, "rewards/rejected": -2.914222002029419, "step": 758 }, { "epoch": 1.7378974505872242, "grad_norm": 10.224038124084473, "learning_rate": 8.261467889908257e-07, "logits/chosen": -2.902859926223755, "logits/rejected": -2.9397454261779785, "logps/chosen": -45.789363861083984, "logps/rejected": -82.3103256225586, "loss": 0.6462, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8627012968063354, "rewards/margins": 3.1672377586364746, "rewards/rejected": -2.3045363426208496, "step": 759 }, { "epoch": 1.7401890575766257, "grad_norm": 9.865615844726562, "learning_rate": 8.259174311926605e-07, "logits/chosen": -2.8785769939422607, "logits/rejected": -2.853688955307007, "logps/chosen": -37.715335845947266, "logps/rejected": -74.4703140258789, "loss": 0.6239, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6251728534698486, "rewards/margins": 3.1350016593933105, "rewards/rejected": -1.5098291635513306, "step": 760 }, { "epoch": 1.742480664566027, "grad_norm": 7.8303070068359375, "learning_rate": 8.256880733944955e-07, "logits/chosen": -2.874582290649414, "logits/rejected": -2.8534278869628906, "logps/chosen": -41.30553436279297, "logps/rejected": -84.71517181396484, "loss": 0.5675, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2293566465377808, "rewards/margins": 3.609668016433716, "rewards/rejected": -2.3803114891052246, "step": 761 }, { "epoch": 1.7447722715554281, "grad_norm": 10.44961166381836, "learning_rate": 8.254587155963303e-07, "logits/chosen": -2.873250722885132, "logits/rejected": -2.8568291664123535, "logps/chosen": -43.92038345336914, "logps/rejected": -77.70698547363281, "loss": 0.5883, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2816317081451416, "rewards/margins": 3.102020740509033, "rewards/rejected": -1.820388913154602, "step": 762 }, { "epoch": 1.7470638785448296, "grad_norm": 15.555195808410645, "learning_rate": 8.25229357798165e-07, "logits/chosen": -2.9342217445373535, "logits/rejected": -2.9464492797851562, "logps/chosen": -34.21287536621094, "logps/rejected": -79.72958374023438, "loss": 0.5054, "rewards/accuracies": 0.96875, "rewards/chosen": 2.053072929382324, "rewards/margins": 3.926144599914551, "rewards/rejected": -1.873071551322937, "step": 763 }, { "epoch": 1.749355485534231, "grad_norm": 11.878745079040527, "learning_rate": 8.249999999999999e-07, "logits/chosen": -2.8821840286254883, "logits/rejected": -2.867624521255493, "logps/chosen": -38.717769622802734, "logps/rejected": -78.58610534667969, "loss": 0.559, "rewards/accuracies": 0.875, "rewards/chosen": 1.5781608819961548, "rewards/margins": 3.4628233909606934, "rewards/rejected": -1.8846625089645386, "step": 764 }, { "epoch": 1.751647092523632, "grad_norm": 17.1141357421875, "learning_rate": 8.247706422018348e-07, "logits/chosen": -2.94296932220459, "logits/rejected": -2.9260566234588623, "logps/chosen": -40.298614501953125, "logps/rejected": -86.62203979492188, "loss": 0.5165, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2732932567596436, "rewards/margins": 4.083120822906494, "rewards/rejected": -2.8098275661468506, "step": 765 }, { "epoch": 1.7539386995130335, "grad_norm": 13.264410972595215, "learning_rate": 8.245412844036697e-07, "logits/chosen": -2.926096200942993, "logits/rejected": -2.939941644668579, "logps/chosen": -38.64235305786133, "logps/rejected": -66.46871185302734, "loss": 0.6737, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4684351682662964, "rewards/margins": 2.4644079208374023, "rewards/rejected": -0.9959726333618164, "step": 766 }, { "epoch": 1.756230306502435, "grad_norm": 15.303181648254395, "learning_rate": 8.243119266055045e-07, "logits/chosen": -2.8933281898498535, "logits/rejected": -2.868638515472412, "logps/chosen": -48.93446350097656, "logps/rejected": -81.85924530029297, "loss": 0.6354, "rewards/accuracies": 0.875, "rewards/chosen": 0.8834459185600281, "rewards/margins": 2.813539981842041, "rewards/rejected": -1.9300941228866577, "step": 767 }, { "epoch": 1.7585219134918362, "grad_norm": 10.525945663452148, "learning_rate": 8.240825688073393e-07, "logits/chosen": -2.9868783950805664, "logits/rejected": -2.9370641708374023, "logps/chosen": -45.60233688354492, "logps/rejected": -82.99354553222656, "loss": 0.5763, "rewards/accuracies": 0.875, "rewards/chosen": 1.0386899709701538, "rewards/margins": 3.11466121673584, "rewards/rejected": -2.0759708881378174, "step": 768 }, { "epoch": 1.7608135204812374, "grad_norm": 9.884099960327148, "learning_rate": 8.238532110091743e-07, "logits/chosen": -2.877599000930786, "logits/rejected": -2.895101308822632, "logps/chosen": -38.016910552978516, "logps/rejected": -70.10406494140625, "loss": 0.6692, "rewards/accuracies": 0.84375, "rewards/chosen": 1.6382863521575928, "rewards/margins": 2.7530436515808105, "rewards/rejected": -1.1147571802139282, "step": 769 }, { "epoch": 1.7631051274706389, "grad_norm": 11.545266151428223, "learning_rate": 8.236238532110091e-07, "logits/chosen": -2.8860597610473633, "logits/rejected": -2.9445698261260986, "logps/chosen": -48.93947982788086, "logps/rejected": -83.0103759765625, "loss": 0.6609, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4660838842391968, "rewards/margins": 2.9084692001342773, "rewards/rejected": -2.442385196685791, "step": 770 }, { "epoch": 1.76539673446004, "grad_norm": 11.456825256347656, "learning_rate": 8.23394495412844e-07, "logits/chosen": -2.8958659172058105, "logits/rejected": -2.9017722606658936, "logps/chosen": -43.17325973510742, "logps/rejected": -81.04286193847656, "loss": 0.5925, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2823618650436401, "rewards/margins": 3.5242607593536377, "rewards/rejected": -2.241899013519287, "step": 771 }, { "epoch": 1.7676883414494413, "grad_norm": 11.915953636169434, "learning_rate": 8.231651376146789e-07, "logits/chosen": -2.826777935028076, "logits/rejected": -2.8364858627319336, "logps/chosen": -47.432132720947266, "logps/rejected": -80.01506805419922, "loss": 0.6888, "rewards/accuracies": 0.875, "rewards/chosen": 0.5032607913017273, "rewards/margins": 2.7821033000946045, "rewards/rejected": -2.2788429260253906, "step": 772 }, { "epoch": 1.7699799484388428, "grad_norm": 11.844916343688965, "learning_rate": 8.229357798165138e-07, "logits/chosen": -2.854806900024414, "logits/rejected": -2.8825109004974365, "logps/chosen": -45.380584716796875, "logps/rejected": -80.22589874267578, "loss": 0.6229, "rewards/accuracies": 0.84375, "rewards/chosen": 0.932317316532135, "rewards/margins": 3.015904664993286, "rewards/rejected": -2.083587646484375, "step": 773 }, { "epoch": 1.7722715554282442, "grad_norm": 10.996774673461914, "learning_rate": 8.227064220183486e-07, "logits/chosen": -2.8733012676239014, "logits/rejected": -2.8639674186706543, "logps/chosen": -47.3084716796875, "logps/rejected": -85.59430694580078, "loss": 0.6448, "rewards/accuracies": 0.875, "rewards/chosen": 0.7703449726104736, "rewards/margins": 3.2954611778259277, "rewards/rejected": -2.525116205215454, "step": 774 }, { "epoch": 1.7745631624176452, "grad_norm": 7.952910900115967, "learning_rate": 8.224770642201834e-07, "logits/chosen": -2.889728307723999, "logits/rejected": -2.9158034324645996, "logps/chosen": -37.81418228149414, "logps/rejected": -79.07296752929688, "loss": 0.547, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7564036846160889, "rewards/margins": 3.5402324199676514, "rewards/rejected": -1.7838287353515625, "step": 775 }, { "epoch": 1.7768547694070467, "grad_norm": 12.404465675354004, "learning_rate": 8.222477064220184e-07, "logits/chosen": -2.931483268737793, "logits/rejected": -2.9390995502471924, "logps/chosen": -40.869754791259766, "logps/rejected": -77.59195709228516, "loss": 0.5565, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5175129175186157, "rewards/margins": 3.206946849822998, "rewards/rejected": -1.6894338130950928, "step": 776 }, { "epoch": 1.7791463763964481, "grad_norm": 20.39431381225586, "learning_rate": 8.220183486238532e-07, "logits/chosen": -2.9431018829345703, "logits/rejected": -2.9574813842773438, "logps/chosen": -55.09475326538086, "logps/rejected": -95.7016372680664, "loss": 0.6282, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2750326097011566, "rewards/margins": 3.555096387863159, "rewards/rejected": -3.2800636291503906, "step": 777 }, { "epoch": 1.7814379833858494, "grad_norm": 9.46557331085205, "learning_rate": 8.217889908256881e-07, "logits/chosen": -2.931535243988037, "logits/rejected": -2.915329933166504, "logps/chosen": -45.48323440551758, "logps/rejected": -76.99571228027344, "loss": 0.7063, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9226804971694946, "rewards/margins": 2.725539207458496, "rewards/rejected": -1.8028587102890015, "step": 778 }, { "epoch": 1.7837295903752506, "grad_norm": 17.305782318115234, "learning_rate": 8.215596330275229e-07, "logits/chosen": -2.9389138221740723, "logits/rejected": -2.9375195503234863, "logps/chosen": -41.198326110839844, "logps/rejected": -76.04153442382812, "loss": 0.6112, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4751795530319214, "rewards/margins": 2.9906296730041504, "rewards/rejected": -1.5154497623443604, "step": 779 }, { "epoch": 1.786021197364652, "grad_norm": 19.298398971557617, "learning_rate": 8.213302752293577e-07, "logits/chosen": -2.8967247009277344, "logits/rejected": -2.8859524726867676, "logps/chosen": -51.649688720703125, "logps/rejected": -89.01837158203125, "loss": 0.6591, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4787759482860565, "rewards/margins": 3.2806954383850098, "rewards/rejected": -2.80191969871521, "step": 780 }, { "epoch": 1.7883128043540533, "grad_norm": 11.817749977111816, "learning_rate": 8.211009174311926e-07, "logits/chosen": -2.933652877807617, "logits/rejected": -2.9114558696746826, "logps/chosen": -45.127532958984375, "logps/rejected": -82.51288604736328, "loss": 0.6659, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9903493523597717, "rewards/margins": 3.0219497680664062, "rewards/rejected": -2.0316004753112793, "step": 781 }, { "epoch": 1.7906044113434545, "grad_norm": 12.037189483642578, "learning_rate": 8.208715596330274e-07, "logits/chosen": -2.8476462364196777, "logits/rejected": -2.8554282188415527, "logps/chosen": -41.581298828125, "logps/rejected": -85.61280822753906, "loss": 0.5024, "rewards/accuracies": 0.875, "rewards/chosen": 1.505171775817871, "rewards/margins": 3.9295990467071533, "rewards/rejected": -2.4244275093078613, "step": 782 }, { "epoch": 1.792896018332856, "grad_norm": 9.682612419128418, "learning_rate": 8.206422018348624e-07, "logits/chosen": -2.8531317710876465, "logits/rejected": -2.8038222789764404, "logps/chosen": -62.07020568847656, "logps/rejected": -90.73014831542969, "loss": 0.6891, "rewards/accuracies": 0.875, "rewards/chosen": -0.23635300993919373, "rewards/margins": 2.390868902206421, "rewards/rejected": -2.6272220611572266, "step": 783 }, { "epoch": 1.7951876253222574, "grad_norm": 15.323958396911621, "learning_rate": 8.204128440366972e-07, "logits/chosen": -2.867286443710327, "logits/rejected": -2.9052393436431885, "logps/chosen": -39.425052642822266, "logps/rejected": -84.19662475585938, "loss": 0.5071, "rewards/accuracies": 0.96875, "rewards/chosen": 1.459191918373108, "rewards/margins": 3.8970484733581543, "rewards/rejected": -2.437856674194336, "step": 784 }, { "epoch": 1.7974792323116584, "grad_norm": 18.61156463623047, "learning_rate": 8.20183486238532e-07, "logits/chosen": -2.975440502166748, "logits/rejected": -2.9377193450927734, "logps/chosen": -51.38750457763672, "logps/rejected": -90.15067291259766, "loss": 0.582, "rewards/accuracies": 0.875, "rewards/chosen": 0.43525561690330505, "rewards/margins": 3.5851688385009766, "rewards/rejected": -3.1499133110046387, "step": 785 }, { "epoch": 1.7997708393010599, "grad_norm": 13.084996223449707, "learning_rate": 8.19954128440367e-07, "logits/chosen": -2.9390223026275635, "logits/rejected": -3.0326993465423584, "logps/chosen": -50.09013366699219, "logps/rejected": -75.11155700683594, "loss": 0.7363, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3699650764465332, "rewards/margins": 2.120762586593628, "rewards/rejected": -1.7507975101470947, "step": 786 }, { "epoch": 1.8020624462904613, "grad_norm": 10.50637149810791, "learning_rate": 8.197247706422018e-07, "logits/chosen": -2.9772698879241943, "logits/rejected": -2.91322660446167, "logps/chosen": -54.03983688354492, "logps/rejected": -93.1724624633789, "loss": 0.6285, "rewards/accuracies": 0.9375, "rewards/chosen": 0.45736148953437805, "rewards/margins": 3.416944742202759, "rewards/rejected": -2.9595835208892822, "step": 787 }, { "epoch": 1.8043540532798625, "grad_norm": 12.767847061157227, "learning_rate": 8.194954128440367e-07, "logits/chosen": -2.8533895015716553, "logits/rejected": -2.8709888458251953, "logps/chosen": -45.639739990234375, "logps/rejected": -84.94903564453125, "loss": 0.5748, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9416825771331787, "rewards/margins": 3.478113889694214, "rewards/rejected": -2.536431074142456, "step": 788 }, { "epoch": 1.8066456602692638, "grad_norm": 10.467527389526367, "learning_rate": 8.192660550458715e-07, "logits/chosen": -2.89713191986084, "logits/rejected": -2.8785977363586426, "logps/chosen": -44.7719841003418, "logps/rejected": -85.2029800415039, "loss": 0.5666, "rewards/accuracies": 0.9375, "rewards/chosen": 1.329332947731018, "rewards/margins": 3.6750035285949707, "rewards/rejected": -2.345670700073242, "step": 789 }, { "epoch": 1.8089372672586652, "grad_norm": 11.506053924560547, "learning_rate": 8.190366972477064e-07, "logits/chosen": -2.8693466186523438, "logits/rejected": -2.9406232833862305, "logps/chosen": -39.00740432739258, "logps/rejected": -74.36056518554688, "loss": 0.6153, "rewards/accuracies": 0.84375, "rewards/chosen": 1.5030896663665771, "rewards/margins": 3.2338736057281494, "rewards/rejected": -1.7307837009429932, "step": 790 }, { "epoch": 1.8112288742480664, "grad_norm": 9.65473461151123, "learning_rate": 8.188073394495413e-07, "logits/chosen": -2.924645185470581, "logits/rejected": -2.885420322418213, "logps/chosen": -55.41948699951172, "logps/rejected": -93.23300170898438, "loss": 0.684, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07544653862714767, "rewards/margins": 3.283374309539795, "rewards/rejected": -3.358820676803589, "step": 791 }, { "epoch": 1.8135204812374677, "grad_norm": 11.441035270690918, "learning_rate": 8.185779816513761e-07, "logits/chosen": -2.895298480987549, "logits/rejected": -2.860877513885498, "logps/chosen": -41.05875778198242, "logps/rejected": -77.22750854492188, "loss": 0.5881, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4055814743041992, "rewards/margins": 3.1463632583618164, "rewards/rejected": -1.7407820224761963, "step": 792 }, { "epoch": 1.8135204812374677, "eval_logits/chosen": -2.9918644428253174, "eval_logits/rejected": -3.008038282394409, "eval_logps/chosen": -57.0305061340332, "eval_logps/rejected": -87.7309799194336, "eval_loss": 0.7080715894699097, "eval_rewards/accuracies": 0.8641509413719177, "eval_rewards/chosen": 0.0031847935169935226, "eval_rewards/margins": 2.6713390350341797, "eval_rewards/rejected": -2.66815447807312, "eval_runtime": 960.1835, "eval_samples_per_second": 0.551, "eval_steps_per_second": 0.276, "step": 792 } ], "logging_steps": 1.0, "max_steps": 4360, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 88, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }