{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9880609304240429, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013174145738987238, "grad_norm": 1.194186806678772, "learning_rate": 6.25e-08, "logits/chosen": 9.990612030029297, "logits/rejected": 10.698101997375488, "logps/chosen": -102.88545989990234, "logps/ref_chosen": -102.88545989990234, "logps/ref_rejected": -121.84871673583984, "logps/rejected": -121.84871673583984, "loss": 0.3675, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "sft_loss": 0.36753880977630615, "step": 1 }, { "epoch": 0.026348291477974475, "grad_norm": 0.5353251099586487, "learning_rate": 1.25e-07, "logits/chosen": 10.211905479431152, "logits/rejected": 11.06594467163086, "logps/chosen": -107.70349884033203, "logps/ref_chosen": -107.70349884033203, "logps/ref_rejected": -121.89966583251953, "logps/rejected": -121.89966583251953, "loss": 0.4101, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "sft_loss": 0.41013145446777344, "step": 2 }, { "epoch": 0.03952243721696171, "grad_norm": 0.7126303315162659, "learning_rate": 1.875e-07, "logits/chosen": 10.032384872436523, "logits/rejected": 11.023520469665527, "logps/chosen": -108.3123779296875, "logps/ref_chosen": -107.98188781738281, "logps/ref_rejected": -124.51527404785156, "logps/rejected": -124.87130737304688, "loss": 0.412, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.003304910147562623, "rewards/margins": 0.0002554532838985324, "rewards/rejected": -0.003560363780707121, "sft_loss": 0.41195932030677795, "step": 3 }, { "epoch": 0.05269658295594895, "grad_norm": 1.2344533205032349, "learning_rate": 2.5e-07, "logits/chosen": 9.836658477783203, "logits/rejected": 10.855621337890625, "logps/chosen": -109.55919647216797, "logps/ref_chosen": -109.20836639404297, "logps/ref_rejected": -119.23908996582031, "logps/rejected": -119.48279571533203, "loss": 0.4039, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.003508324269205332, "rewards/margins": -0.0010712125804275274, "rewards/rejected": -0.0024371116887778044, "sft_loss": 0.4038863480091095, "step": 4 }, { "epoch": 0.06587072869493618, "grad_norm": 1.426048994064331, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 10.212320327758789, "logits/rejected": 10.966379165649414, "logps/chosen": -103.76991271972656, "logps/ref_chosen": -103.87680053710938, "logps/ref_rejected": -118.41618347167969, "logps/rejected": -118.23270416259766, "loss": 0.3697, "rewards/accuracies": 0.453125, "rewards/chosen": 0.0010687037138268352, "rewards/margins": -0.000766113749705255, "rewards/rejected": 0.0018348174635320902, "sft_loss": 0.3697226345539093, "step": 5 }, { "epoch": 0.07904487443392343, "grad_norm": 1.413549780845642, "learning_rate": 3.75e-07, "logits/chosen": 10.700042724609375, "logits/rejected": 11.478326797485352, "logps/chosen": -107.56877899169922, "logps/ref_chosen": -107.58968353271484, "logps/ref_rejected": -122.07303619384766, "logps/rejected": -121.85940551757812, "loss": 0.3909, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.0002090137859340757, "rewards/margins": -0.0019273017533123493, "rewards/rejected": 0.00213631521910429, "sft_loss": 0.390906423330307, "step": 6 }, { "epoch": 0.09221902017291066, "grad_norm": 1.2342580556869507, "learning_rate": 4.375e-07, "logits/chosen": 10.01632308959961, "logits/rejected": 10.7178955078125, "logps/chosen": -107.01339721679688, "logps/ref_chosen": -107.42727661132812, "logps/ref_rejected": -116.87063598632812, "logps/rejected": -116.37357330322266, "loss": 0.3747, "rewards/accuracies": 0.4453125, "rewards/chosen": 0.00413867924362421, "rewards/margins": -0.0008318667532876134, "rewards/rejected": 0.004970546346157789, "sft_loss": 0.3746669888496399, "step": 7 }, { "epoch": 0.1053931659118979, "grad_norm": 0.6644937992095947, "learning_rate": 5e-07, "logits/chosen": 10.211028099060059, "logits/rejected": 11.11027717590332, "logps/chosen": -104.41184997558594, "logps/ref_chosen": -105.60282135009766, "logps/ref_rejected": -119.53916931152344, "logps/rejected": -118.27430725097656, "loss": 0.3773, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.01190974935889244, "rewards/margins": -0.0007388982339762151, "rewards/rejected": 0.012648648582398891, "sft_loss": 0.37729793787002563, "step": 8 }, { "epoch": 0.11856731165088513, "grad_norm": 0.9437576532363892, "learning_rate": 4.997252228714278e-07, "logits/chosen": 10.179821014404297, "logits/rejected": 11.147579193115234, "logps/chosen": -104.13174438476562, "logps/ref_chosen": -105.46086120605469, "logps/ref_rejected": -119.00373840332031, "logps/rejected": -117.734130859375, "loss": 0.3807, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.013291322626173496, "rewards/margins": 0.0005952615174464881, "rewards/rejected": 0.012696062214672565, "sft_loss": 0.38070446252822876, "step": 9 }, { "epoch": 0.13174145738987236, "grad_norm": 0.700039803981781, "learning_rate": 4.989014955054745e-07, "logits/chosen": 10.076737403869629, "logits/rejected": 10.897785186767578, "logps/chosen": -100.81087493896484, "logps/ref_chosen": -104.21009826660156, "logps/ref_rejected": -118.9209213256836, "logps/rejected": -115.75495910644531, "loss": 0.3367, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.033992186188697815, "rewards/margins": 0.0023326175287365913, "rewards/rejected": 0.0316595658659935, "sft_loss": 0.33672136068344116, "step": 10 }, { "epoch": 0.14491560312885962, "grad_norm": 0.9160856008529663, "learning_rate": 4.975306286336627e-07, "logits/chosen": 9.973880767822266, "logits/rejected": 11.158487319946289, "logps/chosen": -101.3505630493164, "logps/ref_chosen": -105.94319152832031, "logps/ref_rejected": -122.76007843017578, "logps/rejected": -118.6338119506836, "loss": 0.3851, "rewards/accuracies": 0.578125, "rewards/chosen": 0.045926500111818314, "rewards/margins": 0.004663803614675999, "rewards/rejected": 0.04126270115375519, "sft_loss": 0.3850533962249756, "step": 11 }, { "epoch": 0.15808974886784685, "grad_norm": 0.9421964883804321, "learning_rate": 4.956156357188939e-07, "logits/chosen": 9.908226013183594, "logits/rejected": 10.598045349121094, "logps/chosen": -103.32762908935547, "logps/ref_chosen": -109.08442687988281, "logps/ref_rejected": -121.41947174072266, "logps/rejected": -115.84996795654297, "loss": 0.3532, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.05756799131631851, "rewards/margins": 0.0018730255542322993, "rewards/rejected": 0.05569496005773544, "sft_loss": 0.3532242476940155, "step": 12 }, { "epoch": 0.17126389460683408, "grad_norm": 0.3328304886817932, "learning_rate": 4.931607263312032e-07, "logits/chosen": 9.964012145996094, "logits/rejected": 11.03992748260498, "logps/chosen": -98.97601318359375, "logps/ref_chosen": -104.62150573730469, "logps/ref_rejected": -119.55384063720703, "logps/rejected": -114.12371826171875, "loss": 0.3686, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.056454867124557495, "rewards/margins": 0.0021536569111049175, "rewards/rejected": 0.05430121719837189, "sft_loss": 0.36859023571014404, "step": 13 }, { "epoch": 0.1844380403458213, "grad_norm": 0.30642038583755493, "learning_rate": 4.9017129689421e-07, "logits/chosen": 10.519927978515625, "logits/rejected": 11.649580001831055, "logps/chosen": -96.5634765625, "logps/ref_chosen": -106.179443359375, "logps/ref_rejected": -120.73036193847656, "logps/rejected": -110.86133575439453, "loss": 0.3385, "rewards/accuracies": 0.5, "rewards/chosen": 0.09615952521562576, "rewards/margins": -0.002530643017962575, "rewards/rejected": 0.09869016706943512, "sft_loss": 0.338548868894577, "step": 14 }, { "epoch": 0.19761218608480857, "grad_norm": 0.301073282957077, "learning_rate": 4.866539188226085e-07, "logits/chosen": 9.891039848327637, "logits/rejected": 10.824172973632812, "logps/chosen": -95.14861297607422, "logps/ref_chosen": -105.70547485351562, "logps/ref_rejected": -118.89997863769531, "logps/rejected": -108.2326889038086, "loss": 0.3305, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.10556865483522415, "rewards/margins": -0.0011043368140235543, "rewards/rejected": 0.1066729873418808, "sft_loss": 0.3305360674858093, "step": 15 }, { "epoch": 0.2107863318237958, "grad_norm": 0.36065343022346497, "learning_rate": 4.826163240767716e-07, "logits/chosen": 10.682470321655273, "logits/rejected": 11.299846649169922, "logps/chosen": -96.53520202636719, "logps/ref_chosen": -108.86376953125, "logps/ref_rejected": -122.1635513305664, "logps/rejected": -110.50537872314453, "loss": 0.3484, "rewards/accuracies": 0.578125, "rewards/chosen": 0.12328556925058365, "rewards/margins": 0.006703883409500122, "rewards/rejected": 0.11658168584108353, "sft_loss": 0.3484281003475189, "step": 16 }, { "epoch": 0.22396047756278303, "grad_norm": 0.5325565934181213, "learning_rate": 4.780673881662242e-07, "logits/chosen": 10.187503814697266, "logits/rejected": 10.843408584594727, "logps/chosen": -90.1707992553711, "logps/ref_chosen": -102.93986511230469, "logps/ref_rejected": -119.43718719482422, "logps/rejected": -106.76301574707031, "loss": 0.359, "rewards/accuracies": 0.4453125, "rewards/chosen": 0.12769076228141785, "rewards/margins": 0.0009490540251135826, "rewards/rejected": 0.1267417073249817, "sft_loss": 0.3589847683906555, "step": 17 }, { "epoch": 0.23713462330177026, "grad_norm": 0.4098409116268158, "learning_rate": 4.730171106393466e-07, "logits/chosen": 10.4215669631958, "logits/rejected": 11.216498374938965, "logps/chosen": -90.09894561767578, "logps/ref_chosen": -103.81341552734375, "logps/ref_rejected": -117.45123291015625, "logps/rejected": -104.58552551269531, "loss": 0.3368, "rewards/accuracies": 0.515625, "rewards/chosen": 0.13714462518692017, "rewards/margins": 0.00848748255521059, "rewards/rejected": 0.12865713238716125, "sft_loss": 0.33678534626960754, "step": 18 }, { "epoch": 0.2503087690407575, "grad_norm": 0.3302735984325409, "learning_rate": 4.6747659310219757e-07, "logits/chosen": 10.332744598388672, "logits/rejected": 11.005766868591309, "logps/chosen": -94.52428436279297, "logps/ref_chosen": -107.85797119140625, "logps/ref_rejected": -121.88042449951172, "logps/rejected": -108.09265899658203, "loss": 0.3222, "rewards/accuracies": 0.4296875, "rewards/chosen": 0.1333368420600891, "rewards/margins": -0.004540742840617895, "rewards/rejected": 0.13787758350372314, "sft_loss": 0.3221552073955536, "step": 19 }, { "epoch": 0.2634829147797447, "grad_norm": 0.40531161427497864, "learning_rate": 4.6145801481477433e-07, "logits/chosen": 10.747330665588379, "logits/rejected": 11.561124801635742, "logps/chosen": -89.97228240966797, "logps/ref_chosen": -103.42721557617188, "logps/ref_rejected": -116.7796630859375, "logps/rejected": -103.99850463867188, "loss": 0.3157, "rewards/accuracies": 0.5703125, "rewards/chosen": 0.13454943895339966, "rewards/margins": 0.006737923249602318, "rewards/rejected": 0.1278115212917328, "sft_loss": 0.3156886100769043, "step": 20 }, { "epoch": 0.276657060518732, "grad_norm": 0.2686282992362976, "learning_rate": 4.549746059183561e-07, "logits/chosen": 9.720458984375, "logits/rejected": 10.846506118774414, "logps/chosen": -92.48249816894531, "logps/ref_chosen": -106.60163879394531, "logps/ref_rejected": -124.56562805175781, "logps/rejected": -109.58876037597656, "loss": 0.3106, "rewards/accuracies": 0.46875, "rewards/chosen": 0.14119136333465576, "rewards/margins": -0.00857722107321024, "rewards/rejected": 0.14976857602596283, "sft_loss": 0.31064528226852417, "step": 21 }, { "epoch": 0.28983120625771924, "grad_norm": 0.5989738702774048, "learning_rate": 4.480406183527823e-07, "logits/chosen": 10.225810050964355, "logits/rejected": 11.099544525146484, "logps/chosen": -88.04141998291016, "logps/ref_chosen": -103.77696228027344, "logps/ref_rejected": -118.73616027832031, "logps/rejected": -104.40451049804688, "loss": 0.3321, "rewards/accuracies": 0.5625, "rewards/chosen": 0.157355397939682, "rewards/margins": 0.014038847759366035, "rewards/rejected": 0.14331655204296112, "sft_loss": 0.3321138620376587, "step": 22 }, { "epoch": 0.3030053519967065, "grad_norm": 0.1970965415239334, "learning_rate": 4.4067129452759546e-07, "logits/chosen": 10.115339279174805, "logits/rejected": 11.140266418457031, "logps/chosen": -87.26233673095703, "logps/ref_chosen": -104.72956085205078, "logps/ref_rejected": -121.35556030273438, "logps/rejected": -104.43501281738281, "loss": 0.3228, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.17467224597930908, "rewards/margins": 0.005466699134558439, "rewards/rejected": 0.16920553147792816, "sft_loss": 0.3228015899658203, "step": 23 }, { "epoch": 0.3161794977356937, "grad_norm": 0.2894323468208313, "learning_rate": 4.3288283381591725e-07, "logits/chosen": 10.147160530090332, "logits/rejected": 10.98647689819336, "logps/chosen": -86.99087524414062, "logps/ref_chosen": -105.88758087158203, "logps/ref_rejected": -125.69054412841797, "logps/rejected": -106.15878295898438, "loss": 0.3069, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.1889670193195343, "rewards/margins": -0.0063507393933832645, "rewards/rejected": 0.19531774520874023, "sft_loss": 0.30694928765296936, "step": 24 }, { "epoch": 0.32935364347468093, "grad_norm": 0.31463876366615295, "learning_rate": 4.246923569447104e-07, "logits/chosen": 10.327369689941406, "logits/rejected": 11.063910484313965, "logps/chosen": -87.99880981445312, "logps/ref_chosen": -110.0761489868164, "logps/ref_rejected": -129.10540771484375, "logps/rejected": -107.19017028808594, "loss": 0.2993, "rewards/accuracies": 0.53125, "rewards/chosen": 0.22077329456806183, "rewards/margins": 0.001621072180569172, "rewards/rejected": 0.21915221214294434, "sft_loss": 0.2992705702781677, "step": 25 }, { "epoch": 0.34252778921366817, "grad_norm": 0.25855836272239685, "learning_rate": 4.161178683597054e-07, "logits/chosen": 10.388958930969238, "logits/rejected": 11.489179611206055, "logps/chosen": -81.3349609375, "logps/ref_chosen": -103.74571990966797, "logps/ref_rejected": -120.73832702636719, "logps/rejected": -98.57904052734375, "loss": 0.2909, "rewards/accuracies": 0.5, "rewards/chosen": 0.22410757839679718, "rewards/margins": 0.0025147469714283943, "rewards/rejected": 0.22159285843372345, "sft_loss": 0.2909452021121979, "step": 26 }, { "epoch": 0.3557019349526554, "grad_norm": 0.5696946382522583, "learning_rate": 4.0717821664772124e-07, "logits/chosen": 10.086296081542969, "logits/rejected": 11.336379051208496, "logps/chosen": -81.64080810546875, "logps/ref_chosen": -105.47428131103516, "logps/ref_rejected": -120.5193099975586, "logps/rejected": -97.64772033691406, "loss": 0.3163, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.23833464086055756, "rewards/margins": 0.00961877591907978, "rewards/rejected": 0.22871585190296173, "sft_loss": 0.316275417804718, "step": 27 }, { "epoch": 0.3688760806916426, "grad_norm": 0.14858409762382507, "learning_rate": 3.978930531033806e-07, "logits/chosen": 9.710855484008789, "logits/rejected": 10.872222900390625, "logps/chosen": -80.8717269897461, "logps/ref_chosen": -103.72540283203125, "logps/ref_rejected": -119.79557800292969, "logps/rejected": -96.69274139404297, "loss": 0.2766, "rewards/accuracies": 0.5, "rewards/chosen": 0.22853665053844452, "rewards/margins": -0.0024917693808674812, "rewards/rejected": 0.23102842271327972, "sft_loss": 0.2765931785106659, "step": 28 }, { "epoch": 0.3820502264306299, "grad_norm": 0.2741422951221466, "learning_rate": 3.882827885312998e-07, "logits/chosen": 10.16092586517334, "logits/rejected": 11.23297119140625, "logps/chosen": -85.11812591552734, "logps/ref_chosen": -108.65434265136719, "logps/ref_rejected": -121.46784973144531, "logps/rejected": -98.77241516113281, "loss": 0.2799, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.23536208271980286, "rewards/margins": 0.008407761342823505, "rewards/rejected": 0.22695434093475342, "sft_loss": 0.27991783618927, "step": 29 }, { "epoch": 0.39522437216961714, "grad_norm": 0.16848187148571014, "learning_rate": 3.7836854837871044e-07, "logits/chosen": 10.2907133102417, "logits/rejected": 11.690597534179688, "logps/chosen": -78.23504638671875, "logps/ref_chosen": -103.62174224853516, "logps/ref_rejected": -126.73807525634766, "logps/rejected": -102.43669128417969, "loss": 0.2962, "rewards/accuracies": 0.546875, "rewards/chosen": 0.2538670301437378, "rewards/margins": 0.010853251442313194, "rewards/rejected": 0.24301378428936005, "sft_loss": 0.2962155342102051, "step": 30 }, { "epoch": 0.4083985179086044, "grad_norm": 0.1987890601158142, "learning_rate": 3.681721262971413e-07, "logits/chosen": 9.929094314575195, "logits/rejected": 10.946361541748047, "logps/chosen": -80.73751831054688, "logps/ref_chosen": -106.10479736328125, "logps/ref_rejected": -120.6382827758789, "logps/rejected": -96.38467407226562, "loss": 0.2982, "rewards/accuracies": 0.5, "rewards/chosen": 0.2536728084087372, "rewards/margins": 0.011136716231703758, "rewards/rejected": 0.24253609776496887, "sft_loss": 0.2981662452220917, "step": 31 }, { "epoch": 0.4215726636475916, "grad_norm": 0.1829695999622345, "learning_rate": 3.577159362352426e-07, "logits/chosen": 10.097947120666504, "logits/rejected": 11.477932929992676, "logps/chosen": -82.30887603759766, "logps/ref_chosen": -105.99569702148438, "logps/ref_rejected": -128.34303283691406, "logps/rejected": -104.14814758300781, "loss": 0.2848, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.2368682324886322, "rewards/margins": -0.0050805676728487015, "rewards/rejected": 0.24194881319999695, "sft_loss": 0.28480714559555054, "step": 32 }, { "epoch": 0.43474680938657884, "grad_norm": 0.22964715957641602, "learning_rate": 3.470229631680624e-07, "logits/chosen": 10.105993270874023, "logits/rejected": 10.923880577087402, "logps/chosen": -81.49787902832031, "logps/ref_chosen": -105.72196197509766, "logps/ref_rejected": -121.59507751464844, "logps/rejected": -97.2413101196289, "loss": 0.2686, "rewards/accuracies": 0.5, "rewards/chosen": 0.242240771651268, "rewards/margins": -0.0012968010269105434, "rewards/rejected": 0.24353757500648499, "sft_loss": 0.2686034142971039, "step": 33 }, { "epoch": 0.44792095512556607, "grad_norm": 0.2910502254962921, "learning_rate": 3.361167125710832e-07, "logits/chosen": 10.264101028442383, "logits/rejected": 11.107752799987793, "logps/chosen": -85.68121337890625, "logps/ref_chosen": -111.4834976196289, "logps/ref_rejected": -130.48089599609375, "logps/rejected": -104.08442687988281, "loss": 0.3039, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.2580227851867676, "rewards/margins": -0.005941788665950298, "rewards/rejected": 0.26396459341049194, "sft_loss": 0.3038797080516815, "step": 34 }, { "epoch": 0.4610951008645533, "grad_norm": 0.2656664550304413, "learning_rate": 3.2502115875008516e-07, "logits/chosen": 10.529006958007812, "logits/rejected": 11.506540298461914, "logps/chosen": -82.62615966796875, "logps/ref_chosen": -108.9183349609375, "logps/ref_rejected": -121.32493591308594, "logps/rejected": -95.86014556884766, "loss": 0.2751, "rewards/accuracies": 0.5625, "rewards/chosen": 0.26292186975479126, "rewards/margins": 0.008274020627140999, "rewards/rejected": 0.2546478509902954, "sft_loss": 0.2751036286354065, "step": 35 }, { "epoch": 0.47426924660354053, "grad_norm": 0.4242345690727234, "learning_rate": 3.137606921404191e-07, "logits/chosen": 10.204312324523926, "logits/rejected": 10.856239318847656, "logps/chosen": -81.047607421875, "logps/ref_chosen": -107.1411361694336, "logps/ref_rejected": -118.66165161132812, "logps/rejected": -92.72647094726562, "loss": 0.2871, "rewards/accuracies": 0.5, "rewards/chosen": 0.26093533635139465, "rewards/margins": 0.0015834786463528872, "rewards/rejected": 0.25935184955596924, "sft_loss": 0.28712767362594604, "step": 36 }, { "epoch": 0.4874433923425278, "grad_norm": 0.30854037404060364, "learning_rate": 3.0236006569153616e-07, "logits/chosen": 10.416954040527344, "logits/rejected": 11.237515449523926, "logps/chosen": -80.80354309082031, "logps/ref_chosen": -106.6348876953125, "logps/ref_rejected": -121.37834167480469, "logps/rejected": -94.95623779296875, "loss": 0.2868, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.2583135664463043, "rewards/margins": -0.0059075187891721725, "rewards/rejected": 0.26422107219696045, "sft_loss": 0.2867960035800934, "step": 37 }, { "epoch": 0.500617538081515, "grad_norm": 0.38461658358573914, "learning_rate": 2.9084434045463254e-07, "logits/chosen": 9.907350540161133, "logits/rejected": 10.992050170898438, "logps/chosen": -76.08082580566406, "logps/ref_chosen": -104.01033782958984, "logps/ref_rejected": -119.02666473388672, "logps/rejected": -90.15015411376953, "loss": 0.2865, "rewards/accuracies": 0.5, "rewards/chosen": 0.2792952358722687, "rewards/margins": -0.009469768032431602, "rewards/rejected": 0.28876498341560364, "sft_loss": 0.28654617071151733, "step": 38 }, { "epoch": 0.5137916838205022, "grad_norm": 0.32149407267570496, "learning_rate": 2.7923883049302066e-07, "logits/chosen": 10.409524917602539, "logits/rejected": 11.193167686462402, "logps/chosen": -82.37915802001953, "logps/ref_chosen": -109.76485443115234, "logps/ref_rejected": -122.25163269042969, "logps/rejected": -96.5480728149414, "loss": 0.2919, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2738569676876068, "rewards/margins": 0.016821369528770447, "rewards/rejected": 0.25703561305999756, "sft_loss": 0.2919383943080902, "step": 39 }, { "epoch": 0.5269658295594895, "grad_norm": 0.48350030183792114, "learning_rate": 2.6756904723632324e-07, "logits/chosen": 10.301675796508789, "logits/rejected": 11.467926025390625, "logps/chosen": -79.19075012207031, "logps/ref_chosen": -107.18782806396484, "logps/ref_rejected": -124.24542236328125, "logps/rejected": -96.9850082397461, "loss": 0.2805, "rewards/accuracies": 0.5, "rewards/chosen": 0.2799707055091858, "rewards/margins": 0.007366571109741926, "rewards/rejected": 0.27260416746139526, "sft_loss": 0.2805452048778534, "step": 40 }, { "epoch": 0.5401399752984768, "grad_norm": 0.27216726541519165, "learning_rate": 2.5586064340081516e-07, "logits/chosen": 10.603267669677734, "logits/rejected": 11.226158142089844, "logps/chosen": -78.87698364257812, "logps/ref_chosen": -106.42051696777344, "logps/ref_rejected": -122.25247192382812, "logps/rejected": -94.32707977294922, "loss": 0.2862, "rewards/accuracies": 0.515625, "rewards/chosen": 0.275435209274292, "rewards/margins": -0.003818750847131014, "rewards/rejected": 0.2792539596557617, "sft_loss": 0.28622862696647644, "step": 41 }, { "epoch": 0.553314121037464, "grad_norm": 0.33358198404312134, "learning_rate": 2.4413935659918487e-07, "logits/chosen": 9.588984489440918, "logits/rejected": 10.6452054977417, "logps/chosen": -74.2744369506836, "logps/ref_chosen": -103.1148452758789, "logps/ref_rejected": -116.55464935302734, "logps/rejected": -88.39811706542969, "loss": 0.2641, "rewards/accuracies": 0.546875, "rewards/chosen": 0.28840407729148865, "rewards/margins": 0.0068388087674975395, "rewards/rejected": 0.28156527876853943, "sft_loss": 0.2640990614891052, "step": 42 }, { "epoch": 0.5664882667764513, "grad_norm": 0.47038576006889343, "learning_rate": 2.3243095276367684e-07, "logits/chosen": 9.663254737854004, "logits/rejected": 10.8170747756958, "logps/chosen": -76.7999267578125, "logps/ref_chosen": -104.21064758300781, "logps/ref_rejected": -118.7614974975586, "logps/rejected": -91.26052856445312, "loss": 0.2744, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.2741071879863739, "rewards/margins": -0.0009024296887218952, "rewards/rejected": 0.2750096321105957, "sft_loss": 0.2744351625442505, "step": 43 }, { "epoch": 0.5796624125154385, "grad_norm": 0.30214762687683105, "learning_rate": 2.2076116950697937e-07, "logits/chosen": 9.781536102294922, "logits/rejected": 10.607444763183594, "logps/chosen": -72.30926513671875, "logps/ref_chosen": -100.59449005126953, "logps/ref_rejected": -115.95166778564453, "logps/rejected": -87.6107177734375, "loss": 0.2461, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.28285229206085205, "rewards/margins": -0.0005572582595050335, "rewards/rejected": 0.2834095358848572, "sft_loss": 0.24606820940971375, "step": 44 }, { "epoch": 0.5928365582544257, "grad_norm": 0.26689764857292175, "learning_rate": 2.091556595453674e-07, "logits/chosen": 9.98751449584961, "logits/rejected": 10.849931716918945, "logps/chosen": -78.55535888671875, "logps/ref_chosen": -106.96060943603516, "logps/ref_rejected": -125.49449157714844, "logps/rejected": -98.11154174804688, "loss": 0.2706, "rewards/accuracies": 0.53125, "rewards/chosen": 0.2840524911880493, "rewards/margins": 0.010223127901554108, "rewards/rejected": 0.2738293409347534, "sft_loss": 0.2706489562988281, "step": 45 }, { "epoch": 0.606010703993413, "grad_norm": 0.24426043033599854, "learning_rate": 1.9763993430846392e-07, "logits/chosen": 10.033075332641602, "logits/rejected": 10.728992462158203, "logps/chosen": -76.60491180419922, "logps/ref_chosen": -107.08544158935547, "logps/ref_rejected": -120.38542175292969, "logps/rejected": -89.13396453857422, "loss": 0.2738, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.30480533838272095, "rewards/margins": -0.00770913390442729, "rewards/rejected": 0.31251445412635803, "sft_loss": 0.2737652063369751, "step": 46 }, { "epoch": 0.6191848497324002, "grad_norm": 0.22300882637500763, "learning_rate": 1.862393078595809e-07, "logits/chosen": 9.950118064880371, "logits/rejected": 11.169652938842773, "logps/chosen": -77.8197021484375, "logps/ref_chosen": -105.74787902832031, "logps/ref_rejected": -122.93606567382812, "logps/rejected": -96.1205062866211, "loss": 0.2652, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.2792818248271942, "rewards/margins": 0.011126276105642319, "rewards/rejected": 0.2681555151939392, "sft_loss": 0.26522505283355713, "step": 47 }, { "epoch": 0.6323589954713874, "grad_norm": 0.2568327784538269, "learning_rate": 1.7497884124991485e-07, "logits/chosen": 10.451184272766113, "logits/rejected": 11.392836570739746, "logps/chosen": -76.05364990234375, "logps/ref_chosen": -105.3005599975586, "logps/ref_rejected": -123.93569946289062, "logps/rejected": -93.72474670410156, "loss": 0.2697, "rewards/accuracies": 0.5, "rewards/chosen": 0.29246923327445984, "rewards/margins": -0.009640296921133995, "rewards/rejected": 0.3021095395088196, "sft_loss": 0.26966598629951477, "step": 48 }, { "epoch": 0.6455331412103746, "grad_norm": 0.2787468433380127, "learning_rate": 1.6388328742891678e-07, "logits/chosen": 10.48294734954834, "logits/rejected": 11.362519264221191, "logps/chosen": -74.37510681152344, "logps/ref_chosen": -104.30430603027344, "logps/ref_rejected": -115.85497283935547, "logps/rejected": -86.97918701171875, "loss": 0.2744, "rewards/accuracies": 0.46875, "rewards/chosen": 0.2992919683456421, "rewards/margins": 0.010534043423831463, "rewards/rejected": 0.28875789046287537, "sft_loss": 0.2743627727031708, "step": 49 }, { "epoch": 0.6587072869493619, "grad_norm": 0.3125001788139343, "learning_rate": 1.5297703683193753e-07, "logits/chosen": 10.078466415405273, "logits/rejected": 10.955018997192383, "logps/chosen": -75.85308837890625, "logps/ref_chosen": -104.65946960449219, "logps/ref_rejected": -118.84170532226562, "logps/rejected": -90.01569366455078, "loss": 0.2577, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.28806373476982117, "rewards/margins": -0.00019637378863990307, "rewards/rejected": 0.2882601022720337, "sft_loss": 0.25767678022384644, "step": 50 }, { "epoch": 0.6718814326883491, "grad_norm": 0.15405406057834625, "learning_rate": 1.422840637647574e-07, "logits/chosen": 10.179081916809082, "logits/rejected": 10.733241081237793, "logps/chosen": -74.97821807861328, "logps/ref_chosen": -104.4243392944336, "logps/ref_rejected": -117.16233825683594, "logps/rejected": -88.18524932861328, "loss": 0.2584, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.294461190700531, "rewards/margins": 0.004690280184149742, "rewards/rejected": 0.2897709012031555, "sft_loss": 0.25844162702560425, "step": 51 }, { "epoch": 0.6850555784273363, "grad_norm": 0.16120746731758118, "learning_rate": 1.3182787370285865e-07, "logits/chosen": 9.57602596282959, "logits/rejected": 10.736429214477539, "logps/chosen": -72.85411071777344, "logps/ref_chosen": -101.99165344238281, "logps/ref_rejected": -123.20516204833984, "logps/rejected": -93.20382690429688, "loss": 0.2552, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.29137539863586426, "rewards/margins": -0.008637862280011177, "rewards/rejected": 0.3000132739543915, "sft_loss": 0.2551620900630951, "step": 52 }, { "epoch": 0.6982297241663236, "grad_norm": 0.19364570081233978, "learning_rate": 1.2163145162128946e-07, "logits/chosen": 10.08875846862793, "logits/rejected": 10.98922348022461, "logps/chosen": -78.81430053710938, "logps/ref_chosen": -108.26175689697266, "logps/ref_rejected": -118.12374114990234, "logps/rejected": -89.11669921875, "loss": 0.2577, "rewards/accuracies": 0.53125, "rewards/chosen": 0.2944745421409607, "rewards/margins": 0.004404103849083185, "rewards/rejected": 0.29007044434547424, "sft_loss": 0.2576565444469452, "step": 53 }, { "epoch": 0.7114038699053108, "grad_norm": 0.28979793190956116, "learning_rate": 1.1171721146870014e-07, "logits/chosen": 10.206223487854004, "logits/rejected": 11.192782402038574, "logps/chosen": -76.93553924560547, "logps/ref_chosen": -108.5864028930664, "logps/ref_rejected": -130.25155639648438, "logps/rejected": -96.52012634277344, "loss": 0.2871, "rewards/accuracies": 0.3984375, "rewards/chosen": 0.31650859117507935, "rewards/margins": -0.020805664360523224, "rewards/rejected": 0.337314248085022, "sft_loss": 0.2871078848838806, "step": 54 }, { "epoch": 0.724578015644298, "grad_norm": 0.1910555362701416, "learning_rate": 1.0210694689661939e-07, "logits/chosen": 10.210641860961914, "logits/rejected": 11.032613754272461, "logps/chosen": -74.83497619628906, "logps/ref_chosen": -105.69741821289062, "logps/ref_rejected": -122.07044219970703, "logps/rejected": -90.28074645996094, "loss": 0.2573, "rewards/accuracies": 0.46875, "rewards/chosen": 0.308624267578125, "rewards/margins": -0.009272638708353043, "rewards/rejected": 0.3178969621658325, "sft_loss": 0.25734585523605347, "step": 55 }, { "epoch": 0.7377521613832853, "grad_norm": 0.27673250436782837, "learning_rate": 9.282178335227883e-08, "logits/chosen": 9.825605392456055, "logits/rejected": 11.014497756958008, "logps/chosen": -76.0024642944336, "logps/ref_chosen": -106.5007095336914, "logps/ref_rejected": -123.01736450195312, "logps/rejected": -92.10081481933594, "loss": 0.2743, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.3049824833869934, "rewards/margins": -0.004183035343885422, "rewards/rejected": 0.30916550755500793, "sft_loss": 0.27425283193588257, "step": 56 }, { "epoch": 0.7509263071222725, "grad_norm": 0.18975986540317535, "learning_rate": 8.388213164029459e-08, "logits/chosen": 10.642633438110352, "logits/rejected": 11.387500762939453, "logps/chosen": -79.39006805419922, "logps/ref_chosen": -109.18460083007812, "logps/ref_rejected": -124.3697280883789, "logps/rejected": -92.89251708984375, "loss": 0.2729, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2979453504085541, "rewards/margins": -0.016826828941702843, "rewards/rejected": 0.3147721290588379, "sft_loss": 0.2728780210018158, "step": 57 }, { "epoch": 0.7641004528612598, "grad_norm": 0.41853123903274536, "learning_rate": 7.530764305528958e-08, "logits/chosen": 9.80534839630127, "logits/rejected": 10.46203327178955, "logps/chosen": -73.13560485839844, "logps/ref_chosen": -104.43944549560547, "logps/ref_rejected": -118.44985961914062, "logps/rejected": -85.20870208740234, "loss": 0.2597, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.3130384385585785, "rewards/margins": -0.01937328279018402, "rewards/rejected": 0.33241167664527893, "sft_loss": 0.2597176730632782, "step": 58 }, { "epoch": 0.7772745986002471, "grad_norm": 0.2700183391571045, "learning_rate": 6.711716618408281e-08, "logits/chosen": 10.213776588439941, "logits/rejected": 11.164811134338379, "logps/chosen": -73.02882385253906, "logps/ref_chosen": -103.32658386230469, "logps/ref_rejected": -121.63726806640625, "logps/rejected": -90.34732818603516, "loss": 0.2687, "rewards/accuracies": 0.4296875, "rewards/chosen": 0.3029775619506836, "rewards/margins": -0.009921873919665813, "rewards/rejected": 0.3128994107246399, "sft_loss": 0.26870667934417725, "step": 59 }, { "epoch": 0.7904487443392343, "grad_norm": 0.14984337985515594, "learning_rate": 5.932870547240454e-08, "logits/chosen": 9.960476875305176, "logits/rejected": 11.033708572387695, "logps/chosen": -72.57840728759766, "logps/ref_chosen": -102.98921966552734, "logps/ref_rejected": -124.47185516357422, "logps/rejected": -92.38633728027344, "loss": 0.2563, "rewards/accuracies": 0.46875, "rewards/chosen": 0.3041080832481384, "rewards/margins": -0.01674714870750904, "rewards/rejected": 0.32085520029067993, "sft_loss": 0.2563033699989319, "step": 60 }, { "epoch": 0.8036228900782215, "grad_norm": 0.31175732612609863, "learning_rate": 5.1959381647217665e-08, "logits/chosen": 9.977319717407227, "logits/rejected": 10.946649551391602, "logps/chosen": -73.95118713378906, "logps/ref_chosen": -106.28311157226562, "logps/ref_rejected": -121.47750854492188, "logps/rejected": -88.92882537841797, "loss": 0.2666, "rewards/accuracies": 0.5, "rewards/chosen": 0.3233192563056946, "rewards/margins": -0.0021677182521671057, "rewards/rejected": 0.3254869878292084, "sft_loss": 0.26657047867774963, "step": 61 }, { "epoch": 0.8167970358172087, "grad_norm": 0.3104262351989746, "learning_rate": 4.502539408164385e-08, "logits/chosen": 10.092552185058594, "logits/rejected": 10.915599822998047, "logps/chosen": -77.04930114746094, "logps/ref_chosen": -109.67979431152344, "logps/ref_rejected": -120.36711120605469, "logps/rejected": -87.77507781982422, "loss": 0.2549, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.32630497217178345, "rewards/margins": 0.00038449978455901146, "rewards/rejected": 0.3259204030036926, "sft_loss": 0.2549050450325012, "step": 62 }, { "epoch": 0.829971181556196, "grad_norm": 1.811981201171875, "learning_rate": 3.854198518522564e-08, "logits/chosen": 10.26008415222168, "logits/rejected": 11.338315963745117, "logps/chosen": -75.18155670166016, "logps/ref_chosen": -106.88896942138672, "logps/ref_rejected": -122.57796478271484, "logps/rejected": -91.81503295898438, "loss": 0.2708, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.3170740604400635, "rewards/margins": 0.009444723837077618, "rewards/rejected": 0.3076293170452118, "sft_loss": 0.270816445350647, "step": 63 }, { "epoch": 0.8431453272951832, "grad_norm": 0.19537799060344696, "learning_rate": 3.2523406897802444e-08, "logits/chosen": 10.241344451904297, "logits/rejected": 10.94163990020752, "logps/chosen": -78.00033569335938, "logps/ref_chosen": -109.29510498046875, "logps/ref_rejected": -121.51821899414062, "logps/rejected": -88.49795532226562, "loss": 0.2733, "rewards/accuracies": 0.40625, "rewards/chosen": 0.3129475712776184, "rewards/margins": -0.017255008220672607, "rewards/rejected": 0.33020254969596863, "sft_loss": 0.2733091115951538, "step": 64 }, { "epoch": 0.8563194730341704, "grad_norm": 0.3109639286994934, "learning_rate": 2.6982889360653376e-08, "logits/chosen": 9.754749298095703, "logits/rejected": 11.062653541564941, "logps/chosen": -76.51459503173828, "logps/ref_chosen": -110.15232849121094, "logps/ref_rejected": -127.54279327392578, "logps/rejected": -96.31084442138672, "loss": 0.271, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3363773226737976, "rewards/margins": 0.024057911708950996, "rewards/rejected": 0.31231939792633057, "sft_loss": 0.27095291018486023, "step": 65 }, { "epoch": 0.8694936187731577, "grad_norm": 0.22148017585277557, "learning_rate": 2.1932611833775843e-08, "logits/chosen": 9.787257194519043, "logits/rejected": 10.842453956604004, "logps/chosen": -72.53459167480469, "logps/ref_chosen": -104.0207748413086, "logps/ref_rejected": -126.93211364746094, "logps/rejected": -92.82463073730469, "loss": 0.2518, "rewards/accuracies": 0.4453125, "rewards/chosen": 0.31486180424690247, "rewards/margins": -0.026213109493255615, "rewards/rejected": 0.3410749137401581, "sft_loss": 0.25176769495010376, "step": 66 }, { "epoch": 0.8826677645121449, "grad_norm": 0.23339781165122986, "learning_rate": 1.738367592322837e-08, "logits/chosen": 10.151799201965332, "logits/rejected": 11.080910682678223, "logps/chosen": -72.54059600830078, "logps/ref_chosen": -104.55751037597656, "logps/ref_rejected": -119.71514892578125, "logps/rejected": -87.53738403320312, "loss": 0.2572, "rewards/accuracies": 0.484375, "rewards/chosen": 0.32016903162002563, "rewards/margins": -0.001608673483133316, "rewards/rejected": 0.3217777609825134, "sft_loss": 0.25715309381484985, "step": 67 }, { "epoch": 0.8958419102511321, "grad_norm": 0.3316951394081116, "learning_rate": 1.3346081177391472e-08, "logits/chosen": 10.458593368530273, "logits/rejected": 10.821167945861816, "logps/chosen": -76.2717056274414, "logps/ref_chosen": -107.26033020019531, "logps/ref_rejected": -115.8590087890625, "logps/rejected": -83.77940368652344, "loss": 0.2695, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.3098861873149872, "rewards/margins": -0.010909780859947205, "rewards/rejected": 0.3207959830760956, "sft_loss": 0.26949527859687805, "step": 68 }, { "epoch": 0.9090160559901194, "grad_norm": 0.5555047392845154, "learning_rate": 9.828703105789981e-09, "logits/chosen": 10.413060188293457, "logits/rejected": 11.321882247924805, "logps/chosen": -73.33889770507812, "logps/ref_chosen": -106.8610610961914, "logps/ref_rejected": -122.44428253173828, "logps/rejected": -89.23860931396484, "loss": 0.2688, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.3352215886116028, "rewards/margins": 0.003164912573993206, "rewards/rejected": 0.3320567011833191, "sft_loss": 0.26880374550819397, "step": 69 }, { "epoch": 0.9221902017291066, "grad_norm": 0.31241339445114136, "learning_rate": 6.839273668796747e-09, "logits/chosen": 10.117363929748535, "logits/rejected": 10.65896224975586, "logps/chosen": -76.24555969238281, "logps/ref_chosen": -106.58778381347656, "logps/ref_rejected": -124.77790832519531, "logps/rejected": -93.02002716064453, "loss": 0.2603, "rewards/accuracies": 0.4453125, "rewards/chosen": 0.3034222424030304, "rewards/margins": -0.014156593009829521, "rewards/rejected": 0.3175787925720215, "sft_loss": 0.2603415846824646, "step": 70 }, { "epoch": 0.9353643474680938, "grad_norm": 0.3154677748680115, "learning_rate": 4.384364281105973e-09, "logits/chosen": 10.104023933410645, "logits/rejected": 11.347376823425293, "logps/chosen": -70.88841247558594, "logps/ref_chosen": -104.39148712158203, "logps/ref_rejected": -120.59461212158203, "logps/rejected": -87.24669647216797, "loss": 0.2746, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.33503076434135437, "rewards/margins": 0.001551617868244648, "rewards/rejected": 0.3334791362285614, "sft_loss": 0.2746419608592987, "step": 71 }, { "epoch": 0.9485384932070811, "grad_norm": 0.27243080735206604, "learning_rate": 2.469371366337264e-09, "logits/chosen": 10.178709030151367, "logits/rejected": 11.207265853881836, "logps/chosen": -77.24491882324219, "logps/ref_chosen": -108.53898620605469, "logps/ref_rejected": -123.26167297363281, "logps/rejected": -91.20188903808594, "loss": 0.2649, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.31294065713882446, "rewards/margins": -0.007657179608941078, "rewards/rejected": 0.3205978274345398, "sft_loss": 0.26487135887145996, "step": 72 }, { "epoch": 0.9617126389460683, "grad_norm": 0.29381245374679565, "learning_rate": 1.0985044945254762e-09, "logits/chosen": 10.011255264282227, "logits/rejected": 10.857406616210938, "logps/chosen": -79.84062957763672, "logps/ref_chosen": -110.50595092773438, "logps/ref_rejected": -124.92510223388672, "logps/rejected": -94.2270736694336, "loss": 0.2583, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.3066532015800476, "rewards/margins": -0.0003270732704550028, "rewards/rejected": 0.30698028206825256, "sft_loss": 0.258339524269104, "step": 73 }, { "epoch": 0.9748867846850556, "grad_norm": 0.4308127462863922, "learning_rate": 2.7477712857215675e-10, "logits/chosen": 10.271313667297363, "logits/rejected": 11.117928504943848, "logps/chosen": -75.68081665039062, "logps/ref_chosen": -107.1200942993164, "logps/ref_rejected": -120.22421264648438, "logps/rejected": -89.6799545288086, "loss": 0.2564, "rewards/accuracies": 0.546875, "rewards/chosen": 0.3143928647041321, "rewards/margins": 0.008950251154601574, "rewards/rejected": 0.3054426312446594, "sft_loss": 0.2563988268375397, "step": 74 }, { "epoch": 0.9880609304240429, "grad_norm": 0.1982087343931198, "learning_rate": 0.0, "logits/chosen": 10.055291175842285, "logits/rejected": 10.858556747436523, "logps/chosen": -75.02184295654297, "logps/ref_chosen": -107.40764617919922, "logps/ref_rejected": -120.6578369140625, "logps/rejected": -89.03117370605469, "loss": 0.2485, "rewards/accuracies": 0.53125, "rewards/chosen": 0.32385802268981934, "rewards/margins": 0.0075914738699793816, "rewards/rejected": 0.3162665367126465, "sft_loss": 0.24852335453033447, "step": 75 }, { "epoch": 0.9880609304240429, "step": 75, "total_flos": 0.0, "train_loss": 0.29937881012757617, "train_runtime": 8120.0447, "train_samples_per_second": 1.196, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }