lzc0525's picture
Upload folder using huggingface_hub
82e400f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9880609304240429,
"eval_steps": 500,
"global_step": 75,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013174145738987238,
"grad_norm": 1.194186806678772,
"learning_rate": 6.25e-08,
"logits/chosen": 9.990612030029297,
"logits/rejected": 10.698101997375488,
"logps/chosen": -102.88545989990234,
"logps/ref_chosen": -102.88545989990234,
"logps/ref_rejected": -121.84871673583984,
"logps/rejected": -121.84871673583984,
"loss": 0.3675,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"sft_loss": 0.36753880977630615,
"step": 1
},
{
"epoch": 0.026348291477974475,
"grad_norm": 0.5353251099586487,
"learning_rate": 1.25e-07,
"logits/chosen": 10.211905479431152,
"logits/rejected": 11.06594467163086,
"logps/chosen": -107.70349884033203,
"logps/ref_chosen": -107.70349884033203,
"logps/ref_rejected": -121.89966583251953,
"logps/rejected": -121.89966583251953,
"loss": 0.4101,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"sft_loss": 0.41013145446777344,
"step": 2
},
{
"epoch": 0.03952243721696171,
"grad_norm": 0.7126303315162659,
"learning_rate": 1.875e-07,
"logits/chosen": 10.032384872436523,
"logits/rejected": 11.023520469665527,
"logps/chosen": -108.3123779296875,
"logps/ref_chosen": -107.98188781738281,
"logps/ref_rejected": -124.51527404785156,
"logps/rejected": -124.87130737304688,
"loss": 0.412,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -0.003304910147562623,
"rewards/margins": 0.0002554532838985324,
"rewards/rejected": -0.003560363780707121,
"sft_loss": 0.41195932030677795,
"step": 3
},
{
"epoch": 0.05269658295594895,
"grad_norm": 1.2344533205032349,
"learning_rate": 2.5e-07,
"logits/chosen": 9.836658477783203,
"logits/rejected": 10.855621337890625,
"logps/chosen": -109.55919647216797,
"logps/ref_chosen": -109.20836639404297,
"logps/ref_rejected": -119.23908996582031,
"logps/rejected": -119.48279571533203,
"loss": 0.4039,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -0.003508324269205332,
"rewards/margins": -0.0010712125804275274,
"rewards/rejected": -0.0024371116887778044,
"sft_loss": 0.4038863480091095,
"step": 4
},
{
"epoch": 0.06587072869493618,
"grad_norm": 1.426048994064331,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": 10.212320327758789,
"logits/rejected": 10.966379165649414,
"logps/chosen": -103.76991271972656,
"logps/ref_chosen": -103.87680053710938,
"logps/ref_rejected": -118.41618347167969,
"logps/rejected": -118.23270416259766,
"loss": 0.3697,
"rewards/accuracies": 0.453125,
"rewards/chosen": 0.0010687037138268352,
"rewards/margins": -0.000766113749705255,
"rewards/rejected": 0.0018348174635320902,
"sft_loss": 0.3697226345539093,
"step": 5
},
{
"epoch": 0.07904487443392343,
"grad_norm": 1.413549780845642,
"learning_rate": 3.75e-07,
"logits/chosen": 10.700042724609375,
"logits/rejected": 11.478326797485352,
"logps/chosen": -107.56877899169922,
"logps/ref_chosen": -107.58968353271484,
"logps/ref_rejected": -122.07303619384766,
"logps/rejected": -121.85940551757812,
"loss": 0.3909,
"rewards/accuracies": 0.4609375,
"rewards/chosen": 0.0002090137859340757,
"rewards/margins": -0.0019273017533123493,
"rewards/rejected": 0.00213631521910429,
"sft_loss": 0.390906423330307,
"step": 6
},
{
"epoch": 0.09221902017291066,
"grad_norm": 1.2342580556869507,
"learning_rate": 4.375e-07,
"logits/chosen": 10.01632308959961,
"logits/rejected": 10.7178955078125,
"logps/chosen": -107.01339721679688,
"logps/ref_chosen": -107.42727661132812,
"logps/ref_rejected": -116.87063598632812,
"logps/rejected": -116.37357330322266,
"loss": 0.3747,
"rewards/accuracies": 0.4453125,
"rewards/chosen": 0.00413867924362421,
"rewards/margins": -0.0008318667532876134,
"rewards/rejected": 0.004970546346157789,
"sft_loss": 0.3746669888496399,
"step": 7
},
{
"epoch": 0.1053931659118979,
"grad_norm": 0.6644937992095947,
"learning_rate": 5e-07,
"logits/chosen": 10.211028099060059,
"logits/rejected": 11.11027717590332,
"logps/chosen": -104.41184997558594,
"logps/ref_chosen": -105.60282135009766,
"logps/ref_rejected": -119.53916931152344,
"logps/rejected": -118.27430725097656,
"loss": 0.3773,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.01190974935889244,
"rewards/margins": -0.0007388982339762151,
"rewards/rejected": 0.012648648582398891,
"sft_loss": 0.37729793787002563,
"step": 8
},
{
"epoch": 0.11856731165088513,
"grad_norm": 0.9437576532363892,
"learning_rate": 4.997252228714278e-07,
"logits/chosen": 10.179821014404297,
"logits/rejected": 11.147579193115234,
"logps/chosen": -104.13174438476562,
"logps/ref_chosen": -105.46086120605469,
"logps/ref_rejected": -119.00373840332031,
"logps/rejected": -117.734130859375,
"loss": 0.3807,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.013291322626173496,
"rewards/margins": 0.0005952615174464881,
"rewards/rejected": 0.012696062214672565,
"sft_loss": 0.38070446252822876,
"step": 9
},
{
"epoch": 0.13174145738987236,
"grad_norm": 0.700039803981781,
"learning_rate": 4.989014955054745e-07,
"logits/chosen": 10.076737403869629,
"logits/rejected": 10.897785186767578,
"logps/chosen": -100.81087493896484,
"logps/ref_chosen": -104.21009826660156,
"logps/ref_rejected": -118.9209213256836,
"logps/rejected": -115.75495910644531,
"loss": 0.3367,
"rewards/accuracies": 0.5546875,
"rewards/chosen": 0.033992186188697815,
"rewards/margins": 0.0023326175287365913,
"rewards/rejected": 0.0316595658659935,
"sft_loss": 0.33672136068344116,
"step": 10
},
{
"epoch": 0.14491560312885962,
"grad_norm": 0.9160856008529663,
"learning_rate": 4.975306286336627e-07,
"logits/chosen": 9.973880767822266,
"logits/rejected": 11.158487319946289,
"logps/chosen": -101.3505630493164,
"logps/ref_chosen": -105.94319152832031,
"logps/ref_rejected": -122.76007843017578,
"logps/rejected": -118.6338119506836,
"loss": 0.3851,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.045926500111818314,
"rewards/margins": 0.004663803614675999,
"rewards/rejected": 0.04126270115375519,
"sft_loss": 0.3850533962249756,
"step": 11
},
{
"epoch": 0.15808974886784685,
"grad_norm": 0.9421964883804321,
"learning_rate": 4.956156357188939e-07,
"logits/chosen": 9.908226013183594,
"logits/rejected": 10.598045349121094,
"logps/chosen": -103.32762908935547,
"logps/ref_chosen": -109.08442687988281,
"logps/ref_rejected": -121.41947174072266,
"logps/rejected": -115.84996795654297,
"loss": 0.3532,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.05756799131631851,
"rewards/margins": 0.0018730255542322993,
"rewards/rejected": 0.05569496005773544,
"sft_loss": 0.3532242476940155,
"step": 12
},
{
"epoch": 0.17126389460683408,
"grad_norm": 0.3328304886817932,
"learning_rate": 4.931607263312032e-07,
"logits/chosen": 9.964012145996094,
"logits/rejected": 11.03992748260498,
"logps/chosen": -98.97601318359375,
"logps/ref_chosen": -104.62150573730469,
"logps/ref_rejected": -119.55384063720703,
"logps/rejected": -114.12371826171875,
"loss": 0.3686,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.056454867124557495,
"rewards/margins": 0.0021536569111049175,
"rewards/rejected": 0.05430121719837189,
"sft_loss": 0.36859023571014404,
"step": 13
},
{
"epoch": 0.1844380403458213,
"grad_norm": 0.30642038583755493,
"learning_rate": 4.9017129689421e-07,
"logits/chosen": 10.519927978515625,
"logits/rejected": 11.649580001831055,
"logps/chosen": -96.5634765625,
"logps/ref_chosen": -106.179443359375,
"logps/ref_rejected": -120.73036193847656,
"logps/rejected": -110.86133575439453,
"loss": 0.3385,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.09615952521562576,
"rewards/margins": -0.002530643017962575,
"rewards/rejected": 0.09869016706943512,
"sft_loss": 0.338548868894577,
"step": 14
},
{
"epoch": 0.19761218608480857,
"grad_norm": 0.301073282957077,
"learning_rate": 4.866539188226085e-07,
"logits/chosen": 9.891039848327637,
"logits/rejected": 10.824172973632812,
"logps/chosen": -95.14861297607422,
"logps/ref_chosen": -105.70547485351562,
"logps/ref_rejected": -118.89997863769531,
"logps/rejected": -108.2326889038086,
"loss": 0.3305,
"rewards/accuracies": 0.5078125,
"rewards/chosen": 0.10556865483522415,
"rewards/margins": -0.0011043368140235543,
"rewards/rejected": 0.1066729873418808,
"sft_loss": 0.3305360674858093,
"step": 15
},
{
"epoch": 0.2107863318237958,
"grad_norm": 0.36065343022346497,
"learning_rate": 4.826163240767716e-07,
"logits/chosen": 10.682470321655273,
"logits/rejected": 11.299846649169922,
"logps/chosen": -96.53520202636719,
"logps/ref_chosen": -108.86376953125,
"logps/ref_rejected": -122.1635513305664,
"logps/rejected": -110.50537872314453,
"loss": 0.3484,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.12328556925058365,
"rewards/margins": 0.006703883409500122,
"rewards/rejected": 0.11658168584108353,
"sft_loss": 0.3484281003475189,
"step": 16
},
{
"epoch": 0.22396047756278303,
"grad_norm": 0.5325565934181213,
"learning_rate": 4.780673881662242e-07,
"logits/chosen": 10.187503814697266,
"logits/rejected": 10.843408584594727,
"logps/chosen": -90.1707992553711,
"logps/ref_chosen": -102.93986511230469,
"logps/ref_rejected": -119.43718719482422,
"logps/rejected": -106.76301574707031,
"loss": 0.359,
"rewards/accuracies": 0.4453125,
"rewards/chosen": 0.12769076228141785,
"rewards/margins": 0.0009490540251135826,
"rewards/rejected": 0.1267417073249817,
"sft_loss": 0.3589847683906555,
"step": 17
},
{
"epoch": 0.23713462330177026,
"grad_norm": 0.4098409116268158,
"learning_rate": 4.730171106393466e-07,
"logits/chosen": 10.4215669631958,
"logits/rejected": 11.216498374938965,
"logps/chosen": -90.09894561767578,
"logps/ref_chosen": -103.81341552734375,
"logps/ref_rejected": -117.45123291015625,
"logps/rejected": -104.58552551269531,
"loss": 0.3368,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.13714462518692017,
"rewards/margins": 0.00848748255521059,
"rewards/rejected": 0.12865713238716125,
"sft_loss": 0.33678534626960754,
"step": 18
},
{
"epoch": 0.2503087690407575,
"grad_norm": 0.3302735984325409,
"learning_rate": 4.6747659310219757e-07,
"logits/chosen": 10.332744598388672,
"logits/rejected": 11.005766868591309,
"logps/chosen": -94.52428436279297,
"logps/ref_chosen": -107.85797119140625,
"logps/ref_rejected": -121.88042449951172,
"logps/rejected": -108.09265899658203,
"loss": 0.3222,
"rewards/accuracies": 0.4296875,
"rewards/chosen": 0.1333368420600891,
"rewards/margins": -0.004540742840617895,
"rewards/rejected": 0.13787758350372314,
"sft_loss": 0.3221552073955536,
"step": 19
},
{
"epoch": 0.2634829147797447,
"grad_norm": 0.40531161427497864,
"learning_rate": 4.6145801481477433e-07,
"logits/chosen": 10.747330665588379,
"logits/rejected": 11.561124801635742,
"logps/chosen": -89.97228240966797,
"logps/ref_chosen": -103.42721557617188,
"logps/ref_rejected": -116.7796630859375,
"logps/rejected": -103.99850463867188,
"loss": 0.3157,
"rewards/accuracies": 0.5703125,
"rewards/chosen": 0.13454943895339966,
"rewards/margins": 0.006737923249602318,
"rewards/rejected": 0.1278115212917328,
"sft_loss": 0.3156886100769043,
"step": 20
},
{
"epoch": 0.276657060518732,
"grad_norm": 0.2686282992362976,
"learning_rate": 4.549746059183561e-07,
"logits/chosen": 9.720458984375,
"logits/rejected": 10.846506118774414,
"logps/chosen": -92.48249816894531,
"logps/ref_chosen": -106.60163879394531,
"logps/ref_rejected": -124.56562805175781,
"logps/rejected": -109.58876037597656,
"loss": 0.3106,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.14119136333465576,
"rewards/margins": -0.00857722107321024,
"rewards/rejected": 0.14976857602596283,
"sft_loss": 0.31064528226852417,
"step": 21
},
{
"epoch": 0.28983120625771924,
"grad_norm": 0.5989738702774048,
"learning_rate": 4.480406183527823e-07,
"logits/chosen": 10.225810050964355,
"logits/rejected": 11.099544525146484,
"logps/chosen": -88.04141998291016,
"logps/ref_chosen": -103.77696228027344,
"logps/ref_rejected": -118.73616027832031,
"logps/rejected": -104.40451049804688,
"loss": 0.3321,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.157355397939682,
"rewards/margins": 0.014038847759366035,
"rewards/rejected": 0.14331655204296112,
"sft_loss": 0.3321138620376587,
"step": 22
},
{
"epoch": 0.3030053519967065,
"grad_norm": 0.1970965415239334,
"learning_rate": 4.4067129452759546e-07,
"logits/chosen": 10.115339279174805,
"logits/rejected": 11.140266418457031,
"logps/chosen": -87.26233673095703,
"logps/ref_chosen": -104.72956085205078,
"logps/ref_rejected": -121.35556030273438,
"logps/rejected": -104.43501281738281,
"loss": 0.3228,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.17467224597930908,
"rewards/margins": 0.005466699134558439,
"rewards/rejected": 0.16920553147792816,
"sft_loss": 0.3228015899658203,
"step": 23
},
{
"epoch": 0.3161794977356937,
"grad_norm": 0.2894323468208313,
"learning_rate": 4.3288283381591725e-07,
"logits/chosen": 10.147160530090332,
"logits/rejected": 10.98647689819336,
"logps/chosen": -86.99087524414062,
"logps/ref_chosen": -105.88758087158203,
"logps/ref_rejected": -125.69054412841797,
"logps/rejected": -106.15878295898438,
"loss": 0.3069,
"rewards/accuracies": 0.4765625,
"rewards/chosen": 0.1889670193195343,
"rewards/margins": -0.0063507393933832645,
"rewards/rejected": 0.19531774520874023,
"sft_loss": 0.30694928765296936,
"step": 24
},
{
"epoch": 0.32935364347468093,
"grad_norm": 0.31463876366615295,
"learning_rate": 4.246923569447104e-07,
"logits/chosen": 10.327369689941406,
"logits/rejected": 11.063910484313965,
"logps/chosen": -87.99880981445312,
"logps/ref_chosen": -110.0761489868164,
"logps/ref_rejected": -129.10540771484375,
"logps/rejected": -107.19017028808594,
"loss": 0.2993,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.22077329456806183,
"rewards/margins": 0.001621072180569172,
"rewards/rejected": 0.21915221214294434,
"sft_loss": 0.2992705702781677,
"step": 25
},
{
"epoch": 0.34252778921366817,
"grad_norm": 0.25855836272239685,
"learning_rate": 4.161178683597054e-07,
"logits/chosen": 10.388958930969238,
"logits/rejected": 11.489179611206055,
"logps/chosen": -81.3349609375,
"logps/ref_chosen": -103.74571990966797,
"logps/ref_rejected": -120.73832702636719,
"logps/rejected": -98.57904052734375,
"loss": 0.2909,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.22410757839679718,
"rewards/margins": 0.0025147469714283943,
"rewards/rejected": 0.22159285843372345,
"sft_loss": 0.2909452021121979,
"step": 26
},
{
"epoch": 0.3557019349526554,
"grad_norm": 0.5696946382522583,
"learning_rate": 4.0717821664772124e-07,
"logits/chosen": 10.086296081542969,
"logits/rejected": 11.336379051208496,
"logps/chosen": -81.64080810546875,
"logps/ref_chosen": -105.47428131103516,
"logps/ref_rejected": -120.5193099975586,
"logps/rejected": -97.64772033691406,
"loss": 0.3163,
"rewards/accuracies": 0.5390625,
"rewards/chosen": 0.23833464086055756,
"rewards/margins": 0.00961877591907978,
"rewards/rejected": 0.22871585190296173,
"sft_loss": 0.316275417804718,
"step": 27
},
{
"epoch": 0.3688760806916426,
"grad_norm": 0.14858409762382507,
"learning_rate": 3.978930531033806e-07,
"logits/chosen": 9.710855484008789,
"logits/rejected": 10.872222900390625,
"logps/chosen": -80.8717269897461,
"logps/ref_chosen": -103.72540283203125,
"logps/ref_rejected": -119.79557800292969,
"logps/rejected": -96.69274139404297,
"loss": 0.2766,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.22853665053844452,
"rewards/margins": -0.0024917693808674812,
"rewards/rejected": 0.23102842271327972,
"sft_loss": 0.2765931785106659,
"step": 28
},
{
"epoch": 0.3820502264306299,
"grad_norm": 0.2741422951221466,
"learning_rate": 3.882827885312998e-07,
"logits/chosen": 10.16092586517334,
"logits/rejected": 11.23297119140625,
"logps/chosen": -85.11812591552734,
"logps/ref_chosen": -108.65434265136719,
"logps/ref_rejected": -121.46784973144531,
"logps/rejected": -98.77241516113281,
"loss": 0.2799,
"rewards/accuracies": 0.5078125,
"rewards/chosen": 0.23536208271980286,
"rewards/margins": 0.008407761342823505,
"rewards/rejected": 0.22695434093475342,
"sft_loss": 0.27991783618927,
"step": 29
},
{
"epoch": 0.39522437216961714,
"grad_norm": 0.16848187148571014,
"learning_rate": 3.7836854837871044e-07,
"logits/chosen": 10.2907133102417,
"logits/rejected": 11.690597534179688,
"logps/chosen": -78.23504638671875,
"logps/ref_chosen": -103.62174224853516,
"logps/ref_rejected": -126.73807525634766,
"logps/rejected": -102.43669128417969,
"loss": 0.2962,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.2538670301437378,
"rewards/margins": 0.010853251442313194,
"rewards/rejected": 0.24301378428936005,
"sft_loss": 0.2962155342102051,
"step": 30
},
{
"epoch": 0.4083985179086044,
"grad_norm": 0.1987890601158142,
"learning_rate": 3.681721262971413e-07,
"logits/chosen": 9.929094314575195,
"logits/rejected": 10.946361541748047,
"logps/chosen": -80.73751831054688,
"logps/ref_chosen": -106.10479736328125,
"logps/ref_rejected": -120.6382827758789,
"logps/rejected": -96.38467407226562,
"loss": 0.2982,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.2536728084087372,
"rewards/margins": 0.011136716231703758,
"rewards/rejected": 0.24253609776496887,
"sft_loss": 0.2981662452220917,
"step": 31
},
{
"epoch": 0.4215726636475916,
"grad_norm": 0.1829695999622345,
"learning_rate": 3.577159362352426e-07,
"logits/chosen": 10.097947120666504,
"logits/rejected": 11.477932929992676,
"logps/chosen": -82.30887603759766,
"logps/ref_chosen": -105.99569702148438,
"logps/ref_rejected": -128.34303283691406,
"logps/rejected": -104.14814758300781,
"loss": 0.2848,
"rewards/accuracies": 0.5078125,
"rewards/chosen": 0.2368682324886322,
"rewards/margins": -0.0050805676728487015,
"rewards/rejected": 0.24194881319999695,
"sft_loss": 0.28480714559555054,
"step": 32
},
{
"epoch": 0.43474680938657884,
"grad_norm": 0.22964715957641602,
"learning_rate": 3.470229631680624e-07,
"logits/chosen": 10.105993270874023,
"logits/rejected": 10.923880577087402,
"logps/chosen": -81.49787902832031,
"logps/ref_chosen": -105.72196197509766,
"logps/ref_rejected": -121.59507751464844,
"logps/rejected": -97.2413101196289,
"loss": 0.2686,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.242240771651268,
"rewards/margins": -0.0012968010269105434,
"rewards/rejected": 0.24353757500648499,
"sft_loss": 0.2686034142971039,
"step": 33
},
{
"epoch": 0.44792095512556607,
"grad_norm": 0.2910502254962921,
"learning_rate": 3.361167125710832e-07,
"logits/chosen": 10.264101028442383,
"logits/rejected": 11.107752799987793,
"logps/chosen": -85.68121337890625,
"logps/ref_chosen": -111.4834976196289,
"logps/ref_rejected": -130.48089599609375,
"logps/rejected": -104.08442687988281,
"loss": 0.3039,
"rewards/accuracies": 0.4609375,
"rewards/chosen": 0.2580227851867676,
"rewards/margins": -0.005941788665950298,
"rewards/rejected": 0.26396459341049194,
"sft_loss": 0.3038797080516815,
"step": 34
},
{
"epoch": 0.4610951008645533,
"grad_norm": 0.2656664550304413,
"learning_rate": 3.2502115875008516e-07,
"logits/chosen": 10.529006958007812,
"logits/rejected": 11.506540298461914,
"logps/chosen": -82.62615966796875,
"logps/ref_chosen": -108.9183349609375,
"logps/ref_rejected": -121.32493591308594,
"logps/rejected": -95.86014556884766,
"loss": 0.2751,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.26292186975479126,
"rewards/margins": 0.008274020627140999,
"rewards/rejected": 0.2546478509902954,
"sft_loss": 0.2751036286354065,
"step": 35
},
{
"epoch": 0.47426924660354053,
"grad_norm": 0.4242345690727234,
"learning_rate": 3.137606921404191e-07,
"logits/chosen": 10.204312324523926,
"logits/rejected": 10.856239318847656,
"logps/chosen": -81.047607421875,
"logps/ref_chosen": -107.1411361694336,
"logps/ref_rejected": -118.66165161132812,
"logps/rejected": -92.72647094726562,
"loss": 0.2871,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.26093533635139465,
"rewards/margins": 0.0015834786463528872,
"rewards/rejected": 0.25935184955596924,
"sft_loss": 0.28712767362594604,
"step": 36
},
{
"epoch": 0.4874433923425278,
"grad_norm": 0.30854037404060364,
"learning_rate": 3.0236006569153616e-07,
"logits/chosen": 10.416954040527344,
"logits/rejected": 11.237515449523926,
"logps/chosen": -80.80354309082031,
"logps/ref_chosen": -106.6348876953125,
"logps/ref_rejected": -121.37834167480469,
"logps/rejected": -94.95623779296875,
"loss": 0.2868,
"rewards/accuracies": 0.5390625,
"rewards/chosen": 0.2583135664463043,
"rewards/margins": -0.0059075187891721725,
"rewards/rejected": 0.26422107219696045,
"sft_loss": 0.2867960035800934,
"step": 37
},
{
"epoch": 0.500617538081515,
"grad_norm": 0.38461658358573914,
"learning_rate": 2.9084434045463254e-07,
"logits/chosen": 9.907350540161133,
"logits/rejected": 10.992050170898438,
"logps/chosen": -76.08082580566406,
"logps/ref_chosen": -104.01033782958984,
"logps/ref_rejected": -119.02666473388672,
"logps/rejected": -90.15015411376953,
"loss": 0.2865,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.2792952358722687,
"rewards/margins": -0.009469768032431602,
"rewards/rejected": 0.28876498341560364,
"sft_loss": 0.28654617071151733,
"step": 38
},
{
"epoch": 0.5137916838205022,
"grad_norm": 0.32149407267570496,
"learning_rate": 2.7923883049302066e-07,
"logits/chosen": 10.409524917602539,
"logits/rejected": 11.193167686462402,
"logps/chosen": -82.37915802001953,
"logps/ref_chosen": -109.76485443115234,
"logps/ref_rejected": -122.25163269042969,
"logps/rejected": -96.5480728149414,
"loss": 0.2919,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.2738569676876068,
"rewards/margins": 0.016821369528770447,
"rewards/rejected": 0.25703561305999756,
"sft_loss": 0.2919383943080902,
"step": 39
},
{
"epoch": 0.5269658295594895,
"grad_norm": 0.48350030183792114,
"learning_rate": 2.6756904723632324e-07,
"logits/chosen": 10.301675796508789,
"logits/rejected": 11.467926025390625,
"logps/chosen": -79.19075012207031,
"logps/ref_chosen": -107.18782806396484,
"logps/ref_rejected": -124.24542236328125,
"logps/rejected": -96.9850082397461,
"loss": 0.2805,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.2799707055091858,
"rewards/margins": 0.007366571109741926,
"rewards/rejected": 0.27260416746139526,
"sft_loss": 0.2805452048778534,
"step": 40
},
{
"epoch": 0.5401399752984768,
"grad_norm": 0.27216726541519165,
"learning_rate": 2.5586064340081516e-07,
"logits/chosen": 10.603267669677734,
"logits/rejected": 11.226158142089844,
"logps/chosen": -78.87698364257812,
"logps/ref_chosen": -106.42051696777344,
"logps/ref_rejected": -122.25247192382812,
"logps/rejected": -94.32707977294922,
"loss": 0.2862,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.275435209274292,
"rewards/margins": -0.003818750847131014,
"rewards/rejected": 0.2792539596557617,
"sft_loss": 0.28622862696647644,
"step": 41
},
{
"epoch": 0.553314121037464,
"grad_norm": 0.33358198404312134,
"learning_rate": 2.4413935659918487e-07,
"logits/chosen": 9.588984489440918,
"logits/rejected": 10.6452054977417,
"logps/chosen": -74.2744369506836,
"logps/ref_chosen": -103.1148452758789,
"logps/ref_rejected": -116.55464935302734,
"logps/rejected": -88.39811706542969,
"loss": 0.2641,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.28840407729148865,
"rewards/margins": 0.0068388087674975395,
"rewards/rejected": 0.28156527876853943,
"sft_loss": 0.2640990614891052,
"step": 42
},
{
"epoch": 0.5664882667764513,
"grad_norm": 0.47038576006889343,
"learning_rate": 2.3243095276367684e-07,
"logits/chosen": 9.663254737854004,
"logits/rejected": 10.8170747756958,
"logps/chosen": -76.7999267578125,
"logps/ref_chosen": -104.21064758300781,
"logps/ref_rejected": -118.7614974975586,
"logps/rejected": -91.26052856445312,
"loss": 0.2744,
"rewards/accuracies": 0.4609375,
"rewards/chosen": 0.2741071879863739,
"rewards/margins": -0.0009024296887218952,
"rewards/rejected": 0.2750096321105957,
"sft_loss": 0.2744351625442505,
"step": 43
},
{
"epoch": 0.5796624125154385,
"grad_norm": 0.30214762687683105,
"learning_rate": 2.2076116950697937e-07,
"logits/chosen": 9.781536102294922,
"logits/rejected": 10.607444763183594,
"logps/chosen": -72.30926513671875,
"logps/ref_chosen": -100.59449005126953,
"logps/ref_rejected": -115.95166778564453,
"logps/rejected": -87.6107177734375,
"loss": 0.2461,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.28285229206085205,
"rewards/margins": -0.0005572582595050335,
"rewards/rejected": 0.2834095358848572,
"sft_loss": 0.24606820940971375,
"step": 44
},
{
"epoch": 0.5928365582544257,
"grad_norm": 0.26689764857292175,
"learning_rate": 2.091556595453674e-07,
"logits/chosen": 9.98751449584961,
"logits/rejected": 10.849931716918945,
"logps/chosen": -78.55535888671875,
"logps/ref_chosen": -106.96060943603516,
"logps/ref_rejected": -125.49449157714844,
"logps/rejected": -98.11154174804688,
"loss": 0.2706,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.2840524911880493,
"rewards/margins": 0.010223127901554108,
"rewards/rejected": 0.2738293409347534,
"sft_loss": 0.2706489562988281,
"step": 45
},
{
"epoch": 0.606010703993413,
"grad_norm": 0.24426043033599854,
"learning_rate": 1.9763993430846392e-07,
"logits/chosen": 10.033075332641602,
"logits/rejected": 10.728992462158203,
"logps/chosen": -76.60491180419922,
"logps/ref_chosen": -107.08544158935547,
"logps/ref_rejected": -120.38542175292969,
"logps/rejected": -89.13396453857422,
"loss": 0.2738,
"rewards/accuracies": 0.4609375,
"rewards/chosen": 0.30480533838272095,
"rewards/margins": -0.00770913390442729,
"rewards/rejected": 0.31251445412635803,
"sft_loss": 0.2737652063369751,
"step": 46
},
{
"epoch": 0.6191848497324002,
"grad_norm": 0.22300882637500763,
"learning_rate": 1.862393078595809e-07,
"logits/chosen": 9.950118064880371,
"logits/rejected": 11.169652938842773,
"logps/chosen": -77.8197021484375,
"logps/ref_chosen": -105.74787902832031,
"logps/ref_rejected": -122.93606567382812,
"logps/rejected": -96.1205062866211,
"loss": 0.2652,
"rewards/accuracies": 0.5078125,
"rewards/chosen": 0.2792818248271942,
"rewards/margins": 0.011126276105642319,
"rewards/rejected": 0.2681555151939392,
"sft_loss": 0.26522505283355713,
"step": 47
},
{
"epoch": 0.6323589954713874,
"grad_norm": 0.2568327784538269,
"learning_rate": 1.7497884124991485e-07,
"logits/chosen": 10.451184272766113,
"logits/rejected": 11.392836570739746,
"logps/chosen": -76.05364990234375,
"logps/ref_chosen": -105.3005599975586,
"logps/ref_rejected": -123.93569946289062,
"logps/rejected": -93.72474670410156,
"loss": 0.2697,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.29246923327445984,
"rewards/margins": -0.009640296921133995,
"rewards/rejected": 0.3021095395088196,
"sft_loss": 0.26966598629951477,
"step": 48
},
{
"epoch": 0.6455331412103746,
"grad_norm": 0.2787468433380127,
"learning_rate": 1.6388328742891678e-07,
"logits/chosen": 10.48294734954834,
"logits/rejected": 11.362519264221191,
"logps/chosen": -74.37510681152344,
"logps/ref_chosen": -104.30430603027344,
"logps/ref_rejected": -115.85497283935547,
"logps/rejected": -86.97918701171875,
"loss": 0.2744,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.2992919683456421,
"rewards/margins": 0.010534043423831463,
"rewards/rejected": 0.28875789046287537,
"sft_loss": 0.2743627727031708,
"step": 49
},
{
"epoch": 0.6587072869493619,
"grad_norm": 0.3125001788139343,
"learning_rate": 1.5297703683193753e-07,
"logits/chosen": 10.078466415405273,
"logits/rejected": 10.955018997192383,
"logps/chosen": -75.85308837890625,
"logps/ref_chosen": -104.65946960449219,
"logps/ref_rejected": -118.84170532226562,
"logps/rejected": -90.01569366455078,
"loss": 0.2577,
"rewards/accuracies": 0.4765625,
"rewards/chosen": 0.28806373476982117,
"rewards/margins": -0.00019637378863990307,
"rewards/rejected": 0.2882601022720337,
"sft_loss": 0.25767678022384644,
"step": 50
},
{
"epoch": 0.6718814326883491,
"grad_norm": 0.15405406057834625,
"learning_rate": 1.422840637647574e-07,
"logits/chosen": 10.179081916809082,
"logits/rejected": 10.733241081237793,
"logps/chosen": -74.97821807861328,
"logps/ref_chosen": -104.4243392944336,
"logps/ref_rejected": -117.16233825683594,
"logps/rejected": -88.18524932861328,
"loss": 0.2584,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.294461190700531,
"rewards/margins": 0.004690280184149742,
"rewards/rejected": 0.2897709012031555,
"sft_loss": 0.25844162702560425,
"step": 51
},
{
"epoch": 0.6850555784273363,
"grad_norm": 0.16120746731758118,
"learning_rate": 1.3182787370285865e-07,
"logits/chosen": 9.57602596282959,
"logits/rejected": 10.736429214477539,
"logps/chosen": -72.85411071777344,
"logps/ref_chosen": -101.99165344238281,
"logps/ref_rejected": -123.20516204833984,
"logps/rejected": -93.20382690429688,
"loss": 0.2552,
"rewards/accuracies": 0.4609375,
"rewards/chosen": 0.29137539863586426,
"rewards/margins": -0.008637862280011177,
"rewards/rejected": 0.3000132739543915,
"sft_loss": 0.2551620900630951,
"step": 52
},
{
"epoch": 0.6982297241663236,
"grad_norm": 0.19364570081233978,
"learning_rate": 1.2163145162128946e-07,
"logits/chosen": 10.08875846862793,
"logits/rejected": 10.98922348022461,
"logps/chosen": -78.81430053710938,
"logps/ref_chosen": -108.26175689697266,
"logps/ref_rejected": -118.12374114990234,
"logps/rejected": -89.11669921875,
"loss": 0.2577,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.2944745421409607,
"rewards/margins": 0.004404103849083185,
"rewards/rejected": 0.29007044434547424,
"sft_loss": 0.2576565444469452,
"step": 53
},
{
"epoch": 0.7114038699053108,
"grad_norm": 0.28979793190956116,
"learning_rate": 1.1171721146870014e-07,
"logits/chosen": 10.206223487854004,
"logits/rejected": 11.192782402038574,
"logps/chosen": -76.93553924560547,
"logps/ref_chosen": -108.5864028930664,
"logps/ref_rejected": -130.25155639648438,
"logps/rejected": -96.52012634277344,
"loss": 0.2871,
"rewards/accuracies": 0.3984375,
"rewards/chosen": 0.31650859117507935,
"rewards/margins": -0.020805664360523224,
"rewards/rejected": 0.337314248085022,
"sft_loss": 0.2871078848838806,
"step": 54
},
{
"epoch": 0.724578015644298,
"grad_norm": 0.1910555362701416,
"learning_rate": 1.0210694689661939e-07,
"logits/chosen": 10.210641860961914,
"logits/rejected": 11.032613754272461,
"logps/chosen": -74.83497619628906,
"logps/ref_chosen": -105.69741821289062,
"logps/ref_rejected": -122.07044219970703,
"logps/rejected": -90.28074645996094,
"loss": 0.2573,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.308624267578125,
"rewards/margins": -0.009272638708353043,
"rewards/rejected": 0.3178969621658325,
"sft_loss": 0.25734585523605347,
"step": 55
},
{
"epoch": 0.7377521613832853,
"grad_norm": 0.27673250436782837,
"learning_rate": 9.282178335227883e-08,
"logits/chosen": 9.825605392456055,
"logits/rejected": 11.014497756958008,
"logps/chosen": -76.0024642944336,
"logps/ref_chosen": -106.5007095336914,
"logps/ref_rejected": -123.01736450195312,
"logps/rejected": -92.10081481933594,
"loss": 0.2743,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.3049824833869934,
"rewards/margins": -0.004183035343885422,
"rewards/rejected": 0.30916550755500793,
"sft_loss": 0.27425283193588257,
"step": 56
},
{
"epoch": 0.7509263071222725,
"grad_norm": 0.18975986540317535,
"learning_rate": 8.388213164029459e-08,
"logits/chosen": 10.642633438110352,
"logits/rejected": 11.387500762939453,
"logps/chosen": -79.39006805419922,
"logps/ref_chosen": -109.18460083007812,
"logps/ref_rejected": -124.3697280883789,
"logps/rejected": -92.89251708984375,
"loss": 0.2729,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.2979453504085541,
"rewards/margins": -0.016826828941702843,
"rewards/rejected": 0.3147721290588379,
"sft_loss": 0.2728780210018158,
"step": 57
},
{
"epoch": 0.7641004528612598,
"grad_norm": 0.41853123903274536,
"learning_rate": 7.530764305528958e-08,
"logits/chosen": 9.80534839630127,
"logits/rejected": 10.46203327178955,
"logps/chosen": -73.13560485839844,
"logps/ref_chosen": -104.43944549560547,
"logps/ref_rejected": -118.44985961914062,
"logps/rejected": -85.20870208740234,
"loss": 0.2597,
"rewards/accuracies": 0.4765625,
"rewards/chosen": 0.3130384385585785,
"rewards/margins": -0.01937328279018402,
"rewards/rejected": 0.33241167664527893,
"sft_loss": 0.2597176730632782,
"step": 58
},
{
"epoch": 0.7772745986002471,
"grad_norm": 0.2700183391571045,
"learning_rate": 6.711716618408281e-08,
"logits/chosen": 10.213776588439941,
"logits/rejected": 11.164811134338379,
"logps/chosen": -73.02882385253906,
"logps/ref_chosen": -103.32658386230469,
"logps/ref_rejected": -121.63726806640625,
"logps/rejected": -90.34732818603516,
"loss": 0.2687,
"rewards/accuracies": 0.4296875,
"rewards/chosen": 0.3029775619506836,
"rewards/margins": -0.009921873919665813,
"rewards/rejected": 0.3128994107246399,
"sft_loss": 0.26870667934417725,
"step": 59
},
{
"epoch": 0.7904487443392343,
"grad_norm": 0.14984337985515594,
"learning_rate": 5.932870547240454e-08,
"logits/chosen": 9.960476875305176,
"logits/rejected": 11.033708572387695,
"logps/chosen": -72.57840728759766,
"logps/ref_chosen": -102.98921966552734,
"logps/ref_rejected": -124.47185516357422,
"logps/rejected": -92.38633728027344,
"loss": 0.2563,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.3041080832481384,
"rewards/margins": -0.01674714870750904,
"rewards/rejected": 0.32085520029067993,
"sft_loss": 0.2563033699989319,
"step": 60
},
{
"epoch": 0.8036228900782215,
"grad_norm": 0.31175732612609863,
"learning_rate": 5.1959381647217665e-08,
"logits/chosen": 9.977319717407227,
"logits/rejected": 10.946649551391602,
"logps/chosen": -73.95118713378906,
"logps/ref_chosen": -106.28311157226562,
"logps/ref_rejected": -121.47750854492188,
"logps/rejected": -88.92882537841797,
"loss": 0.2666,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.3233192563056946,
"rewards/margins": -0.0021677182521671057,
"rewards/rejected": 0.3254869878292084,
"sft_loss": 0.26657047867774963,
"step": 61
},
{
"epoch": 0.8167970358172087,
"grad_norm": 0.3104262351989746,
"learning_rate": 4.502539408164385e-08,
"logits/chosen": 10.092552185058594,
"logits/rejected": 10.915599822998047,
"logps/chosen": -77.04930114746094,
"logps/ref_chosen": -109.67979431152344,
"logps/ref_rejected": -120.36711120605469,
"logps/rejected": -87.77507781982422,
"loss": 0.2549,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.32630497217178345,
"rewards/margins": 0.00038449978455901146,
"rewards/rejected": 0.3259204030036926,
"sft_loss": 0.2549050450325012,
"step": 62
},
{
"epoch": 0.829971181556196,
"grad_norm": 1.811981201171875,
"learning_rate": 3.854198518522564e-08,
"logits/chosen": 10.26008415222168,
"logits/rejected": 11.338315963745117,
"logps/chosen": -75.18155670166016,
"logps/ref_chosen": -106.88896942138672,
"logps/ref_rejected": -122.57796478271484,
"logps/rejected": -91.81503295898438,
"loss": 0.2708,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.3170740604400635,
"rewards/margins": 0.009444723837077618,
"rewards/rejected": 0.3076293170452118,
"sft_loss": 0.270816445350647,
"step": 63
},
{
"epoch": 0.8431453272951832,
"grad_norm": 0.19537799060344696,
"learning_rate": 3.2523406897802444e-08,
"logits/chosen": 10.241344451904297,
"logits/rejected": 10.94163990020752,
"logps/chosen": -78.00033569335938,
"logps/ref_chosen": -109.29510498046875,
"logps/ref_rejected": -121.51821899414062,
"logps/rejected": -88.49795532226562,
"loss": 0.2733,
"rewards/accuracies": 0.40625,
"rewards/chosen": 0.3129475712776184,
"rewards/margins": -0.017255008220672607,
"rewards/rejected": 0.33020254969596863,
"sft_loss": 0.2733091115951538,
"step": 64
},
{
"epoch": 0.8563194730341704,
"grad_norm": 0.3109639286994934,
"learning_rate": 2.6982889360653376e-08,
"logits/chosen": 9.754749298095703,
"logits/rejected": 11.062653541564941,
"logps/chosen": -76.51459503173828,
"logps/ref_chosen": -110.15232849121094,
"logps/ref_rejected": -127.54279327392578,
"logps/rejected": -96.31084442138672,
"loss": 0.271,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.3363773226737976,
"rewards/margins": 0.024057911708950996,
"rewards/rejected": 0.31231939792633057,
"sft_loss": 0.27095291018486023,
"step": 65
},
{
"epoch": 0.8694936187731577,
"grad_norm": 0.22148017585277557,
"learning_rate": 2.1932611833775843e-08,
"logits/chosen": 9.787257194519043,
"logits/rejected": 10.842453956604004,
"logps/chosen": -72.53459167480469,
"logps/ref_chosen": -104.0207748413086,
"logps/ref_rejected": -126.93211364746094,
"logps/rejected": -92.82463073730469,
"loss": 0.2518,
"rewards/accuracies": 0.4453125,
"rewards/chosen": 0.31486180424690247,
"rewards/margins": -0.026213109493255615,
"rewards/rejected": 0.3410749137401581,
"sft_loss": 0.25176769495010376,
"step": 66
},
{
"epoch": 0.8826677645121449,
"grad_norm": 0.23339781165122986,
"learning_rate": 1.738367592322837e-08,
"logits/chosen": 10.151799201965332,
"logits/rejected": 11.080910682678223,
"logps/chosen": -72.54059600830078,
"logps/ref_chosen": -104.55751037597656,
"logps/ref_rejected": -119.71514892578125,
"logps/rejected": -87.53738403320312,
"loss": 0.2572,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.32016903162002563,
"rewards/margins": -0.001608673483133316,
"rewards/rejected": 0.3217777609825134,
"sft_loss": 0.25715309381484985,
"step": 67
},
{
"epoch": 0.8958419102511321,
"grad_norm": 0.3316951394081116,
"learning_rate": 1.3346081177391472e-08,
"logits/chosen": 10.458593368530273,
"logits/rejected": 10.821167945861816,
"logps/chosen": -76.2717056274414,
"logps/ref_chosen": -107.26033020019531,
"logps/ref_rejected": -115.8590087890625,
"logps/rejected": -83.77940368652344,
"loss": 0.2695,
"rewards/accuracies": 0.4609375,
"rewards/chosen": 0.3098861873149872,
"rewards/margins": -0.010909780859947205,
"rewards/rejected": 0.3207959830760956,
"sft_loss": 0.26949527859687805,
"step": 68
},
{
"epoch": 0.9090160559901194,
"grad_norm": 0.5555047392845154,
"learning_rate": 9.828703105789981e-09,
"logits/chosen": 10.413060188293457,
"logits/rejected": 11.321882247924805,
"logps/chosen": -73.33889770507812,
"logps/ref_chosen": -106.8610610961914,
"logps/ref_rejected": -122.44428253173828,
"logps/rejected": -89.23860931396484,
"loss": 0.2688,
"rewards/accuracies": 0.5078125,
"rewards/chosen": 0.3352215886116028,
"rewards/margins": 0.003164912573993206,
"rewards/rejected": 0.3320567011833191,
"sft_loss": 0.26880374550819397,
"step": 69
},
{
"epoch": 0.9221902017291066,
"grad_norm": 0.31241339445114136,
"learning_rate": 6.839273668796747e-09,
"logits/chosen": 10.117363929748535,
"logits/rejected": 10.65896224975586,
"logps/chosen": -76.24555969238281,
"logps/ref_chosen": -106.58778381347656,
"logps/ref_rejected": -124.77790832519531,
"logps/rejected": -93.02002716064453,
"loss": 0.2603,
"rewards/accuracies": 0.4453125,
"rewards/chosen": 0.3034222424030304,
"rewards/margins": -0.014156593009829521,
"rewards/rejected": 0.3175787925720215,
"sft_loss": 0.2603415846824646,
"step": 70
},
{
"epoch": 0.9353643474680938,
"grad_norm": 0.3154677748680115,
"learning_rate": 4.384364281105973e-09,
"logits/chosen": 10.104023933410645,
"logits/rejected": 11.347376823425293,
"logps/chosen": -70.88841247558594,
"logps/ref_chosen": -104.39148712158203,
"logps/ref_rejected": -120.59461212158203,
"logps/rejected": -87.24669647216797,
"loss": 0.2746,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.33503076434135437,
"rewards/margins": 0.001551617868244648,
"rewards/rejected": 0.3334791362285614,
"sft_loss": 0.2746419608592987,
"step": 71
},
{
"epoch": 0.9485384932070811,
"grad_norm": 0.27243080735206604,
"learning_rate": 2.469371366337264e-09,
"logits/chosen": 10.178709030151367,
"logits/rejected": 11.207265853881836,
"logps/chosen": -77.24491882324219,
"logps/ref_chosen": -108.53898620605469,
"logps/ref_rejected": -123.26167297363281,
"logps/rejected": -91.20188903808594,
"loss": 0.2649,
"rewards/accuracies": 0.4765625,
"rewards/chosen": 0.31294065713882446,
"rewards/margins": -0.007657179608941078,
"rewards/rejected": 0.3205978274345398,
"sft_loss": 0.26487135887145996,
"step": 72
},
{
"epoch": 0.9617126389460683,
"grad_norm": 0.29381245374679565,
"learning_rate": 1.0985044945254762e-09,
"logits/chosen": 10.011255264282227,
"logits/rejected": 10.857406616210938,
"logps/chosen": -79.84062957763672,
"logps/ref_chosen": -110.50595092773438,
"logps/ref_rejected": -124.92510223388672,
"logps/rejected": -94.2270736694336,
"loss": 0.2583,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.3066532015800476,
"rewards/margins": -0.0003270732704550028,
"rewards/rejected": 0.30698028206825256,
"sft_loss": 0.258339524269104,
"step": 73
},
{
"epoch": 0.9748867846850556,
"grad_norm": 0.4308127462863922,
"learning_rate": 2.7477712857215675e-10,
"logits/chosen": 10.271313667297363,
"logits/rejected": 11.117928504943848,
"logps/chosen": -75.68081665039062,
"logps/ref_chosen": -107.1200942993164,
"logps/ref_rejected": -120.22421264648438,
"logps/rejected": -89.6799545288086,
"loss": 0.2564,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.3143928647041321,
"rewards/margins": 0.008950251154601574,
"rewards/rejected": 0.3054426312446594,
"sft_loss": 0.2563988268375397,
"step": 74
},
{
"epoch": 0.9880609304240429,
"grad_norm": 0.1982087343931198,
"learning_rate": 0.0,
"logits/chosen": 10.055291175842285,
"logits/rejected": 10.858556747436523,
"logps/chosen": -75.02184295654297,
"logps/ref_chosen": -107.40764617919922,
"logps/ref_rejected": -120.6578369140625,
"logps/rejected": -89.03117370605469,
"loss": 0.2485,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.32385802268981934,
"rewards/margins": 0.0075914738699793816,
"rewards/rejected": 0.3162665367126465,
"sft_loss": 0.24852335453033447,
"step": 75
}
],
"logging_steps": 1,
"max_steps": 75,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 12,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}