{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9880609304240429, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013174145738987238, "grad_norm": 1.0125823020935059, "learning_rate": 6.25e-08, "logits/chosen": 9.990612030029297, "logits/rejected": 10.698101997375488, "logps/chosen": -102.88545989990234, "logps/ref_chosen": -102.88545989990234, "logps/ref_rejected": -121.84871673583984, "logps/rejected": -121.84871673583984, "loss": 0.4327, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "sft_loss": 0.36753880977630615, "step": 1 }, { "epoch": 0.026348291477974475, "grad_norm": 0.3579196035861969, "learning_rate": 1.25e-07, "logits/chosen": 10.211905479431152, "logits/rejected": 11.06594467163086, "logps/chosen": -107.70349884033203, "logps/ref_chosen": -107.70349884033203, "logps/ref_rejected": -121.89966583251953, "logps/rejected": -121.89966583251953, "loss": 0.4667, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "sft_loss": 0.41013145446777344, "step": 2 }, { "epoch": 0.03952243721696171, "grad_norm": 0.49040451645851135, "learning_rate": 1.875e-07, "logits/chosen": 10.035531044006348, "logits/rejected": 11.027185440063477, "logps/chosen": -108.23310852050781, "logps/ref_chosen": -107.98188781738281, "logps/ref_rejected": -124.51527404785156, "logps/rejected": -124.64785766601562, "loss": 0.4683, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0025122263468801975, "rewards/margins": -0.0011863748077303171, "rewards/rejected": -0.0013258515391498804, "sft_loss": 0.41194257140159607, "step": 3 }, { "epoch": 0.05269658295594895, "grad_norm": 0.8740162253379822, "learning_rate": 2.5e-07, "logits/chosen": 9.860024452209473, "logits/rejected": 10.876106262207031, "logps/chosen": -109.94369506835938, "logps/ref_chosen": -109.20836639404297, "logps/ref_rejected": -119.23908996582031, "logps/rejected": -119.73454284667969, "loss": 0.4633, "rewards/accuracies": 0.3828125, "rewards/chosen": -0.007353362161666155, "rewards/margins": -0.002398767275735736, "rewards/rejected": -0.004954595118761063, "sft_loss": 0.40552011132240295, "step": 4 }, { "epoch": 0.06587072869493618, "grad_norm": 1.1980141401290894, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 10.19467830657959, "logits/rejected": 10.95050048828125, "logps/chosen": -104.02793884277344, "logps/ref_chosen": -103.87680053710938, "logps/ref_rejected": -118.41618347167969, "logps/rejected": -118.46170806884766, "loss": 0.4351, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.001511452835984528, "rewards/margins": -0.001056289067491889, "rewards/rejected": -0.0004551640013232827, "sft_loss": 0.3704559803009033, "step": 5 }, { "epoch": 0.07904487443392343, "grad_norm": 0.928102433681488, "learning_rate": 3.75e-07, "logits/chosen": 10.701957702636719, "logits/rejected": 11.477033615112305, "logps/chosen": -107.61714935302734, "logps/ref_chosen": -107.58968353271484, "logps/ref_rejected": -122.07303619384766, "logps/rejected": -122.0443115234375, "loss": 0.4515, "rewards/accuracies": 0.4765625, "rewards/chosen": -0.0002746534300968051, "rewards/margins": -0.0005618570139631629, "rewards/rejected": 0.0002872035256586969, "sft_loss": 0.3909577429294586, "step": 6 }, { "epoch": 0.09221902017291066, "grad_norm": 1.0250380039215088, "learning_rate": 4.375e-07, "logits/chosen": 10.025421142578125, "logits/rejected": 10.72871208190918, "logps/chosen": -107.13175201416016, "logps/ref_chosen": -107.42727661132812, "logps/ref_rejected": -116.87063598632812, "logps/rejected": -116.28421020507812, "loss": 0.4392, "rewards/accuracies": 0.3984375, "rewards/chosen": 0.002955180360004306, "rewards/margins": -0.0029091311153024435, "rewards/rejected": 0.005864311475306749, "sft_loss": 0.3753029406070709, "step": 7 }, { "epoch": 0.1053931659118979, "grad_norm": 0.5661666393280029, "learning_rate": 5e-07, "logits/chosen": 10.203546524047852, "logits/rejected": 11.103278160095215, "logps/chosen": -104.93194580078125, "logps/ref_chosen": -105.60282135009766, "logps/ref_rejected": -119.53916931152344, "logps/rejected": -118.93331909179688, "loss": 0.4416, "rewards/accuracies": 0.515625, "rewards/chosen": 0.006708861328661442, "rewards/margins": 0.0006504050688818097, "rewards/rejected": 0.006058456376194954, "sft_loss": 0.3787955939769745, "step": 8 }, { "epoch": 0.11856731165088513, "grad_norm": 0.820360541343689, "learning_rate": 4.997252228714278e-07, "logits/chosen": 10.184520721435547, "logits/rejected": 11.154094696044922, "logps/chosen": -104.26238250732422, "logps/ref_chosen": -105.46086120605469, "logps/ref_rejected": -119.00373840332031, "logps/rejected": -117.88744354248047, "loss": 0.4437, "rewards/accuracies": 0.53125, "rewards/chosen": 0.011985024437308311, "rewards/margins": 0.0008220230811275542, "rewards/rejected": 0.011163001880049706, "sft_loss": 0.38146448135375977, "step": 9 }, { "epoch": 0.13174145738987236, "grad_norm": 0.4781506061553955, "learning_rate": 4.989014955054745e-07, "logits/chosen": 10.042634963989258, "logits/rejected": 10.866905212402344, "logps/chosen": -101.11405944824219, "logps/ref_chosen": -104.21009826660156, "logps/ref_rejected": -118.9209213256836, "logps/rejected": -115.99314880371094, "loss": 0.4088, "rewards/accuracies": 0.46875, "rewards/chosen": 0.030960241332650185, "rewards/margins": 0.0016824830090627074, "rewards/rejected": 0.029277760535478592, "sft_loss": 0.3378788232803345, "step": 10 }, { "epoch": 0.14491560312885962, "grad_norm": 0.8178320527076721, "learning_rate": 4.975306286336627e-07, "logits/chosen": 9.987105369567871, "logits/rejected": 11.181533813476562, "logps/chosen": -101.77717590332031, "logps/ref_chosen": -105.94319152832031, "logps/ref_rejected": -122.76007843017578, "logps/rejected": -119.00365447998047, "loss": 0.4478, "rewards/accuracies": 0.578125, "rewards/chosen": 0.04166024178266525, "rewards/margins": 0.0040960111655294895, "rewards/rejected": 0.0375642292201519, "sft_loss": 0.3869646191596985, "step": 11 }, { "epoch": 0.15808974886784685, "grad_norm": 0.7931886315345764, "learning_rate": 4.956156357188939e-07, "logits/chosen": 9.913724899291992, "logits/rejected": 10.605714797973633, "logps/chosen": -104.08253479003906, "logps/ref_chosen": -109.08442687988281, "logps/ref_rejected": -121.41947174072266, "logps/rejected": -116.61964416503906, "loss": 0.4233, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.05001899227499962, "rewards/margins": 0.002020882908254862, "rewards/rejected": 0.04799811542034149, "sft_loss": 0.3560585379600525, "step": 12 }, { "epoch": 0.17126389460683408, "grad_norm": 0.21988148987293243, "learning_rate": 4.931607263312032e-07, "logits/chosen": 9.934953689575195, "logits/rejected": 11.010736465454102, "logps/chosen": -99.40065002441406, "logps/ref_chosen": -104.62150573730469, "logps/ref_rejected": -119.55384063720703, "logps/rejected": -114.51910400390625, "loss": 0.4347, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.05220862105488777, "rewards/margins": 0.0018612804124131799, "rewards/rejected": 0.05034734308719635, "sft_loss": 0.3703528344631195, "step": 13 }, { "epoch": 0.1844380403458213, "grad_norm": 0.25260093808174133, "learning_rate": 4.9017129689421e-07, "logits/chosen": 10.499589920043945, "logits/rejected": 11.620351791381836, "logps/chosen": -98.26732635498047, "logps/ref_chosen": -106.179443359375, "logps/ref_rejected": -120.73036193847656, "logps/rejected": -112.42403411865234, "loss": 0.415, "rewards/accuracies": 0.53125, "rewards/chosen": 0.07912100851535797, "rewards/margins": -0.00394221069291234, "rewards/rejected": 0.08306321501731873, "sft_loss": 0.3448493182659149, "step": 14 }, { "epoch": 0.19761218608480857, "grad_norm": 0.2699073553085327, "learning_rate": 4.866539188226085e-07, "logits/chosen": 9.888190269470215, "logits/rejected": 10.820487976074219, "logps/chosen": -96.35069274902344, "logps/ref_chosen": -105.70547485351562, "logps/ref_rejected": -118.89997863769531, "logps/rejected": -109.4480209350586, "loss": 0.4071, "rewards/accuracies": 0.515625, "rewards/chosen": 0.09354789555072784, "rewards/margins": -0.0009717608336359262, "rewards/rejected": 0.09451965987682343, "sft_loss": 0.3353506624698639, "step": 15 }, { "epoch": 0.2107863318237958, "grad_norm": 0.3605174422264099, "learning_rate": 4.826163240767716e-07, "logits/chosen": 10.686586380004883, "logits/rejected": 11.310283660888672, "logps/chosen": -97.58367919921875, "logps/ref_chosen": -108.86376953125, "logps/ref_rejected": -122.1635513305664, "logps/rejected": -111.56184387207031, "loss": 0.4203, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.11280079185962677, "rewards/margins": 0.006783789023756981, "rewards/rejected": 0.10601700842380524, "sft_loss": 0.35289227962493896, "step": 16 }, { "epoch": 0.22396047756278303, "grad_norm": 0.41622957587242126, "learning_rate": 4.780673881662242e-07, "logits/chosen": 10.20689582824707, "logits/rejected": 10.86001205444336, "logps/chosen": -91.06839752197266, "logps/ref_chosen": -102.93986511230469, "logps/ref_rejected": -119.43718719482422, "logps/rejected": -107.62686920166016, "loss": 0.4289, "rewards/accuracies": 0.4296875, "rewards/chosen": 0.11871471256017685, "rewards/margins": 0.0006115150172263384, "rewards/rejected": 0.11810319125652313, "sft_loss": 0.3628237247467041, "step": 17 }, { "epoch": 0.23713462330177026, "grad_norm": 0.36325603723526, "learning_rate": 4.730171106393466e-07, "logits/chosen": 10.428996086120605, "logits/rejected": 11.2207612991333, "logps/chosen": -91.17449951171875, "logps/ref_chosen": -103.81341552734375, "logps/ref_rejected": -117.45123291015625, "logps/rejected": -105.54415893554688, "loss": 0.4109, "rewards/accuracies": 0.5, "rewards/chosen": 0.1263890266418457, "rewards/margins": 0.007318255491554737, "rewards/rejected": 0.11907076835632324, "sft_loss": 0.3411993384361267, "step": 18 }, { "epoch": 0.2503087690407575, "grad_norm": 0.3004375994205475, "learning_rate": 4.6747659310219757e-07, "logits/chosen": 10.341066360473633, "logits/rejected": 11.011281967163086, "logps/chosen": -95.462158203125, "logps/ref_chosen": -107.85797119140625, "logps/ref_rejected": -121.88042449951172, "logps/rejected": -109.14384460449219, "loss": 0.3995, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.12395807355642319, "rewards/margins": -0.0034077779855579138, "rewards/rejected": 0.1273658722639084, "sft_loss": 0.3255612552165985, "step": 19 }, { "epoch": 0.2634829147797447, "grad_norm": 0.3459661304950714, "learning_rate": 4.6145801481477433e-07, "logits/chosen": 10.744415283203125, "logits/rejected": 11.557382583618164, "logps/chosen": -90.79095458984375, "logps/ref_chosen": -103.42721557617188, "logps/ref_rejected": -116.7796630859375, "logps/rejected": -104.72395324707031, "loss": 0.3933, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.1263626366853714, "rewards/margins": 0.005805579479783773, "rewards/rejected": 0.12055706232786179, "sft_loss": 0.31898409128189087, "step": 20 }, { "epoch": 0.276657060518732, "grad_norm": 0.23334962129592896, "learning_rate": 4.549746059183561e-07, "logits/chosen": 9.809565544128418, "logits/rejected": 10.91222095489502, "logps/chosen": -93.41632843017578, "logps/ref_chosen": -106.60163879394531, "logps/ref_rejected": -124.56562805175781, "logps/rejected": -110.83106994628906, "loss": 0.3907, "rewards/accuracies": 0.484375, "rewards/chosen": 0.13185299932956696, "rewards/margins": -0.0054924385622143745, "rewards/rejected": 0.13734543323516846, "sft_loss": 0.3142167329788208, "step": 21 }, { "epoch": 0.28983120625771924, "grad_norm": 0.5167871713638306, "learning_rate": 4.480406183527823e-07, "logits/chosen": 10.247149467468262, "logits/rejected": 11.123760223388672, "logps/chosen": -89.27349090576172, "logps/ref_chosen": -103.77696228027344, "logps/ref_rejected": -118.73616027832031, "logps/rejected": -105.45307922363281, "loss": 0.4072, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.14503462612628937, "rewards/margins": 0.012203728780150414, "rewards/rejected": 0.1328309029340744, "sft_loss": 0.3371087312698364, "step": 22 }, { "epoch": 0.3030053519967065, "grad_norm": 0.18110381066799164, "learning_rate": 4.4067129452759546e-07, "logits/chosen": 10.112556457519531, "logits/rejected": 11.132116317749023, "logps/chosen": -89.66647338867188, "logps/ref_chosen": -104.72956085205078, "logps/ref_rejected": -121.35556030273438, "logps/rejected": -106.93888092041016, "loss": 0.4041, "rewards/accuracies": 0.546875, "rewards/chosen": 0.15063081681728363, "rewards/margins": 0.006464053876698017, "rewards/rejected": 0.1441667526960373, "sft_loss": 0.3324388861656189, "step": 23 }, { "epoch": 0.3161794977356937, "grad_norm": 0.2767745554447174, "learning_rate": 4.3288283381591725e-07, "logits/chosen": 10.138566017150879, "logits/rejected": 10.986135482788086, "logps/chosen": -90.00975799560547, "logps/ref_chosen": -105.88758087158203, "logps/ref_rejected": -125.69054412841797, "logps/rejected": -109.54483795166016, "loss": 0.3939, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.15877819061279297, "rewards/margins": -0.002678929828107357, "rewards/rejected": 0.1614571213722229, "sft_loss": 0.3185826539993286, "step": 24 }, { "epoch": 0.32935364347468093, "grad_norm": 0.2805613875389099, "learning_rate": 4.246923569447104e-07, "logits/chosen": 10.334699630737305, "logits/rejected": 11.066987037658691, "logps/chosen": -91.0733642578125, "logps/ref_chosen": -110.0761489868164, "logps/ref_rejected": -129.10540771484375, "logps/rejected": -110.36031341552734, "loss": 0.3872, "rewards/accuracies": 0.53125, "rewards/chosen": 0.1900278627872467, "rewards/margins": 0.0025771353393793106, "rewards/rejected": 0.18745073676109314, "sft_loss": 0.3108268082141876, "step": 25 }, { "epoch": 0.34252778921366817, "grad_norm": 0.26198408007621765, "learning_rate": 4.161178683597054e-07, "logits/chosen": 10.408025741577148, "logits/rejected": 11.505026817321777, "logps/chosen": -83.76263427734375, "logps/ref_chosen": -103.74571990966797, "logps/ref_rejected": -120.73832702636719, "logps/rejected": -101.1318588256836, "loss": 0.3785, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.19983090460300446, "rewards/margins": 0.003766256384551525, "rewards/rejected": 0.19606465101242065, "sft_loss": 0.30009642243385315, "step": 26 }, { "epoch": 0.3557019349526554, "grad_norm": 0.5107954144477844, "learning_rate": 4.0717821664772124e-07, "logits/chosen": 10.090536117553711, "logits/rejected": 11.338558197021484, "logps/chosen": -84.10513305664062, "logps/ref_chosen": -105.47428131103516, "logps/ref_rejected": -120.5193099975586, "logps/rejected": -99.99079895019531, "loss": 0.399, "rewards/accuracies": 0.546875, "rewards/chosen": 0.21369142830371857, "rewards/margins": 0.008406372740864754, "rewards/rejected": 0.20528505742549896, "sft_loss": 0.32625776529312134, "step": 27 }, { "epoch": 0.3688760806916426, "grad_norm": 0.14193640649318695, "learning_rate": 3.978930531033806e-07, "logits/chosen": 9.690971374511719, "logits/rejected": 10.854898452758789, "logps/chosen": -82.89115905761719, "logps/ref_chosen": -103.72540283203125, "logps/ref_rejected": -119.79557800292969, "logps/rejected": -98.77869415283203, "loss": 0.3661, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.20834232866764069, "rewards/margins": -0.0018264743266627192, "rewards/rejected": 0.2101687788963318, "sft_loss": 0.2838048040866852, "step": 28 }, { "epoch": 0.3820502264306299, "grad_norm": 0.22989627718925476, "learning_rate": 3.882827885312998e-07, "logits/chosen": 10.156278610229492, "logits/rejected": 11.227837562561035, "logps/chosen": -87.06941223144531, "logps/ref_chosen": -108.65434265136719, "logps/ref_rejected": -121.46784973144531, "logps/rejected": -100.8239974975586, "loss": 0.3674, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.2158493846654892, "rewards/margins": 0.00941087119281292, "rewards/rejected": 0.20643851161003113, "sft_loss": 0.28684642910957336, "step": 29 }, { "epoch": 0.39522437216961714, "grad_norm": 0.1578030288219452, "learning_rate": 3.7836854837871044e-07, "logits/chosen": 10.308172225952148, "logits/rejected": 11.710267066955566, "logps/chosen": -80.15604400634766, "logps/ref_chosen": -103.62174224853516, "logps/ref_rejected": -126.73807525634766, "logps/rejected": -104.45430755615234, "loss": 0.381, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.23465700447559357, "rewards/margins": 0.011819422245025635, "rewards/rejected": 0.22283759713172913, "sft_loss": 0.30422142148017883, "step": 30 }, { "epoch": 0.4083985179086044, "grad_norm": 0.19974292814731598, "learning_rate": 3.681721262971413e-07, "logits/chosen": 9.940515518188477, "logits/rejected": 10.964177131652832, "logps/chosen": -82.63470458984375, "logps/ref_chosen": -106.10479736328125, "logps/ref_rejected": -120.6382827758789, "logps/rejected": -98.44656372070312, "loss": 0.382, "rewards/accuracies": 0.5, "rewards/chosen": 0.2347009927034378, "rewards/margins": 0.012783760204911232, "rewards/rejected": 0.22191724181175232, "sft_loss": 0.30557647347450256, "step": 31 }, { "epoch": 0.4215726636475916, "grad_norm": 0.1747150868177414, "learning_rate": 3.577159362352426e-07, "logits/chosen": 10.107948303222656, "logits/rejected": 11.489829063415527, "logps/chosen": -83.92710876464844, "logps/ref_chosen": -105.99569702148438, "logps/ref_rejected": -128.34303283691406, "logps/rejected": -106.02722930908203, "loss": 0.3718, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.22068597376346588, "rewards/margins": -0.0024720439687371254, "rewards/rejected": 0.22315803170204163, "sft_loss": 0.2908666133880615, "step": 32 }, { "epoch": 0.43474680938657884, "grad_norm": 0.2229217141866684, "learning_rate": 3.470229631680624e-07, "logits/chosen": 10.121207237243652, "logits/rejected": 10.942390441894531, "logps/chosen": -83.12457275390625, "logps/ref_chosen": -105.72196197509766, "logps/ref_rejected": -121.59507751464844, "logps/rejected": -98.94662475585938, "loss": 0.3581, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.22597381472587585, "rewards/margins": -0.0005105836316943169, "rewards/rejected": 0.22648443281650543, "sft_loss": 0.27406632900238037, "step": 33 }, { "epoch": 0.44792095512556607, "grad_norm": 0.2622174620628357, "learning_rate": 3.361167125710832e-07, "logits/chosen": 10.276717185974121, "logits/rejected": 11.122960090637207, "logps/chosen": -87.3719253540039, "logps/ref_chosen": -111.4834976196289, "logps/ref_rejected": -130.48089599609375, "logps/rejected": -105.93075561523438, "loss": 0.3875, "rewards/accuracies": 0.46875, "rewards/chosen": 0.24111570417881012, "rewards/margins": -0.004385640844702721, "rewards/rejected": 0.24550136923789978, "sft_loss": 0.3102506101131439, "step": 34 }, { "epoch": 0.4610951008645533, "grad_norm": 0.2475651353597641, "learning_rate": 3.2502115875008516e-07, "logits/chosen": 10.531122207641602, "logits/rejected": 11.501781463623047, "logps/chosen": -84.39804077148438, "logps/ref_chosen": -108.9183349609375, "logps/ref_rejected": -121.32493591308594, "logps/rejected": -97.67329406738281, "loss": 0.363, "rewards/accuracies": 0.546875, "rewards/chosen": 0.24520304799079895, "rewards/margins": 0.008686644956469536, "rewards/rejected": 0.23651635646820068, "sft_loss": 0.28123030066490173, "step": 35 }, { "epoch": 0.47426924660354053, "grad_norm": 0.37637782096862793, "learning_rate": 3.137606921404191e-07, "logits/chosen": 10.115397453308105, "logits/rejected": 10.798471450805664, "logps/chosen": -82.94326782226562, "logps/ref_chosen": -107.1411361694336, "logps/ref_rejected": -118.66165161132812, "logps/rejected": -94.65679931640625, "loss": 0.374, "rewards/accuracies": 0.515625, "rewards/chosen": 0.24197861552238464, "rewards/margins": 0.0019301516003906727, "rewards/rejected": 0.24004849791526794, "sft_loss": 0.2941214442253113, "step": 36 }, { "epoch": 0.4874433923425278, "grad_norm": 0.2612260580062866, "learning_rate": 3.0236006569153616e-07, "logits/chosen": 10.420869827270508, "logits/rejected": 11.245797157287598, "logps/chosen": -82.58905029296875, "logps/ref_chosen": -106.6348876953125, "logps/ref_rejected": -121.37834167480469, "logps/rejected": -96.91046142578125, "loss": 0.3743, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.24045847356319427, "rewards/margins": -0.004220380447804928, "rewards/rejected": 0.24467885494232178, "sft_loss": 0.29366734623908997, "step": 37 }, { "epoch": 0.500617538081515, "grad_norm": 0.3258395791053772, "learning_rate": 2.9084434045463254e-07, "logits/chosen": 9.917516708374023, "logits/rejected": 11.002355575561523, "logps/chosen": -77.93925476074219, "logps/ref_chosen": -104.01033782958984, "logps/ref_rejected": -119.02666473388672, "logps/rejected": -92.11087799072266, "loss": 0.3751, "rewards/accuracies": 0.46875, "rewards/chosen": 0.26071101427078247, "rewards/margins": -0.00844690389931202, "rewards/rejected": 0.26915794610977173, "sft_loss": 0.2940705418586731, "step": 38 }, { "epoch": 0.5137916838205022, "grad_norm": 0.5550091862678528, "learning_rate": 2.7923883049302066e-07, "logits/chosen": 10.355965614318848, "logits/rejected": 11.166629791259766, "logps/chosen": -84.12164306640625, "logps/ref_chosen": -109.76485443115234, "logps/ref_rejected": -122.25163269042969, "logps/rejected": -98.2071762084961, "loss": 0.3762, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.25643208622932434, "rewards/margins": 0.015987541526556015, "rewards/rejected": 0.24044457077980042, "sft_loss": 0.29857873916625977, "step": 39 }, { "epoch": 0.5269658295594895, "grad_norm": 0.45951950550079346, "learning_rate": 2.6756904723632324e-07, "logits/chosen": 10.308568000793457, "logits/rejected": 11.47468376159668, "logps/chosen": -81.06179809570312, "logps/ref_chosen": -107.18782806396484, "logps/ref_rejected": -124.24542236328125, "logps/rejected": -99.07140350341797, "loss": 0.3682, "rewards/accuracies": 0.515625, "rewards/chosen": 0.2612603008747101, "rewards/margins": 0.00952002964913845, "rewards/rejected": 0.2517402768135071, "sft_loss": 0.2877885401248932, "step": 40 }, { "epoch": 0.5401399752984768, "grad_norm": 0.24050092697143555, "learning_rate": 2.5586064340081516e-07, "logits/chosen": 10.630128860473633, "logits/rejected": 11.253531455993652, "logps/chosen": -80.80494689941406, "logps/ref_chosen": -106.42051696777344, "logps/ref_rejected": -122.25247192382812, "logps/rejected": -96.38570404052734, "loss": 0.3738, "rewards/accuracies": 0.546875, "rewards/chosen": 0.25615566968917847, "rewards/margins": -0.002511979779228568, "rewards/rejected": 0.25866764783859253, "sft_loss": 0.2933884859085083, "step": 41 }, { "epoch": 0.553314121037464, "grad_norm": 0.29919344186782837, "learning_rate": 2.4413935659918487e-07, "logits/chosen": 9.607657432556152, "logits/rejected": 10.671792984008789, "logps/chosen": -76.3399429321289, "logps/ref_chosen": -103.1148452758789, "logps/ref_rejected": -116.55464935302734, "logps/rejected": -90.74406433105469, "loss": 0.3554, "rewards/accuracies": 0.5625, "rewards/chosen": 0.26774901151657104, "rewards/margins": 0.00964320357888937, "rewards/rejected": 0.2581058144569397, "sft_loss": 0.27179476618766785, "step": 42 }, { "epoch": 0.5664882667764513, "grad_norm": 0.36268603801727295, "learning_rate": 2.3243095276367684e-07, "logits/chosen": 9.684992790222168, "logits/rejected": 10.834800720214844, "logps/chosen": -78.83952331542969, "logps/ref_chosen": -104.21064758300781, "logps/ref_rejected": -118.7614974975586, "logps/rejected": -93.35822296142578, "loss": 0.3646, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.2537112534046173, "rewards/margins": -0.000321386381983757, "rewards/rejected": 0.2540326416492462, "sft_loss": 0.2820621728897095, "step": 43 }, { "epoch": 0.5796624125154385, "grad_norm": 0.24246186017990112, "learning_rate": 2.2076116950697937e-07, "logits/chosen": 9.82433032989502, "logits/rejected": 10.648906707763672, "logps/chosen": -74.00592041015625, "logps/ref_chosen": -100.59449005126953, "logps/ref_rejected": -115.95166778564453, "logps/rejected": -89.70950317382812, "loss": 0.3403, "rewards/accuracies": 0.53125, "rewards/chosen": 0.2658856511116028, "rewards/margins": 0.003463895060122013, "rewards/rejected": 0.26242175698280334, "sft_loss": 0.25220543146133423, "step": 44 }, { "epoch": 0.5928365582544257, "grad_norm": 0.22262080013751984, "learning_rate": 2.091556595453674e-07, "logits/chosen": 10.002988815307617, "logits/rejected": 10.865106582641602, "logps/chosen": -80.3853530883789, "logps/ref_chosen": -106.96060943603516, "logps/ref_rejected": -125.49449157714844, "logps/rejected": -100.078369140625, "loss": 0.3598, "rewards/accuracies": 0.546875, "rewards/chosen": 0.26575252413749695, "rewards/margins": 0.011591208167374134, "rewards/rejected": 0.2541612982749939, "sft_loss": 0.2775415778160095, "step": 45 }, { "epoch": 0.606010703993413, "grad_norm": 0.22258229553699493, "learning_rate": 1.9763993430846392e-07, "logits/chosen": 10.098237037658691, "logits/rejected": 10.78830337524414, "logps/chosen": -78.46400451660156, "logps/ref_chosen": -107.08544158935547, "logps/ref_rejected": -120.38542175292969, "logps/rejected": -91.22920989990234, "loss": 0.3642, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.28621435165405273, "rewards/margins": -0.005347713828086853, "rewards/rejected": 0.2915620803833008, "sft_loss": 0.2810046374797821, "step": 46 }, { "epoch": 0.6191848497324002, "grad_norm": 0.18880513310432434, "learning_rate": 1.862393078595809e-07, "logits/chosen": 9.973186492919922, "logits/rejected": 11.202853202819824, "logps/chosen": -79.4144287109375, "logps/ref_chosen": -105.74787902832031, "logps/ref_rejected": -122.93606567382812, "logps/rejected": -97.95023345947266, "loss": 0.3543, "rewards/accuracies": 0.515625, "rewards/chosen": 0.2633345425128937, "rewards/margins": 0.013476208783686161, "rewards/rejected": 0.2498583048582077, "sft_loss": 0.2710249125957489, "step": 47 }, { "epoch": 0.6323589954713874, "grad_norm": 0.23856320977210999, "learning_rate": 1.7497884124991485e-07, "logits/chosen": 10.487789154052734, "logits/rejected": 11.431607246398926, "logps/chosen": -77.9845199584961, "logps/ref_chosen": -105.3005599975586, "logps/ref_rejected": -123.93569946289062, "logps/rejected": -95.95220184326172, "loss": 0.3612, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.2731604278087616, "rewards/margins": -0.006674529053270817, "rewards/rejected": 0.27983492612838745, "sft_loss": 0.2770739197731018, "step": 48 }, { "epoch": 0.6455331412103746, "grad_norm": 0.26441866159439087, "learning_rate": 1.6388328742891678e-07, "logits/chosen": 10.524215698242188, "logits/rejected": 11.406645774841309, "logps/chosen": -76.19466400146484, "logps/ref_chosen": -104.30430603027344, "logps/ref_rejected": -115.85497283935547, "logps/rejected": -88.74186706542969, "loss": 0.3633, "rewards/accuracies": 0.46875, "rewards/chosen": 0.28109651803970337, "rewards/margins": 0.009965332224965096, "rewards/rejected": 0.27113115787506104, "sft_loss": 0.28159454464912415, "step": 49 }, { "epoch": 0.6587072869493619, "grad_norm": 0.22752611339092255, "learning_rate": 1.5297703683193753e-07, "logits/chosen": 10.119732856750488, "logits/rejected": 10.99254035949707, "logps/chosen": -77.70559692382812, "logps/ref_chosen": -104.65946960449219, "logps/ref_rejected": -118.84170532226562, "logps/rejected": -91.9924087524414, "loss": 0.3505, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.26953861117362976, "rewards/margins": 0.001045763841830194, "rewards/rejected": 0.2684928774833679, "sft_loss": 0.264681875705719, "step": 50 }, { "epoch": 0.6718814326883491, "grad_norm": 0.1437837928533554, "learning_rate": 1.422840637647574e-07, "logits/chosen": 10.233728408813477, "logits/rejected": 10.782451629638672, "logps/chosen": -76.79963684082031, "logps/ref_chosen": -104.4243392944336, "logps/ref_rejected": -117.16233825683594, "logps/rejected": -90.30931091308594, "loss": 0.3504, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.2762470245361328, "rewards/margins": 0.007716692518442869, "rewards/rejected": 0.26853030920028687, "sft_loss": 0.2653406262397766, "step": 51 }, { "epoch": 0.6850555784273363, "grad_norm": 0.13265302777290344, "learning_rate": 1.3182787370285865e-07, "logits/chosen": 9.633808135986328, "logits/rejected": 10.798635482788086, "logps/chosen": -74.69645690917969, "logps/ref_chosen": -101.99165344238281, "logps/ref_rejected": -123.20516204833984, "logps/rejected": -95.3323745727539, "loss": 0.3492, "rewards/accuracies": 0.5, "rewards/chosen": 0.2729518711566925, "rewards/margins": -0.005775850266218185, "rewards/rejected": 0.2787277400493622, "sft_loss": 0.2620721757411957, "step": 52 }, { "epoch": 0.6982297241663236, "grad_norm": 0.18237581849098206, "learning_rate": 1.2163145162128946e-07, "logits/chosen": 10.135986328125, "logits/rejected": 11.036626815795898, "logps/chosen": -80.85763549804688, "logps/ref_chosen": -108.26175689697266, "logps/ref_rejected": -118.12374114990234, "logps/rejected": -91.23506164550781, "loss": 0.3501, "rewards/accuracies": 0.53125, "rewards/chosen": 0.27404123544692993, "rewards/margins": 0.005154372192919254, "rewards/rejected": 0.26888686418533325, "sft_loss": 0.26453250646591187, "step": 53 }, { "epoch": 0.7114038699053108, "grad_norm": 0.24895808100700378, "learning_rate": 1.1171721146870014e-07, "logits/chosen": 10.246241569519043, "logits/rejected": 11.237100601196289, "logps/chosen": -78.87728881835938, "logps/ref_chosen": -108.5864028930664, "logps/ref_rejected": -130.25155639648438, "logps/rejected": -99.01264953613281, "loss": 0.3761, "rewards/accuracies": 0.40625, "rewards/chosen": 0.29709118604660034, "rewards/margins": -0.015297865495085716, "rewards/rejected": 0.3123890459537506, "sft_loss": 0.29449015855789185, "step": 54 }, { "epoch": 0.724578015644298, "grad_norm": 0.16947855055332184, "learning_rate": 1.0210694689661939e-07, "logits/chosen": 10.253119468688965, "logits/rejected": 11.077024459838867, "logps/chosen": -77.06118774414062, "logps/ref_chosen": -105.69741821289062, "logps/ref_rejected": -122.07044219970703, "logps/rejected": -92.63545227050781, "loss": 0.3522, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2863622307777405, "rewards/margins": -0.0079876147210598, "rewards/rejected": 0.2943498492240906, "sft_loss": 0.26553767919540405, "step": 55 }, { "epoch": 0.7377521613832853, "grad_norm": 0.2588522434234619, "learning_rate": 9.282178335227883e-08, "logits/chosen": 9.869044303894043, "logits/rejected": 11.061833381652832, "logps/chosen": -78.13841247558594, "logps/ref_chosen": -106.5007095336914, "logps/ref_rejected": -123.01736450195312, "logps/rejected": -94.39738464355469, "loss": 0.365, "rewards/accuracies": 0.53125, "rewards/chosen": 0.28362295031547546, "rewards/margins": -0.0025769099593162537, "rewards/rejected": 0.2861998379230499, "sft_loss": 0.28222140669822693, "step": 56 }, { "epoch": 0.7509263071222725, "grad_norm": 0.16184063255786896, "learning_rate": 8.388213164029459e-08, "logits/chosen": 10.650407791137695, "logits/rejected": 11.406877517700195, "logps/chosen": -81.62381744384766, "logps/ref_chosen": -109.18460083007812, "logps/ref_rejected": -124.3697280883789, "logps/rejected": -95.28343200683594, "loss": 0.3652, "rewards/accuracies": 0.4453125, "rewards/chosen": 0.2756078243255615, "rewards/margins": -0.015255214646458626, "rewards/rejected": 0.290863037109375, "sft_loss": 0.28094765543937683, "step": 57 }, { "epoch": 0.7641004528612598, "grad_norm": 0.36946606636047363, "learning_rate": 7.530764305528958e-08, "logits/chosen": 10.020023345947266, "logits/rejected": 10.644125938415527, "logps/chosen": -75.29403686523438, "logps/ref_chosen": -104.43944549560547, "logps/ref_rejected": -118.44985961914062, "logps/rejected": -87.67448425292969, "loss": 0.355, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.29145413637161255, "rewards/margins": -0.01629968360066414, "rewards/rejected": 0.3077538013458252, "sft_loss": 0.26799270510673523, "step": 58 }, { "epoch": 0.7772745986002471, "grad_norm": 0.2249249815940857, "learning_rate": 6.711716618408281e-08, "logits/chosen": 10.270488739013672, "logits/rejected": 11.220294952392578, "logps/chosen": -75.23960876464844, "logps/ref_chosen": -103.32658386230469, "logps/ref_rejected": -121.63726806640625, "logps/rejected": -92.84001922607422, "loss": 0.3614, "rewards/accuracies": 0.4140625, "rewards/chosen": 0.2808696925640106, "rewards/margins": -0.007102856878191233, "rewards/rejected": 0.2879725694656372, "sft_loss": 0.2771528363227844, "step": 59 }, { "epoch": 0.7904487443392343, "grad_norm": 0.14675669372081757, "learning_rate": 5.932870547240454e-08, "logits/chosen": 10.010223388671875, "logits/rejected": 11.097480773925781, "logps/chosen": -74.62510681152344, "logps/ref_chosen": -102.98921966552734, "logps/ref_rejected": -124.47185516357422, "logps/rejected": -94.94497680664062, "loss": 0.3514, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.2836410105228424, "rewards/margins": -0.011627687141299248, "rewards/rejected": 0.295268714427948, "sft_loss": 0.26409104466438293, "step": 60 }, { "epoch": 0.8036228900782215, "grad_norm": 0.26351556181907654, "learning_rate": 5.1959381647217665e-08, "logits/chosen": 10.073440551757812, "logits/rejected": 11.036574363708496, "logps/chosen": -76.29356384277344, "logps/ref_chosen": -106.28311157226562, "logps/ref_rejected": -121.47750854492188, "logps/rejected": -91.61712646484375, "loss": 0.3592, "rewards/accuracies": 0.5, "rewards/chosen": 0.29989543557167053, "rewards/margins": 0.0012914962135255337, "rewards/rejected": 0.29860392212867737, "sft_loss": 0.27550363540649414, "step": 61 }, { "epoch": 0.8167970358172087, "grad_norm": 0.24507783353328705, "learning_rate": 4.502539408164385e-08, "logits/chosen": 10.17712688446045, "logits/rejected": 10.999006271362305, "logps/chosen": -79.46083068847656, "logps/ref_chosen": -109.67979431152344, "logps/ref_rejected": -120.36711120605469, "logps/rejected": -90.30607604980469, "loss": 0.3496, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.3021896481513977, "rewards/margins": 0.0015792613849043846, "rewards/rejected": 0.30061036348342896, "sft_loss": 0.2635141909122467, "step": 62 }, { "epoch": 0.829971181556196, "grad_norm": 1.0491613149642944, "learning_rate": 3.854198518522564e-08, "logits/chosen": 10.305301666259766, "logits/rejected": 11.398462295532227, "logps/chosen": -77.59590911865234, "logps/ref_chosen": -106.88896942138672, "logps/ref_rejected": -122.57796478271484, "logps/rejected": -94.59490203857422, "loss": 0.3623, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.2929304838180542, "rewards/margins": 0.013099929317831993, "rewards/rejected": 0.27983057498931885, "sft_loss": 0.28068116307258606, "step": 63 }, { "epoch": 0.8431453272951832, "grad_norm": 0.2025328278541565, "learning_rate": 3.2523406897802444e-08, "logits/chosen": 10.281243324279785, "logits/rejected": 10.981318473815918, "logps/chosen": -80.57833862304688, "logps/ref_chosen": -109.29510498046875, "logps/ref_rejected": -121.51821899414062, "logps/rejected": -91.47090911865234, "loss": 0.3666, "rewards/accuracies": 0.4140625, "rewards/chosen": 0.2871675491333008, "rewards/margins": -0.013305490836501122, "rewards/rejected": 0.30047303438186646, "sft_loss": 0.2829879820346832, "step": 64 }, { "epoch": 0.8563194730341704, "grad_norm": 0.2945575416088104, "learning_rate": 2.6982889360653376e-08, "logits/chosen": 9.850225448608398, "logits/rejected": 11.152016639709473, "logps/chosen": -79.17605590820312, "logps/ref_chosen": -110.15232849121094, "logps/ref_rejected": -127.54279327392578, "logps/rejected": -98.93862915039062, "loss": 0.3614, "rewards/accuracies": 0.5703125, "rewards/chosen": 0.30976271629333496, "rewards/margins": 0.023721186444163322, "rewards/rejected": 0.2860415279865265, "sft_loss": 0.2810002267360687, "step": 65 }, { "epoch": 0.8694936187731577, "grad_norm": 0.24030965566635132, "learning_rate": 2.1932611833775843e-08, "logits/chosen": 9.87226390838623, "logits/rejected": 10.928711891174316, "logps/chosen": -75.15364837646484, "logps/ref_chosen": -104.0207748413086, "logps/ref_rejected": -126.93211364746094, "logps/rejected": -96.0455093383789, "loss": 0.3502, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.28867122530937195, "rewards/margins": -0.020194828510284424, "rewards/rejected": 0.308866024017334, "sft_loss": 0.2615111172199249, "step": 66 }, { "epoch": 0.8826677645121449, "grad_norm": 0.20665204524993896, "learning_rate": 1.738367592322837e-08, "logits/chosen": 10.203917503356934, "logits/rejected": 11.145586013793945, "logps/chosen": -74.83247375488281, "logps/ref_chosen": -104.55751037597656, "logps/ref_rejected": -119.71514892578125, "logps/rejected": -90.24601745605469, "loss": 0.3513, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.2972503900527954, "rewards/margins": 0.0025589880533516407, "rewards/rejected": 0.29469138383865356, "sft_loss": 0.26581910252571106, "step": 67 }, { "epoch": 0.8958419102511321, "grad_norm": 0.2984723150730133, "learning_rate": 1.3346081177391472e-08, "logits/chosen": 10.52553939819336, "logits/rejected": 10.891569137573242, "logps/chosen": -78.711181640625, "logps/ref_chosen": -107.26033020019531, "logps/ref_rejected": -115.8590087890625, "logps/rejected": -86.45365905761719, "loss": 0.3625, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.285491406917572, "rewards/margins": -0.008561921305954456, "rewards/rejected": 0.2940533459186554, "sft_loss": 0.2784934639930725, "step": 68 }, { "epoch": 0.9090160559901194, "grad_norm": 0.46800193190574646, "learning_rate": 9.828703105789981e-09, "logits/chosen": 10.491955757141113, "logits/rejected": 11.398428916931152, "logps/chosen": -75.86448669433594, "logps/ref_chosen": -106.8610610961914, "logps/ref_rejected": -122.44428253173828, "logps/rejected": -92.12076568603516, "loss": 0.3613, "rewards/accuracies": 0.53125, "rewards/chosen": 0.30996572971343994, "rewards/margins": 0.006730497814714909, "rewards/rejected": 0.3032352328300476, "sft_loss": 0.2787122130393982, "step": 69 }, { "epoch": 0.9221902017291066, "grad_norm": 0.2754024565219879, "learning_rate": 6.839273668796747e-09, "logits/chosen": 10.182811737060547, "logits/rejected": 10.726500511169434, "logps/chosen": -78.67579650878906, "logps/ref_chosen": -106.58778381347656, "logps/ref_rejected": -124.77790832519531, "logps/rejected": -95.84029388427734, "loss": 0.3553, "rewards/accuracies": 0.453125, "rewards/chosen": 0.27911996841430664, "rewards/margins": -0.010256174020469189, "rewards/rejected": 0.2893761098384857, "sft_loss": 0.2690708339214325, "step": 70 }, { "epoch": 0.9353643474680938, "grad_norm": 0.26821038126945496, "learning_rate": 4.384364281105973e-09, "logits/chosen": 10.197190284729004, "logits/rejected": 11.432995796203613, "logps/chosen": -73.4859390258789, "logps/ref_chosen": -104.39148712158203, "logps/ref_rejected": -120.59461212158203, "logps/rejected": -90.0641860961914, "loss": 0.3671, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.3090555667877197, "rewards/margins": 0.0037512234412133694, "rewards/rejected": 0.3053043484687805, "sft_loss": 0.28556591272354126, "step": 71 }, { "epoch": 0.9485384932070811, "grad_norm": 0.227573961019516, "learning_rate": 2.469371366337264e-09, "logits/chosen": 10.291793823242188, "logits/rejected": 11.314682006835938, "logps/chosen": -79.75538635253906, "logps/ref_chosen": -108.53898620605469, "logps/ref_rejected": -123.26167297363281, "logps/rejected": -93.88778686523438, "loss": 0.3587, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.28783589601516724, "rewards/margins": -0.005902983248233795, "rewards/rejected": 0.29373887181282043, "sft_loss": 0.2740861177444458, "step": 72 }, { "epoch": 0.9617126389460683, "grad_norm": 0.2832602262496948, "learning_rate": 1.0985044945254762e-09, "logits/chosen": 10.094602584838867, "logits/rejected": 10.940149307250977, "logps/chosen": -82.2466049194336, "logps/ref_chosen": -110.50595092773438, "logps/ref_rejected": -124.92510223388672, "logps/rejected": -97.02880859375, "loss": 0.3518, "rewards/accuracies": 0.515625, "rewards/chosen": 0.28259342908859253, "rewards/margins": 0.003630502847954631, "rewards/rejected": 0.2789629101753235, "sft_loss": 0.2665908932685852, "step": 73 }, { "epoch": 0.9748867846850556, "grad_norm": 0.37028247117996216, "learning_rate": 2.7477712857215675e-10, "logits/chosen": 10.341303825378418, "logits/rejected": 11.190547943115234, "logps/chosen": -78.12569427490234, "logps/ref_chosen": -107.1200942993164, "logps/ref_rejected": -120.22421264648438, "logps/rejected": -92.23339080810547, "loss": 0.3501, "rewards/accuracies": 0.546875, "rewards/chosen": 0.289944052696228, "rewards/margins": 0.010035790503025055, "rewards/rejected": 0.27990826964378357, "sft_loss": 0.2652747631072998, "step": 74 }, { "epoch": 0.9880609304240429, "grad_norm": 0.18064919114112854, "learning_rate": 0.0, "logits/chosen": 10.112634658813477, "logits/rejected": 10.916732788085938, "logps/chosen": -77.67636108398438, "logps/ref_chosen": -107.40764617919922, "logps/ref_rejected": -120.6578369140625, "logps/rejected": -91.93303680419922, "loss": 0.3442, "rewards/accuracies": 0.53125, "rewards/chosen": 0.2973128855228424, "rewards/margins": 0.01006484404206276, "rewards/rejected": 0.28724804520606995, "sft_loss": 0.2577492296695709, "step": 75 }, { "epoch": 0.9880609304240429, "step": 75, "total_flos": 0.0, "train_loss": 0.3833660825093587, "train_runtime": 8038.2327, "train_samples_per_second": 1.209, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }