{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7904487443392343, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013174145738987238, "grad_norm": 0.4815484285354614, "learning_rate": 6.25e-08, "logits/chosen": 10.088521957397461, "logits/rejected": 10.263787269592285, "logps/chosen": -163.12940979003906, "logps/ref_chosen": -163.12940979003906, "logps/ref_rejected": -171.48428344726562, "logps/rejected": -171.48428344726562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.026348291477974475, "grad_norm": 0.627070426940918, "learning_rate": 1.25e-07, "logits/chosen": 10.592972755432129, "logits/rejected": 10.720216751098633, "logps/chosen": -155.91574096679688, "logps/ref_chosen": -155.91574096679688, "logps/ref_rejected": -161.34078979492188, "logps/rejected": -161.34078979492188, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.03952243721696171, "grad_norm": 0.4148138165473938, "learning_rate": 1.875e-07, "logits/chosen": 10.043272972106934, "logits/rejected": 10.398024559020996, "logps/chosen": -158.3568115234375, "logps/ref_chosen": -157.65640258789062, "logps/ref_rejected": -168.5882110595703, "logps/rejected": -168.91085815429688, "loss": 0.6951, "rewards/accuracies": 0.3828125, "rewards/chosen": -0.007004000246524811, "rewards/margins": -0.0037774655502289534, "rewards/rejected": -0.003226534929126501, "step": 3 }, { "epoch": 0.05269658295594895, "grad_norm": 0.7029770612716675, "learning_rate": 2.5e-07, "logits/chosen": 10.250253677368164, "logits/rejected": 10.45008659362793, "logps/chosen": -164.01119995117188, "logps/ref_chosen": -162.89878845214844, "logps/ref_rejected": -168.30462646484375, "logps/rejected": -169.1818389892578, "loss": 0.6944, "rewards/accuracies": 0.453125, "rewards/chosen": -0.01112416572868824, "rewards/margins": -0.0023521997500211, "rewards/rejected": -0.008771965280175209, "step": 4 }, { "epoch": 0.06587072869493618, "grad_norm": 0.4063253104686737, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 10.442557334899902, "logits/rejected": 10.740192413330078, "logps/chosen": -156.1859130859375, "logps/ref_chosen": -156.03257751464844, "logps/ref_rejected": -165.37911987304688, "logps/rejected": -165.6518096923828, "loss": 0.6926, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.001533512957394123, "rewards/margins": 0.001193464733660221, "rewards/rejected": -0.002726977691054344, "step": 5 }, { "epoch": 0.07904487443392343, "grad_norm": 0.4845049977302551, "learning_rate": 3.75e-07, "logits/chosen": 10.906261444091797, "logits/rejected": 11.201122283935547, "logps/chosen": -162.45692443847656, "logps/ref_chosen": -161.98570251464844, "logps/ref_rejected": -169.72560119628906, "logps/rejected": -170.18275451660156, "loss": 0.6932, "rewards/accuracies": 0.515625, "rewards/chosen": -0.004712029360234737, "rewards/margins": -0.00014030117017682642, "rewards/rejected": -0.004571728408336639, "step": 6 }, { "epoch": 0.09221902017291066, "grad_norm": 0.8172655701637268, "learning_rate": 4.375e-07, "logits/chosen": 9.883949279785156, "logits/rejected": 10.030972480773926, "logps/chosen": -157.43295288085938, "logps/ref_chosen": -157.26968383789062, "logps/ref_rejected": -167.37155151367188, "logps/rejected": -167.53939819335938, "loss": 0.6931, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0016327811172232032, "rewards/margins": 4.5756096369586885e-05, "rewards/rejected": -0.0016785369953140616, "step": 7 }, { "epoch": 0.1053931659118979, "grad_norm": 0.588524341583252, "learning_rate": 5e-07, "logits/chosen": 10.633930206298828, "logits/rejected": 10.81590747833252, "logps/chosen": -162.8237762451172, "logps/ref_chosen": -162.2948455810547, "logps/ref_rejected": -172.98866271972656, "logps/rejected": -173.56680297851562, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.005289244465529919, "rewards/margins": 0.0004922347725369036, "rewards/rejected": -0.005781479645520449, "step": 8 }, { "epoch": 0.11856731165088513, "grad_norm": 0.46077635884284973, "learning_rate": 4.997252228714278e-07, "logits/chosen": 10.326555252075195, "logits/rejected": 10.736672401428223, "logps/chosen": -164.5288543701172, "logps/ref_chosen": -163.37091064453125, "logps/ref_rejected": -173.1500701904297, "logps/rejected": -174.08392333984375, "loss": 0.6943, "rewards/accuracies": 0.4375, "rewards/chosen": -0.011579334735870361, "rewards/margins": -0.0022407739888876677, "rewards/rejected": -0.00933856051415205, "step": 9 }, { "epoch": 0.13174145738987236, "grad_norm": 0.673312783241272, "learning_rate": 4.989014955054745e-07, "logits/chosen": 10.325155258178711, "logits/rejected": 10.473593711853027, "logps/chosen": -157.8944091796875, "logps/ref_chosen": -156.87838745117188, "logps/ref_rejected": -165.17373657226562, "logps/rejected": -166.20751953125, "loss": 0.6931, "rewards/accuracies": 0.546875, "rewards/chosen": -0.010160216130316257, "rewards/margins": 0.00017760891932994127, "rewards/rejected": -0.010337824933230877, "step": 10 }, { "epoch": 0.14491560312885962, "grad_norm": 0.6500194668769836, "learning_rate": 4.975306286336627e-07, "logits/chosen": 10.476134300231934, "logits/rejected": 10.66375732421875, "logps/chosen": -161.99935913085938, "logps/ref_chosen": -160.73855590820312, "logps/ref_rejected": -173.1862030029297, "logps/rejected": -174.4076385498047, "loss": 0.6934, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0126079972833395, "rewards/margins": -0.00039388981531374156, "rewards/rejected": -0.012214107438921928, "step": 11 }, { "epoch": 0.15808974886784685, "grad_norm": 0.5539909601211548, "learning_rate": 4.956156357188939e-07, "logits/chosen": 10.318845748901367, "logits/rejected": 10.355680465698242, "logps/chosen": -167.43121337890625, "logps/ref_chosen": -165.21177673339844, "logps/ref_rejected": -170.47381591796875, "logps/rejected": -172.76483154296875, "loss": 0.6928, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.022194450721144676, "rewards/margins": 0.0007156741339713335, "rewards/rejected": -0.02291012369096279, "step": 12 }, { "epoch": 0.17126389460683408, "grad_norm": 0.5150694251060486, "learning_rate": 4.931607263312032e-07, "logits/chosen": 9.89578914642334, "logits/rejected": 10.236948013305664, "logps/chosen": -161.29905700683594, "logps/ref_chosen": -158.68667602539062, "logps/ref_rejected": -168.57968139648438, "logps/rejected": -171.51979064941406, "loss": 0.6916, "rewards/accuracies": 0.5625, "rewards/chosen": -0.026123855262994766, "rewards/margins": 0.0032772955019026995, "rewards/rejected": -0.029401153326034546, "step": 13 }, { "epoch": 0.1844380403458213, "grad_norm": 0.6596489548683167, "learning_rate": 4.9017129689421e-07, "logits/chosen": 10.484810829162598, "logits/rejected": 10.99763298034668, "logps/chosen": -157.44769287109375, "logps/ref_chosen": -153.92340087890625, "logps/ref_rejected": -167.03564453125, "logps/rejected": -170.4557647705078, "loss": 0.6937, "rewards/accuracies": 0.4609375, "rewards/chosen": -0.03524318337440491, "rewards/margins": -0.0010417333105579019, "rewards/rejected": -0.034201446920633316, "step": 14 }, { "epoch": 0.19761218608480857, "grad_norm": 0.39550018310546875, "learning_rate": 4.866539188226085e-07, "logits/chosen": 10.189282417297363, "logits/rejected": 10.43722152709961, "logps/chosen": -166.56544494628906, "logps/ref_chosen": -162.66110229492188, "logps/ref_rejected": -168.7485809326172, "logps/rejected": -172.78038024902344, "loss": 0.6926, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03904342278838158, "rewards/margins": 0.0012746157590299845, "rewards/rejected": -0.04031803831458092, "step": 15 }, { "epoch": 0.2107863318237958, "grad_norm": 0.6276482939720154, "learning_rate": 4.826163240767716e-07, "logits/chosen": 10.743437767028809, "logits/rejected": 11.031370162963867, "logps/chosen": -166.45135498046875, "logps/ref_chosen": -163.39239501953125, "logps/ref_rejected": -172.29183959960938, "logps/rejected": -175.4534912109375, "loss": 0.6927, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.03058951534330845, "rewards/margins": 0.0010271857026964426, "rewards/rejected": -0.0316167026758194, "step": 16 }, { "epoch": 0.22396047756278303, "grad_norm": 0.516729474067688, "learning_rate": 4.780673881662242e-07, "logits/chosen": 10.332087516784668, "logits/rejected": 10.48865032196045, "logps/chosen": -157.08522033691406, "logps/ref_chosen": -153.6072540283203, "logps/ref_rejected": -161.9541473388672, "logps/rejected": -165.6874542236328, "loss": 0.6919, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.03477972373366356, "rewards/margins": 0.0025533493608236313, "rewards/rejected": -0.03733307495713234, "step": 17 }, { "epoch": 0.23713462330177026, "grad_norm": 0.70009446144104, "learning_rate": 4.730171106393466e-07, "logits/chosen": 10.40684986114502, "logits/rejected": 10.725347518920898, "logps/chosen": -158.2038116455078, "logps/ref_chosen": -154.3197021484375, "logps/ref_rejected": -161.81753540039062, "logps/rejected": -165.58631896972656, "loss": 0.6938, "rewards/accuracies": 0.484375, "rewards/chosen": -0.03884127736091614, "rewards/margins": -0.0011533537181094289, "rewards/rejected": -0.03768792748451233, "step": 18 }, { "epoch": 0.2503087690407575, "grad_norm": 0.47613224387168884, "learning_rate": 4.6747659310219757e-07, "logits/chosen": 10.489011764526367, "logits/rejected": 10.455073356628418, "logps/chosen": -171.99160766601562, "logps/ref_chosen": -167.8755340576172, "logps/ref_rejected": -175.09603881835938, "logps/rejected": -179.593994140625, "loss": 0.6913, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.04116089642047882, "rewards/margins": 0.003818710334599018, "rewards/rejected": -0.04497961327433586, "step": 19 }, { "epoch": 0.2634829147797447, "grad_norm": 0.6483292579650879, "learning_rate": 4.6145801481477433e-07, "logits/chosen": 10.415058135986328, "logits/rejected": 10.774059295654297, "logps/chosen": -163.5430450439453, "logps/ref_chosen": -159.07583618164062, "logps/ref_rejected": -169.23069763183594, "logps/rejected": -173.73776245117188, "loss": 0.693, "rewards/accuracies": 0.515625, "rewards/chosen": -0.04467229172587395, "rewards/margins": 0.00039826278225518763, "rewards/rejected": -0.04507055878639221, "step": 20 }, { "epoch": 0.276657060518732, "grad_norm": 0.5634174942970276, "learning_rate": 4.549746059183561e-07, "logits/chosen": 10.342830657958984, "logits/rejected": 10.680377960205078, "logps/chosen": -163.2490997314453, "logps/ref_chosen": -159.25521850585938, "logps/ref_rejected": -169.57681274414062, "logps/rejected": -173.69276428222656, "loss": 0.6926, "rewards/accuracies": 0.578125, "rewards/chosen": -0.03993882238864899, "rewards/margins": 0.0012204290833324194, "rewards/rejected": -0.0411592535674572, "step": 21 }, { "epoch": 0.28983120625771924, "grad_norm": 0.3794897198677063, "learning_rate": 4.480406183527823e-07, "logits/chosen": 10.29517936706543, "logits/rejected": 10.647847175598145, "logps/chosen": -161.54783630371094, "logps/ref_chosen": -157.0568084716797, "logps/ref_rejected": -163.96209716796875, "logps/rejected": -168.51736450195312, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04491012915968895, "rewards/margins": 0.0006425387691706419, "rewards/rejected": -0.04555266723036766, "step": 22 }, { "epoch": 0.3030053519967065, "grad_norm": 0.4016757607460022, "learning_rate": 4.4067129452759546e-07, "logits/chosen": 10.357274055480957, "logits/rejected": 10.63122844696045, "logps/chosen": -162.94578552246094, "logps/ref_chosen": -158.10250854492188, "logps/ref_rejected": -169.85250854492188, "logps/rejected": -174.79525756835938, "loss": 0.6927, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.04843292012810707, "rewards/margins": 0.0009945080382749438, "rewards/rejected": -0.04942742735147476, "step": 23 }, { "epoch": 0.3161794977356937, "grad_norm": 0.46131113171577454, "learning_rate": 4.3288283381591725e-07, "logits/chosen": 10.260627746582031, "logits/rejected": 10.424566268920898, "logps/chosen": -163.2139129638672, "logps/ref_chosen": -158.93540954589844, "logps/ref_rejected": -168.12344360351562, "logps/rejected": -172.52456665039062, "loss": 0.6926, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.042785100638866425, "rewards/margins": 0.0012263581156730652, "rewards/rejected": -0.04401145875453949, "step": 24 }, { "epoch": 0.32935364347468093, "grad_norm": 0.4610799252986908, "learning_rate": 4.246923569447104e-07, "logits/chosen": 10.461551666259766, "logits/rejected": 10.855925559997559, "logps/chosen": -165.60084533691406, "logps/ref_chosen": -161.0833740234375, "logps/ref_rejected": -174.85760498046875, "logps/rejected": -179.20965576171875, "loss": 0.694, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0451747290790081, "rewards/margins": -0.0016541833756491542, "rewards/rejected": -0.043520547449588776, "step": 25 }, { "epoch": 0.34252778921366817, "grad_norm": 0.41953545808792114, "learning_rate": 4.161178683597054e-07, "logits/chosen": 10.611435890197754, "logits/rejected": 10.745625495910645, "logps/chosen": -160.7465057373047, "logps/ref_chosen": -156.07315063476562, "logps/ref_rejected": -161.84292602539062, "logps/rejected": -166.85279846191406, "loss": 0.6915, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.046733610332012177, "rewards/margins": 0.0033648861572146416, "rewards/rejected": -0.05009850114583969, "step": 26 }, { "epoch": 0.3557019349526554, "grad_norm": 0.3880956470966339, "learning_rate": 4.0717821664772124e-07, "logits/chosen": 10.590215682983398, "logits/rejected": 10.893061637878418, "logps/chosen": -168.67279052734375, "logps/ref_chosen": -163.2271728515625, "logps/ref_rejected": -171.53738403320312, "logps/rejected": -176.9310302734375, "loss": 0.6935, "rewards/accuracies": 0.4609375, "rewards/chosen": -0.05445636808872223, "rewards/margins": -0.00051975465612486, "rewards/rejected": -0.05393661558628082, "step": 27 }, { "epoch": 0.3688760806916426, "grad_norm": 0.5345169901847839, "learning_rate": 3.978930531033806e-07, "logits/chosen": 9.953861236572266, "logits/rejected": 10.416406631469727, "logps/chosen": -162.322021484375, "logps/ref_chosen": -157.08795166015625, "logps/ref_rejected": -167.1195068359375, "logps/rejected": -172.4853973388672, "loss": 0.6925, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.0523407980799675, "rewards/margins": 0.001318173250183463, "rewards/rejected": -0.053658969700336456, "step": 28 }, { "epoch": 0.3820502264306299, "grad_norm": 0.5297831296920776, "learning_rate": 3.882827885312998e-07, "logits/chosen": 10.323142051696777, "logits/rejected": 10.501938819885254, "logps/chosen": -168.2061004638672, "logps/ref_chosen": -163.59707641601562, "logps/ref_rejected": -171.89508056640625, "logps/rejected": -176.55738830566406, "loss": 0.6929, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.04609035328030586, "rewards/margins": 0.0005328265833668411, "rewards/rejected": -0.04662318155169487, "step": 29 }, { "epoch": 0.39522437216961714, "grad_norm": 0.35810208320617676, "learning_rate": 3.7836854837871044e-07, "logits/chosen": 10.40945053100586, "logits/rejected": 10.931025505065918, "logps/chosen": -169.71910095214844, "logps/ref_chosen": -164.91160583496094, "logps/ref_rejected": -176.66453552246094, "logps/rejected": -181.69285583496094, "loss": 0.6921, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.04807499051094055, "rewards/margins": 0.0022084712982177734, "rewards/rejected": -0.050283461809158325, "step": 30 }, { "epoch": 0.4083985179086044, "grad_norm": 0.44152358174324036, "learning_rate": 3.681721262971413e-07, "logits/chosen": 10.004611015319824, "logits/rejected": 10.491472244262695, "logps/chosen": -161.24798583984375, "logps/ref_chosen": -155.95877075195312, "logps/ref_rejected": -166.5852508544922, "logps/rejected": -172.22703552246094, "loss": 0.6915, "rewards/accuracies": 0.578125, "rewards/chosen": -0.052892111241817474, "rewards/margins": 0.003525771899148822, "rewards/rejected": -0.05641787871718407, "step": 31 }, { "epoch": 0.4215726636475916, "grad_norm": 0.5185390114784241, "learning_rate": 3.577159362352426e-07, "logits/chosen": 10.27377986907959, "logits/rejected": 10.56481647491455, "logps/chosen": -167.19921875, "logps/ref_chosen": -161.83575439453125, "logps/ref_rejected": -169.53759765625, "logps/rejected": -174.91342163085938, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": -0.05363469570875168, "rewards/margins": 0.00012344191782176495, "rewards/rejected": -0.05375813692808151, "step": 32 }, { "epoch": 0.43474680938657884, "grad_norm": 1.1196942329406738, "learning_rate": 3.470229631680624e-07, "logits/chosen": 10.063702583312988, "logits/rejected": 10.693009376525879, "logps/chosen": -164.40225219726562, "logps/ref_chosen": -158.7517547607422, "logps/ref_rejected": -168.51002502441406, "logps/rejected": -174.00901794433594, "loss": 0.694, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.056504976004362106, "rewards/margins": -0.0015149968676269054, "rewards/rejected": -0.054989978671073914, "step": 33 }, { "epoch": 0.44792095512556607, "grad_norm": 0.593383252620697, "learning_rate": 3.361167125710832e-07, "logits/chosen": 9.993782043457031, "logits/rejected": 10.380085945129395, "logps/chosen": -170.81832885742188, "logps/ref_chosen": -165.12754821777344, "logps/ref_rejected": -177.654296875, "logps/rejected": -183.54998779296875, "loss": 0.6922, "rewards/accuracies": 0.53125, "rewards/chosen": -0.05690779164433479, "rewards/margins": 0.0020490488968789577, "rewards/rejected": -0.05895683914422989, "step": 34 }, { "epoch": 0.4610951008645533, "grad_norm": 0.4870688319206238, "learning_rate": 3.2502115875008516e-07, "logits/chosen": 10.418455123901367, "logits/rejected": 10.655467987060547, "logps/chosen": -165.74551391601562, "logps/ref_chosen": -159.895751953125, "logps/ref_rejected": -167.39785766601562, "logps/rejected": -173.5540008544922, "loss": 0.6917, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.058497510850429535, "rewards/margins": 0.0030638885218650103, "rewards/rejected": -0.06156139820814133, "step": 35 }, { "epoch": 0.47426924660354053, "grad_norm": 0.9538622498512268, "learning_rate": 3.137606921404191e-07, "logits/chosen": 10.286083221435547, "logits/rejected": 10.6614408493042, "logps/chosen": -170.4355926513672, "logps/ref_chosen": -165.02023315429688, "logps/ref_rejected": -172.28997802734375, "logps/rejected": -177.64358520507812, "loss": 0.6935, "rewards/accuracies": 0.546875, "rewards/chosen": -0.05415371432900429, "rewards/margins": -0.0006177356699481606, "rewards/rejected": -0.053535979241132736, "step": 36 }, { "epoch": 0.4874433923425278, "grad_norm": 0.6481872200965881, "learning_rate": 3.0236006569153616e-07, "logits/chosen": 10.281628608703613, "logits/rejected": 10.485464096069336, "logps/chosen": -167.83177185058594, "logps/ref_chosen": -162.57997131347656, "logps/ref_rejected": -174.94447326660156, "logps/rejected": -180.6204833984375, "loss": 0.6911, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.05251805856823921, "rewards/margins": 0.004242150578647852, "rewards/rejected": -0.056760210543870926, "step": 37 }, { "epoch": 0.500617538081515, "grad_norm": 0.6258410215377808, "learning_rate": 2.9084434045463254e-07, "logits/chosen": 10.348075866699219, "logits/rejected": 10.585733413696289, "logps/chosen": -170.00047302246094, "logps/ref_chosen": -164.2234649658203, "logps/ref_rejected": -170.92135620117188, "logps/rejected": -177.09783935546875, "loss": 0.6912, "rewards/accuracies": 0.546875, "rewards/chosen": -0.05776997655630112, "rewards/margins": 0.003995058126747608, "rewards/rejected": -0.06176503747701645, "step": 38 }, { "epoch": 0.5137916838205022, "grad_norm": 0.45227017998695374, "learning_rate": 2.7923883049302066e-07, "logits/chosen": 10.290374755859375, "logits/rejected": 10.424775123596191, "logps/chosen": -172.98834228515625, "logps/ref_chosen": -166.5240478515625, "logps/ref_rejected": -174.45970153808594, "logps/rejected": -180.691162109375, "loss": 0.6944, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0646430104970932, "rewards/margins": -0.002328395377844572, "rewards/rejected": -0.06231461465358734, "step": 39 }, { "epoch": 0.5269658295594895, "grad_norm": 0.7223658561706543, "learning_rate": 2.6756904723632324e-07, "logits/chosen": 10.33895492553711, "logits/rejected": 10.744306564331055, "logps/chosen": -162.80517578125, "logps/ref_chosen": -156.39364624023438, "logps/ref_rejected": -172.6676483154297, "logps/rejected": -178.97576904296875, "loss": 0.6938, "rewards/accuracies": 0.4765625, "rewards/chosen": -0.06411512196063995, "rewards/margins": -0.0010338453575968742, "rewards/rejected": -0.0630812719464302, "step": 40 }, { "epoch": 0.5401399752984768, "grad_norm": 0.9710007905960083, "learning_rate": 2.5586064340081516e-07, "logits/chosen": 10.650144577026367, "logits/rejected": 10.64175796508789, "logps/chosen": -164.384765625, "logps/ref_chosen": -158.60899353027344, "logps/ref_rejected": -167.36000061035156, "logps/rejected": -173.48138427734375, "loss": 0.6915, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.057757824659347534, "rewards/margins": 0.0034558887127786875, "rewards/rejected": -0.061213716864585876, "step": 41 }, { "epoch": 0.553314121037464, "grad_norm": 1.060659408569336, "learning_rate": 2.4413935659918487e-07, "logits/chosen": 9.889198303222656, "logits/rejected": 10.20853042602539, "logps/chosen": -161.29913330078125, "logps/ref_chosen": -155.86929321289062, "logps/ref_rejected": -164.31619262695312, "logps/rejected": -170.2007598876953, "loss": 0.691, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.05429830774664879, "rewards/margins": 0.004547302611172199, "rewards/rejected": -0.05884561315178871, "step": 42 }, { "epoch": 0.5664882667764513, "grad_norm": 0.5962560176849365, "learning_rate": 2.3243095276367684e-07, "logits/chosen": 9.809310913085938, "logits/rejected": 10.134492874145508, "logps/chosen": -155.75411987304688, "logps/ref_chosen": -149.69866943359375, "logps/ref_rejected": -163.860107421875, "logps/rejected": -170.56063842773438, "loss": 0.69, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.06055447459220886, "rewards/margins": 0.006450990214943886, "rewards/rejected": -0.0670054703950882, "step": 43 }, { "epoch": 0.5796624125154385, "grad_norm": 0.38942375779151917, "learning_rate": 2.2076116950697937e-07, "logits/chosen": 10.249410629272461, "logits/rejected": 10.454606056213379, "logps/chosen": -161.172119140625, "logps/ref_chosen": -155.06076049804688, "logps/ref_rejected": -161.446044921875, "logps/rejected": -167.66583251953125, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": -0.06111355125904083, "rewards/margins": 0.0010843212949112058, "rewards/rejected": -0.06219786778092384, "step": 44 }, { "epoch": 0.5928365582544257, "grad_norm": 0.3398829996585846, "learning_rate": 2.091556595453674e-07, "logits/chosen": 10.017583847045898, "logits/rejected": 10.271501541137695, "logps/chosen": -169.4973907470703, "logps/ref_chosen": -163.5751190185547, "logps/ref_rejected": -176.65078735351562, "logps/rejected": -183.13449096679688, "loss": 0.6904, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.059222809970378876, "rewards/margins": 0.005614194553345442, "rewards/rejected": -0.0648370012640953, "step": 45 }, { "epoch": 0.606010703993413, "grad_norm": 0.5492353439331055, "learning_rate": 1.9763993430846392e-07, "logits/chosen": 10.156986236572266, "logits/rejected": 10.36589241027832, "logps/chosen": -168.26568603515625, "logps/ref_chosen": -162.17233276367188, "logps/ref_rejected": -169.48728942871094, "logps/rejected": -176.28375244140625, "loss": 0.6897, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.06093353033065796, "rewards/margins": 0.007031145039945841, "rewards/rejected": -0.06796467304229736, "step": 46 }, { "epoch": 0.6191848497324002, "grad_norm": 0.361573725938797, "learning_rate": 1.862393078595809e-07, "logits/chosen": 10.36301040649414, "logits/rejected": 10.790814399719238, "logps/chosen": -164.67152404785156, "logps/ref_chosen": -158.01217651367188, "logps/ref_rejected": -170.02401733398438, "logps/rejected": -177.1544189453125, "loss": 0.6909, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.06659334897994995, "rewards/margins": 0.004710891284048557, "rewards/rejected": -0.07130423933267593, "step": 47 }, { "epoch": 0.6323589954713874, "grad_norm": 0.49494439363479614, "learning_rate": 1.7497884124991485e-07, "logits/chosen": 10.640326499938965, "logits/rejected": 10.784747123718262, "logps/chosen": -170.2487335205078, "logps/ref_chosen": -163.1754608154297, "logps/ref_rejected": -171.04629516601562, "logps/rejected": -178.30601501464844, "loss": 0.6923, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.07073251903057098, "rewards/margins": 0.0018647168762981892, "rewards/rejected": -0.07259723544120789, "step": 48 }, { "epoch": 0.6455331412103746, "grad_norm": 0.5513429045677185, "learning_rate": 1.6388328742891678e-07, "logits/chosen": 10.51612663269043, "logits/rejected": 10.971571922302246, "logps/chosen": -173.3045654296875, "logps/ref_chosen": -166.46066284179688, "logps/ref_rejected": -176.8931884765625, "logps/rejected": -183.9400634765625, "loss": 0.6922, "rewards/accuracies": 0.546875, "rewards/chosen": -0.06843903660774231, "rewards/margins": 0.002029917435720563, "rewards/rejected": -0.0704689472913742, "step": 49 }, { "epoch": 0.6587072869493619, "grad_norm": 0.4830932319164276, "learning_rate": 1.5297703683193753e-07, "logits/chosen": 10.228598594665527, "logits/rejected": 10.446922302246094, "logps/chosen": -160.33726501464844, "logps/ref_chosen": -154.5497589111328, "logps/ref_rejected": -163.0555419921875, "logps/rejected": -169.6492156982422, "loss": 0.6892, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.057875264436006546, "rewards/margins": 0.008061729371547699, "rewards/rejected": -0.06593699753284454, "step": 50 }, { "epoch": 0.6718814326883491, "grad_norm": 0.3345705568790436, "learning_rate": 1.422840637647574e-07, "logits/chosen": 9.768302917480469, "logits/rejected": 10.109855651855469, "logps/chosen": -163.43936157226562, "logps/ref_chosen": -156.50027465820312, "logps/ref_rejected": -165.34817504882812, "logps/rejected": -172.6494140625, "loss": 0.6914, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.06939102709293365, "rewards/margins": 0.003621481591835618, "rewards/rejected": -0.07301251590251923, "step": 51 }, { "epoch": 0.6850555784273363, "grad_norm": 0.4939371347427368, "learning_rate": 1.3182787370285865e-07, "logits/chosen": 10.192159652709961, "logits/rejected": 10.675037384033203, "logps/chosen": -166.86045837402344, "logps/ref_chosen": -159.9600830078125, "logps/ref_rejected": -173.38265991210938, "logps/rejected": -180.2958526611328, "loss": 0.6932, "rewards/accuracies": 0.546875, "rewards/chosen": -0.06900367140769958, "rewards/margins": 0.00012808499741367996, "rewards/rejected": -0.0691317617893219, "step": 52 }, { "epoch": 0.6982297241663236, "grad_norm": 0.34416159987449646, "learning_rate": 1.2163145162128946e-07, "logits/chosen": 10.153275489807129, "logits/rejected": 10.229888916015625, "logps/chosen": -163.69931030273438, "logps/ref_chosen": -156.9239501953125, "logps/ref_rejected": -164.6415557861328, "logps/rejected": -171.66647338867188, "loss": 0.692, "rewards/accuracies": 0.53125, "rewards/chosen": -0.06775350123643875, "rewards/margins": 0.002495494903996587, "rewards/rejected": -0.07024899125099182, "step": 53 }, { "epoch": 0.7114038699053108, "grad_norm": 0.6642581820487976, "learning_rate": 1.1171721146870014e-07, "logits/chosen": 10.387247085571289, "logits/rejected": 10.734329223632812, "logps/chosen": -165.91758728027344, "logps/ref_chosen": -158.57778930664062, "logps/ref_rejected": -167.73760986328125, "logps/rejected": -174.94659423828125, "loss": 0.6939, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.07339778542518616, "rewards/margins": -0.0013078839983791113, "rewards/rejected": -0.07208990305662155, "step": 54 }, { "epoch": 0.724578015644298, "grad_norm": 0.4084431529045105, "learning_rate": 1.0210694689661939e-07, "logits/chosen": 10.050434112548828, "logits/rejected": 10.472386360168457, "logps/chosen": -165.36961364746094, "logps/ref_chosen": -157.52191162109375, "logps/ref_rejected": -169.84109497070312, "logps/rejected": -177.8925018310547, "loss": 0.6922, "rewards/accuracies": 0.53125, "rewards/chosen": -0.07847694307565689, "rewards/margins": 0.0020371756982058287, "rewards/rejected": -0.08051411807537079, "step": 55 }, { "epoch": 0.7377521613832853, "grad_norm": 0.4536319971084595, "learning_rate": 9.282178335227883e-08, "logits/chosen": 10.111780166625977, "logits/rejected": 10.440515518188477, "logps/chosen": -165.53118896484375, "logps/ref_chosen": -157.51145935058594, "logps/ref_rejected": -171.0047607421875, "logps/rejected": -179.44081115722656, "loss": 0.6912, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.08019725978374481, "rewards/margins": 0.004163055680692196, "rewards/rejected": -0.08436032384634018, "step": 56 }, { "epoch": 0.7509263071222725, "grad_norm": 0.7599093914031982, "learning_rate": 8.388213164029459e-08, "logits/chosen": 10.463480949401855, "logits/rejected": 10.61793041229248, "logps/chosen": -163.78585815429688, "logps/ref_chosen": -155.50022888183594, "logps/ref_rejected": -163.588623046875, "logps/rejected": -171.88169860839844, "loss": 0.6932, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.08285625278949738, "rewards/margins": 7.443735376000404e-05, "rewards/rejected": -0.08293069899082184, "step": 57 }, { "epoch": 0.7641004528612598, "grad_norm": 1.0936464071273804, "learning_rate": 7.530764305528958e-08, "logits/chosen": 10.068573951721191, "logits/rejected": 10.497234344482422, "logps/chosen": -168.2652587890625, "logps/ref_chosen": -160.71681213378906, "logps/ref_rejected": -171.31556701660156, "logps/rejected": -179.1130828857422, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": -0.07548440247774124, "rewards/margins": 0.0024907987099140882, "rewards/rejected": -0.07797519862651825, "step": 58 }, { "epoch": 0.7772745986002471, "grad_norm": 0.49573731422424316, "learning_rate": 6.711716618408281e-08, "logits/chosen": 10.322929382324219, "logits/rejected": 10.415237426757812, "logps/chosen": -176.02772521972656, "logps/ref_chosen": -168.35157775878906, "logps/ref_rejected": -173.06912231445312, "logps/rejected": -181.29351806640625, "loss": 0.6905, "rewards/accuracies": 0.578125, "rewards/chosen": -0.07676173746585846, "rewards/margins": 0.005482470151036978, "rewards/rejected": -0.08224420994520187, "step": 59 }, { "epoch": 0.7904487443392343, "grad_norm": 0.6191822290420532, "learning_rate": 5.932870547240454e-08, "logits/chosen": 10.167850494384766, "logits/rejected": 10.512453079223633, "logps/chosen": -162.01792907714844, "logps/ref_chosen": -154.5463409423828, "logps/ref_rejected": -164.9440155029297, "logps/rejected": -172.77783203125, "loss": 0.6915, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.07471606135368347, "rewards/margins": 0.0036221330519765615, "rewards/rejected": -0.07833817601203918, "step": 60 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }