{ "best_global_step": 1330, "best_metric": 6.090111255645752, "best_model_checkpoint": "/mnt/localssd/svadugur/36464/preference_pairs-speaker=gemma-listener=pixtral_ft-length_conditioned=False-contexts=hard-36464/checkpoint-1330", "epoch": 3.8349584687612857, "eval_steps": 70, "global_step": 1330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_logits/chosen": -2.2975962162017822, "eval_logits/rejected": -2.248411178588867, "eval_logps/chosen": -46.31641387939453, "eval_logps/rejected": -46.95017623901367, "eval_loss": 1.0, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0, "eval_runtime": 233.5439, "eval_samples_per_second": 0.527, "eval_steps_per_second": 0.265, "step": 0 }, { "epoch": 0.00288912964969303, "grad_norm": 2.8075449466705322, "learning_rate": 1e-06, "logits/chosen": -2.3175277709960938, "logits/rejected": -2.276055335998535, "logps/chosen": -47.75232696533203, "logps/rejected": -48.14625549316406, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00577825929938606, "grad_norm": 3.2962496280670166, "learning_rate": 9.997109826589594e-07, "logits/chosen": -2.3267276287078857, "logits/rejected": -2.285989761352539, "logps/chosen": -45.59309768676758, "logps/rejected": -51.92250061035156, "loss": 1.0011, "rewards/accuracies": 0.5, "rewards/chosen": -0.00838401447981596, "rewards/margins": -0.004439950454980135, "rewards/rejected": -0.003944064024835825, "step": 2 }, { "epoch": 0.00866738894907909, "grad_norm": 2.6075656414031982, "learning_rate": 9.99421965317919e-07, "logits/chosen": -2.2306013107299805, "logits/rejected": -2.2051498889923096, "logps/chosen": -44.349578857421875, "logps/rejected": -49.07940673828125, "loss": 1.0082, "rewards/accuracies": 0.375, "rewards/chosen": -0.03237631171941757, "rewards/margins": -0.032960619777441025, "rewards/rejected": 0.0005843047983944416, "step": 3 }, { "epoch": 0.01155651859877212, "grad_norm": 2.7039880752563477, "learning_rate": 9.991329479768785e-07, "logits/chosen": -2.2700114250183105, "logits/rejected": -2.219162940979004, "logps/chosen": -39.92502212524414, "logps/rejected": -44.78451919555664, "loss": 1.001, "rewards/accuracies": 0.5, "rewards/chosen": -0.0012441696599125862, "rewards/margins": -0.004121714271605015, "rewards/rejected": 0.002877545077353716, "step": 4 }, { "epoch": 0.01444564824846515, "grad_norm": 2.570429563522339, "learning_rate": 9.98843930635838e-07, "logits/chosen": -2.3640296459198, "logits/rejected": -2.3220138549804688, "logps/chosen": -48.91164779663086, "logps/rejected": -48.63970184326172, "loss": 1.009, "rewards/accuracies": 0.34375, "rewards/chosen": -0.03972390294075012, "rewards/margins": -0.03604067116975784, "rewards/rejected": -0.0036832334008067846, "step": 5 }, { "epoch": 0.01733477789815818, "grad_norm": 2.29892897605896, "learning_rate": 9.985549132947976e-07, "logits/chosen": -2.3818764686584473, "logits/rejected": -2.2983953952789307, "logps/chosen": -43.221649169921875, "logps/rejected": -46.041236877441406, "loss": 1.0013, "rewards/accuracies": 0.5, "rewards/chosen": -0.007948458194732666, "rewards/margins": -0.0050258943811059, "rewards/rejected": -0.002922563813626766, "step": 6 }, { "epoch": 0.020223907547851208, "grad_norm": 2.379246473312378, "learning_rate": 9.982658959537573e-07, "logits/chosen": -2.2162768840789795, "logits/rejected": -2.1959571838378906, "logps/chosen": -46.38663864135742, "logps/rejected": -47.75700378417969, "loss": 0.9963, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0022822804749011993, "rewards/margins": 0.014888251200318336, "rewards/rejected": -0.017170531675219536, "step": 7 }, { "epoch": 0.02311303719754424, "grad_norm": 2.6945853233337402, "learning_rate": 9.979768786127167e-07, "logits/chosen": -2.296346664428711, "logits/rejected": -2.1753225326538086, "logps/chosen": -46.964115142822266, "logps/rejected": -49.2270393371582, "loss": 0.9971, "rewards/accuracies": 0.5, "rewards/chosen": -0.0013342492748051882, "rewards/margins": 0.011715519241988659, "rewards/rejected": -0.013049770146608353, "step": 8 }, { "epoch": 0.02600216684723727, "grad_norm": 2.6182069778442383, "learning_rate": 9.976878612716762e-07, "logits/chosen": -2.3194963932037354, "logits/rejected": -2.3071932792663574, "logps/chosen": -47.32005310058594, "logps/rejected": -49.789939880371094, "loss": 0.9965, "rewards/accuracies": 0.53125, "rewards/chosen": -0.011104549281299114, "rewards/margins": 0.013879906386137009, "rewards/rejected": -0.024984456598758698, "step": 9 }, { "epoch": 0.0288912964969303, "grad_norm": 2.9282901287078857, "learning_rate": 9.973988439306358e-07, "logits/chosen": -2.384018898010254, "logits/rejected": -2.3609778881073, "logps/chosen": -45.86537551879883, "logps/rejected": -48.12544631958008, "loss": 0.9998, "rewards/accuracies": 0.53125, "rewards/chosen": -0.011187266558408737, "rewards/margins": 0.0007130773738026619, "rewards/rejected": -0.01190034206956625, "step": 10 }, { "epoch": 0.03178042614662333, "grad_norm": 2.779256820678711, "learning_rate": 9.971098265895953e-07, "logits/chosen": -2.3103959560394287, "logits/rejected": -2.264667272567749, "logps/chosen": -45.628074645996094, "logps/rejected": -51.19231414794922, "loss": 0.9965, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004043484106659889, "rewards/margins": 0.013859058730304241, "rewards/rejected": -0.009815573692321777, "step": 11 }, { "epoch": 0.03466955579631636, "grad_norm": 2.4561803340911865, "learning_rate": 9.968208092485547e-07, "logits/chosen": -2.250896692276001, "logits/rejected": -2.27902889251709, "logps/chosen": -48.17356872558594, "logps/rejected": -49.72551345825195, "loss": 0.9985, "rewards/accuracies": 0.5625, "rewards/chosen": 0.017966926097869873, "rewards/margins": 0.005964946001768112, "rewards/rejected": 0.012001979164779186, "step": 12 }, { "epoch": 0.03755868544600939, "grad_norm": 3.2101974487304688, "learning_rate": 9.965317919075144e-07, "logits/chosen": -2.2668163776397705, "logits/rejected": -2.2156577110290527, "logps/chosen": -46.649906158447266, "logps/rejected": -47.03781509399414, "loss": 0.9983, "rewards/accuracies": 0.59375, "rewards/chosen": 0.011064433492720127, "rewards/margins": 0.006706194952130318, "rewards/rejected": 0.004358238540589809, "step": 13 }, { "epoch": 0.040447815095702416, "grad_norm": 2.3819403648376465, "learning_rate": 9.96242774566474e-07, "logits/chosen": -2.2456724643707275, "logits/rejected": -2.2415871620178223, "logps/chosen": -45.100318908691406, "logps/rejected": -49.309783935546875, "loss": 0.994, "rewards/accuracies": 0.625, "rewards/chosen": 0.02066171169281006, "rewards/margins": 0.023844540119171143, "rewards/rejected": -0.0031828279606997967, "step": 14 }, { "epoch": 0.04333694474539545, "grad_norm": 2.548259973526001, "learning_rate": 9.959537572254335e-07, "logits/chosen": -2.1900925636291504, "logits/rejected": -2.191520929336548, "logps/chosen": -44.79486083984375, "logps/rejected": -46.691734313964844, "loss": 1.0029, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00569757679477334, "rewards/margins": -0.01161092147231102, "rewards/rejected": 0.017308499664068222, "step": 15 }, { "epoch": 0.04622607439508848, "grad_norm": 2.271209955215454, "learning_rate": 9.95664739884393e-07, "logits/chosen": -2.2223944664001465, "logits/rejected": -2.202390670776367, "logps/chosen": -45.696937561035156, "logps/rejected": -46.02082443237305, "loss": 0.9987, "rewards/accuracies": 0.5, "rewards/chosen": -0.008063561283051968, "rewards/margins": 0.005261479876935482, "rewards/rejected": -0.01332504115998745, "step": 16 }, { "epoch": 0.04911520404478151, "grad_norm": 2.795358180999756, "learning_rate": 9.953757225433526e-07, "logits/chosen": -2.2889254093170166, "logits/rejected": -2.2782371044158936, "logps/chosen": -45.277645111083984, "logps/rejected": -47.96349334716797, "loss": 0.9992, "rewards/accuracies": 0.53125, "rewards/chosen": -0.01219995692372322, "rewards/margins": 0.003160005435347557, "rewards/rejected": -0.015359962359070778, "step": 17 }, { "epoch": 0.05200433369447454, "grad_norm": 2.477508544921875, "learning_rate": 9.95086705202312e-07, "logits/chosen": -2.3498191833496094, "logits/rejected": -2.2757749557495117, "logps/chosen": -43.86589813232422, "logps/rejected": -47.35736846923828, "loss": 1.0024, "rewards/accuracies": 0.46875, "rewards/chosen": -0.038532111793756485, "rewards/margins": -0.009746897965669632, "rewards/rejected": -0.02878521755337715, "step": 18 }, { "epoch": 0.05489346334416757, "grad_norm": 4.256105899810791, "learning_rate": 9.947976878612715e-07, "logits/chosen": -2.2930595874786377, "logits/rejected": -2.2665107250213623, "logps/chosen": -45.19348907470703, "logps/rejected": -49.08800506591797, "loss": 0.9944, "rewards/accuracies": 0.5625, "rewards/chosen": -0.026201710104942322, "rewards/margins": 0.022400666028261185, "rewards/rejected": -0.04860237240791321, "step": 19 }, { "epoch": 0.0577825929938606, "grad_norm": 2.7938969135284424, "learning_rate": 9.945086705202312e-07, "logits/chosen": -2.343785047531128, "logits/rejected": -2.317864418029785, "logps/chosen": -48.817501068115234, "logps/rejected": -50.040794372558594, "loss": 0.9976, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014661613386124372, "rewards/margins": 0.009426183067262173, "rewards/rejected": -0.010892343707382679, "step": 20 }, { "epoch": 0.06067172264355363, "grad_norm": 3.319253921508789, "learning_rate": 9.942196531791906e-07, "logits/chosen": -2.327411413192749, "logits/rejected": -2.276134490966797, "logps/chosen": -46.324459075927734, "logps/rejected": -47.910125732421875, "loss": 0.9932, "rewards/accuracies": 0.75, "rewards/chosen": 0.023560144007205963, "rewards/margins": 0.0272036325186491, "rewards/rejected": -0.003643488045781851, "step": 21 }, { "epoch": 0.06356085229324666, "grad_norm": 2.5584089756011963, "learning_rate": 9.939306358381503e-07, "logits/chosen": -2.2838125228881836, "logits/rejected": -2.260859727859497, "logps/chosen": -44.116668701171875, "logps/rejected": -46.03076934814453, "loss": 0.9955, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0006487350910902023, "rewards/margins": 0.0178131815046072, "rewards/rejected": -0.018461918458342552, "step": 22 }, { "epoch": 0.06644998194293969, "grad_norm": 3.210329532623291, "learning_rate": 9.936416184971098e-07, "logits/chosen": -2.256563663482666, "logits/rejected": -2.2027249336242676, "logps/chosen": -43.3868293762207, "logps/rejected": -44.468902587890625, "loss": 0.9971, "rewards/accuracies": 0.59375, "rewards/chosen": -0.002043062588199973, "rewards/margins": 0.011789566837251186, "rewards/rejected": -0.013832628726959229, "step": 23 }, { "epoch": 0.06933911159263272, "grad_norm": 2.5849857330322266, "learning_rate": 9.933526011560694e-07, "logits/chosen": -2.2836403846740723, "logits/rejected": -2.2356815338134766, "logps/chosen": -47.08574676513672, "logps/rejected": -46.028846740722656, "loss": 0.9996, "rewards/accuracies": 0.5, "rewards/chosen": -0.024665724486112595, "rewards/margins": 0.001551418798044324, "rewards/rejected": -0.026217145845294, "step": 24 }, { "epoch": 0.07222824124232576, "grad_norm": 2.604755401611328, "learning_rate": 9.930635838150289e-07, "logits/chosen": -2.2649636268615723, "logits/rejected": -2.2321090698242188, "logps/chosen": -46.79102325439453, "logps/rejected": -48.006805419921875, "loss": 0.9964, "rewards/accuracies": 0.5, "rewards/chosen": -0.015872038900852203, "rewards/margins": 0.014624142087996006, "rewards/rejected": -0.030496181920170784, "step": 25 }, { "epoch": 0.07511737089201878, "grad_norm": 2.5735223293304443, "learning_rate": 9.927745664739883e-07, "logits/chosen": -2.26102352142334, "logits/rejected": -2.2093541622161865, "logps/chosen": -47.01689910888672, "logps/rejected": -47.01516342163086, "loss": 0.9922, "rewards/accuracies": 0.65625, "rewards/chosen": -0.007432263810187578, "rewards/margins": 0.031172877177596092, "rewards/rejected": -0.03860514611005783, "step": 26 }, { "epoch": 0.0780065005417118, "grad_norm": 2.566977024078369, "learning_rate": 9.92485549132948e-07, "logits/chosen": -2.2537708282470703, "logits/rejected": -2.2898244857788086, "logps/chosen": -45.33720397949219, "logps/rejected": -47.98091125488281, "loss": 0.9985, "rewards/accuracies": 0.5, "rewards/chosen": -0.03447738289833069, "rewards/margins": 0.0060107288882136345, "rewards/rejected": -0.0404881089925766, "step": 27 }, { "epoch": 0.08089563019140483, "grad_norm": 2.583399534225464, "learning_rate": 9.921965317919074e-07, "logits/chosen": -2.2943978309631348, "logits/rejected": -2.256737232208252, "logps/chosen": -44.241424560546875, "logps/rejected": -47.46023941040039, "loss": 0.9975, "rewards/accuracies": 0.5625, "rewards/chosen": -0.01808394119143486, "rewards/margins": 0.009940600953996181, "rewards/rejected": -0.028024546802043915, "step": 28 }, { "epoch": 0.08378475984109787, "grad_norm": 2.5404512882232666, "learning_rate": 9.919075144508669e-07, "logits/chosen": -2.2348742485046387, "logits/rejected": -2.228454113006592, "logps/chosen": -43.323028564453125, "logps/rejected": -49.72521209716797, "loss": 0.9953, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01807316765189171, "rewards/margins": 0.018755722790956497, "rewards/rejected": -0.0006825551390647888, "step": 29 }, { "epoch": 0.0866738894907909, "grad_norm": 3.0930638313293457, "learning_rate": 9.916184971098265e-07, "logits/chosen": -2.364201068878174, "logits/rejected": -2.3086743354797363, "logps/chosen": -47.379703521728516, "logps/rejected": -46.97401428222656, "loss": 0.9904, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0015128243248909712, "rewards/margins": 0.038638047873973846, "rewards/rejected": -0.040150873363018036, "step": 30 }, { "epoch": 0.08956301914048392, "grad_norm": 2.8373796939849854, "learning_rate": 9.913294797687862e-07, "logits/chosen": -2.3252105712890625, "logits/rejected": -2.2393128871917725, "logps/chosen": -50.974849700927734, "logps/rejected": -50.64263916015625, "loss": 0.9881, "rewards/accuracies": 0.6875, "rewards/chosen": -0.034422945231199265, "rewards/margins": 0.047944869846105576, "rewards/rejected": -0.08236781507730484, "step": 31 }, { "epoch": 0.09245214879017696, "grad_norm": 2.9654507637023926, "learning_rate": 9.910404624277457e-07, "logits/chosen": -2.3276240825653076, "logits/rejected": -2.2649993896484375, "logps/chosen": -46.466129302978516, "logps/rejected": -50.508644104003906, "loss": 0.9956, "rewards/accuracies": 0.625, "rewards/chosen": -0.01797034777700901, "rewards/margins": 0.017593324184417725, "rewards/rejected": -0.035563670098781586, "step": 32 }, { "epoch": 0.09534127843986999, "grad_norm": 4.158166408538818, "learning_rate": 9.90751445086705e-07, "logits/chosen": -2.233234405517578, "logits/rejected": -2.2040791511535645, "logps/chosen": -48.05862045288086, "logps/rejected": -45.87712860107422, "loss": 0.9944, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03222387656569481, "rewards/margins": 0.022392649203538895, "rewards/rejected": -0.0546165332198143, "step": 33 }, { "epoch": 0.09823040808956301, "grad_norm": 2.475355863571167, "learning_rate": 9.904624277456648e-07, "logits/chosen": -2.336580991744995, "logits/rejected": -2.3329074382781982, "logps/chosen": -46.53982162475586, "logps/rejected": -50.89897537231445, "loss": 0.9918, "rewards/accuracies": 0.625, "rewards/chosen": -0.003178446553647518, "rewards/margins": 0.032921887934207916, "rewards/rejected": -0.03610033541917801, "step": 34 }, { "epoch": 0.10111953773925605, "grad_norm": 3.352144718170166, "learning_rate": 9.901734104046242e-07, "logits/chosen": -2.266047239303589, "logits/rejected": -2.255530834197998, "logps/chosen": -44.91516876220703, "logps/rejected": -49.15069580078125, "loss": 0.9922, "rewards/accuracies": 0.625, "rewards/chosen": -0.018226830288767815, "rewards/margins": 0.03153911232948303, "rewards/rejected": -0.049765948206186295, "step": 35 }, { "epoch": 0.10400866738894908, "grad_norm": 2.782752275466919, "learning_rate": 9.898843930635837e-07, "logits/chosen": -2.268705368041992, "logits/rejected": -2.2047626972198486, "logps/chosen": -45.6903076171875, "logps/rejected": -50.264381408691406, "loss": 0.9825, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01599123515188694, "rewards/margins": 0.07028627395629883, "rewards/rejected": -0.05429503694176674, "step": 36 }, { "epoch": 0.1068977970386421, "grad_norm": 2.620342969894409, "learning_rate": 9.895953757225433e-07, "logits/chosen": -2.295877456665039, "logits/rejected": -2.2715611457824707, "logps/chosen": -44.10856628417969, "logps/rejected": -47.521820068359375, "loss": 0.9891, "rewards/accuracies": 0.75, "rewards/chosen": -0.02145606093108654, "rewards/margins": 0.04387184605002403, "rewards/rejected": -0.06532790511846542, "step": 37 }, { "epoch": 0.10978692668833515, "grad_norm": 2.4181268215179443, "learning_rate": 9.89306358381503e-07, "logits/chosen": -2.2570910453796387, "logits/rejected": -2.2162439823150635, "logps/chosen": -44.422916412353516, "logps/rejected": -45.88962173461914, "loss": 0.987, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02386242151260376, "rewards/margins": 0.05209946259856224, "rewards/rejected": -0.0759618803858757, "step": 38 }, { "epoch": 0.11267605633802817, "grad_norm": 2.8675639629364014, "learning_rate": 9.890173410404624e-07, "logits/chosen": -2.284738779067993, "logits/rejected": -2.2369132041931152, "logps/chosen": -44.61423110961914, "logps/rejected": -49.159942626953125, "loss": 0.9914, "rewards/accuracies": 0.59375, "rewards/chosen": -0.009722869843244553, "rewards/margins": 0.034449003636837006, "rewards/rejected": -0.04417186975479126, "step": 39 }, { "epoch": 0.1155651859877212, "grad_norm": 2.3255810737609863, "learning_rate": 9.887283236994219e-07, "logits/chosen": -2.283201217651367, "logits/rejected": -2.288475513458252, "logps/chosen": -43.80707550048828, "logps/rejected": -47.49113082885742, "loss": 0.9877, "rewards/accuracies": 0.75, "rewards/chosen": 0.0072800163179636, "rewards/margins": 0.04947707802057266, "rewards/rejected": -0.04219706356525421, "step": 40 }, { "epoch": 0.11845431563741422, "grad_norm": 3.4773473739624023, "learning_rate": 9.884393063583815e-07, "logits/chosen": -2.296431064605713, "logits/rejected": -2.2959694862365723, "logps/chosen": -46.476619720458984, "logps/rejected": -49.464813232421875, "loss": 0.9851, "rewards/accuracies": 0.75, "rewards/chosen": -0.01301040593534708, "rewards/margins": 0.05961364135146141, "rewards/rejected": -0.07262405008077621, "step": 41 }, { "epoch": 0.12134344528710726, "grad_norm": 2.582150936126709, "learning_rate": 9.88150289017341e-07, "logits/chosen": -2.3093783855438232, "logits/rejected": -2.281991958618164, "logps/chosen": -44.1807975769043, "logps/rejected": -48.594276428222656, "loss": 0.9869, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0009051505476236343, "rewards/margins": 0.052667662501335144, "rewards/rejected": -0.05176251754164696, "step": 42 }, { "epoch": 0.12423257493680029, "grad_norm": 2.570236921310425, "learning_rate": 9.878612716763004e-07, "logits/chosen": -2.2545926570892334, "logits/rejected": -2.2202649116516113, "logps/chosen": -49.61861038208008, "logps/rejected": -47.759761810302734, "loss": 0.9894, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03398113325238228, "rewards/margins": 0.04263947159051895, "rewards/rejected": -0.07662060856819153, "step": 43 }, { "epoch": 0.12712170458649333, "grad_norm": 3.0656914710998535, "learning_rate": 9.875722543352601e-07, "logits/chosen": -2.2744696140289307, "logits/rejected": -2.2654876708984375, "logps/chosen": -45.46676254272461, "logps/rejected": -48.63860321044922, "loss": 0.9813, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0059118326753377914, "rewards/margins": 0.07511584460735321, "rewards/rejected": -0.06920401751995087, "step": 44 }, { "epoch": 0.13001083423618634, "grad_norm": 2.3484809398651123, "learning_rate": 9.872832369942196e-07, "logits/chosen": -2.2918381690979004, "logits/rejected": -2.275817394256592, "logps/chosen": -46.429229736328125, "logps/rejected": -47.27028274536133, "loss": 0.9848, "rewards/accuracies": 0.75, "rewards/chosen": -0.04199044778943062, "rewards/margins": 0.061182651668787, "rewards/rejected": -0.10317309200763702, "step": 45 }, { "epoch": 0.13289996388587938, "grad_norm": 2.777702569961548, "learning_rate": 9.869942196531792e-07, "logits/chosen": -2.2852606773376465, "logits/rejected": -2.2880396842956543, "logps/chosen": -44.83600616455078, "logps/rejected": -49.17924499511719, "loss": 0.9785, "rewards/accuracies": 0.75, "rewards/chosen": -0.02687942236661911, "rewards/margins": 0.08664777129888535, "rewards/rejected": -0.11352720111608505, "step": 46 }, { "epoch": 0.13578909353557242, "grad_norm": 3.208728075027466, "learning_rate": 9.867052023121387e-07, "logits/chosen": -2.3025739192962646, "logits/rejected": -2.2718827724456787, "logps/chosen": -49.72447204589844, "logps/rejected": -52.94569396972656, "loss": 0.9722, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03683493286371231, "rewards/margins": 0.11167352646589279, "rewards/rejected": -0.1485084593296051, "step": 47 }, { "epoch": 0.13867822318526543, "grad_norm": 2.6031839847564697, "learning_rate": 9.864161849710983e-07, "logits/chosen": -2.2706146240234375, "logits/rejected": -2.24666428565979, "logps/chosen": -47.76182556152344, "logps/rejected": -48.919288635253906, "loss": 0.978, "rewards/accuracies": 0.65625, "rewards/chosen": -0.053675539791584015, "rewards/margins": 0.08843037486076355, "rewards/rejected": -0.14210590720176697, "step": 48 }, { "epoch": 0.14156735283495847, "grad_norm": 2.9281086921691895, "learning_rate": 9.861271676300578e-07, "logits/chosen": -2.339927911758423, "logits/rejected": -2.32120418548584, "logps/chosen": -45.86162185668945, "logps/rejected": -50.570743560791016, "loss": 0.9876, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0440777949988842, "rewards/margins": 0.0497584342956543, "rewards/rejected": -0.0938362255692482, "step": 49 }, { "epoch": 0.1444564824846515, "grad_norm": 2.851905345916748, "learning_rate": 9.858381502890172e-07, "logits/chosen": -2.3085618019104004, "logits/rejected": -2.2658281326293945, "logps/chosen": -46.89045333862305, "logps/rejected": -50.59040451049805, "loss": 0.9749, "rewards/accuracies": 0.75, "rewards/chosen": -0.001536625437438488, "rewards/margins": 0.10088874399662018, "rewards/rejected": -0.10242536664009094, "step": 50 }, { "epoch": 0.14734561213434452, "grad_norm": 2.6556220054626465, "learning_rate": 9.855491329479769e-07, "logits/chosen": -2.2781286239624023, "logits/rejected": -2.2676360607147217, "logps/chosen": -45.85409164428711, "logps/rejected": -47.48622131347656, "loss": 0.9805, "rewards/accuracies": 0.75, "rewards/chosen": -0.05080262944102287, "rewards/margins": 0.07841161638498306, "rewards/rejected": -0.12921422719955444, "step": 51 }, { "epoch": 0.15023474178403756, "grad_norm": 2.669827461242676, "learning_rate": 9.852601156069363e-07, "logits/chosen": -2.293513774871826, "logits/rejected": -2.2106375694274902, "logps/chosen": -45.423118591308594, "logps/rejected": -48.59331512451172, "loss": 0.979, "rewards/accuracies": 0.65625, "rewards/chosen": -0.013767587020993233, "rewards/margins": 0.08427857607603073, "rewards/rejected": -0.09804615378379822, "step": 52 }, { "epoch": 0.1531238714337306, "grad_norm": 2.712538480758667, "learning_rate": 9.849710982658958e-07, "logits/chosen": -2.282120943069458, "logits/rejected": -2.2388973236083984, "logps/chosen": -45.62527084350586, "logps/rejected": -47.03450012207031, "loss": 0.9801, "rewards/accuracies": 0.84375, "rewards/chosen": -0.031670790165662766, "rewards/margins": 0.07997889071702957, "rewards/rejected": -0.11164967715740204, "step": 53 }, { "epoch": 0.1560130010834236, "grad_norm": 2.694411277770996, "learning_rate": 9.846820809248555e-07, "logits/chosen": -2.2926647663116455, "logits/rejected": -2.2434275150299072, "logps/chosen": -46.5322151184082, "logps/rejected": -49.94254684448242, "loss": 0.973, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06836049258708954, "rewards/margins": 0.10836561024188995, "rewards/rejected": -0.1767261028289795, "step": 54 }, { "epoch": 0.15890213073311665, "grad_norm": 2.566436290740967, "learning_rate": 9.843930635838151e-07, "logits/chosen": -2.2875001430511475, "logits/rejected": -2.239607095718384, "logps/chosen": -45.26957321166992, "logps/rejected": -48.66908264160156, "loss": 0.9728, "rewards/accuracies": 0.6875, "rewards/chosen": -0.019489873200654984, "rewards/margins": 0.10923899710178375, "rewards/rejected": -0.12872888147830963, "step": 55 }, { "epoch": 0.16179126038280967, "grad_norm": 2.879613161087036, "learning_rate": 9.841040462427746e-07, "logits/chosen": -2.2845618724823, "logits/rejected": -2.284653425216675, "logps/chosen": -46.61941146850586, "logps/rejected": -50.952274322509766, "loss": 0.9689, "rewards/accuracies": 0.71875, "rewards/chosen": -0.02038157358765602, "rewards/margins": 0.12514248490333557, "rewards/rejected": -0.1455240696668625, "step": 56 }, { "epoch": 0.1646803900325027, "grad_norm": 12.369844436645508, "learning_rate": 9.83815028901734e-07, "logits/chosen": -2.227327346801758, "logits/rejected": -2.254575490951538, "logps/chosen": -47.10674285888672, "logps/rejected": -52.392372131347656, "loss": 0.9696, "rewards/accuracies": 0.6875, "rewards/chosen": -0.011986589059233665, "rewards/margins": 0.12231485545635223, "rewards/rejected": -0.13430143892765045, "step": 57 }, { "epoch": 0.16756951968219574, "grad_norm": 2.676523208618164, "learning_rate": 9.835260115606937e-07, "logits/chosen": -2.251145362854004, "logits/rejected": -2.2389659881591797, "logps/chosen": -43.23997497558594, "logps/rejected": -46.34592056274414, "loss": 0.9746, "rewards/accuracies": 0.84375, "rewards/chosen": -0.08189845085144043, "rewards/margins": 0.10253530740737915, "rewards/rejected": -0.18443375825881958, "step": 58 }, { "epoch": 0.17045864933188876, "grad_norm": 2.6484291553497314, "learning_rate": 9.832369942196531e-07, "logits/chosen": -2.3022546768188477, "logits/rejected": -2.2444005012512207, "logps/chosen": -43.91075897216797, "logps/rejected": -50.577049255371094, "loss": 0.9962, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1366237998008728, "rewards/margins": 0.015846164897084236, "rewards/rejected": -0.1524699479341507, "step": 59 }, { "epoch": 0.1733477789815818, "grad_norm": 2.888946771621704, "learning_rate": 9.829479768786126e-07, "logits/chosen": -2.3209028244018555, "logits/rejected": -2.2555549144744873, "logps/chosen": -47.39657974243164, "logps/rejected": -50.446693420410156, "loss": 0.9789, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08920682221651077, "rewards/margins": 0.08598750084638596, "rewards/rejected": -0.17519432306289673, "step": 60 }, { "epoch": 0.17623690863127484, "grad_norm": 2.5530812740325928, "learning_rate": 9.826589595375722e-07, "logits/chosen": -2.270442247390747, "logits/rejected": -2.2388758659362793, "logps/chosen": -42.512901306152344, "logps/rejected": -51.3780632019043, "loss": 0.9779, "rewards/accuracies": 0.78125, "rewards/chosen": -0.055780354887247086, "rewards/margins": 0.08908979594707489, "rewards/rejected": -0.14487016201019287, "step": 61 }, { "epoch": 0.17912603828096785, "grad_norm": 2.566180467605591, "learning_rate": 9.82369942196532e-07, "logits/chosen": -2.28181791305542, "logits/rejected": -2.249518632888794, "logps/chosen": -46.51033401489258, "logps/rejected": -49.537662506103516, "loss": 0.9694, "rewards/accuracies": 0.78125, "rewards/chosen": -0.03622724860906601, "rewards/margins": 0.12354650348424911, "rewards/rejected": -0.15977376699447632, "step": 62 }, { "epoch": 0.1820151679306609, "grad_norm": 3.616037607192993, "learning_rate": 9.820809248554913e-07, "logits/chosen": -2.231435537338257, "logits/rejected": -2.235358238220215, "logps/chosen": -48.959861755371094, "logps/rejected": -49.414825439453125, "loss": 0.9486, "rewards/accuracies": 0.875, "rewards/chosen": -0.036105044186115265, "rewards/margins": 0.20781224966049194, "rewards/rejected": -0.2439172863960266, "step": 63 }, { "epoch": 0.18490429758035393, "grad_norm": 3.160766124725342, "learning_rate": 9.817919075144508e-07, "logits/chosen": -2.2667908668518066, "logits/rejected": -2.2644991874694824, "logps/chosen": -47.29449462890625, "logps/rejected": -50.08894348144531, "loss": 0.9633, "rewards/accuracies": 0.8125, "rewards/chosen": 0.00962911732494831, "rewards/margins": 0.14762884378433228, "rewards/rejected": -0.13799969851970673, "step": 64 }, { "epoch": 0.18779342723004694, "grad_norm": 3.2306325435638428, "learning_rate": 9.815028901734105e-07, "logits/chosen": -2.282207727432251, "logits/rejected": -2.2431702613830566, "logps/chosen": -47.2364501953125, "logps/rejected": -53.247737884521484, "loss": 0.956, "rewards/accuracies": 0.84375, "rewards/chosen": -0.07210152596235275, "rewards/margins": 0.17883284389972687, "rewards/rejected": -0.250934362411499, "step": 65 }, { "epoch": 0.19068255687973998, "grad_norm": 2.714399576187134, "learning_rate": 9.8121387283237e-07, "logits/chosen": -2.31252121925354, "logits/rejected": -2.3363687992095947, "logps/chosen": -44.12818908691406, "logps/rejected": -49.0745735168457, "loss": 0.9547, "rewards/accuracies": 0.875, "rewards/chosen": -0.0644940435886383, "rewards/margins": 0.1830126941204071, "rewards/rejected": -0.24750672280788422, "step": 66 }, { "epoch": 0.19357168652943302, "grad_norm": 3.167468786239624, "learning_rate": 9.809248554913294e-07, "logits/chosen": -2.3152239322662354, "logits/rejected": -2.3467743396759033, "logps/chosen": -46.754032135009766, "logps/rejected": -50.095436096191406, "loss": 0.9521, "rewards/accuracies": 0.84375, "rewards/chosen": 0.01586472988128662, "rewards/margins": 0.19368241727352142, "rewards/rejected": -0.1778176873922348, "step": 67 }, { "epoch": 0.19646081617912603, "grad_norm": 2.832882881164551, "learning_rate": 9.80635838150289e-07, "logits/chosen": -2.349510431289673, "logits/rejected": -2.301586866378784, "logps/chosen": -46.56528854370117, "logps/rejected": -51.07681655883789, "loss": 0.9568, "rewards/accuracies": 0.75, "rewards/chosen": -0.10664601624011993, "rewards/margins": 0.17626953125, "rewards/rejected": -0.28291553258895874, "step": 68 }, { "epoch": 0.19934994582881907, "grad_norm": 4.608213424682617, "learning_rate": 9.803468208092485e-07, "logits/chosen": -2.3525147438049316, "logits/rejected": -2.336215019226074, "logps/chosen": -47.91854476928711, "logps/rejected": -50.124549865722656, "loss": 0.9669, "rewards/accuracies": 0.75, "rewards/chosen": -0.11560079455375671, "rewards/margins": 0.1341520994901657, "rewards/rejected": -0.24975289404392242, "step": 69 }, { "epoch": 0.2022390754785121, "grad_norm": 3.1157894134521484, "learning_rate": 9.800578034682081e-07, "logits/chosen": -2.315807580947876, "logits/rejected": -2.253732442855835, "logps/chosen": -49.6510124206543, "logps/rejected": -52.322776794433594, "loss": 0.9578, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10318784415721893, "rewards/margins": 0.17081809043884277, "rewards/rejected": -0.2740059494972229, "step": 70 }, { "epoch": 0.2022390754785121, "eval_logits/chosen": -2.300962209701538, "eval_logits/rejected": -2.2598252296447754, "eval_logps/chosen": -46.50691604614258, "eval_logps/rejected": -49.15951919555664, "eval_loss": 0.9500161409378052, "eval_rewards/accuracies": 0.9032257795333862, "eval_rewards/chosen": -0.01905054785311222, "eval_rewards/margins": 0.20188362896442413, "eval_rewards/rejected": -0.2209341675043106, "eval_runtime": 223.9678, "eval_samples_per_second": 0.549, "eval_steps_per_second": 0.277, "step": 70 }, { "epoch": 0.20512820512820512, "grad_norm": 2.493720769882202, "learning_rate": 9.797687861271676e-07, "logits/chosen": -2.288285732269287, "logits/rejected": -2.2501378059387207, "logps/chosen": -45.0571403503418, "logps/rejected": -49.26872634887695, "loss": 0.9688, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09671956300735474, "rewards/margins": 0.1267600655555725, "rewards/rejected": -0.22347962856292725, "step": 71 }, { "epoch": 0.20801733477789816, "grad_norm": 2.669626474380493, "learning_rate": 9.794797687861272e-07, "logits/chosen": -2.289896011352539, "logits/rejected": -2.2698183059692383, "logps/chosen": -45.88307571411133, "logps/rejected": -52.062286376953125, "loss": 0.9762, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1288202553987503, "rewards/margins": 0.09532952308654785, "rewards/rejected": -0.22414979338645935, "step": 72 }, { "epoch": 0.2109064644275912, "grad_norm": 3.215147018432617, "learning_rate": 9.791907514450867e-07, "logits/chosen": -2.2705891132354736, "logits/rejected": -2.2824974060058594, "logps/chosen": -48.54786682128906, "logps/rejected": -52.85610580444336, "loss": 0.949, "rewards/accuracies": 0.84375, "rewards/chosen": -0.15248575806617737, "rewards/margins": 0.20824581384658813, "rewards/rejected": -0.3607315719127655, "step": 73 }, { "epoch": 0.2137955940772842, "grad_norm": 3.1103947162628174, "learning_rate": 9.789017341040461e-07, "logits/chosen": -2.301731586456299, "logits/rejected": -2.2555320262908936, "logps/chosen": -47.282073974609375, "logps/rejected": -50.781349182128906, "loss": 0.9531, "rewards/accuracies": 0.75, "rewards/chosen": -0.08309256285429001, "rewards/margins": 0.19135484099388123, "rewards/rejected": -0.27444738149642944, "step": 74 }, { "epoch": 0.21668472372697725, "grad_norm": 2.8817436695098877, "learning_rate": 9.786127167630058e-07, "logits/chosen": -2.3043715953826904, "logits/rejected": -2.2508349418640137, "logps/chosen": -47.059024810791016, "logps/rejected": -50.260780334472656, "loss": 0.9494, "rewards/accuracies": 0.90625, "rewards/chosen": -0.11742958426475525, "rewards/margins": 0.20674584805965424, "rewards/rejected": -0.3241754174232483, "step": 75 }, { "epoch": 0.2195738533766703, "grad_norm": 3.0267608165740967, "learning_rate": 9.783236994219653e-07, "logits/chosen": -2.285388946533203, "logits/rejected": -2.30926513671875, "logps/chosen": -49.096988677978516, "logps/rejected": -52.603946685791016, "loss": 0.9362, "rewards/accuracies": 0.875, "rewards/chosen": -0.041618913412094116, "rewards/margins": 0.26182615756988525, "rewards/rejected": -0.30344507098197937, "step": 76 }, { "epoch": 0.2224629830263633, "grad_norm": 3.032710313796997, "learning_rate": 9.780346820809247e-07, "logits/chosen": -2.3650362491607666, "logits/rejected": -2.280951738357544, "logps/chosen": -45.961551666259766, "logps/rejected": -53.615760803222656, "loss": 0.9454, "rewards/accuracies": 0.6875, "rewards/chosen": -0.01917697675526142, "rewards/margins": 0.22113394737243652, "rewards/rejected": -0.2403109222650528, "step": 77 }, { "epoch": 0.22535211267605634, "grad_norm": 2.690061092376709, "learning_rate": 9.777456647398844e-07, "logits/chosen": -2.23842453956604, "logits/rejected": -2.2224130630493164, "logps/chosen": -47.41105270385742, "logps/rejected": -52.246978759765625, "loss": 0.9596, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19264782965183258, "rewards/margins": 0.1677761971950531, "rewards/rejected": -0.3604240119457245, "step": 78 }, { "epoch": 0.22824124232574936, "grad_norm": 3.0419321060180664, "learning_rate": 9.77456647398844e-07, "logits/chosen": -2.344222068786621, "logits/rejected": -2.292158842086792, "logps/chosen": -48.342960357666016, "logps/rejected": -52.0751953125, "loss": 0.9435, "rewards/accuracies": 0.875, "rewards/chosen": -0.11126381158828735, "rewards/margins": 0.23090822994709015, "rewards/rejected": -0.3421720266342163, "step": 79 }, { "epoch": 0.2311303719754424, "grad_norm": 2.889025926589966, "learning_rate": 9.771676300578035e-07, "logits/chosen": -2.3186588287353516, "logits/rejected": -2.2908332347869873, "logps/chosen": -47.47981643676758, "logps/rejected": -51.05913543701172, "loss": 0.9315, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05798480659723282, "rewards/margins": 0.2803359925746918, "rewards/rejected": -0.338320791721344, "step": 80 }, { "epoch": 0.23401950162513543, "grad_norm": 2.9459567070007324, "learning_rate": 9.76878612716763e-07, "logits/chosen": -2.2692413330078125, "logits/rejected": -2.251190662384033, "logps/chosen": -46.03938674926758, "logps/rejected": -49.480796813964844, "loss": 0.9459, "rewards/accuracies": 0.75, "rewards/chosen": -0.1792960911989212, "rewards/margins": 0.22361961007118225, "rewards/rejected": -0.40291571617126465, "step": 81 }, { "epoch": 0.23690863127482845, "grad_norm": 6.810198783874512, "learning_rate": 9.765895953757226e-07, "logits/chosen": -2.313509225845337, "logits/rejected": -2.217852830886841, "logps/chosen": -48.95508575439453, "logps/rejected": -51.63447570800781, "loss": 0.945, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1272617131471634, "rewards/margins": 0.22542613744735718, "rewards/rejected": -0.35268789529800415, "step": 82 }, { "epoch": 0.23979776092452149, "grad_norm": 2.844003200531006, "learning_rate": 9.76300578034682e-07, "logits/chosen": -2.357997179031372, "logits/rejected": -2.279188871383667, "logps/chosen": -45.44438552856445, "logps/rejected": -47.394935607910156, "loss": 0.9445, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06597324460744858, "rewards/margins": 0.2276672124862671, "rewards/rejected": -0.29364046454429626, "step": 83 }, { "epoch": 0.24268689057421453, "grad_norm": 14.226211547851562, "learning_rate": 9.760115606936415e-07, "logits/chosen": -2.334857702255249, "logits/rejected": -2.3090269565582275, "logps/chosen": -47.508766174316406, "logps/rejected": -54.94401931762695, "loss": 0.9338, "rewards/accuracies": 0.875, "rewards/chosen": -0.0756339579820633, "rewards/margins": 0.274461567401886, "rewards/rejected": -0.3500955402851105, "step": 84 }, { "epoch": 0.24557602022390754, "grad_norm": 3.0864241123199463, "learning_rate": 9.757225433526011e-07, "logits/chosen": -2.2955594062805176, "logits/rejected": -2.254427433013916, "logps/chosen": -45.880043029785156, "logps/rejected": -52.78728103637695, "loss": 0.9239, "rewards/accuracies": 0.875, "rewards/chosen": -0.03306727856397629, "rewards/margins": 0.3127743601799011, "rewards/rejected": -0.3458416163921356, "step": 85 }, { "epoch": 0.24846514987360058, "grad_norm": 3.2545931339263916, "learning_rate": 9.754335260115608e-07, "logits/chosen": -2.2423698902130127, "logits/rejected": -2.194934844970703, "logps/chosen": -45.584346771240234, "logps/rejected": -53.056671142578125, "loss": 0.9258, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06997516006231308, "rewards/margins": 0.30728572607040405, "rewards/rejected": -0.37726089358329773, "step": 86 }, { "epoch": 0.2513542795232936, "grad_norm": 3.3431127071380615, "learning_rate": 9.751445086705203e-07, "logits/chosen": -2.2570557594299316, "logits/rejected": -2.286752700805664, "logps/chosen": -44.26374053955078, "logps/rejected": -52.50733947753906, "loss": 0.9392, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10946682840585709, "rewards/margins": 0.24984873831272125, "rewards/rejected": -0.3593156039714813, "step": 87 }, { "epoch": 0.25424340917298666, "grad_norm": 3.3322982788085938, "learning_rate": 9.748554913294797e-07, "logits/chosen": -2.340938091278076, "logits/rejected": -2.262331485748291, "logps/chosen": -50.1177978515625, "logps/rejected": -52.47159957885742, "loss": 0.9273, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11067531257867813, "rewards/margins": 0.305046021938324, "rewards/rejected": -0.4157213270664215, "step": 88 }, { "epoch": 0.25713253882267967, "grad_norm": 3.0395636558532715, "learning_rate": 9.745664739884394e-07, "logits/chosen": -2.3276443481445312, "logits/rejected": -2.270548105239868, "logps/chosen": -48.72882843017578, "logps/rejected": -53.95570373535156, "loss": 0.9359, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12068435549736023, "rewards/margins": 0.26502496004104614, "rewards/rejected": -0.385709285736084, "step": 89 }, { "epoch": 0.2600216684723727, "grad_norm": 2.628474712371826, "learning_rate": 9.742774566473988e-07, "logits/chosen": -2.276275396347046, "logits/rejected": -2.265517234802246, "logps/chosen": -46.186222076416016, "logps/rejected": -54.174522399902344, "loss": 0.9345, "rewards/accuracies": 0.84375, "rewards/chosen": -0.10530070215463638, "rewards/margins": 0.2738332152366638, "rewards/rejected": -0.3791339099407196, "step": 90 }, { "epoch": 0.26291079812206575, "grad_norm": 3.761831760406494, "learning_rate": 9.739884393063583e-07, "logits/chosen": -2.288990020751953, "logits/rejected": -2.2979166507720947, "logps/chosen": -45.146018981933594, "logps/rejected": -50.666404724121094, "loss": 0.9057, "rewards/accuracies": 0.875, "rewards/chosen": 0.030712779611349106, "rewards/margins": 0.3899651765823364, "rewards/rejected": -0.359252393245697, "step": 91 }, { "epoch": 0.26579992777175876, "grad_norm": 2.8075027465820312, "learning_rate": 9.73699421965318e-07, "logits/chosen": -2.3509256839752197, "logits/rejected": -2.326643228530884, "logps/chosen": -48.01289749145508, "logps/rejected": -55.8663444519043, "loss": 0.9364, "rewards/accuracies": 0.8125, "rewards/chosen": -0.23900941014289856, "rewards/margins": 0.26733824610710144, "rewards/rejected": -0.5063477158546448, "step": 92 }, { "epoch": 0.26868905742145177, "grad_norm": 3.1415743827819824, "learning_rate": 9.734104046242774e-07, "logits/chosen": -2.2719314098358154, "logits/rejected": -2.2735722064971924, "logps/chosen": -47.5028190612793, "logps/rejected": -51.72547912597656, "loss": 0.8967, "rewards/accuracies": 0.875, "rewards/chosen": -0.09682731330394745, "rewards/margins": 0.4346972703933716, "rewards/rejected": -0.531524658203125, "step": 93 }, { "epoch": 0.27157818707114484, "grad_norm": 3.763679265975952, "learning_rate": 9.73121387283237e-07, "logits/chosen": -2.2680459022521973, "logits/rejected": -2.25370717048645, "logps/chosen": -46.5444450378418, "logps/rejected": -53.215614318847656, "loss": 0.8852, "rewards/accuracies": 0.84375, "rewards/chosen": 0.04018138349056244, "rewards/margins": 0.47995519638061523, "rewards/rejected": -0.4397737979888916, "step": 94 }, { "epoch": 0.27446731672083785, "grad_norm": 3.3367152214050293, "learning_rate": 9.728323699421965e-07, "logits/chosen": -2.2629644870758057, "logits/rejected": -2.266263484954834, "logps/chosen": -52.274593353271484, "logps/rejected": -56.912479400634766, "loss": 0.9276, "rewards/accuracies": 0.8125, "rewards/chosen": -0.25155460834503174, "rewards/margins": 0.3155984878540039, "rewards/rejected": -0.5671530365943909, "step": 95 }, { "epoch": 0.27735644637053086, "grad_norm": 19.03712272644043, "learning_rate": 9.725433526011562e-07, "logits/chosen": -2.3371129035949707, "logits/rejected": -2.333841323852539, "logps/chosen": -46.46038818359375, "logps/rejected": -53.949676513671875, "loss": 0.9046, "rewards/accuracies": 0.84375, "rewards/chosen": -0.13699927926063538, "rewards/margins": 0.39715397357940674, "rewards/rejected": -0.5341532230377197, "step": 96 }, { "epoch": 0.28024557602022393, "grad_norm": 2.8175182342529297, "learning_rate": 9.722543352601156e-07, "logits/chosen": -2.328862428665161, "logits/rejected": -2.3321826457977295, "logps/chosen": -47.078102111816406, "logps/rejected": -52.308013916015625, "loss": 0.9331, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2634473443031311, "rewards/margins": 0.2826656103134155, "rewards/rejected": -0.5461129546165466, "step": 97 }, { "epoch": 0.28313470566991694, "grad_norm": 3.0097403526306152, "learning_rate": 9.71965317919075e-07, "logits/chosen": -2.2560582160949707, "logits/rejected": -2.2519190311431885, "logps/chosen": -47.75956726074219, "logps/rejected": -57.27588653564453, "loss": 0.8865, "rewards/accuracies": 0.875, "rewards/chosen": -0.11432754993438721, "rewards/margins": 0.49641290307044983, "rewards/rejected": -0.6107404232025146, "step": 98 }, { "epoch": 0.28602383531960995, "grad_norm": 2.6421005725860596, "learning_rate": 9.716763005780347e-07, "logits/chosen": -2.332425832748413, "logits/rejected": -2.2943625450134277, "logps/chosen": -45.564701080322266, "logps/rejected": -51.01380920410156, "loss": 0.9042, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2510656416416168, "rewards/margins": 0.4070891737937927, "rewards/rejected": -0.6581548452377319, "step": 99 }, { "epoch": 0.288912964969303, "grad_norm": 2.7961349487304688, "learning_rate": 9.713872832369942e-07, "logits/chosen": -2.28564190864563, "logits/rejected": -2.233611583709717, "logps/chosen": -45.948856353759766, "logps/rejected": -49.938140869140625, "loss": 0.9223, "rewards/accuracies": 0.875, "rewards/chosen": -0.15864676237106323, "rewards/margins": 0.3363339900970459, "rewards/rejected": -0.49498075246810913, "step": 100 }, { "epoch": 0.29180209461899603, "grad_norm": 2.8495142459869385, "learning_rate": 9.710982658959536e-07, "logits/chosen": -2.294275999069214, "logits/rejected": -2.2477400302886963, "logps/chosen": -50.33110427856445, "logps/rejected": -56.04043197631836, "loss": 0.9219, "rewards/accuracies": 0.84375, "rewards/chosen": -0.387870192527771, "rewards/margins": 0.34480151534080505, "rewards/rejected": -0.7326717376708984, "step": 101 }, { "epoch": 0.29469122426868904, "grad_norm": 3.891122579574585, "learning_rate": 9.708092485549133e-07, "logits/chosen": -2.397738456726074, "logits/rejected": -2.311356544494629, "logps/chosen": -48.49738311767578, "logps/rejected": -57.731937408447266, "loss": 0.8922, "rewards/accuracies": 0.875, "rewards/chosen": -0.2558045983314514, "rewards/margins": 0.4750725328922272, "rewards/rejected": -0.7308771014213562, "step": 102 }, { "epoch": 0.2975803539183821, "grad_norm": 2.606268882751465, "learning_rate": 9.70520231213873e-07, "logits/chosen": -2.308202028274536, "logits/rejected": -2.232607841491699, "logps/chosen": -49.22990417480469, "logps/rejected": -55.20797348022461, "loss": 0.9312, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3126336336135864, "rewards/margins": 0.29822391271591187, "rewards/rejected": -0.6108576059341431, "step": 103 }, { "epoch": 0.3004694835680751, "grad_norm": 3.464611768722534, "learning_rate": 9.702312138728324e-07, "logits/chosen": -2.33969783782959, "logits/rejected": -2.3266992568969727, "logps/chosen": -47.32627487182617, "logps/rejected": -55.30508804321289, "loss": 0.8483, "rewards/accuracies": 0.84375, "rewards/chosen": -0.03852356970310211, "rewards/margins": 0.6640288829803467, "rewards/rejected": -0.7025524377822876, "step": 104 }, { "epoch": 0.30335861321776814, "grad_norm": 3.3541245460510254, "learning_rate": 9.699421965317918e-07, "logits/chosen": -2.294907808303833, "logits/rejected": -2.230393171310425, "logps/chosen": -47.048789978027344, "logps/rejected": -56.042964935302734, "loss": 0.8575, "rewards/accuracies": 0.875, "rewards/chosen": -0.15433521568775177, "rewards/margins": 0.6271101832389832, "rewards/rejected": -0.7814454436302185, "step": 105 }, { "epoch": 0.3062477428674612, "grad_norm": 3.174867868423462, "learning_rate": 9.696531791907515e-07, "logits/chosen": -2.362940549850464, "logits/rejected": -2.34169864654541, "logps/chosen": -49.634029388427734, "logps/rejected": -55.43955993652344, "loss": 0.8587, "rewards/accuracies": 0.90625, "rewards/chosen": -0.09253744035959244, "rewards/margins": 0.611237108707428, "rewards/rejected": -0.7037745714187622, "step": 106 }, { "epoch": 0.3091368725171542, "grad_norm": 3.3843846321105957, "learning_rate": 9.69364161849711e-07, "logits/chosen": -2.3430352210998535, "logits/rejected": -2.3064732551574707, "logps/chosen": -45.9991340637207, "logps/rejected": -51.150123596191406, "loss": 0.8667, "rewards/accuracies": 0.875, "rewards/chosen": -0.13945092260837555, "rewards/margins": 0.5716868042945862, "rewards/rejected": -0.7111377120018005, "step": 107 }, { "epoch": 0.3120260021668472, "grad_norm": 2.7547271251678467, "learning_rate": 9.690751445086704e-07, "logits/chosen": -2.329497814178467, "logits/rejected": -2.2797775268554688, "logps/chosen": -47.02820587158203, "logps/rejected": -54.673858642578125, "loss": 0.8739, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2221914827823639, "rewards/margins": 0.583368718624115, "rewards/rejected": -0.8055601119995117, "step": 108 }, { "epoch": 0.3149151318165403, "grad_norm": 2.7593111991882324, "learning_rate": 9.6878612716763e-07, "logits/chosen": -2.336287498474121, "logits/rejected": -2.2785396575927734, "logps/chosen": -47.497501373291016, "logps/rejected": -53.990478515625, "loss": 0.906, "rewards/accuracies": 0.75, "rewards/chosen": -0.2104789912700653, "rewards/margins": 0.41137224435806274, "rewards/rejected": -0.6218512058258057, "step": 109 }, { "epoch": 0.3178042614662333, "grad_norm": 2.9446537494659424, "learning_rate": 9.684971098265895e-07, "logits/chosen": -2.353287696838379, "logits/rejected": -2.327430248260498, "logps/chosen": -48.23634338378906, "logps/rejected": -54.227516174316406, "loss": 0.9245, "rewards/accuracies": 0.65625, "rewards/chosen": -0.27513745427131653, "rewards/margins": 0.3277024030685425, "rewards/rejected": -0.6028398275375366, "step": 110 }, { "epoch": 0.3206933911159263, "grad_norm": 3.14666748046875, "learning_rate": 9.682080924855492e-07, "logits/chosen": -2.3392720222473145, "logits/rejected": -2.4010963439941406, "logps/chosen": -48.01555633544922, "logps/rejected": -54.412052154541016, "loss": 0.8651, "rewards/accuracies": 0.78125, "rewards/chosen": -0.08314169943332672, "rewards/margins": 0.5773841142654419, "rewards/rejected": -0.6605258584022522, "step": 111 }, { "epoch": 0.32358252076561933, "grad_norm": 2.971306324005127, "learning_rate": 9.679190751445086e-07, "logits/chosen": -2.324148654937744, "logits/rejected": -2.3573672771453857, "logps/chosen": -49.55748748779297, "logps/rejected": -56.810394287109375, "loss": 0.9181, "rewards/accuracies": 0.78125, "rewards/chosen": -0.18836934864521027, "rewards/margins": 0.35117048025131226, "rewards/rejected": -0.5395397543907166, "step": 112 }, { "epoch": 0.3264716504153124, "grad_norm": 3.070141077041626, "learning_rate": 9.676300578034683e-07, "logits/chosen": -2.3226184844970703, "logits/rejected": -2.2912485599517822, "logps/chosen": -48.23741912841797, "logps/rejected": -53.66437911987305, "loss": 0.8647, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18249152600765228, "rewards/margins": 0.6078415513038635, "rewards/rejected": -0.790333092212677, "step": 113 }, { "epoch": 0.3293607800650054, "grad_norm": 3.170484781265259, "learning_rate": 9.673410404624277e-07, "logits/chosen": -2.357276201248169, "logits/rejected": -2.320917844772339, "logps/chosen": -46.845035552978516, "logps/rejected": -55.01700210571289, "loss": 0.8505, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17951740324497223, "rewards/margins": 0.6673025488853455, "rewards/rejected": -0.8468199968338013, "step": 114 }, { "epoch": 0.3322499097146984, "grad_norm": 2.967478036880493, "learning_rate": 9.670520231213872e-07, "logits/chosen": -2.3304812908172607, "logits/rejected": -2.3041210174560547, "logps/chosen": -45.898136138916016, "logps/rejected": -55.1304817199707, "loss": 0.8879, "rewards/accuracies": 0.78125, "rewards/chosen": -0.27858901023864746, "rewards/margins": 0.5239519476890564, "rewards/rejected": -0.8025408983230591, "step": 115 }, { "epoch": 0.3351390393643915, "grad_norm": 2.910109281539917, "learning_rate": 9.667630057803468e-07, "logits/chosen": -2.3351857662200928, "logits/rejected": -2.313999891281128, "logps/chosen": -47.812625885009766, "logps/rejected": -52.07948303222656, "loss": 0.875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.18821363151073456, "rewards/margins": 0.5494893789291382, "rewards/rejected": -0.7377029657363892, "step": 116 }, { "epoch": 0.3380281690140845, "grad_norm": 2.730300188064575, "learning_rate": 9.664739884393063e-07, "logits/chosen": -2.3181583881378174, "logits/rejected": -2.3551371097564697, "logps/chosen": -47.8523063659668, "logps/rejected": -54.51359939575195, "loss": 0.8756, "rewards/accuracies": 0.8125, "rewards/chosen": -0.28951752185821533, "rewards/margins": 0.5754128098487854, "rewards/rejected": -0.864930272102356, "step": 117 }, { "epoch": 0.3409172986637775, "grad_norm": 3.0325069427490234, "learning_rate": 9.661849710982657e-07, "logits/chosen": -2.3413450717926025, "logits/rejected": -2.3676180839538574, "logps/chosen": -48.42097854614258, "logps/rejected": -55.7701530456543, "loss": 0.8651, "rewards/accuracies": 0.78125, "rewards/chosen": -0.16341598331928253, "rewards/margins": 0.6106769442558289, "rewards/rejected": -0.7740930318832397, "step": 118 }, { "epoch": 0.3438064283134706, "grad_norm": 2.6863348484039307, "learning_rate": 9.658959537572254e-07, "logits/chosen": -2.344531774520874, "logits/rejected": -2.313162326812744, "logps/chosen": -46.715721130371094, "logps/rejected": -55.56859588623047, "loss": 0.8861, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3766912817955017, "rewards/margins": 0.5813136696815491, "rewards/rejected": -0.9580049514770508, "step": 119 }, { "epoch": 0.3466955579631636, "grad_norm": 2.6382954120635986, "learning_rate": 9.65606936416185e-07, "logits/chosen": -2.3315603733062744, "logits/rejected": -2.3309741020202637, "logps/chosen": -46.216064453125, "logps/rejected": -55.7353515625, "loss": 0.8702, "rewards/accuracies": 0.8125, "rewards/chosen": -0.14833883941173553, "rewards/margins": 0.599740743637085, "rewards/rejected": -0.7480796575546265, "step": 120 }, { "epoch": 0.3495846876128566, "grad_norm": 2.628342866897583, "learning_rate": 9.653179190751445e-07, "logits/chosen": -2.282151699066162, "logits/rejected": -2.3005192279815674, "logps/chosen": -47.57792663574219, "logps/rejected": -56.578800201416016, "loss": 0.8581, "rewards/accuracies": 0.875, "rewards/chosen": -0.2624375820159912, "rewards/margins": 0.656284511089325, "rewards/rejected": -0.9187220931053162, "step": 121 }, { "epoch": 0.35247381726254967, "grad_norm": 3.1212923526763916, "learning_rate": 9.65028901734104e-07, "logits/chosen": -2.349888801574707, "logits/rejected": -2.366293430328369, "logps/chosen": -45.99599075317383, "logps/rejected": -57.90758514404297, "loss": 0.8031, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2075478434562683, "rewards/margins": 0.9114113450050354, "rewards/rejected": -1.1189591884613037, "step": 122 }, { "epoch": 0.3553629469122427, "grad_norm": 3.0340378284454346, "learning_rate": 9.647398843930636e-07, "logits/chosen": -2.3201851844787598, "logits/rejected": -2.349195718765259, "logps/chosen": -48.09332275390625, "logps/rejected": -59.15227508544922, "loss": 0.8278, "rewards/accuracies": 0.84375, "rewards/chosen": 0.00597868487238884, "rewards/margins": 0.7992280721664429, "rewards/rejected": -0.7932493090629578, "step": 123 }, { "epoch": 0.3582520765619357, "grad_norm": 2.784259796142578, "learning_rate": 9.64450867052023e-07, "logits/chosen": -2.30971097946167, "logits/rejected": -2.3382463455200195, "logps/chosen": -44.945777893066406, "logps/rejected": -55.29536056518555, "loss": 0.8433, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07108014822006226, "rewards/margins": 0.7588818669319153, "rewards/rejected": -0.8299620151519775, "step": 124 }, { "epoch": 0.36114120621162876, "grad_norm": 2.725497245788574, "learning_rate": 9.641618497109825e-07, "logits/chosen": -2.3675315380096436, "logits/rejected": -2.3600988388061523, "logps/chosen": -52.46882629394531, "logps/rejected": -59.211273193359375, "loss": 0.9118, "rewards/accuracies": 0.8125, "rewards/chosen": -0.48697429895401, "rewards/margins": 0.41217708587646484, "rewards/rejected": -0.8991513252258301, "step": 125 }, { "epoch": 0.3640303358613218, "grad_norm": 2.7192118167877197, "learning_rate": 9.638728323699422e-07, "logits/chosen": -2.3457491397857666, "logits/rejected": -2.3016483783721924, "logps/chosen": -46.0822639465332, "logps/rejected": -57.34514617919922, "loss": 0.8858, "rewards/accuracies": 0.875, "rewards/chosen": -0.1916232407093048, "rewards/margins": 0.5331659913063049, "rewards/rejected": -0.7247892022132874, "step": 126 }, { "epoch": 0.3669194655110148, "grad_norm": 2.9863195419311523, "learning_rate": 9.635838150289016e-07, "logits/chosen": -2.3197648525238037, "logits/rejected": -2.3219985961914062, "logps/chosen": -48.30518341064453, "logps/rejected": -60.116416931152344, "loss": 0.8009, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1585208922624588, "rewards/margins": 0.9725704789161682, "rewards/rejected": -1.1310913562774658, "step": 127 }, { "epoch": 0.36980859516070785, "grad_norm": 2.6185131072998047, "learning_rate": 9.632947976878613e-07, "logits/chosen": -2.342125177383423, "logits/rejected": -2.282788038253784, "logps/chosen": -49.31147003173828, "logps/rejected": -57.40409469604492, "loss": 0.8729, "rewards/accuracies": 0.78125, "rewards/chosen": -0.30596208572387695, "rewards/margins": 0.6069289445877075, "rewards/rejected": -0.9128910303115845, "step": 128 }, { "epoch": 0.37269772481040087, "grad_norm": 2.6975183486938477, "learning_rate": 9.630057803468207e-07, "logits/chosen": -2.307365894317627, "logits/rejected": -2.306612253189087, "logps/chosen": -44.51679611206055, "logps/rejected": -55.858909606933594, "loss": 0.8425, "rewards/accuracies": 0.875, "rewards/chosen": -0.16821949183940887, "rewards/margins": 0.7343431711196899, "rewards/rejected": -0.9025627374649048, "step": 129 }, { "epoch": 0.3755868544600939, "grad_norm": 2.6189517974853516, "learning_rate": 9.627167630057804e-07, "logits/chosen": -2.3479316234588623, "logits/rejected": -2.323646068572998, "logps/chosen": -45.01519012451172, "logps/rejected": -56.215843200683594, "loss": 0.8531, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22826676070690155, "rewards/margins": 0.6993081569671631, "rewards/rejected": -0.9275749325752258, "step": 130 }, { "epoch": 0.37847598410978694, "grad_norm": 2.8579814434051514, "learning_rate": 9.624277456647399e-07, "logits/chosen": -2.337200164794922, "logits/rejected": -2.298640012741089, "logps/chosen": -48.59123611450195, "logps/rejected": -58.37284469604492, "loss": 0.8862, "rewards/accuracies": 0.8125, "rewards/chosen": -0.36919236183166504, "rewards/margins": 0.5075418949127197, "rewards/rejected": -0.8767342567443848, "step": 131 }, { "epoch": 0.38136511375947996, "grad_norm": 2.8516643047332764, "learning_rate": 9.621387283236993e-07, "logits/chosen": -2.295905590057373, "logits/rejected": -2.311826705932617, "logps/chosen": -48.264198303222656, "logps/rejected": -57.98565673828125, "loss": 0.8417, "rewards/accuracies": 0.75, "rewards/chosen": -0.1565767228603363, "rewards/margins": 0.7841172218322754, "rewards/rejected": -0.9406939148902893, "step": 132 }, { "epoch": 0.38425424340917297, "grad_norm": 3.2288613319396973, "learning_rate": 9.61849710982659e-07, "logits/chosen": -2.3385813236236572, "logits/rejected": -2.2313692569732666, "logps/chosen": -52.11156463623047, "logps/rejected": -63.84651565551758, "loss": 0.8173, "rewards/accuracies": 0.90625, "rewards/chosen": -0.11779576539993286, "rewards/margins": 0.8698699474334717, "rewards/rejected": -0.987665593624115, "step": 133 }, { "epoch": 0.38714337305886604, "grad_norm": 3.710510730743408, "learning_rate": 9.615606936416184e-07, "logits/chosen": -2.323657751083374, "logits/rejected": -2.299999713897705, "logps/chosen": -48.035911560058594, "logps/rejected": -61.3480110168457, "loss": 0.8572, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4226877987384796, "rewards/margins": 0.6703556776046753, "rewards/rejected": -1.093043565750122, "step": 134 }, { "epoch": 0.39003250270855905, "grad_norm": 3.341756820678711, "learning_rate": 9.61271676300578e-07, "logits/chosen": -2.2897157669067383, "logits/rejected": -2.364151954650879, "logps/chosen": -47.34947967529297, "logps/rejected": -56.29930877685547, "loss": 0.8359, "rewards/accuracies": 0.84375, "rewards/chosen": -0.27182891964912415, "rewards/margins": 0.7803487181663513, "rewards/rejected": -1.0521775484085083, "step": 135 }, { "epoch": 0.39292163235825206, "grad_norm": 3.105022430419922, "learning_rate": 9.609826589595375e-07, "logits/chosen": -2.3292622566223145, "logits/rejected": -2.2848198413848877, "logps/chosen": -48.70988082885742, "logps/rejected": -56.176815032958984, "loss": 0.8359, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08492550998926163, "rewards/margins": 0.7672684788703918, "rewards/rejected": -0.8521939516067505, "step": 136 }, { "epoch": 0.3958107620079451, "grad_norm": 3.2864484786987305, "learning_rate": 9.60693641618497e-07, "logits/chosen": -2.3043460845947266, "logits/rejected": -2.3414254188537598, "logps/chosen": -48.69913864135742, "logps/rejected": -60.6285285949707, "loss": 0.8025, "rewards/accuracies": 0.875, "rewards/chosen": -0.08016210794448853, "rewards/margins": 0.9953374266624451, "rewards/rejected": -1.0754995346069336, "step": 137 }, { "epoch": 0.39869989165763814, "grad_norm": 7.9277424812316895, "learning_rate": 9.604046242774566e-07, "logits/chosen": -2.2833728790283203, "logits/rejected": -2.303652286529541, "logps/chosen": -45.769718170166016, "logps/rejected": -59.46901321411133, "loss": 0.7781, "rewards/accuracies": 0.96875, "rewards/chosen": -0.06253030151128769, "rewards/margins": 1.1042126417160034, "rewards/rejected": -1.1667429208755493, "step": 138 }, { "epoch": 0.40158902130733115, "grad_norm": 2.8860299587249756, "learning_rate": 9.60115606936416e-07, "logits/chosen": -2.340679407119751, "logits/rejected": -2.3381094932556152, "logps/chosen": -45.73509979248047, "logps/rejected": -58.113304138183594, "loss": 0.7538, "rewards/accuracies": 0.875, "rewards/chosen": 0.23133789002895355, "rewards/margins": 1.133486032485962, "rewards/rejected": -0.9021479487419128, "step": 139 }, { "epoch": 0.4044781509570242, "grad_norm": 2.58992600440979, "learning_rate": 9.598265895953758e-07, "logits/chosen": -2.295712947845459, "logits/rejected": -2.2972474098205566, "logps/chosen": -47.55681610107422, "logps/rejected": -58.00616455078125, "loss": 0.8447, "rewards/accuracies": 0.84375, "rewards/chosen": -0.17562296986579895, "rewards/margins": 0.7606106996536255, "rewards/rejected": -0.9362335801124573, "step": 140 }, { "epoch": 0.4044781509570242, "eval_logits/chosen": -2.315685749053955, "eval_logits/rejected": -2.305417537689209, "eval_logps/chosen": -46.05936813354492, "eval_logps/rejected": -56.86530685424805, "eval_loss": 0.7908146381378174, "eval_rewards/accuracies": 0.9435483813285828, "eval_rewards/chosen": 0.02570480667054653, "eval_rewards/margins": 1.0172181129455566, "eval_rewards/rejected": -0.9915134310722351, "eval_runtime": 223.7977, "eval_samples_per_second": 0.55, "eval_steps_per_second": 0.277, "step": 140 }, { "epoch": 0.40736728060671723, "grad_norm": 2.6497061252593994, "learning_rate": 9.595375722543352e-07, "logits/chosen": -2.3302996158599854, "logits/rejected": -2.2793173789978027, "logps/chosen": -48.59764099121094, "logps/rejected": -60.43309783935547, "loss": 0.7449, "rewards/accuracies": 0.84375, "rewards/chosen": 0.059712812304496765, "rewards/margins": 1.2090269327163696, "rewards/rejected": -1.1493141651153564, "step": 141 }, { "epoch": 0.41025641025641024, "grad_norm": 3.0939226150512695, "learning_rate": 9.592485549132947e-07, "logits/chosen": -2.346724510192871, "logits/rejected": -2.3845267295837402, "logps/chosen": -48.53195571899414, "logps/rejected": -59.198936462402344, "loss": 0.8274, "rewards/accuracies": 0.84375, "rewards/chosen": -0.30369433760643005, "rewards/margins": 0.9035637974739075, "rewards/rejected": -1.2072582244873047, "step": 142 }, { "epoch": 0.4131455399061033, "grad_norm": 2.7848033905029297, "learning_rate": 9.589595375722543e-07, "logits/chosen": -2.3483619689941406, "logits/rejected": -2.296332597732544, "logps/chosen": -48.95331573486328, "logps/rejected": -59.01502227783203, "loss": 0.8202, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18521979451179504, "rewards/margins": 0.8769732713699341, "rewards/rejected": -1.0621930360794067, "step": 143 }, { "epoch": 0.4160346695557963, "grad_norm": 3.074005603790283, "learning_rate": 9.586705202312138e-07, "logits/chosen": -2.3119964599609375, "logits/rejected": -2.3102855682373047, "logps/chosen": -43.03636932373047, "logps/rejected": -59.10661315917969, "loss": 0.7396, "rewards/accuracies": 0.9375, "rewards/chosen": 0.20055551826953888, "rewards/margins": 1.3068310022354126, "rewards/rejected": -1.1062755584716797, "step": 144 }, { "epoch": 0.41892379920548933, "grad_norm": 2.81425404548645, "learning_rate": 9.583815028901734e-07, "logits/chosen": -2.3123860359191895, "logits/rejected": -2.359079122543335, "logps/chosen": -47.516231536865234, "logps/rejected": -57.55647277832031, "loss": 0.7762, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1585501879453659, "rewards/margins": 1.0757954120635986, "rewards/rejected": -1.234345555305481, "step": 145 }, { "epoch": 0.4218129288551824, "grad_norm": 2.546170949935913, "learning_rate": 9.580924855491329e-07, "logits/chosen": -2.3715782165527344, "logits/rejected": -2.346294403076172, "logps/chosen": -44.10591506958008, "logps/rejected": -57.175689697265625, "loss": 0.7782, "rewards/accuracies": 0.84375, "rewards/chosen": 0.07550734281539917, "rewards/margins": 1.1298648118972778, "rewards/rejected": -1.0543574094772339, "step": 146 }, { "epoch": 0.4247020585048754, "grad_norm": 2.923238754272461, "learning_rate": 9.578034682080925e-07, "logits/chosen": -2.2776002883911133, "logits/rejected": -2.3353734016418457, "logps/chosen": -49.32353591918945, "logps/rejected": -61.320743560791016, "loss": 0.814, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2229001224040985, "rewards/margins": 0.9807959794998169, "rewards/rejected": -1.2036961317062378, "step": 147 }, { "epoch": 0.4275911881545684, "grad_norm": 3.1544923782348633, "learning_rate": 9.57514450867052e-07, "logits/chosen": -2.285320997238159, "logits/rejected": -2.2907814979553223, "logps/chosen": -43.48589324951172, "logps/rejected": -64.26268005371094, "loss": 0.7326, "rewards/accuracies": 0.96875, "rewards/chosen": 0.0946674644947052, "rewards/margins": 1.3594310283660889, "rewards/rejected": -1.264763593673706, "step": 148 }, { "epoch": 0.4304803178042615, "grad_norm": 2.719599723815918, "learning_rate": 9.572254335260114e-07, "logits/chosen": -2.2725346088409424, "logits/rejected": -2.324857234954834, "logps/chosen": -47.105506896972656, "logps/rejected": -59.95610046386719, "loss": 0.7686, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07974597811698914, "rewards/margins": 1.1624045372009277, "rewards/rejected": -1.0826585292816162, "step": 149 }, { "epoch": 0.4333694474539545, "grad_norm": 3.1477670669555664, "learning_rate": 9.56936416184971e-07, "logits/chosen": -2.3324151039123535, "logits/rejected": -2.3037030696868896, "logps/chosen": -45.5635871887207, "logps/rejected": -57.47909164428711, "loss": 0.7835, "rewards/accuracies": 0.84375, "rewards/chosen": 0.011844396591186523, "rewards/margins": 1.1185591220855713, "rewards/rejected": -1.1067147254943848, "step": 150 }, { "epoch": 0.4362585771036475, "grad_norm": 2.7705297470092773, "learning_rate": 9.566473988439305e-07, "logits/chosen": -2.3420984745025635, "logits/rejected": -2.4012842178344727, "logps/chosen": -43.958858489990234, "logps/rejected": -57.274818420410156, "loss": 0.7837, "rewards/accuracies": 0.875, "rewards/chosen": 0.15141266584396362, "rewards/margins": 1.0975890159606934, "rewards/rejected": -0.946176290512085, "step": 151 }, { "epoch": 0.4391477067533406, "grad_norm": 2.683974027633667, "learning_rate": 9.563583815028902e-07, "logits/chosen": -2.362520933151245, "logits/rejected": -2.36594295501709, "logps/chosen": -51.139305114746094, "logps/rejected": -64.2470474243164, "loss": 0.8019, "rewards/accuracies": 0.875, "rewards/chosen": -0.2687056064605713, "rewards/margins": 1.117750883102417, "rewards/rejected": -1.3864564895629883, "step": 152 }, { "epoch": 0.4420368364030336, "grad_norm": 2.7962563037872314, "learning_rate": 9.560693641618497e-07, "logits/chosen": -2.3986620903015137, "logits/rejected": -2.3897457122802734, "logps/chosen": -48.83110809326172, "logps/rejected": -60.110687255859375, "loss": 0.7943, "rewards/accuracies": 0.90625, "rewards/chosen": -0.245039165019989, "rewards/margins": 1.0435243844985962, "rewards/rejected": -1.28856360912323, "step": 153 }, { "epoch": 0.4449259660527266, "grad_norm": 2.947244167327881, "learning_rate": 9.557803468208091e-07, "logits/chosen": -2.321613311767578, "logits/rejected": -2.3357694149017334, "logps/chosen": -45.53520202636719, "logps/rejected": -57.97107696533203, "loss": 0.824, "rewards/accuracies": 0.84375, "rewards/chosen": -0.10804518312215805, "rewards/margins": 0.8921381235122681, "rewards/rejected": -1.000183343887329, "step": 154 }, { "epoch": 0.4478150957024196, "grad_norm": 3.3747689723968506, "learning_rate": 9.554913294797688e-07, "logits/chosen": -2.3252596855163574, "logits/rejected": -2.366668701171875, "logps/chosen": -42.887237548828125, "logps/rejected": -57.86045455932617, "loss": 0.7555, "rewards/accuracies": 0.84375, "rewards/chosen": 0.19275924563407898, "rewards/margins": 1.2686245441436768, "rewards/rejected": -1.0758652687072754, "step": 155 }, { "epoch": 0.4507042253521127, "grad_norm": 2.49139142036438, "learning_rate": 9.552023121387282e-07, "logits/chosen": -2.3715429306030273, "logits/rejected": -2.3945794105529785, "logps/chosen": -47.71132278442383, "logps/rejected": -60.60277557373047, "loss": 0.801, "rewards/accuracies": 0.875, "rewards/chosen": -0.16272470355033875, "rewards/margins": 1.080264925956726, "rewards/rejected": -1.2429895401000977, "step": 156 }, { "epoch": 0.4535933550018057, "grad_norm": 2.8690037727355957, "learning_rate": 9.549132947976879e-07, "logits/chosen": -2.3640246391296387, "logits/rejected": -2.3844990730285645, "logps/chosen": -50.11227798461914, "logps/rejected": -64.35725402832031, "loss": 0.7617, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22508317232131958, "rewards/margins": 1.2165261507034302, "rewards/rejected": -1.4416093826293945, "step": 157 }, { "epoch": 0.4564824846514987, "grad_norm": 5.4745306968688965, "learning_rate": 9.546242774566473e-07, "logits/chosen": -2.3419406414031982, "logits/rejected": -2.335655927658081, "logps/chosen": -46.24637985229492, "logps/rejected": -61.15616226196289, "loss": 0.7387, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13853719830513, "rewards/margins": 1.355780839920044, "rewards/rejected": -1.2172436714172363, "step": 158 }, { "epoch": 0.4593716143011918, "grad_norm": 2.7619171142578125, "learning_rate": 9.54335260115607e-07, "logits/chosen": -2.3224844932556152, "logits/rejected": -2.344984531402588, "logps/chosen": -41.23829650878906, "logps/rejected": -53.192596435546875, "loss": 0.7951, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2941339313983917, "rewards/margins": 1.0402990579605103, "rewards/rejected": -0.7461652159690857, "step": 159 }, { "epoch": 0.4622607439508848, "grad_norm": 2.923457145690918, "learning_rate": 9.540462427745664e-07, "logits/chosen": -2.4111733436584473, "logits/rejected": -2.3094773292541504, "logps/chosen": -45.61140441894531, "logps/rejected": -59.48064422607422, "loss": 0.7335, "rewards/accuracies": 0.875, "rewards/chosen": 0.15507729351520538, "rewards/margins": 1.3403875827789307, "rewards/rejected": -1.1853102445602417, "step": 160 }, { "epoch": 0.4651498736005778, "grad_norm": 2.62205171585083, "learning_rate": 9.537572254335259e-07, "logits/chosen": -2.356109857559204, "logits/rejected": -2.3282909393310547, "logps/chosen": -48.528297424316406, "logps/rejected": -52.70984649658203, "loss": 0.9023, "rewards/accuracies": 0.75, "rewards/chosen": -0.3466912508010864, "rewards/margins": 0.43278515338897705, "rewards/rejected": -0.7794764637947083, "step": 161 }, { "epoch": 0.46803900325027087, "grad_norm": 60.407894134521484, "learning_rate": 9.534682080924856e-07, "logits/chosen": -2.345142364501953, "logits/rejected": -2.387071371078491, "logps/chosen": -41.33418273925781, "logps/rejected": -59.42607498168945, "loss": 0.6892, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5663004517555237, "rewards/margins": 1.6625601053237915, "rewards/rejected": -1.0962597131729126, "step": 162 }, { "epoch": 0.4709281328999639, "grad_norm": 3.25727915763855, "learning_rate": 9.53179190751445e-07, "logits/chosen": -2.376967668533325, "logits/rejected": -2.370056390762329, "logps/chosen": -48.34507751464844, "logps/rejected": -60.89734649658203, "loss": 0.7641, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10363277047872543, "rewards/margins": 1.2723913192749023, "rewards/rejected": -1.168758511543274, "step": 163 }, { "epoch": 0.4738172625496569, "grad_norm": 2.612868547439575, "learning_rate": 9.528901734104046e-07, "logits/chosen": -2.314535617828369, "logits/rejected": -2.3643898963928223, "logps/chosen": -43.96488952636719, "logps/rejected": -58.207908630371094, "loss": 0.8027, "rewards/accuracies": 0.875, "rewards/chosen": 0.21549297869205475, "rewards/margins": 1.1217070817947388, "rewards/rejected": -0.9062140583992004, "step": 164 }, { "epoch": 0.47670639219934996, "grad_norm": 3.1391210556030273, "learning_rate": 9.526011560693641e-07, "logits/chosen": -2.277660608291626, "logits/rejected": -2.3520591259002686, "logps/chosen": -45.33197784423828, "logps/rejected": -52.75275421142578, "loss": 0.7898, "rewards/accuracies": 0.8125, "rewards/chosen": 0.30621424317359924, "rewards/margins": 1.0861114263534546, "rewards/rejected": -0.7798972129821777, "step": 165 }, { "epoch": 0.47959552184904297, "grad_norm": 3.395692825317383, "learning_rate": 9.523121387283236e-07, "logits/chosen": -2.387721061706543, "logits/rejected": -2.351414680480957, "logps/chosen": -53.330543518066406, "logps/rejected": -68.43891143798828, "loss": 0.7674, "rewards/accuracies": 0.875, "rewards/chosen": -0.4690203070640564, "rewards/margins": 1.3445286750793457, "rewards/rejected": -1.8135490417480469, "step": 166 }, { "epoch": 0.482484651498736, "grad_norm": 2.976444959640503, "learning_rate": 9.520231213872832e-07, "logits/chosen": -2.277439832687378, "logits/rejected": -2.318681478500366, "logps/chosen": -45.160892486572266, "logps/rejected": -55.695499420166016, "loss": 0.8095, "rewards/accuracies": 0.875, "rewards/chosen": 0.221432164311409, "rewards/margins": 1.0894769430160522, "rewards/rejected": -0.8680447936058044, "step": 167 }, { "epoch": 0.48537378114842905, "grad_norm": 3.3816096782684326, "learning_rate": 9.517341040462428e-07, "logits/chosen": -2.3622045516967773, "logits/rejected": -2.344548463821411, "logps/chosen": -44.29157257080078, "logps/rejected": -64.99209594726562, "loss": 0.7155, "rewards/accuracies": 0.84375, "rewards/chosen": 0.12375026941299438, "rewards/margins": 1.4671835899353027, "rewards/rejected": -1.3434333801269531, "step": 168 }, { "epoch": 0.48826291079812206, "grad_norm": 2.93595552444458, "learning_rate": 9.514450867052023e-07, "logits/chosen": -2.379096508026123, "logits/rejected": -2.366532564163208, "logps/chosen": -43.95456314086914, "logps/rejected": -59.0626220703125, "loss": 0.7627, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10310675203800201, "rewards/margins": 1.2869701385498047, "rewards/rejected": -1.1838634014129639, "step": 169 }, { "epoch": 0.4911520404478151, "grad_norm": 2.3508830070495605, "learning_rate": 9.511560693641618e-07, "logits/chosen": -2.418426513671875, "logits/rejected": -2.4082648754119873, "logps/chosen": -49.32514953613281, "logps/rejected": -63.44178009033203, "loss": 0.8068, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19795437157154083, "rewards/margins": 1.0844449996948242, "rewards/rejected": -1.2823994159698486, "step": 170 }, { "epoch": 0.49404117009750814, "grad_norm": 2.744736909866333, "learning_rate": 9.508670520231213e-07, "logits/chosen": -2.3351967334747314, "logits/rejected": -2.3089871406555176, "logps/chosen": -41.13566589355469, "logps/rejected": -56.82026672363281, "loss": 0.7634, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3358127772808075, "rewards/margins": 1.2992078065872192, "rewards/rejected": -0.9633951187133789, "step": 171 }, { "epoch": 0.49693029974720115, "grad_norm": 3.2524611949920654, "learning_rate": 9.505780346820809e-07, "logits/chosen": -2.4261269569396973, "logits/rejected": -2.4698245525360107, "logps/chosen": -47.841224670410156, "logps/rejected": -62.869293212890625, "loss": 0.7239, "rewards/accuracies": 0.875, "rewards/chosen": -0.03150162100791931, "rewards/margins": 1.5400289297103882, "rewards/rejected": -1.5715304613113403, "step": 172 }, { "epoch": 0.49981942939689417, "grad_norm": 3.106243848800659, "learning_rate": 9.502890173410403e-07, "logits/chosen": -2.3285441398620605, "logits/rejected": -2.3787131309509277, "logps/chosen": -50.55501937866211, "logps/rejected": -65.04655456542969, "loss": 0.7787, "rewards/accuracies": 0.875, "rewards/chosen": -0.18158307671546936, "rewards/margins": 1.206680417060852, "rewards/rejected": -1.3882635831832886, "step": 173 }, { "epoch": 0.5027085590465872, "grad_norm": 5.861273288726807, "learning_rate": 9.499999999999999e-07, "logits/chosen": -2.3346846103668213, "logits/rejected": -2.3453452587127686, "logps/chosen": -43.25203323364258, "logps/rejected": -59.57757568359375, "loss": 0.7831, "rewards/accuracies": 0.8125, "rewards/chosen": 0.15670084953308105, "rewards/margins": 1.22382652759552, "rewards/rejected": -1.067125678062439, "step": 174 }, { "epoch": 0.5055976886962803, "grad_norm": 3.029984474182129, "learning_rate": 9.497109826589596e-07, "logits/chosen": -2.408653736114502, "logits/rejected": -2.3959949016571045, "logps/chosen": -45.758113861083984, "logps/rejected": -63.659019470214844, "loss": 0.7583, "rewards/accuracies": 0.90625, "rewards/chosen": 0.09060897678136826, "rewards/margins": 1.3969378471374512, "rewards/rejected": -1.3063290119171143, "step": 175 }, { "epoch": 0.5084868183459733, "grad_norm": 2.8444571495056152, "learning_rate": 9.494219653179191e-07, "logits/chosen": -2.353229522705078, "logits/rejected": -2.366319179534912, "logps/chosen": -43.88920211791992, "logps/rejected": -58.94062805175781, "loss": 0.7683, "rewards/accuracies": 0.875, "rewards/chosen": -0.030150949954986572, "rewards/margins": 1.2556228637695312, "rewards/rejected": -1.2857739925384521, "step": 176 }, { "epoch": 0.5113759479956663, "grad_norm": 3.1716370582580566, "learning_rate": 9.491329479768786e-07, "logits/chosen": -2.3686683177948, "logits/rejected": -2.4404296875, "logps/chosen": -48.117618560791016, "logps/rejected": -63.83859634399414, "loss": 0.7537, "rewards/accuracies": 0.875, "rewards/chosen": 0.06565667688846588, "rewards/margins": 1.3247724771499634, "rewards/rejected": -1.2591158151626587, "step": 177 }, { "epoch": 0.5142650776453593, "grad_norm": 3.3642094135284424, "learning_rate": 9.488439306358381e-07, "logits/chosen": -2.4229869842529297, "logits/rejected": -2.460686683654785, "logps/chosen": -48.098628997802734, "logps/rejected": -63.79861831665039, "loss": 0.7892, "rewards/accuracies": 0.875, "rewards/chosen": -0.2574460506439209, "rewards/margins": 1.2954460382461548, "rewards/rejected": -1.5528922080993652, "step": 178 }, { "epoch": 0.5171542072950523, "grad_norm": 3.1985249519348145, "learning_rate": 9.485549132947977e-07, "logits/chosen": -2.418065309524536, "logits/rejected": -2.3865458965301514, "logps/chosen": -48.65021514892578, "logps/rejected": -63.61997985839844, "loss": 0.857, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5942423343658447, "rewards/margins": 0.9400255680084229, "rewards/rejected": -1.5342676639556885, "step": 179 }, { "epoch": 0.5200433369447454, "grad_norm": 3.717428207397461, "learning_rate": 9.482658959537571e-07, "logits/chosen": -2.3536787033081055, "logits/rejected": -2.339796543121338, "logps/chosen": -48.195587158203125, "logps/rejected": -62.58692169189453, "loss": 0.7742, "rewards/accuracies": 0.875, "rewards/chosen": -0.13635697960853577, "rewards/margins": 1.2717196941375732, "rewards/rejected": -1.408076524734497, "step": 180 }, { "epoch": 0.5229324665944384, "grad_norm": 2.9678938388824463, "learning_rate": 9.479768786127167e-07, "logits/chosen": -2.3017899990081787, "logits/rejected": -2.350071907043457, "logps/chosen": -39.605690002441406, "logps/rejected": -61.74664306640625, "loss": 0.6792, "rewards/accuracies": 0.875, "rewards/chosen": 0.7841922044754028, "rewards/margins": 1.8973994255065918, "rewards/rejected": -1.113207221031189, "step": 181 }, { "epoch": 0.5258215962441315, "grad_norm": 2.7847087383270264, "learning_rate": 9.476878612716762e-07, "logits/chosen": -2.3748056888580322, "logits/rejected": -2.394930124282837, "logps/chosen": -43.233184814453125, "logps/rejected": -59.87007522583008, "loss": 0.7329, "rewards/accuracies": 0.75, "rewards/chosen": 0.3727853298187256, "rewards/margins": 1.4840737581253052, "rewards/rejected": -1.1112884283065796, "step": 182 }, { "epoch": 0.5287107258938245, "grad_norm": 3.041361093521118, "learning_rate": 9.473988439306358e-07, "logits/chosen": -2.3777172565460205, "logits/rejected": -2.3845226764678955, "logps/chosen": -43.81415557861328, "logps/rejected": -61.82279586791992, "loss": 0.6589, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2464689314365387, "rewards/margins": 1.828550100326538, "rewards/rejected": -1.5820810794830322, "step": 183 }, { "epoch": 0.5315998555435175, "grad_norm": 3.5515432357788086, "learning_rate": 9.471098265895954e-07, "logits/chosen": -2.3812739849090576, "logits/rejected": -2.374936580657959, "logps/chosen": -48.10561752319336, "logps/rejected": -61.188865661621094, "loss": 0.7912, "rewards/accuracies": 0.96875, "rewards/chosen": -0.13286006450653076, "rewards/margins": 1.195282220840454, "rewards/rejected": -1.3281421661376953, "step": 184 }, { "epoch": 0.5344889851932105, "grad_norm": 2.2396018505096436, "learning_rate": 9.468208092485549e-07, "logits/chosen": -2.337050437927246, "logits/rejected": -2.396160364151001, "logps/chosen": -45.007877349853516, "logps/rejected": -58.290550231933594, "loss": 0.7834, "rewards/accuracies": 0.84375, "rewards/chosen": 0.01751628890633583, "rewards/margins": 1.17159903049469, "rewards/rejected": -1.1540827751159668, "step": 185 }, { "epoch": 0.5373781148429035, "grad_norm": 2.9951846599578857, "learning_rate": 9.465317919075145e-07, "logits/chosen": -2.3893227577209473, "logits/rejected": -2.436905860900879, "logps/chosen": -46.12611389160156, "logps/rejected": -60.75209045410156, "loss": 0.7895, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1584327518939972, "rewards/margins": 1.169509768486023, "rewards/rejected": -1.3279423713684082, "step": 186 }, { "epoch": 0.5402672444925966, "grad_norm": 2.953843593597412, "learning_rate": 9.462427745664739e-07, "logits/chosen": -2.3540384769439697, "logits/rejected": -2.422950267791748, "logps/chosen": -45.830604553222656, "logps/rejected": -60.55857467651367, "loss": 0.7594, "rewards/accuracies": 0.84375, "rewards/chosen": -0.03428354859352112, "rewards/margins": 1.3441871404647827, "rewards/rejected": -1.3784706592559814, "step": 187 }, { "epoch": 0.5431563741422897, "grad_norm": 2.908754348754883, "learning_rate": 9.459537572254335e-07, "logits/chosen": -2.410837411880493, "logits/rejected": -2.402312755584717, "logps/chosen": -44.6417236328125, "logps/rejected": -63.0499267578125, "loss": 0.7199, "rewards/accuracies": 0.875, "rewards/chosen": 0.16029220819473267, "rewards/margins": 1.6347649097442627, "rewards/rejected": -1.4744727611541748, "step": 188 }, { "epoch": 0.5460455037919827, "grad_norm": 2.836951732635498, "learning_rate": 9.45664739884393e-07, "logits/chosen": -2.3992109298706055, "logits/rejected": -2.405224323272705, "logps/chosen": -43.55963897705078, "logps/rejected": -59.37586212158203, "loss": 0.7234, "rewards/accuracies": 0.90625, "rewards/chosen": 0.278353750705719, "rewards/margins": 1.4112428426742554, "rewards/rejected": -1.1328890323638916, "step": 189 }, { "epoch": 0.5489346334416757, "grad_norm": 2.8864667415618896, "learning_rate": 9.453757225433525e-07, "logits/chosen": -2.4266345500946045, "logits/rejected": -2.3807833194732666, "logps/chosen": -48.41771697998047, "logps/rejected": -60.34461212158203, "loss": 0.774, "rewards/accuracies": 0.84375, "rewards/chosen": 0.03833237662911415, "rewards/margins": 1.211247205734253, "rewards/rejected": -1.1729148626327515, "step": 190 }, { "epoch": 0.5518237630913687, "grad_norm": 2.9094765186309814, "learning_rate": 9.450867052023121e-07, "logits/chosen": -2.3085968494415283, "logits/rejected": -2.322753667831421, "logps/chosen": -40.244049072265625, "logps/rejected": -57.875732421875, "loss": 0.7111, "rewards/accuracies": 0.84375, "rewards/chosen": 0.36100077629089355, "rewards/margins": 1.5517032146453857, "rewards/rejected": -1.1907023191452026, "step": 191 }, { "epoch": 0.5547128927410617, "grad_norm": 2.998178482055664, "learning_rate": 9.447976878612717e-07, "logits/chosen": -2.297504186630249, "logits/rejected": -2.369494915008545, "logps/chosen": -41.937747955322266, "logps/rejected": -57.34153366088867, "loss": 0.7582, "rewards/accuracies": 0.84375, "rewards/chosen": 0.19088220596313477, "rewards/margins": 1.3015732765197754, "rewards/rejected": -1.1106911897659302, "step": 192 }, { "epoch": 0.5576020223907547, "grad_norm": 2.839581251144409, "learning_rate": 9.445086705202312e-07, "logits/chosen": -2.3890557289123535, "logits/rejected": -2.428040027618408, "logps/chosen": -44.32537841796875, "logps/rejected": -56.33393478393555, "loss": 0.7848, "rewards/accuracies": 0.8125, "rewards/chosen": 0.30881792306900024, "rewards/margins": 1.1424444913864136, "rewards/rejected": -0.8336264491081238, "step": 193 }, { "epoch": 0.5604911520404479, "grad_norm": 2.8329803943634033, "learning_rate": 9.442196531791907e-07, "logits/chosen": -2.4388229846954346, "logits/rejected": -2.4753518104553223, "logps/chosen": -42.07447814941406, "logps/rejected": -63.71078872680664, "loss": 0.7148, "rewards/accuracies": 0.875, "rewards/chosen": 0.23605872690677643, "rewards/margins": 1.6880855560302734, "rewards/rejected": -1.452026605606079, "step": 194 }, { "epoch": 0.5633802816901409, "grad_norm": 3.2380728721618652, "learning_rate": 9.439306358381503e-07, "logits/chosen": -2.340550184249878, "logits/rejected": -2.318730354309082, "logps/chosen": -41.822288513183594, "logps/rejected": -66.3376235961914, "loss": 0.6714, "rewards/accuracies": 0.9375, "rewards/chosen": 0.40174630284309387, "rewards/margins": 1.9015274047851562, "rewards/rejected": -1.4997808933258057, "step": 195 }, { "epoch": 0.5662694113398339, "grad_norm": 2.600170850753784, "learning_rate": 9.436416184971098e-07, "logits/chosen": -2.3637571334838867, "logits/rejected": -2.3651864528656006, "logps/chosen": -45.98854446411133, "logps/rejected": -62.4269905090332, "loss": 0.7218, "rewards/accuracies": 0.96875, "rewards/chosen": 0.07987090200185776, "rewards/margins": 1.613500714302063, "rewards/rejected": -1.5336300134658813, "step": 196 }, { "epoch": 0.5691585409895269, "grad_norm": 3.371495246887207, "learning_rate": 9.433526011560693e-07, "logits/chosen": -2.366495370864868, "logits/rejected": -2.3871426582336426, "logps/chosen": -47.994930267333984, "logps/rejected": -65.29827880859375, "loss": 0.7805, "rewards/accuracies": 0.875, "rewards/chosen": -0.09062989056110382, "rewards/margins": 1.4487261772155762, "rewards/rejected": -1.539355993270874, "step": 197 }, { "epoch": 0.5720476706392199, "grad_norm": 3.329141616821289, "learning_rate": 9.430635838150288e-07, "logits/chosen": -2.367084503173828, "logits/rejected": -2.378183364868164, "logps/chosen": -46.721038818359375, "logps/rejected": -63.2609977722168, "loss": 0.7048, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10747575759887695, "rewards/margins": 1.7354298830032349, "rewards/rejected": -1.627954125404358, "step": 198 }, { "epoch": 0.5749368002889129, "grad_norm": 5.602270603179932, "learning_rate": 9.427745664739884e-07, "logits/chosen": -2.2770822048187256, "logits/rejected": -2.3389759063720703, "logps/chosen": -35.112545013427734, "logps/rejected": -57.791221618652344, "loss": 0.6303, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7994493842124939, "rewards/margins": 2.0253689289093018, "rewards/rejected": -1.225919485092163, "step": 199 }, { "epoch": 0.577825929938606, "grad_norm": 3.0671958923339844, "learning_rate": 9.424855491329479e-07, "logits/chosen": -2.319162368774414, "logits/rejected": -2.3333024978637695, "logps/chosen": -38.87583923339844, "logps/rejected": -53.70672607421875, "loss": 0.7214, "rewards/accuracies": 0.875, "rewards/chosen": 0.5686999559402466, "rewards/margins": 1.4867463111877441, "rewards/rejected": -0.9180463552474976, "step": 200 }, { "epoch": 0.580715059588299, "grad_norm": 3.0341153144836426, "learning_rate": 9.421965317919075e-07, "logits/chosen": -2.3065683841705322, "logits/rejected": -2.33486008644104, "logps/chosen": -42.66212463378906, "logps/rejected": -61.82978820800781, "loss": 0.7521, "rewards/accuracies": 0.84375, "rewards/chosen": 0.24838417768478394, "rewards/margins": 1.4916948080062866, "rewards/rejected": -1.243310570716858, "step": 201 }, { "epoch": 0.5836041892379921, "grad_norm": 2.745147466659546, "learning_rate": 9.41907514450867e-07, "logits/chosen": -2.3542654514312744, "logits/rejected": -2.3550662994384766, "logps/chosen": -43.39604187011719, "logps/rejected": -60.77286911010742, "loss": 0.7534, "rewards/accuracies": 0.8125, "rewards/chosen": 0.17716994881629944, "rewards/margins": 1.5029187202453613, "rewards/rejected": -1.3257488012313843, "step": 202 }, { "epoch": 0.5864933188876851, "grad_norm": 3.132131814956665, "learning_rate": 9.416184971098266e-07, "logits/chosen": -2.2805140018463135, "logits/rejected": -2.331120729446411, "logps/chosen": -41.29029083251953, "logps/rejected": -58.58720779418945, "loss": 0.7686, "rewards/accuracies": 0.8125, "rewards/chosen": 0.26279711723327637, "rewards/margins": 1.3171426057815552, "rewards/rejected": -1.0543454885482788, "step": 203 }, { "epoch": 0.5893824485373781, "grad_norm": 2.9049148559570312, "learning_rate": 9.41329479768786e-07, "logits/chosen": -2.3321189880371094, "logits/rejected": -2.3081600666046143, "logps/chosen": -42.908447265625, "logps/rejected": -62.805381774902344, "loss": 0.7419, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3000158369541168, "rewards/margins": 1.635756015777588, "rewards/rejected": -1.3357402086257935, "step": 204 }, { "epoch": 0.5922715781870711, "grad_norm": 3.258268356323242, "learning_rate": 9.410404624277456e-07, "logits/chosen": -2.319244146347046, "logits/rejected": -2.422391653060913, "logps/chosen": -39.953468322753906, "logps/rejected": -57.857704162597656, "loss": 0.7111, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6537701487541199, "rewards/margins": 1.6051630973815918, "rewards/rejected": -0.9513929486274719, "step": 205 }, { "epoch": 0.5951607078367642, "grad_norm": 3.000689744949341, "learning_rate": 9.407514450867052e-07, "logits/chosen": -2.3660035133361816, "logits/rejected": -2.414260149002075, "logps/chosen": -47.480934143066406, "logps/rejected": -69.16351318359375, "loss": 0.7498, "rewards/accuracies": 0.96875, "rewards/chosen": -0.13614621758460999, "rewards/margins": 1.6641069650650024, "rewards/rejected": -1.8002530336380005, "step": 206 }, { "epoch": 0.5980498374864572, "grad_norm": 3.6796936988830566, "learning_rate": 9.404624277456646e-07, "logits/chosen": -2.419508695602417, "logits/rejected": -2.465574026107788, "logps/chosen": -38.265785217285156, "logps/rejected": -57.90287780761719, "loss": 0.6898, "rewards/accuracies": 0.9375, "rewards/chosen": 0.541540265083313, "rewards/margins": 1.7540528774261475, "rewards/rejected": -1.212512731552124, "step": 207 }, { "epoch": 0.6009389671361502, "grad_norm": 3.531259536743164, "learning_rate": 9.401734104046243e-07, "logits/chosen": -2.3134267330169678, "logits/rejected": -2.350153684616089, "logps/chosen": -44.13142776489258, "logps/rejected": -61.386085510253906, "loss": 0.7421, "rewards/accuracies": 0.9375, "rewards/chosen": 0.37140658497810364, "rewards/margins": 1.5971095561981201, "rewards/rejected": -1.2257030010223389, "step": 208 }, { "epoch": 0.6038280967858433, "grad_norm": 3.1668481826782227, "learning_rate": 9.398843930635838e-07, "logits/chosen": -2.3468456268310547, "logits/rejected": -2.39508318901062, "logps/chosen": -43.531494140625, "logps/rejected": -68.27076721191406, "loss": 0.6532, "rewards/accuracies": 0.875, "rewards/chosen": 0.3307722806930542, "rewards/margins": 2.120400905609131, "rewards/rejected": -1.7896287441253662, "step": 209 }, { "epoch": 0.6067172264355363, "grad_norm": 3.083158016204834, "learning_rate": 9.395953757225434e-07, "logits/chosen": -2.3983888626098633, "logits/rejected": -2.4787657260894775, "logps/chosen": -43.750518798828125, "logps/rejected": -64.18580627441406, "loss": 0.6684, "rewards/accuracies": 0.875, "rewards/chosen": 0.37933406233787537, "rewards/margins": 1.962712287902832, "rewards/rejected": -1.5833783149719238, "step": 210 }, { "epoch": 0.6067172264355363, "eval_logits/chosen": -2.388397693634033, "eval_logits/rejected": -2.4097301959991455, "eval_logps/chosen": -43.41181945800781, "eval_logps/rejected": -62.437965393066406, "eval_loss": 0.683803379535675, "eval_rewards/accuracies": 0.9112903475761414, "eval_rewards/chosen": 0.29046016931533813, "eval_rewards/margins": 1.8392393589019775, "eval_rewards/rejected": -1.5487791299819946, "eval_runtime": 233.515, "eval_samples_per_second": 0.527, "eval_steps_per_second": 0.266, "step": 210 }, { "epoch": 0.6096063560852293, "grad_norm": 3.343902111053467, "learning_rate": 9.393063583815028e-07, "logits/chosen": -2.33109974861145, "logits/rejected": -2.437838554382324, "logps/chosen": -42.95936584472656, "logps/rejected": -62.915225982666016, "loss": 0.6715, "rewards/accuracies": 0.90625, "rewards/chosen": 0.180616557598114, "rewards/margins": 1.9110515117645264, "rewards/rejected": -1.7304350137710571, "step": 211 }, { "epoch": 0.6124954857349224, "grad_norm": 2.750523328781128, "learning_rate": 9.390173410404624e-07, "logits/chosen": -2.311305522918701, "logits/rejected": -2.3737690448760986, "logps/chosen": -42.72809600830078, "logps/rejected": -59.75407409667969, "loss": 0.7266, "rewards/accuracies": 0.875, "rewards/chosen": 0.2634056806564331, "rewards/margins": 1.598074197769165, "rewards/rejected": -1.334668517112732, "step": 212 }, { "epoch": 0.6153846153846154, "grad_norm": 3.1016433238983154, "learning_rate": 9.387283236994219e-07, "logits/chosen": -2.378206491470337, "logits/rejected": -2.3866708278656006, "logps/chosen": -44.500762939453125, "logps/rejected": -66.25657653808594, "loss": 0.6863, "rewards/accuracies": 0.8125, "rewards/chosen": 0.14031347632408142, "rewards/margins": 1.944046974182129, "rewards/rejected": -1.8037335872650146, "step": 213 }, { "epoch": 0.6182737450343084, "grad_norm": 3.3249285221099854, "learning_rate": 9.384393063583814e-07, "logits/chosen": -2.3150699138641357, "logits/rejected": -2.3835535049438477, "logps/chosen": -43.63330841064453, "logps/rejected": -59.55425262451172, "loss": 0.7456, "rewards/accuracies": 0.90625, "rewards/chosen": 0.054222509264945984, "rewards/margins": 1.4205846786499023, "rewards/rejected": -1.3663620948791504, "step": 214 }, { "epoch": 0.6211628746840014, "grad_norm": 3.084545612335205, "learning_rate": 9.381502890173409e-07, "logits/chosen": -2.368077278137207, "logits/rejected": -2.3781044483184814, "logps/chosen": -43.576698303222656, "logps/rejected": -61.597721099853516, "loss": 0.7692, "rewards/accuracies": 0.84375, "rewards/chosen": -0.14434625208377838, "rewards/margins": 1.4359506368637085, "rewards/rejected": -1.5802969932556152, "step": 215 }, { "epoch": 0.6240520043336945, "grad_norm": 3.17987322807312, "learning_rate": 9.378612716763006e-07, "logits/chosen": -2.4100430011749268, "logits/rejected": -2.4726436138153076, "logps/chosen": -48.009315490722656, "logps/rejected": -68.30410766601562, "loss": 0.7193, "rewards/accuracies": 0.90625, "rewards/chosen": -0.16807985305786133, "rewards/margins": 1.7344818115234375, "rewards/rejected": -1.9025615453720093, "step": 216 }, { "epoch": 0.6269411339833875, "grad_norm": 2.8982911109924316, "learning_rate": 9.375722543352601e-07, "logits/chosen": -2.31890869140625, "logits/rejected": -2.4044032096862793, "logps/chosen": -39.74444580078125, "logps/rejected": -60.03290557861328, "loss": 0.6417, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6656346917152405, "rewards/margins": 2.0730507373809814, "rewards/rejected": -1.4074161052703857, "step": 217 }, { "epoch": 0.6298302636330806, "grad_norm": 3.056142568588257, "learning_rate": 9.372832369942196e-07, "logits/chosen": -2.4744246006011963, "logits/rejected": -2.4521710872650146, "logps/chosen": -44.35829544067383, "logps/rejected": -62.373802185058594, "loss": 0.7457, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1000618040561676, "rewards/margins": 1.3952710628509521, "rewards/rejected": -1.4953328371047974, "step": 218 }, { "epoch": 0.6327193932827736, "grad_norm": 3.5300240516662598, "learning_rate": 9.369942196531792e-07, "logits/chosen": -2.384413719177246, "logits/rejected": -2.4357423782348633, "logps/chosen": -47.987308502197266, "logps/rejected": -67.78047180175781, "loss": 0.7494, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3375641107559204, "rewards/margins": 1.6694798469543457, "rewards/rejected": -2.0070438385009766, "step": 219 }, { "epoch": 0.6356085229324666, "grad_norm": 2.5477054119110107, "learning_rate": 9.367052023121387e-07, "logits/chosen": -2.416285514831543, "logits/rejected": -2.4718029499053955, "logps/chosen": -43.00476837158203, "logps/rejected": -63.462928771972656, "loss": 0.6769, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3342089056968689, "rewards/margins": 1.918051838874817, "rewards/rejected": -1.5838428735733032, "step": 220 }, { "epoch": 0.6384976525821596, "grad_norm": 3.66827392578125, "learning_rate": 9.364161849710982e-07, "logits/chosen": -2.461031675338745, "logits/rejected": -2.4775278568267822, "logps/chosen": -43.486183166503906, "logps/rejected": -59.74700164794922, "loss": 0.7418, "rewards/accuracies": 0.875, "rewards/chosen": 0.06752738356590271, "rewards/margins": 1.3548539876937866, "rewards/rejected": -1.287326693534851, "step": 221 }, { "epoch": 0.6413867822318526, "grad_norm": 3.510223627090454, "learning_rate": 9.361271676300577e-07, "logits/chosen": -2.319551944732666, "logits/rejected": -2.3902390003204346, "logps/chosen": -38.66592788696289, "logps/rejected": -63.25918197631836, "loss": 0.5831, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9130045175552368, "rewards/margins": 2.199897050857544, "rewards/rejected": -1.2868926525115967, "step": 222 }, { "epoch": 0.6442759118815456, "grad_norm": 2.594172716140747, "learning_rate": 9.358381502890173e-07, "logits/chosen": -2.469304323196411, "logits/rejected": -2.465294599533081, "logps/chosen": -41.24940872192383, "logps/rejected": -63.2137451171875, "loss": 0.7002, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2730734050273895, "rewards/margins": 1.924410343170166, "rewards/rejected": -1.6513367891311646, "step": 223 }, { "epoch": 0.6471650415312387, "grad_norm": 3.4168612957000732, "learning_rate": 9.355491329479768e-07, "logits/chosen": -2.392885446548462, "logits/rejected": -2.4281060695648193, "logps/chosen": -43.67170333862305, "logps/rejected": -59.703975677490234, "loss": 0.7427, "rewards/accuracies": 0.90625, "rewards/chosen": 0.18917107582092285, "rewards/margins": 1.5090937614440918, "rewards/rejected": -1.319922685623169, "step": 224 }, { "epoch": 0.6500541711809318, "grad_norm": 3.7614173889160156, "learning_rate": 9.352601156069364e-07, "logits/chosen": -2.36893630027771, "logits/rejected": -2.3739073276519775, "logps/chosen": -42.48311996459961, "logps/rejected": -65.38565063476562, "loss": 0.7162, "rewards/accuracies": 0.90625, "rewards/chosen": 0.33256396651268005, "rewards/margins": 1.8180973529815674, "rewards/rejected": -1.485533356666565, "step": 225 }, { "epoch": 0.6529433008306248, "grad_norm": 2.9030890464782715, "learning_rate": 9.34971098265896e-07, "logits/chosen": -2.3989081382751465, "logits/rejected": -2.472454071044922, "logps/chosen": -46.59208297729492, "logps/rejected": -73.7762680053711, "loss": 0.6385, "rewards/accuracies": 0.84375, "rewards/chosen": 0.07025131583213806, "rewards/margins": 2.3649282455444336, "rewards/rejected": -2.2946767807006836, "step": 226 }, { "epoch": 0.6558324304803178, "grad_norm": 3.3116471767425537, "learning_rate": 9.346820809248555e-07, "logits/chosen": -2.3760766983032227, "logits/rejected": -2.477018356323242, "logps/chosen": -42.72669982910156, "logps/rejected": -63.77056121826172, "loss": 0.6447, "rewards/accuracies": 0.875, "rewards/chosen": 0.4199894666671753, "rewards/margins": 2.064450263977051, "rewards/rejected": -1.644460916519165, "step": 227 }, { "epoch": 0.6587215601300108, "grad_norm": 3.2879390716552734, "learning_rate": 9.34393063583815e-07, "logits/chosen": -2.4289674758911133, "logits/rejected": -2.494957208633423, "logps/chosen": -44.9857063293457, "logps/rejected": -65.45561218261719, "loss": 0.6967, "rewards/accuracies": 0.84375, "rewards/chosen": 0.20772086083889008, "rewards/margins": 1.8624329566955566, "rewards/rejected": -1.654712200164795, "step": 228 }, { "epoch": 0.6616106897797038, "grad_norm": 2.827012777328491, "learning_rate": 9.341040462427745e-07, "logits/chosen": -2.409533977508545, "logits/rejected": -2.474635124206543, "logps/chosen": -41.5572509765625, "logps/rejected": -64.55766296386719, "loss": 0.6853, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5530557036399841, "rewards/margins": 1.9349128007888794, "rewards/rejected": -1.3818570375442505, "step": 229 }, { "epoch": 0.6644998194293968, "grad_norm": 3.1656312942504883, "learning_rate": 9.338150289017341e-07, "logits/chosen": -2.447114944458008, "logits/rejected": -2.438410758972168, "logps/chosen": -44.94252014160156, "logps/rejected": -64.98225402832031, "loss": 0.6756, "rewards/accuracies": 0.875, "rewards/chosen": 0.17447510361671448, "rewards/margins": 1.7838224172592163, "rewards/rejected": -1.6093473434448242, "step": 230 }, { "epoch": 0.66738894907909, "grad_norm": 2.7864890098571777, "learning_rate": 9.335260115606935e-07, "logits/chosen": -2.302251100540161, "logits/rejected": -2.44164776802063, "logps/chosen": -38.832847595214844, "logps/rejected": -64.41265106201172, "loss": 0.6117, "rewards/accuracies": 0.875, "rewards/chosen": 0.7420086860656738, "rewards/margins": 2.319333553314209, "rewards/rejected": -1.5773248672485352, "step": 231 }, { "epoch": 0.670278078728783, "grad_norm": 3.271606206893921, "learning_rate": 9.332369942196532e-07, "logits/chosen": -2.4246320724487305, "logits/rejected": -2.4531314373016357, "logps/chosen": -45.94568634033203, "logps/rejected": -62.88578414916992, "loss": 0.748, "rewards/accuracies": 0.84375, "rewards/chosen": -0.076453298330307, "rewards/margins": 1.5326231718063354, "rewards/rejected": -1.6090763807296753, "step": 232 }, { "epoch": 0.673167208378476, "grad_norm": 3.1658873558044434, "learning_rate": 9.329479768786127e-07, "logits/chosen": -2.4460442066192627, "logits/rejected": -2.502411365509033, "logps/chosen": -46.970458984375, "logps/rejected": -64.1877670288086, "loss": 0.7558, "rewards/accuracies": 0.84375, "rewards/chosen": 0.07559677213430405, "rewards/margins": 1.580013394355774, "rewards/rejected": -1.5044167041778564, "step": 233 }, { "epoch": 0.676056338028169, "grad_norm": 3.6443798542022705, "learning_rate": 9.326589595375722e-07, "logits/chosen": -2.5317306518554688, "logits/rejected": -2.522693157196045, "logps/chosen": -45.432621002197266, "logps/rejected": -64.5593032836914, "loss": 0.7491, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13109466433525085, "rewards/margins": 1.5544226169586182, "rewards/rejected": -1.6855171918869019, "step": 234 }, { "epoch": 0.678945467677862, "grad_norm": 3.142045497894287, "learning_rate": 9.323699421965317e-07, "logits/chosen": -2.3829150199890137, "logits/rejected": -2.435121536254883, "logps/chosen": -44.225318908691406, "logps/rejected": -64.57733917236328, "loss": 0.7545, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04684029147028923, "rewards/margins": 1.5296306610107422, "rewards/rejected": -1.482790470123291, "step": 235 }, { "epoch": 0.681834597327555, "grad_norm": 3.54897141456604, "learning_rate": 9.320809248554913e-07, "logits/chosen": -2.394425392150879, "logits/rejected": -2.4788336753845215, "logps/chosen": -42.197574615478516, "logps/rejected": -63.33639144897461, "loss": 0.6398, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5247761011123657, "rewards/margins": 2.240112543106079, "rewards/rejected": -1.7153362035751343, "step": 236 }, { "epoch": 0.6847237269772481, "grad_norm": 3.614816427230835, "learning_rate": 9.317919075144509e-07, "logits/chosen": -2.430112838745117, "logits/rejected": -2.46321964263916, "logps/chosen": -42.23335647583008, "logps/rejected": -65.45967102050781, "loss": 0.7059, "rewards/accuracies": 0.875, "rewards/chosen": 0.1677156388759613, "rewards/margins": 2.0227181911468506, "rewards/rejected": -1.8550026416778564, "step": 237 }, { "epoch": 0.6876128566269412, "grad_norm": 3.1061975955963135, "learning_rate": 9.315028901734103e-07, "logits/chosen": -2.4547650814056396, "logits/rejected": -2.5138602256774902, "logps/chosen": -48.07855224609375, "logps/rejected": -68.40777587890625, "loss": 0.7096, "rewards/accuracies": 0.90625, "rewards/chosen": 0.031758278608322144, "rewards/margins": 1.819270372390747, "rewards/rejected": -1.7875120639801025, "step": 238 }, { "epoch": 0.6905019862766342, "grad_norm": 3.029616355895996, "learning_rate": 9.312138728323699e-07, "logits/chosen": -2.4792447090148926, "logits/rejected": -2.5177829265594482, "logps/chosen": -42.31736373901367, "logps/rejected": -65.71548461914062, "loss": 0.6397, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3538481295108795, "rewards/margins": 2.357322931289673, "rewards/rejected": -2.003474712371826, "step": 239 }, { "epoch": 0.6933911159263272, "grad_norm": 3.469893217086792, "learning_rate": 9.309248554913295e-07, "logits/chosen": -2.358518362045288, "logits/rejected": -2.410309076309204, "logps/chosen": -48.300601959228516, "logps/rejected": -62.81208038330078, "loss": 0.801, "rewards/accuracies": 0.78125, "rewards/chosen": -0.026813454926013947, "rewards/margins": 1.167003870010376, "rewards/rejected": -1.193817377090454, "step": 240 }, { "epoch": 0.6962802455760202, "grad_norm": 3.3930399417877197, "learning_rate": 9.30635838150289e-07, "logits/chosen": -2.3976480960845947, "logits/rejected": -2.455986976623535, "logps/chosen": -36.520626068115234, "logps/rejected": -60.392677307128906, "loss": 0.6563, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7650952339172363, "rewards/margins": 2.204737663269043, "rewards/rejected": -1.4396425485610962, "step": 241 }, { "epoch": 0.6991693752257132, "grad_norm": 3.051813840866089, "learning_rate": 9.303468208092485e-07, "logits/chosen": -2.425654172897339, "logits/rejected": -2.4463047981262207, "logps/chosen": -50.015018463134766, "logps/rejected": -70.49362182617188, "loss": 0.7234, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2034038007259369, "rewards/margins": 1.8485537767410278, "rewards/rejected": -2.051957607269287, "step": 242 }, { "epoch": 0.7020585048754063, "grad_norm": 3.5102062225341797, "learning_rate": 9.300578034682081e-07, "logits/chosen": -2.462451457977295, "logits/rejected": -2.4227795600891113, "logps/chosen": -43.86286926269531, "logps/rejected": -70.74295043945312, "loss": 0.6512, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2810259163379669, "rewards/margins": 2.4676151275634766, "rewards/rejected": -2.186589479446411, "step": 243 }, { "epoch": 0.7049476345250993, "grad_norm": 3.3236570358276367, "learning_rate": 9.297687861271676e-07, "logits/chosen": -2.445957660675049, "logits/rejected": -2.499586582183838, "logps/chosen": -38.89114761352539, "logps/rejected": -65.3713150024414, "loss": 0.6006, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6739312410354614, "rewards/margins": 2.4846673011779785, "rewards/rejected": -1.8107361793518066, "step": 244 }, { "epoch": 0.7078367641747924, "grad_norm": 4.141207695007324, "learning_rate": 9.294797687861271e-07, "logits/chosen": -2.4112517833709717, "logits/rejected": -2.489060640335083, "logps/chosen": -44.63983154296875, "logps/rejected": -61.65996551513672, "loss": 0.6738, "rewards/accuracies": 0.84375, "rewards/chosen": 0.35495948791503906, "rewards/margins": 1.922616720199585, "rewards/rejected": -1.5676573514938354, "step": 245 }, { "epoch": 0.7107258938244854, "grad_norm": 3.640693426132202, "learning_rate": 9.291907514450866e-07, "logits/chosen": -2.4216229915618896, "logits/rejected": -2.456334352493286, "logps/chosen": -40.318424224853516, "logps/rejected": -61.89943313598633, "loss": 0.6885, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5388501882553101, "rewards/margins": 1.8666306734085083, "rewards/rejected": -1.3277804851531982, "step": 246 }, { "epoch": 0.7136150234741784, "grad_norm": 3.4722509384155273, "learning_rate": 9.289017341040462e-07, "logits/chosen": -2.4019992351531982, "logits/rejected": -2.4723360538482666, "logps/chosen": -39.1639518737793, "logps/rejected": -61.585968017578125, "loss": 0.663, "rewards/accuracies": 0.90625, "rewards/chosen": 0.44599443674087524, "rewards/margins": 1.8800784349441528, "rewards/rejected": -1.4340838193893433, "step": 247 }, { "epoch": 0.7165041531238714, "grad_norm": 4.178869724273682, "learning_rate": 9.286127167630058e-07, "logits/chosen": -2.38395619392395, "logits/rejected": -2.4707727432250977, "logps/chosen": -46.04563903808594, "logps/rejected": -68.51762390136719, "loss": 0.6849, "rewards/accuracies": 0.96875, "rewards/chosen": -0.09884285926818848, "rewards/margins": 2.0691301822662354, "rewards/rejected": -2.1679728031158447, "step": 248 }, { "epoch": 0.7193932827735645, "grad_norm": 4.01619291305542, "learning_rate": 9.283236994219653e-07, "logits/chosen": -2.3799002170562744, "logits/rejected": -2.4336538314819336, "logps/chosen": -38.75762939453125, "logps/rejected": -70.06504821777344, "loss": 0.5869, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6923506855964661, "rewards/margins": 2.5831894874572754, "rewards/rejected": -1.8908389806747437, "step": 249 }, { "epoch": 0.7222824124232575, "grad_norm": 3.2909371852874756, "learning_rate": 9.280346820809249e-07, "logits/chosen": -2.4959113597869873, "logits/rejected": -2.523707151412964, "logps/chosen": -48.14604949951172, "logps/rejected": -67.05722045898438, "loss": 0.7013, "rewards/accuracies": 0.875, "rewards/chosen": 0.196534663438797, "rewards/margins": 1.9859517812728882, "rewards/rejected": -1.789417028427124, "step": 250 }, { "epoch": 0.7251715420729505, "grad_norm": 3.2420284748077393, "learning_rate": 9.277456647398843e-07, "logits/chosen": -2.518670082092285, "logits/rejected": -2.6291985511779785, "logps/chosen": -41.96446990966797, "logps/rejected": -67.37039184570312, "loss": 0.621, "rewards/accuracies": 0.90625, "rewards/chosen": 0.27726656198501587, "rewards/margins": 2.2622499465942383, "rewards/rejected": -1.9849834442138672, "step": 251 }, { "epoch": 0.7280606717226435, "grad_norm": 3.121448040008545, "learning_rate": 9.274566473988439e-07, "logits/chosen": -2.3559794425964355, "logits/rejected": -2.4928174018859863, "logps/chosen": -41.637420654296875, "logps/rejected": -65.81178283691406, "loss": 0.6272, "rewards/accuracies": 0.9375, "rewards/chosen": 0.592205286026001, "rewards/margins": 2.237271785736084, "rewards/rejected": -1.6450666189193726, "step": 252 }, { "epoch": 0.7309498013723366, "grad_norm": 3.2870371341705322, "learning_rate": 9.271676300578034e-07, "logits/chosen": -2.398446559906006, "logits/rejected": -2.4768240451812744, "logps/chosen": -45.6719970703125, "logps/rejected": -65.17205047607422, "loss": 0.6698, "rewards/accuracies": 0.875, "rewards/chosen": 0.10151119530200958, "rewards/margins": 1.8878767490386963, "rewards/rejected": -1.7863656282424927, "step": 253 }, { "epoch": 0.7338389310220296, "grad_norm": 3.28837251663208, "learning_rate": 9.26878612716763e-07, "logits/chosen": -2.4255053997039795, "logits/rejected": -2.4748618602752686, "logps/chosen": -44.71167755126953, "logps/rejected": -68.60025024414062, "loss": 0.6669, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1821301281452179, "rewards/margins": 2.261673927307129, "rewards/rejected": -2.079543352127075, "step": 254 }, { "epoch": 0.7367280606717227, "grad_norm": 3.2449686527252197, "learning_rate": 9.265895953757224e-07, "logits/chosen": -2.5110151767730713, "logits/rejected": -2.497730255126953, "logps/chosen": -44.614463806152344, "logps/rejected": -64.130859375, "loss": 0.7654, "rewards/accuracies": 0.78125, "rewards/chosen": 0.04725974425673485, "rewards/margins": 1.6098748445510864, "rewards/rejected": -1.562615156173706, "step": 255 }, { "epoch": 0.7396171903214157, "grad_norm": 4.386717319488525, "learning_rate": 9.263005780346821e-07, "logits/chosen": -2.381074905395508, "logits/rejected": -2.4651596546173096, "logps/chosen": -47.64090347290039, "logps/rejected": -64.15583038330078, "loss": 0.7095, "rewards/accuracies": 0.8125, "rewards/chosen": 0.19230279326438904, "rewards/margins": 1.6889746189117432, "rewards/rejected": -1.4966716766357422, "step": 256 }, { "epoch": 0.7425063199711087, "grad_norm": 3.3001387119293213, "learning_rate": 9.260115606936416e-07, "logits/chosen": -2.480133533477783, "logits/rejected": -2.5944204330444336, "logps/chosen": -40.53097152709961, "logps/rejected": -68.91915130615234, "loss": 0.6043, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4694287180900574, "rewards/margins": 2.428231716156006, "rewards/rejected": -1.9588031768798828, "step": 257 }, { "epoch": 0.7453954496208017, "grad_norm": 3.23453426361084, "learning_rate": 9.257225433526011e-07, "logits/chosen": -2.5030901432037354, "logits/rejected": -2.534526824951172, "logps/chosen": -45.12353515625, "logps/rejected": -63.57447814941406, "loss": 0.7451, "rewards/accuracies": 0.8125, "rewards/chosen": -0.015907496213912964, "rewards/margins": 1.6678990125656128, "rewards/rejected": -1.6838066577911377, "step": 258 }, { "epoch": 0.7482845792704947, "grad_norm": 4.275283336639404, "learning_rate": 9.254335260115607e-07, "logits/chosen": -2.395054817199707, "logits/rejected": -2.462977409362793, "logps/chosen": -45.91474914550781, "logps/rejected": -71.77691650390625, "loss": 0.6427, "rewards/accuracies": 0.84375, "rewards/chosen": 0.011908292770385742, "rewards/margins": 2.157731533050537, "rewards/rejected": -2.1458234786987305, "step": 259 }, { "epoch": 0.7511737089201878, "grad_norm": 3.0128467082977295, "learning_rate": 9.251445086705202e-07, "logits/chosen": -2.3830103874206543, "logits/rejected": -2.494330883026123, "logps/chosen": -40.92268371582031, "logps/rejected": -70.515380859375, "loss": 0.5992, "rewards/accuracies": 1.0, "rewards/chosen": 0.40378421545028687, "rewards/margins": 2.6742103099823, "rewards/rejected": -2.270426034927368, "step": 260 }, { "epoch": 0.7540628385698809, "grad_norm": 3.840834856033325, "learning_rate": 9.248554913294797e-07, "logits/chosen": -2.475538730621338, "logits/rejected": -2.5120649337768555, "logps/chosen": -47.48195266723633, "logps/rejected": -71.46372985839844, "loss": 0.7088, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22744929790496826, "rewards/margins": 2.162015676498413, "rewards/rejected": -2.389465093612671, "step": 261 }, { "epoch": 0.7569519682195739, "grad_norm": 4.022007942199707, "learning_rate": 9.245664739884392e-07, "logits/chosen": -2.3699288368225098, "logits/rejected": -2.4191534519195557, "logps/chosen": -45.758148193359375, "logps/rejected": -66.73216247558594, "loss": 0.7236, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1233632043004036, "rewards/margins": 1.4373282194137573, "rewards/rejected": -1.5606913566589355, "step": 262 }, { "epoch": 0.7598410978692669, "grad_norm": 3.9498722553253174, "learning_rate": 9.242774566473988e-07, "logits/chosen": -2.484650135040283, "logits/rejected": -2.560020685195923, "logps/chosen": -45.565269470214844, "logps/rejected": -74.2047348022461, "loss": 0.6711, "rewards/accuracies": 0.90625, "rewards/chosen": 0.18025030195713043, "rewards/margins": 2.5487356185913086, "rewards/rejected": -2.368485450744629, "step": 263 }, { "epoch": 0.7627302275189599, "grad_norm": 3.883610486984253, "learning_rate": 9.239884393063584e-07, "logits/chosen": -2.489962100982666, "logits/rejected": -2.532402753829956, "logps/chosen": -45.92567443847656, "logps/rejected": -67.43284606933594, "loss": 0.7299, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10026935487985611, "rewards/margins": 2.0646140575408936, "rewards/rejected": -1.96434485912323, "step": 264 }, { "epoch": 0.7656193571686529, "grad_norm": 3.1797797679901123, "learning_rate": 9.236994219653179e-07, "logits/chosen": -2.4063587188720703, "logits/rejected": -2.5504889488220215, "logps/chosen": -37.751129150390625, "logps/rejected": -67.85531616210938, "loss": 0.5263, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7012264728546143, "rewards/margins": 2.942140817642212, "rewards/rejected": -2.2409143447875977, "step": 265 }, { "epoch": 0.7685084868183459, "grad_norm": 3.485259771347046, "learning_rate": 9.234104046242774e-07, "logits/chosen": -2.3995540142059326, "logits/rejected": -2.458651065826416, "logps/chosen": -43.20692443847656, "logps/rejected": -64.39857482910156, "loss": 0.6466, "rewards/accuracies": 0.90625, "rewards/chosen": 0.17133493721485138, "rewards/margins": 2.097611427307129, "rewards/rejected": -1.926276445388794, "step": 266 }, { "epoch": 0.771397616468039, "grad_norm": 3.5613832473754883, "learning_rate": 9.23121387283237e-07, "logits/chosen": -2.4345486164093018, "logits/rejected": -2.5327308177948, "logps/chosen": -44.759273529052734, "logps/rejected": -69.31208801269531, "loss": 0.6609, "rewards/accuracies": 0.9375, "rewards/chosen": 0.27498704195022583, "rewards/margins": 2.2735865116119385, "rewards/rejected": -1.9985995292663574, "step": 267 }, { "epoch": 0.7742867461177321, "grad_norm": 3.117969512939453, "learning_rate": 9.228323699421964e-07, "logits/chosen": -2.393704891204834, "logits/rejected": -2.4577081203460693, "logps/chosen": -40.83298110961914, "logps/rejected": -58.384944915771484, "loss": 0.7593, "rewards/accuracies": 0.8125, "rewards/chosen": 0.45059508085250854, "rewards/margins": 1.629565954208374, "rewards/rejected": -1.1789709329605103, "step": 268 }, { "epoch": 0.7771758757674251, "grad_norm": 3.6475157737731934, "learning_rate": 9.22543352601156e-07, "logits/chosen": -2.5049936771392822, "logits/rejected": -2.4979405403137207, "logps/chosen": -38.867713928222656, "logps/rejected": -67.32798767089844, "loss": 0.5998, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6716524958610535, "rewards/margins": 2.5514330863952637, "rewards/rejected": -1.8797805309295654, "step": 269 }, { "epoch": 0.7800650054171181, "grad_norm": 4.0406012535095215, "learning_rate": 9.222543352601156e-07, "logits/chosen": -2.476184129714966, "logits/rejected": -2.471676826477051, "logps/chosen": -42.27669906616211, "logps/rejected": -68.86135864257812, "loss": 0.6732, "rewards/accuracies": 0.8125, "rewards/chosen": 0.30062276124954224, "rewards/margins": 2.229818344116211, "rewards/rejected": -1.9291954040527344, "step": 270 }, { "epoch": 0.7829541350668111, "grad_norm": 3.234572172164917, "learning_rate": 9.219653179190751e-07, "logits/chosen": -2.405979633331299, "logits/rejected": -2.524130344390869, "logps/chosen": -41.15300369262695, "logps/rejected": -65.87324523925781, "loss": 0.5823, "rewards/accuracies": 0.90625, "rewards/chosen": 0.42995625734329224, "rewards/margins": 2.4747872352600098, "rewards/rejected": -2.0448312759399414, "step": 271 }, { "epoch": 0.7858432647165041, "grad_norm": 3.833636522293091, "learning_rate": 9.216763005780347e-07, "logits/chosen": -2.4554195404052734, "logits/rejected": -2.539419651031494, "logps/chosen": -44.6583137512207, "logps/rejected": -72.87726593017578, "loss": 0.6459, "rewards/accuracies": 0.96875, "rewards/chosen": 0.24064956605434418, "rewards/margins": 2.404108762741089, "rewards/rejected": -2.163459300994873, "step": 272 }, { "epoch": 0.7887323943661971, "grad_norm": 3.102644681930542, "learning_rate": 9.213872832369942e-07, "logits/chosen": -2.4764914512634277, "logits/rejected": -2.5330934524536133, "logps/chosen": -49.97911071777344, "logps/rejected": -70.9179916381836, "loss": 0.7503, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3866407573223114, "rewards/margins": 1.7446074485778809, "rewards/rejected": -2.1312482357025146, "step": 273 }, { "epoch": 0.7916215240158903, "grad_norm": 3.787971019744873, "learning_rate": 9.210982658959538e-07, "logits/chosen": -2.566422939300537, "logits/rejected": -2.5850980281829834, "logps/chosen": -39.56183624267578, "logps/rejected": -61.10340881347656, "loss": 0.7019, "rewards/accuracies": 0.875, "rewards/chosen": 0.5593673586845398, "rewards/margins": 2.172682285308838, "rewards/rejected": -1.6133151054382324, "step": 274 }, { "epoch": 0.7945106536655833, "grad_norm": 3.617879629135132, "learning_rate": 9.208092485549132e-07, "logits/chosen": -2.504697322845459, "logits/rejected": -2.5740532875061035, "logps/chosen": -49.80826950073242, "logps/rejected": -67.46434020996094, "loss": 0.7084, "rewards/accuracies": 0.84375, "rewards/chosen": -0.08540436625480652, "rewards/margins": 1.950144648551941, "rewards/rejected": -2.035548686981201, "step": 275 }, { "epoch": 0.7973997833152763, "grad_norm": 3.2384798526763916, "learning_rate": 9.205202312138728e-07, "logits/chosen": -2.380239963531494, "logits/rejected": -2.534738302230835, "logps/chosen": -43.29826736450195, "logps/rejected": -72.6145248413086, "loss": 0.6382, "rewards/accuracies": 0.9375, "rewards/chosen": 0.39506906270980835, "rewards/margins": 2.7234039306640625, "rewards/rejected": -2.3283348083496094, "step": 276 }, { "epoch": 0.8002889129649693, "grad_norm": 3.4461843967437744, "learning_rate": 9.202312138728323e-07, "logits/chosen": -2.4549002647399902, "logits/rejected": -2.522564172744751, "logps/chosen": -46.93656921386719, "logps/rejected": -71.52021789550781, "loss": 0.6924, "rewards/accuracies": 0.9375, "rewards/chosen": -0.009806867688894272, "rewards/margins": 2.020381450653076, "rewards/rejected": -2.0301883220672607, "step": 277 }, { "epoch": 0.8031780426146623, "grad_norm": 3.469305992126465, "learning_rate": 9.199421965317918e-07, "logits/chosen": -2.3956665992736816, "logits/rejected": -2.510859966278076, "logps/chosen": -40.31458282470703, "logps/rejected": -64.0567855834961, "loss": 0.6633, "rewards/accuracies": 0.90625, "rewards/chosen": 0.560263991355896, "rewards/margins": 2.0939269065856934, "rewards/rejected": -1.5336626768112183, "step": 278 }, { "epoch": 0.8060671722643553, "grad_norm": 3.5162360668182373, "learning_rate": 9.196531791907513e-07, "logits/chosen": -2.411567211151123, "logits/rejected": -2.5344676971435547, "logps/chosen": -36.35032653808594, "logps/rejected": -64.64751434326172, "loss": 0.5752, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8904430270195007, "rewards/margins": 2.6893656253814697, "rewards/rejected": -1.7989225387573242, "step": 279 }, { "epoch": 0.8089563019140484, "grad_norm": 3.368431568145752, "learning_rate": 9.19364161849711e-07, "logits/chosen": -2.432854175567627, "logits/rejected": -2.5339980125427246, "logps/chosen": -39.98001480102539, "logps/rejected": -70.32357025146484, "loss": 0.6312, "rewards/accuracies": 0.96875, "rewards/chosen": 0.207392156124115, "rewards/margins": 2.4195663928985596, "rewards/rejected": -2.2121741771698, "step": 280 }, { "epoch": 0.8089563019140484, "eval_logits/chosen": -2.4886653423309326, "eval_logits/rejected": -2.5542562007904053, "eval_logps/chosen": -41.619781494140625, "eval_logps/rejected": -67.21428680419922, "eval_loss": 0.6253829598426819, "eval_rewards/accuracies": 0.9354838728904724, "eval_rewards/chosen": 0.4696633517742157, "eval_rewards/margins": 2.4960739612579346, "eval_rewards/rejected": -2.0264108180999756, "eval_runtime": 226.1979, "eval_samples_per_second": 0.544, "eval_steps_per_second": 0.274, "step": 280 }, { "epoch": 0.8118454315637414, "grad_norm": 3.9951047897338867, "learning_rate": 9.190751445086706e-07, "logits/chosen": -2.552478551864624, "logits/rejected": -2.6336252689361572, "logps/chosen": -35.975074768066406, "logps/rejected": -60.37914276123047, "loss": 0.6043, "rewards/accuracies": 0.9375, "rewards/chosen": 1.072455883026123, "rewards/margins": 2.4385673999786377, "rewards/rejected": -1.3661115169525146, "step": 281 }, { "epoch": 0.8147345612134345, "grad_norm": 4.034847736358643, "learning_rate": 9.1878612716763e-07, "logits/chosen": -2.455024242401123, "logits/rejected": -2.4704644680023193, "logps/chosen": -42.213802337646484, "logps/rejected": -69.82737731933594, "loss": 0.5977, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3700176775455475, "rewards/margins": 2.4584693908691406, "rewards/rejected": -2.088451385498047, "step": 282 }, { "epoch": 0.8176236908631275, "grad_norm": 4.204030513763428, "learning_rate": 9.184971098265896e-07, "logits/chosen": -2.528496503829956, "logits/rejected": -2.5552875995635986, "logps/chosen": -37.135902404785156, "logps/rejected": -63.24402618408203, "loss": 0.6422, "rewards/accuracies": 0.875, "rewards/chosen": 0.8916536569595337, "rewards/margins": 2.2236905097961426, "rewards/rejected": -1.3320367336273193, "step": 283 }, { "epoch": 0.8205128205128205, "grad_norm": 3.6883456707000732, "learning_rate": 9.182080924855491e-07, "logits/chosen": -2.4612183570861816, "logits/rejected": -2.5836119651794434, "logps/chosen": -35.44035339355469, "logps/rejected": -64.86527252197266, "loss": 0.6298, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7473670840263367, "rewards/margins": 2.4685230255126953, "rewards/rejected": -1.7211557626724243, "step": 284 }, { "epoch": 0.8234019501625135, "grad_norm": 3.0016567707061768, "learning_rate": 9.179190751445086e-07, "logits/chosen": -2.459878921508789, "logits/rejected": -2.4889872074127197, "logps/chosen": -39.882423400878906, "logps/rejected": -68.34754180908203, "loss": 0.6361, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5655270218849182, "rewards/margins": 2.687227725982666, "rewards/rejected": -2.1217007637023926, "step": 285 }, { "epoch": 0.8262910798122066, "grad_norm": 3.553039789199829, "learning_rate": 9.176300578034681e-07, "logits/chosen": -2.4982852935791016, "logits/rejected": -2.597928285598755, "logps/chosen": -41.47071075439453, "logps/rejected": -70.321044921875, "loss": 0.5719, "rewards/accuracies": 1.0, "rewards/chosen": 0.5355979204177856, "rewards/margins": 2.8970491886138916, "rewards/rejected": -2.3614513874053955, "step": 286 }, { "epoch": 0.8291802094618996, "grad_norm": 4.1403608322143555, "learning_rate": 9.173410404624277e-07, "logits/chosen": -2.5399892330169678, "logits/rejected": -2.6276533603668213, "logps/chosen": -41.49819564819336, "logps/rejected": -69.3299789428711, "loss": 0.6659, "rewards/accuracies": 0.9375, "rewards/chosen": 0.39784497022628784, "rewards/margins": 2.2857861518859863, "rewards/rejected": -1.8879411220550537, "step": 287 }, { "epoch": 0.8320693391115926, "grad_norm": 4.009652614593506, "learning_rate": 9.170520231213873e-07, "logits/chosen": -2.5400567054748535, "logits/rejected": -2.621927261352539, "logps/chosen": -47.09807205200195, "logps/rejected": -73.31185913085938, "loss": 0.6377, "rewards/accuracies": 0.90625, "rewards/chosen": 0.336742103099823, "rewards/margins": 2.902808427810669, "rewards/rejected": -2.5660665035247803, "step": 288 }, { "epoch": 0.8349584687612857, "grad_norm": 3.883091688156128, "learning_rate": 9.167630057803468e-07, "logits/chosen": -2.4889349937438965, "logits/rejected": -2.652920722961426, "logps/chosen": -44.13095474243164, "logps/rejected": -71.95037841796875, "loss": 0.6349, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2663761377334595, "rewards/margins": 2.6645655632019043, "rewards/rejected": -2.398189067840576, "step": 289 }, { "epoch": 0.8378475984109787, "grad_norm": 5.977077484130859, "learning_rate": 9.164739884393063e-07, "logits/chosen": -2.453486442565918, "logits/rejected": -2.5640861988067627, "logps/chosen": -37.9553337097168, "logps/rejected": -71.37313842773438, "loss": 0.5338, "rewards/accuracies": 0.9375, "rewards/chosen": 1.005055546760559, "rewards/margins": 3.201676845550537, "rewards/rejected": -2.1966214179992676, "step": 290 }, { "epoch": 0.8407367280606717, "grad_norm": 3.615544080734253, "learning_rate": 9.161849710982659e-07, "logits/chosen": -2.462028980255127, "logits/rejected": -2.5216217041015625, "logps/chosen": -43.958919525146484, "logps/rejected": -62.56788635253906, "loss": 0.696, "rewards/accuracies": 0.84375, "rewards/chosen": 0.16503024101257324, "rewards/margins": 1.8514580726623535, "rewards/rejected": -1.6864278316497803, "step": 291 }, { "epoch": 0.8436258577103648, "grad_norm": 3.5122017860412598, "learning_rate": 9.158959537572254e-07, "logits/chosen": -2.4896271228790283, "logits/rejected": -2.568814277648926, "logps/chosen": -35.9182243347168, "logps/rejected": -67.78746795654297, "loss": 0.5411, "rewards/accuracies": 1.0, "rewards/chosen": 0.847554624080658, "rewards/margins": 3.045525550842285, "rewards/rejected": -2.1979708671569824, "step": 292 }, { "epoch": 0.8465149873600578, "grad_norm": 3.5581135749816895, "learning_rate": 9.156069364161849e-07, "logits/chosen": -2.4738845825195312, "logits/rejected": -2.5835037231445312, "logps/chosen": -35.21149826049805, "logps/rejected": -64.04547882080078, "loss": 0.5996, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0379579067230225, "rewards/margins": 2.5596566200256348, "rewards/rejected": -1.5216988325119019, "step": 293 }, { "epoch": 0.8494041170097508, "grad_norm": 5.401801109313965, "learning_rate": 9.153179190751445e-07, "logits/chosen": -2.5522656440734863, "logits/rejected": -2.661315441131592, "logps/chosen": -40.49526596069336, "logps/rejected": -68.09928131103516, "loss": 0.6614, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5266401767730713, "rewards/margins": 2.620157241821289, "rewards/rejected": -2.093517303466797, "step": 294 }, { "epoch": 0.8522932466594438, "grad_norm": 4.048588275909424, "learning_rate": 9.150289017341039e-07, "logits/chosen": -2.475501537322998, "logits/rejected": -2.5921573638916016, "logps/chosen": -40.697349548339844, "logps/rejected": -66.0309066772461, "loss": 0.5905, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7147305607795715, "rewards/margins": 2.4487221240997314, "rewards/rejected": -1.7339916229248047, "step": 295 }, { "epoch": 0.8551823763091368, "grad_norm": 3.515406608581543, "learning_rate": 9.147398843930635e-07, "logits/chosen": -2.534817695617676, "logits/rejected": -2.6662962436676025, "logps/chosen": -39.72247314453125, "logps/rejected": -76.31913757324219, "loss": 0.5641, "rewards/accuracies": 1.0, "rewards/chosen": 0.6094422936439514, "rewards/margins": 3.331801652908325, "rewards/rejected": -2.7223594188690186, "step": 296 }, { "epoch": 0.8580715059588299, "grad_norm": 3.912428617477417, "learning_rate": 9.144508670520231e-07, "logits/chosen": -2.4501607418060303, "logits/rejected": -2.5892672538757324, "logps/chosen": -43.91957092285156, "logps/rejected": -65.66881561279297, "loss": 0.6975, "rewards/accuracies": 0.875, "rewards/chosen": 0.34623777866363525, "rewards/margins": 1.9166061878204346, "rewards/rejected": -1.5703685283660889, "step": 297 }, { "epoch": 0.860960635608523, "grad_norm": 3.3454926013946533, "learning_rate": 9.141618497109827e-07, "logits/chosen": -2.6113290786743164, "logits/rejected": -2.5816550254821777, "logps/chosen": -44.269386291503906, "logps/rejected": -68.05392456054688, "loss": 0.7432, "rewards/accuracies": 0.875, "rewards/chosen": 0.1852259784936905, "rewards/margins": 2.204892873764038, "rewards/rejected": -2.019666910171509, "step": 298 }, { "epoch": 0.863849765258216, "grad_norm": 3.502080202102661, "learning_rate": 9.138728323699421e-07, "logits/chosen": -2.4228122234344482, "logits/rejected": -2.539158582687378, "logps/chosen": -33.781959533691406, "logps/rejected": -60.97136688232422, "loss": 0.5742, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9389117956161499, "rewards/margins": 2.4651377201080322, "rewards/rejected": -1.5262258052825928, "step": 299 }, { "epoch": 0.866738894907909, "grad_norm": 3.0817880630493164, "learning_rate": 9.135838150289017e-07, "logits/chosen": -2.511807918548584, "logits/rejected": -2.590658187866211, "logps/chosen": -42.36675262451172, "logps/rejected": -69.91401672363281, "loss": 0.6602, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5288019180297852, "rewards/margins": 2.4033119678497314, "rewards/rejected": -1.874510407447815, "step": 300 }, { "epoch": 0.869628024557602, "grad_norm": 3.33506178855896, "learning_rate": 9.132947976878612e-07, "logits/chosen": -2.5483992099761963, "logits/rejected": -2.5716564655303955, "logps/chosen": -45.75339126586914, "logps/rejected": -66.6498031616211, "loss": 0.675, "rewards/accuracies": 0.90625, "rewards/chosen": 0.34455955028533936, "rewards/margins": 2.1185386180877686, "rewards/rejected": -1.7739789485931396, "step": 301 }, { "epoch": 0.872517154207295, "grad_norm": 3.0678393840789795, "learning_rate": 9.130057803468207e-07, "logits/chosen": -2.424287796020508, "logits/rejected": -2.6127328872680664, "logps/chosen": -35.42959213256836, "logps/rejected": -67.87315368652344, "loss": 0.5149, "rewards/accuracies": 1.0, "rewards/chosen": 1.0535515546798706, "rewards/margins": 3.268799066543579, "rewards/rejected": -2.215247392654419, "step": 302 }, { "epoch": 0.875406283856988, "grad_norm": 3.4821019172668457, "learning_rate": 9.127167630057803e-07, "logits/chosen": -2.5368146896362305, "logits/rejected": -2.6703102588653564, "logps/chosen": -38.7606315612793, "logps/rejected": -61.090553283691406, "loss": 0.6639, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8089661598205566, "rewards/margins": 2.4483702182769775, "rewards/rejected": -1.6394039392471313, "step": 303 }, { "epoch": 0.8782954135066812, "grad_norm": 4.079732894897461, "learning_rate": 9.124277456647398e-07, "logits/chosen": -2.453587293624878, "logits/rejected": -2.5938048362731934, "logps/chosen": -49.1068000793457, "logps/rejected": -70.29792785644531, "loss": 0.7534, "rewards/accuracies": 0.84375, "rewards/chosen": 0.08599613606929779, "rewards/margins": 1.8649377822875977, "rewards/rejected": -1.7789416313171387, "step": 304 }, { "epoch": 0.8811845431563742, "grad_norm": 3.542505979537964, "learning_rate": 9.121387283236995e-07, "logits/chosen": -2.540121078491211, "logits/rejected": -2.642054557800293, "logps/chosen": -33.04487228393555, "logps/rejected": -67.15391540527344, "loss": 0.5384, "rewards/accuracies": 1.0, "rewards/chosen": 1.0053293704986572, "rewards/margins": 3.163139581680298, "rewards/rejected": -2.1578099727630615, "step": 305 }, { "epoch": 0.8840736728060672, "grad_norm": 4.178450584411621, "learning_rate": 9.118497109826589e-07, "logits/chosen": -2.533841848373413, "logits/rejected": -2.6150870323181152, "logps/chosen": -40.57527160644531, "logps/rejected": -74.33956909179688, "loss": 0.5884, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5392240285873413, "rewards/margins": 2.8842785358428955, "rewards/rejected": -2.3450541496276855, "step": 306 }, { "epoch": 0.8869628024557602, "grad_norm": 3.3869917392730713, "learning_rate": 9.115606936416185e-07, "logits/chosen": -2.587411880493164, "logits/rejected": -2.748539924621582, "logps/chosen": -38.92563247680664, "logps/rejected": -71.56584167480469, "loss": 0.5712, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6082615256309509, "rewards/margins": 3.0774667263031006, "rewards/rejected": -2.469204902648926, "step": 307 }, { "epoch": 0.8898519321054532, "grad_norm": 3.9779317378997803, "learning_rate": 9.11271676300578e-07, "logits/chosen": -2.4584460258483887, "logits/rejected": -2.6223058700561523, "logps/chosen": -37.00040817260742, "logps/rejected": -68.1635513305664, "loss": 0.6262, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7683927416801453, "rewards/margins": 2.813746929168701, "rewards/rejected": -2.0453543663024902, "step": 308 }, { "epoch": 0.8927410617551462, "grad_norm": 3.7044858932495117, "learning_rate": 9.109826589595375e-07, "logits/chosen": -2.4832444190979004, "logits/rejected": -2.615065097808838, "logps/chosen": -39.1903076171875, "logps/rejected": -62.02070617675781, "loss": 0.6747, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6972198486328125, "rewards/margins": 2.1993565559387207, "rewards/rejected": -1.5021368265151978, "step": 309 }, { "epoch": 0.8956301914048392, "grad_norm": 4.212526321411133, "learning_rate": 9.10693641618497e-07, "logits/chosen": -2.50657320022583, "logits/rejected": -2.6892480850219727, "logps/chosen": -35.599281311035156, "logps/rejected": -59.99262237548828, "loss": 0.66, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8573276400566101, "rewards/margins": 2.1136112213134766, "rewards/rejected": -1.2562836408615112, "step": 310 }, { "epoch": 0.8985193210545324, "grad_norm": 3.8935935497283936, "learning_rate": 9.104046242774566e-07, "logits/chosen": -2.554143190383911, "logits/rejected": -2.6395509243011475, "logps/chosen": -43.24195098876953, "logps/rejected": -68.70059204101562, "loss": 0.695, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1635969877243042, "rewards/margins": 2.3491814136505127, "rewards/rejected": -2.185584545135498, "step": 311 }, { "epoch": 0.9014084507042254, "grad_norm": 3.8799362182617188, "learning_rate": 9.10115606936416e-07, "logits/chosen": -2.513380527496338, "logits/rejected": -2.624927520751953, "logps/chosen": -38.35611343383789, "logps/rejected": -71.56069946289062, "loss": 0.5228, "rewards/accuracies": 0.96875, "rewards/chosen": 0.822606086730957, "rewards/margins": 2.965503454208374, "rewards/rejected": -2.142897605895996, "step": 312 }, { "epoch": 0.9042975803539184, "grad_norm": 10.589539527893066, "learning_rate": 9.098265895953757e-07, "logits/chosen": -2.4539265632629395, "logits/rejected": -2.611717700958252, "logps/chosen": -37.54859161376953, "logps/rejected": -63.97444534301758, "loss": 0.6597, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0226821899414062, "rewards/margins": 2.479374885559082, "rewards/rejected": -1.4566924571990967, "step": 313 }, { "epoch": 0.9071867100036114, "grad_norm": 4.473581314086914, "learning_rate": 9.095375722543353e-07, "logits/chosen": -2.546827554702759, "logits/rejected": -2.627554178237915, "logps/chosen": -43.40639114379883, "logps/rejected": -68.19022369384766, "loss": 0.6459, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4464014768600464, "rewards/margins": 2.5574398040771484, "rewards/rejected": -2.1110382080078125, "step": 314 }, { "epoch": 0.9100758396533044, "grad_norm": 3.3407227993011475, "learning_rate": 9.092485549132948e-07, "logits/chosen": -2.538987159729004, "logits/rejected": -2.630944013595581, "logps/chosen": -39.28359603881836, "logps/rejected": -66.84771728515625, "loss": 0.5962, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6391288042068481, "rewards/margins": 2.5313191413879395, "rewards/rejected": -1.8921902179718018, "step": 315 }, { "epoch": 0.9129649693029974, "grad_norm": 3.959900379180908, "learning_rate": 9.089595375722543e-07, "logits/chosen": -2.5553252696990967, "logits/rejected": -2.7143120765686035, "logps/chosen": -46.74531936645508, "logps/rejected": -71.58466339111328, "loss": 0.6398, "rewards/accuracies": 0.84375, "rewards/chosen": -0.019035007804632187, "rewards/margins": 2.3773717880249023, "rewards/rejected": -2.396406888961792, "step": 316 }, { "epoch": 0.9158540989526905, "grad_norm": 4.067241191864014, "learning_rate": 9.086705202312138e-07, "logits/chosen": -2.5827717781066895, "logits/rejected": -2.6258020401000977, "logps/chosen": -39.96958923339844, "logps/rejected": -64.82997131347656, "loss": 0.6848, "rewards/accuracies": 0.875, "rewards/chosen": 0.8043922185897827, "rewards/margins": 2.3543598651885986, "rewards/rejected": -1.549967885017395, "step": 317 }, { "epoch": 0.9187432286023836, "grad_norm": 4.309684753417969, "learning_rate": 9.083815028901734e-07, "logits/chosen": -2.594193935394287, "logits/rejected": -2.64048171043396, "logps/chosen": -50.887916564941406, "logps/rejected": -75.3998031616211, "loss": 0.6712, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2921513319015503, "rewards/margins": 2.485818386077881, "rewards/rejected": -2.7779698371887207, "step": 318 }, { "epoch": 0.9216323582520766, "grad_norm": 3.47698974609375, "learning_rate": 9.080924855491328e-07, "logits/chosen": -2.6102797985076904, "logits/rejected": -2.7170825004577637, "logps/chosen": -40.53758239746094, "logps/rejected": -69.62661743164062, "loss": 0.6956, "rewards/accuracies": 0.875, "rewards/chosen": 0.3017171621322632, "rewards/margins": 2.4922971725463867, "rewards/rejected": -2.190580129623413, "step": 319 }, { "epoch": 0.9245214879017696, "grad_norm": 4.052387714385986, "learning_rate": 9.078034682080924e-07, "logits/chosen": -2.535029411315918, "logits/rejected": -2.530482053756714, "logps/chosen": -43.616065979003906, "logps/rejected": -68.04935455322266, "loss": 0.7149, "rewards/accuracies": 0.96875, "rewards/chosen": 0.2768706679344177, "rewards/margins": 2.0126953125, "rewards/rejected": -1.7358245849609375, "step": 320 }, { "epoch": 0.9274106175514626, "grad_norm": 4.2843475341796875, "learning_rate": 9.07514450867052e-07, "logits/chosen": -2.604017734527588, "logits/rejected": -2.7277121543884277, "logps/chosen": -36.27265167236328, "logps/rejected": -68.59797668457031, "loss": 0.5828, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7869418859481812, "rewards/margins": 2.8353772163391113, "rewards/rejected": -2.0484352111816406, "step": 321 }, { "epoch": 0.9302997472011556, "grad_norm": 3.943394660949707, "learning_rate": 9.072254335260116e-07, "logits/chosen": -2.5433437824249268, "logits/rejected": -2.6021742820739746, "logps/chosen": -39.89887237548828, "logps/rejected": -73.9595947265625, "loss": 0.5851, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4822889566421509, "rewards/margins": 3.230360507965088, "rewards/rejected": -2.7480711936950684, "step": 322 }, { "epoch": 0.9331888768508487, "grad_norm": 4.175582408905029, "learning_rate": 9.06936416184971e-07, "logits/chosen": -2.5564684867858887, "logits/rejected": -2.6773805618286133, "logps/chosen": -42.7093505859375, "logps/rejected": -79.44746398925781, "loss": 0.5403, "rewards/accuracies": 1.0, "rewards/chosen": 0.5161958932876587, "rewards/margins": 3.4385979175567627, "rewards/rejected": -2.9224023818969727, "step": 323 }, { "epoch": 0.9360780065005417, "grad_norm": 4.191171169281006, "learning_rate": 9.066473988439306e-07, "logits/chosen": -2.4692399501800537, "logits/rejected": -2.623198986053467, "logps/chosen": -44.751792907714844, "logps/rejected": -68.65464782714844, "loss": 0.6096, "rewards/accuracies": 0.875, "rewards/chosen": 0.42441681027412415, "rewards/margins": 2.565269947052002, "rewards/rejected": -2.140852928161621, "step": 324 }, { "epoch": 0.9389671361502347, "grad_norm": 3.6699161529541016, "learning_rate": 9.063583815028902e-07, "logits/chosen": -2.530395746231079, "logits/rejected": -2.6101040840148926, "logps/chosen": -36.472198486328125, "logps/rejected": -67.37355041503906, "loss": 0.5667, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0177892446517944, "rewards/margins": 2.997868776321411, "rewards/rejected": -1.9800792932510376, "step": 325 }, { "epoch": 0.9418562657999278, "grad_norm": 3.6798505783081055, "learning_rate": 9.060693641618496e-07, "logits/chosen": -2.4548022747039795, "logits/rejected": -2.620945692062378, "logps/chosen": -37.46780776977539, "logps/rejected": -77.91193389892578, "loss": 0.4886, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9554864764213562, "rewards/margins": 3.720430612564087, "rewards/rejected": -2.764943838119507, "step": 326 }, { "epoch": 0.9447453954496208, "grad_norm": 5.1189494132995605, "learning_rate": 9.057803468208092e-07, "logits/chosen": -2.5380563735961914, "logits/rejected": -2.657862901687622, "logps/chosen": -39.2568473815918, "logps/rejected": -73.6739501953125, "loss": 0.573, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6028609275817871, "rewards/margins": 3.080925703048706, "rewards/rejected": -2.478064775466919, "step": 327 }, { "epoch": 0.9476345250993138, "grad_norm": 3.8155901432037354, "learning_rate": 9.054913294797687e-07, "logits/chosen": -2.5188636779785156, "logits/rejected": -2.700515031814575, "logps/chosen": -37.44816970825195, "logps/rejected": -74.22052764892578, "loss": 0.5807, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9475712776184082, "rewards/margins": 3.4847822189331055, "rewards/rejected": -2.5372109413146973, "step": 328 }, { "epoch": 0.9505236547490069, "grad_norm": 6.08412504196167, "learning_rate": 9.052023121387284e-07, "logits/chosen": -2.501875400543213, "logits/rejected": -2.649557113647461, "logps/chosen": -36.36819076538086, "logps/rejected": -70.97380828857422, "loss": 0.5661, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9740036129951477, "rewards/margins": 3.071885108947754, "rewards/rejected": -2.09788179397583, "step": 329 }, { "epoch": 0.9534127843986999, "grad_norm": 3.481214761734009, "learning_rate": 9.049132947976878e-07, "logits/chosen": -2.667644500732422, "logits/rejected": -2.78558349609375, "logps/chosen": -43.35844421386719, "logps/rejected": -76.49736785888672, "loss": 0.6034, "rewards/accuracies": 0.90625, "rewards/chosen": 0.44476422667503357, "rewards/margins": 3.2631213665008545, "rewards/rejected": -2.818356990814209, "step": 330 }, { "epoch": 0.9563019140483929, "grad_norm": 3.7142152786254883, "learning_rate": 9.046242774566474e-07, "logits/chosen": -2.5198638439178467, "logits/rejected": -2.627474784851074, "logps/chosen": -40.8130989074707, "logps/rejected": -75.5346450805664, "loss": 0.5577, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7035675644874573, "rewards/margins": 3.149885654449463, "rewards/rejected": -2.4463181495666504, "step": 331 }, { "epoch": 0.9591910436980859, "grad_norm": 3.398216962814331, "learning_rate": 9.043352601156069e-07, "logits/chosen": -2.5769877433776855, "logits/rejected": -2.704854965209961, "logps/chosen": -40.1043586730957, "logps/rejected": -72.2789306640625, "loss": 0.5396, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6180805563926697, "rewards/margins": 2.878488540649414, "rewards/rejected": -2.2604079246520996, "step": 332 }, { "epoch": 0.962080173347779, "grad_norm": 4.1903076171875, "learning_rate": 9.040462427745664e-07, "logits/chosen": -2.595999240875244, "logits/rejected": -2.7414121627807617, "logps/chosen": -36.535316467285156, "logps/rejected": -70.72069549560547, "loss": 0.5331, "rewards/accuracies": 1.0, "rewards/chosen": 0.9301936626434326, "rewards/margins": 3.0833730697631836, "rewards/rejected": -2.153179407119751, "step": 333 }, { "epoch": 0.964969302997472, "grad_norm": 4.576474666595459, "learning_rate": 9.03757225433526e-07, "logits/chosen": -2.5251078605651855, "logits/rejected": -2.698233127593994, "logps/chosen": -36.57093048095703, "logps/rejected": -67.20500183105469, "loss": 0.5576, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1779340505599976, "rewards/margins": 3.0365188121795654, "rewards/rejected": -1.858585000038147, "step": 334 }, { "epoch": 0.9678584326471651, "grad_norm": 4.008988857269287, "learning_rate": 9.034682080924855e-07, "logits/chosen": -2.601259231567383, "logits/rejected": -2.7556710243225098, "logps/chosen": -44.61309814453125, "logps/rejected": -80.91131591796875, "loss": 0.6011, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2519819140434265, "rewards/margins": 3.451770305633545, "rewards/rejected": -3.1997883319854736, "step": 335 }, { "epoch": 0.9707475622968581, "grad_norm": 3.6837098598480225, "learning_rate": 9.03179190751445e-07, "logits/chosen": -2.5415561199188232, "logits/rejected": -2.6987063884735107, "logps/chosen": -32.57243728637695, "logps/rejected": -62.08921813964844, "loss": 0.5649, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2499992847442627, "rewards/margins": 2.922354221343994, "rewards/rejected": -1.672354817390442, "step": 336 }, { "epoch": 0.9736366919465511, "grad_norm": 4.882418155670166, "learning_rate": 9.028901734104046e-07, "logits/chosen": -2.703327178955078, "logits/rejected": -2.8208601474761963, "logps/chosen": -40.292724609375, "logps/rejected": -68.74337005615234, "loss": 0.5984, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6841131448745728, "rewards/margins": 2.8242759704589844, "rewards/rejected": -2.140162706375122, "step": 337 }, { "epoch": 0.9765258215962441, "grad_norm": 3.9461865425109863, "learning_rate": 9.026011560693642e-07, "logits/chosen": -2.637037992477417, "logits/rejected": -2.719202756881714, "logps/chosen": -40.9707145690918, "logps/rejected": -67.67610168457031, "loss": 0.6218, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6344377398490906, "rewards/margins": 2.4455244541168213, "rewards/rejected": -1.8110867738723755, "step": 338 }, { "epoch": 0.9794149512459371, "grad_norm": 5.216695308685303, "learning_rate": 9.023121387283237e-07, "logits/chosen": -2.641049385070801, "logits/rejected": -2.724747896194458, "logps/chosen": -37.31519317626953, "logps/rejected": -71.93550109863281, "loss": 0.5353, "rewards/accuracies": 0.96875, "rewards/chosen": 0.604995846748352, "rewards/margins": 3.094020128250122, "rewards/rejected": -2.4890241622924805, "step": 339 }, { "epoch": 0.9823040808956301, "grad_norm": 4.011502265930176, "learning_rate": 9.020231213872832e-07, "logits/chosen": -2.612579345703125, "logits/rejected": -2.621985912322998, "logps/chosen": -40.4416389465332, "logps/rejected": -75.08846282958984, "loss": 0.5687, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4743841886520386, "rewards/margins": 3.262747049331665, "rewards/rejected": -2.788362741470337, "step": 340 }, { "epoch": 0.9851932105453233, "grad_norm": 3.774113178253174, "learning_rate": 9.017341040462427e-07, "logits/chosen": -2.581984043121338, "logits/rejected": -2.7366297245025635, "logps/chosen": -42.34621810913086, "logps/rejected": -74.31827545166016, "loss": 0.5855, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4694390594959259, "rewards/margins": 3.0040781497955322, "rewards/rejected": -2.5346388816833496, "step": 341 }, { "epoch": 0.9880823401950163, "grad_norm": 3.7254021167755127, "learning_rate": 9.014450867052023e-07, "logits/chosen": -2.635998249053955, "logits/rejected": -2.752227544784546, "logps/chosen": -43.205528259277344, "logps/rejected": -66.92916107177734, "loss": 0.681, "rewards/accuracies": 0.9375, "rewards/chosen": 0.35815632343292236, "rewards/margins": 2.3112077713012695, "rewards/rejected": -1.9530513286590576, "step": 342 }, { "epoch": 0.9909714698447093, "grad_norm": 3.831116199493408, "learning_rate": 9.011560693641617e-07, "logits/chosen": -2.6379480361938477, "logits/rejected": -2.6836259365081787, "logps/chosen": -46.09840393066406, "logps/rejected": -71.81094360351562, "loss": 0.72, "rewards/accuracies": 0.84375, "rewards/chosen": -0.05712345987558365, "rewards/margins": 2.2850470542907715, "rewards/rejected": -2.342170238494873, "step": 343 }, { "epoch": 0.9938605994944023, "grad_norm": 3.7269649505615234, "learning_rate": 9.008670520231213e-07, "logits/chosen": -2.529177665710449, "logits/rejected": -2.677931308746338, "logps/chosen": -36.41141128540039, "logps/rejected": -73.14506530761719, "loss": 0.587, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7896242141723633, "rewards/margins": 3.047970771789551, "rewards/rejected": -2.2583467960357666, "step": 344 }, { "epoch": 0.9967497291440953, "grad_norm": 4.2797393798828125, "learning_rate": 9.00578034682081e-07, "logits/chosen": -2.5898146629333496, "logits/rejected": -2.6978299617767334, "logps/chosen": -36.522377014160156, "logps/rejected": -72.42797088623047, "loss": 0.5835, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8778678774833679, "rewards/margins": 2.9076108932495117, "rewards/rejected": -2.029743194580078, "step": 345 }, { "epoch": 0.9996388587937883, "grad_norm": 3.5425171852111816, "learning_rate": 9.002890173410405e-07, "logits/chosen": -2.4954326152801514, "logits/rejected": -2.6007137298583984, "logps/chosen": -37.552093505859375, "logps/rejected": -72.54142761230469, "loss": 0.5245, "rewards/accuracies": 1.0, "rewards/chosen": 0.6055393815040588, "rewards/margins": 3.3106627464294434, "rewards/rejected": -2.7051236629486084, "step": 346 }, { "epoch": 1.0, "grad_norm": 0.9838401079177856, "learning_rate": 9e-07, "logits/chosen": -2.497622489929199, "logits/rejected": -2.597386360168457, "logps/chosen": -20.67485809326172, "logps/rejected": -62.22892761230469, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 2.3754537105560303, "rewards/margins": 4.088649272918701, "rewards/rejected": -1.713195562362671, "step": 347 }, { "epoch": 1.0028891296496931, "grad_norm": 3.3233981132507324, "learning_rate": 8.997109826589595e-07, "logits/chosen": -2.6513736248016357, "logits/rejected": -2.7475152015686035, "logps/chosen": -45.383506774902344, "logps/rejected": -75.44622039794922, "loss": 0.6406, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1529727578163147, "rewards/margins": 3.0368824005126953, "rewards/rejected": -2.8839094638824463, "step": 348 }, { "epoch": 1.005778259299386, "grad_norm": 3.9817113876342773, "learning_rate": 8.994219653179191e-07, "logits/chosen": -2.655919313430786, "logits/rejected": -2.8471693992614746, "logps/chosen": -40.17938232421875, "logps/rejected": -72.53995513916016, "loss": 0.6106, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3957988917827606, "rewards/margins": 3.0632717609405518, "rewards/rejected": -2.6674728393554688, "step": 349 }, { "epoch": 1.0086673889490791, "grad_norm": 4.333801746368408, "learning_rate": 8.991329479768785e-07, "logits/chosen": -2.6788835525512695, "logits/rejected": -2.7244813442230225, "logps/chosen": -42.556453704833984, "logps/rejected": -76.15573120117188, "loss": 0.608, "rewards/accuracies": 0.90625, "rewards/chosen": 0.49776360392570496, "rewards/margins": 3.221371650695801, "rewards/rejected": -2.7236084938049316, "step": 350 }, { "epoch": 1.0086673889490791, "eval_logits/chosen": -2.6560726165771484, "eval_logits/rejected": -2.757128953933716, "eval_logps/chosen": -41.24240493774414, "eval_logps/rejected": -75.23573303222656, "eval_loss": 0.5771867632865906, "eval_rewards/accuracies": 0.9516128897666931, "eval_rewards/chosen": 0.5074013471603394, "eval_rewards/margins": 3.3359577655792236, "eval_rewards/rejected": -2.8285562992095947, "eval_runtime": 226.0953, "eval_samples_per_second": 0.544, "eval_steps_per_second": 0.274, "step": 350 }, { "epoch": 1.011556518598772, "grad_norm": 4.245736122131348, "learning_rate": 8.988439306358381e-07, "logits/chosen": -2.5468339920043945, "logits/rejected": -2.7034616470336914, "logps/chosen": -33.670448303222656, "logps/rejected": -68.75988006591797, "loss": 0.4997, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1815358400344849, "rewards/margins": 3.7310919761657715, "rewards/rejected": -2.549556255340576, "step": 351 }, { "epoch": 1.0144456482484652, "grad_norm": 4.16458797454834, "learning_rate": 8.985549132947976e-07, "logits/chosen": -2.6288397312164307, "logits/rejected": -2.7724053859710693, "logps/chosen": -31.70697784423828, "logps/rejected": -74.14129638671875, "loss": 0.4798, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3817578554153442, "rewards/margins": 4.1868896484375, "rewards/rejected": -2.8051319122314453, "step": 352 }, { "epoch": 1.017334777898158, "grad_norm": 4.569166660308838, "learning_rate": 8.982658959537572e-07, "logits/chosen": -2.6571054458618164, "logits/rejected": -2.73004150390625, "logps/chosen": -51.3411750793457, "logps/rejected": -81.9998779296875, "loss": 0.623, "rewards/accuracies": 0.875, "rewards/chosen": -0.34957993030548096, "rewards/margins": 3.023090124130249, "rewards/rejected": -3.3726696968078613, "step": 353 }, { "epoch": 1.0202239075478512, "grad_norm": 4.798895835876465, "learning_rate": 8.979768786127167e-07, "logits/chosen": -2.578137159347534, "logits/rejected": -2.639063835144043, "logps/chosen": -32.45537567138672, "logps/rejected": -67.75145721435547, "loss": 0.4423, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4644650220870972, "rewards/margins": 3.4173295497894287, "rewards/rejected": -1.9528642892837524, "step": 354 }, { "epoch": 1.0231130371975443, "grad_norm": 4.026383399963379, "learning_rate": 8.976878612716763e-07, "logits/chosen": -2.69832181930542, "logits/rejected": -2.7830233573913574, "logps/chosen": -46.68137741088867, "logps/rejected": -76.31501007080078, "loss": 0.6145, "rewards/accuracies": 0.96875, "rewards/chosen": -0.020418232306838036, "rewards/margins": 2.8765604496002197, "rewards/rejected": -2.8969786167144775, "step": 355 }, { "epoch": 1.0260021668472372, "grad_norm": 4.502602577209473, "learning_rate": 8.973988439306359e-07, "logits/chosen": -2.711526870727539, "logits/rejected": -2.801483392715454, "logps/chosen": -47.382774353027344, "logps/rejected": -79.66889953613281, "loss": 0.6518, "rewards/accuracies": 0.90625, "rewards/chosen": -0.18938440084457397, "rewards/margins": 2.813958168029785, "rewards/rejected": -3.003342390060425, "step": 356 }, { "epoch": 1.0288912964969303, "grad_norm": 4.061535358428955, "learning_rate": 8.971098265895953e-07, "logits/chosen": -2.563891649246216, "logits/rejected": -2.664153814315796, "logps/chosen": -35.74896240234375, "logps/rejected": -72.98462677001953, "loss": 0.5247, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8447089195251465, "rewards/margins": 3.3718504905700684, "rewards/rejected": -2.52714204788208, "step": 357 }, { "epoch": 1.0317804261466232, "grad_norm": 4.174027442932129, "learning_rate": 8.968208092485549e-07, "logits/chosen": -2.5889229774475098, "logits/rejected": -2.7253928184509277, "logps/chosen": -38.98970031738281, "logps/rejected": -71.5126724243164, "loss": 0.6172, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6851992011070251, "rewards/margins": 3.148819923400879, "rewards/rejected": -2.463620901107788, "step": 358 }, { "epoch": 1.0346695557963164, "grad_norm": 4.085376739501953, "learning_rate": 8.965317919075144e-07, "logits/chosen": -2.6645214557647705, "logits/rejected": -2.7790884971618652, "logps/chosen": -38.92276382446289, "logps/rejected": -79.03520202636719, "loss": 0.5559, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6072142124176025, "rewards/margins": 3.4720964431762695, "rewards/rejected": -2.864882230758667, "step": 359 }, { "epoch": 1.0375586854460095, "grad_norm": 4.076954364776611, "learning_rate": 8.962427745664739e-07, "logits/chosen": -2.5741708278656006, "logits/rejected": -2.755178928375244, "logps/chosen": -35.45353698730469, "logps/rejected": -68.057373046875, "loss": 0.5261, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0925025939941406, "rewards/margins": 3.1355042457580566, "rewards/rejected": -2.043001413345337, "step": 360 }, { "epoch": 1.0404478150957024, "grad_norm": 4.298234462738037, "learning_rate": 8.959537572254335e-07, "logits/chosen": -2.674018144607544, "logits/rejected": -2.798316240310669, "logps/chosen": -42.18428039550781, "logps/rejected": -77.34943389892578, "loss": 0.5872, "rewards/accuracies": 0.9375, "rewards/chosen": 0.49577420949935913, "rewards/margins": 3.06772518157959, "rewards/rejected": -2.571950912475586, "step": 361 }, { "epoch": 1.0433369447453955, "grad_norm": 3.9694089889526367, "learning_rate": 8.956647398843931e-07, "logits/chosen": -2.583324432373047, "logits/rejected": -2.8108761310577393, "logps/chosen": -30.998029708862305, "logps/rejected": -68.20577239990234, "loss": 0.4845, "rewards/accuracies": 1.0, "rewards/chosen": 1.3087923526763916, "rewards/margins": 3.4818739891052246, "rewards/rejected": -2.173081874847412, "step": 362 }, { "epoch": 1.0462260743950884, "grad_norm": 3.7666094303131104, "learning_rate": 8.953757225433526e-07, "logits/chosen": -2.6847457885742188, "logits/rejected": -2.769507646560669, "logps/chosen": -39.647579193115234, "logps/rejected": -71.65984344482422, "loss": 0.5769, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6884243488311768, "rewards/margins": 3.0851964950561523, "rewards/rejected": -2.3967723846435547, "step": 363 }, { "epoch": 1.0491152040447815, "grad_norm": 4.184115886688232, "learning_rate": 8.950867052023121e-07, "logits/chosen": -2.5889270305633545, "logits/rejected": -2.7022836208343506, "logps/chosen": -36.179718017578125, "logps/rejected": -79.02323913574219, "loss": 0.5343, "rewards/accuracies": 0.875, "rewards/chosen": 1.0400224924087524, "rewards/margins": 3.9413888454437256, "rewards/rejected": -2.9013662338256836, "step": 364 }, { "epoch": 1.0520043336944744, "grad_norm": 3.4295694828033447, "learning_rate": 8.947976878612716e-07, "logits/chosen": -2.6274805068969727, "logits/rejected": -2.77897572517395, "logps/chosen": -38.49458312988281, "logps/rejected": -81.05567169189453, "loss": 0.4896, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7275655269622803, "rewards/margins": 3.902857780456543, "rewards/rejected": -3.1752920150756836, "step": 365 }, { "epoch": 1.0548934633441676, "grad_norm": 4.391941070556641, "learning_rate": 8.945086705202312e-07, "logits/chosen": -2.5908727645874023, "logits/rejected": -2.76975154876709, "logps/chosen": -30.21854019165039, "logps/rejected": -67.23657989501953, "loss": 0.5326, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2213455438613892, "rewards/margins": 3.3225646018981934, "rewards/rejected": -2.1012191772460938, "step": 366 }, { "epoch": 1.0577825929938607, "grad_norm": 4.115214824676514, "learning_rate": 8.942196531791906e-07, "logits/chosen": -2.7148776054382324, "logits/rejected": -2.817162036895752, "logps/chosen": -39.17529296875, "logps/rejected": -69.69668579101562, "loss": 0.5744, "rewards/accuracies": 0.875, "rewards/chosen": 0.869823157787323, "rewards/margins": 3.1502022743225098, "rewards/rejected": -2.280379056930542, "step": 367 }, { "epoch": 1.0606717226435536, "grad_norm": 3.815997362136841, "learning_rate": 8.939306358381502e-07, "logits/chosen": -2.716233730316162, "logits/rejected": -2.8169775009155273, "logps/chosen": -38.88613510131836, "logps/rejected": -70.28046417236328, "loss": 0.5821, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8400521874427795, "rewards/margins": 3.1169867515563965, "rewards/rejected": -2.2769346237182617, "step": 368 }, { "epoch": 1.0635608522932467, "grad_norm": 5.7282538414001465, "learning_rate": 8.936416184971099e-07, "logits/chosen": -2.707303762435913, "logits/rejected": -2.7893030643463135, "logps/chosen": -36.4865837097168, "logps/rejected": -76.44750213623047, "loss": 0.5207, "rewards/accuracies": 1.0, "rewards/chosen": 1.0300416946411133, "rewards/margins": 3.560987949371338, "rewards/rejected": -2.5309460163116455, "step": 369 }, { "epoch": 1.0664499819429396, "grad_norm": 4.000889778137207, "learning_rate": 8.933526011560693e-07, "logits/chosen": -2.647754669189453, "logits/rejected": -2.7487518787384033, "logps/chosen": -40.235923767089844, "logps/rejected": -67.27999114990234, "loss": 0.6103, "rewards/accuracies": 0.875, "rewards/chosen": 0.5401511788368225, "rewards/margins": 2.5919322967529297, "rewards/rejected": -2.051781177520752, "step": 370 }, { "epoch": 1.0693391115926327, "grad_norm": 3.800471782684326, "learning_rate": 8.930635838150289e-07, "logits/chosen": -2.5877697467803955, "logits/rejected": -2.7140374183654785, "logps/chosen": -38.25746154785156, "logps/rejected": -74.94580841064453, "loss": 0.5784, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7334631681442261, "rewards/margins": 3.1605255603790283, "rewards/rejected": -2.427062511444092, "step": 371 }, { "epoch": 1.0722282412423259, "grad_norm": 3.3378031253814697, "learning_rate": 8.927745664739884e-07, "logits/chosen": -2.612712860107422, "logits/rejected": -2.7114951610565186, "logps/chosen": -45.95894241333008, "logps/rejected": -77.95696258544922, "loss": 0.6409, "rewards/accuracies": 0.90625, "rewards/chosen": 0.015276536345481873, "rewards/margins": 2.92169189453125, "rewards/rejected": -2.9064152240753174, "step": 372 }, { "epoch": 1.0751173708920188, "grad_norm": 5.105685234069824, "learning_rate": 8.92485549132948e-07, "logits/chosen": -2.6808009147644043, "logits/rejected": -2.8769354820251465, "logps/chosen": -40.77914810180664, "logps/rejected": -62.73938751220703, "loss": 0.6787, "rewards/accuracies": 0.875, "rewards/chosen": 0.8619775772094727, "rewards/margins": 2.307441234588623, "rewards/rejected": -1.4454636573791504, "step": 373 }, { "epoch": 1.0780065005417119, "grad_norm": 4.5050048828125, "learning_rate": 8.921965317919074e-07, "logits/chosen": -2.694953203201294, "logits/rejected": -2.813584566116333, "logps/chosen": -35.72710418701172, "logps/rejected": -78.33296203613281, "loss": 0.5023, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8362966179847717, "rewards/margins": 3.8365471363067627, "rewards/rejected": -3.0002505779266357, "step": 374 }, { "epoch": 1.0808956301914048, "grad_norm": 4.606315612792969, "learning_rate": 8.91907514450867e-07, "logits/chosen": -2.6429505348205566, "logits/rejected": -2.7533328533172607, "logps/chosen": -37.199073791503906, "logps/rejected": -64.58412170410156, "loss": 0.6549, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8086665868759155, "rewards/margins": 2.41715931892395, "rewards/rejected": -1.6084928512573242, "step": 375 }, { "epoch": 1.083784759841098, "grad_norm": 4.287848949432373, "learning_rate": 8.916184971098265e-07, "logits/chosen": -2.616523265838623, "logits/rejected": -2.7270827293395996, "logps/chosen": -32.501976013183594, "logps/rejected": -66.85604095458984, "loss": 0.5087, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5083478689193726, "rewards/margins": 3.653153419494629, "rewards/rejected": -2.144805431365967, "step": 376 }, { "epoch": 1.0866738894907908, "grad_norm": 4.7546610832214355, "learning_rate": 8.913294797687861e-07, "logits/chosen": -2.6369495391845703, "logits/rejected": -2.8009378910064697, "logps/chosen": -34.13892364501953, "logps/rejected": -62.84431457519531, "loss": 0.5781, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2841787338256836, "rewards/margins": 2.928971529006958, "rewards/rejected": -1.6447926759719849, "step": 377 }, { "epoch": 1.089563019140484, "grad_norm": 4.097519397735596, "learning_rate": 8.910404624277457e-07, "logits/chosen": -2.6674346923828125, "logits/rejected": -2.8234317302703857, "logps/chosen": -37.382266998291016, "logps/rejected": -76.36051940917969, "loss": 0.4709, "rewards/accuracies": 1.0, "rewards/chosen": 1.059966802597046, "rewards/margins": 3.704927444458008, "rewards/rejected": -2.6449601650238037, "step": 378 }, { "epoch": 1.092452148790177, "grad_norm": 3.9319417476654053, "learning_rate": 8.907514450867052e-07, "logits/chosen": -2.59186053276062, "logits/rejected": -2.6682050228118896, "logps/chosen": -30.398130416870117, "logps/rejected": -72.88426208496094, "loss": 0.4506, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6080918312072754, "rewards/margins": 3.92158842086792, "rewards/rejected": -2.3134965896606445, "step": 379 }, { "epoch": 1.09534127843987, "grad_norm": 4.520718097686768, "learning_rate": 8.904624277456647e-07, "logits/chosen": -2.6662437915802, "logits/rejected": -2.6979880332946777, "logps/chosen": -43.91696548461914, "logps/rejected": -86.04647827148438, "loss": 0.5609, "rewards/accuracies": 0.96875, "rewards/chosen": 0.48574501276016235, "rewards/margins": 3.9556491374969482, "rewards/rejected": -3.4699044227600098, "step": 380 }, { "epoch": 1.098230408089563, "grad_norm": 5.119112491607666, "learning_rate": 8.901734104046242e-07, "logits/chosen": -2.6846835613250732, "logits/rejected": -2.8772716522216797, "logps/chosen": -33.24864959716797, "logps/rejected": -63.430084228515625, "loss": 0.5752, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1046857833862305, "rewards/margins": 2.844252109527588, "rewards/rejected": -1.7395659685134888, "step": 381 }, { "epoch": 1.101119537739256, "grad_norm": 5.203680038452148, "learning_rate": 8.898843930635838e-07, "logits/chosen": -2.639986991882324, "logits/rejected": -2.8273942470550537, "logps/chosen": -32.76747512817383, "logps/rejected": -71.0276870727539, "loss": 0.4869, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5199198722839355, "rewards/margins": 3.8611972332000732, "rewards/rejected": -2.341277599334717, "step": 382 }, { "epoch": 1.104008667388949, "grad_norm": 4.195711135864258, "learning_rate": 8.895953757225433e-07, "logits/chosen": -2.586557388305664, "logits/rejected": -2.7604846954345703, "logps/chosen": -31.725234985351562, "logps/rejected": -66.67027282714844, "loss": 0.5478, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4709930419921875, "rewards/margins": 3.2202858924865723, "rewards/rejected": -1.7492926120758057, "step": 383 }, { "epoch": 1.1068977970386422, "grad_norm": 4.448134899139404, "learning_rate": 8.893063583815028e-07, "logits/chosen": -2.595475196838379, "logits/rejected": -2.718139171600342, "logps/chosen": -40.147884368896484, "logps/rejected": -83.6457290649414, "loss": 0.5209, "rewards/accuracies": 1.0, "rewards/chosen": 0.5680930614471436, "rewards/margins": 3.911783456802368, "rewards/rejected": -3.3436903953552246, "step": 384 }, { "epoch": 1.1097869266883351, "grad_norm": 3.820861577987671, "learning_rate": 8.890173410404624e-07, "logits/chosen": -2.56107759475708, "logits/rejected": -2.7080538272857666, "logps/chosen": -35.25724792480469, "logps/rejected": -69.7938003540039, "loss": 0.5872, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1053979396820068, "rewards/margins": 3.360851526260376, "rewards/rejected": -2.255453586578369, "step": 385 }, { "epoch": 1.1126760563380282, "grad_norm": 4.443717956542969, "learning_rate": 8.88728323699422e-07, "logits/chosen": -2.7117831707000732, "logits/rejected": -2.8422939777374268, "logps/chosen": -37.33143615722656, "logps/rejected": -78.2411880493164, "loss": 0.571, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7799036502838135, "rewards/margins": 3.8490095138549805, "rewards/rejected": -3.069105625152588, "step": 386 }, { "epoch": 1.1155651859877211, "grad_norm": 3.8960816860198975, "learning_rate": 8.884393063583814e-07, "logits/chosen": -2.7303829193115234, "logits/rejected": -2.8859949111938477, "logps/chosen": -35.78809356689453, "logps/rejected": -83.34320831298828, "loss": 0.4933, "rewards/accuracies": 0.96875, "rewards/chosen": 1.198372721672058, "rewards/margins": 4.303105354309082, "rewards/rejected": -3.1047322750091553, "step": 387 }, { "epoch": 1.1184543156374143, "grad_norm": 3.7495932579040527, "learning_rate": 8.88150289017341e-07, "logits/chosen": -2.6963415145874023, "logits/rejected": -2.8378100395202637, "logps/chosen": -43.61357498168945, "logps/rejected": -75.1953353881836, "loss": 0.653, "rewards/accuracies": 0.96875, "rewards/chosen": 0.539381206035614, "rewards/margins": 3.281782627105713, "rewards/rejected": -2.742401599884033, "step": 388 }, { "epoch": 1.1213434452871072, "grad_norm": 4.852271556854248, "learning_rate": 8.878612716763006e-07, "logits/chosen": -2.7159271240234375, "logits/rejected": -2.766693115234375, "logps/chosen": -36.49671936035156, "logps/rejected": -74.44003295898438, "loss": 0.5288, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8248002529144287, "rewards/margins": 3.4152514934539795, "rewards/rejected": -2.590451240539551, "step": 389 }, { "epoch": 1.1242325749368003, "grad_norm": 3.4910240173339844, "learning_rate": 8.875722543352601e-07, "logits/chosen": -2.6837315559387207, "logits/rejected": -2.8103346824645996, "logps/chosen": -39.46384811401367, "logps/rejected": -78.43911743164062, "loss": 0.6118, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4987215995788574, "rewards/margins": 3.516116142272949, "rewards/rejected": -3.017394781112671, "step": 390 }, { "epoch": 1.1271217045864934, "grad_norm": 4.421627998352051, "learning_rate": 8.872832369942196e-07, "logits/chosen": -2.7595365047454834, "logits/rejected": -2.8590078353881836, "logps/chosen": -40.07503890991211, "logps/rejected": -72.86799621582031, "loss": 0.5797, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5742632746696472, "rewards/margins": 3.076848030090332, "rewards/rejected": -2.50258469581604, "step": 391 }, { "epoch": 1.1300108342361863, "grad_norm": 3.873779058456421, "learning_rate": 8.869942196531791e-07, "logits/chosen": -2.6963140964508057, "logits/rejected": -2.836185932159424, "logps/chosen": -33.9482307434082, "logps/rejected": -75.56643676757812, "loss": 0.5357, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2973594665527344, "rewards/margins": 3.918779134750366, "rewards/rejected": -2.621419906616211, "step": 392 }, { "epoch": 1.1328999638858794, "grad_norm": 4.417798042297363, "learning_rate": 8.867052023121387e-07, "logits/chosen": -2.7001137733459473, "logits/rejected": -2.767489433288574, "logps/chosen": -38.320796966552734, "logps/rejected": -73.93550872802734, "loss": 0.6165, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6084245443344116, "rewards/margins": 3.2792775630950928, "rewards/rejected": -2.6708531379699707, "step": 393 }, { "epoch": 1.1357890935355723, "grad_norm": 4.567755222320557, "learning_rate": 8.864161849710982e-07, "logits/chosen": -2.603849411010742, "logits/rejected": -2.7687649726867676, "logps/chosen": -33.923828125, "logps/rejected": -67.75118255615234, "loss": 0.4985, "rewards/accuracies": 1.0, "rewards/chosen": 1.0084538459777832, "rewards/margins": 3.3150391578674316, "rewards/rejected": -2.3065853118896484, "step": 394 }, { "epoch": 1.1386782231852655, "grad_norm": 4.571526050567627, "learning_rate": 8.861271676300578e-07, "logits/chosen": -2.7011799812316895, "logits/rejected": -2.873833417892456, "logps/chosen": -34.9163703918457, "logps/rejected": -63.023582458496094, "loss": 0.6274, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0786833763122559, "rewards/margins": 2.729031801223755, "rewards/rejected": -1.650348424911499, "step": 395 }, { "epoch": 1.1415673528349584, "grad_norm": 5.447819709777832, "learning_rate": 8.858381502890173e-07, "logits/chosen": -2.656836986541748, "logits/rejected": -2.7667465209960938, "logps/chosen": -34.49205780029297, "logps/rejected": -77.8834457397461, "loss": 0.5287, "rewards/accuracies": 1.0, "rewards/chosen": 1.0143662691116333, "rewards/margins": 3.9117655754089355, "rewards/rejected": -2.8973991870880127, "step": 396 }, { "epoch": 1.1444564824846515, "grad_norm": 5.010695457458496, "learning_rate": 8.855491329479768e-07, "logits/chosen": -2.601531744003296, "logits/rejected": -2.7134859561920166, "logps/chosen": -39.23756790161133, "logps/rejected": -80.86209869384766, "loss": 0.5559, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7050048112869263, "rewards/margins": 3.9246737957000732, "rewards/rejected": -3.2196688652038574, "step": 397 }, { "epoch": 1.1473456121343446, "grad_norm": 10.614958763122559, "learning_rate": 8.852601156069363e-07, "logits/chosen": -2.6540987491607666, "logits/rejected": -2.8740921020507812, "logps/chosen": -34.130455017089844, "logps/rejected": -73.70214080810547, "loss": 0.5609, "rewards/accuracies": 1.0, "rewards/chosen": 1.2075939178466797, "rewards/margins": 3.856597423553467, "rewards/rejected": -2.649003505706787, "step": 398 }, { "epoch": 1.1502347417840375, "grad_norm": 4.696315288543701, "learning_rate": 8.849710982658959e-07, "logits/chosen": -2.646531105041504, "logits/rejected": -2.777148485183716, "logps/chosen": -44.282386779785156, "logps/rejected": -71.73926544189453, "loss": 0.6736, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3998558521270752, "rewards/margins": 2.570723056793213, "rewards/rejected": -2.1708672046661377, "step": 399 }, { "epoch": 1.1531238714337306, "grad_norm": 5.293782711029053, "learning_rate": 8.846820809248555e-07, "logits/chosen": -2.6952064037323, "logits/rejected": -2.85719633102417, "logps/chosen": -33.302555084228516, "logps/rejected": -70.3379898071289, "loss": 0.5559, "rewards/accuracies": 0.9375, "rewards/chosen": 1.314673900604248, "rewards/margins": 3.6022276878356934, "rewards/rejected": -2.2875540256500244, "step": 400 }, { "epoch": 1.1560130010834235, "grad_norm": 4.217169761657715, "learning_rate": 8.843930635838149e-07, "logits/chosen": -2.664534568786621, "logits/rejected": -2.7651610374450684, "logps/chosen": -48.26874542236328, "logps/rejected": -80.17000579833984, "loss": 0.6724, "rewards/accuracies": 0.90625, "rewards/chosen": -0.35078269243240356, "rewards/margins": 2.833745241165161, "rewards/rejected": -3.184528112411499, "step": 401 }, { "epoch": 1.1589021307331167, "grad_norm": 3.645480155944824, "learning_rate": 8.841040462427746e-07, "logits/chosen": -2.7515487670898438, "logits/rejected": -2.8162059783935547, "logps/chosen": -37.97970962524414, "logps/rejected": -75.37366485595703, "loss": 0.5483, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5657361745834351, "rewards/margins": 3.440424680709839, "rewards/rejected": -2.8746886253356934, "step": 402 }, { "epoch": 1.1617912603828096, "grad_norm": 5.526108264923096, "learning_rate": 8.838150289017341e-07, "logits/chosen": -2.6186935901641846, "logits/rejected": -2.7155447006225586, "logps/chosen": -42.78927230834961, "logps/rejected": -86.1871337890625, "loss": 0.5744, "rewards/accuracies": 0.9375, "rewards/chosen": 0.31486985087394714, "rewards/margins": 3.86824107170105, "rewards/rejected": -3.5533714294433594, "step": 403 }, { "epoch": 1.1646803900325027, "grad_norm": 5.4208269119262695, "learning_rate": 8.835260115606936e-07, "logits/chosen": -2.7949934005737305, "logits/rejected": -2.7570061683654785, "logps/chosen": -38.388885498046875, "logps/rejected": -78.5318603515625, "loss": 0.5766, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6491004228591919, "rewards/margins": 3.4858458042144775, "rewards/rejected": -2.836745262145996, "step": 404 }, { "epoch": 1.1675695196821958, "grad_norm": 5.321201324462891, "learning_rate": 8.832369942196531e-07, "logits/chosen": -2.8210983276367188, "logits/rejected": -2.875443458557129, "logps/chosen": -37.25994110107422, "logps/rejected": -72.69224548339844, "loss": 0.6092, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6131730675697327, "rewards/margins": 3.0758004188537598, "rewards/rejected": -2.46262788772583, "step": 405 }, { "epoch": 1.1704586493318887, "grad_norm": 4.314303874969482, "learning_rate": 8.829479768786127e-07, "logits/chosen": -2.673849582672119, "logits/rejected": -2.770573139190674, "logps/chosen": -28.970577239990234, "logps/rejected": -65.72933197021484, "loss": 0.5657, "rewards/accuracies": 0.875, "rewards/chosen": 1.4263311624526978, "rewards/margins": 3.048949718475342, "rewards/rejected": -1.622618556022644, "step": 406 }, { "epoch": 1.1733477789815818, "grad_norm": 4.760498523712158, "learning_rate": 8.826589595375722e-07, "logits/chosen": -2.803218126296997, "logits/rejected": -2.8597984313964844, "logps/chosen": -36.2917366027832, "logps/rejected": -74.27033233642578, "loss": 0.5564, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1513960361480713, "rewards/margins": 3.3471157550811768, "rewards/rejected": -2.1957199573516846, "step": 407 }, { "epoch": 1.176236908631275, "grad_norm": 4.767838954925537, "learning_rate": 8.823699421965317e-07, "logits/chosen": -2.683689832687378, "logits/rejected": -2.7666618824005127, "logps/chosen": -33.9878044128418, "logps/rejected": -68.52696990966797, "loss": 0.5454, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8845517635345459, "rewards/margins": 3.173886775970459, "rewards/rejected": -2.289335250854492, "step": 408 }, { "epoch": 1.1791260382809678, "grad_norm": 4.63846492767334, "learning_rate": 8.820809248554912e-07, "logits/chosen": -2.6688179969787598, "logits/rejected": -2.8492374420166016, "logps/chosen": -37.00197982788086, "logps/rejected": -65.50816345214844, "loss": 0.6497, "rewards/accuracies": 0.875, "rewards/chosen": 0.8976666927337646, "rewards/margins": 2.7745981216430664, "rewards/rejected": -1.8769315481185913, "step": 409 }, { "epoch": 1.182015167930661, "grad_norm": 4.959630966186523, "learning_rate": 8.817919075144509e-07, "logits/chosen": -2.747529983520508, "logits/rejected": -2.8279848098754883, "logps/chosen": -38.51792526245117, "logps/rejected": -69.94867706298828, "loss": 0.5682, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8562703132629395, "rewards/margins": 3.090345859527588, "rewards/rejected": -2.2340755462646484, "step": 410 }, { "epoch": 1.1849042975803539, "grad_norm": 5.828198432922363, "learning_rate": 8.815028901734104e-07, "logits/chosen": -2.674741744995117, "logits/rejected": -2.7915751934051514, "logps/chosen": -34.95844268798828, "logps/rejected": -71.99874877929688, "loss": 0.4764, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0237369537353516, "rewards/margins": 3.5601444244384766, "rewards/rejected": -2.536407470703125, "step": 411 }, { "epoch": 1.187793427230047, "grad_norm": 5.415954113006592, "learning_rate": 8.812138728323699e-07, "logits/chosen": -2.690664291381836, "logits/rejected": -2.857930898666382, "logps/chosen": -45.44537353515625, "logps/rejected": -78.08106994628906, "loss": 0.6028, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04367390275001526, "rewards/margins": 3.12933349609375, "rewards/rejected": -3.1730074882507324, "step": 412 }, { "epoch": 1.19068255687974, "grad_norm": 11.096631050109863, "learning_rate": 8.809248554913295e-07, "logits/chosen": -2.6459004878997803, "logits/rejected": -2.779163360595703, "logps/chosen": -34.97566223144531, "logps/rejected": -68.373779296875, "loss": 0.5106, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3187159299850464, "rewards/margins": 3.2909042835235596, "rewards/rejected": -1.9721884727478027, "step": 413 }, { "epoch": 1.193571686529433, "grad_norm": 4.363132476806641, "learning_rate": 8.806358381502889e-07, "logits/chosen": -2.7269585132598877, "logits/rejected": -2.842881202697754, "logps/chosen": -35.969112396240234, "logps/rejected": -70.01860046386719, "loss": 0.5602, "rewards/accuracies": 0.96875, "rewards/chosen": 1.135250210762024, "rewards/margins": 3.626439094543457, "rewards/rejected": -2.4911882877349854, "step": 414 }, { "epoch": 1.1964608161791261, "grad_norm": 4.189739227294922, "learning_rate": 8.803468208092485e-07, "logits/chosen": -2.783520221710205, "logits/rejected": -2.8271918296813965, "logps/chosen": -37.135467529296875, "logps/rejected": -77.63946533203125, "loss": 0.5725, "rewards/accuracies": 1.0, "rewards/chosen": 0.9615842700004578, "rewards/margins": 3.6996665000915527, "rewards/rejected": -2.7380824089050293, "step": 415 }, { "epoch": 1.199349945828819, "grad_norm": 3.9679324626922607, "learning_rate": 8.80057803468208e-07, "logits/chosen": -2.719433546066284, "logits/rejected": -2.7794110774993896, "logps/chosen": -36.31254577636719, "logps/rejected": -68.19381713867188, "loss": 0.6527, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9835607409477234, "rewards/margins": 3.075997829437256, "rewards/rejected": -2.092437267303467, "step": 416 }, { "epoch": 1.2022390754785122, "grad_norm": 4.1102423667907715, "learning_rate": 8.797687861271676e-07, "logits/chosen": -2.696917772293091, "logits/rejected": -2.826016426086426, "logps/chosen": -39.1036262512207, "logps/rejected": -84.30925750732422, "loss": 0.4544, "rewards/accuracies": 0.9375, "rewards/chosen": 0.906946063041687, "rewards/margins": 4.279753684997559, "rewards/rejected": -3.372807741165161, "step": 417 }, { "epoch": 1.205128205128205, "grad_norm": 5.257472991943359, "learning_rate": 8.794797687861271e-07, "logits/chosen": -2.695589780807495, "logits/rejected": -2.800180673599243, "logps/chosen": -32.31498336791992, "logps/rejected": -77.95870971679688, "loss": 0.4429, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4337422847747803, "rewards/margins": 4.293133735656738, "rewards/rejected": -2.859391689300537, "step": 418 }, { "epoch": 1.2080173347778982, "grad_norm": 4.386075019836426, "learning_rate": 8.791907514450867e-07, "logits/chosen": -2.7151756286621094, "logits/rejected": -2.8277535438537598, "logps/chosen": -35.461978912353516, "logps/rejected": -70.59474182128906, "loss": 0.5777, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1487724781036377, "rewards/margins": 3.304598331451416, "rewards/rejected": -2.1558260917663574, "step": 419 }, { "epoch": 1.210906464427591, "grad_norm": 4.576193332672119, "learning_rate": 8.789017341040463e-07, "logits/chosen": -2.7332358360290527, "logits/rejected": -2.841491222381592, "logps/chosen": -40.76188278198242, "logps/rejected": -80.75080108642578, "loss": 0.6009, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5478619337081909, "rewards/margins": 3.6950790882110596, "rewards/rejected": -3.147217035293579, "step": 420 }, { "epoch": 1.210906464427591, "eval_logits/chosen": -2.7678651809692383, "eval_logits/rejected": -2.8934261798858643, "eval_logps/chosen": -37.1412239074707, "eval_logps/rejected": -75.38317108154297, "eval_loss": 0.5447859764099121, "eval_rewards/accuracies": 0.9354838728904724, "eval_rewards/chosen": 0.9175196886062622, "eval_rewards/margins": 3.760819673538208, "eval_rewards/rejected": -2.8433001041412354, "eval_runtime": 225.2183, "eval_samples_per_second": 0.546, "eval_steps_per_second": 0.275, "step": 420 }, { "epoch": 1.2137955940772842, "grad_norm": 4.624328136444092, "learning_rate": 8.786127167630057e-07, "logits/chosen": -2.678075075149536, "logits/rejected": -2.782327175140381, "logps/chosen": -41.900184631347656, "logps/rejected": -75.0003433227539, "loss": 0.5994, "rewards/accuracies": 0.875, "rewards/chosen": 0.8105798363685608, "rewards/margins": 3.2758662700653076, "rewards/rejected": -2.4652862548828125, "step": 421 }, { "epoch": 1.2166847237269773, "grad_norm": 5.227085113525391, "learning_rate": 8.783236994219653e-07, "logits/chosen": -2.8757693767547607, "logits/rejected": -2.9884181022644043, "logps/chosen": -35.9775276184082, "logps/rejected": -71.68123626708984, "loss": 0.5398, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7908434867858887, "rewards/margins": 3.2878737449645996, "rewards/rejected": -2.49703049659729, "step": 422 }, { "epoch": 1.2195738533766702, "grad_norm": 4.352758407592773, "learning_rate": 8.780346820809248e-07, "logits/chosen": -2.6959564685821533, "logits/rejected": -2.8545141220092773, "logps/chosen": -39.34725570678711, "logps/rejected": -79.40071868896484, "loss": 0.5905, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7301769852638245, "rewards/margins": 3.8670945167541504, "rewards/rejected": -3.1369171142578125, "step": 423 }, { "epoch": 1.2224629830263634, "grad_norm": 5.082805633544922, "learning_rate": 8.777456647398844e-07, "logits/chosen": -2.7640867233276367, "logits/rejected": -2.8567121028900146, "logps/chosen": -33.79572296142578, "logps/rejected": -76.1879653930664, "loss": 0.5151, "rewards/accuracies": 1.0, "rewards/chosen": 1.0792731046676636, "rewards/margins": 3.9319570064544678, "rewards/rejected": -2.8526837825775146, "step": 424 }, { "epoch": 1.2253521126760563, "grad_norm": 4.153040885925293, "learning_rate": 8.774566473988438e-07, "logits/chosen": -2.6738409996032715, "logits/rejected": -2.8047215938568115, "logps/chosen": -44.478309631347656, "logps/rejected": -71.54761505126953, "loss": 0.6999, "rewards/accuracies": 0.90625, "rewards/chosen": 0.06620743870735168, "rewards/margins": 2.4611761569976807, "rewards/rejected": -2.3949685096740723, "step": 425 }, { "epoch": 1.2282412423257494, "grad_norm": 5.1140570640563965, "learning_rate": 8.771676300578035e-07, "logits/chosen": -2.7175941467285156, "logits/rejected": -2.826666831970215, "logps/chosen": -34.636314392089844, "logps/rejected": -69.7265853881836, "loss": 0.5649, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2315003871917725, "rewards/margins": 3.4566309452056885, "rewards/rejected": -2.225130558013916, "step": 426 }, { "epoch": 1.2311303719754423, "grad_norm": 5.279741287231445, "learning_rate": 8.76878612716763e-07, "logits/chosen": -2.742215156555176, "logits/rejected": -2.801013469696045, "logps/chosen": -40.77836608886719, "logps/rejected": -76.24119567871094, "loss": 0.6269, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5099973678588867, "rewards/margins": 3.0800135135650635, "rewards/rejected": -2.5700161457061768, "step": 427 }, { "epoch": 1.2340195016251354, "grad_norm": 4.672144889831543, "learning_rate": 8.765895953757225e-07, "logits/chosen": -2.8198516368865967, "logits/rejected": -2.891038417816162, "logps/chosen": -45.769691467285156, "logps/rejected": -90.80643463134766, "loss": 0.6191, "rewards/accuracies": 0.9375, "rewards/chosen": 0.28314727544784546, "rewards/margins": 4.039517402648926, "rewards/rejected": -3.7563705444335938, "step": 428 }, { "epoch": 1.2369086312748285, "grad_norm": 4.471892833709717, "learning_rate": 8.76300578034682e-07, "logits/chosen": -2.699690580368042, "logits/rejected": -2.8810839653015137, "logps/chosen": -39.90394973754883, "logps/rejected": -67.92735290527344, "loss": 0.6389, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6688891053199768, "rewards/margins": 2.8126580715179443, "rewards/rejected": -2.143768787384033, "step": 429 }, { "epoch": 1.2397977609245214, "grad_norm": 4.523414134979248, "learning_rate": 8.760115606936416e-07, "logits/chosen": -2.725552558898926, "logits/rejected": -2.893455982208252, "logps/chosen": -40.625244140625, "logps/rejected": -84.12871551513672, "loss": 0.5722, "rewards/accuracies": 1.0, "rewards/chosen": 0.5917713642120361, "rewards/margins": 3.8172082901000977, "rewards/rejected": -3.2254369258880615, "step": 430 }, { "epoch": 1.2426868905742146, "grad_norm": 4.6731181144714355, "learning_rate": 8.75722543352601e-07, "logits/chosen": -2.7196595668792725, "logits/rejected": -2.8844504356384277, "logps/chosen": -36.05049133300781, "logps/rejected": -82.12178039550781, "loss": 0.5371, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2439333200454712, "rewards/margins": 4.615616321563721, "rewards/rejected": -3.371683359146118, "step": 431 }, { "epoch": 1.2455760202239075, "grad_norm": 5.008205890655518, "learning_rate": 8.754335260115606e-07, "logits/chosen": -2.7236340045928955, "logits/rejected": -2.8835086822509766, "logps/chosen": -33.549922943115234, "logps/rejected": -69.07763671875, "loss": 0.5934, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0692138671875, "rewards/margins": 3.2445507049560547, "rewards/rejected": -2.1753363609313965, "step": 432 }, { "epoch": 1.2484651498736006, "grad_norm": 4.430170059204102, "learning_rate": 8.751445086705202e-07, "logits/chosen": -2.8028788566589355, "logits/rejected": -2.893110990524292, "logps/chosen": -37.615962982177734, "logps/rejected": -76.26776885986328, "loss": 0.5356, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9204255938529968, "rewards/margins": 3.8570706844329834, "rewards/rejected": -2.936645030975342, "step": 433 }, { "epoch": 1.2513542795232935, "grad_norm": 6.2711052894592285, "learning_rate": 8.748554913294798e-07, "logits/chosen": -2.7720322608947754, "logits/rejected": -2.95858097076416, "logps/chosen": -49.2936897277832, "logps/rejected": -86.44499206542969, "loss": 0.5868, "rewards/accuracies": 0.96875, "rewards/chosen": -0.05711996555328369, "rewards/margins": 3.697941303253174, "rewards/rejected": -3.755061149597168, "step": 434 }, { "epoch": 1.2542434091729866, "grad_norm": 4.672637462615967, "learning_rate": 8.745664739884393e-07, "logits/chosen": -2.683133840560913, "logits/rejected": -2.8268959522247314, "logps/chosen": -31.175844192504883, "logps/rejected": -74.44174194335938, "loss": 0.4789, "rewards/accuracies": 1.0, "rewards/chosen": 1.4744740724563599, "rewards/margins": 4.108086109161377, "rewards/rejected": -2.6336121559143066, "step": 435 }, { "epoch": 1.2571325388226797, "grad_norm": 4.736401081085205, "learning_rate": 8.742774566473988e-07, "logits/chosen": -2.7310378551483154, "logits/rejected": -2.8620333671569824, "logps/chosen": -38.13949966430664, "logps/rejected": -81.59400939941406, "loss": 0.5392, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8322504162788391, "rewards/margins": 4.03175163269043, "rewards/rejected": -3.1995010375976562, "step": 436 }, { "epoch": 1.2600216684723726, "grad_norm": 4.3230881690979, "learning_rate": 8.739884393063584e-07, "logits/chosen": -2.7729902267456055, "logits/rejected": -2.8223047256469727, "logps/chosen": -39.73530197143555, "logps/rejected": -85.56317901611328, "loss": 0.4641, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9063411355018616, "rewards/margins": 4.459737300872803, "rewards/rejected": -3.553395986557007, "step": 437 }, { "epoch": 1.2629107981220657, "grad_norm": 5.208999156951904, "learning_rate": 8.736994219653178e-07, "logits/chosen": -2.755342721939087, "logits/rejected": -2.918727397918701, "logps/chosen": -32.01211166381836, "logps/rejected": -77.4506607055664, "loss": 0.5032, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4197477102279663, "rewards/margins": 4.112761497497559, "rewards/rejected": -2.693013906478882, "step": 438 }, { "epoch": 1.2657999277717589, "grad_norm": 6.019786357879639, "learning_rate": 8.734104046242774e-07, "logits/chosen": -2.7843399047851562, "logits/rejected": -2.9184255599975586, "logps/chosen": -33.55283737182617, "logps/rejected": -73.92412567138672, "loss": 0.4898, "rewards/accuracies": 0.96875, "rewards/chosen": 1.275992751121521, "rewards/margins": 3.9589927196502686, "rewards/rejected": -2.682999849319458, "step": 439 }, { "epoch": 1.2686890574214518, "grad_norm": 5.056728363037109, "learning_rate": 8.731213872832369e-07, "logits/chosen": -2.7187554836273193, "logits/rejected": -2.93357253074646, "logps/chosen": -35.32181167602539, "logps/rejected": -72.80290985107422, "loss": 0.5777, "rewards/accuracies": 0.875, "rewards/chosen": 1.005698800086975, "rewards/margins": 3.4086904525756836, "rewards/rejected": -2.402991533279419, "step": 440 }, { "epoch": 1.271578187071145, "grad_norm": 4.9927215576171875, "learning_rate": 8.728323699421965e-07, "logits/chosen": -2.698293924331665, "logits/rejected": -2.8382792472839355, "logps/chosen": -32.40521240234375, "logps/rejected": -72.90019989013672, "loss": 0.5333, "rewards/accuracies": 1.0, "rewards/chosen": 1.1709896326065063, "rewards/margins": 3.6544604301452637, "rewards/rejected": -2.483470916748047, "step": 441 }, { "epoch": 1.2744673167208378, "grad_norm": 5.236688137054443, "learning_rate": 8.72543352601156e-07, "logits/chosen": -2.8073441982269287, "logits/rejected": -2.9317736625671387, "logps/chosen": -37.74362564086914, "logps/rejected": -83.48320007324219, "loss": 0.4814, "rewards/accuracies": 1.0, "rewards/chosen": 0.6183822751045227, "rewards/margins": 4.198382377624512, "rewards/rejected": -3.580000400543213, "step": 442 }, { "epoch": 1.277356446370531, "grad_norm": 4.8349385261535645, "learning_rate": 8.722543352601156e-07, "logits/chosen": -2.7691972255706787, "logits/rejected": -2.935199737548828, "logps/chosen": -35.38799285888672, "logps/rejected": -74.75297546386719, "loss": 0.4935, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0413532257080078, "rewards/margins": 3.787895679473877, "rewards/rejected": -2.746542453765869, "step": 443 }, { "epoch": 1.2802455760202238, "grad_norm": 4.958865165710449, "learning_rate": 8.719653179190752e-07, "logits/chosen": -2.865434169769287, "logits/rejected": -2.983201026916504, "logps/chosen": -38.406280517578125, "logps/rejected": -76.03527069091797, "loss": 0.5542, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8820172548294067, "rewards/margins": 3.6524250507354736, "rewards/rejected": -2.7704076766967773, "step": 444 }, { "epoch": 1.283134705669917, "grad_norm": 5.309213638305664, "learning_rate": 8.716763005780346e-07, "logits/chosen": -2.816439628601074, "logits/rejected": -2.997514009475708, "logps/chosen": -46.15966033935547, "logps/rejected": -72.27320098876953, "loss": 0.7242, "rewards/accuracies": 0.84375, "rewards/chosen": 0.10111969709396362, "rewards/margins": 2.4936578273773193, "rewards/rejected": -2.392537832260132, "step": 445 }, { "epoch": 1.28602383531961, "grad_norm": 4.3404083251953125, "learning_rate": 8.713872832369942e-07, "logits/chosen": -2.8198482990264893, "logits/rejected": -3.068601608276367, "logps/chosen": -39.64863204956055, "logps/rejected": -79.35392761230469, "loss": 0.5108, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6846644878387451, "rewards/margins": 3.5911636352539062, "rewards/rejected": -2.906498908996582, "step": 446 }, { "epoch": 1.288912964969303, "grad_norm": 3.8316311836242676, "learning_rate": 8.710982658959537e-07, "logits/chosen": -2.7471210956573486, "logits/rejected": -2.933821439743042, "logps/chosen": -38.15103530883789, "logps/rejected": -75.36856079101562, "loss": 0.5516, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7086495757102966, "rewards/margins": 3.4639217853546143, "rewards/rejected": -2.755272150039673, "step": 447 }, { "epoch": 1.291802094618996, "grad_norm": 6.028949737548828, "learning_rate": 8.708092485549132e-07, "logits/chosen": -2.7934036254882812, "logits/rejected": -2.908989429473877, "logps/chosen": -34.22704315185547, "logps/rejected": -68.52715301513672, "loss": 0.5579, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3431901931762695, "rewards/margins": 3.287559747695923, "rewards/rejected": -1.9443693161010742, "step": 448 }, { "epoch": 1.294691224268689, "grad_norm": 4.961402893066406, "learning_rate": 8.705202312138727e-07, "logits/chosen": -2.7940239906311035, "logits/rejected": -2.934861660003662, "logps/chosen": -36.886409759521484, "logps/rejected": -75.4515151977539, "loss": 0.5551, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7367959022521973, "rewards/margins": 3.523159980773926, "rewards/rejected": -2.7863643169403076, "step": 449 }, { "epoch": 1.2975803539183821, "grad_norm": 4.775138854980469, "learning_rate": 8.702312138728324e-07, "logits/chosen": -2.7407169342041016, "logits/rejected": -2.9240574836730957, "logps/chosen": -37.243648529052734, "logps/rejected": -74.2890853881836, "loss": 0.5255, "rewards/accuracies": 0.9375, "rewards/chosen": 1.149469017982483, "rewards/margins": 3.7668211460113525, "rewards/rejected": -2.617352247238159, "step": 450 }, { "epoch": 1.300469483568075, "grad_norm": 5.8019208908081055, "learning_rate": 8.699421965317919e-07, "logits/chosen": -2.8531250953674316, "logits/rejected": -3.0119290351867676, "logps/chosen": -30.091724395751953, "logps/rejected": -71.05150604248047, "loss": 0.551, "rewards/accuracies": 0.90625, "rewards/chosen": 1.424283742904663, "rewards/margins": 3.9558916091918945, "rewards/rejected": -2.5316081047058105, "step": 451 }, { "epoch": 1.3033586132177681, "grad_norm": 4.535440921783447, "learning_rate": 8.696531791907514e-07, "logits/chosen": -2.824815034866333, "logits/rejected": -3.0352108478546143, "logps/chosen": -33.35322189331055, "logps/rejected": -81.41117095947266, "loss": 0.4256, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4413789510726929, "rewards/margins": 4.803022861480713, "rewards/rejected": -3.3616433143615723, "step": 452 }, { "epoch": 1.3062477428674613, "grad_norm": 4.982152462005615, "learning_rate": 8.69364161849711e-07, "logits/chosen": -2.86645770072937, "logits/rejected": -2.956289052963257, "logps/chosen": -38.46508026123047, "logps/rejected": -77.80690002441406, "loss": 0.6154, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6436825394630432, "rewards/margins": 3.614814043045044, "rewards/rejected": -2.9711313247680664, "step": 453 }, { "epoch": 1.3091368725171542, "grad_norm": 3.806307554244995, "learning_rate": 8.690751445086705e-07, "logits/chosen": -2.759852647781372, "logits/rejected": -2.886793613433838, "logps/chosen": -40.38119888305664, "logps/rejected": -87.66114044189453, "loss": 0.4474, "rewards/accuracies": 0.96875, "rewards/chosen": 0.850340723991394, "rewards/margins": 4.7124128341674805, "rewards/rejected": -3.8620715141296387, "step": 454 }, { "epoch": 1.3120260021668473, "grad_norm": 4.511470794677734, "learning_rate": 8.6878612716763e-07, "logits/chosen": -2.8225245475769043, "logits/rejected": -2.99293851852417, "logps/chosen": -31.107614517211914, "logps/rejected": -70.64469146728516, "loss": 0.5962, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0877379179000854, "rewards/margins": 3.592844009399414, "rewards/rejected": -2.505105972290039, "step": 455 }, { "epoch": 1.3149151318165404, "grad_norm": 4.278866291046143, "learning_rate": 8.684971098265895e-07, "logits/chosen": -2.8370516300201416, "logits/rejected": -2.9592154026031494, "logps/chosen": -45.782649993896484, "logps/rejected": -79.47604370117188, "loss": 0.6143, "rewards/accuracies": 0.9375, "rewards/chosen": -0.026744753122329712, "rewards/margins": 3.2368860244750977, "rewards/rejected": -3.2636303901672363, "step": 456 }, { "epoch": 1.3178042614662333, "grad_norm": 5.481095314025879, "learning_rate": 8.682080924855491e-07, "logits/chosen": -2.839883804321289, "logits/rejected": -3.000767230987549, "logps/chosen": -35.029815673828125, "logps/rejected": -73.94115447998047, "loss": 0.5889, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1072138547897339, "rewards/margins": 3.5285520553588867, "rewards/rejected": -2.4213385581970215, "step": 457 }, { "epoch": 1.3206933911159262, "grad_norm": 4.927905082702637, "learning_rate": 8.679190751445087e-07, "logits/chosen": -2.775681495666504, "logits/rejected": -2.9392733573913574, "logps/chosen": -44.71532440185547, "logps/rejected": -84.72069549560547, "loss": 0.5457, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13403868675231934, "rewards/margins": 3.8792638778686523, "rewards/rejected": -3.745224952697754, "step": 458 }, { "epoch": 1.3235825207656193, "grad_norm": 4.978816509246826, "learning_rate": 8.676300578034682e-07, "logits/chosen": -2.8252480030059814, "logits/rejected": -2.9670770168304443, "logps/chosen": -39.3198356628418, "logps/rejected": -82.33750915527344, "loss": 0.5245, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7262114882469177, "rewards/margins": 4.006939888000488, "rewards/rejected": -3.280728340148926, "step": 459 }, { "epoch": 1.3264716504153125, "grad_norm": 4.851628303527832, "learning_rate": 8.673410404624277e-07, "logits/chosen": -2.7195050716400146, "logits/rejected": -2.978254795074463, "logps/chosen": -35.67596435546875, "logps/rejected": -82.69499206542969, "loss": 0.4917, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8452720642089844, "rewards/margins": 4.205634117126465, "rewards/rejected": -3.3603620529174805, "step": 460 }, { "epoch": 1.3293607800650054, "grad_norm": 5.097532272338867, "learning_rate": 8.670520231213873e-07, "logits/chosen": -2.8602705001831055, "logits/rejected": -2.985159158706665, "logps/chosen": -43.48902130126953, "logps/rejected": -81.47476959228516, "loss": 0.6592, "rewards/accuracies": 0.84375, "rewards/chosen": 0.059732019901275635, "rewards/margins": 3.4308066368103027, "rewards/rejected": -3.371074676513672, "step": 461 }, { "epoch": 1.3322499097146985, "grad_norm": 4.072976112365723, "learning_rate": 8.667630057803467e-07, "logits/chosen": -2.8558833599090576, "logits/rejected": -2.9495489597320557, "logps/chosen": -36.235843658447266, "logps/rejected": -78.67971801757812, "loss": 0.5259, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6205334663391113, "rewards/margins": 3.89579439163208, "rewards/rejected": -3.275261163711548, "step": 462 }, { "epoch": 1.3351390393643916, "grad_norm": 5.3163042068481445, "learning_rate": 8.664739884393063e-07, "logits/chosen": -2.7773079872131348, "logits/rejected": -3.00466251373291, "logps/chosen": -37.96452331542969, "logps/rejected": -75.76390075683594, "loss": 0.5532, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9740297794342041, "rewards/margins": 3.733351945877075, "rewards/rejected": -2.75932240486145, "step": 463 }, { "epoch": 1.3380281690140845, "grad_norm": 7.07990026473999, "learning_rate": 8.661849710982659e-07, "logits/chosen": -2.714033603668213, "logits/rejected": -2.9774961471557617, "logps/chosen": -28.17632484436035, "logps/rejected": -64.1485824584961, "loss": 0.5466, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9976109266281128, "rewards/margins": 3.460076332092285, "rewards/rejected": -1.4624650478363037, "step": 464 }, { "epoch": 1.3409172986637774, "grad_norm": 5.817718029022217, "learning_rate": 8.658959537572253e-07, "logits/chosen": -2.788031578063965, "logits/rejected": -2.9505178928375244, "logps/chosen": -36.33378982543945, "logps/rejected": -73.20217895507812, "loss": 0.5747, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8354604244232178, "rewards/margins": 3.5224976539611816, "rewards/rejected": -2.687037467956543, "step": 465 }, { "epoch": 1.3438064283134705, "grad_norm": 4.189802646636963, "learning_rate": 8.65606936416185e-07, "logits/chosen": -2.727475643157959, "logits/rejected": -2.8999013900756836, "logps/chosen": -33.12579345703125, "logps/rejected": -82.27655792236328, "loss": 0.4777, "rewards/accuracies": 0.9375, "rewards/chosen": 1.326291561126709, "rewards/margins": 4.756531238555908, "rewards/rejected": -3.4302401542663574, "step": 466 }, { "epoch": 1.3466955579631636, "grad_norm": 5.114218235015869, "learning_rate": 8.653179190751445e-07, "logits/chosen": -2.743811845779419, "logits/rejected": -2.9211032390594482, "logps/chosen": -33.62108612060547, "logps/rejected": -72.07921600341797, "loss": 0.5654, "rewards/accuracies": 0.9375, "rewards/chosen": 1.22584068775177, "rewards/margins": 3.5303943157196045, "rewards/rejected": -2.304553747177124, "step": 467 }, { "epoch": 1.3495846876128565, "grad_norm": 4.164299488067627, "learning_rate": 8.650289017341041e-07, "logits/chosen": -2.7877869606018066, "logits/rejected": -2.9523630142211914, "logps/chosen": -36.92043685913086, "logps/rejected": -81.80934143066406, "loss": 0.5303, "rewards/accuracies": 0.9375, "rewards/chosen": 0.834252119064331, "rewards/margins": 4.287662506103516, "rewards/rejected": -3.4534101486206055, "step": 468 }, { "epoch": 1.3524738172625497, "grad_norm": 3.7396445274353027, "learning_rate": 8.647398843930635e-07, "logits/chosen": -2.806626558303833, "logits/rejected": -2.9790666103363037, "logps/chosen": -35.72969436645508, "logps/rejected": -78.11731719970703, "loss": 0.4729, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0530378818511963, "rewards/margins": 4.242936134338379, "rewards/rejected": -3.1898984909057617, "step": 469 }, { "epoch": 1.3553629469122428, "grad_norm": 6.15748929977417, "learning_rate": 8.644508670520231e-07, "logits/chosen": -2.775326728820801, "logits/rejected": -2.9336392879486084, "logps/chosen": -32.93771743774414, "logps/rejected": -67.1951904296875, "loss": 0.5941, "rewards/accuracies": 0.875, "rewards/chosen": 1.3558802604675293, "rewards/margins": 3.0758707523345947, "rewards/rejected": -1.7199904918670654, "step": 470 }, { "epoch": 1.3582520765619357, "grad_norm": 5.025757312774658, "learning_rate": 8.641618497109826e-07, "logits/chosen": -2.906747579574585, "logits/rejected": -3.041851043701172, "logps/chosen": -39.209693908691406, "logps/rejected": -84.14530944824219, "loss": 0.5368, "rewards/accuracies": 1.0, "rewards/chosen": 0.6926413178443909, "rewards/margins": 4.335644721984863, "rewards/rejected": -3.6430039405822754, "step": 471 }, { "epoch": 1.3611412062116288, "grad_norm": 4.259728908538818, "learning_rate": 8.638728323699421e-07, "logits/chosen": -2.7182979583740234, "logits/rejected": -2.914658546447754, "logps/chosen": -31.912151336669922, "logps/rejected": -76.17743682861328, "loss": 0.4591, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3938807249069214, "rewards/margins": 4.1492109298706055, "rewards/rejected": -2.7553305625915527, "step": 472 }, { "epoch": 1.3640303358613217, "grad_norm": 6.305069446563721, "learning_rate": 8.635838150289016e-07, "logits/chosen": -2.8170130252838135, "logits/rejected": -2.930980920791626, "logps/chosen": -40.73451232910156, "logps/rejected": -72.90861511230469, "loss": 0.6129, "rewards/accuracies": 0.875, "rewards/chosen": 0.5649321675300598, "rewards/margins": 3.0019922256469727, "rewards/rejected": -2.4370601177215576, "step": 473 }, { "epoch": 1.3669194655110148, "grad_norm": 5.992141246795654, "learning_rate": 8.632947976878613e-07, "logits/chosen": -2.7938718795776367, "logits/rejected": -2.9787516593933105, "logps/chosen": -30.731271743774414, "logps/rejected": -69.7202377319336, "loss": 0.5071, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4521815776824951, "rewards/margins": 3.6843912601470947, "rewards/rejected": -2.2322096824645996, "step": 474 }, { "epoch": 1.3698085951607077, "grad_norm": 5.110053062438965, "learning_rate": 8.630057803468209e-07, "logits/chosen": -2.813521385192871, "logits/rejected": -2.9182240962982178, "logps/chosen": -43.69904708862305, "logps/rejected": -90.57720947265625, "loss": 0.5959, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2946796715259552, "rewards/margins": 4.1905598640441895, "rewards/rejected": -3.8958804607391357, "step": 475 }, { "epoch": 1.3726977248104009, "grad_norm": 5.300953388214111, "learning_rate": 8.627167630057803e-07, "logits/chosen": -2.9244351387023926, "logits/rejected": -3.0175418853759766, "logps/chosen": -39.600257873535156, "logps/rejected": -88.1875, "loss": 0.4489, "rewards/accuracies": 1.0, "rewards/chosen": 0.6742545366287231, "rewards/margins": 4.393219947814941, "rewards/rejected": -3.718965768814087, "step": 476 }, { "epoch": 1.375586854460094, "grad_norm": 4.702014446258545, "learning_rate": 8.624277456647399e-07, "logits/chosen": -2.831960916519165, "logits/rejected": -2.994563579559326, "logps/chosen": -31.149959564208984, "logps/rejected": -77.2069091796875, "loss": 0.4799, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4388375282287598, "rewards/margins": 4.654277324676514, "rewards/rejected": -3.215439796447754, "step": 477 }, { "epoch": 1.378475984109787, "grad_norm": 5.9743499755859375, "learning_rate": 8.621387283236994e-07, "logits/chosen": -2.742887496948242, "logits/rejected": -2.9690804481506348, "logps/chosen": -38.23392105102539, "logps/rejected": -78.25518798828125, "loss": 0.5166, "rewards/accuracies": 0.875, "rewards/chosen": 1.0967557430267334, "rewards/margins": 4.050760269165039, "rewards/rejected": -2.9540042877197266, "step": 478 }, { "epoch": 1.38136511375948, "grad_norm": 5.3449506759643555, "learning_rate": 8.618497109826589e-07, "logits/chosen": -2.795515775680542, "logits/rejected": -2.9254143238067627, "logps/chosen": -33.56208419799805, "logps/rejected": -64.46864318847656, "loss": 0.6228, "rewards/accuracies": 0.96875, "rewards/chosen": 1.018775463104248, "rewards/margins": 3.0759241580963135, "rewards/rejected": -2.0571489334106445, "step": 479 }, { "epoch": 1.384254243409173, "grad_norm": 5.411711692810059, "learning_rate": 8.615606936416184e-07, "logits/chosen": -2.807673931121826, "logits/rejected": -2.9505765438079834, "logps/chosen": -37.97096633911133, "logps/rejected": -69.3465576171875, "loss": 0.5613, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9105626344680786, "rewards/margins": 3.2901394367218018, "rewards/rejected": -2.3795769214630127, "step": 480 }, { "epoch": 1.387143373058866, "grad_norm": 4.585638046264648, "learning_rate": 8.61271676300578e-07, "logits/chosen": -2.760918140411377, "logits/rejected": -2.973991632461548, "logps/chosen": -30.44251823425293, "logps/rejected": -80.97653198242188, "loss": 0.4016, "rewards/accuracies": 1.0, "rewards/chosen": 1.7074700593948364, "rewards/margins": 5.052731037139893, "rewards/rejected": -3.3452610969543457, "step": 481 }, { "epoch": 1.390032502708559, "grad_norm": 6.584817409515381, "learning_rate": 8.609826589595374e-07, "logits/chosen": -2.842930316925049, "logits/rejected": -2.928795576095581, "logps/chosen": -28.986557006835938, "logps/rejected": -68.39892578125, "loss": 0.4928, "rewards/accuracies": 1.0, "rewards/chosen": 1.6220492124557495, "rewards/margins": 3.929259777069092, "rewards/rejected": -2.307210683822632, "step": 482 }, { "epoch": 1.392921632358252, "grad_norm": 5.2399983406066895, "learning_rate": 8.606936416184971e-07, "logits/chosen": -2.8298325538635254, "logits/rejected": -3.0109729766845703, "logps/chosen": -36.18417739868164, "logps/rejected": -85.63337707519531, "loss": 0.494, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1250920295715332, "rewards/margins": 4.855281352996826, "rewards/rejected": -3.730189561843872, "step": 483 }, { "epoch": 1.3958107620079452, "grad_norm": 5.266636371612549, "learning_rate": 8.604046242774566e-07, "logits/chosen": -2.875777244567871, "logits/rejected": -2.9699795246124268, "logps/chosen": -34.29158401489258, "logps/rejected": -74.39845275878906, "loss": 0.5246, "rewards/accuracies": 1.0, "rewards/chosen": 1.32461416721344, "rewards/margins": 3.931121349334717, "rewards/rejected": -2.6065073013305664, "step": 484 }, { "epoch": 1.398699891657638, "grad_norm": 4.798684597015381, "learning_rate": 8.601156069364162e-07, "logits/chosen": -2.7796380519866943, "logits/rejected": -2.9181394577026367, "logps/chosen": -36.28969192504883, "logps/rejected": -73.37995147705078, "loss": 0.5517, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9576826095581055, "rewards/margins": 3.6894073486328125, "rewards/rejected": -2.731724977493286, "step": 485 }, { "epoch": 1.4015890213073312, "grad_norm": 5.37644624710083, "learning_rate": 8.598265895953757e-07, "logits/chosen": -2.7120361328125, "logits/rejected": -2.904144287109375, "logps/chosen": -32.45103073120117, "logps/rejected": -74.20600128173828, "loss": 0.5521, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3779481649398804, "rewards/margins": 3.902850866317749, "rewards/rejected": -2.5249030590057373, "step": 486 }, { "epoch": 1.4044781509570243, "grad_norm": 5.196961402893066, "learning_rate": 8.595375722543352e-07, "logits/chosen": -2.827479839324951, "logits/rejected": -2.9298112392425537, "logps/chosen": -40.052921295166016, "logps/rejected": -84.5960922241211, "loss": 0.5363, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6545298099517822, "rewards/margins": 4.211878299713135, "rewards/rejected": -3.5573484897613525, "step": 487 }, { "epoch": 1.4073672806067172, "grad_norm": 4.46522331237793, "learning_rate": 8.592485549132948e-07, "logits/chosen": -2.8187191486358643, "logits/rejected": -3.034477472305298, "logps/chosen": -30.29120635986328, "logps/rejected": -75.37663269042969, "loss": 0.522, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4455540180206299, "rewards/margins": 4.1688456535339355, "rewards/rejected": -2.7232918739318848, "step": 488 }, { "epoch": 1.4102564102564101, "grad_norm": 5.317508220672607, "learning_rate": 8.589595375722542e-07, "logits/chosen": -2.8216190338134766, "logits/rejected": -2.9796597957611084, "logps/chosen": -36.20509719848633, "logps/rejected": -78.69483184814453, "loss": 0.5069, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0738444328308105, "rewards/margins": 4.138364791870117, "rewards/rejected": -3.0645205974578857, "step": 489 }, { "epoch": 1.4131455399061033, "grad_norm": 5.689909934997559, "learning_rate": 8.586705202312138e-07, "logits/chosen": -2.857208728790283, "logits/rejected": -3.001729965209961, "logps/chosen": -36.77024459838867, "logps/rejected": -84.82415008544922, "loss": 0.5213, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9671823978424072, "rewards/margins": 4.465588092803955, "rewards/rejected": -3.498405694961548, "step": 490 }, { "epoch": 1.4131455399061033, "eval_logits/chosen": -2.879606246948242, "eval_logits/rejected": -3.023911237716675, "eval_logps/chosen": -38.820030212402344, "eval_logps/rejected": -83.6218032836914, "eval_loss": 0.5325563549995422, "eval_rewards/accuracies": 0.9435483813285828, "eval_rewards/chosen": 0.7496388554573059, "eval_rewards/margins": 4.416802883148193, "eval_rewards/rejected": -3.667163610458374, "eval_runtime": 225.5598, "eval_samples_per_second": 0.545, "eval_steps_per_second": 0.275, "step": 490 }, { "epoch": 1.4160346695557964, "grad_norm": 4.376262664794922, "learning_rate": 8.583815028901734e-07, "logits/chosen": -2.8714046478271484, "logits/rejected": -2.958890914916992, "logps/chosen": -38.66341018676758, "logps/rejected": -79.9447250366211, "loss": 0.5678, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7394583821296692, "rewards/margins": 3.8064346313476562, "rewards/rejected": -3.0669760704040527, "step": 491 }, { "epoch": 1.4189237992054893, "grad_norm": 8.375000953674316, "learning_rate": 8.58092485549133e-07, "logits/chosen": -2.8456196784973145, "logits/rejected": -2.9597809314727783, "logps/chosen": -40.19248962402344, "logps/rejected": -85.13108825683594, "loss": 0.5611, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5715253353118896, "rewards/margins": 4.1237969398498535, "rewards/rejected": -3.552271842956543, "step": 492 }, { "epoch": 1.4218129288551824, "grad_norm": 4.804337978363037, "learning_rate": 8.578034682080924e-07, "logits/chosen": -2.921769142150879, "logits/rejected": -2.996828556060791, "logps/chosen": -41.36530303955078, "logps/rejected": -80.02484893798828, "loss": 0.623, "rewards/accuracies": 0.96875, "rewards/chosen": 0.41631799936294556, "rewards/margins": 3.5545711517333984, "rewards/rejected": -3.1382532119750977, "step": 493 }, { "epoch": 1.4247020585048755, "grad_norm": 4.92618465423584, "learning_rate": 8.57514450867052e-07, "logits/chosen": -2.720742702484131, "logits/rejected": -2.9131276607513428, "logps/chosen": -39.58763122558594, "logps/rejected": -87.52918243408203, "loss": 0.5524, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7949979305267334, "rewards/margins": 4.49998664855957, "rewards/rejected": -3.704988956451416, "step": 494 }, { "epoch": 1.4275911881545684, "grad_norm": 7.055140495300293, "learning_rate": 8.572254335260115e-07, "logits/chosen": -2.7689247131347656, "logits/rejected": -2.9362668991088867, "logps/chosen": -38.513206481933594, "logps/rejected": -74.66799926757812, "loss": 0.5003, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3343929052352905, "rewards/margins": 3.5852746963500977, "rewards/rejected": -2.2508819103240967, "step": 495 }, { "epoch": 1.4304803178042615, "grad_norm": 6.308236122131348, "learning_rate": 8.56936416184971e-07, "logits/chosen": -2.9129374027252197, "logits/rejected": -3.0743496417999268, "logps/chosen": -41.380306243896484, "logps/rejected": -84.47854614257812, "loss": 0.5709, "rewards/accuracies": 0.90625, "rewards/chosen": 0.25370752811431885, "rewards/margins": 4.111812591552734, "rewards/rejected": -3.858104944229126, "step": 496 }, { "epoch": 1.4333694474539544, "grad_norm": 5.0766448974609375, "learning_rate": 8.566473988439306e-07, "logits/chosen": -2.932018280029297, "logits/rejected": -3.0889806747436523, "logps/chosen": -33.890708923339844, "logps/rejected": -81.53955078125, "loss": 0.5248, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9664660096168518, "rewards/margins": 4.405239582061768, "rewards/rejected": -3.4387736320495605, "step": 497 }, { "epoch": 1.4362585771036476, "grad_norm": 4.894867420196533, "learning_rate": 8.563583815028901e-07, "logits/chosen": -2.863499402999878, "logits/rejected": -2.984708309173584, "logps/chosen": -36.28825378417969, "logps/rejected": -80.10680389404297, "loss": 0.535, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8964564800262451, "rewards/margins": 3.994307518005371, "rewards/rejected": -3.097851276397705, "step": 498 }, { "epoch": 1.4391477067533405, "grad_norm": 5.814592361450195, "learning_rate": 8.560693641618497e-07, "logits/chosen": -2.835535764694214, "logits/rejected": -2.997244358062744, "logps/chosen": -43.31106948852539, "logps/rejected": -88.48612976074219, "loss": 0.6573, "rewards/accuracies": 0.9375, "rewards/chosen": 0.37227892875671387, "rewards/margins": 3.8431246280670166, "rewards/rejected": -3.4708456993103027, "step": 499 }, { "epoch": 1.4420368364030336, "grad_norm": 4.953680038452148, "learning_rate": 8.557803468208092e-07, "logits/chosen": -2.793914794921875, "logits/rejected": -2.9797778129577637, "logps/chosen": -37.890838623046875, "logps/rejected": -73.50547790527344, "loss": 0.6216, "rewards/accuracies": 0.875, "rewards/chosen": 0.6936928033828735, "rewards/margins": 3.0482349395751953, "rewards/rejected": -2.3545424938201904, "step": 500 }, { "epoch": 1.4449259660527267, "grad_norm": 4.454192638397217, "learning_rate": 8.554913294797688e-07, "logits/chosen": -2.8273067474365234, "logits/rejected": -3.0185768604278564, "logps/chosen": -29.83706283569336, "logps/rejected": -80.76670837402344, "loss": 0.4555, "rewards/accuracies": 0.9375, "rewards/chosen": 1.47136652469635, "rewards/margins": 4.6129536628723145, "rewards/rejected": -3.141587018966675, "step": 501 }, { "epoch": 1.4478150957024196, "grad_norm": 4.750903606414795, "learning_rate": 8.552023121387283e-07, "logits/chosen": -2.878018856048584, "logits/rejected": -2.977294921875, "logps/chosen": -37.87764358520508, "logps/rejected": -84.04717254638672, "loss": 0.6232, "rewards/accuracies": 0.9375, "rewards/chosen": 0.691935658454895, "rewards/margins": 3.9389848709106445, "rewards/rejected": -3.24704909324646, "step": 502 }, { "epoch": 1.4507042253521127, "grad_norm": 4.823765277862549, "learning_rate": 8.549132947976878e-07, "logits/chosen": -2.760106325149536, "logits/rejected": -2.898512601852417, "logps/chosen": -44.010108947753906, "logps/rejected": -86.44173431396484, "loss": 0.5796, "rewards/accuracies": 0.875, "rewards/chosen": 0.4723835587501526, "rewards/margins": 4.065179824829102, "rewards/rejected": -3.5927960872650146, "step": 503 }, { "epoch": 1.4535933550018056, "grad_norm": 6.866596221923828, "learning_rate": 8.546242774566473e-07, "logits/chosen": -2.9152469635009766, "logits/rejected": -3.0345189571380615, "logps/chosen": -38.52988052368164, "logps/rejected": -73.99510955810547, "loss": 0.6106, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6230095624923706, "rewards/margins": 3.3455018997192383, "rewards/rejected": -2.722492218017578, "step": 504 }, { "epoch": 1.4564824846514988, "grad_norm": 4.852906227111816, "learning_rate": 8.543352601156069e-07, "logits/chosen": -2.9676876068115234, "logits/rejected": -3.1129908561706543, "logps/chosen": -36.68492126464844, "logps/rejected": -78.21269226074219, "loss": 0.5639, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7597385048866272, "rewards/margins": 4.039193153381348, "rewards/rejected": -3.279454469680786, "step": 505 }, { "epoch": 1.4593716143011917, "grad_norm": 8.660745620727539, "learning_rate": 8.540462427745663e-07, "logits/chosen": -2.968233108520508, "logits/rejected": -3.0475916862487793, "logps/chosen": -39.89090347290039, "logps/rejected": -92.43059539794922, "loss": 0.5203, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8688349723815918, "rewards/margins": 5.033344268798828, "rewards/rejected": -4.1645097732543945, "step": 506 }, { "epoch": 1.4622607439508848, "grad_norm": 6.254988670349121, "learning_rate": 8.53757225433526e-07, "logits/chosen": -2.8835861682891846, "logits/rejected": -3.0386312007904053, "logps/chosen": -33.4464225769043, "logps/rejected": -72.37200927734375, "loss": 0.5242, "rewards/accuracies": 0.90625, "rewards/chosen": 1.508113145828247, "rewards/margins": 4.016477108001709, "rewards/rejected": -2.508363723754883, "step": 507 }, { "epoch": 1.465149873600578, "grad_norm": 4.652965068817139, "learning_rate": 8.534682080924856e-07, "logits/chosen": -2.6557397842407227, "logits/rejected": -2.849747657775879, "logps/chosen": -26.856779098510742, "logps/rejected": -72.51593780517578, "loss": 0.4882, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9603663682937622, "rewards/margins": 4.45388650894165, "rewards/rejected": -2.4935202598571777, "step": 508 }, { "epoch": 1.4680390032502708, "grad_norm": 4.648406505584717, "learning_rate": 8.531791907514451e-07, "logits/chosen": -2.8713395595550537, "logits/rejected": -3.002530574798584, "logps/chosen": -35.85606002807617, "logps/rejected": -79.28520965576172, "loss": 0.4995, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0395764112472534, "rewards/margins": 4.210742473602295, "rewards/rejected": -3.1711666584014893, "step": 509 }, { "epoch": 1.470928132899964, "grad_norm": 6.485387325286865, "learning_rate": 8.528901734104046e-07, "logits/chosen": -2.894598960876465, "logits/rejected": -3.0752475261688232, "logps/chosen": -28.217205047607422, "logps/rejected": -69.2518539428711, "loss": 0.4892, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8187456130981445, "rewards/margins": 4.058292388916016, "rewards/rejected": -2.239546298980713, "step": 510 }, { "epoch": 1.4738172625496568, "grad_norm": 5.107236385345459, "learning_rate": 8.526011560693641e-07, "logits/chosen": -2.885706901550293, "logits/rejected": -2.9088377952575684, "logps/chosen": -33.92036437988281, "logps/rejected": -76.28575897216797, "loss": 0.5165, "rewards/accuracies": 0.875, "rewards/chosen": 1.1448192596435547, "rewards/margins": 4.077378749847412, "rewards/rejected": -2.9325594902038574, "step": 511 }, { "epoch": 1.47670639219935, "grad_norm": 6.3228983879089355, "learning_rate": 8.523121387283237e-07, "logits/chosen": -2.877225637435913, "logits/rejected": -2.993406295776367, "logps/chosen": -32.04007339477539, "logps/rejected": -69.0103759765625, "loss": 0.5471, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3186954259872437, "rewards/margins": 3.586280584335327, "rewards/rejected": -2.267585515975952, "step": 512 }, { "epoch": 1.4795955218490429, "grad_norm": 5.514613628387451, "learning_rate": 8.520231213872831e-07, "logits/chosen": -2.789038896560669, "logits/rejected": -2.9361724853515625, "logps/chosen": -31.87209129333496, "logps/rejected": -73.00839233398438, "loss": 0.5609, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0216784477233887, "rewards/margins": 3.5497500896453857, "rewards/rejected": -2.528071403503418, "step": 513 }, { "epoch": 1.482484651498736, "grad_norm": 5.002336502075195, "learning_rate": 8.517341040462427e-07, "logits/chosen": -2.9004290103912354, "logits/rejected": -3.10964298248291, "logps/chosen": -34.996238708496094, "logps/rejected": -89.68195343017578, "loss": 0.4592, "rewards/accuracies": 0.96875, "rewards/chosen": 1.055063009262085, "rewards/margins": 5.257054805755615, "rewards/rejected": -4.201991558074951, "step": 514 }, { "epoch": 1.485373781148429, "grad_norm": 4.465389728546143, "learning_rate": 8.514450867052023e-07, "logits/chosen": -2.8360743522644043, "logits/rejected": -2.998246192932129, "logps/chosen": -35.323692321777344, "logps/rejected": -77.61909484863281, "loss": 0.5631, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8671560287475586, "rewards/margins": 3.7537918090820312, "rewards/rejected": -2.8866360187530518, "step": 515 }, { "epoch": 1.488262910798122, "grad_norm": 4.9843525886535645, "learning_rate": 8.511560693641618e-07, "logits/chosen": -2.795457601547241, "logits/rejected": -2.992534875869751, "logps/chosen": -36.30201721191406, "logps/rejected": -77.32571411132812, "loss": 0.4701, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0124891996383667, "rewards/margins": 3.981069564819336, "rewards/rejected": -2.968580722808838, "step": 516 }, { "epoch": 1.4911520404478151, "grad_norm": 5.904293060302734, "learning_rate": 8.508670520231213e-07, "logits/chosen": -2.8839237689971924, "logits/rejected": -3.0261571407318115, "logps/chosen": -39.483673095703125, "logps/rejected": -75.34327697753906, "loss": 0.5194, "rewards/accuracies": 0.875, "rewards/chosen": 0.6698137521743774, "rewards/margins": 3.4694998264312744, "rewards/rejected": -2.7996859550476074, "step": 517 }, { "epoch": 1.4940411700975083, "grad_norm": 4.56111478805542, "learning_rate": 8.505780346820809e-07, "logits/chosen": -2.8774116039276123, "logits/rejected": -2.9190444946289062, "logps/chosen": -44.67308807373047, "logps/rejected": -86.72640228271484, "loss": 0.5783, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4429689645767212, "rewards/margins": 3.959906578063965, "rewards/rejected": -3.516937732696533, "step": 518 }, { "epoch": 1.4969302997472012, "grad_norm": 6.669811725616455, "learning_rate": 8.502890173410405e-07, "logits/chosen": -2.7617812156677246, "logits/rejected": -2.9562575817108154, "logps/chosen": -43.74060821533203, "logps/rejected": -83.7746353149414, "loss": 0.5906, "rewards/accuracies": 0.875, "rewards/chosen": 0.49553534388542175, "rewards/margins": 4.0483527183532715, "rewards/rejected": -3.5528173446655273, "step": 519 }, { "epoch": 1.499819429396894, "grad_norm": 5.847562789916992, "learning_rate": 8.499999999999999e-07, "logits/chosen": -2.8710761070251465, "logits/rejected": -3.035953998565674, "logps/chosen": -33.22985076904297, "logps/rejected": -91.01107025146484, "loss": 0.4193, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0972232818603516, "rewards/margins": 5.388428688049316, "rewards/rejected": -4.291204929351807, "step": 520 }, { "epoch": 1.5027085590465872, "grad_norm": 5.124783039093018, "learning_rate": 8.497109826589595e-07, "logits/chosen": -2.816801071166992, "logits/rejected": -2.9890239238739014, "logps/chosen": -36.249263763427734, "logps/rejected": -84.91102600097656, "loss": 0.4538, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8514481782913208, "rewards/margins": 4.649965763092041, "rewards/rejected": -3.7985172271728516, "step": 521 }, { "epoch": 1.5055976886962803, "grad_norm": 6.464309215545654, "learning_rate": 8.49421965317919e-07, "logits/chosen": -2.9010353088378906, "logits/rejected": -3.1532981395721436, "logps/chosen": -33.288604736328125, "logps/rejected": -87.8751220703125, "loss": 0.4885, "rewards/accuracies": 1.0, "rewards/chosen": 1.0561829805374146, "rewards/margins": 5.1956000328063965, "rewards/rejected": -4.139416694641113, "step": 522 }, { "epoch": 1.5084868183459732, "grad_norm": 6.305094242095947, "learning_rate": 8.491329479768786e-07, "logits/chosen": -2.894796848297119, "logits/rejected": -2.9984586238861084, "logps/chosen": -33.61380386352539, "logps/rejected": -79.0778579711914, "loss": 0.4979, "rewards/accuracies": 1.0, "rewards/chosen": 0.9295316338539124, "rewards/margins": 4.264792442321777, "rewards/rejected": -3.3352606296539307, "step": 523 }, { "epoch": 1.5113759479956663, "grad_norm": 5.550131797790527, "learning_rate": 8.488439306358381e-07, "logits/chosen": -2.9115092754364014, "logits/rejected": -3.0928213596343994, "logps/chosen": -34.9962158203125, "logps/rejected": -82.88037872314453, "loss": 0.4822, "rewards/accuracies": 0.9375, "rewards/chosen": 1.147985816001892, "rewards/margins": 4.5063090324401855, "rewards/rejected": -3.358323574066162, "step": 524 }, { "epoch": 1.5142650776453594, "grad_norm": 5.399001598358154, "learning_rate": 8.485549132947977e-07, "logits/chosen": -2.9987082481384277, "logits/rejected": -3.1683647632598877, "logps/chosen": -35.62678146362305, "logps/rejected": -81.18851470947266, "loss": 0.4907, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9452550411224365, "rewards/margins": 4.4766011238098145, "rewards/rejected": -3.531346321105957, "step": 525 }, { "epoch": 1.5171542072950523, "grad_norm": 5.634058952331543, "learning_rate": 8.482658959537572e-07, "logits/chosen": -2.843346357345581, "logits/rejected": -2.9609057903289795, "logps/chosen": -37.87563705444336, "logps/rejected": -84.86378479003906, "loss": 0.5293, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8951582312583923, "rewards/margins": 4.347986698150635, "rewards/rejected": -3.4528286457061768, "step": 526 }, { "epoch": 1.5200433369447452, "grad_norm": 5.478619575500488, "learning_rate": 8.479768786127167e-07, "logits/chosen": -2.7839601039886475, "logits/rejected": -2.966874837875366, "logps/chosen": -41.941768646240234, "logps/rejected": -88.36266326904297, "loss": 0.5447, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5694379210472107, "rewards/margins": 4.494062423706055, "rewards/rejected": -3.924624443054199, "step": 527 }, { "epoch": 1.5229324665944384, "grad_norm": 6.047064781188965, "learning_rate": 8.476878612716762e-07, "logits/chosen": -2.8991944789886475, "logits/rejected": -3.0594704151153564, "logps/chosen": -34.998905181884766, "logps/rejected": -86.83697509765625, "loss": 0.4281, "rewards/accuracies": 1.0, "rewards/chosen": 1.015048861503601, "rewards/margins": 4.883323669433594, "rewards/rejected": -3.868274211883545, "step": 528 }, { "epoch": 1.5258215962441315, "grad_norm": 5.442681312561035, "learning_rate": 8.473988439306358e-07, "logits/chosen": -2.9588539600372314, "logits/rejected": -3.100282907485962, "logps/chosen": -31.85093879699707, "logps/rejected": -81.3811264038086, "loss": 0.4965, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0205270051956177, "rewards/margins": 4.584382057189941, "rewards/rejected": -3.5638551712036133, "step": 529 }, { "epoch": 1.5287107258938244, "grad_norm": 4.650325298309326, "learning_rate": 8.471098265895953e-07, "logits/chosen": -2.7853918075561523, "logits/rejected": -2.9871950149536133, "logps/chosen": -37.55100631713867, "logps/rejected": -83.42362976074219, "loss": 0.4497, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8729323744773865, "rewards/margins": 4.330994129180908, "rewards/rejected": -3.458061695098877, "step": 530 }, { "epoch": 1.5315998555435175, "grad_norm": 6.241758346557617, "learning_rate": 8.468208092485549e-07, "logits/chosen": -2.832517623901367, "logits/rejected": -3.020355701446533, "logps/chosen": -36.22041702270508, "logps/rejected": -73.72248840332031, "loss": 0.5865, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8767622709274292, "rewards/margins": 3.763015031814575, "rewards/rejected": -2.8862528800964355, "step": 531 }, { "epoch": 1.5344889851932106, "grad_norm": 5.752298831939697, "learning_rate": 8.465317919075145e-07, "logits/chosen": -2.9602532386779785, "logits/rejected": -3.0638487339019775, "logps/chosen": -38.309326171875, "logps/rejected": -78.25000762939453, "loss": 0.557, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4337731599807739, "rewards/margins": 3.6152453422546387, "rewards/rejected": -3.181472063064575, "step": 532 }, { "epoch": 1.5373781148429035, "grad_norm": 5.3358540534973145, "learning_rate": 8.462427745664739e-07, "logits/chosen": -2.9744319915771484, "logits/rejected": -2.9937987327575684, "logps/chosen": -37.489990234375, "logps/rejected": -85.55256652832031, "loss": 0.4968, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7201905846595764, "rewards/margins": 4.391959190368652, "rewards/rejected": -3.6717684268951416, "step": 533 }, { "epoch": 1.5402672444925964, "grad_norm": 5.3339009284973145, "learning_rate": 8.459537572254335e-07, "logits/chosen": -3.000953435897827, "logits/rejected": -3.220940589904785, "logps/chosen": -28.34293556213379, "logps/rejected": -85.7071533203125, "loss": 0.4008, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5475428104400635, "rewards/margins": 5.55186653137207, "rewards/rejected": -4.0043230056762695, "step": 534 }, { "epoch": 1.5431563741422898, "grad_norm": 5.527181625366211, "learning_rate": 8.45664739884393e-07, "logits/chosen": -2.791100025177002, "logits/rejected": -2.9532179832458496, "logps/chosen": -34.79048156738281, "logps/rejected": -70.12191772460938, "loss": 0.5965, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0323145389556885, "rewards/margins": 3.3617162704467773, "rewards/rejected": -2.329401731491089, "step": 535 }, { "epoch": 1.5460455037919827, "grad_norm": 5.246410846710205, "learning_rate": 8.453757225433526e-07, "logits/chosen": -2.8435866832733154, "logits/rejected": -2.990196943283081, "logps/chosen": -38.027793884277344, "logps/rejected": -69.85352325439453, "loss": 0.6128, "rewards/accuracies": 0.875, "rewards/chosen": 0.9082375764846802, "rewards/margins": 3.0660629272460938, "rewards/rejected": -2.157825469970703, "step": 536 }, { "epoch": 1.5489346334416756, "grad_norm": 3.9340264797210693, "learning_rate": 8.45086705202312e-07, "logits/chosen": -2.7930121421813965, "logits/rejected": -2.998404026031494, "logps/chosen": -48.92507553100586, "logps/rejected": -92.17303466796875, "loss": 0.6062, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1081695631146431, "rewards/margins": 4.0976881980896, "rewards/rejected": -4.20585823059082, "step": 537 }, { "epoch": 1.5518237630913687, "grad_norm": 6.447320938110352, "learning_rate": 8.447976878612716e-07, "logits/chosen": -2.938324451446533, "logits/rejected": -2.9899353981018066, "logps/chosen": -35.59672546386719, "logps/rejected": -80.83482360839844, "loss": 0.5808, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9871600866317749, "rewards/margins": 4.116922378540039, "rewards/rejected": -3.1297621726989746, "step": 538 }, { "epoch": 1.5547128927410618, "grad_norm": 6.411078453063965, "learning_rate": 8.445086705202313e-07, "logits/chosen": -2.949838638305664, "logits/rejected": -3.110128402709961, "logps/chosen": -37.965911865234375, "logps/rejected": -93.57537078857422, "loss": 0.4635, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0113686323165894, "rewards/margins": 5.504851341247559, "rewards/rejected": -4.493483066558838, "step": 539 }, { "epoch": 1.5576020223907547, "grad_norm": 6.689858436584473, "learning_rate": 8.442196531791907e-07, "logits/chosen": -2.9329752922058105, "logits/rejected": -3.049015522003174, "logps/chosen": -37.800872802734375, "logps/rejected": -84.22163391113281, "loss": 0.5359, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8359326720237732, "rewards/margins": 4.196547508239746, "rewards/rejected": -3.360614538192749, "step": 540 }, { "epoch": 1.5604911520404479, "grad_norm": 5.707031726837158, "learning_rate": 8.439306358381503e-07, "logits/chosen": -2.8252949714660645, "logits/rejected": -3.0325334072113037, "logps/chosen": -30.461069107055664, "logps/rejected": -70.8265609741211, "loss": 0.4717, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7505152225494385, "rewards/margins": 3.998080015182495, "rewards/rejected": -2.2475647926330566, "step": 541 }, { "epoch": 1.563380281690141, "grad_norm": 6.490290641784668, "learning_rate": 8.436416184971098e-07, "logits/chosen": -3.0217297077178955, "logits/rejected": -3.101407766342163, "logps/chosen": -36.618553161621094, "logps/rejected": -84.51885986328125, "loss": 0.5164, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7734043598175049, "rewards/margins": 4.352943420410156, "rewards/rejected": -3.5795388221740723, "step": 542 }, { "epoch": 1.5662694113398339, "grad_norm": 5.395206928253174, "learning_rate": 8.433526011560694e-07, "logits/chosen": -2.9387965202331543, "logits/rejected": -3.049835205078125, "logps/chosen": -42.1044921875, "logps/rejected": -84.58711242675781, "loss": 0.5934, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12341994047164917, "rewards/margins": 3.923992156982422, "rewards/rejected": -3.800572156906128, "step": 543 }, { "epoch": 1.5691585409895268, "grad_norm": 4.7293596267700195, "learning_rate": 8.430635838150288e-07, "logits/chosen": -2.9716053009033203, "logits/rejected": -3.088226795196533, "logps/chosen": -31.58833885192871, "logps/rejected": -85.06055450439453, "loss": 0.4693, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4680695533752441, "rewards/margins": 4.839232444763184, "rewards/rejected": -3.3711626529693604, "step": 544 }, { "epoch": 1.57204767063922, "grad_norm": 6.35274076461792, "learning_rate": 8.427745664739884e-07, "logits/chosen": -2.935830593109131, "logits/rejected": -3.102137565612793, "logps/chosen": -33.814666748046875, "logps/rejected": -80.01080322265625, "loss": 0.4511, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2573742866516113, "rewards/margins": 4.420088291168213, "rewards/rejected": -3.1627142429351807, "step": 545 }, { "epoch": 1.574936800288913, "grad_norm": 4.890665054321289, "learning_rate": 8.424855491329479e-07, "logits/chosen": -2.9318933486938477, "logits/rejected": -3.0205044746398926, "logps/chosen": -43.458984375, "logps/rejected": -91.95165252685547, "loss": 0.5804, "rewards/accuracies": 0.96875, "rewards/chosen": 0.48831257224082947, "rewards/margins": 4.391110420227051, "rewards/rejected": -3.9027976989746094, "step": 546 }, { "epoch": 1.577825929938606, "grad_norm": 5.629630088806152, "learning_rate": 8.421965317919075e-07, "logits/chosen": -2.8844501972198486, "logits/rejected": -3.0895934104919434, "logps/chosen": -41.18265151977539, "logps/rejected": -83.54656219482422, "loss": 0.568, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8794753551483154, "rewards/margins": 4.3105549812316895, "rewards/rejected": -3.431079387664795, "step": 547 }, { "epoch": 1.580715059588299, "grad_norm": 5.166134357452393, "learning_rate": 8.41907514450867e-07, "logits/chosen": -2.805149555206299, "logits/rejected": -3.000971794128418, "logps/chosen": -35.81270217895508, "logps/rejected": -79.15937805175781, "loss": 0.4717, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3115148544311523, "rewards/margins": 4.201268672943115, "rewards/rejected": -2.889753580093384, "step": 548 }, { "epoch": 1.5836041892379922, "grad_norm": 6.499272346496582, "learning_rate": 8.416184971098266e-07, "logits/chosen": -2.865851879119873, "logits/rejected": -3.0028634071350098, "logps/chosen": -35.82699203491211, "logps/rejected": -74.9052734375, "loss": 0.4778, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1839150190353394, "rewards/margins": 3.9434428215026855, "rewards/rejected": -2.7595276832580566, "step": 549 }, { "epoch": 1.586493318887685, "grad_norm": 5.176476955413818, "learning_rate": 8.41329479768786e-07, "logits/chosen": -2.8420462608337402, "logits/rejected": -2.9911434650421143, "logps/chosen": -32.505828857421875, "logps/rejected": -83.21354675292969, "loss": 0.4602, "rewards/accuracies": 1.0, "rewards/chosen": 1.3156919479370117, "rewards/margins": 4.838355541229248, "rewards/rejected": -3.5226635932922363, "step": 550 }, { "epoch": 1.589382448537378, "grad_norm": 6.037923336029053, "learning_rate": 8.410404624277456e-07, "logits/chosen": -2.9421305656433105, "logits/rejected": -3.0649032592773438, "logps/chosen": -34.7432975769043, "logps/rejected": -73.50973510742188, "loss": 0.5554, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9480359554290771, "rewards/margins": 3.709869146347046, "rewards/rejected": -2.7618329524993896, "step": 551 }, { "epoch": 1.592271578187071, "grad_norm": 4.706053256988525, "learning_rate": 8.407514450867052e-07, "logits/chosen": -2.924790382385254, "logits/rejected": -3.108372688293457, "logps/chosen": -34.085166931152344, "logps/rejected": -75.39935302734375, "loss": 0.535, "rewards/accuracies": 1.0, "rewards/chosen": 1.3154267072677612, "rewards/margins": 4.0307512283325195, "rewards/rejected": -2.715324878692627, "step": 552 }, { "epoch": 1.5951607078367642, "grad_norm": 5.010000228881836, "learning_rate": 8.404624277456647e-07, "logits/chosen": -2.930959701538086, "logits/rejected": -3.0764501094818115, "logps/chosen": -34.17338180541992, "logps/rejected": -81.60545349121094, "loss": 0.5189, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1072590351104736, "rewards/margins": 4.612486839294434, "rewards/rejected": -3.505228042602539, "step": 553 }, { "epoch": 1.5980498374864571, "grad_norm": 6.299757480621338, "learning_rate": 8.401734104046242e-07, "logits/chosen": -2.9575366973876953, "logits/rejected": -3.0464847087860107, "logps/chosen": -41.43682098388672, "logps/rejected": -88.00402069091797, "loss": 0.6266, "rewards/accuracies": 0.96875, "rewards/chosen": 0.45792335271835327, "rewards/margins": 4.22069787979126, "rewards/rejected": -3.7627742290496826, "step": 554 }, { "epoch": 1.6009389671361502, "grad_norm": 5.336593151092529, "learning_rate": 8.398843930635838e-07, "logits/chosen": -2.9510536193847656, "logits/rejected": -3.058969736099243, "logps/chosen": -31.42938995361328, "logps/rejected": -80.74948120117188, "loss": 0.4125, "rewards/accuracies": 1.0, "rewards/chosen": 1.365181803703308, "rewards/margins": 4.47907829284668, "rewards/rejected": -3.113896608352661, "step": 555 }, { "epoch": 1.6038280967858434, "grad_norm": 5.504385471343994, "learning_rate": 8.395953757225434e-07, "logits/chosen": -2.898261547088623, "logits/rejected": -3.070767879486084, "logps/chosen": -36.51664733886719, "logps/rejected": -77.56098175048828, "loss": 0.522, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9455909729003906, "rewards/margins": 4.1319804191589355, "rewards/rejected": -3.186389684677124, "step": 556 }, { "epoch": 1.6067172264355363, "grad_norm": 4.69291353225708, "learning_rate": 8.393063583815028e-07, "logits/chosen": -2.867715358734131, "logits/rejected": -2.9781723022460938, "logps/chosen": -31.92168617248535, "logps/rejected": -74.9223403930664, "loss": 0.5027, "rewards/accuracies": 1.0, "rewards/chosen": 1.383091926574707, "rewards/margins": 4.119671821594238, "rewards/rejected": -2.7365801334381104, "step": 557 }, { "epoch": 1.6096063560852292, "grad_norm": 7.218979835510254, "learning_rate": 8.390173410404624e-07, "logits/chosen": -3.0434117317199707, "logits/rejected": -3.130735397338867, "logps/chosen": -34.85606384277344, "logps/rejected": -80.6331787109375, "loss": 0.4723, "rewards/accuracies": 0.96875, "rewards/chosen": 0.835258424282074, "rewards/margins": 4.217602252960205, "rewards/rejected": -3.3823437690734863, "step": 558 }, { "epoch": 1.6124954857349225, "grad_norm": 5.780701637268066, "learning_rate": 8.387283236994219e-07, "logits/chosen": -2.865917444229126, "logits/rejected": -3.0747296810150146, "logps/chosen": -27.188934326171875, "logps/rejected": -77.66658020019531, "loss": 0.4368, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8280422687530518, "rewards/margins": 4.785882472991943, "rewards/rejected": -2.95784068107605, "step": 559 }, { "epoch": 1.6153846153846154, "grad_norm": 5.266223430633545, "learning_rate": 8.384393063583815e-07, "logits/chosen": -2.9238054752349854, "logits/rejected": -3.0618038177490234, "logps/chosen": -33.50039291381836, "logps/rejected": -81.81497192382812, "loss": 0.4601, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9811907410621643, "rewards/margins": 4.396491050720215, "rewards/rejected": -3.4153003692626953, "step": 560 }, { "epoch": 1.6153846153846154, "eval_logits/chosen": -2.9868621826171875, "eval_logits/rejected": -3.131486654281616, "eval_logps/chosen": -38.45983123779297, "eval_logps/rejected": -86.50846099853516, "eval_loss": 0.5202704668045044, "eval_rewards/accuracies": 0.9354838728904724, "eval_rewards/chosen": 0.785658597946167, "eval_rewards/margins": 4.741487503051758, "eval_rewards/rejected": -3.9558286666870117, "eval_runtime": 224.6216, "eval_samples_per_second": 0.548, "eval_steps_per_second": 0.276, "step": 560 }, { "epoch": 1.6182737450343083, "grad_norm": 5.015097141265869, "learning_rate": 8.38150289017341e-07, "logits/chosen": -2.848797559738159, "logits/rejected": -2.9815118312835693, "logps/chosen": -40.42072677612305, "logps/rejected": -81.99705505371094, "loss": 0.5459, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4381978511810303, "rewards/margins": 3.9758737087249756, "rewards/rejected": -3.5376760959625244, "step": 561 }, { "epoch": 1.6211628746840014, "grad_norm": 5.343723773956299, "learning_rate": 8.378612716763005e-07, "logits/chosen": -2.9649672508239746, "logits/rejected": -3.117180347442627, "logps/chosen": -36.85393142700195, "logps/rejected": -82.58875274658203, "loss": 0.5904, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6899961233139038, "rewards/margins": 4.284644603729248, "rewards/rejected": -3.594648838043213, "step": 562 }, { "epoch": 1.6240520043336946, "grad_norm": 7.862930774688721, "learning_rate": 8.375722543352602e-07, "logits/chosen": -2.9994359016418457, "logits/rejected": -3.1050024032592773, "logps/chosen": -41.85871887207031, "logps/rejected": -87.94921875, "loss": 0.5771, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5170707106590271, "rewards/margins": 4.3917388916015625, "rewards/rejected": -3.8746681213378906, "step": 563 }, { "epoch": 1.6269411339833875, "grad_norm": 6.988171100616455, "learning_rate": 8.372832369942196e-07, "logits/chosen": -3.041736364364624, "logits/rejected": -3.1122612953186035, "logps/chosen": -33.502315521240234, "logps/rejected": -72.4446792602539, "loss": 0.4828, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2442210912704468, "rewards/margins": 3.7893927097320557, "rewards/rejected": -2.5451714992523193, "step": 564 }, { "epoch": 1.6298302636330806, "grad_norm": 5.7988715171813965, "learning_rate": 8.369942196531792e-07, "logits/chosen": -2.7909138202667236, "logits/rejected": -3.073004961013794, "logps/chosen": -31.946849822998047, "logps/rejected": -74.46458435058594, "loss": 0.5649, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1418075561523438, "rewards/margins": 4.01780366897583, "rewards/rejected": -2.8759963512420654, "step": 565 }, { "epoch": 1.6327193932827737, "grad_norm": 6.360963821411133, "learning_rate": 8.367052023121387e-07, "logits/chosen": -2.9669723510742188, "logits/rejected": -3.066706895828247, "logps/chosen": -31.72788429260254, "logps/rejected": -85.31500244140625, "loss": 0.4316, "rewards/accuracies": 1.0, "rewards/chosen": 1.5607978105545044, "rewards/margins": 4.968331813812256, "rewards/rejected": -3.407534122467041, "step": 566 }, { "epoch": 1.6356085229324666, "grad_norm": 5.980303764343262, "learning_rate": 8.364161849710982e-07, "logits/chosen": -3.082207202911377, "logits/rejected": -3.245044231414795, "logps/chosen": -39.61912155151367, "logps/rejected": -81.638427734375, "loss": 0.5223, "rewards/accuracies": 0.875, "rewards/chosen": 0.9691084623336792, "rewards/margins": 4.424490451812744, "rewards/rejected": -3.4553818702697754, "step": 567 }, { "epoch": 1.6384976525821595, "grad_norm": 5.120550632476807, "learning_rate": 8.361271676300577e-07, "logits/chosen": -2.964932918548584, "logits/rejected": -2.978151798248291, "logps/chosen": -46.612579345703125, "logps/rejected": -93.0313949584961, "loss": 0.5881, "rewards/accuracies": 0.9375, "rewards/chosen": 0.01841449737548828, "rewards/margins": 4.3015828132629395, "rewards/rejected": -4.283168792724609, "step": 568 }, { "epoch": 1.6413867822318526, "grad_norm": 7.267858982086182, "learning_rate": 8.358381502890173e-07, "logits/chosen": -2.9100470542907715, "logits/rejected": -3.074343204498291, "logps/chosen": -33.03060531616211, "logps/rejected": -76.11921691894531, "loss": 0.4862, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3751636743545532, "rewards/margins": 4.165558338165283, "rewards/rejected": -2.7903952598571777, "step": 569 }, { "epoch": 1.6442759118815458, "grad_norm": 6.538259506225586, "learning_rate": 8.355491329479768e-07, "logits/chosen": -2.89776349067688, "logits/rejected": -3.043333053588867, "logps/chosen": -31.282257080078125, "logps/rejected": -70.60037231445312, "loss": 0.5436, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3989460468292236, "rewards/margins": 3.8915445804595947, "rewards/rejected": -2.492598533630371, "step": 570 }, { "epoch": 1.6471650415312387, "grad_norm": 5.246339797973633, "learning_rate": 8.352601156069364e-07, "logits/chosen": -3.027642250061035, "logits/rejected": -3.1379289627075195, "logps/chosen": -39.47615432739258, "logps/rejected": -88.739501953125, "loss": 0.5673, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7930524349212646, "rewards/margins": 4.679956912994385, "rewards/rejected": -3.88690447807312, "step": 571 }, { "epoch": 1.6500541711809318, "grad_norm": 5.41150426864624, "learning_rate": 8.34971098265896e-07, "logits/chosen": -2.9506547451019287, "logits/rejected": -3.088325262069702, "logps/chosen": -42.17538070678711, "logps/rejected": -84.22631072998047, "loss": 0.5311, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23045772314071655, "rewards/margins": 4.000404357910156, "rewards/rejected": -3.769946336746216, "step": 572 }, { "epoch": 1.652943300830625, "grad_norm": 7.3377766609191895, "learning_rate": 8.346820809248555e-07, "logits/chosen": -2.9909958839416504, "logits/rejected": -3.0879576206207275, "logps/chosen": -35.229522705078125, "logps/rejected": -78.63722229003906, "loss": 0.4661, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4035279750823975, "rewards/margins": 4.431962966918945, "rewards/rejected": -3.028435230255127, "step": 573 }, { "epoch": 1.6558324304803178, "grad_norm": 6.610383033752441, "learning_rate": 8.34393063583815e-07, "logits/chosen": -2.8597888946533203, "logits/rejected": -3.0439767837524414, "logps/chosen": -29.713701248168945, "logps/rejected": -74.06674194335938, "loss": 0.4671, "rewards/accuracies": 0.96875, "rewards/chosen": 1.585608720779419, "rewards/margins": 4.239022731781006, "rewards/rejected": -2.653414011001587, "step": 574 }, { "epoch": 1.6587215601300107, "grad_norm": 6.850522518157959, "learning_rate": 8.341040462427745e-07, "logits/chosen": -2.9418246746063232, "logits/rejected": -3.1736035346984863, "logps/chosen": -46.42430114746094, "logps/rejected": -95.52348327636719, "loss": 0.5775, "rewards/accuracies": 1.0, "rewards/chosen": -0.10655544698238373, "rewards/margins": 4.7316179275512695, "rewards/rejected": -4.838172912597656, "step": 575 }, { "epoch": 1.6616106897797038, "grad_norm": 5.070699214935303, "learning_rate": 8.338150289017341e-07, "logits/chosen": -2.9737086296081543, "logits/rejected": -2.9941165447235107, "logps/chosen": -44.342254638671875, "logps/rejected": -86.61956024169922, "loss": 0.4962, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1542888730764389, "rewards/margins": 3.9809412956237793, "rewards/rejected": -3.8266522884368896, "step": 576 }, { "epoch": 1.664499819429397, "grad_norm": 6.139413356781006, "learning_rate": 8.335260115606936e-07, "logits/chosen": -2.9963550567626953, "logits/rejected": -3.1824474334716797, "logps/chosen": -34.852020263671875, "logps/rejected": -81.01537322998047, "loss": 0.5004, "rewards/accuracies": 1.0, "rewards/chosen": 1.0003440380096436, "rewards/margins": 4.41264533996582, "rewards/rejected": -3.4123010635375977, "step": 577 }, { "epoch": 1.6673889490790899, "grad_norm": 7.116971015930176, "learning_rate": 8.332369942196531e-07, "logits/chosen": -2.989762783050537, "logits/rejected": -3.124937057495117, "logps/chosen": -35.958343505859375, "logps/rejected": -82.71610260009766, "loss": 0.4972, "rewards/accuracies": 0.875, "rewards/chosen": 0.8308160305023193, "rewards/margins": 4.466343402862549, "rewards/rejected": -3.6355278491973877, "step": 578 }, { "epoch": 1.670278078728783, "grad_norm": 5.232962131500244, "learning_rate": 8.329479768786126e-07, "logits/chosen": -2.97214937210083, "logits/rejected": -3.109567880630493, "logps/chosen": -42.38941192626953, "logps/rejected": -84.15459442138672, "loss": 0.5853, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2784712612628937, "rewards/margins": 3.8916728496551514, "rewards/rejected": -3.613201379776001, "step": 579 }, { "epoch": 1.673167208378476, "grad_norm": 5.636703014373779, "learning_rate": 8.326589595375723e-07, "logits/chosen": -3.00995135307312, "logits/rejected": -3.1179099082946777, "logps/chosen": -29.080883026123047, "logps/rejected": -70.17930603027344, "loss": 0.5108, "rewards/accuracies": 1.0, "rewards/chosen": 1.445608139038086, "rewards/margins": 3.9392151832580566, "rewards/rejected": -2.4936070442199707, "step": 580 }, { "epoch": 1.676056338028169, "grad_norm": 5.478577613830566, "learning_rate": 8.323699421965317e-07, "logits/chosen": -2.9576809406280518, "logits/rejected": -3.0703396797180176, "logps/chosen": -34.364654541015625, "logps/rejected": -77.597900390625, "loss": 0.4362, "rewards/accuracies": 0.9375, "rewards/chosen": 1.072531819343567, "rewards/margins": 4.470047473907471, "rewards/rejected": -3.397515296936035, "step": 581 }, { "epoch": 1.678945467677862, "grad_norm": 6.835014343261719, "learning_rate": 8.320809248554913e-07, "logits/chosen": -3.0224063396453857, "logits/rejected": -3.134758710861206, "logps/chosen": -38.452056884765625, "logps/rejected": -77.1511001586914, "loss": 0.5826, "rewards/accuracies": 0.90625, "rewards/chosen": 0.762798547744751, "rewards/margins": 3.8000614643096924, "rewards/rejected": -3.037262439727783, "step": 582 }, { "epoch": 1.681834597327555, "grad_norm": 5.055516242980957, "learning_rate": 8.317919075144509e-07, "logits/chosen": -3.0052738189697266, "logits/rejected": -3.134793758392334, "logps/chosen": -39.232200622558594, "logps/rejected": -86.2968521118164, "loss": 0.5363, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5356695055961609, "rewards/margins": 4.361815929412842, "rewards/rejected": -3.826146364212036, "step": 583 }, { "epoch": 1.6847237269772481, "grad_norm": 7.185488224029541, "learning_rate": 8.315028901734103e-07, "logits/chosen": -2.9495081901550293, "logits/rejected": -3.185056447982788, "logps/chosen": -32.84742736816406, "logps/rejected": -77.11138153076172, "loss": 0.5484, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2948682308197021, "rewards/margins": 4.1156487464904785, "rewards/rejected": -2.8207807540893555, "step": 584 }, { "epoch": 1.687612856626941, "grad_norm": 4.905453205108643, "learning_rate": 8.312138728323699e-07, "logits/chosen": -2.9914374351501465, "logits/rejected": -3.119941234588623, "logps/chosen": -43.071067810058594, "logps/rejected": -94.53150939941406, "loss": 0.5082, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2592986226081848, "rewards/margins": 4.863262176513672, "rewards/rejected": -4.603963851928711, "step": 585 }, { "epoch": 1.6905019862766342, "grad_norm": 5.066962242126465, "learning_rate": 8.309248554913294e-07, "logits/chosen": -2.905642032623291, "logits/rejected": -3.0411996841430664, "logps/chosen": -35.0206298828125, "logps/rejected": -79.23263549804688, "loss": 0.5244, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3091154098510742, "rewards/margins": 4.265486240386963, "rewards/rejected": -2.9563710689544678, "step": 586 }, { "epoch": 1.6933911159263273, "grad_norm": 5.185830593109131, "learning_rate": 8.30635838150289e-07, "logits/chosen": -2.949571132659912, "logits/rejected": -3.0806822776794434, "logps/chosen": -38.26598358154297, "logps/rejected": -85.5084228515625, "loss": 0.5189, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7123066186904907, "rewards/margins": 4.6239142417907715, "rewards/rejected": -3.9116079807281494, "step": 587 }, { "epoch": 1.6962802455760202, "grad_norm": 6.41165828704834, "learning_rate": 8.303468208092485e-07, "logits/chosen": -2.7958641052246094, "logits/rejected": -2.960505723953247, "logps/chosen": -35.9156494140625, "logps/rejected": -92.65199279785156, "loss": 0.4879, "rewards/accuracies": 0.9375, "rewards/chosen": 0.680739164352417, "rewards/margins": 5.082223415374756, "rewards/rejected": -4.401483535766602, "step": 588 }, { "epoch": 1.699169375225713, "grad_norm": 5.092557430267334, "learning_rate": 8.300578034682081e-07, "logits/chosen": -3.0195322036743164, "logits/rejected": -3.1165761947631836, "logps/chosen": -33.837066650390625, "logps/rejected": -92.6607437133789, "loss": 0.427, "rewards/accuracies": 1.0, "rewards/chosen": 0.9760910272598267, "rewards/margins": 5.469136714935303, "rewards/rejected": -4.493045806884766, "step": 589 }, { "epoch": 1.7020585048754064, "grad_norm": 5.078556537628174, "learning_rate": 8.297687861271676e-07, "logits/chosen": -2.980234146118164, "logits/rejected": -3.121985912322998, "logps/chosen": -35.390830993652344, "logps/rejected": -82.63925170898438, "loss": 0.4479, "rewards/accuracies": 0.96875, "rewards/chosen": 1.475090742111206, "rewards/margins": 4.804927349090576, "rewards/rejected": -3.329836845397949, "step": 590 }, { "epoch": 1.7049476345250993, "grad_norm": 6.147855758666992, "learning_rate": 8.294797687861271e-07, "logits/chosen": -3.0156197547912598, "logits/rejected": -3.144033432006836, "logps/chosen": -31.486042022705078, "logps/rejected": -69.15717315673828, "loss": 0.608, "rewards/accuracies": 0.96875, "rewards/chosen": 1.265994906425476, "rewards/margins": 3.5014071464538574, "rewards/rejected": -2.235412120819092, "step": 591 }, { "epoch": 1.7078367641747922, "grad_norm": 4.901125431060791, "learning_rate": 8.291907514450866e-07, "logits/chosen": -2.9586093425750732, "logits/rejected": -3.170210599899292, "logps/chosen": -32.87324142456055, "logps/rejected": -88.63568115234375, "loss": 0.485, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3067972660064697, "rewards/margins": 5.343923568725586, "rewards/rejected": -4.037126064300537, "step": 592 }, { "epoch": 1.7107258938244854, "grad_norm": 5.397507190704346, "learning_rate": 8.289017341040462e-07, "logits/chosen": -3.041165351867676, "logits/rejected": -3.127078056335449, "logps/chosen": -45.413509368896484, "logps/rejected": -91.99909973144531, "loss": 0.5861, "rewards/accuracies": 0.96875, "rewards/chosen": 0.22452908754348755, "rewards/margins": 4.564725875854492, "rewards/rejected": -4.3401970863342285, "step": 593 }, { "epoch": 1.7136150234741785, "grad_norm": 5.514498233795166, "learning_rate": 8.286127167630058e-07, "logits/chosen": -3.0359997749328613, "logits/rejected": -3.1049530506134033, "logps/chosen": -40.390586853027344, "logps/rejected": -89.9292221069336, "loss": 0.5308, "rewards/accuracies": 1.0, "rewards/chosen": 0.5619018077850342, "rewards/margins": 4.734234809875488, "rewards/rejected": -4.172333717346191, "step": 594 }, { "epoch": 1.7165041531238714, "grad_norm": 6.767085075378418, "learning_rate": 8.283236994219652e-07, "logits/chosen": -2.9729831218719482, "logits/rejected": -3.189377784729004, "logps/chosen": -41.00813293457031, "logps/rejected": -91.06271362304688, "loss": 0.465, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9085961580276489, "rewards/margins": 5.214243412017822, "rewards/rejected": -4.305647373199463, "step": 595 }, { "epoch": 1.7193932827735645, "grad_norm": 5.297351837158203, "learning_rate": 8.280346820809249e-07, "logits/chosen": -2.9819047451019287, "logits/rejected": -3.156393527984619, "logps/chosen": -31.694828033447266, "logps/rejected": -81.03192138671875, "loss": 0.4232, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3530659675598145, "rewards/margins": 4.6907734870910645, "rewards/rejected": -3.33770751953125, "step": 596 }, { "epoch": 1.7222824124232576, "grad_norm": 5.542720794677734, "learning_rate": 8.277456647398844e-07, "logits/chosen": -3.0575523376464844, "logits/rejected": -3.1281609535217285, "logps/chosen": -33.33456039428711, "logps/rejected": -80.12397766113281, "loss": 0.4407, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2749147415161133, "rewards/margins": 4.471574783325195, "rewards/rejected": -3.196659803390503, "step": 597 }, { "epoch": 1.7251715420729505, "grad_norm": 8.764702796936035, "learning_rate": 8.274566473988439e-07, "logits/chosen": -2.967693567276001, "logits/rejected": -3.1423354148864746, "logps/chosen": -39.32445526123047, "logps/rejected": -94.50997924804688, "loss": 0.481, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6185977458953857, "rewards/margins": 5.279813766479492, "rewards/rejected": -4.6612162590026855, "step": 598 }, { "epoch": 1.7280606717226434, "grad_norm": 5.60723352432251, "learning_rate": 8.271676300578034e-07, "logits/chosen": -2.951319694519043, "logits/rejected": -3.189392328262329, "logps/chosen": -31.224205017089844, "logps/rejected": -84.28052520751953, "loss": 0.4324, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6058545112609863, "rewards/margins": 5.4491753578186035, "rewards/rejected": -3.843320846557617, "step": 599 }, { "epoch": 1.7309498013723366, "grad_norm": 6.410114765167236, "learning_rate": 8.26878612716763e-07, "logits/chosen": -3.0103259086608887, "logits/rejected": -3.0790507793426514, "logps/chosen": -40.070621490478516, "logps/rejected": -92.4290771484375, "loss": 0.5934, "rewards/accuracies": 1.0, "rewards/chosen": 0.4487622380256653, "rewards/margins": 4.720920562744141, "rewards/rejected": -4.272158145904541, "step": 600 }, { "epoch": 1.7338389310220297, "grad_norm": 5.635537147521973, "learning_rate": 8.265895953757224e-07, "logits/chosen": -2.958164691925049, "logits/rejected": -3.0150861740112305, "logps/chosen": -36.900909423828125, "logps/rejected": -84.7981948852539, "loss": 0.4986, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0187759399414062, "rewards/margins": 4.401679039001465, "rewards/rejected": -3.3829030990600586, "step": 601 }, { "epoch": 1.7367280606717226, "grad_norm": 4.21838903427124, "learning_rate": 8.26300578034682e-07, "logits/chosen": -3.0078134536743164, "logits/rejected": -3.096529483795166, "logps/chosen": -43.893863677978516, "logps/rejected": -85.03565216064453, "loss": 0.5358, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4510440230369568, "rewards/margins": 4.009797096252441, "rewards/rejected": -3.5587527751922607, "step": 602 }, { "epoch": 1.7396171903214157, "grad_norm": 4.520734786987305, "learning_rate": 8.260115606936415e-07, "logits/chosen": -2.995366096496582, "logits/rejected": -3.1591992378234863, "logps/chosen": -34.37445831298828, "logps/rejected": -87.23716735839844, "loss": 0.3545, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4159760475158691, "rewards/margins": 5.367801666259766, "rewards/rejected": -3.9518256187438965, "step": 603 }, { "epoch": 1.7425063199711088, "grad_norm": 5.321105003356934, "learning_rate": 8.257225433526012e-07, "logits/chosen": -2.866715669631958, "logits/rejected": -3.0572705268859863, "logps/chosen": -36.399070739746094, "logps/rejected": -89.33610534667969, "loss": 0.5002, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7761214375495911, "rewards/margins": 4.9723310470581055, "rewards/rejected": -4.196209907531738, "step": 604 }, { "epoch": 1.7453954496208017, "grad_norm": 5.329153060913086, "learning_rate": 8.254335260115607e-07, "logits/chosen": -3.0398216247558594, "logits/rejected": -3.149423122406006, "logps/chosen": -35.264495849609375, "logps/rejected": -88.58855438232422, "loss": 0.5268, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8344831466674805, "rewards/margins": 4.864467620849609, "rewards/rejected": -4.029984474182129, "step": 605 }, { "epoch": 1.7482845792704946, "grad_norm": 8.38354778289795, "learning_rate": 8.251445086705202e-07, "logits/chosen": -2.8852293491363525, "logits/rejected": -3.077263593673706, "logps/chosen": -27.246414184570312, "logps/rejected": -68.86322784423828, "loss": 0.4416, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9532887935638428, "rewards/margins": 4.057790279388428, "rewards/rejected": -2.1045007705688477, "step": 606 }, { "epoch": 1.7511737089201878, "grad_norm": 7.388906002044678, "learning_rate": 8.248554913294798e-07, "logits/chosen": -2.996931791305542, "logits/rejected": -3.1075656414031982, "logps/chosen": -30.850065231323242, "logps/rejected": -79.14715576171875, "loss": 0.5173, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3924458026885986, "rewards/margins": 4.654299259185791, "rewards/rejected": -3.2618532180786133, "step": 607 }, { "epoch": 1.7540628385698809, "grad_norm": 5.448512077331543, "learning_rate": 8.245664739884392e-07, "logits/chosen": -2.9890825748443604, "logits/rejected": -3.0406441688537598, "logps/chosen": -38.467132568359375, "logps/rejected": -84.12008666992188, "loss": 0.6021, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8451201915740967, "rewards/margins": 4.338707447052002, "rewards/rejected": -3.4935874938964844, "step": 608 }, { "epoch": 1.7569519682195738, "grad_norm": 4.288078308105469, "learning_rate": 8.242774566473988e-07, "logits/chosen": -3.013829469680786, "logits/rejected": -3.105149745941162, "logps/chosen": -33.551944732666016, "logps/rejected": -86.89997863769531, "loss": 0.4609, "rewards/accuracies": 1.0, "rewards/chosen": 1.1601448059082031, "rewards/margins": 4.819759845733643, "rewards/rejected": -3.6596150398254395, "step": 609 }, { "epoch": 1.759841097869267, "grad_norm": 5.873926639556885, "learning_rate": 8.239884393063583e-07, "logits/chosen": -3.0066678524017334, "logits/rejected": -3.210447072982788, "logps/chosen": -27.470674514770508, "logps/rejected": -80.65792846679688, "loss": 0.4385, "rewards/accuracies": 0.96875, "rewards/chosen": 1.857364296913147, "rewards/margins": 5.155390739440918, "rewards/rejected": -3.2980260848999023, "step": 610 }, { "epoch": 1.76273022751896, "grad_norm": 4.199736595153809, "learning_rate": 8.236994219653179e-07, "logits/chosen": -2.9392318725585938, "logits/rejected": -3.226116180419922, "logps/chosen": -29.6333065032959, "logps/rejected": -86.395263671875, "loss": 0.4608, "rewards/accuracies": 1.0, "rewards/chosen": 1.5211005210876465, "rewards/margins": 5.1521124839782715, "rewards/rejected": -3.631011962890625, "step": 611 }, { "epoch": 1.765619357168653, "grad_norm": 5.971253395080566, "learning_rate": 8.234104046242774e-07, "logits/chosen": -2.93794584274292, "logits/rejected": -3.1527504920959473, "logps/chosen": -49.6580924987793, "logps/rejected": -91.31189727783203, "loss": 0.5846, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20560336112976074, "rewards/margins": 4.062130928039551, "rewards/rejected": -4.267734527587891, "step": 612 }, { "epoch": 1.7685084868183458, "grad_norm": 6.703039169311523, "learning_rate": 8.23121387283237e-07, "logits/chosen": -2.7842562198638916, "logits/rejected": -2.9395182132720947, "logps/chosen": -42.918460845947266, "logps/rejected": -78.89824676513672, "loss": 0.6042, "rewards/accuracies": 0.9375, "rewards/chosen": 0.380743145942688, "rewards/margins": 3.235267400741577, "rewards/rejected": -2.8545241355895996, "step": 613 }, { "epoch": 1.771397616468039, "grad_norm": 7.292159080505371, "learning_rate": 8.228323699421966e-07, "logits/chosen": -2.951747179031372, "logits/rejected": -3.164541244506836, "logps/chosen": -34.04856872558594, "logps/rejected": -77.33439636230469, "loss": 0.5879, "rewards/accuracies": 1.0, "rewards/chosen": 0.9253284931182861, "rewards/margins": 3.886298894882202, "rewards/rejected": -2.960970401763916, "step": 614 }, { "epoch": 1.774286746117732, "grad_norm": 5.35520601272583, "learning_rate": 8.22543352601156e-07, "logits/chosen": -2.9530839920043945, "logits/rejected": -3.209146022796631, "logps/chosen": -28.478330612182617, "logps/rejected": -77.42288970947266, "loss": 0.4967, "rewards/accuracies": 1.0, "rewards/chosen": 1.5920071601867676, "rewards/margins": 4.769176483154297, "rewards/rejected": -3.1771695613861084, "step": 615 }, { "epoch": 1.777175875767425, "grad_norm": 7.353893756866455, "learning_rate": 8.222543352601156e-07, "logits/chosen": -2.9176669120788574, "logits/rejected": -3.0474905967712402, "logps/chosen": -27.85068702697754, "logps/rejected": -73.91122436523438, "loss": 0.5165, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5820434093475342, "rewards/margins": 4.023802757263184, "rewards/rejected": -2.4417593479156494, "step": 616 }, { "epoch": 1.780065005417118, "grad_norm": 4.3310546875, "learning_rate": 8.219653179190751e-07, "logits/chosen": -3.040290355682373, "logits/rejected": -3.1785295009613037, "logps/chosen": -35.24732971191406, "logps/rejected": -78.5286865234375, "loss": 0.573, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3377944231033325, "rewards/margins": 4.30049991607666, "rewards/rejected": -2.962705612182617, "step": 617 }, { "epoch": 1.7829541350668112, "grad_norm": 5.655146598815918, "learning_rate": 8.216763005780346e-07, "logits/chosen": -2.8849306106567383, "logits/rejected": -3.022913932800293, "logps/chosen": -37.44658279418945, "logps/rejected": -84.71302795410156, "loss": 0.4854, "rewards/accuracies": 0.96875, "rewards/chosen": 1.131503701210022, "rewards/margins": 4.502411365509033, "rewards/rejected": -3.3709075450897217, "step": 618 }, { "epoch": 1.7858432647165041, "grad_norm": 6.402068614959717, "learning_rate": 8.213872832369941e-07, "logits/chosen": -2.862344980239868, "logits/rejected": -2.976181983947754, "logps/chosen": -31.61269760131836, "logps/rejected": -68.46392059326172, "loss": 0.5856, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3571683168411255, "rewards/margins": 3.511690139770508, "rewards/rejected": -2.1545217037200928, "step": 619 }, { "epoch": 1.788732394366197, "grad_norm": 7.82240629196167, "learning_rate": 8.210982658959538e-07, "logits/chosen": -2.9667751789093018, "logits/rejected": -3.173785448074341, "logps/chosen": -33.918575286865234, "logps/rejected": -77.2135009765625, "loss": 0.4578, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3693773746490479, "rewards/margins": 4.301879405975342, "rewards/rejected": -2.9325027465820312, "step": 620 }, { "epoch": 1.7916215240158904, "grad_norm": 6.50620698928833, "learning_rate": 8.208092485549133e-07, "logits/chosen": -2.8809447288513184, "logits/rejected": -3.102128505706787, "logps/chosen": -33.08806610107422, "logps/rejected": -81.19566345214844, "loss": 0.5327, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4030952453613281, "rewards/margins": 4.521734714508057, "rewards/rejected": -3.1186392307281494, "step": 621 }, { "epoch": 1.7945106536655833, "grad_norm": 5.395105838775635, "learning_rate": 8.205202312138728e-07, "logits/chosen": -2.9029197692871094, "logits/rejected": -3.051201343536377, "logps/chosen": -33.21516036987305, "logps/rejected": -80.169677734375, "loss": 0.4603, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3055156469345093, "rewards/margins": 4.412240982055664, "rewards/rejected": -3.1067254543304443, "step": 622 }, { "epoch": 1.7973997833152762, "grad_norm": 6.106100559234619, "learning_rate": 8.202312138728323e-07, "logits/chosen": -3.03208065032959, "logits/rejected": -3.1228785514831543, "logps/chosen": -36.03227996826172, "logps/rejected": -86.21102905273438, "loss": 0.5303, "rewards/accuracies": 0.875, "rewards/chosen": 0.9085787534713745, "rewards/margins": 4.704180717468262, "rewards/rejected": -3.7956013679504395, "step": 623 }, { "epoch": 1.8002889129649693, "grad_norm": 6.749969005584717, "learning_rate": 8.199421965317919e-07, "logits/chosen": -2.977936267852783, "logits/rejected": -3.0554261207580566, "logps/chosen": -33.103492736816406, "logps/rejected": -80.23279571533203, "loss": 0.5023, "rewards/accuracies": 0.9375, "rewards/chosen": 1.266503930091858, "rewards/margins": 4.546993255615234, "rewards/rejected": -3.280489206314087, "step": 624 }, { "epoch": 1.8031780426146624, "grad_norm": 4.979244232177734, "learning_rate": 8.196531791907513e-07, "logits/chosen": -2.956930637359619, "logits/rejected": -3.0376977920532227, "logps/chosen": -34.25605010986328, "logps/rejected": -80.17422485351562, "loss": 0.5232, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8778418898582458, "rewards/margins": 4.235307693481445, "rewards/rejected": -3.357465982437134, "step": 625 }, { "epoch": 1.8060671722643553, "grad_norm": 5.4333815574646, "learning_rate": 8.193641618497109e-07, "logits/chosen": -3.057795524597168, "logits/rejected": -3.2130002975463867, "logps/chosen": -39.14981460571289, "logps/rejected": -91.9941177368164, "loss": 0.5312, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6686781048774719, "rewards/margins": 5.057835578918457, "rewards/rejected": -4.389157295227051, "step": 626 }, { "epoch": 1.8089563019140484, "grad_norm": 5.118998050689697, "learning_rate": 8.190751445086705e-07, "logits/chosen": -2.937938690185547, "logits/rejected": -3.1209933757781982, "logps/chosen": -34.45170211791992, "logps/rejected": -88.7099838256836, "loss": 0.4896, "rewards/accuracies": 1.0, "rewards/chosen": 1.3449147939682007, "rewards/margins": 5.429948329925537, "rewards/rejected": -4.085033416748047, "step": 627 }, { "epoch": 1.8118454315637416, "grad_norm": 6.071027755737305, "learning_rate": 8.187861271676301e-07, "logits/chosen": -2.8995392322540283, "logits/rejected": -3.101074457168579, "logps/chosen": -31.1778621673584, "logps/rejected": -82.02810668945312, "loss": 0.4682, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5304964780807495, "rewards/margins": 4.849705219268799, "rewards/rejected": -3.3192079067230225, "step": 628 }, { "epoch": 1.8147345612134345, "grad_norm": 6.129658222198486, "learning_rate": 8.184971098265896e-07, "logits/chosen": -3.0050809383392334, "logits/rejected": -3.1968491077423096, "logps/chosen": -30.640483856201172, "logps/rejected": -91.49943542480469, "loss": 0.4546, "rewards/accuracies": 1.0, "rewards/chosen": 1.3003723621368408, "rewards/margins": 5.500196933746338, "rewards/rejected": -4.199824333190918, "step": 629 }, { "epoch": 1.8176236908631274, "grad_norm": 5.403126239776611, "learning_rate": 8.182080924855491e-07, "logits/chosen": -2.9639155864715576, "logits/rejected": -3.115312337875366, "logps/chosen": -35.48090744018555, "logps/rejected": -82.41761016845703, "loss": 0.5479, "rewards/accuracies": 1.0, "rewards/chosen": 0.9364205002784729, "rewards/margins": 4.325657367706299, "rewards/rejected": -3.3892364501953125, "step": 630 }, { "epoch": 1.8176236908631274, "eval_logits/chosen": -3.036954164505005, "eval_logits/rejected": -3.1882131099700928, "eval_logps/chosen": -38.51020431518555, "eval_logps/rejected": -88.78728485107422, "eval_loss": 0.5168507099151611, "eval_rewards/accuracies": 0.9354838728904724, "eval_rewards/chosen": 0.7806212902069092, "eval_rewards/margins": 4.964332103729248, "eval_rewards/rejected": -4.183711528778076, "eval_runtime": 222.9117, "eval_samples_per_second": 0.552, "eval_steps_per_second": 0.278, "step": 630 }, { "epoch": 1.8205128205128205, "grad_norm": 4.348379135131836, "learning_rate": 8.179190751445087e-07, "logits/chosen": -2.9608139991760254, "logits/rejected": -3.1232638359069824, "logps/chosen": -37.974769592285156, "logps/rejected": -85.86041259765625, "loss": 0.54, "rewards/accuracies": 0.96875, "rewards/chosen": 0.759053647518158, "rewards/margins": 4.631721496582031, "rewards/rejected": -3.8726675510406494, "step": 631 }, { "epoch": 1.8234019501625136, "grad_norm": 5.252662658691406, "learning_rate": 8.176300578034681e-07, "logits/chosen": -2.93670392036438, "logits/rejected": -3.0905940532684326, "logps/chosen": -36.263919830322266, "logps/rejected": -82.09907531738281, "loss": 0.5091, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8741349577903748, "rewards/margins": 4.432958126068115, "rewards/rejected": -3.5588231086730957, "step": 632 }, { "epoch": 1.8262910798122065, "grad_norm": 5.664395332336426, "learning_rate": 8.173410404624277e-07, "logits/chosen": -2.9527831077575684, "logits/rejected": -3.1055383682250977, "logps/chosen": -33.3294563293457, "logps/rejected": -80.28186798095703, "loss": 0.5162, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2627534866333008, "rewards/margins": 4.462331295013428, "rewards/rejected": -3.199578285217285, "step": 633 }, { "epoch": 1.8291802094618996, "grad_norm": 6.268341541290283, "learning_rate": 8.170520231213872e-07, "logits/chosen": -2.9692416191101074, "logits/rejected": -3.1805496215820312, "logps/chosen": -31.205528259277344, "logps/rejected": -81.1873550415039, "loss": 0.4842, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4686051607131958, "rewards/margins": 4.805546760559082, "rewards/rejected": -3.336941719055176, "step": 634 }, { "epoch": 1.8320693391115928, "grad_norm": 6.547710418701172, "learning_rate": 8.167630057803467e-07, "logits/chosen": -2.935215950012207, "logits/rejected": -3.017056703567505, "logps/chosen": -28.544275283813477, "logps/rejected": -77.68939971923828, "loss": 0.5023, "rewards/accuracies": 1.0, "rewards/chosen": 1.7420316934585571, "rewards/margins": 4.852575302124023, "rewards/rejected": -3.1105432510375977, "step": 635 }, { "epoch": 1.8349584687612857, "grad_norm": 7.55168342590332, "learning_rate": 8.164739884393064e-07, "logits/chosen": -2.9872164726257324, "logits/rejected": -3.1176326274871826, "logps/chosen": -34.14030456542969, "logps/rejected": -86.88125610351562, "loss": 0.462, "rewards/accuracies": 0.96875, "rewards/chosen": 1.164391040802002, "rewards/margins": 4.984152317047119, "rewards/rejected": -3.819761037826538, "step": 636 }, { "epoch": 1.8378475984109786, "grad_norm": 6.944372177124023, "learning_rate": 8.161849710982659e-07, "logits/chosen": -2.9416067600250244, "logits/rejected": -3.0287389755249023, "logps/chosen": -35.794952392578125, "logps/rejected": -81.40647888183594, "loss": 0.5187, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0683623552322388, "rewards/margins": 4.603111267089844, "rewards/rejected": -3.5347490310668945, "step": 637 }, { "epoch": 1.8407367280606717, "grad_norm": 6.344137191772461, "learning_rate": 8.158959537572255e-07, "logits/chosen": -2.9847874641418457, "logits/rejected": -3.190206527709961, "logps/chosen": -39.635066986083984, "logps/rejected": -86.98387908935547, "loss": 0.5307, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6575032472610474, "rewards/margins": 4.494219779968262, "rewards/rejected": -3.8367161750793457, "step": 638 }, { "epoch": 1.8436258577103648, "grad_norm": 5.914822101593018, "learning_rate": 8.156069364161849e-07, "logits/chosen": -2.9563608169555664, "logits/rejected": -3.189479351043701, "logps/chosen": -26.935317993164062, "logps/rejected": -79.61907958984375, "loss": 0.4346, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5560582876205444, "rewards/margins": 4.908036231994629, "rewards/rejected": -3.351977825164795, "step": 639 }, { "epoch": 1.8465149873600577, "grad_norm": 6.618429660797119, "learning_rate": 8.153179190751445e-07, "logits/chosen": -2.9810256958007812, "logits/rejected": -3.1779110431671143, "logps/chosen": -30.39217758178711, "logps/rejected": -83.99485778808594, "loss": 0.4754, "rewards/accuracies": 1.0, "rewards/chosen": 1.6573755741119385, "rewards/margins": 5.106306076049805, "rewards/rejected": -3.448930025100708, "step": 640 }, { "epoch": 1.8494041170097508, "grad_norm": 6.389671802520752, "learning_rate": 8.15028901734104e-07, "logits/chosen": -2.97924542427063, "logits/rejected": -3.168506145477295, "logps/chosen": -37.71305847167969, "logps/rejected": -96.78425598144531, "loss": 0.5439, "rewards/accuracies": 1.0, "rewards/chosen": 0.7369674444198608, "rewards/margins": 5.536897659301758, "rewards/rejected": -4.799930572509766, "step": 641 }, { "epoch": 1.852293246659444, "grad_norm": 5.501306056976318, "learning_rate": 8.147398843930635e-07, "logits/chosen": -2.9978761672973633, "logits/rejected": -3.1400277614593506, "logps/chosen": -38.53581237792969, "logps/rejected": -86.70414733886719, "loss": 0.5188, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7296884059906006, "rewards/margins": 4.404979228973389, "rewards/rejected": -3.675291061401367, "step": 642 }, { "epoch": 1.8551823763091368, "grad_norm": 4.890214920043945, "learning_rate": 8.14450867052023e-07, "logits/chosen": -2.9729650020599365, "logits/rejected": -3.0984251499176025, "logps/chosen": -46.45745086669922, "logps/rejected": -82.0829849243164, "loss": 0.6671, "rewards/accuracies": 0.90625, "rewards/chosen": -0.039590418338775635, "rewards/margins": 3.3553857803344727, "rewards/rejected": -3.3949761390686035, "step": 643 }, { "epoch": 1.8580715059588297, "grad_norm": 5.153567314147949, "learning_rate": 8.141618497109827e-07, "logits/chosen": -2.9915783405303955, "logits/rejected": -3.1685447692871094, "logps/chosen": -38.19207763671875, "logps/rejected": -80.9034652709961, "loss": 0.602, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7549100518226624, "rewards/margins": 3.925792932510376, "rewards/rejected": -3.1708827018737793, "step": 644 }, { "epoch": 1.860960635608523, "grad_norm": 5.6725263595581055, "learning_rate": 8.138728323699422e-07, "logits/chosen": -3.015535354614258, "logits/rejected": -3.144338369369507, "logps/chosen": -37.03944778442383, "logps/rejected": -91.535400390625, "loss": 0.4659, "rewards/accuracies": 0.96875, "rewards/chosen": 0.800851583480835, "rewards/margins": 5.255871772766113, "rewards/rejected": -4.455020427703857, "step": 645 }, { "epoch": 1.863849765258216, "grad_norm": 6.042547702789307, "learning_rate": 8.135838150289017e-07, "logits/chosen": -2.9429471492767334, "logits/rejected": -3.180408477783203, "logps/chosen": -35.46075439453125, "logps/rejected": -73.03401947021484, "loss": 0.5136, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1098581552505493, "rewards/margins": 3.5422327518463135, "rewards/rejected": -2.4323747158050537, "step": 646 }, { "epoch": 1.866738894907909, "grad_norm": 6.974422454833984, "learning_rate": 8.132947976878613e-07, "logits/chosen": -2.9918265342712402, "logits/rejected": -3.0580527782440186, "logps/chosen": -39.71244430541992, "logps/rejected": -82.4761962890625, "loss": 0.512, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6374382376670837, "rewards/margins": 4.32309627532959, "rewards/rejected": -3.6856582164764404, "step": 647 }, { "epoch": 1.869628024557602, "grad_norm": 7.992709159851074, "learning_rate": 8.130057803468208e-07, "logits/chosen": -3.021246910095215, "logits/rejected": -3.186558246612549, "logps/chosen": -36.09872055053711, "logps/rejected": -89.31275939941406, "loss": 0.3975, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9173091053962708, "rewards/margins": 5.35239315032959, "rewards/rejected": -4.435084342956543, "step": 648 }, { "epoch": 1.8725171542072951, "grad_norm": 11.040325164794922, "learning_rate": 8.127167630057803e-07, "logits/chosen": -2.98490571975708, "logits/rejected": -3.0551106929779053, "logps/chosen": -36.74375915527344, "logps/rejected": -80.60696411132812, "loss": 0.5155, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8388140797615051, "rewards/margins": 4.606074810028076, "rewards/rejected": -3.7672600746154785, "step": 649 }, { "epoch": 1.875406283856988, "grad_norm": 7.445559024810791, "learning_rate": 8.124277456647398e-07, "logits/chosen": -3.026155471801758, "logits/rejected": -3.163059949874878, "logps/chosen": -36.61015319824219, "logps/rejected": -97.72160339355469, "loss": 0.4768, "rewards/accuracies": 1.0, "rewards/chosen": 0.8992023468017578, "rewards/margins": 5.7196125984191895, "rewards/rejected": -4.82041072845459, "step": 650 }, { "epoch": 1.8782954135066812, "grad_norm": 6.131134986877441, "learning_rate": 8.121387283236994e-07, "logits/chosen": -2.929870843887329, "logits/rejected": -3.118533134460449, "logps/chosen": -40.61466598510742, "logps/rejected": -88.40288543701172, "loss": 0.5814, "rewards/accuracies": 0.96875, "rewards/chosen": 0.44728556275367737, "rewards/margins": 4.4659600257873535, "rewards/rejected": -4.018674850463867, "step": 651 }, { "epoch": 1.8811845431563743, "grad_norm": 5.380061626434326, "learning_rate": 8.118497109826589e-07, "logits/chosen": -2.902421474456787, "logits/rejected": -3.0673317909240723, "logps/chosen": -33.022193908691406, "logps/rejected": -84.75384521484375, "loss": 0.5596, "rewards/accuracies": 1.0, "rewards/chosen": 1.059475064277649, "rewards/margins": 4.67938232421875, "rewards/rejected": -3.6199071407318115, "step": 652 }, { "epoch": 1.8840736728060672, "grad_norm": 5.939985275268555, "learning_rate": 8.115606936416185e-07, "logits/chosen": -2.9751510620117188, "logits/rejected": -3.0657331943511963, "logps/chosen": -40.1938591003418, "logps/rejected": -76.76475524902344, "loss": 0.6334, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6060484647750854, "rewards/margins": 3.510798692703247, "rewards/rejected": -2.904750347137451, "step": 653 }, { "epoch": 1.88696280245576, "grad_norm": 5.649683952331543, "learning_rate": 8.11271676300578e-07, "logits/chosen": -2.899231433868408, "logits/rejected": -3.0559122562408447, "logps/chosen": -53.277191162109375, "logps/rejected": -95.1552505493164, "loss": 0.6299, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6029395461082458, "rewards/margins": 3.958367109298706, "rewards/rejected": -4.561305999755859, "step": 654 }, { "epoch": 1.8898519321054532, "grad_norm": 5.0117998123168945, "learning_rate": 8.109826589595376e-07, "logits/chosen": -3.023536205291748, "logits/rejected": -3.1634793281555176, "logps/chosen": -41.6141357421875, "logps/rejected": -91.44081115722656, "loss": 0.5401, "rewards/accuracies": 0.96875, "rewards/chosen": 0.1351771056652069, "rewards/margins": 4.747908115386963, "rewards/rejected": -4.61273193359375, "step": 655 }, { "epoch": 1.8927410617551463, "grad_norm": 6.480145454406738, "learning_rate": 8.10693641618497e-07, "logits/chosen": -3.0236170291900635, "logits/rejected": -3.109318256378174, "logps/chosen": -40.33126449584961, "logps/rejected": -88.38697052001953, "loss": 0.5079, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4476148784160614, "rewards/margins": 4.3831682205200195, "rewards/rejected": -3.935553550720215, "step": 656 }, { "epoch": 1.8956301914048392, "grad_norm": 6.521762847900391, "learning_rate": 8.104046242774566e-07, "logits/chosen": -2.98250412940979, "logits/rejected": -3.114968776702881, "logps/chosen": -31.66532325744629, "logps/rejected": -80.02324676513672, "loss": 0.5361, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5886961221694946, "rewards/margins": 4.668105602264404, "rewards/rejected": -3.07940936088562, "step": 657 }, { "epoch": 1.8985193210545324, "grad_norm": 4.334444999694824, "learning_rate": 8.101156069364162e-07, "logits/chosen": -3.078040361404419, "logits/rejected": -3.1394002437591553, "logps/chosen": -47.70879364013672, "logps/rejected": -95.84917449951172, "loss": 0.5484, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0016074702143669128, "rewards/margins": 4.845538139343262, "rewards/rejected": -4.8471455574035645, "step": 658 }, { "epoch": 1.9014084507042255, "grad_norm": 6.501220226287842, "learning_rate": 8.098265895953756e-07, "logits/chosen": -2.9393444061279297, "logits/rejected": -3.0893611907958984, "logps/chosen": -37.064666748046875, "logps/rejected": -73.83495330810547, "loss": 0.5697, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6431670784950256, "rewards/margins": 3.4061439037323, "rewards/rejected": -2.7629764080047607, "step": 659 }, { "epoch": 1.9042975803539184, "grad_norm": 5.726454734802246, "learning_rate": 8.095375722543353e-07, "logits/chosen": -2.909200668334961, "logits/rejected": -3.0888924598693848, "logps/chosen": -42.985694885253906, "logps/rejected": -93.50128173828125, "loss": 0.5396, "rewards/accuracies": 0.96875, "rewards/chosen": 0.08721169829368591, "rewards/margins": 4.4482316970825195, "rewards/rejected": -4.361020088195801, "step": 660 }, { "epoch": 1.9071867100036113, "grad_norm": 5.621708869934082, "learning_rate": 8.092485549132948e-07, "logits/chosen": -2.900198221206665, "logits/rejected": -3.0312061309814453, "logps/chosen": -26.09656524658203, "logps/rejected": -78.39443969726562, "loss": 0.4747, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6629376411437988, "rewards/margins": 4.577685356140137, "rewards/rejected": -2.914747953414917, "step": 661 }, { "epoch": 1.9100758396533044, "grad_norm": 7.359631538391113, "learning_rate": 8.089595375722544e-07, "logits/chosen": -2.847289562225342, "logits/rejected": -3.0585389137268066, "logps/chosen": -29.31728172302246, "logps/rejected": -77.63565826416016, "loss": 0.5141, "rewards/accuracies": 1.0, "rewards/chosen": 1.5481106042861938, "rewards/margins": 4.783447265625, "rewards/rejected": -3.2353367805480957, "step": 662 }, { "epoch": 1.9129649693029975, "grad_norm": 6.026761054992676, "learning_rate": 8.086705202312138e-07, "logits/chosen": -2.946091890335083, "logits/rejected": -3.0833582878112793, "logps/chosen": -33.40098190307617, "logps/rejected": -89.85920715332031, "loss": 0.4021, "rewards/accuracies": 0.96875, "rewards/chosen": 1.362295150756836, "rewards/margins": 5.453577041625977, "rewards/rejected": -4.091281414031982, "step": 663 }, { "epoch": 1.9158540989526904, "grad_norm": 6.292086124420166, "learning_rate": 8.083815028901734e-07, "logits/chosen": -2.9832470417022705, "logits/rejected": -3.0572731494903564, "logps/chosen": -23.641983032226562, "logps/rejected": -77.7940444946289, "loss": 0.3882, "rewards/accuracies": 1.0, "rewards/chosen": 2.058725118637085, "rewards/margins": 4.8776326179504395, "rewards/rejected": -2.8189074993133545, "step": 664 }, { "epoch": 1.9187432286023836, "grad_norm": 5.766408920288086, "learning_rate": 8.080924855491329e-07, "logits/chosen": -2.875854969024658, "logits/rejected": -3.0044407844543457, "logps/chosen": -35.06092071533203, "logps/rejected": -85.40806579589844, "loss": 0.4506, "rewards/accuracies": 1.0, "rewards/chosen": 1.3416409492492676, "rewards/margins": 4.99652099609375, "rewards/rejected": -3.6548800468444824, "step": 665 }, { "epoch": 1.9216323582520767, "grad_norm": 5.815342903137207, "learning_rate": 8.078034682080924e-07, "logits/chosen": -2.955214738845825, "logits/rejected": -3.196073532104492, "logps/chosen": -44.32563400268555, "logps/rejected": -79.72505187988281, "loss": 0.6298, "rewards/accuracies": 0.84375, "rewards/chosen": 0.15126359462738037, "rewards/margins": 3.3441321849823, "rewards/rejected": -3.192868232727051, "step": 666 }, { "epoch": 1.9245214879017696, "grad_norm": 6.327131271362305, "learning_rate": 8.075144508670519e-07, "logits/chosen": -2.8855161666870117, "logits/rejected": -3.0603151321411133, "logps/chosen": -36.15216064453125, "logps/rejected": -89.31976318359375, "loss": 0.4683, "rewards/accuracies": 1.0, "rewards/chosen": 1.321886658668518, "rewards/margins": 5.041203022003174, "rewards/rejected": -3.7193169593811035, "step": 667 }, { "epoch": 1.9274106175514625, "grad_norm": 5.322356224060059, "learning_rate": 8.072254335260116e-07, "logits/chosen": -3.0521907806396484, "logits/rejected": -3.1584672927856445, "logps/chosen": -53.387237548828125, "logps/rejected": -102.3505859375, "loss": 0.5048, "rewards/accuracies": 0.96875, "rewards/chosen": -0.09176050126552582, "rewards/margins": 5.131049156188965, "rewards/rejected": -5.222809791564941, "step": 668 }, { "epoch": 1.9302997472011556, "grad_norm": 5.926821231842041, "learning_rate": 8.06936416184971e-07, "logits/chosen": -2.880244731903076, "logits/rejected": -3.174180030822754, "logps/chosen": -28.50019645690918, "logps/rejected": -89.452880859375, "loss": 0.4734, "rewards/accuracies": 1.0, "rewards/chosen": 1.4300963878631592, "rewards/margins": 5.639631748199463, "rewards/rejected": -4.209535121917725, "step": 669 }, { "epoch": 1.9331888768508487, "grad_norm": 5.1184468269348145, "learning_rate": 8.066473988439306e-07, "logits/chosen": -2.9582366943359375, "logits/rejected": -3.0702836513519287, "logps/chosen": -36.80727005004883, "logps/rejected": -78.49073028564453, "loss": 0.5157, "rewards/accuracies": 0.875, "rewards/chosen": 0.9787735939025879, "rewards/margins": 4.025045394897461, "rewards/rejected": -3.046271324157715, "step": 670 }, { "epoch": 1.9360780065005416, "grad_norm": 6.23281192779541, "learning_rate": 8.063583815028902e-07, "logits/chosen": -2.979663610458374, "logits/rejected": -3.1269407272338867, "logps/chosen": -36.596588134765625, "logps/rejected": -98.7701644897461, "loss": 0.4431, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8545740246772766, "rewards/margins": 5.698371410369873, "rewards/rejected": -4.843797206878662, "step": 671 }, { "epoch": 1.9389671361502347, "grad_norm": 7.39448356628418, "learning_rate": 8.060693641618497e-07, "logits/chosen": -2.9705004692077637, "logits/rejected": -3.10622239112854, "logps/chosen": -33.169471740722656, "logps/rejected": -67.08755493164062, "loss": 0.6086, "rewards/accuracies": 0.875, "rewards/chosen": 1.065769910812378, "rewards/margins": 3.1125106811523438, "rewards/rejected": -2.046740770339966, "step": 672 }, { "epoch": 1.9418562657999279, "grad_norm": 5.856557846069336, "learning_rate": 8.057803468208092e-07, "logits/chosen": -2.9048428535461426, "logits/rejected": -3.164964199066162, "logps/chosen": -32.07964324951172, "logps/rejected": -74.12225341796875, "loss": 0.5144, "rewards/accuracies": 1.0, "rewards/chosen": 1.4794905185699463, "rewards/margins": 4.059309005737305, "rewards/rejected": -2.5798180103302, "step": 673 }, { "epoch": 1.9447453954496208, "grad_norm": 5.171868324279785, "learning_rate": 8.054913294797687e-07, "logits/chosen": -2.901639461517334, "logits/rejected": -3.0840206146240234, "logps/chosen": -37.92768478393555, "logps/rejected": -94.47046661376953, "loss": 0.4885, "rewards/accuracies": 1.0, "rewards/chosen": 0.8089265823364258, "rewards/margins": 5.425121307373047, "rewards/rejected": -4.616194248199463, "step": 674 }, { "epoch": 1.9476345250993137, "grad_norm": 6.282932758331299, "learning_rate": 8.052023121387283e-07, "logits/chosen": -2.899199962615967, "logits/rejected": -3.0768942832946777, "logps/chosen": -37.07691955566406, "logps/rejected": -84.76333618164062, "loss": 0.5093, "rewards/accuracies": 1.0, "rewards/chosen": 0.9387211799621582, "rewards/margins": 4.602282524108887, "rewards/rejected": -3.6635613441467285, "step": 675 }, { "epoch": 1.950523654749007, "grad_norm": 6.06162166595459, "learning_rate": 8.049132947976877e-07, "logits/chosen": -2.913146734237671, "logits/rejected": -3.1349949836730957, "logps/chosen": -31.881500244140625, "logps/rejected": -78.16751098632812, "loss": 0.423, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3345134258270264, "rewards/margins": 4.630168437957764, "rewards/rejected": -3.295654773712158, "step": 676 }, { "epoch": 1.9534127843987, "grad_norm": 7.746712684631348, "learning_rate": 8.046242774566474e-07, "logits/chosen": -2.925950527191162, "logits/rejected": -3.1128625869750977, "logps/chosen": -28.978073120117188, "logps/rejected": -72.84159088134766, "loss": 0.5456, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5638902187347412, "rewards/margins": 4.131750583648682, "rewards/rejected": -2.5678603649139404, "step": 677 }, { "epoch": 1.9563019140483928, "grad_norm": 5.554107666015625, "learning_rate": 8.043352601156069e-07, "logits/chosen": -3.039438009262085, "logits/rejected": -3.1892690658569336, "logps/chosen": -42.64824676513672, "logps/rejected": -97.61021423339844, "loss": 0.4527, "rewards/accuracies": 0.9375, "rewards/chosen": 0.48523056507110596, "rewards/margins": 5.166911602020264, "rewards/rejected": -4.681680679321289, "step": 678 }, { "epoch": 1.959191043698086, "grad_norm": 5.910806179046631, "learning_rate": 8.040462427745665e-07, "logits/chosen": -2.995664119720459, "logits/rejected": -3.167083740234375, "logps/chosen": -37.13373565673828, "logps/rejected": -73.50940704345703, "loss": 0.6288, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8464086651802063, "rewards/margins": 3.447892427444458, "rewards/rejected": -2.6014838218688965, "step": 679 }, { "epoch": 1.962080173347779, "grad_norm": 6.2992472648620605, "learning_rate": 8.03757225433526e-07, "logits/chosen": -2.986361026763916, "logits/rejected": -3.1164352893829346, "logps/chosen": -40.4788703918457, "logps/rejected": -96.547607421875, "loss": 0.5168, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5288674831390381, "rewards/margins": 5.307679176330566, "rewards/rejected": -4.778812408447266, "step": 680 }, { "epoch": 1.964969302997472, "grad_norm": 5.078592300415039, "learning_rate": 8.034682080924855e-07, "logits/chosen": -3.028524398803711, "logits/rejected": -3.1507019996643066, "logps/chosen": -38.55777359008789, "logps/rejected": -81.84357452392578, "loss": 0.5947, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9617204070091248, "rewards/margins": 4.085726737976074, "rewards/rejected": -3.1240057945251465, "step": 681 }, { "epoch": 1.967858432647165, "grad_norm": 5.993879795074463, "learning_rate": 8.031791907514451e-07, "logits/chosen": -2.985337257385254, "logits/rejected": -3.232396125793457, "logps/chosen": -38.60255813598633, "logps/rejected": -87.2102279663086, "loss": 0.4738, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7949625253677368, "rewards/margins": 4.831089973449707, "rewards/rejected": -4.03612756729126, "step": 682 }, { "epoch": 1.9707475622968582, "grad_norm": 5.845585823059082, "learning_rate": 8.028901734104045e-07, "logits/chosen": -3.0505194664001465, "logits/rejected": -3.2557356357574463, "logps/chosen": -36.20124053955078, "logps/rejected": -84.59063720703125, "loss": 0.458, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0472619533538818, "rewards/margins": 4.6977410316467285, "rewards/rejected": -3.6504788398742676, "step": 683 }, { "epoch": 1.9736366919465511, "grad_norm": 5.729684352874756, "learning_rate": 8.026011560693641e-07, "logits/chosen": -2.97812557220459, "logits/rejected": -3.1200385093688965, "logps/chosen": -39.094444274902344, "logps/rejected": -88.58628845214844, "loss": 0.5065, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7655878067016602, "rewards/margins": 4.793065547943115, "rewards/rejected": -4.027477741241455, "step": 684 }, { "epoch": 1.976525821596244, "grad_norm": 6.13563346862793, "learning_rate": 8.023121387283237e-07, "logits/chosen": -2.9799869060516357, "logits/rejected": -3.1528515815734863, "logps/chosen": -38.23456573486328, "logps/rejected": -93.73779296875, "loss": 0.4892, "rewards/accuracies": 0.96875, "rewards/chosen": 1.017842173576355, "rewards/margins": 5.362667560577393, "rewards/rejected": -4.344825267791748, "step": 685 }, { "epoch": 1.9794149512459371, "grad_norm": 7.936887741088867, "learning_rate": 8.020231213872832e-07, "logits/chosen": -3.1005172729492188, "logits/rejected": -3.223294973373413, "logps/chosen": -37.46775436401367, "logps/rejected": -89.77969360351562, "loss": 0.4951, "rewards/accuracies": 1.0, "rewards/chosen": 0.8594723343849182, "rewards/margins": 5.115813255310059, "rewards/rejected": -4.256340980529785, "step": 686 }, { "epoch": 1.9823040808956303, "grad_norm": 5.150296688079834, "learning_rate": 8.017341040462427e-07, "logits/chosen": -2.9882616996765137, "logits/rejected": -3.268350124359131, "logps/chosen": -30.068458557128906, "logps/rejected": -85.70452880859375, "loss": 0.4623, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5475133657455444, "rewards/margins": 5.4526896476745605, "rewards/rejected": -3.9051761627197266, "step": 687 }, { "epoch": 1.9851932105453232, "grad_norm": 5.264040470123291, "learning_rate": 8.014450867052023e-07, "logits/chosen": -2.970850706100464, "logits/rejected": -3.081343173980713, "logps/chosen": -37.131046295166016, "logps/rejected": -86.20039367675781, "loss": 0.5014, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8634870052337646, "rewards/margins": 4.504019737243652, "rewards/rejected": -3.640532970428467, "step": 688 }, { "epoch": 1.9880823401950163, "grad_norm": 5.040195465087891, "learning_rate": 8.011560693641618e-07, "logits/chosen": -2.9382729530334473, "logits/rejected": -3.046417236328125, "logps/chosen": -40.684940338134766, "logps/rejected": -97.37854766845703, "loss": 0.4566, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6490885019302368, "rewards/margins": 5.360957622528076, "rewards/rejected": -4.711868762969971, "step": 689 }, { "epoch": 1.9909714698447094, "grad_norm": 8.882342338562012, "learning_rate": 8.008670520231213e-07, "logits/chosen": -3.0084638595581055, "logits/rejected": -3.213249683380127, "logps/chosen": -48.857337951660156, "logps/rejected": -100.72569274902344, "loss": 0.5663, "rewards/accuracies": 0.90625, "rewards/chosen": 0.05295965075492859, "rewards/margins": 5.185053825378418, "rewards/rejected": -5.132094383239746, "step": 690 }, { "epoch": 1.9938605994944023, "grad_norm": 5.703953266143799, "learning_rate": 8.005780346820809e-07, "logits/chosen": -2.8425638675689697, "logits/rejected": -2.976069450378418, "logps/chosen": -36.28514099121094, "logps/rejected": -73.9004135131836, "loss": 0.5152, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3009943962097168, "rewards/margins": 3.8213069438934326, "rewards/rejected": -2.5203123092651367, "step": 691 }, { "epoch": 1.9967497291440952, "grad_norm": 6.961340427398682, "learning_rate": 8.002890173410404e-07, "logits/chosen": -2.967747926712036, "logits/rejected": -3.125976800918579, "logps/chosen": -39.39066696166992, "logps/rejected": -91.12981414794922, "loss": 0.5226, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5182185769081116, "rewards/margins": 4.6886467933654785, "rewards/rejected": -4.170428276062012, "step": 692 }, { "epoch": 1.9996388587937883, "grad_norm": 6.709586143493652, "learning_rate": 8e-07, "logits/chosen": -2.987633466720581, "logits/rejected": -3.190544605255127, "logps/chosen": -32.63578414916992, "logps/rejected": -82.37600708007812, "loss": 0.4406, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1243561506271362, "rewards/margins": 4.6096296310424805, "rewards/rejected": -3.485273599624634, "step": 693 }, { "epoch": 2.0, "grad_norm": 1.401968002319336, "learning_rate": 7.997109826589595e-07, "logits/chosen": -2.8360862731933594, "logits/rejected": -2.842268943786621, "logps/chosen": -20.761390686035156, "logps/rejected": -65.69432830810547, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": 2.929753303527832, "rewards/margins": 4.535924911499023, "rewards/rejected": -1.6061716079711914, "step": 694 }, { "epoch": 2.002889129649693, "grad_norm": 6.8562421798706055, "learning_rate": 7.994219653179191e-07, "logits/chosen": -2.9529571533203125, "logits/rejected": -3.05726957321167, "logps/chosen": -28.10107421875, "logps/rejected": -75.10818481445312, "loss": 0.4256, "rewards/accuracies": 0.96875, "rewards/chosen": 1.512214183807373, "rewards/margins": 4.474913120269775, "rewards/rejected": -2.9626989364624023, "step": 695 }, { "epoch": 2.0057782592993862, "grad_norm": 5.7676005363464355, "learning_rate": 7.991329479768786e-07, "logits/chosen": -2.986109972000122, "logits/rejected": -3.1936559677124023, "logps/chosen": -34.541259765625, "logps/rejected": -90.24646759033203, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 0.9297841191291809, "rewards/margins": 5.418911933898926, "rewards/rejected": -4.4891276359558105, "step": 696 }, { "epoch": 2.008667388949079, "grad_norm": 8.037938117980957, "learning_rate": 7.988439306358381e-07, "logits/chosen": -2.9166696071624756, "logits/rejected": -3.1207969188690186, "logps/chosen": -23.478824615478516, "logps/rejected": -74.52700805664062, "loss": 0.4096, "rewards/accuracies": 1.0, "rewards/chosen": 2.1930060386657715, "rewards/margins": 4.943049430847168, "rewards/rejected": -2.7500433921813965, "step": 697 }, { "epoch": 2.011556518598772, "grad_norm": 4.7359538078308105, "learning_rate": 7.985549132947976e-07, "logits/chosen": -2.963585615158081, "logits/rejected": -3.039701461791992, "logps/chosen": -31.549711227416992, "logps/rejected": -72.77433776855469, "loss": 0.5674, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6456918716430664, "rewards/margins": 4.1162824630737305, "rewards/rejected": -2.470590591430664, "step": 698 }, { "epoch": 2.014445648248465, "grad_norm": 5.23918342590332, "learning_rate": 7.982658959537572e-07, "logits/chosen": -2.9175868034362793, "logits/rejected": -3.106886386871338, "logps/chosen": -32.02226257324219, "logps/rejected": -78.75151062011719, "loss": 0.5261, "rewards/accuracies": 1.0, "rewards/chosen": 1.3091003894805908, "rewards/margins": 4.4271626472473145, "rewards/rejected": -3.1180622577667236, "step": 699 }, { "epoch": 2.0173347778981583, "grad_norm": 6.860978126525879, "learning_rate": 7.979768786127166e-07, "logits/chosen": -2.9223480224609375, "logits/rejected": -3.122480630874634, "logps/chosen": -36.8601188659668, "logps/rejected": -94.07476043701172, "loss": 0.4548, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9118767380714417, "rewards/margins": 5.625308513641357, "rewards/rejected": -4.7134318351745605, "step": 700 }, { "epoch": 2.0173347778981583, "eval_logits/chosen": -3.02260684967041, "eval_logits/rejected": -3.184335470199585, "eval_logps/chosen": -37.05479049682617, "eval_logps/rejected": -89.26454162597656, "eval_loss": 0.49909529089927673, "eval_rewards/accuracies": 0.9354838728904724, "eval_rewards/chosen": 0.9261625409126282, "eval_rewards/margins": 5.157599449157715, "eval_rewards/rejected": -4.2314372062683105, "eval_runtime": 250.0917, "eval_samples_per_second": 0.492, "eval_steps_per_second": 0.248, "step": 700 }, { "epoch": 2.020223907547851, "grad_norm": 7.2794671058654785, "learning_rate": 7.976878612716763e-07, "logits/chosen": -2.888192653656006, "logits/rejected": -3.1444971561431885, "logps/chosen": -28.27557373046875, "logps/rejected": -75.97660064697266, "loss": 0.5, "rewards/accuracies": 0.96875, "rewards/chosen": 1.746099591255188, "rewards/margins": 4.580921173095703, "rewards/rejected": -2.834821939468384, "step": 701 }, { "epoch": 2.023113037197544, "grad_norm": 4.987190246582031, "learning_rate": 7.973988439306359e-07, "logits/chosen": -2.952162981033325, "logits/rejected": -3.116055965423584, "logps/chosen": -30.948013305664062, "logps/rejected": -79.80277252197266, "loss": 0.4649, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3605382442474365, "rewards/margins": 4.5741868019104, "rewards/rejected": -3.213648796081543, "step": 702 }, { "epoch": 2.0260021668472374, "grad_norm": 6.597548007965088, "learning_rate": 7.971098265895953e-07, "logits/chosen": -2.9266412258148193, "logits/rejected": -3.0563342571258545, "logps/chosen": -37.612823486328125, "logps/rejected": -76.3134765625, "loss": 0.5386, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8606700301170349, "rewards/margins": 3.759525775909424, "rewards/rejected": -2.898855686187744, "step": 703 }, { "epoch": 2.0288912964969303, "grad_norm": 6.418520450592041, "learning_rate": 7.968208092485549e-07, "logits/chosen": -2.919698715209961, "logits/rejected": -3.107138156890869, "logps/chosen": -31.192407608032227, "logps/rejected": -85.7509765625, "loss": 0.472, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4116394519805908, "rewards/margins": 4.881299018859863, "rewards/rejected": -3.4696598052978516, "step": 704 }, { "epoch": 2.0317804261466232, "grad_norm": 4.7809157371521, "learning_rate": 7.965317919075144e-07, "logits/chosen": -2.9525837898254395, "logits/rejected": -3.0449557304382324, "logps/chosen": -34.54072189331055, "logps/rejected": -88.07892608642578, "loss": 0.5074, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8218435049057007, "rewards/margins": 4.789416313171387, "rewards/rejected": -3.9675724506378174, "step": 705 }, { "epoch": 2.034669555796316, "grad_norm": 4.618563175201416, "learning_rate": 7.96242774566474e-07, "logits/chosen": -3.048676013946533, "logits/rejected": -3.143731117248535, "logps/chosen": -31.071439743041992, "logps/rejected": -83.87669372558594, "loss": 0.4557, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4606343507766724, "rewards/margins": 5.040596961975098, "rewards/rejected": -3.579962730407715, "step": 706 }, { "epoch": 2.0375586854460095, "grad_norm": 6.310376167297363, "learning_rate": 7.959537572254334e-07, "logits/chosen": -3.080840587615967, "logits/rejected": -3.2143783569335938, "logps/chosen": -40.11782455444336, "logps/rejected": -86.11071014404297, "loss": 0.518, "rewards/accuracies": 1.0, "rewards/chosen": 0.41728782653808594, "rewards/margins": 4.474344730377197, "rewards/rejected": -4.057056903839111, "step": 707 }, { "epoch": 2.0404478150957024, "grad_norm": 4.520307540893555, "learning_rate": 7.95664739884393e-07, "logits/chosen": -3.0505855083465576, "logits/rejected": -3.173330068588257, "logps/chosen": -40.34224319458008, "logps/rejected": -87.24307250976562, "loss": 0.53, "rewards/accuracies": 0.875, "rewards/chosen": 0.8092873096466064, "rewards/margins": 4.4743170738220215, "rewards/rejected": -3.665030002593994, "step": 708 }, { "epoch": 2.0433369447453953, "grad_norm": 6.281489849090576, "learning_rate": 7.953757225433526e-07, "logits/chosen": -3.038400888442993, "logits/rejected": -3.156240940093994, "logps/chosen": -35.31046676635742, "logps/rejected": -90.43082427978516, "loss": 0.4332, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1110025644302368, "rewards/margins": 5.626880168914795, "rewards/rejected": -4.515878200531006, "step": 709 }, { "epoch": 2.0462260743950886, "grad_norm": 4.915785789489746, "learning_rate": 7.950867052023121e-07, "logits/chosen": -2.907407760620117, "logits/rejected": -3.009284019470215, "logps/chosen": -32.01826477050781, "logps/rejected": -92.51264953613281, "loss": 0.472, "rewards/accuracies": 0.96875, "rewards/chosen": 1.740984559059143, "rewards/margins": 5.648333549499512, "rewards/rejected": -3.907349109649658, "step": 710 }, { "epoch": 2.0491152040447815, "grad_norm": 6.4368696212768555, "learning_rate": 7.947976878612716e-07, "logits/chosen": -2.9456992149353027, "logits/rejected": -3.083733558654785, "logps/chosen": -34.54573059082031, "logps/rejected": -90.56426239013672, "loss": 0.4536, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1951971054077148, "rewards/margins": 5.158744812011719, "rewards/rejected": -3.963548183441162, "step": 711 }, { "epoch": 2.0520043336944744, "grad_norm": 6.135793209075928, "learning_rate": 7.945086705202312e-07, "logits/chosen": -3.0073812007904053, "logits/rejected": -3.2323219776153564, "logps/chosen": -33.492897033691406, "logps/rejected": -96.89982604980469, "loss": 0.3734, "rewards/accuracies": 1.0, "rewards/chosen": 1.2207906246185303, "rewards/margins": 5.794336795806885, "rewards/rejected": -4.573546409606934, "step": 712 }, { "epoch": 2.054893463344168, "grad_norm": 6.321246147155762, "learning_rate": 7.942196531791908e-07, "logits/chosen": -3.042720079421997, "logits/rejected": -3.262045383453369, "logps/chosen": -34.910255432128906, "logps/rejected": -80.38431549072266, "loss": 0.5162, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1551896333694458, "rewards/margins": 4.491057395935059, "rewards/rejected": -3.3358676433563232, "step": 713 }, { "epoch": 2.0577825929938607, "grad_norm": 6.022965431213379, "learning_rate": 7.939306358381502e-07, "logits/chosen": -2.9321141242980957, "logits/rejected": -3.0189318656921387, "logps/chosen": -32.46451950073242, "logps/rejected": -90.03841400146484, "loss": 0.4472, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2877639532089233, "rewards/margins": 5.446091651916504, "rewards/rejected": -4.158327579498291, "step": 714 }, { "epoch": 2.0606717226435536, "grad_norm": 8.460423469543457, "learning_rate": 7.936416184971098e-07, "logits/chosen": -2.992858409881592, "logits/rejected": -3.2361855506896973, "logps/chosen": -34.27162170410156, "logps/rejected": -96.00141906738281, "loss": 0.3958, "rewards/accuracies": 1.0, "rewards/chosen": 1.0362247228622437, "rewards/margins": 5.653861999511719, "rewards/rejected": -4.6176371574401855, "step": 715 }, { "epoch": 2.0635608522932465, "grad_norm": 7.020964622497559, "learning_rate": 7.933526011560693e-07, "logits/chosen": -3.0349936485290527, "logits/rejected": -3.170804023742676, "logps/chosen": -32.543922424316406, "logps/rejected": -80.59713745117188, "loss": 0.4628, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2447434663772583, "rewards/margins": 4.516265869140625, "rewards/rejected": -3.271522045135498, "step": 716 }, { "epoch": 2.06644998194294, "grad_norm": 5.677659511566162, "learning_rate": 7.930635838150289e-07, "logits/chosen": -3.0051651000976562, "logits/rejected": -3.0997252464294434, "logps/chosen": -30.389949798583984, "logps/rejected": -77.81022644042969, "loss": 0.5798, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2261536121368408, "rewards/margins": 4.468410491943359, "rewards/rejected": -3.2422568798065186, "step": 717 }, { "epoch": 2.0693391115926327, "grad_norm": 6.843306064605713, "learning_rate": 7.927745664739884e-07, "logits/chosen": -3.0452675819396973, "logits/rejected": -3.1844992637634277, "logps/chosen": -30.942556381225586, "logps/rejected": -76.35688781738281, "loss": 0.461, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3894994258880615, "rewards/margins": 4.382437229156494, "rewards/rejected": -2.9929378032684326, "step": 718 }, { "epoch": 2.0722282412423256, "grad_norm": 5.4665093421936035, "learning_rate": 7.92485549132948e-07, "logits/chosen": -2.944218158721924, "logits/rejected": -3.12618350982666, "logps/chosen": -32.285736083984375, "logps/rejected": -92.16851043701172, "loss": 0.4164, "rewards/accuracies": 1.0, "rewards/chosen": 1.3505882024765015, "rewards/margins": 5.896647930145264, "rewards/rejected": -4.546059608459473, "step": 719 }, { "epoch": 2.075117370892019, "grad_norm": 4.840363025665283, "learning_rate": 7.921965317919074e-07, "logits/chosen": -2.937197208404541, "logits/rejected": -3.172323703765869, "logps/chosen": -37.27968215942383, "logps/rejected": -82.439208984375, "loss": 0.4724, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0026384592056274, "rewards/margins": 4.622501850128174, "rewards/rejected": -3.619863748550415, "step": 720 }, { "epoch": 2.078006500541712, "grad_norm": 17.80331802368164, "learning_rate": 7.91907514450867e-07, "logits/chosen": -2.961294412612915, "logits/rejected": -3.0352158546447754, "logps/chosen": -28.737672805786133, "logps/rejected": -73.1213607788086, "loss": 0.4936, "rewards/accuracies": 1.0, "rewards/chosen": 1.7672371864318848, "rewards/margins": 4.320464611053467, "rewards/rejected": -2.553226947784424, "step": 721 }, { "epoch": 2.0808956301914048, "grad_norm": 7.141528129577637, "learning_rate": 7.916184971098265e-07, "logits/chosen": -3.0764541625976562, "logits/rejected": -3.2035136222839355, "logps/chosen": -39.24583053588867, "logps/rejected": -79.32208251953125, "loss": 0.5053, "rewards/accuracies": 0.875, "rewards/chosen": 0.7054703831672668, "rewards/margins": 3.8574867248535156, "rewards/rejected": -3.1520166397094727, "step": 722 }, { "epoch": 2.0837847598410977, "grad_norm": 6.357395172119141, "learning_rate": 7.913294797687861e-07, "logits/chosen": -2.9688379764556885, "logits/rejected": -3.2063167095184326, "logps/chosen": -34.99431610107422, "logps/rejected": -84.08784484863281, "loss": 0.5314, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9042757153511047, "rewards/margins": 4.837939739227295, "rewards/rejected": -3.933664083480835, "step": 723 }, { "epoch": 2.086673889490791, "grad_norm": 5.655086040496826, "learning_rate": 7.910404624277456e-07, "logits/chosen": -2.9122211933135986, "logits/rejected": -3.099104166030884, "logps/chosen": -34.87295913696289, "logps/rejected": -83.96915435791016, "loss": 0.5024, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0472062826156616, "rewards/margins": 4.714405536651611, "rewards/rejected": -3.6671996116638184, "step": 724 }, { "epoch": 2.089563019140484, "grad_norm": 12.972542762756348, "learning_rate": 7.907514450867052e-07, "logits/chosen": -3.006436824798584, "logits/rejected": -3.2770090103149414, "logps/chosen": -37.43876266479492, "logps/rejected": -90.66030883789062, "loss": 0.5404, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7922465801239014, "rewards/margins": 4.935737609863281, "rewards/rejected": -4.143490314483643, "step": 725 }, { "epoch": 2.092452148790177, "grad_norm": 5.659564018249512, "learning_rate": 7.904624277456648e-07, "logits/chosen": -2.9433531761169434, "logits/rejected": -3.1599698066711426, "logps/chosen": -31.11492919921875, "logps/rejected": -77.48043823242188, "loss": 0.5073, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3857172727584839, "rewards/margins": 4.424158096313477, "rewards/rejected": -3.038440227508545, "step": 726 }, { "epoch": 2.09534127843987, "grad_norm": 6.587910175323486, "learning_rate": 7.901734104046242e-07, "logits/chosen": -2.9890990257263184, "logits/rejected": -3.149970293045044, "logps/chosen": -27.504268646240234, "logps/rejected": -86.45973205566406, "loss": 0.4355, "rewards/accuracies": 1.0, "rewards/chosen": 1.7089130878448486, "rewards/margins": 5.406412124633789, "rewards/rejected": -3.6974987983703613, "step": 727 }, { "epoch": 2.098230408089563, "grad_norm": 6.718311309814453, "learning_rate": 7.898843930635838e-07, "logits/chosen": -2.911851406097412, "logits/rejected": -3.104433298110962, "logps/chosen": -31.47101402282715, "logps/rejected": -87.35575103759766, "loss": 0.4225, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2247806787490845, "rewards/margins": 5.2898268699646, "rewards/rejected": -4.065046310424805, "step": 728 }, { "epoch": 2.101119537739256, "grad_norm": 6.337939262390137, "learning_rate": 7.895953757225433e-07, "logits/chosen": -2.928579807281494, "logits/rejected": -3.084653854370117, "logps/chosen": -31.12701988220215, "logps/rejected": -81.37339782714844, "loss": 0.4898, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6513149738311768, "rewards/margins": 4.985055446624756, "rewards/rejected": -3.333740234375, "step": 729 }, { "epoch": 2.104008667388949, "grad_norm": 8.103276252746582, "learning_rate": 7.893063583815029e-07, "logits/chosen": -2.9459362030029297, "logits/rejected": -3.098142147064209, "logps/chosen": -29.741870880126953, "logps/rejected": -76.80816650390625, "loss": 0.4576, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6028882265090942, "rewards/margins": 4.334708213806152, "rewards/rejected": -2.7318203449249268, "step": 730 }, { "epoch": 2.106897797038642, "grad_norm": 6.11631441116333, "learning_rate": 7.890173410404623e-07, "logits/chosen": -2.979278326034546, "logits/rejected": -3.0995500087738037, "logps/chosen": -33.73371887207031, "logps/rejected": -88.19420623779297, "loss": 0.4395, "rewards/accuracies": 1.0, "rewards/chosen": 0.9756962656974792, "rewards/margins": 5.087556838989258, "rewards/rejected": -4.111860752105713, "step": 731 }, { "epoch": 2.109786926688335, "grad_norm": 6.971851348876953, "learning_rate": 7.887283236994219e-07, "logits/chosen": -2.9925031661987305, "logits/rejected": -3.138026475906372, "logps/chosen": -39.571014404296875, "logps/rejected": -84.92323303222656, "loss": 0.4953, "rewards/accuracies": 1.0, "rewards/chosen": 1.0561673641204834, "rewards/margins": 4.8554253578186035, "rewards/rejected": -3.799258232116699, "step": 732 }, { "epoch": 2.112676056338028, "grad_norm": 8.449911117553711, "learning_rate": 7.884393063583816e-07, "logits/chosen": -3.0706653594970703, "logits/rejected": -3.1267611980438232, "logps/chosen": -44.73291015625, "logps/rejected": -84.25572204589844, "loss": 0.5634, "rewards/accuracies": 0.90625, "rewards/chosen": 0.31574517488479614, "rewards/margins": 3.939117670059204, "rewards/rejected": -3.6233725547790527, "step": 733 }, { "epoch": 2.1155651859877214, "grad_norm": 6.28912353515625, "learning_rate": 7.88150289017341e-07, "logits/chosen": -2.9533510208129883, "logits/rejected": -3.1109323501586914, "logps/chosen": -41.9156379699707, "logps/rejected": -99.68185424804688, "loss": 0.504, "rewards/accuracies": 1.0, "rewards/chosen": 0.6306040287017822, "rewards/margins": 5.507490158081055, "rewards/rejected": -4.876885890960693, "step": 734 }, { "epoch": 2.1184543156374143, "grad_norm": 6.606508255004883, "learning_rate": 7.878612716763006e-07, "logits/chosen": -3.026280641555786, "logits/rejected": -3.272963047027588, "logps/chosen": -45.97677993774414, "logps/rejected": -103.50189208984375, "loss": 0.5216, "rewards/accuracies": 0.96875, "rewards/chosen": -0.06356006860733032, "rewards/margins": 5.78660249710083, "rewards/rejected": -5.850163459777832, "step": 735 }, { "epoch": 2.121343445287107, "grad_norm": 7.174530506134033, "learning_rate": 7.875722543352601e-07, "logits/chosen": -2.9956159591674805, "logits/rejected": -3.09269642829895, "logps/chosen": -29.783960342407227, "logps/rejected": -82.39452362060547, "loss": 0.4517, "rewards/accuracies": 1.0, "rewards/chosen": 1.4611526727676392, "rewards/margins": 5.021931171417236, "rewards/rejected": -3.5607781410217285, "step": 736 }, { "epoch": 2.1242325749368005, "grad_norm": 5.514847278594971, "learning_rate": 7.872832369942196e-07, "logits/chosen": -2.953570604324341, "logits/rejected": -3.0856385231018066, "logps/chosen": -33.94776153564453, "logps/rejected": -87.23409271240234, "loss": 0.4664, "rewards/accuracies": 1.0, "rewards/chosen": 0.9319786429405212, "rewards/margins": 5.069998741149902, "rewards/rejected": -4.138019561767578, "step": 737 }, { "epoch": 2.1271217045864934, "grad_norm": 6.274872303009033, "learning_rate": 7.869942196531791e-07, "logits/chosen": -2.978684902191162, "logits/rejected": -3.1195058822631836, "logps/chosen": -43.21322250366211, "logps/rejected": -91.16759490966797, "loss": 0.6226, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2053849995136261, "rewards/margins": 4.317602157592773, "rewards/rejected": -4.112217426300049, "step": 738 }, { "epoch": 2.1300108342361863, "grad_norm": 6.273994445800781, "learning_rate": 7.867052023121387e-07, "logits/chosen": -2.9032862186431885, "logits/rejected": -3.0864436626434326, "logps/chosen": -37.87358474731445, "logps/rejected": -87.28439331054688, "loss": 0.4753, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9062228202819824, "rewards/margins": 4.840488910675049, "rewards/rejected": -3.9342665672302246, "step": 739 }, { "epoch": 2.132899963885879, "grad_norm": 6.015315055847168, "learning_rate": 7.864161849710982e-07, "logits/chosen": -2.9858100414276123, "logits/rejected": -3.210482597351074, "logps/chosen": -32.121124267578125, "logps/rejected": -89.93128204345703, "loss": 0.4364, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2092779874801636, "rewards/margins": 5.352938175201416, "rewards/rejected": -4.143661022186279, "step": 740 }, { "epoch": 2.1357890935355726, "grad_norm": 3.755516529083252, "learning_rate": 7.861271676300578e-07, "logits/chosen": -3.040121078491211, "logits/rejected": -3.173438549041748, "logps/chosen": -41.87519073486328, "logps/rejected": -89.95869445800781, "loss": 0.5269, "rewards/accuracies": 0.90625, "rewards/chosen": 0.11013069748878479, "rewards/margins": 4.456624507904053, "rewards/rejected": -4.346493721008301, "step": 741 }, { "epoch": 2.1386782231852655, "grad_norm": 9.310901641845703, "learning_rate": 7.858381502890173e-07, "logits/chosen": -2.9814295768737793, "logits/rejected": -3.105982780456543, "logps/chosen": -23.641254425048828, "logps/rejected": -77.10796356201172, "loss": 0.4192, "rewards/accuracies": 0.90625, "rewards/chosen": 2.0129010677337646, "rewards/margins": 5.156200408935547, "rewards/rejected": -3.143299102783203, "step": 742 }, { "epoch": 2.1415673528349584, "grad_norm": 6.834477424621582, "learning_rate": 7.855491329479769e-07, "logits/chosen": -2.989231586456299, "logits/rejected": -3.139280319213867, "logps/chosen": -42.277400970458984, "logps/rejected": -77.64715576171875, "loss": 0.5674, "rewards/accuracies": 0.9375, "rewards/chosen": 0.29802206158638, "rewards/margins": 3.6812350749969482, "rewards/rejected": -3.3832130432128906, "step": 743 }, { "epoch": 2.1444564824846517, "grad_norm": 5.025114059448242, "learning_rate": 7.852601156069363e-07, "logits/chosen": -2.9860191345214844, "logits/rejected": -3.2737293243408203, "logps/chosen": -36.156768798828125, "logps/rejected": -102.41941833496094, "loss": 0.4629, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0910975933074951, "rewards/margins": 6.2518310546875, "rewards/rejected": -5.160734176635742, "step": 744 }, { "epoch": 2.1473456121343446, "grad_norm": 5.662807464599609, "learning_rate": 7.849710982658959e-07, "logits/chosen": -3.0086331367492676, "logits/rejected": -3.1849381923675537, "logps/chosen": -27.4437198638916, "logps/rejected": -84.08712005615234, "loss": 0.4157, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6172171831130981, "rewards/margins": 5.447690486907959, "rewards/rejected": -3.8304731845855713, "step": 745 }, { "epoch": 2.1502347417840375, "grad_norm": 5.705535411834717, "learning_rate": 7.846820809248555e-07, "logits/chosen": -3.074054718017578, "logits/rejected": -3.268482208251953, "logps/chosen": -31.018869400024414, "logps/rejected": -80.46067810058594, "loss": 0.4334, "rewards/accuracies": 1.0, "rewards/chosen": 1.6455118656158447, "rewards/margins": 4.934002876281738, "rewards/rejected": -3.2884905338287354, "step": 746 }, { "epoch": 2.1531238714337304, "grad_norm": 6.041797637939453, "learning_rate": 7.843930635838149e-07, "logits/chosen": -3.066805839538574, "logits/rejected": -3.2807517051696777, "logps/chosen": -26.268421173095703, "logps/rejected": -84.15154266357422, "loss": 0.3937, "rewards/accuracies": 0.96875, "rewards/chosen": 1.858206033706665, "rewards/margins": 5.197409629821777, "rewards/rejected": -3.3392038345336914, "step": 747 }, { "epoch": 2.1560130010834238, "grad_norm": 5.5632500648498535, "learning_rate": 7.841040462427745e-07, "logits/chosen": -3.015317440032959, "logits/rejected": -3.212616205215454, "logps/chosen": -32.38658905029297, "logps/rejected": -90.75804901123047, "loss": 0.4676, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5197981595993042, "rewards/margins": 5.589811325073242, "rewards/rejected": -4.070013046264648, "step": 748 }, { "epoch": 2.1589021307331167, "grad_norm": 6.919811248779297, "learning_rate": 7.838150289017341e-07, "logits/chosen": -3.042743682861328, "logits/rejected": -3.141134738922119, "logps/chosen": -43.4727897644043, "logps/rejected": -91.40890502929688, "loss": 0.5504, "rewards/accuracies": 1.0, "rewards/chosen": 0.2589297592639923, "rewards/margins": 4.463791847229004, "rewards/rejected": -4.204862117767334, "step": 749 }, { "epoch": 2.1617912603828096, "grad_norm": 5.9210638999938965, "learning_rate": 7.835260115606937e-07, "logits/chosen": -2.956435203552246, "logits/rejected": -3.159775733947754, "logps/chosen": -31.811119079589844, "logps/rejected": -79.19149780273438, "loss": 0.5795, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3113930225372314, "rewards/margins": 4.484503269195557, "rewards/rejected": -3.173110008239746, "step": 750 }, { "epoch": 2.164680390032503, "grad_norm": 6.325962543487549, "learning_rate": 7.832369942196531e-07, "logits/chosen": -3.0422399044036865, "logits/rejected": -3.168555736541748, "logps/chosen": -37.49549102783203, "logps/rejected": -88.64521026611328, "loss": 0.4145, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2878117561340332, "rewards/margins": 5.307209491729736, "rewards/rejected": -4.019397735595703, "step": 751 }, { "epoch": 2.167569519682196, "grad_norm": 6.233355522155762, "learning_rate": 7.829479768786127e-07, "logits/chosen": -2.8885014057159424, "logits/rejected": -2.9981536865234375, "logps/chosen": -38.5592155456543, "logps/rejected": -81.27747344970703, "loss": 0.506, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7427980303764343, "rewards/margins": 3.8892557621002197, "rewards/rejected": -3.1464579105377197, "step": 752 }, { "epoch": 2.1704586493318887, "grad_norm": 5.257051467895508, "learning_rate": 7.826589595375722e-07, "logits/chosen": -2.99542236328125, "logits/rejected": -3.186781883239746, "logps/chosen": -37.793697357177734, "logps/rejected": -78.9001693725586, "loss": 0.5827, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7105236649513245, "rewards/margins": 3.791496992111206, "rewards/rejected": -3.0809733867645264, "step": 753 }, { "epoch": 2.1733477789815816, "grad_norm": 6.893218994140625, "learning_rate": 7.823699421965317e-07, "logits/chosen": -3.051917791366577, "logits/rejected": -3.2446224689483643, "logps/chosen": -35.29770278930664, "logps/rejected": -81.05123138427734, "loss": 0.5135, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9697397351264954, "rewards/margins": 4.460207939147949, "rewards/rejected": -3.4904682636260986, "step": 754 }, { "epoch": 2.176236908631275, "grad_norm": 5.531877040863037, "learning_rate": 7.820809248554912e-07, "logits/chosen": -3.047898292541504, "logits/rejected": -3.23604679107666, "logps/chosen": -30.166810989379883, "logps/rejected": -81.825927734375, "loss": 0.3724, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5876976251602173, "rewards/margins": 5.424752712249756, "rewards/rejected": -3.837055206298828, "step": 755 }, { "epoch": 2.179126038280968, "grad_norm": 6.017573833465576, "learning_rate": 7.817919075144508e-07, "logits/chosen": -3.1148314476013184, "logits/rejected": -3.1489405632019043, "logps/chosen": -38.285465240478516, "logps/rejected": -78.74629211425781, "loss": 0.5692, "rewards/accuracies": 0.875, "rewards/chosen": 1.1192975044250488, "rewards/margins": 4.112434387207031, "rewards/rejected": -2.9931373596191406, "step": 756 }, { "epoch": 2.1820151679306607, "grad_norm": 6.352273464202881, "learning_rate": 7.815028901734105e-07, "logits/chosen": -2.952488899230957, "logits/rejected": -3.044002056121826, "logps/chosen": -35.30575180053711, "logps/rejected": -81.82289123535156, "loss": 0.4645, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9610450267791748, "rewards/margins": 4.532365798950195, "rewards/rejected": -3.5713207721710205, "step": 757 }, { "epoch": 2.184904297580354, "grad_norm": 6.385003566741943, "learning_rate": 7.812138728323699e-07, "logits/chosen": -3.0244884490966797, "logits/rejected": -3.1600422859191895, "logps/chosen": -30.38421630859375, "logps/rejected": -77.73979187011719, "loss": 0.3792, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9174721240997314, "rewards/margins": 4.831384658813477, "rewards/rejected": -2.913912296295166, "step": 758 }, { "epoch": 2.187793427230047, "grad_norm": 7.461294174194336, "learning_rate": 7.809248554913295e-07, "logits/chosen": -2.9424540996551514, "logits/rejected": -3.1173958778381348, "logps/chosen": -37.929691314697266, "logps/rejected": -94.24299621582031, "loss": 0.4207, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6328985691070557, "rewards/margins": 5.334482669830322, "rewards/rejected": -4.701584339141846, "step": 759 }, { "epoch": 2.19068255687974, "grad_norm": 7.412065505981445, "learning_rate": 7.80635838150289e-07, "logits/chosen": -2.906187057495117, "logits/rejected": -3.009211778640747, "logps/chosen": -31.900951385498047, "logps/rejected": -80.74220275878906, "loss": 0.3937, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5402882099151611, "rewards/margins": 4.826944828033447, "rewards/rejected": -3.286656618118286, "step": 760 }, { "epoch": 2.193571686529433, "grad_norm": 5.869570732116699, "learning_rate": 7.803468208092485e-07, "logits/chosen": -3.0531649589538574, "logits/rejected": -3.15552020072937, "logps/chosen": -38.28168487548828, "logps/rejected": -89.9604263305664, "loss": 0.4863, "rewards/accuracies": 0.875, "rewards/chosen": 0.7310044765472412, "rewards/margins": 4.8249101638793945, "rewards/rejected": -4.093905448913574, "step": 761 }, { "epoch": 2.196460816179126, "grad_norm": 5.263199329376221, "learning_rate": 7.80057803468208e-07, "logits/chosen": -2.974557876586914, "logits/rejected": -3.21832275390625, "logps/chosen": -35.47771072387695, "logps/rejected": -82.00511169433594, "loss": 0.4557, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2224147319793701, "rewards/margins": 4.6531662940979, "rewards/rejected": -3.430751323699951, "step": 762 }, { "epoch": 2.199349945828819, "grad_norm": 6.038288116455078, "learning_rate": 7.797687861271676e-07, "logits/chosen": -2.9662961959838867, "logits/rejected": -3.145958423614502, "logps/chosen": -34.38801956176758, "logps/rejected": -86.25798797607422, "loss": 0.4594, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0664496421813965, "rewards/margins": 4.984440803527832, "rewards/rejected": -3.9179909229278564, "step": 763 }, { "epoch": 2.202239075478512, "grad_norm": 5.992225646972656, "learning_rate": 7.79479768786127e-07, "logits/chosen": -2.979759931564331, "logits/rejected": -3.1423983573913574, "logps/chosen": -31.483701705932617, "logps/rejected": -79.21722412109375, "loss": 0.4792, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2094366550445557, "rewards/margins": 4.550536632537842, "rewards/rejected": -3.341099739074707, "step": 764 }, { "epoch": 2.2051282051282053, "grad_norm": 7.609710693359375, "learning_rate": 7.791907514450867e-07, "logits/chosen": -2.996736526489258, "logits/rejected": -3.2080800533294678, "logps/chosen": -36.41436767578125, "logps/rejected": -95.46406555175781, "loss": 0.4569, "rewards/accuracies": 1.0, "rewards/chosen": 0.9983922243118286, "rewards/margins": 5.648761749267578, "rewards/rejected": -4.650369167327881, "step": 765 }, { "epoch": 2.208017334777898, "grad_norm": 6.459946155548096, "learning_rate": 7.789017341040463e-07, "logits/chosen": -3.00062894821167, "logits/rejected": -3.1957814693450928, "logps/chosen": -38.86110305786133, "logps/rejected": -91.17980194091797, "loss": 0.4848, "rewards/accuracies": 0.875, "rewards/chosen": 0.8025227189064026, "rewards/margins": 5.151808738708496, "rewards/rejected": -4.349286079406738, "step": 766 }, { "epoch": 2.210906464427591, "grad_norm": 6.946235179901123, "learning_rate": 7.786127167630058e-07, "logits/chosen": -2.8983051776885986, "logits/rejected": -3.0979809761047363, "logps/chosen": -25.647905349731445, "logps/rejected": -78.35060119628906, "loss": 0.3604, "rewards/accuracies": 1.0, "rewards/chosen": 2.332484006881714, "rewards/margins": 5.378349304199219, "rewards/rejected": -3.0458648204803467, "step": 767 }, { "epoch": 2.2137955940772844, "grad_norm": 6.894688606262207, "learning_rate": 7.783236994219653e-07, "logits/chosen": -3.0074312686920166, "logits/rejected": -3.1201834678649902, "logps/chosen": -30.097003936767578, "logps/rejected": -79.4720458984375, "loss": 0.5639, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5586533546447754, "rewards/margins": 4.557915687561035, "rewards/rejected": -2.9992618560791016, "step": 768 }, { "epoch": 2.2166847237269773, "grad_norm": 7.63638162612915, "learning_rate": 7.780346820809248e-07, "logits/chosen": -2.8955254554748535, "logits/rejected": -3.0645923614501953, "logps/chosen": -27.74175453186035, "logps/rejected": -79.56930541992188, "loss": 0.4676, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6939682960510254, "rewards/margins": 4.825389385223389, "rewards/rejected": -3.1314210891723633, "step": 769 }, { "epoch": 2.2195738533766702, "grad_norm": 5.851343154907227, "learning_rate": 7.777456647398844e-07, "logits/chosen": -2.9883506298065186, "logits/rejected": -3.2263402938842773, "logps/chosen": -31.09695816040039, "logps/rejected": -84.33662414550781, "loss": 0.4445, "rewards/accuracies": 1.0, "rewards/chosen": 1.5773981809616089, "rewards/margins": 5.232708930969238, "rewards/rejected": -3.655311346054077, "step": 770 }, { "epoch": 2.2195738533766702, "eval_logits/chosen": -3.066568374633789, "eval_logits/rejected": -3.2303764820098877, "eval_logps/chosen": -38.223304748535156, "eval_logps/rejected": -91.79434204101562, "eval_loss": 0.505393922328949, "eval_rewards/accuracies": 0.9354838728904724, "eval_rewards/chosen": 0.809311032295227, "eval_rewards/margins": 5.293727397918701, "eval_rewards/rejected": -4.4844160079956055, "eval_runtime": 226.8145, "eval_samples_per_second": 0.542, "eval_steps_per_second": 0.273, "step": 770 }, { "epoch": 2.222462983026363, "grad_norm": 4.347232818603516, "learning_rate": 7.774566473988438e-07, "logits/chosen": -3.03483510017395, "logits/rejected": -3.200991153717041, "logps/chosen": -37.116451263427734, "logps/rejected": -98.26908111572266, "loss": 0.502, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9575788974761963, "rewards/margins": 5.867281913757324, "rewards/rejected": -4.909702777862549, "step": 771 }, { "epoch": 2.2253521126760565, "grad_norm": 7.303581714630127, "learning_rate": 7.771676300578034e-07, "logits/chosen": -2.935122489929199, "logits/rejected": -3.079983711242676, "logps/chosen": -35.2635612487793, "logps/rejected": -81.37926483154297, "loss": 0.5662, "rewards/accuracies": 0.90625, "rewards/chosen": 1.200096607208252, "rewards/margins": 4.453435897827148, "rewards/rejected": -3.2533388137817383, "step": 772 }, { "epoch": 2.2282412423257494, "grad_norm": 6.188920974731445, "learning_rate": 7.768786127167629e-07, "logits/chosen": -3.0771355628967285, "logits/rejected": -3.1965951919555664, "logps/chosen": -35.09613800048828, "logps/rejected": -85.69706726074219, "loss": 0.4071, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1690007448196411, "rewards/margins": 5.1799211502075195, "rewards/rejected": -4.010921001434326, "step": 773 }, { "epoch": 2.2311303719754423, "grad_norm": 7.114441394805908, "learning_rate": 7.765895953757226e-07, "logits/chosen": -3.1681172847747803, "logits/rejected": -3.267181873321533, "logps/chosen": -33.05453109741211, "logps/rejected": -89.62509155273438, "loss": 0.464, "rewards/accuracies": 1.0, "rewards/chosen": 1.3872708082199097, "rewards/margins": 5.315178394317627, "rewards/rejected": -3.9279074668884277, "step": 774 }, { "epoch": 2.2340195016251356, "grad_norm": 8.644691467285156, "learning_rate": 7.76300578034682e-07, "logits/chosen": -3.135103225708008, "logits/rejected": -3.243813991546631, "logps/chosen": -37.61027526855469, "logps/rejected": -90.46954345703125, "loss": 0.4654, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5364813208580017, "rewards/margins": 5.123570919036865, "rewards/rejected": -4.587089538574219, "step": 775 }, { "epoch": 2.2369086312748285, "grad_norm": 6.39396858215332, "learning_rate": 7.760115606936416e-07, "logits/chosen": -3.097006320953369, "logits/rejected": -3.263004779815674, "logps/chosen": -35.686275482177734, "logps/rejected": -94.88957977294922, "loss": 0.4802, "rewards/accuracies": 0.96875, "rewards/chosen": 1.09142005443573, "rewards/margins": 5.690501689910889, "rewards/rejected": -4.599081993103027, "step": 776 }, { "epoch": 2.2397977609245214, "grad_norm": 6.9708027839660645, "learning_rate": 7.757225433526012e-07, "logits/chosen": -2.987311601638794, "logits/rejected": -3.1462559700012207, "logps/chosen": -48.99359893798828, "logps/rejected": -97.88687896728516, "loss": 0.5402, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1550697386264801, "rewards/margins": 4.854072570800781, "rewards/rejected": -5.0091423988342285, "step": 777 }, { "epoch": 2.2426868905742143, "grad_norm": 4.272688865661621, "learning_rate": 7.754335260115606e-07, "logits/chosen": -3.0230674743652344, "logits/rejected": -3.1947529315948486, "logps/chosen": -35.35139846801758, "logps/rejected": -97.27832794189453, "loss": 0.4748, "rewards/accuracies": 0.9375, "rewards/chosen": 0.952195942401886, "rewards/margins": 5.627264976501465, "rewards/rejected": -4.6750688552856445, "step": 778 }, { "epoch": 2.2455760202239077, "grad_norm": 6.408928871154785, "learning_rate": 7.751445086705202e-07, "logits/chosen": -3.1046154499053955, "logits/rejected": -3.214602470397949, "logps/chosen": -49.313072204589844, "logps/rejected": -110.81935119628906, "loss": 0.4877, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3137931525707245, "rewards/margins": 5.922170639038086, "rewards/rejected": -5.608377933502197, "step": 779 }, { "epoch": 2.2484651498736006, "grad_norm": 7.403957843780518, "learning_rate": 7.748554913294797e-07, "logits/chosen": -2.9401676654815674, "logits/rejected": -3.1281726360321045, "logps/chosen": -32.72539138793945, "logps/rejected": -92.10433959960938, "loss": 0.3887, "rewards/accuracies": 1.0, "rewards/chosen": 1.5498439073562622, "rewards/margins": 5.728662014007568, "rewards/rejected": -4.1788177490234375, "step": 780 }, { "epoch": 2.2513542795232935, "grad_norm": 8.931777000427246, "learning_rate": 7.745664739884392e-07, "logits/chosen": -3.0268261432647705, "logits/rejected": -3.166330337524414, "logps/chosen": -28.758102416992188, "logps/rejected": -83.6677474975586, "loss": 0.4306, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7027051448822021, "rewards/margins": 5.2276716232299805, "rewards/rejected": -3.5249667167663574, "step": 781 }, { "epoch": 2.254243409172987, "grad_norm": 6.639174461364746, "learning_rate": 7.742774566473988e-07, "logits/chosen": -2.9809226989746094, "logits/rejected": -3.1448235511779785, "logps/chosen": -38.04843521118164, "logps/rejected": -91.68146514892578, "loss": 0.4898, "rewards/accuracies": 0.90625, "rewards/chosen": 1.020439624786377, "rewards/margins": 5.088812351226807, "rewards/rejected": -4.06837272644043, "step": 782 }, { "epoch": 2.2571325388226797, "grad_norm": 7.130983829498291, "learning_rate": 7.739884393063584e-07, "logits/chosen": -3.023581027984619, "logits/rejected": -3.1610965728759766, "logps/chosen": -30.48259735107422, "logps/rejected": -77.28678894042969, "loss": 0.5076, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3372530937194824, "rewards/margins": 4.3780951499938965, "rewards/rejected": -3.040842294692993, "step": 783 }, { "epoch": 2.2600216684723726, "grad_norm": 5.848807334899902, "learning_rate": 7.736994219653179e-07, "logits/chosen": -3.032809019088745, "logits/rejected": -3.1873526573181152, "logps/chosen": -44.055545806884766, "logps/rejected": -95.42835235595703, "loss": 0.5975, "rewards/accuracies": 0.90625, "rewards/chosen": 0.24942216277122498, "rewards/margins": 4.799055099487305, "rewards/rejected": -4.549632549285889, "step": 784 }, { "epoch": 2.262910798122066, "grad_norm": 6.264895915985107, "learning_rate": 7.734104046242774e-07, "logits/chosen": -3.0102596282958984, "logits/rejected": -3.323775291442871, "logps/chosen": -25.998191833496094, "logps/rejected": -82.38720703125, "loss": 0.4435, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9932544231414795, "rewards/margins": 5.371459007263184, "rewards/rejected": -3.378204584121704, "step": 785 }, { "epoch": 2.265799927771759, "grad_norm": 7.492628574371338, "learning_rate": 7.731213872832369e-07, "logits/chosen": -2.996995449066162, "logits/rejected": -3.1946890354156494, "logps/chosen": -26.42367172241211, "logps/rejected": -80.76595306396484, "loss": 0.4441, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9842190742492676, "rewards/margins": 5.346397399902344, "rewards/rejected": -3.362178325653076, "step": 786 }, { "epoch": 2.2686890574214518, "grad_norm": 8.005013465881348, "learning_rate": 7.728323699421965e-07, "logits/chosen": -3.0832087993621826, "logits/rejected": -3.176790952682495, "logps/chosen": -43.19381332397461, "logps/rejected": -82.10325622558594, "loss": 0.6091, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3535003662109375, "rewards/margins": 3.866173028945923, "rewards/rejected": -3.512671947479248, "step": 787 }, { "epoch": 2.2715781870711447, "grad_norm": 7.64493989944458, "learning_rate": 7.72543352601156e-07, "logits/chosen": -2.99308180809021, "logits/rejected": -3.090984344482422, "logps/chosen": -41.106788635253906, "logps/rejected": -97.1788558959961, "loss": 0.5089, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5729259252548218, "rewards/margins": 5.5653276443481445, "rewards/rejected": -4.992402076721191, "step": 788 }, { "epoch": 2.274467316720838, "grad_norm": 7.523717403411865, "learning_rate": 7.722543352601155e-07, "logits/chosen": -3.0011284351348877, "logits/rejected": -3.0739030838012695, "logps/chosen": -36.60017395019531, "logps/rejected": -78.14659881591797, "loss": 0.5513, "rewards/accuracies": 0.9375, "rewards/chosen": 1.017719030380249, "rewards/margins": 4.016940116882324, "rewards/rejected": -2.999220848083496, "step": 789 }, { "epoch": 2.277356446370531, "grad_norm": 6.275721549987793, "learning_rate": 7.719653179190752e-07, "logits/chosen": -3.1132733821868896, "logits/rejected": -3.1432478427886963, "logps/chosen": -42.05107879638672, "logps/rejected": -93.02056121826172, "loss": 0.499, "rewards/accuracies": 1.0, "rewards/chosen": 0.5994644165039062, "rewards/margins": 4.9283671379089355, "rewards/rejected": -4.3289031982421875, "step": 790 }, { "epoch": 2.280245576020224, "grad_norm": 8.181063652038574, "learning_rate": 7.716763005780347e-07, "logits/chosen": -3.0504722595214844, "logits/rejected": -3.211282730102539, "logps/chosen": -37.07589340209961, "logps/rejected": -87.99382781982422, "loss": 0.4937, "rewards/accuracies": 1.0, "rewards/chosen": 0.882259726524353, "rewards/margins": 4.810269355773926, "rewards/rejected": -3.9280097484588623, "step": 791 }, { "epoch": 2.2831347056699167, "grad_norm": 5.9081621170043945, "learning_rate": 7.713872832369942e-07, "logits/chosen": -3.0306217670440674, "logits/rejected": -3.098280906677246, "logps/chosen": -47.0698356628418, "logps/rejected": -103.3870849609375, "loss": 0.516, "rewards/accuracies": 0.96875, "rewards/chosen": 0.33869919180870056, "rewards/margins": 5.421341896057129, "rewards/rejected": -5.082642555236816, "step": 792 }, { "epoch": 2.28602383531961, "grad_norm": 5.702414035797119, "learning_rate": 7.710982658959537e-07, "logits/chosen": -3.0079498291015625, "logits/rejected": -3.21006178855896, "logps/chosen": -31.354537963867188, "logps/rejected": -84.71969604492188, "loss": 0.4582, "rewards/accuracies": 1.0, "rewards/chosen": 1.3502092361450195, "rewards/margins": 4.780604362487793, "rewards/rejected": -3.4303953647613525, "step": 793 }, { "epoch": 2.288912964969303, "grad_norm": 6.3059163093566895, "learning_rate": 7.708092485549133e-07, "logits/chosen": -3.096944808959961, "logits/rejected": -3.2997589111328125, "logps/chosen": -36.96029281616211, "logps/rejected": -87.50593566894531, "loss": 0.4968, "rewards/accuracies": 1.0, "rewards/chosen": 1.0584725141525269, "rewards/margins": 4.865203857421875, "rewards/rejected": -3.8067312240600586, "step": 794 }, { "epoch": 2.291802094618996, "grad_norm": 5.698260307312012, "learning_rate": 7.705202312138727e-07, "logits/chosen": -3.1583821773529053, "logits/rejected": -3.2506394386291504, "logps/chosen": -44.46715545654297, "logps/rejected": -96.68314361572266, "loss": 0.4629, "rewards/accuracies": 1.0, "rewards/chosen": 0.7154088020324707, "rewards/margins": 5.243915557861328, "rewards/rejected": -4.528507232666016, "step": 795 }, { "epoch": 2.294691224268689, "grad_norm": 8.574338912963867, "learning_rate": 7.702312138728323e-07, "logits/chosen": -3.2771754264831543, "logits/rejected": -3.258988380432129, "logps/chosen": -40.85321807861328, "logps/rejected": -104.39664459228516, "loss": 0.5006, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3557373881340027, "rewards/margins": 5.810889720916748, "rewards/rejected": -5.45515251159668, "step": 796 }, { "epoch": 2.297580353918382, "grad_norm": 7.3108744621276855, "learning_rate": 7.699421965317918e-07, "logits/chosen": -3.0166680812835693, "logits/rejected": -3.196021318435669, "logps/chosen": -28.937414169311523, "logps/rejected": -82.22786712646484, "loss": 0.4864, "rewards/accuracies": 1.0, "rewards/chosen": 1.6583797931671143, "rewards/margins": 5.254579544067383, "rewards/rejected": -3.5961999893188477, "step": 797 }, { "epoch": 2.300469483568075, "grad_norm": 6.545724391937256, "learning_rate": 7.696531791907515e-07, "logits/chosen": -3.0649125576019287, "logits/rejected": -3.2092676162719727, "logps/chosen": -43.378849029541016, "logps/rejected": -89.22132873535156, "loss": 0.5257, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5424851179122925, "rewards/margins": 4.768599033355713, "rewards/rejected": -4.226113796234131, "step": 798 }, { "epoch": 2.3033586132177684, "grad_norm": 6.6016740798950195, "learning_rate": 7.69364161849711e-07, "logits/chosen": -3.0500969886779785, "logits/rejected": -3.2156195640563965, "logps/chosen": -41.472869873046875, "logps/rejected": -102.07572937011719, "loss": 0.5204, "rewards/accuracies": 0.9375, "rewards/chosen": 0.48447251319885254, "rewards/margins": 5.631127834320068, "rewards/rejected": -5.1466546058654785, "step": 799 }, { "epoch": 2.3062477428674613, "grad_norm": 8.208829879760742, "learning_rate": 7.690751445086705e-07, "logits/chosen": -2.940166711807251, "logits/rejected": -3.1592373847961426, "logps/chosen": -38.33790588378906, "logps/rejected": -89.95974731445312, "loss": 0.4645, "rewards/accuracies": 1.0, "rewards/chosen": 0.7521089315414429, "rewards/margins": 5.095324516296387, "rewards/rejected": -4.3432159423828125, "step": 800 }, { "epoch": 2.309136872517154, "grad_norm": 6.4143805503845215, "learning_rate": 7.687861271676301e-07, "logits/chosen": -2.940607786178589, "logits/rejected": -3.2125680446624756, "logps/chosen": -38.343231201171875, "logps/rejected": -96.00888061523438, "loss": 0.5131, "rewards/accuracies": 1.0, "rewards/chosen": 0.7090911269187927, "rewards/margins": 5.286875247955322, "rewards/rejected": -4.577783584594727, "step": 801 }, { "epoch": 2.312026002166847, "grad_norm": 6.047307968139648, "learning_rate": 7.684971098265895e-07, "logits/chosen": -3.0859837532043457, "logits/rejected": -3.2449982166290283, "logps/chosen": -46.55959701538086, "logps/rejected": -96.82316589355469, "loss": 0.5079, "rewards/accuracies": 1.0, "rewards/chosen": 0.2807428538799286, "rewards/margins": 5.050294876098633, "rewards/rejected": -4.7695512771606445, "step": 802 }, { "epoch": 2.3149151318165404, "grad_norm": 7.343916893005371, "learning_rate": 7.682080924855491e-07, "logits/chosen": -2.9906187057495117, "logits/rejected": -3.2221360206604004, "logps/chosen": -33.97502899169922, "logps/rejected": -90.54178619384766, "loss": 0.5184, "rewards/accuracies": 0.96875, "rewards/chosen": 1.180734395980835, "rewards/margins": 5.5956130027771, "rewards/rejected": -4.4148783683776855, "step": 803 }, { "epoch": 2.3178042614662333, "grad_norm": 6.788966178894043, "learning_rate": 7.679190751445086e-07, "logits/chosen": -2.8883891105651855, "logits/rejected": -3.0573880672454834, "logps/chosen": -27.99823570251465, "logps/rejected": -74.80245971679688, "loss": 0.498, "rewards/accuracies": 0.96875, "rewards/chosen": 1.646669864654541, "rewards/margins": 4.482073783874512, "rewards/rejected": -2.835404396057129, "step": 804 }, { "epoch": 2.320693391115926, "grad_norm": 7.070558071136475, "learning_rate": 7.676300578034681e-07, "logits/chosen": -3.016228199005127, "logits/rejected": -3.181541919708252, "logps/chosen": -28.98084831237793, "logps/rejected": -81.80950164794922, "loss": 0.4554, "rewards/accuracies": 1.0, "rewards/chosen": 1.6495656967163086, "rewards/margins": 4.759071350097656, "rewards/rejected": -3.1095054149627686, "step": 805 }, { "epoch": 2.323582520765619, "grad_norm": 8.175030708312988, "learning_rate": 7.673410404624277e-07, "logits/chosen": -3.041330099105835, "logits/rejected": -3.195037364959717, "logps/chosen": -33.32106399536133, "logps/rejected": -95.72648620605469, "loss": 0.4502, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0024263858795166, "rewards/margins": 5.701193332672119, "rewards/rejected": -4.698766708374023, "step": 806 }, { "epoch": 2.3264716504153125, "grad_norm": 7.011484622955322, "learning_rate": 7.670520231213873e-07, "logits/chosen": -3.046602964401245, "logits/rejected": -3.233116865158081, "logps/chosen": -31.564367294311523, "logps/rejected": -86.97819519042969, "loss": 0.4265, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2048630714416504, "rewards/margins": 5.016744613647461, "rewards/rejected": -3.8118815422058105, "step": 807 }, { "epoch": 2.3293607800650054, "grad_norm": 6.267205238342285, "learning_rate": 7.667630057803469e-07, "logits/chosen": -3.097764015197754, "logits/rejected": -3.2030959129333496, "logps/chosen": -44.64999771118164, "logps/rejected": -84.61982727050781, "loss": 0.6279, "rewards/accuracies": 0.96875, "rewards/chosen": 0.25144869089126587, "rewards/margins": 3.7365448474884033, "rewards/rejected": -3.4850964546203613, "step": 808 }, { "epoch": 2.3322499097146983, "grad_norm": 6.639354705810547, "learning_rate": 7.664739884393063e-07, "logits/chosen": -2.9878737926483154, "logits/rejected": -3.173110246658325, "logps/chosen": -36.914363861083984, "logps/rejected": -105.63969421386719, "loss": 0.3987, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9474353194236755, "rewards/margins": 6.522622585296631, "rewards/rejected": -5.575187683105469, "step": 809 }, { "epoch": 2.3351390393643916, "grad_norm": 7.372828960418701, "learning_rate": 7.661849710982659e-07, "logits/chosen": -3.0141100883483887, "logits/rejected": -3.2004106044769287, "logps/chosen": -37.553627014160156, "logps/rejected": -88.52012634277344, "loss": 0.5276, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9088641405105591, "rewards/margins": 4.905967712402344, "rewards/rejected": -3.997103452682495, "step": 810 }, { "epoch": 2.3380281690140845, "grad_norm": 6.901037693023682, "learning_rate": 7.658959537572254e-07, "logits/chosen": -2.987156391143799, "logits/rejected": -3.084721565246582, "logps/chosen": -32.385337829589844, "logps/rejected": -85.93006896972656, "loss": 0.4315, "rewards/accuracies": 0.9375, "rewards/chosen": 1.579974889755249, "rewards/margins": 5.168354511260986, "rewards/rejected": -3.588379383087158, "step": 811 }, { "epoch": 2.3409172986637774, "grad_norm": 6.721704959869385, "learning_rate": 7.656069364161849e-07, "logits/chosen": -2.975752830505371, "logits/rejected": -3.1244611740112305, "logps/chosen": -27.23630142211914, "logps/rejected": -86.8199462890625, "loss": 0.3971, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8905816078186035, "rewards/margins": 5.830223083496094, "rewards/rejected": -3.9396421909332275, "step": 812 }, { "epoch": 2.3438064283134707, "grad_norm": 4.501123905181885, "learning_rate": 7.653179190751444e-07, "logits/chosen": -2.964911460876465, "logits/rejected": -3.0837955474853516, "logps/chosen": -40.86470413208008, "logps/rejected": -94.62437438964844, "loss": 0.5472, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5712671875953674, "rewards/margins": 4.970544338226318, "rewards/rejected": -4.399276256561279, "step": 813 }, { "epoch": 2.3466955579631636, "grad_norm": 4.529665470123291, "learning_rate": 7.650289017341041e-07, "logits/chosen": -3.132183313369751, "logits/rejected": -3.328632116317749, "logps/chosen": -27.2799015045166, "logps/rejected": -84.81092834472656, "loss": 0.4367, "rewards/accuracies": 0.96875, "rewards/chosen": 1.642725944519043, "rewards/margins": 5.253782749176025, "rewards/rejected": -3.6110568046569824, "step": 814 }, { "epoch": 2.3495846876128565, "grad_norm": 7.134300708770752, "learning_rate": 7.647398843930636e-07, "logits/chosen": -2.9136757850646973, "logits/rejected": -3.139591932296753, "logps/chosen": -29.22779083251953, "logps/rejected": -80.0166015625, "loss": 0.4496, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4816604852676392, "rewards/margins": 4.615758419036865, "rewards/rejected": -3.1340980529785156, "step": 815 }, { "epoch": 2.35247381726255, "grad_norm": 5.89410924911499, "learning_rate": 7.644508670520231e-07, "logits/chosen": -2.905600070953369, "logits/rejected": -3.091099739074707, "logps/chosen": -29.745241165161133, "logps/rejected": -88.06546020507812, "loss": 0.3713, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9216837882995605, "rewards/margins": 5.929923057556152, "rewards/rejected": -4.00823974609375, "step": 816 }, { "epoch": 2.355362946912243, "grad_norm": 7.221127510070801, "learning_rate": 7.641618497109826e-07, "logits/chosen": -3.0908708572387695, "logits/rejected": -3.1592440605163574, "logps/chosen": -34.612953186035156, "logps/rejected": -83.13213348388672, "loss": 0.5032, "rewards/accuracies": 0.875, "rewards/chosen": 0.7000576257705688, "rewards/margins": 4.4678053855896, "rewards/rejected": -3.767747640609741, "step": 817 }, { "epoch": 2.3582520765619357, "grad_norm": 5.887415885925293, "learning_rate": 7.638728323699422e-07, "logits/chosen": -2.9175758361816406, "logits/rejected": -3.1340646743774414, "logps/chosen": -32.8663444519043, "logps/rejected": -94.86422729492188, "loss": 0.403, "rewards/accuracies": 1.0, "rewards/chosen": 1.5272717475891113, "rewards/margins": 5.905208587646484, "rewards/rejected": -4.377937316894531, "step": 818 }, { "epoch": 2.3611412062116286, "grad_norm": 5.598973751068115, "learning_rate": 7.635838150289016e-07, "logits/chosen": -3.0662598609924316, "logits/rejected": -3.216427803039551, "logps/chosen": -26.407453536987305, "logps/rejected": -93.0563735961914, "loss": 0.3308, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7809123992919922, "rewards/margins": 6.20606803894043, "rewards/rejected": -4.4251556396484375, "step": 819 }, { "epoch": 2.364030335861322, "grad_norm": 6.225068092346191, "learning_rate": 7.632947976878612e-07, "logits/chosen": -3.0548298358917236, "logits/rejected": -3.1988346576690674, "logps/chosen": -37.065711975097656, "logps/rejected": -89.3818130493164, "loss": 0.5043, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7025814056396484, "rewards/margins": 4.894538879394531, "rewards/rejected": -4.191956996917725, "step": 820 }, { "epoch": 2.366919465511015, "grad_norm": 6.186283111572266, "learning_rate": 7.630057803468208e-07, "logits/chosen": -2.9642088413238525, "logits/rejected": -3.1023755073547363, "logps/chosen": -32.05230712890625, "logps/rejected": -81.84619903564453, "loss": 0.4614, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3466625213623047, "rewards/margins": 4.91215705871582, "rewards/rejected": -3.5654947757720947, "step": 821 }, { "epoch": 2.3698085951607077, "grad_norm": 4.814180850982666, "learning_rate": 7.627167630057803e-07, "logits/chosen": -2.936882972717285, "logits/rejected": -3.1092722415924072, "logps/chosen": -34.91107940673828, "logps/rejected": -98.10433959960938, "loss": 0.4218, "rewards/accuracies": 0.96875, "rewards/chosen": 1.19455087184906, "rewards/margins": 6.039534568786621, "rewards/rejected": -4.84498405456543, "step": 822 }, { "epoch": 2.3726977248104006, "grad_norm": 7.174637317657471, "learning_rate": 7.624277456647399e-07, "logits/chosen": -3.0918045043945312, "logits/rejected": -3.2717444896698, "logps/chosen": -35.84741973876953, "logps/rejected": -94.99324798583984, "loss": 0.4262, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8450899720191956, "rewards/margins": 5.577546119689941, "rewards/rejected": -4.732455730438232, "step": 823 }, { "epoch": 2.375586854460094, "grad_norm": 7.917307376861572, "learning_rate": 7.621387283236994e-07, "logits/chosen": -2.941373348236084, "logits/rejected": -3.11293363571167, "logps/chosen": -39.46183776855469, "logps/rejected": -79.01727294921875, "loss": 0.5695, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5445269346237183, "rewards/margins": 3.706601142883301, "rewards/rejected": -3.162073850631714, "step": 824 }, { "epoch": 2.378475984109787, "grad_norm": 6.020567893981934, "learning_rate": 7.61849710982659e-07, "logits/chosen": -2.9569859504699707, "logits/rejected": -3.0975184440612793, "logps/chosen": -29.867177963256836, "logps/rejected": -91.34683227539062, "loss": 0.4368, "rewards/accuracies": 1.0, "rewards/chosen": 1.4826176166534424, "rewards/margins": 5.585046768188477, "rewards/rejected": -4.102428913116455, "step": 825 }, { "epoch": 2.38136511375948, "grad_norm": 5.7849650382995605, "learning_rate": 7.615606936416184e-07, "logits/chosen": -2.9641506671905518, "logits/rejected": -3.1256136894226074, "logps/chosen": -36.09562683105469, "logps/rejected": -96.79310607910156, "loss": 0.3965, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1034834384918213, "rewards/margins": 5.840942859649658, "rewards/rejected": -4.737459182739258, "step": 826 }, { "epoch": 2.384254243409173, "grad_norm": 6.68578577041626, "learning_rate": 7.61271676300578e-07, "logits/chosen": -3.0796074867248535, "logits/rejected": -3.2023701667785645, "logps/chosen": -29.642845153808594, "logps/rejected": -87.78987121582031, "loss": 0.43, "rewards/accuracies": 1.0, "rewards/chosen": 1.3011319637298584, "rewards/margins": 5.375452041625977, "rewards/rejected": -4.074320316314697, "step": 827 }, { "epoch": 2.387143373058866, "grad_norm": 5.452661514282227, "learning_rate": 7.609826589595375e-07, "logits/chosen": -3.129819393157959, "logits/rejected": -3.30415678024292, "logps/chosen": -39.32756423950195, "logps/rejected": -74.89128112792969, "loss": 0.5172, "rewards/accuracies": 0.875, "rewards/chosen": 0.9013874530792236, "rewards/margins": 3.562549114227295, "rewards/rejected": -2.6611618995666504, "step": 828 }, { "epoch": 2.390032502708559, "grad_norm": 6.487278938293457, "learning_rate": 7.60693641618497e-07, "logits/chosen": -2.9855823516845703, "logits/rejected": -3.15129017829895, "logps/chosen": -32.87815856933594, "logps/rejected": -87.20403289794922, "loss": 0.4394, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2469563484191895, "rewards/margins": 4.918574810028076, "rewards/rejected": -3.6716182231903076, "step": 829 }, { "epoch": 2.3929216323582523, "grad_norm": 5.193930625915527, "learning_rate": 7.604046242774567e-07, "logits/chosen": -2.8993072509765625, "logits/rejected": -3.0079991817474365, "logps/chosen": -34.229461669921875, "logps/rejected": -86.09512329101562, "loss": 0.468, "rewards/accuracies": 1.0, "rewards/chosen": 1.4917597770690918, "rewards/margins": 5.119998455047607, "rewards/rejected": -3.6282386779785156, "step": 830 }, { "epoch": 2.395810762007945, "grad_norm": 7.401700973510742, "learning_rate": 7.601156069364162e-07, "logits/chosen": -3.0222105979919434, "logits/rejected": -3.239767551422119, "logps/chosen": -32.67233657836914, "logps/rejected": -74.3736801147461, "loss": 0.481, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0905117988586426, "rewards/margins": 3.926856517791748, "rewards/rejected": -2.8363442420959473, "step": 831 }, { "epoch": 2.398699891657638, "grad_norm": 8.126103401184082, "learning_rate": 7.598265895953758e-07, "logits/chosen": -2.9177041053771973, "logits/rejected": -3.098245620727539, "logps/chosen": -38.60244369506836, "logps/rejected": -84.1904296875, "loss": 0.5596, "rewards/accuracies": 0.9375, "rewards/chosen": 0.856947660446167, "rewards/margins": 4.452012062072754, "rewards/rejected": -3.595064163208008, "step": 832 }, { "epoch": 2.401589021307331, "grad_norm": 9.14980411529541, "learning_rate": 7.595375722543352e-07, "logits/chosen": -2.9506616592407227, "logits/rejected": -3.0924184322357178, "logps/chosen": -34.69971466064453, "logps/rejected": -76.2582778930664, "loss": 0.5155, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0708163976669312, "rewards/margins": 4.037282943725586, "rewards/rejected": -2.9664664268493652, "step": 833 }, { "epoch": 2.4044781509570243, "grad_norm": 6.029335975646973, "learning_rate": 7.592485549132948e-07, "logits/chosen": -3.0796608924865723, "logits/rejected": -3.2183709144592285, "logps/chosen": -30.39975357055664, "logps/rejected": -93.23634338378906, "loss": 0.3869, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3428677320480347, "rewards/margins": 5.808917999267578, "rewards/rejected": -4.466050624847412, "step": 834 }, { "epoch": 2.4073672806067172, "grad_norm": 8.773011207580566, "learning_rate": 7.589595375722543e-07, "logits/chosen": -3.111551523208618, "logits/rejected": -3.3059775829315186, "logps/chosen": -27.902420043945312, "logps/rejected": -80.49171447753906, "loss": 0.3454, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6989786624908447, "rewards/margins": 5.16819429397583, "rewards/rejected": -3.4692153930664062, "step": 835 }, { "epoch": 2.41025641025641, "grad_norm": 8.87929916381836, "learning_rate": 7.586705202312138e-07, "logits/chosen": -3.132289409637451, "logits/rejected": -3.2416157722473145, "logps/chosen": -31.178340911865234, "logps/rejected": -85.34992980957031, "loss": 0.4266, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5261144638061523, "rewards/margins": 5.3076043128967285, "rewards/rejected": -3.781489849090576, "step": 836 }, { "epoch": 2.4131455399061035, "grad_norm": 7.201249599456787, "learning_rate": 7.583815028901733e-07, "logits/chosen": -2.9555771350860596, "logits/rejected": -3.1617817878723145, "logps/chosen": -23.28835105895996, "logps/rejected": -81.29730987548828, "loss": 0.3526, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0464699268341064, "rewards/margins": 5.325154781341553, "rewards/rejected": -3.2786850929260254, "step": 837 }, { "epoch": 2.4160346695557964, "grad_norm": 6.742241382598877, "learning_rate": 7.58092485549133e-07, "logits/chosen": -2.902555227279663, "logits/rejected": -3.0926918983459473, "logps/chosen": -32.82780456542969, "logps/rejected": -88.47476196289062, "loss": 0.4206, "rewards/accuracies": 1.0, "rewards/chosen": 1.4754236936569214, "rewards/margins": 5.467804908752441, "rewards/rejected": -3.9923815727233887, "step": 838 }, { "epoch": 2.4189237992054893, "grad_norm": 6.6811347007751465, "learning_rate": 7.578034682080924e-07, "logits/chosen": -3.107694625854492, "logits/rejected": -3.2547342777252197, "logps/chosen": -37.35139846801758, "logps/rejected": -93.58861541748047, "loss": 0.5402, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7497139573097229, "rewards/margins": 5.3141961097717285, "rewards/rejected": -4.564481735229492, "step": 839 }, { "epoch": 2.421812928855182, "grad_norm": 7.937631607055664, "learning_rate": 7.57514450867052e-07, "logits/chosen": -2.991267681121826, "logits/rejected": -3.1466572284698486, "logps/chosen": -30.504446029663086, "logps/rejected": -79.99848175048828, "loss": 0.4384, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2989815473556519, "rewards/margins": 4.644842147827148, "rewards/rejected": -3.3458609580993652, "step": 840 }, { "epoch": 2.421812928855182, "eval_logits/chosen": -3.0614097118377686, "eval_logits/rejected": -3.2352702617645264, "eval_logps/chosen": -38.86855697631836, "eval_logps/rejected": -94.59771728515625, "eval_loss": 0.5023149847984314, "eval_rewards/accuracies": 0.9274193644523621, "eval_rewards/chosen": 0.7447859644889832, "eval_rewards/margins": 5.50954008102417, "eval_rewards/rejected": -4.764754295349121, "eval_runtime": 224.3963, "eval_samples_per_second": 0.548, "eval_steps_per_second": 0.276, "step": 840 }, { "epoch": 2.4247020585048755, "grad_norm": 7.77288818359375, "learning_rate": 7.572254335260116e-07, "logits/chosen": -3.049692153930664, "logits/rejected": -3.2015957832336426, "logps/chosen": -32.96240997314453, "logps/rejected": -91.47537994384766, "loss": 0.3781, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4886354207992554, "rewards/margins": 5.632765293121338, "rewards/rejected": -4.144129753112793, "step": 841 }, { "epoch": 2.4275911881545684, "grad_norm": 8.20938491821289, "learning_rate": 7.569364161849711e-07, "logits/chosen": -2.966094970703125, "logits/rejected": -3.238837718963623, "logps/chosen": -33.17336654663086, "logps/rejected": -94.17745208740234, "loss": 0.3774, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3925448656082153, "rewards/margins": 5.909165382385254, "rewards/rejected": -4.516619682312012, "step": 842 }, { "epoch": 2.4304803178042613, "grad_norm": 7.6978840827941895, "learning_rate": 7.566473988439306e-07, "logits/chosen": -2.9130289554595947, "logits/rejected": -3.072643756866455, "logps/chosen": -29.302780151367188, "logps/rejected": -91.85082244873047, "loss": 0.4218, "rewards/accuracies": 1.0, "rewards/chosen": 1.469096064567566, "rewards/margins": 5.744505882263184, "rewards/rejected": -4.27540922164917, "step": 843 }, { "epoch": 2.4333694474539547, "grad_norm": 7.976936340332031, "learning_rate": 7.563583815028901e-07, "logits/chosen": -2.953439712524414, "logits/rejected": -3.1004862785339355, "logps/chosen": -40.891910552978516, "logps/rejected": -88.50614929199219, "loss": 0.5252, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5159652829170227, "rewards/margins": 4.693696975708008, "rewards/rejected": -4.177731513977051, "step": 844 }, { "epoch": 2.4362585771036476, "grad_norm": 6.104488849639893, "learning_rate": 7.560693641618497e-07, "logits/chosen": -3.0211939811706543, "logits/rejected": -3.185293197631836, "logps/chosen": -34.4317626953125, "logps/rejected": -83.56092834472656, "loss": 0.4472, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2557460069656372, "rewards/margins": 4.939545631408691, "rewards/rejected": -3.683799982070923, "step": 845 }, { "epoch": 2.4391477067533405, "grad_norm": 7.729788780212402, "learning_rate": 7.557803468208092e-07, "logits/chosen": -3.016409397125244, "logits/rejected": -3.204678535461426, "logps/chosen": -31.374134063720703, "logps/rejected": -80.90098571777344, "loss": 0.481, "rewards/accuracies": 1.0, "rewards/chosen": 1.2851327657699585, "rewards/margins": 4.710079669952393, "rewards/rejected": -3.4249470233917236, "step": 846 }, { "epoch": 2.442036836403034, "grad_norm": 5.847208499908447, "learning_rate": 7.554913294797688e-07, "logits/chosen": -2.9102182388305664, "logits/rejected": -3.091501474380493, "logps/chosen": -28.456302642822266, "logps/rejected": -83.51007080078125, "loss": 0.4102, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9334810972213745, "rewards/margins": 5.576583385467529, "rewards/rejected": -3.643101692199707, "step": 847 }, { "epoch": 2.4449259660527267, "grad_norm": 7.00287389755249, "learning_rate": 7.552023121387283e-07, "logits/chosen": -2.962043046951294, "logits/rejected": -3.0959668159484863, "logps/chosen": -40.826290130615234, "logps/rejected": -105.61198425292969, "loss": 0.432, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3164115250110626, "rewards/margins": 5.966396331787109, "rewards/rejected": -5.649984359741211, "step": 848 }, { "epoch": 2.4478150957024196, "grad_norm": 7.5802998542785645, "learning_rate": 7.549132947976879e-07, "logits/chosen": -2.983426570892334, "logits/rejected": -3.2523860931396484, "logps/chosen": -30.029224395751953, "logps/rejected": -93.26683807373047, "loss": 0.3991, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3748286962509155, "rewards/margins": 6.090542316436768, "rewards/rejected": -4.7157135009765625, "step": 849 }, { "epoch": 2.4507042253521125, "grad_norm": 7.552842617034912, "learning_rate": 7.546242774566473e-07, "logits/chosen": -2.9618887901306152, "logits/rejected": -3.15394926071167, "logps/chosen": -38.86567687988281, "logps/rejected": -95.00456237792969, "loss": 0.5142, "rewards/accuracies": 1.0, "rewards/chosen": 0.8416578769683838, "rewards/margins": 5.654567718505859, "rewards/rejected": -4.8129096031188965, "step": 850 }, { "epoch": 2.453593355001806, "grad_norm": 6.922358512878418, "learning_rate": 7.543352601156069e-07, "logits/chosen": -2.920952796936035, "logits/rejected": -3.1943001747131348, "logps/chosen": -35.84394073486328, "logps/rejected": -99.09619903564453, "loss": 0.4823, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2769191265106201, "rewards/margins": 6.231209754943848, "rewards/rejected": -4.95428991317749, "step": 851 }, { "epoch": 2.4564824846514988, "grad_norm": 6.5385355949401855, "learning_rate": 7.540462427745665e-07, "logits/chosen": -3.071357011795044, "logits/rejected": -3.30588960647583, "logps/chosen": -45.13351058959961, "logps/rejected": -101.447265625, "loss": 0.5353, "rewards/accuracies": 1.0, "rewards/chosen": 0.09036006033420563, "rewards/margins": 5.442699909210205, "rewards/rejected": -5.352339744567871, "step": 852 }, { "epoch": 2.4593716143011917, "grad_norm": 4.983462810516357, "learning_rate": 7.537572254335259e-07, "logits/chosen": -2.9273836612701416, "logits/rejected": -3.124185800552368, "logps/chosen": -35.10961151123047, "logps/rejected": -96.78028869628906, "loss": 0.4308, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2506301403045654, "rewards/margins": 5.957820415496826, "rewards/rejected": -4.707190990447998, "step": 853 }, { "epoch": 2.4622607439508846, "grad_norm": 6.931430339813232, "learning_rate": 7.534682080924856e-07, "logits/chosen": -2.9788265228271484, "logits/rejected": -3.1387717723846436, "logps/chosen": -48.61039352416992, "logps/rejected": -88.881591796875, "loss": 0.5906, "rewards/accuracies": 0.9375, "rewards/chosen": -0.01696734130382538, "rewards/margins": 4.0932793617248535, "rewards/rejected": -4.110246658325195, "step": 854 }, { "epoch": 2.465149873600578, "grad_norm": 5.810131549835205, "learning_rate": 7.531791907514451e-07, "logits/chosen": -3.0973496437072754, "logits/rejected": -3.262392520904541, "logps/chosen": -28.906579971313477, "logps/rejected": -85.8479232788086, "loss": 0.4094, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6767513751983643, "rewards/margins": 5.418859004974365, "rewards/rejected": -3.7421071529388428, "step": 855 }, { "epoch": 2.468039003250271, "grad_norm": 8.261255264282227, "learning_rate": 7.528901734104046e-07, "logits/chosen": -3.098083734512329, "logits/rejected": -3.2704524993896484, "logps/chosen": -36.108211517333984, "logps/rejected": -98.29535675048828, "loss": 0.4077, "rewards/accuracies": 1.0, "rewards/chosen": 1.0484505891799927, "rewards/margins": 6.094175338745117, "rewards/rejected": -5.045724868774414, "step": 856 }, { "epoch": 2.4709281328999637, "grad_norm": 5.502718925476074, "learning_rate": 7.526011560693641e-07, "logits/chosen": -2.9024386405944824, "logits/rejected": -3.1464173793792725, "logps/chosen": -38.30472183227539, "logps/rejected": -101.50714874267578, "loss": 0.4549, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5906434655189514, "rewards/margins": 5.915807247161865, "rewards/rejected": -5.325163841247559, "step": 857 }, { "epoch": 2.473817262549657, "grad_norm": 5.411170482635498, "learning_rate": 7.523121387283237e-07, "logits/chosen": -3.0769598484039307, "logits/rejected": -3.2258801460266113, "logps/chosen": -32.44417190551758, "logps/rejected": -91.98713684082031, "loss": 0.3612, "rewards/accuracies": 1.0, "rewards/chosen": 1.2585155963897705, "rewards/margins": 5.655717849731445, "rewards/rejected": -4.3972015380859375, "step": 858 }, { "epoch": 2.47670639219935, "grad_norm": 6.67277193069458, "learning_rate": 7.520231213872832e-07, "logits/chosen": -3.0435056686401367, "logits/rejected": -3.1465089321136475, "logps/chosen": -39.77975082397461, "logps/rejected": -94.12092590332031, "loss": 0.541, "rewards/accuracies": 1.0, "rewards/chosen": 0.5333652496337891, "rewards/margins": 5.115746974945068, "rewards/rejected": -4.582381248474121, "step": 859 }, { "epoch": 2.479595521849043, "grad_norm": 8.250070571899414, "learning_rate": 7.517341040462427e-07, "logits/chosen": -3.032805919647217, "logits/rejected": -3.2224361896514893, "logps/chosen": -33.58226776123047, "logps/rejected": -99.63184356689453, "loss": 0.4883, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3412028551101685, "rewards/margins": 6.2983293533325195, "rewards/rejected": -4.957126617431641, "step": 860 }, { "epoch": 2.482484651498736, "grad_norm": 4.67252779006958, "learning_rate": 7.514450867052022e-07, "logits/chosen": -2.9329068660736084, "logits/rejected": -3.1507186889648438, "logps/chosen": -44.91040802001953, "logps/rejected": -96.8740005493164, "loss": 0.5282, "rewards/accuracies": 0.96875, "rewards/chosen": 0.27793455123901367, "rewards/margins": 5.02626371383667, "rewards/rejected": -4.748328685760498, "step": 861 }, { "epoch": 2.485373781148429, "grad_norm": 7.803878307342529, "learning_rate": 7.511560693641619e-07, "logits/chosen": -3.068413496017456, "logits/rejected": -3.1656837463378906, "logps/chosen": -32.77662658691406, "logps/rejected": -92.56845092773438, "loss": 0.4244, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2080236673355103, "rewards/margins": 5.905834197998047, "rewards/rejected": -4.697810649871826, "step": 862 }, { "epoch": 2.488262910798122, "grad_norm": 11.134756088256836, "learning_rate": 7.508670520231214e-07, "logits/chosen": -3.0792958736419678, "logits/rejected": -3.275721788406372, "logps/chosen": -43.81731414794922, "logps/rejected": -98.28813171386719, "loss": 0.5172, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07868987321853638, "rewards/margins": 5.2611308097839355, "rewards/rejected": -5.182440757751465, "step": 863 }, { "epoch": 2.491152040447815, "grad_norm": 6.394914627075195, "learning_rate": 7.505780346820809e-07, "logits/chosen": -3.0766375064849854, "logits/rejected": -3.240605354309082, "logps/chosen": -30.833166122436523, "logps/rejected": -93.48429107666016, "loss": 0.4202, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3862171173095703, "rewards/margins": 6.387482643127441, "rewards/rejected": -5.001265525817871, "step": 864 }, { "epoch": 2.4940411700975083, "grad_norm": 7.376036643981934, "learning_rate": 7.502890173410405e-07, "logits/chosen": -3.0410094261169434, "logits/rejected": -3.195709228515625, "logps/chosen": -32.753597259521484, "logps/rejected": -90.095947265625, "loss": 0.4439, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3445658683776855, "rewards/margins": 5.331656455993652, "rewards/rejected": -3.987090587615967, "step": 865 }, { "epoch": 2.496930299747201, "grad_norm": 6.7352471351623535, "learning_rate": 7.5e-07, "logits/chosen": -2.9641425609588623, "logits/rejected": -3.139408588409424, "logps/chosen": -25.071075439453125, "logps/rejected": -83.45817565917969, "loss": 0.389, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9181733131408691, "rewards/margins": 5.433215618133545, "rewards/rejected": -3.5150420665740967, "step": 866 }, { "epoch": 2.499819429396894, "grad_norm": 6.544764041900635, "learning_rate": 7.497109826589595e-07, "logits/chosen": -3.0042624473571777, "logits/rejected": -3.143125534057617, "logps/chosen": -37.647945404052734, "logps/rejected": -84.1249771118164, "loss": 0.5652, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9617373943328857, "rewards/margins": 4.606240749359131, "rewards/rejected": -3.644503355026245, "step": 867 }, { "epoch": 2.502708559046587, "grad_norm": 5.785152435302734, "learning_rate": 7.49421965317919e-07, "logits/chosen": -3.040043592453003, "logits/rejected": -3.1901512145996094, "logps/chosen": -34.42786407470703, "logps/rejected": -85.98779296875, "loss": 0.5093, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8931098580360413, "rewards/margins": 4.708312511444092, "rewards/rejected": -3.8152029514312744, "step": 868 }, { "epoch": 2.5055976886962803, "grad_norm": 4.821086883544922, "learning_rate": 7.491329479768786e-07, "logits/chosen": -2.9510297775268555, "logits/rejected": -3.1520142555236816, "logps/chosen": -38.34654235839844, "logps/rejected": -93.72547912597656, "loss": 0.4662, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5713947415351868, "rewards/margins": 5.172936916351318, "rewards/rejected": -4.601541996002197, "step": 869 }, { "epoch": 2.508486818345973, "grad_norm": 6.825074195861816, "learning_rate": 7.48843930635838e-07, "logits/chosen": -3.07829213142395, "logits/rejected": -3.3285577297210693, "logps/chosen": -36.16766357421875, "logps/rejected": -87.69561767578125, "loss": 0.4441, "rewards/accuracies": 1.0, "rewards/chosen": 1.2031182050704956, "rewards/margins": 5.1580047607421875, "rewards/rejected": -3.954885959625244, "step": 870 }, { "epoch": 2.511375947995666, "grad_norm": 5.96443510055542, "learning_rate": 7.485549132947977e-07, "logits/chosen": -3.0493099689483643, "logits/rejected": -3.2029292583465576, "logps/chosen": -34.24468994140625, "logps/rejected": -88.63258361816406, "loss": 0.4366, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1131682395935059, "rewards/margins": 4.952602386474609, "rewards/rejected": -3.8394346237182617, "step": 871 }, { "epoch": 2.5142650776453594, "grad_norm": 6.724757194519043, "learning_rate": 7.482658959537572e-07, "logits/chosen": -3.100111961364746, "logits/rejected": -3.263580322265625, "logps/chosen": -46.71084976196289, "logps/rejected": -84.39302062988281, "loss": 0.6379, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1595587134361267, "rewards/margins": 3.3885154724121094, "rewards/rejected": -3.548074722290039, "step": 872 }, { "epoch": 2.5171542072950523, "grad_norm": 5.363777160644531, "learning_rate": 7.479768786127167e-07, "logits/chosen": -3.0510804653167725, "logits/rejected": -3.097675085067749, "logps/chosen": -38.13088607788086, "logps/rejected": -89.19412231445312, "loss": 0.4922, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5913410782814026, "rewards/margins": 4.684396266937256, "rewards/rejected": -4.093055725097656, "step": 873 }, { "epoch": 2.5200433369447452, "grad_norm": 6.673641681671143, "learning_rate": 7.476878612716763e-07, "logits/chosen": -3.0155556201934814, "logits/rejected": -3.1477997303009033, "logps/chosen": -26.61675262451172, "logps/rejected": -86.91303253173828, "loss": 0.3528, "rewards/accuracies": 1.0, "rewards/chosen": 1.7613855600357056, "rewards/margins": 5.635967254638672, "rewards/rejected": -3.8745815753936768, "step": 874 }, { "epoch": 2.5229324665944386, "grad_norm": 8.160761833190918, "learning_rate": 7.473988439306358e-07, "logits/chosen": -2.9727609157562256, "logits/rejected": -3.14410662651062, "logps/chosen": -26.901676177978516, "logps/rejected": -71.54906463623047, "loss": 0.5546, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7682528495788574, "rewards/margins": 4.299082279205322, "rewards/rejected": -2.5308291912078857, "step": 875 }, { "epoch": 2.5258215962441315, "grad_norm": 8.090862274169922, "learning_rate": 7.471098265895954e-07, "logits/chosen": -2.9901933670043945, "logits/rejected": -3.2562084197998047, "logps/chosen": -30.401918411254883, "logps/rejected": -83.10142517089844, "loss": 0.4349, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5288134813308716, "rewards/margins": 5.116360664367676, "rewards/rejected": -3.5875470638275146, "step": 876 }, { "epoch": 2.5287107258938244, "grad_norm": 7.03707218170166, "learning_rate": 7.468208092485548e-07, "logits/chosen": -2.9930458068847656, "logits/rejected": -3.1309421062469482, "logps/chosen": -31.94817352294922, "logps/rejected": -78.7020263671875, "loss": 0.4596, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4077305793762207, "rewards/margins": 4.402027130126953, "rewards/rejected": -2.9942965507507324, "step": 877 }, { "epoch": 2.5315998555435177, "grad_norm": 6.351861953735352, "learning_rate": 7.465317919075144e-07, "logits/chosen": -3.0085694789886475, "logits/rejected": -3.169839382171631, "logps/chosen": -37.61924362182617, "logps/rejected": -92.47894287109375, "loss": 0.4808, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7363203763961792, "rewards/margins": 5.093493461608887, "rewards/rejected": -4.357172966003418, "step": 878 }, { "epoch": 2.5344889851932106, "grad_norm": 6.368489742279053, "learning_rate": 7.46242774566474e-07, "logits/chosen": -2.991530418395996, "logits/rejected": -3.1654279232025146, "logps/chosen": -30.972766876220703, "logps/rejected": -92.5012435913086, "loss": 0.4127, "rewards/accuracies": 1.0, "rewards/chosen": 1.6568914651870728, "rewards/margins": 6.059305191040039, "rewards/rejected": -4.402413845062256, "step": 879 }, { "epoch": 2.5373781148429035, "grad_norm": 8.13575267791748, "learning_rate": 7.459537572254335e-07, "logits/chosen": -2.932849645614624, "logits/rejected": -3.092749834060669, "logps/chosen": -38.68522262573242, "logps/rejected": -84.8893814086914, "loss": 0.5301, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7995508909225464, "rewards/margins": 4.430875778198242, "rewards/rejected": -3.6313250064849854, "step": 880 }, { "epoch": 2.5402672444925964, "grad_norm": 8.155953407287598, "learning_rate": 7.45664739884393e-07, "logits/chosen": -3.172050952911377, "logits/rejected": -3.2574079036712646, "logps/chosen": -31.50913429260254, "logps/rejected": -94.52819061279297, "loss": 0.4317, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3568016290664673, "rewards/margins": 5.9141340255737305, "rewards/rejected": -4.557332515716553, "step": 881 }, { "epoch": 2.54315637414229, "grad_norm": 4.844716548919678, "learning_rate": 7.453757225433526e-07, "logits/chosen": -3.1063144207000732, "logits/rejected": -3.232449769973755, "logps/chosen": -37.931480407714844, "logps/rejected": -87.39840698242188, "loss": 0.5713, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5260922908782959, "rewards/margins": 4.593554496765137, "rewards/rejected": -4.06746244430542, "step": 882 }, { "epoch": 2.5460455037919827, "grad_norm": 6.922062397003174, "learning_rate": 7.45086705202312e-07, "logits/chosen": -3.040834665298462, "logits/rejected": -3.110229969024658, "logps/chosen": -42.25288391113281, "logps/rejected": -92.7133560180664, "loss": 0.5326, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4680430293083191, "rewards/margins": 4.65861177444458, "rewards/rejected": -4.190568447113037, "step": 883 }, { "epoch": 2.5489346334416756, "grad_norm": 6.507717609405518, "learning_rate": 7.447976878612716e-07, "logits/chosen": -2.9204554557800293, "logits/rejected": -3.072535991668701, "logps/chosen": -35.09549331665039, "logps/rejected": -77.3082046508789, "loss": 0.5186, "rewards/accuracies": 0.9375, "rewards/chosen": 1.218951940536499, "rewards/margins": 4.104152202606201, "rewards/rejected": -2.885200262069702, "step": 884 }, { "epoch": 2.5518237630913685, "grad_norm": 6.753575325012207, "learning_rate": 7.445086705202312e-07, "logits/chosen": -3.044602155685425, "logits/rejected": -3.186495780944824, "logps/chosen": -26.03386688232422, "logps/rejected": -91.18244171142578, "loss": 0.422, "rewards/accuracies": 1.0, "rewards/chosen": 1.9086493253707886, "rewards/margins": 6.120146751403809, "rewards/rejected": -4.2114973068237305, "step": 885 }, { "epoch": 2.554712892741062, "grad_norm": 4.358463764190674, "learning_rate": 7.442196531791907e-07, "logits/chosen": -2.820258855819702, "logits/rejected": -3.0584816932678223, "logps/chosen": -35.33610534667969, "logps/rejected": -98.00807189941406, "loss": 0.4157, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1568174362182617, "rewards/margins": 6.319195747375488, "rewards/rejected": -5.162377834320068, "step": 886 }, { "epoch": 2.5576020223907547, "grad_norm": 7.666214942932129, "learning_rate": 7.439306358381503e-07, "logits/chosen": -2.9576239585876465, "logits/rejected": -3.078767776489258, "logps/chosen": -31.549638748168945, "logps/rejected": -85.83226013183594, "loss": 0.4338, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5526477098464966, "rewards/margins": 5.108861446380615, "rewards/rejected": -3.55621337890625, "step": 887 }, { "epoch": 2.5604911520404476, "grad_norm": 6.629401206970215, "learning_rate": 7.436416184971098e-07, "logits/chosen": -2.9389467239379883, "logits/rejected": -3.0696592330932617, "logps/chosen": -32.87179183959961, "logps/rejected": -79.4817123413086, "loss": 0.4627, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2374324798583984, "rewards/margins": 4.565976142883301, "rewards/rejected": -3.3285434246063232, "step": 888 }, { "epoch": 2.563380281690141, "grad_norm": 7.136954307556152, "learning_rate": 7.433526011560694e-07, "logits/chosen": -3.0937535762786865, "logits/rejected": -3.188474178314209, "logps/chosen": -34.427955627441406, "logps/rejected": -95.155517578125, "loss": 0.4531, "rewards/accuracies": 1.0, "rewards/chosen": 1.1561498641967773, "rewards/margins": 5.8788909912109375, "rewards/rejected": -4.72274112701416, "step": 889 }, { "epoch": 2.566269411339834, "grad_norm": 8.515907287597656, "learning_rate": 7.430635838150288e-07, "logits/chosen": -2.9251365661621094, "logits/rejected": -3.1778924465179443, "logps/chosen": -28.0836238861084, "logps/rejected": -88.12008666992188, "loss": 0.4311, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6521852016448975, "rewards/margins": 5.684402942657471, "rewards/rejected": -4.032217979431152, "step": 890 }, { "epoch": 2.569158540989527, "grad_norm": 8.094847679138184, "learning_rate": 7.427745664739884e-07, "logits/chosen": -3.0273380279541016, "logits/rejected": -3.2317421436309814, "logps/chosen": -36.71263885498047, "logps/rejected": -97.21135711669922, "loss": 0.4962, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7084287405014038, "rewards/margins": 5.683969020843506, "rewards/rejected": -4.975539684295654, "step": 891 }, { "epoch": 2.57204767063922, "grad_norm": 4.917546272277832, "learning_rate": 7.424855491329479e-07, "logits/chosen": -2.8389010429382324, "logits/rejected": -3.0782227516174316, "logps/chosen": -33.40945053100586, "logps/rejected": -99.68690490722656, "loss": 0.3447, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2288212776184082, "rewards/margins": 6.50432825088501, "rewards/rejected": -5.275506973266602, "step": 892 }, { "epoch": 2.574936800288913, "grad_norm": 9.195207595825195, "learning_rate": 7.421965317919075e-07, "logits/chosen": -2.9948647022247314, "logits/rejected": -3.1393916606903076, "logps/chosen": -27.536537170410156, "logps/rejected": -82.17382049560547, "loss": 0.3335, "rewards/accuracies": 1.0, "rewards/chosen": 1.8433072566986084, "rewards/margins": 5.456326961517334, "rewards/rejected": -3.6130197048187256, "step": 893 }, { "epoch": 2.577825929938606, "grad_norm": 5.626406669616699, "learning_rate": 7.419075144508669e-07, "logits/chosen": -2.956540584564209, "logits/rejected": -3.071268081665039, "logps/chosen": -32.9405403137207, "logps/rejected": -80.04915618896484, "loss": 0.4687, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2904844284057617, "rewards/margins": 4.612174034118652, "rewards/rejected": -3.3216891288757324, "step": 894 }, { "epoch": 2.5807150595882993, "grad_norm": 18.932498931884766, "learning_rate": 7.416184971098266e-07, "logits/chosen": -3.0377705097198486, "logits/rejected": -3.0950980186462402, "logps/chosen": -41.03721237182617, "logps/rejected": -95.53816223144531, "loss": 0.4744, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6683703064918518, "rewards/margins": 5.4444050788879395, "rewards/rejected": -4.776034832000732, "step": 895 }, { "epoch": 2.583604189237992, "grad_norm": 7.8929266929626465, "learning_rate": 7.413294797687862e-07, "logits/chosen": -3.0207722187042236, "logits/rejected": -3.0955872535705566, "logps/chosen": -32.016639709472656, "logps/rejected": -77.99920654296875, "loss": 0.5063, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4137051105499268, "rewards/margins": 4.671976566314697, "rewards/rejected": -3.2582712173461914, "step": 896 }, { "epoch": 2.586493318887685, "grad_norm": 6.674401760101318, "learning_rate": 7.410404624277456e-07, "logits/chosen": -2.9949862957000732, "logits/rejected": -3.163600206375122, "logps/chosen": -34.76240158081055, "logps/rejected": -94.04583740234375, "loss": 0.4217, "rewards/accuracies": 0.96875, "rewards/chosen": 0.941516101360321, "rewards/margins": 5.625859260559082, "rewards/rejected": -4.684343338012695, "step": 897 }, { "epoch": 2.589382448537378, "grad_norm": 6.4363813400268555, "learning_rate": 7.407514450867052e-07, "logits/chosen": -2.9867498874664307, "logits/rejected": -3.115583896636963, "logps/chosen": -41.56609344482422, "logps/rejected": -106.26126098632812, "loss": 0.5131, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5096938610076904, "rewards/margins": 6.315803527832031, "rewards/rejected": -5.806109428405762, "step": 898 }, { "epoch": 2.592271578187071, "grad_norm": 6.940569877624512, "learning_rate": 7.404624277456647e-07, "logits/chosen": -2.983525514602661, "logits/rejected": -3.1688497066497803, "logps/chosen": -36.34449768066406, "logps/rejected": -91.87911987304688, "loss": 0.4472, "rewards/accuracies": 1.0, "rewards/chosen": 0.7622076272964478, "rewards/margins": 5.4456257820129395, "rewards/rejected": -4.683418273925781, "step": 899 }, { "epoch": 2.5951607078367642, "grad_norm": 6.692649841308594, "learning_rate": 7.401734104046242e-07, "logits/chosen": -3.0781190395355225, "logits/rejected": -3.2020721435546875, "logps/chosen": -28.623737335205078, "logps/rejected": -90.72283172607422, "loss": 0.3982, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5299034118652344, "rewards/margins": 5.952215194702148, "rewards/rejected": -4.422312259674072, "step": 900 }, { "epoch": 2.598049837486457, "grad_norm": 20.97068214416504, "learning_rate": 7.398843930635837e-07, "logits/chosen": -3.0257644653320312, "logits/rejected": -3.1970551013946533, "logps/chosen": -34.05372619628906, "logps/rejected": -92.13410186767578, "loss": 0.4505, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1516367197036743, "rewards/margins": 5.50809383392334, "rewards/rejected": -4.356456756591797, "step": 901 }, { "epoch": 2.60093896713615, "grad_norm": 7.144614219665527, "learning_rate": 7.395953757225433e-07, "logits/chosen": -2.9436440467834473, "logits/rejected": -3.14725661277771, "logps/chosen": -34.21250534057617, "logps/rejected": -95.36980438232422, "loss": 0.4001, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3512308597564697, "rewards/margins": 5.990185737609863, "rewards/rejected": -4.638955116271973, "step": 902 }, { "epoch": 2.6038280967858434, "grad_norm": 4.483549118041992, "learning_rate": 7.393063583815029e-07, "logits/chosen": -3.116210699081421, "logits/rejected": -3.2524282932281494, "logps/chosen": -33.752899169921875, "logps/rejected": -93.72920227050781, "loss": 0.423, "rewards/accuracies": 1.0, "rewards/chosen": 1.0163524150848389, "rewards/margins": 5.751034259796143, "rewards/rejected": -4.734681606292725, "step": 903 }, { "epoch": 2.6067172264355363, "grad_norm": 7.326252460479736, "learning_rate": 7.390173410404624e-07, "logits/chosen": -2.992072105407715, "logits/rejected": -3.210350513458252, "logps/chosen": -32.811676025390625, "logps/rejected": -83.05061340332031, "loss": 0.5347, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2710829973220825, "rewards/margins": 4.937805652618408, "rewards/rejected": -3.6667227745056152, "step": 904 }, { "epoch": 2.609606356085229, "grad_norm": 6.788186073303223, "learning_rate": 7.38728323699422e-07, "logits/chosen": -3.0280089378356934, "logits/rejected": -3.1981563568115234, "logps/chosen": -30.22918701171875, "logps/rejected": -95.84149932861328, "loss": 0.3906, "rewards/accuracies": 1.0, "rewards/chosen": 1.544578194618225, "rewards/margins": 6.258042812347412, "rewards/rejected": -4.71346378326416, "step": 905 }, { "epoch": 2.6124954857349225, "grad_norm": 6.037217617034912, "learning_rate": 7.384393063583815e-07, "logits/chosen": -3.0708541870117188, "logits/rejected": -3.1153130531311035, "logps/chosen": -46.1581916809082, "logps/rejected": -98.6912841796875, "loss": 0.5614, "rewards/accuracies": 0.90625, "rewards/chosen": -0.18584135174751282, "rewards/margins": 4.936847686767578, "rewards/rejected": -5.1226887702941895, "step": 906 }, { "epoch": 2.6153846153846154, "grad_norm": 6.958624362945557, "learning_rate": 7.38150289017341e-07, "logits/chosen": -2.995453357696533, "logits/rejected": -3.0934219360351562, "logps/chosen": -33.60007095336914, "logps/rejected": -88.61856842041016, "loss": 0.4284, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1389532089233398, "rewards/margins": 5.201254367828369, "rewards/rejected": -4.062300682067871, "step": 907 }, { "epoch": 2.6182737450343083, "grad_norm": 8.778841972351074, "learning_rate": 7.378612716763005e-07, "logits/chosen": -3.124816417694092, "logits/rejected": -3.2910356521606445, "logps/chosen": -39.581600189208984, "logps/rejected": -95.24461364746094, "loss": 0.5001, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4345700144767761, "rewards/margins": 5.432680130004883, "rewards/rejected": -4.998110294342041, "step": 908 }, { "epoch": 2.6211628746840017, "grad_norm": 8.051214218139648, "learning_rate": 7.375722543352601e-07, "logits/chosen": -3.022212028503418, "logits/rejected": -3.190589666366577, "logps/chosen": -47.7587776184082, "logps/rejected": -96.85159301757812, "loss": 0.5359, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0669863224029541, "rewards/margins": 4.901760578155518, "rewards/rejected": -4.834774017333984, "step": 909 }, { "epoch": 2.6240520043336946, "grad_norm": 11.360918045043945, "learning_rate": 7.372832369942196e-07, "logits/chosen": -2.8983283042907715, "logits/rejected": -3.0843348503112793, "logps/chosen": -26.399734497070312, "logps/rejected": -85.82413482666016, "loss": 0.3945, "rewards/accuracies": 1.0, "rewards/chosen": 1.9980841875076294, "rewards/margins": 5.737035274505615, "rewards/rejected": -3.7389514446258545, "step": 910 }, { "epoch": 2.6240520043336946, "eval_logits/chosen": -3.070948600769043, "eval_logits/rejected": -3.2416152954101562, "eval_logps/chosen": -39.304622650146484, "eval_logps/rejected": -95.29138946533203, "eval_loss": 0.510146975517273, "eval_rewards/accuracies": 0.9354838728904724, "eval_rewards/chosen": 0.7011792063713074, "eval_rewards/margins": 5.535299777984619, "eval_rewards/rejected": -4.834120750427246, "eval_runtime": 224.354, "eval_samples_per_second": 0.548, "eval_steps_per_second": 0.276, "step": 910 }, { "epoch": 2.6269411339833875, "grad_norm": 8.617365837097168, "learning_rate": 7.369942196531792e-07, "logits/chosen": -2.960292100906372, "logits/rejected": -3.0370616912841797, "logps/chosen": -36.11150360107422, "logps/rejected": -86.26840209960938, "loss": 0.4722, "rewards/accuracies": 0.90625, "rewards/chosen": 1.265243411064148, "rewards/margins": 4.931544780731201, "rewards/rejected": -3.6663014888763428, "step": 911 }, { "epoch": 2.629830263633081, "grad_norm": 7.240300178527832, "learning_rate": 7.367052023121387e-07, "logits/chosen": -3.1123645305633545, "logits/rejected": -3.175504207611084, "logps/chosen": -38.06614303588867, "logps/rejected": -87.67793273925781, "loss": 0.5194, "rewards/accuracies": 1.0, "rewards/chosen": 0.7773455381393433, "rewards/margins": 4.802985191345215, "rewards/rejected": -4.025639533996582, "step": 912 }, { "epoch": 2.6327193932827737, "grad_norm": 8.141420364379883, "learning_rate": 7.364161849710983e-07, "logits/chosen": -3.0502700805664062, "logits/rejected": -3.2035346031188965, "logps/chosen": -41.652252197265625, "logps/rejected": -96.08008575439453, "loss": 0.4752, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9505918025970459, "rewards/margins": 5.601536273956299, "rewards/rejected": -4.650943756103516, "step": 913 }, { "epoch": 2.6356085229324666, "grad_norm": 7.404236793518066, "learning_rate": 7.361271676300577e-07, "logits/chosen": -3.1005754470825195, "logits/rejected": -3.128451347351074, "logps/chosen": -35.27710723876953, "logps/rejected": -80.54894256591797, "loss": 0.5332, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9388489723205566, "rewards/margins": 4.366587162017822, "rewards/rejected": -3.427738666534424, "step": 914 }, { "epoch": 2.6384976525821595, "grad_norm": 6.7237772941589355, "learning_rate": 7.358381502890173e-07, "logits/chosen": -2.975921392440796, "logits/rejected": -3.082920789718628, "logps/chosen": -39.075164794921875, "logps/rejected": -87.8164291381836, "loss": 0.5417, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8111557960510254, "rewards/margins": 4.8231201171875, "rewards/rejected": -4.011963844299316, "step": 915 }, { "epoch": 2.6413867822318524, "grad_norm": 7.620573997497559, "learning_rate": 7.355491329479768e-07, "logits/chosen": -3.1104214191436768, "logits/rejected": -3.2758285999298096, "logps/chosen": -49.07560348510742, "logps/rejected": -95.56307220458984, "loss": 0.5595, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1301286518573761, "rewards/margins": 4.646026611328125, "rewards/rejected": -4.776155471801758, "step": 916 }, { "epoch": 2.6442759118815458, "grad_norm": 7.535297870635986, "learning_rate": 7.352601156069363e-07, "logits/chosen": -3.0335047245025635, "logits/rejected": -3.185600757598877, "logps/chosen": -35.0740966796875, "logps/rejected": -98.44783782958984, "loss": 0.449, "rewards/accuracies": 0.9375, "rewards/chosen": 1.119848608970642, "rewards/margins": 6.086170196533203, "rewards/rejected": -4.96632194519043, "step": 917 }, { "epoch": 2.6471650415312387, "grad_norm": 6.020998477935791, "learning_rate": 7.349710982658959e-07, "logits/chosen": -2.996631145477295, "logits/rejected": -3.20815372467041, "logps/chosen": -30.831100463867188, "logps/rejected": -86.59548950195312, "loss": 0.45, "rewards/accuracies": 1.0, "rewards/chosen": 1.435611367225647, "rewards/margins": 5.326748847961426, "rewards/rejected": -3.8911385536193848, "step": 918 }, { "epoch": 2.6500541711809316, "grad_norm": 7.9245147705078125, "learning_rate": 7.346820809248555e-07, "logits/chosen": -2.9817235469818115, "logits/rejected": -3.121910810470581, "logps/chosen": -41.80120849609375, "logps/rejected": -90.26697540283203, "loss": 0.5479, "rewards/accuracies": 0.90625, "rewards/chosen": 0.34524792432785034, "rewards/margins": 4.509967803955078, "rewards/rejected": -4.164719581604004, "step": 919 }, { "epoch": 2.652943300830625, "grad_norm": 8.046812057495117, "learning_rate": 7.343930635838151e-07, "logits/chosen": -2.9836132526397705, "logits/rejected": -3.160444736480713, "logps/chosen": -37.53295135498047, "logps/rejected": -88.4162826538086, "loss": 0.5065, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0048022270202637, "rewards/margins": 5.0910186767578125, "rewards/rejected": -4.086216926574707, "step": 920 }, { "epoch": 2.655832430480318, "grad_norm": 7.27815580368042, "learning_rate": 7.341040462427745e-07, "logits/chosen": -2.9932732582092285, "logits/rejected": -3.1388838291168213, "logps/chosen": -33.78784942626953, "logps/rejected": -80.4949722290039, "loss": 0.5077, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1200344562530518, "rewards/margins": 4.3305344581604, "rewards/rejected": -3.2104997634887695, "step": 921 }, { "epoch": 2.6587215601300107, "grad_norm": 6.310403823852539, "learning_rate": 7.338150289017341e-07, "logits/chosen": -3.065218687057495, "logits/rejected": -3.271629810333252, "logps/chosen": -31.290130615234375, "logps/rejected": -95.84548950195312, "loss": 0.3596, "rewards/accuracies": 1.0, "rewards/chosen": 1.820929765701294, "rewards/margins": 6.607476234436035, "rewards/rejected": -4.78654670715332, "step": 922 }, { "epoch": 2.661610689779704, "grad_norm": 9.46094036102295, "learning_rate": 7.335260115606936e-07, "logits/chosen": -2.9797654151916504, "logits/rejected": -3.116645336151123, "logps/chosen": -31.16606330871582, "logps/rejected": -81.01914978027344, "loss": 0.4632, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2768380641937256, "rewards/margins": 4.613790035247803, "rewards/rejected": -3.3369524478912354, "step": 923 }, { "epoch": 2.664499819429397, "grad_norm": 7.428957939147949, "learning_rate": 7.332369942196531e-07, "logits/chosen": -2.8203811645507812, "logits/rejected": -2.9936137199401855, "logps/chosen": -37.19200897216797, "logps/rejected": -86.9395523071289, "loss": 0.5518, "rewards/accuracies": 0.9375, "rewards/chosen": 1.091843843460083, "rewards/margins": 5.055415153503418, "rewards/rejected": -3.9635720252990723, "step": 924 }, { "epoch": 2.66738894907909, "grad_norm": 6.162697792053223, "learning_rate": 7.329479768786126e-07, "logits/chosen": -2.9950931072235107, "logits/rejected": -3.2005908489227295, "logps/chosen": -29.736722946166992, "logps/rejected": -100.17631530761719, "loss": 0.4066, "rewards/accuracies": 1.0, "rewards/chosen": 1.5231742858886719, "rewards/margins": 6.8190412521362305, "rewards/rejected": -5.295866966247559, "step": 925 }, { "epoch": 2.670278078728783, "grad_norm": 6.62402868270874, "learning_rate": 7.326589595375722e-07, "logits/chosen": -2.9426236152648926, "logits/rejected": -3.1058359146118164, "logps/chosen": -32.253971099853516, "logps/rejected": -89.96549224853516, "loss": 0.4348, "rewards/accuracies": 1.0, "rewards/chosen": 1.356264352798462, "rewards/margins": 5.491562843322754, "rewards/rejected": -4.135298728942871, "step": 926 }, { "epoch": 2.673167208378476, "grad_norm": 5.674437046051025, "learning_rate": 7.323699421965319e-07, "logits/chosen": -2.9812874794006348, "logits/rejected": -3.1539745330810547, "logps/chosen": -34.96050262451172, "logps/rejected": -100.41993713378906, "loss": 0.4156, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0357056856155396, "rewards/margins": 6.081836223602295, "rewards/rejected": -5.046130657196045, "step": 927 }, { "epoch": 2.676056338028169, "grad_norm": 4.464150905609131, "learning_rate": 7.320809248554913e-07, "logits/chosen": -2.8500590324401855, "logits/rejected": -3.001804828643799, "logps/chosen": -33.514835357666016, "logps/rejected": -99.44061279296875, "loss": 0.4259, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7012057304382324, "rewards/margins": 6.38706636428833, "rewards/rejected": -4.685860633850098, "step": 928 }, { "epoch": 2.678945467677862, "grad_norm": 8.405232429504395, "learning_rate": 7.317919075144509e-07, "logits/chosen": -3.00148868560791, "logits/rejected": -3.1764631271362305, "logps/chosen": -40.19147872924805, "logps/rejected": -89.39556884765625, "loss": 0.475, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6017460823059082, "rewards/margins": 4.7569193840026855, "rewards/rejected": -4.155173301696777, "step": 929 }, { "epoch": 2.681834597327555, "grad_norm": 5.316399574279785, "learning_rate": 7.315028901734104e-07, "logits/chosen": -2.9663023948669434, "logits/rejected": -3.1034088134765625, "logps/chosen": -34.953941345214844, "logps/rejected": -89.81080627441406, "loss": 0.4728, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0498689413070679, "rewards/margins": 5.397370338439941, "rewards/rejected": -4.347501277923584, "step": 930 }, { "epoch": 2.684723726977248, "grad_norm": 7.209290504455566, "learning_rate": 7.312138728323699e-07, "logits/chosen": -2.823117733001709, "logits/rejected": -3.0949389934539795, "logps/chosen": -34.275333404541016, "logps/rejected": -78.66317749023438, "loss": 0.5742, "rewards/accuracies": 1.0, "rewards/chosen": 1.4148869514465332, "rewards/margins": 4.392204284667969, "rewards/rejected": -2.9773168563842773, "step": 931 }, { "epoch": 2.687612856626941, "grad_norm": 6.994409561157227, "learning_rate": 7.309248554913294e-07, "logits/chosen": -3.0439977645874023, "logits/rejected": -3.23034930229187, "logps/chosen": -39.9583740234375, "logps/rejected": -91.16382598876953, "loss": 0.4853, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0881291627883911, "rewards/margins": 5.381572246551514, "rewards/rejected": -4.293442726135254, "step": 932 }, { "epoch": 2.690501986276634, "grad_norm": 7.526727676391602, "learning_rate": 7.30635838150289e-07, "logits/chosen": -2.8724918365478516, "logits/rejected": -3.050915241241455, "logps/chosen": -40.704383850097656, "logps/rejected": -96.03874969482422, "loss": 0.4045, "rewards/accuracies": 1.0, "rewards/chosen": 0.8395367860794067, "rewards/margins": 5.359458923339844, "rewards/rejected": -4.519922256469727, "step": 933 }, { "epoch": 2.6933911159263273, "grad_norm": 7.039274215698242, "learning_rate": 7.303468208092484e-07, "logits/chosen": -2.911428451538086, "logits/rejected": -3.169187307357788, "logps/chosen": -22.8956241607666, "logps/rejected": -86.11416625976562, "loss": 0.3371, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3365285396575928, "rewards/margins": 6.117691516876221, "rewards/rejected": -3.7811625003814697, "step": 934 }, { "epoch": 2.69628024557602, "grad_norm": 6.390506744384766, "learning_rate": 7.300578034682081e-07, "logits/chosen": -2.970963478088379, "logits/rejected": -3.1519856452941895, "logps/chosen": -32.47160720825195, "logps/rejected": -91.82861328125, "loss": 0.3744, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3735469579696655, "rewards/margins": 5.256251811981201, "rewards/rejected": -3.882704973220825, "step": 935 }, { "epoch": 2.699169375225713, "grad_norm": 6.354522705078125, "learning_rate": 7.297687861271676e-07, "logits/chosen": -3.0635039806365967, "logits/rejected": -3.1095352172851562, "logps/chosen": -41.698482513427734, "logps/rejected": -95.43252563476562, "loss": 0.4583, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7314327955245972, "rewards/margins": 5.544281482696533, "rewards/rejected": -4.812849044799805, "step": 936 }, { "epoch": 2.7020585048754064, "grad_norm": 8.302850723266602, "learning_rate": 7.294797687861272e-07, "logits/chosen": -2.981642484664917, "logits/rejected": -3.269359588623047, "logps/chosen": -34.24940872192383, "logps/rejected": -89.65772247314453, "loss": 0.4947, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1488103866577148, "rewards/margins": 5.147812843322754, "rewards/rejected": -3.999001979827881, "step": 937 }, { "epoch": 2.7049476345250993, "grad_norm": 8.223237991333008, "learning_rate": 7.291907514450866e-07, "logits/chosen": -3.011960983276367, "logits/rejected": -3.1366729736328125, "logps/chosen": -33.19101333618164, "logps/rejected": -84.80142211914062, "loss": 0.482, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1467232704162598, "rewards/margins": 5.210882186889648, "rewards/rejected": -4.064158916473389, "step": 938 }, { "epoch": 2.7078367641747922, "grad_norm": 7.211634159088135, "learning_rate": 7.289017341040462e-07, "logits/chosen": -2.9527130126953125, "logits/rejected": -3.1228485107421875, "logps/chosen": -26.035057067871094, "logps/rejected": -77.23947143554688, "loss": 0.4414, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8252466917037964, "rewards/margins": 4.6549973487854, "rewards/rejected": -2.8297505378723145, "step": 939 }, { "epoch": 2.7107258938244856, "grad_norm": 7.061811447143555, "learning_rate": 7.286127167630058e-07, "logits/chosen": -3.0581116676330566, "logits/rejected": -3.1541717052459717, "logps/chosen": -31.412296295166016, "logps/rejected": -85.34078216552734, "loss": 0.4182, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3957561254501343, "rewards/margins": 5.212162494659424, "rewards/rejected": -3.816406726837158, "step": 940 }, { "epoch": 2.7136150234741785, "grad_norm": 7.894237041473389, "learning_rate": 7.283236994219652e-07, "logits/chosen": -3.027146100997925, "logits/rejected": -3.194796085357666, "logps/chosen": -39.14202880859375, "logps/rejected": -95.18103790283203, "loss": 0.5369, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6139816045761108, "rewards/margins": 5.220067024230957, "rewards/rejected": -4.606085777282715, "step": 941 }, { "epoch": 2.7165041531238714, "grad_norm": 5.127690315246582, "learning_rate": 7.280346820809248e-07, "logits/chosen": -3.015624523162842, "logits/rejected": -3.1918158531188965, "logps/chosen": -33.590911865234375, "logps/rejected": -90.50579071044922, "loss": 0.4431, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2251062393188477, "rewards/margins": 5.298584938049316, "rewards/rejected": -4.073478698730469, "step": 942 }, { "epoch": 2.7193932827735647, "grad_norm": 7.60872220993042, "learning_rate": 7.277456647398844e-07, "logits/chosen": -2.9691414833068848, "logits/rejected": -3.1645445823669434, "logps/chosen": -25.45850372314453, "logps/rejected": -80.60620880126953, "loss": 0.3873, "rewards/accuracies": 0.96875, "rewards/chosen": 2.052061080932617, "rewards/margins": 5.469659328460693, "rewards/rejected": -3.4175987243652344, "step": 943 }, { "epoch": 2.7222824124232576, "grad_norm": 8.788397789001465, "learning_rate": 7.27456647398844e-07, "logits/chosen": -3.064758539199829, "logits/rejected": -3.2050528526306152, "logps/chosen": -40.92625045776367, "logps/rejected": -104.03227996826172, "loss": 0.4468, "rewards/accuracies": 0.96875, "rewards/chosen": 0.2965853810310364, "rewards/margins": 5.493799686431885, "rewards/rejected": -5.197214126586914, "step": 944 }, { "epoch": 2.7251715420729505, "grad_norm": 6.517539978027344, "learning_rate": 7.271676300578034e-07, "logits/chosen": -3.0617611408233643, "logits/rejected": -3.256915330886841, "logps/chosen": -29.093807220458984, "logps/rejected": -90.40067291259766, "loss": 0.2893, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5322153568267822, "rewards/margins": 6.067557334899902, "rewards/rejected": -4.535341262817383, "step": 945 }, { "epoch": 2.7280606717226434, "grad_norm": 7.16837739944458, "learning_rate": 7.26878612716763e-07, "logits/chosen": -3.0104258060455322, "logits/rejected": -3.1036510467529297, "logps/chosen": -39.69757843017578, "logps/rejected": -85.21241760253906, "loss": 0.5624, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7201111316680908, "rewards/margins": 4.255279064178467, "rewards/rejected": -3.535168170928955, "step": 946 }, { "epoch": 2.7309498013723363, "grad_norm": 5.766932964324951, "learning_rate": 7.265895953757225e-07, "logits/chosen": -3.036508560180664, "logits/rejected": -3.2483952045440674, "logps/chosen": -42.02604675292969, "logps/rejected": -90.84982299804688, "loss": 0.5146, "rewards/accuracies": 0.96875, "rewards/chosen": 0.39571794867515564, "rewards/margins": 4.663054943084717, "rewards/rejected": -4.267337322235107, "step": 947 }, { "epoch": 2.7338389310220297, "grad_norm": 8.843334197998047, "learning_rate": 7.26300578034682e-07, "logits/chosen": -3.000779628753662, "logits/rejected": -3.195145845413208, "logps/chosen": -34.50890350341797, "logps/rejected": -85.6613540649414, "loss": 0.434, "rewards/accuracies": 1.0, "rewards/chosen": 1.2454276084899902, "rewards/margins": 5.0758185386657715, "rewards/rejected": -3.8303909301757812, "step": 948 }, { "epoch": 2.7367280606717226, "grad_norm": 7.146130561828613, "learning_rate": 7.260115606936415e-07, "logits/chosen": -2.952489137649536, "logits/rejected": -3.1870033740997314, "logps/chosen": -36.853424072265625, "logps/rejected": -101.11506652832031, "loss": 0.4691, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8918730020523071, "rewards/margins": 5.815433979034424, "rewards/rejected": -4.923561096191406, "step": 949 }, { "epoch": 2.7396171903214155, "grad_norm": 5.790938377380371, "learning_rate": 7.257225433526011e-07, "logits/chosen": -3.017735481262207, "logits/rejected": -3.1214559078216553, "logps/chosen": -31.149288177490234, "logps/rejected": -94.42745208740234, "loss": 0.3789, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2226858139038086, "rewards/margins": 5.832364082336426, "rewards/rejected": -4.609677314758301, "step": 950 }, { "epoch": 2.742506319971109, "grad_norm": 6.542479515075684, "learning_rate": 7.254335260115608e-07, "logits/chosen": -3.0148279666900635, "logits/rejected": -3.141465187072754, "logps/chosen": -37.522972106933594, "logps/rejected": -91.24754333496094, "loss": 0.4961, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9164448380470276, "rewards/margins": 5.252535343170166, "rewards/rejected": -4.336091041564941, "step": 951 }, { "epoch": 2.7453954496208017, "grad_norm": 6.426310062408447, "learning_rate": 7.251445086705202e-07, "logits/chosen": -2.936466693878174, "logits/rejected": -3.0877060890197754, "logps/chosen": -48.58612060546875, "logps/rejected": -98.40394592285156, "loss": 0.5452, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1891024112701416, "rewards/margins": 4.758913040161133, "rewards/rejected": -4.948016166687012, "step": 952 }, { "epoch": 2.7482845792704946, "grad_norm": 10.43020248413086, "learning_rate": 7.248554913294798e-07, "logits/chosen": -3.0935757160186768, "logits/rejected": -3.082885980606079, "logps/chosen": -32.95305633544922, "logps/rejected": -92.38052368164062, "loss": 0.4633, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0475895404815674, "rewards/margins": 5.3821611404418945, "rewards/rejected": -4.334571838378906, "step": 953 }, { "epoch": 2.751173708920188, "grad_norm": 6.6886138916015625, "learning_rate": 7.245664739884393e-07, "logits/chosen": -2.9750280380249023, "logits/rejected": -3.1237130165100098, "logps/chosen": -36.40309143066406, "logps/rejected": -94.63519287109375, "loss": 0.475, "rewards/accuracies": 0.96875, "rewards/chosen": 1.136078119277954, "rewards/margins": 5.787240028381348, "rewards/rejected": -4.6511616706848145, "step": 954 }, { "epoch": 2.754062838569881, "grad_norm": 7.391120910644531, "learning_rate": 7.242774566473988e-07, "logits/chosen": -3.0150744915008545, "logits/rejected": -3.22070574760437, "logps/chosen": -38.529518127441406, "logps/rejected": -97.92715454101562, "loss": 0.4581, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7680704593658447, "rewards/margins": 6.056295394897461, "rewards/rejected": -5.288225173950195, "step": 955 }, { "epoch": 2.756951968219574, "grad_norm": 7.858236789703369, "learning_rate": 7.239884393063583e-07, "logits/chosen": -2.945171356201172, "logits/rejected": -3.129709243774414, "logps/chosen": -37.79121398925781, "logps/rejected": -85.67948150634766, "loss": 0.4434, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2174450159072876, "rewards/margins": 4.7917938232421875, "rewards/rejected": -3.5743489265441895, "step": 956 }, { "epoch": 2.759841097869267, "grad_norm": 5.859660625457764, "learning_rate": 7.236994219653179e-07, "logits/chosen": -2.989328145980835, "logits/rejected": -3.2254626750946045, "logps/chosen": -30.768301010131836, "logps/rejected": -102.27668762207031, "loss": 0.3715, "rewards/accuracies": 1.0, "rewards/chosen": 1.4636731147766113, "rewards/margins": 6.778595447540283, "rewards/rejected": -5.31492280960083, "step": 957 }, { "epoch": 2.76273022751896, "grad_norm": 5.774516582489014, "learning_rate": 7.234104046242773e-07, "logits/chosen": -2.9313547611236572, "logits/rejected": -3.1325535774230957, "logps/chosen": -35.315757751464844, "logps/rejected": -90.12808227539062, "loss": 0.4103, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3019065856933594, "rewards/margins": 5.410433292388916, "rewards/rejected": -4.108526229858398, "step": 958 }, { "epoch": 2.765619357168653, "grad_norm": 5.809350490570068, "learning_rate": 7.23121387283237e-07, "logits/chosen": -2.8873956203460693, "logits/rejected": -3.158529281616211, "logps/chosen": -35.20159149169922, "logps/rejected": -80.04351043701172, "loss": 0.5141, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2009848356246948, "rewards/margins": 4.55112361907959, "rewards/rejected": -3.3501386642456055, "step": 959 }, { "epoch": 2.768508486818346, "grad_norm": 6.66062068939209, "learning_rate": 7.228323699421966e-07, "logits/chosen": -2.99086856842041, "logits/rejected": -3.245821714401245, "logps/chosen": -37.87018966674805, "logps/rejected": -93.80907440185547, "loss": 0.4197, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1269991397857666, "rewards/margins": 5.691450595855713, "rewards/rejected": -4.564451217651367, "step": 960 }, { "epoch": 2.7713976164680387, "grad_norm": 6.466590881347656, "learning_rate": 7.225433526011561e-07, "logits/chosen": -2.979005813598633, "logits/rejected": -3.1522810459136963, "logps/chosen": -35.34963607788086, "logps/rejected": -90.152099609375, "loss": 0.4185, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1058279275894165, "rewards/margins": 5.303696632385254, "rewards/rejected": -4.197868347167969, "step": 961 }, { "epoch": 2.774286746117732, "grad_norm": 5.533064842224121, "learning_rate": 7.222543352601156e-07, "logits/chosen": -2.9232845306396484, "logits/rejected": -3.1667609214782715, "logps/chosen": -33.402015686035156, "logps/rejected": -91.203369140625, "loss": 0.4367, "rewards/accuracies": 1.0, "rewards/chosen": 1.2639484405517578, "rewards/margins": 5.503685474395752, "rewards/rejected": -4.239736557006836, "step": 962 }, { "epoch": 2.777175875767425, "grad_norm": 8.243989944458008, "learning_rate": 7.219653179190751e-07, "logits/chosen": -2.8915138244628906, "logits/rejected": -3.108940601348877, "logps/chosen": -26.78769302368164, "logps/rejected": -85.0464096069336, "loss": 0.4082, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5822961330413818, "rewards/margins": 5.5269927978515625, "rewards/rejected": -3.9446964263916016, "step": 963 }, { "epoch": 2.780065005417118, "grad_norm": 7.120800971984863, "learning_rate": 7.216763005780347e-07, "logits/chosen": -3.0267837047576904, "logits/rejected": -3.1598920822143555, "logps/chosen": -30.13060760498047, "logps/rejected": -79.175048828125, "loss": 0.4711, "rewards/accuracies": 0.9375, "rewards/chosen": 1.534551978111267, "rewards/margins": 4.649069786071777, "rewards/rejected": -3.1145176887512207, "step": 964 }, { "epoch": 2.782954135066811, "grad_norm": 8.388925552368164, "learning_rate": 7.213872832369941e-07, "logits/chosen": -2.973059892654419, "logits/rejected": -3.1255674362182617, "logps/chosen": -35.1217041015625, "logps/rejected": -90.2308578491211, "loss": 0.4588, "rewards/accuracies": 0.96875, "rewards/chosen": 1.333032250404358, "rewards/margins": 5.135455131530762, "rewards/rejected": -3.8024227619171143, "step": 965 }, { "epoch": 2.785843264716504, "grad_norm": 5.271235466003418, "learning_rate": 7.210982658959537e-07, "logits/chosen": -2.9904398918151855, "logits/rejected": -3.163768768310547, "logps/chosen": -32.07188415527344, "logps/rejected": -79.06791687011719, "loss": 0.4248, "rewards/accuracies": 0.875, "rewards/chosen": 1.3301715850830078, "rewards/margins": 4.804322242736816, "rewards/rejected": -3.474151134490967, "step": 966 }, { "epoch": 2.788732394366197, "grad_norm": 6.861327171325684, "learning_rate": 7.208092485549132e-07, "logits/chosen": -2.983786106109619, "logits/rejected": -3.1356353759765625, "logps/chosen": -24.44883155822754, "logps/rejected": -80.72299194335938, "loss": 0.3199, "rewards/accuracies": 1.0, "rewards/chosen": 2.1292896270751953, "rewards/margins": 5.229248046875, "rewards/rejected": -3.0999579429626465, "step": 967 }, { "epoch": 2.7916215240158904, "grad_norm": 9.100025177001953, "learning_rate": 7.205202312138729e-07, "logits/chosen": -2.978358745574951, "logits/rejected": -3.10972261428833, "logps/chosen": -38.54871368408203, "logps/rejected": -105.06187438964844, "loss": 0.3686, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7697430849075317, "rewards/margins": 5.887059688568115, "rewards/rejected": -5.117316722869873, "step": 968 }, { "epoch": 2.7945106536655833, "grad_norm": 6.177901268005371, "learning_rate": 7.202312138728323e-07, "logits/chosen": -3.0045230388641357, "logits/rejected": -3.195956230163574, "logps/chosen": -47.26136779785156, "logps/rejected": -92.62621307373047, "loss": 0.5338, "rewards/accuracies": 0.90625, "rewards/chosen": 0.33205050230026245, "rewards/margins": 4.549076080322266, "rewards/rejected": -4.2170257568359375, "step": 969 }, { "epoch": 2.797399783315276, "grad_norm": 8.372014045715332, "learning_rate": 7.199421965317919e-07, "logits/chosen": -2.98006272315979, "logits/rejected": -3.1778564453125, "logps/chosen": -34.67993927001953, "logps/rejected": -89.53557586669922, "loss": 0.4843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9939017295837402, "rewards/margins": 5.082971096038818, "rewards/rejected": -4.089069843292236, "step": 970 }, { "epoch": 2.8002889129649695, "grad_norm": 6.661905288696289, "learning_rate": 7.196531791907515e-07, "logits/chosen": -2.830031633377075, "logits/rejected": -3.0146491527557373, "logps/chosen": -35.6373291015625, "logps/rejected": -77.92195129394531, "loss": 0.5286, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1727721691131592, "rewards/margins": 4.078889846801758, "rewards/rejected": -2.9061179161071777, "step": 971 }, { "epoch": 2.8031780426146624, "grad_norm": 5.630934715270996, "learning_rate": 7.193641618497109e-07, "logits/chosen": -2.9918577671051025, "logits/rejected": -3.1560702323913574, "logps/chosen": -45.16999053955078, "logps/rejected": -98.4234390258789, "loss": 0.5597, "rewards/accuracies": 0.96875, "rewards/chosen": 0.22811301052570343, "rewards/margins": 5.02404260635376, "rewards/rejected": -4.795929908752441, "step": 972 }, { "epoch": 2.8060671722643553, "grad_norm": 6.8068060874938965, "learning_rate": 7.190751445086705e-07, "logits/chosen": -2.97697114944458, "logits/rejected": -3.1179752349853516, "logps/chosen": -25.78350830078125, "logps/rejected": -94.54249572753906, "loss": 0.2737, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0767621994018555, "rewards/margins": 6.671787738800049, "rewards/rejected": -4.595025062561035, "step": 973 }, { "epoch": 2.8089563019140487, "grad_norm": 8.439992904663086, "learning_rate": 7.1878612716763e-07, "logits/chosen": -3.077406406402588, "logits/rejected": -3.1236114501953125, "logps/chosen": -39.792537689208984, "logps/rejected": -85.94898986816406, "loss": 0.5093, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6332634687423706, "rewards/margins": 4.527629852294922, "rewards/rejected": -3.89436674118042, "step": 974 }, { "epoch": 2.8118454315637416, "grad_norm": 6.531951904296875, "learning_rate": 7.184971098265895e-07, "logits/chosen": -3.0135457515716553, "logits/rejected": -3.180147886276245, "logps/chosen": -30.777589797973633, "logps/rejected": -85.50437927246094, "loss": 0.487, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3101085424423218, "rewards/margins": 5.147443771362305, "rewards/rejected": -3.837334632873535, "step": 975 }, { "epoch": 2.8147345612134345, "grad_norm": 4.597621917724609, "learning_rate": 7.182080924855491e-07, "logits/chosen": -2.8157665729522705, "logits/rejected": -3.089702844619751, "logps/chosen": -32.30543899536133, "logps/rejected": -98.46954345703125, "loss": 0.4233, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6253464221954346, "rewards/margins": 6.417829990386963, "rewards/rejected": -4.792483329772949, "step": 976 }, { "epoch": 2.8176236908631274, "grad_norm": 7.220633506774902, "learning_rate": 7.179190751445087e-07, "logits/chosen": -2.99220871925354, "logits/rejected": -3.057114362716675, "logps/chosen": -33.57260513305664, "logps/rejected": -87.5672607421875, "loss": 0.415, "rewards/accuracies": 1.0, "rewards/chosen": 1.391315221786499, "rewards/margins": 5.355691909790039, "rewards/rejected": -3.964376926422119, "step": 977 }, { "epoch": 2.8205128205128203, "grad_norm": 6.587898254394531, "learning_rate": 7.176300578034682e-07, "logits/chosen": -2.9871745109558105, "logits/rejected": -3.15337872505188, "logps/chosen": -28.824966430664062, "logps/rejected": -75.57685852050781, "loss": 0.5397, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5352519750595093, "rewards/margins": 4.275236129760742, "rewards/rejected": -2.7399842739105225, "step": 978 }, { "epoch": 2.8234019501625136, "grad_norm": 7.360163688659668, "learning_rate": 7.173410404624277e-07, "logits/chosen": -3.0347049236297607, "logits/rejected": -3.1696629524230957, "logps/chosen": -34.25322723388672, "logps/rejected": -89.19109344482422, "loss": 0.4554, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1055876016616821, "rewards/margins": 5.469577789306641, "rewards/rejected": -4.363990306854248, "step": 979 }, { "epoch": 2.8262910798122065, "grad_norm": 7.4325385093688965, "learning_rate": 7.170520231213872e-07, "logits/chosen": -2.9920167922973633, "logits/rejected": -3.169858455657959, "logps/chosen": -35.98303985595703, "logps/rejected": -95.7249526977539, "loss": 0.3958, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8169717788696289, "rewards/margins": 5.617566108703613, "rewards/rejected": -4.800594329833984, "step": 980 }, { "epoch": 2.8262910798122065, "eval_logits/chosen": -3.0549142360687256, "eval_logits/rejected": -3.2207605838775635, "eval_logps/chosen": -39.130027770996094, "eval_logps/rejected": -97.08900451660156, "eval_loss": 0.504331111907959, "eval_rewards/accuracies": 0.9516128897666931, "eval_rewards/chosen": 0.7186389565467834, "eval_rewards/margins": 5.7325215339660645, "eval_rewards/rejected": -5.013882637023926, "eval_runtime": 228.5246, "eval_samples_per_second": 0.538, "eval_steps_per_second": 0.271, "step": 980 }, { "epoch": 2.8291802094618994, "grad_norm": 11.254090309143066, "learning_rate": 7.167630057803468e-07, "logits/chosen": -2.975646734237671, "logits/rejected": -3.1933324337005615, "logps/chosen": -30.453166961669922, "logps/rejected": -80.70867156982422, "loss": 0.4724, "rewards/accuracies": 0.96875, "rewards/chosen": 1.66585373878479, "rewards/margins": 4.910594463348389, "rewards/rejected": -3.2447409629821777, "step": 981 }, { "epoch": 2.8320693391115928, "grad_norm": 7.038717269897461, "learning_rate": 7.164739884393062e-07, "logits/chosen": -3.0258560180664062, "logits/rejected": -3.1012842655181885, "logps/chosen": -26.68010902404785, "logps/rejected": -86.54371643066406, "loss": 0.4358, "rewards/accuracies": 1.0, "rewards/chosen": 1.6494609117507935, "rewards/margins": 5.6789751052856445, "rewards/rejected": -4.029513359069824, "step": 982 }, { "epoch": 2.8349584687612857, "grad_norm": 6.445420742034912, "learning_rate": 7.161849710982658e-07, "logits/chosen": -3.0009241104125977, "logits/rejected": -3.145671844482422, "logps/chosen": -30.17250633239746, "logps/rejected": -85.41028594970703, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 1.267147183418274, "rewards/margins": 5.292344093322754, "rewards/rejected": -4.025197505950928, "step": 983 }, { "epoch": 2.8378475984109786, "grad_norm": 6.986949920654297, "learning_rate": 7.158959537572255e-07, "logits/chosen": -2.9649229049682617, "logits/rejected": -3.1878573894500732, "logps/chosen": -26.00152587890625, "logps/rejected": -87.81948852539062, "loss": 0.4387, "rewards/accuracies": 1.0, "rewards/chosen": 1.7378320693969727, "rewards/margins": 5.834458351135254, "rewards/rejected": -4.096626281738281, "step": 984 }, { "epoch": 2.840736728060672, "grad_norm": 7.04943323135376, "learning_rate": 7.15606936416185e-07, "logits/chosen": -2.8967533111572266, "logits/rejected": -3.1121654510498047, "logps/chosen": -38.87841796875, "logps/rejected": -97.314208984375, "loss": 0.4999, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6885395646095276, "rewards/margins": 5.406418323516846, "rewards/rejected": -4.717878341674805, "step": 985 }, { "epoch": 2.843625857710365, "grad_norm": 8.459012031555176, "learning_rate": 7.153179190751445e-07, "logits/chosen": -2.9387598037719727, "logits/rejected": -3.1809141635894775, "logps/chosen": -26.22281265258789, "logps/rejected": -85.73045349121094, "loss": 0.392, "rewards/accuracies": 0.9375, "rewards/chosen": 2.0602731704711914, "rewards/margins": 5.811758041381836, "rewards/rejected": -3.7514848709106445, "step": 986 }, { "epoch": 2.8465149873600577, "grad_norm": 8.102219581604004, "learning_rate": 7.15028901734104e-07, "logits/chosen": -3.0433130264282227, "logits/rejected": -3.1766128540039062, "logps/chosen": -29.47807502746582, "logps/rejected": -88.50350189208984, "loss": 0.4394, "rewards/accuracies": 1.0, "rewards/chosen": 1.4663150310516357, "rewards/margins": 5.539514064788818, "rewards/rejected": -4.073199272155762, "step": 987 }, { "epoch": 2.849404117009751, "grad_norm": 10.066532135009766, "learning_rate": 7.147398843930636e-07, "logits/chosen": -3.057119846343994, "logits/rejected": -3.1348876953125, "logps/chosen": -43.93482971191406, "logps/rejected": -97.76451110839844, "loss": 0.5159, "rewards/accuracies": 1.0, "rewards/chosen": 0.3155054450035095, "rewards/margins": 5.237244129180908, "rewards/rejected": -4.921738624572754, "step": 988 }, { "epoch": 2.852293246659444, "grad_norm": 6.500981330871582, "learning_rate": 7.14450867052023e-07, "logits/chosen": -3.0427465438842773, "logits/rejected": -3.1961309909820557, "logps/chosen": -38.242942810058594, "logps/rejected": -96.99739074707031, "loss": 0.4994, "rewards/accuracies": 0.875, "rewards/chosen": 0.7119153738021851, "rewards/margins": 5.5655517578125, "rewards/rejected": -4.853636741638184, "step": 989 }, { "epoch": 2.855182376309137, "grad_norm": 7.922822952270508, "learning_rate": 7.141618497109826e-07, "logits/chosen": -2.9686026573181152, "logits/rejected": -3.0827155113220215, "logps/chosen": -30.764772415161133, "logps/rejected": -90.63553619384766, "loss": 0.4163, "rewards/accuracies": 1.0, "rewards/chosen": 1.4919195175170898, "rewards/margins": 5.672472953796387, "rewards/rejected": -4.180553436279297, "step": 990 }, { "epoch": 2.8580715059588297, "grad_norm": 8.155617713928223, "learning_rate": 7.138728323699421e-07, "logits/chosen": -2.970848560333252, "logits/rejected": -3.138521671295166, "logps/chosen": -35.446983337402344, "logps/rejected": -90.83836364746094, "loss": 0.4796, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7622018456459045, "rewards/margins": 5.1435546875, "rewards/rejected": -4.381352424621582, "step": 991 }, { "epoch": 2.860960635608523, "grad_norm": 9.61320972442627, "learning_rate": 7.135838150289017e-07, "logits/chosen": -2.8700337409973145, "logits/rejected": -3.0384979248046875, "logps/chosen": -28.032123565673828, "logps/rejected": -83.71194458007812, "loss": 0.4587, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7001502513885498, "rewards/margins": 5.167546272277832, "rewards/rejected": -3.4673962593078613, "step": 992 }, { "epoch": 2.863849765258216, "grad_norm": 7.816274166107178, "learning_rate": 7.132947976878613e-07, "logits/chosen": -2.9360969066619873, "logits/rejected": -3.0558345317840576, "logps/chosen": -39.129276275634766, "logps/rejected": -76.96192932128906, "loss": 0.5654, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9552640318870544, "rewards/margins": 3.945242404937744, "rewards/rejected": -2.989978313446045, "step": 993 }, { "epoch": 2.866738894907909, "grad_norm": 6.492423057556152, "learning_rate": 7.130057803468208e-07, "logits/chosen": -2.973186492919922, "logits/rejected": -3.1498358249664307, "logps/chosen": -29.39301300048828, "logps/rejected": -86.40326690673828, "loss": 0.4461, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9150147438049316, "rewards/margins": 5.838479042053223, "rewards/rejected": -3.923463821411133, "step": 994 }, { "epoch": 2.869628024557602, "grad_norm": 5.916327476501465, "learning_rate": 7.127167630057804e-07, "logits/chosen": -2.8696353435516357, "logits/rejected": -3.085972547531128, "logps/chosen": -30.817684173583984, "logps/rejected": -94.02547454833984, "loss": 0.4024, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5167925357818604, "rewards/margins": 6.162175178527832, "rewards/rejected": -4.645382404327393, "step": 995 }, { "epoch": 2.872517154207295, "grad_norm": 6.953657627105713, "learning_rate": 7.124277456647398e-07, "logits/chosen": -3.0225253105163574, "logits/rejected": -3.0514025688171387, "logps/chosen": -43.660125732421875, "logps/rejected": -92.23800659179688, "loss": 0.505, "rewards/accuracies": 0.90625, "rewards/chosen": 0.430608332157135, "rewards/margins": 4.959977626800537, "rewards/rejected": -4.529368877410889, "step": 996 }, { "epoch": 2.875406283856988, "grad_norm": 6.804243087768555, "learning_rate": 7.121387283236994e-07, "logits/chosen": -3.089470148086548, "logits/rejected": -3.2415354251861572, "logps/chosen": -37.44913864135742, "logps/rejected": -104.76988220214844, "loss": 0.3819, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8968833684921265, "rewards/margins": 6.56879997253418, "rewards/rejected": -5.6719160079956055, "step": 997 }, { "epoch": 2.878295413506681, "grad_norm": 8.434679985046387, "learning_rate": 7.118497109826589e-07, "logits/chosen": -2.9768829345703125, "logits/rejected": -3.1742048263549805, "logps/chosen": -28.900453567504883, "logps/rejected": -81.95858001708984, "loss": 0.3214, "rewards/accuracies": 1.0, "rewards/chosen": 2.1142427921295166, "rewards/margins": 5.585526943206787, "rewards/rejected": -3.4712839126586914, "step": 998 }, { "epoch": 2.8811845431563743, "grad_norm": 7.231769561767578, "learning_rate": 7.115606936416184e-07, "logits/chosen": -2.93237566947937, "logits/rejected": -3.0951662063598633, "logps/chosen": -44.756874084472656, "logps/rejected": -101.85889434814453, "loss": 0.4988, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2729862332344055, "rewards/margins": 5.278562545776367, "rewards/rejected": -5.005576133728027, "step": 999 }, { "epoch": 2.884073672806067, "grad_norm": 9.003702163696289, "learning_rate": 7.11271676300578e-07, "logits/chosen": -3.0522589683532715, "logits/rejected": -3.205174207687378, "logps/chosen": -27.14014434814453, "logps/rejected": -92.10714721679688, "loss": 0.3948, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7024719715118408, "rewards/margins": 6.22304105758667, "rewards/rejected": -4.52056884765625, "step": 1000 }, { "epoch": 2.88696280245576, "grad_norm": 6.563841819763184, "learning_rate": 7.109826589595376e-07, "logits/chosen": -3.0037035942077637, "logits/rejected": -3.1603009700775146, "logps/chosen": -30.790468215942383, "logps/rejected": -94.48722076416016, "loss": 0.388, "rewards/accuracies": 1.0, "rewards/chosen": 1.5979268550872803, "rewards/margins": 6.080195903778076, "rewards/rejected": -4.482269763946533, "step": 1001 }, { "epoch": 2.8898519321054534, "grad_norm": 7.862727165222168, "learning_rate": 7.10693641618497e-07, "logits/chosen": -3.0027570724487305, "logits/rejected": -3.1621525287628174, "logps/chosen": -33.595706939697266, "logps/rejected": -102.20893096923828, "loss": 0.4207, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0861977338790894, "rewards/margins": 6.723673343658447, "rewards/rejected": -5.637476444244385, "step": 1002 }, { "epoch": 2.8927410617551463, "grad_norm": 9.764883041381836, "learning_rate": 7.104046242774566e-07, "logits/chosen": -2.966398239135742, "logits/rejected": -3.0493626594543457, "logps/chosen": -33.202457427978516, "logps/rejected": -95.13011169433594, "loss": 0.4363, "rewards/accuracies": 1.0, "rewards/chosen": 1.195424199104309, "rewards/margins": 6.1937079429626465, "rewards/rejected": -4.998283386230469, "step": 1003 }, { "epoch": 2.8956301914048392, "grad_norm": 7.125820636749268, "learning_rate": 7.101156069364162e-07, "logits/chosen": -2.934251070022583, "logits/rejected": -3.1436071395874023, "logps/chosen": -36.80155563354492, "logps/rejected": -109.89813995361328, "loss": 0.4064, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0289161205291748, "rewards/margins": 7.103194713592529, "rewards/rejected": -6.074278831481934, "step": 1004 }, { "epoch": 2.8985193210545326, "grad_norm": 8.97346305847168, "learning_rate": 7.098265895953757e-07, "logits/chosen": -3.081575870513916, "logits/rejected": -3.137113571166992, "logps/chosen": -31.193241119384766, "logps/rejected": -100.84976196289062, "loss": 0.3632, "rewards/accuracies": 1.0, "rewards/chosen": 1.363560676574707, "rewards/margins": 6.386486053466797, "rewards/rejected": -5.02292537689209, "step": 1005 }, { "epoch": 2.9014084507042255, "grad_norm": 6.889889717102051, "learning_rate": 7.095375722543352e-07, "logits/chosen": -3.004350423812866, "logits/rejected": -3.079406261444092, "logps/chosen": -34.84596252441406, "logps/rejected": -100.05247497558594, "loss": 0.4398, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0205285549163818, "rewards/margins": 5.92478609085083, "rewards/rejected": -4.904257774353027, "step": 1006 }, { "epoch": 2.9042975803539184, "grad_norm": 7.391822338104248, "learning_rate": 7.092485549132947e-07, "logits/chosen": -2.979118585586548, "logits/rejected": -3.1414475440979004, "logps/chosen": -29.04714584350586, "logps/rejected": -98.15115356445312, "loss": 0.3614, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8900258541107178, "rewards/margins": 6.703832626342773, "rewards/rejected": -4.813807010650635, "step": 1007 }, { "epoch": 2.9071867100036113, "grad_norm": 6.969891548156738, "learning_rate": 7.089595375722544e-07, "logits/chosen": -2.9849085807800293, "logits/rejected": -3.2181382179260254, "logps/chosen": -33.0518913269043, "logps/rejected": -87.47872161865234, "loss": 0.3675, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5340911149978638, "rewards/margins": 5.60134220123291, "rewards/rejected": -4.0672502517700195, "step": 1008 }, { "epoch": 2.910075839653304, "grad_norm": 7.127676963806152, "learning_rate": 7.086705202312138e-07, "logits/chosen": -3.050684690475464, "logits/rejected": -3.186229705810547, "logps/chosen": -35.820613861083984, "logps/rejected": -88.1971206665039, "loss": 0.4887, "rewards/accuracies": 0.90625, "rewards/chosen": 0.855420708656311, "rewards/margins": 4.668940544128418, "rewards/rejected": -3.8135199546813965, "step": 1009 }, { "epoch": 2.9129649693029975, "grad_norm": 7.249298095703125, "learning_rate": 7.083815028901734e-07, "logits/chosen": -2.9061319828033447, "logits/rejected": -3.132096290588379, "logps/chosen": -26.29736328125, "logps/rejected": -84.82160949707031, "loss": 0.3249, "rewards/accuracies": 1.0, "rewards/chosen": 1.8767627477645874, "rewards/margins": 5.709851264953613, "rewards/rejected": -3.833087921142578, "step": 1010 }, { "epoch": 2.9158540989526904, "grad_norm": 7.612448692321777, "learning_rate": 7.080924855491329e-07, "logits/chosen": -3.0099642276763916, "logits/rejected": -3.096172571182251, "logps/chosen": -39.09932327270508, "logps/rejected": -99.69551086425781, "loss": 0.4692, "rewards/accuracies": 1.0, "rewards/chosen": 0.9908605217933655, "rewards/margins": 5.753332138061523, "rewards/rejected": -4.762472152709961, "step": 1011 }, { "epoch": 2.9187432286023833, "grad_norm": 7.798447132110596, "learning_rate": 7.078034682080925e-07, "logits/chosen": -3.0096523761749268, "logits/rejected": -3.1823415756225586, "logps/chosen": -27.23872947692871, "logps/rejected": -91.10102844238281, "loss": 0.3658, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7017686367034912, "rewards/margins": 5.934868812561035, "rewards/rejected": -4.233099937438965, "step": 1012 }, { "epoch": 2.9216323582520767, "grad_norm": 6.6146039962768555, "learning_rate": 7.075144508670519e-07, "logits/chosen": -2.965531826019287, "logits/rejected": -3.188694477081299, "logps/chosen": -24.980016708374023, "logps/rejected": -82.6378402709961, "loss": 0.4583, "rewards/accuracies": 0.9375, "rewards/chosen": 2.05800724029541, "rewards/margins": 5.61596155166626, "rewards/rejected": -3.5579538345336914, "step": 1013 }, { "epoch": 2.9245214879017696, "grad_norm": 6.504480838775635, "learning_rate": 7.072254335260115e-07, "logits/chosen": -3.073784112930298, "logits/rejected": -3.1941378116607666, "logps/chosen": -41.58153533935547, "logps/rejected": -99.36056518554688, "loss": 0.4642, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6123394966125488, "rewards/margins": 5.496784210205078, "rewards/rejected": -4.884444236755371, "step": 1014 }, { "epoch": 2.9274106175514625, "grad_norm": 7.208093166351318, "learning_rate": 7.069364161849711e-07, "logits/chosen": -3.0020322799682617, "logits/rejected": -3.1302638053894043, "logps/chosen": -41.90190887451172, "logps/rejected": -94.87074279785156, "loss": 0.4692, "rewards/accuracies": 0.875, "rewards/chosen": 0.506743848323822, "rewards/margins": 5.238024711608887, "rewards/rejected": -4.731280326843262, "step": 1015 }, { "epoch": 2.930299747201156, "grad_norm": 6.106919288635254, "learning_rate": 7.066473988439306e-07, "logits/chosen": -2.9882419109344482, "logits/rejected": -3.079885721206665, "logps/chosen": -45.87771987915039, "logps/rejected": -82.35261535644531, "loss": 0.6063, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2061937004327774, "rewards/margins": 3.4453697204589844, "rewards/rejected": -3.239176034927368, "step": 1016 }, { "epoch": 2.9331888768508487, "grad_norm": 7.761849880218506, "learning_rate": 7.063583815028902e-07, "logits/chosen": -2.8944475650787354, "logits/rejected": -3.114841938018799, "logps/chosen": -37.995277404785156, "logps/rejected": -100.25733184814453, "loss": 0.5179, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8020846843719482, "rewards/margins": 5.708067893981934, "rewards/rejected": -4.905982494354248, "step": 1017 }, { "epoch": 2.9360780065005416, "grad_norm": 5.698955535888672, "learning_rate": 7.060693641618497e-07, "logits/chosen": -2.9161479473114014, "logits/rejected": -3.184178113937378, "logps/chosen": -23.856847763061523, "logps/rejected": -79.27925109863281, "loss": 0.4065, "rewards/accuracies": 0.9375, "rewards/chosen": 2.271728515625, "rewards/margins": 5.460130214691162, "rewards/rejected": -3.188401460647583, "step": 1018 }, { "epoch": 2.938967136150235, "grad_norm": 9.013781547546387, "learning_rate": 7.057803468208092e-07, "logits/chosen": -3.020265817642212, "logits/rejected": -3.242158889770508, "logps/chosen": -26.75867462158203, "logps/rejected": -88.03825378417969, "loss": 0.4588, "rewards/accuracies": 1.0, "rewards/chosen": 1.5880323648452759, "rewards/margins": 5.757523059844971, "rewards/rejected": -4.169490814208984, "step": 1019 }, { "epoch": 2.941856265799928, "grad_norm": 8.105777740478516, "learning_rate": 7.054913294797687e-07, "logits/chosen": -2.9690754413604736, "logits/rejected": -3.152019500732422, "logps/chosen": -33.47718811035156, "logps/rejected": -86.74366760253906, "loss": 0.4624, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3931517601013184, "rewards/margins": 5.145664215087891, "rewards/rejected": -3.7525126934051514, "step": 1020 }, { "epoch": 2.9447453954496208, "grad_norm": 8.588433265686035, "learning_rate": 7.052023121387283e-07, "logits/chosen": -2.8472039699554443, "logits/rejected": -3.043454647064209, "logps/chosen": -28.687864303588867, "logps/rejected": -86.7995834350586, "loss": 0.4196, "rewards/accuracies": 1.0, "rewards/chosen": 1.7695472240447998, "rewards/margins": 5.846602916717529, "rewards/rejected": -4.07705545425415, "step": 1021 }, { "epoch": 2.9476345250993137, "grad_norm": 7.417272567749023, "learning_rate": 7.049132947976878e-07, "logits/chosen": -3.0492210388183594, "logits/rejected": -3.2008795738220215, "logps/chosen": -37.596744537353516, "logps/rejected": -89.0696029663086, "loss": 0.4916, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6343592405319214, "rewards/margins": 4.996713161468506, "rewards/rejected": -4.362353801727295, "step": 1022 }, { "epoch": 2.950523654749007, "grad_norm": 8.477234840393066, "learning_rate": 7.046242774566473e-07, "logits/chosen": -3.0347230434417725, "logits/rejected": -3.1314899921417236, "logps/chosen": -30.811559677124023, "logps/rejected": -84.12635040283203, "loss": 0.5302, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3837698698043823, "rewards/margins": 4.832367897033691, "rewards/rejected": -3.4485979080200195, "step": 1023 }, { "epoch": 2.9534127843987, "grad_norm": 9.324573516845703, "learning_rate": 7.04335260115607e-07, "logits/chosen": -2.9415581226348877, "logits/rejected": -3.0586178302764893, "logps/chosen": -29.619009017944336, "logps/rejected": -77.6225814819336, "loss": 0.5558, "rewards/accuracies": 0.96875, "rewards/chosen": 1.437255859375, "rewards/margins": 4.323945045471191, "rewards/rejected": -2.8866889476776123, "step": 1024 }, { "epoch": 2.956301914048393, "grad_norm": 4.850113391876221, "learning_rate": 7.040462427745665e-07, "logits/chosen": -2.8678085803985596, "logits/rejected": -3.0624277591705322, "logps/chosen": -28.601064682006836, "logps/rejected": -88.92808532714844, "loss": 0.3776, "rewards/accuracies": 1.0, "rewards/chosen": 1.8295334577560425, "rewards/margins": 5.805248737335205, "rewards/rejected": -3.975715398788452, "step": 1025 }, { "epoch": 2.9591910436980857, "grad_norm": 9.754194259643555, "learning_rate": 7.03757225433526e-07, "logits/chosen": -3.002737045288086, "logits/rejected": -3.238125801086426, "logps/chosen": -31.169639587402344, "logps/rejected": -93.53041076660156, "loss": 0.4552, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4515525102615356, "rewards/margins": 5.856266498565674, "rewards/rejected": -4.404714584350586, "step": 1026 }, { "epoch": 2.962080173347779, "grad_norm": 7.573886394500732, "learning_rate": 7.034682080924855e-07, "logits/chosen": -2.85725998878479, "logits/rejected": -3.054041624069214, "logps/chosen": -29.749374389648438, "logps/rejected": -95.60755157470703, "loss": 0.4146, "rewards/accuracies": 1.0, "rewards/chosen": 1.647189974784851, "rewards/margins": 6.142976760864258, "rewards/rejected": -4.495786666870117, "step": 1027 }, { "epoch": 2.964969302997472, "grad_norm": 6.048768997192383, "learning_rate": 7.031791907514451e-07, "logits/chosen": -2.9894464015960693, "logits/rejected": -3.1431751251220703, "logps/chosen": -41.880775451660156, "logps/rejected": -87.70603942871094, "loss": 0.5445, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4867055118083954, "rewards/margins": 4.497745513916016, "rewards/rejected": -4.011039733886719, "step": 1028 }, { "epoch": 2.967858432647165, "grad_norm": 6.609114170074463, "learning_rate": 7.028901734104046e-07, "logits/chosen": -2.998081922531128, "logits/rejected": -3.2230031490325928, "logps/chosen": -38.78182601928711, "logps/rejected": -98.59590148925781, "loss": 0.4427, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6216474771499634, "rewards/margins": 5.585301876068115, "rewards/rejected": -4.963654518127441, "step": 1029 }, { "epoch": 2.970747562296858, "grad_norm": 9.69709300994873, "learning_rate": 7.026011560693641e-07, "logits/chosen": -2.8818247318267822, "logits/rejected": -3.100083827972412, "logps/chosen": -22.995685577392578, "logps/rejected": -67.4728775024414, "loss": 0.5102, "rewards/accuracies": 0.90625, "rewards/chosen": 2.02713942527771, "rewards/margins": 4.008665561676025, "rewards/rejected": -1.9815263748168945, "step": 1030 }, { "epoch": 2.973636691946551, "grad_norm": 6.237269878387451, "learning_rate": 7.023121387283236e-07, "logits/chosen": -2.8773353099823, "logits/rejected": -3.0706400871276855, "logps/chosen": -27.513492584228516, "logps/rejected": -88.53453826904297, "loss": 0.4615, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5712566375732422, "rewards/margins": 5.5266523361206055, "rewards/rejected": -3.9553959369659424, "step": 1031 }, { "epoch": 2.976525821596244, "grad_norm": 7.080986499786377, "learning_rate": 7.020231213872833e-07, "logits/chosen": -2.941112518310547, "logits/rejected": -3.1891720294952393, "logps/chosen": -30.901912689208984, "logps/rejected": -83.39530944824219, "loss": 0.4076, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4617303609848022, "rewards/margins": 5.151494026184082, "rewards/rejected": -3.689763307571411, "step": 1032 }, { "epoch": 2.9794149512459374, "grad_norm": 7.5630903244018555, "learning_rate": 7.017341040462427e-07, "logits/chosen": -2.8675365447998047, "logits/rejected": -3.076551914215088, "logps/chosen": -31.64057159423828, "logps/rejected": -87.51078796386719, "loss": 0.4415, "rewards/accuracies": 1.0, "rewards/chosen": 1.3834863901138306, "rewards/margins": 5.272146224975586, "rewards/rejected": -3.888659715652466, "step": 1033 }, { "epoch": 2.9823040808956303, "grad_norm": 7.7480244636535645, "learning_rate": 7.014450867052023e-07, "logits/chosen": -2.9201412200927734, "logits/rejected": -3.112175703048706, "logps/chosen": -27.305179595947266, "logps/rejected": -82.00569152832031, "loss": 0.4872, "rewards/accuracies": 1.0, "rewards/chosen": 1.5777117013931274, "rewards/margins": 4.900051116943359, "rewards/rejected": -3.3223397731781006, "step": 1034 }, { "epoch": 2.985193210545323, "grad_norm": 6.825854301452637, "learning_rate": 7.011560693641619e-07, "logits/chosen": -2.9200432300567627, "logits/rejected": -3.1319220066070557, "logps/chosen": -33.829856872558594, "logps/rejected": -80.49784851074219, "loss": 0.4872, "rewards/accuracies": 1.0, "rewards/chosen": 0.9606227278709412, "rewards/margins": 4.266664505004883, "rewards/rejected": -3.306041717529297, "step": 1035 }, { "epoch": 2.9880823401950165, "grad_norm": 8.178459167480469, "learning_rate": 7.008670520231213e-07, "logits/chosen": -3.0565972328186035, "logits/rejected": -3.2282838821411133, "logps/chosen": -39.995361328125, "logps/rejected": -85.59381103515625, "loss": 0.5778, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7172827124595642, "rewards/margins": 4.378743648529053, "rewards/rejected": -3.6614603996276855, "step": 1036 }, { "epoch": 2.9909714698447094, "grad_norm": 7.36652135848999, "learning_rate": 7.005780346820809e-07, "logits/chosen": -2.9130337238311768, "logits/rejected": -3.025453567504883, "logps/chosen": -43.87529373168945, "logps/rejected": -103.9572982788086, "loss": 0.4657, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5232289433479309, "rewards/margins": 5.772222518920898, "rewards/rejected": -5.248992919921875, "step": 1037 }, { "epoch": 2.9938605994944023, "grad_norm": 8.19088363647461, "learning_rate": 7.002890173410404e-07, "logits/chosen": -2.930388927459717, "logits/rejected": -3.1662468910217285, "logps/chosen": -41.28852081298828, "logps/rejected": -92.22834777832031, "loss": 0.4651, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7839375734329224, "rewards/margins": 5.271246910095215, "rewards/rejected": -4.487308979034424, "step": 1038 }, { "epoch": 2.996749729144095, "grad_norm": 7.356391906738281, "learning_rate": 7e-07, "logits/chosen": -2.9074549674987793, "logits/rejected": -3.078335762023926, "logps/chosen": -27.292945861816406, "logps/rejected": -98.02582550048828, "loss": 0.3199, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8868821859359741, "rewards/margins": 6.937186241149902, "rewards/rejected": -5.050304412841797, "step": 1039 }, { "epoch": 2.999638858793788, "grad_norm": 8.34415054321289, "learning_rate": 6.997109826589595e-07, "logits/chosen": -3.049859046936035, "logits/rejected": -3.135390281677246, "logps/chosen": -34.296085357666016, "logps/rejected": -94.37155151367188, "loss": 0.4387, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0943411588668823, "rewards/margins": 5.856232166290283, "rewards/rejected": -4.761890888214111, "step": 1040 }, { "epoch": 3.0, "grad_norm": 2.9997916221618652, "learning_rate": 6.994219653179191e-07, "logits/chosen": -2.895270347595215, "logits/rejected": -3.117094039916992, "logps/chosen": -18.722986221313477, "logps/rejected": -56.060123443603516, "loss": 0.0565, "rewards/accuracies": 1.0, "rewards/chosen": 2.0221729278564453, "rewards/margins": 3.4505045413970947, "rewards/rejected": -1.4283316135406494, "step": 1041 }, { "epoch": 3.002889129649693, "grad_norm": 8.655268669128418, "learning_rate": 6.991329479768786e-07, "logits/chosen": -2.974208354949951, "logits/rejected": -3.171431064605713, "logps/chosen": -27.087650299072266, "logps/rejected": -83.74198913574219, "loss": 0.3738, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0352933406829834, "rewards/margins": 5.61800479888916, "rewards/rejected": -3.582711935043335, "step": 1042 }, { "epoch": 3.0057782592993862, "grad_norm": 6.514278888702393, "learning_rate": 6.988439306358381e-07, "logits/chosen": -2.971078395843506, "logits/rejected": -3.1878252029418945, "logps/chosen": -30.764890670776367, "logps/rejected": -98.73310089111328, "loss": 0.3765, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5683114528656006, "rewards/margins": 6.4997992515563965, "rewards/rejected": -4.931487560272217, "step": 1043 }, { "epoch": 3.008667388949079, "grad_norm": 9.383719444274902, "learning_rate": 6.985549132947976e-07, "logits/chosen": -3.057041645050049, "logits/rejected": -3.154763698577881, "logps/chosen": -37.32014846801758, "logps/rejected": -90.00489807128906, "loss": 0.494, "rewards/accuracies": 1.0, "rewards/chosen": 0.6144899129867554, "rewards/margins": 4.975955963134766, "rewards/rejected": -4.361466884613037, "step": 1044 }, { "epoch": 3.011556518598772, "grad_norm": 7.166605472564697, "learning_rate": 6.982658959537572e-07, "logits/chosen": -2.9758615493774414, "logits/rejected": -3.115218162536621, "logps/chosen": -29.957761764526367, "logps/rejected": -93.27092742919922, "loss": 0.3329, "rewards/accuracies": 1.0, "rewards/chosen": 1.7791714668273926, "rewards/margins": 5.886290073394775, "rewards/rejected": -4.107118606567383, "step": 1045 }, { "epoch": 3.014445648248465, "grad_norm": 8.198643684387207, "learning_rate": 6.979768786127168e-07, "logits/chosen": -3.077780246734619, "logits/rejected": -3.1344175338745117, "logps/chosen": -34.81055450439453, "logps/rejected": -81.3553466796875, "loss": 0.5115, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8423600196838379, "rewards/margins": 4.460418224334717, "rewards/rejected": -3.6180579662323, "step": 1046 }, { "epoch": 3.0173347778981583, "grad_norm": 8.679461479187012, "learning_rate": 6.976878612716762e-07, "logits/chosen": -2.9568142890930176, "logits/rejected": -3.1431164741516113, "logps/chosen": -30.176185607910156, "logps/rejected": -88.20356750488281, "loss": 0.4798, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6856179237365723, "rewards/margins": 5.735868453979492, "rewards/rejected": -4.050250053405762, "step": 1047 }, { "epoch": 3.020223907547851, "grad_norm": 6.3697285652160645, "learning_rate": 6.973988439306359e-07, "logits/chosen": -2.9558990001678467, "logits/rejected": -3.1407203674316406, "logps/chosen": -35.82304000854492, "logps/rejected": -88.47881317138672, "loss": 0.4479, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7212896347045898, "rewards/margins": 4.8312482833862305, "rewards/rejected": -4.109959125518799, "step": 1048 }, { "epoch": 3.023113037197544, "grad_norm": 6.3477325439453125, "learning_rate": 6.971098265895954e-07, "logits/chosen": -3.0067973136901855, "logits/rejected": -3.190176486968994, "logps/chosen": -38.129966735839844, "logps/rejected": -101.04280853271484, "loss": 0.411, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7362529039382935, "rewards/margins": 6.210220813751221, "rewards/rejected": -5.473967552185059, "step": 1049 }, { "epoch": 3.0260021668472374, "grad_norm": 6.971385955810547, "learning_rate": 6.968208092485549e-07, "logits/chosen": -2.9164326190948486, "logits/rejected": -3.0994160175323486, "logps/chosen": -33.56732177734375, "logps/rejected": -86.32809448242188, "loss": 0.3704, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2781051397323608, "rewards/margins": 5.0268049240112305, "rewards/rejected": -3.74869966506958, "step": 1050 }, { "epoch": 3.0260021668472374, "eval_logits/chosen": -3.0305984020233154, "eval_logits/rejected": -3.206969738006592, "eval_logps/chosen": -38.988983154296875, "eval_logps/rejected": -97.94397735595703, "eval_loss": 0.5022231936454773, "eval_rewards/accuracies": 0.9516128897666931, "eval_rewards/chosen": 0.7327435612678528, "eval_rewards/margins": 5.832122802734375, "eval_rewards/rejected": -5.099380016326904, "eval_runtime": 245.1279, "eval_samples_per_second": 0.502, "eval_steps_per_second": 0.253, "step": 1050 }, { "epoch": 3.0288912964969303, "grad_norm": 5.477357864379883, "learning_rate": 6.965317919075144e-07, "logits/chosen": -2.8838160037994385, "logits/rejected": -3.0979456901550293, "logps/chosen": -34.802490234375, "logps/rejected": -107.08964538574219, "loss": 0.4216, "rewards/accuracies": 1.0, "rewards/chosen": 0.9891892671585083, "rewards/margins": 6.6325883865356445, "rewards/rejected": -5.643399238586426, "step": 1051 }, { "epoch": 3.0317804261466232, "grad_norm": 6.079814910888672, "learning_rate": 6.96242774566474e-07, "logits/chosen": -3.003931999206543, "logits/rejected": -3.1560134887695312, "logps/chosen": -31.343353271484375, "logps/rejected": -100.36822509765625, "loss": 0.3339, "rewards/accuracies": 1.0, "rewards/chosen": 1.5658323764801025, "rewards/margins": 6.87000846862793, "rewards/rejected": -5.304176330566406, "step": 1052 }, { "epoch": 3.034669555796316, "grad_norm": 6.608917236328125, "learning_rate": 6.959537572254334e-07, "logits/chosen": -2.9818522930145264, "logits/rejected": -3.2284491062164307, "logps/chosen": -34.5339241027832, "logps/rejected": -97.55612182617188, "loss": 0.3335, "rewards/accuracies": 1.0, "rewards/chosen": 1.0514225959777832, "rewards/margins": 6.124515533447266, "rewards/rejected": -5.073092460632324, "step": 1053 }, { "epoch": 3.0375586854460095, "grad_norm": 7.277270793914795, "learning_rate": 6.95664739884393e-07, "logits/chosen": -2.9681272506713867, "logits/rejected": -3.1319022178649902, "logps/chosen": -34.49784851074219, "logps/rejected": -92.55354309082031, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 1.2808287143707275, "rewards/margins": 5.707004070281982, "rewards/rejected": -4.426175594329834, "step": 1054 }, { "epoch": 3.0404478150957024, "grad_norm": 6.783650875091553, "learning_rate": 6.953757225433525e-07, "logits/chosen": -3.0414483547210693, "logits/rejected": -3.11830472946167, "logps/chosen": -37.86640167236328, "logps/rejected": -90.9355239868164, "loss": 0.3693, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9451969265937805, "rewards/margins": 5.180409908294678, "rewards/rejected": -4.235212802886963, "step": 1055 }, { "epoch": 3.0433369447453953, "grad_norm": 7.432433605194092, "learning_rate": 6.950867052023122e-07, "logits/chosen": -2.9505841732025146, "logits/rejected": -3.046408176422119, "logps/chosen": -27.37981414794922, "logps/rejected": -88.54680633544922, "loss": 0.4433, "rewards/accuracies": 0.90625, "rewards/chosen": 1.639890193939209, "rewards/margins": 5.521781921386719, "rewards/rejected": -3.8818914890289307, "step": 1056 }, { "epoch": 3.0462260743950886, "grad_norm": 8.561920166015625, "learning_rate": 6.947976878612717e-07, "logits/chosen": -2.9182682037353516, "logits/rejected": -3.0659267902374268, "logps/chosen": -38.31745910644531, "logps/rejected": -97.90846252441406, "loss": 0.4518, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9247487187385559, "rewards/margins": 5.703170299530029, "rewards/rejected": -4.778421878814697, "step": 1057 }, { "epoch": 3.0491152040447815, "grad_norm": 6.001120567321777, "learning_rate": 6.945086705202312e-07, "logits/chosen": -2.8797500133514404, "logits/rejected": -3.136345624923706, "logps/chosen": -32.917240142822266, "logps/rejected": -94.13410949707031, "loss": 0.4071, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1177433729171753, "rewards/margins": 5.88363790512085, "rewards/rejected": -4.765894889831543, "step": 1058 }, { "epoch": 3.0520043336944744, "grad_norm": 6.361936092376709, "learning_rate": 6.942196531791908e-07, "logits/chosen": -2.9453089237213135, "logits/rejected": -3.0798938274383545, "logps/chosen": -28.376466751098633, "logps/rejected": -101.84860229492188, "loss": 0.3403, "rewards/accuracies": 1.0, "rewards/chosen": 1.6734530925750732, "rewards/margins": 7.222127914428711, "rewards/rejected": -5.548674583435059, "step": 1059 }, { "epoch": 3.054893463344168, "grad_norm": 8.247048377990723, "learning_rate": 6.939306358381502e-07, "logits/chosen": -2.9600460529327393, "logits/rejected": -3.164147138595581, "logps/chosen": -28.877357482910156, "logps/rejected": -90.536376953125, "loss": 0.4731, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5438882112503052, "rewards/margins": 5.877116680145264, "rewards/rejected": -4.333227634429932, "step": 1060 }, { "epoch": 3.0577825929938607, "grad_norm": 8.956374168395996, "learning_rate": 6.936416184971098e-07, "logits/chosen": -3.0750350952148438, "logits/rejected": -3.2241721153259277, "logps/chosen": -35.3936882019043, "logps/rejected": -84.90377044677734, "loss": 0.4393, "rewards/accuracies": 1.0, "rewards/chosen": 1.218109130859375, "rewards/margins": 5.184865951538086, "rewards/rejected": -3.966756820678711, "step": 1061 }, { "epoch": 3.0606717226435536, "grad_norm": 7.976274013519287, "learning_rate": 6.933526011560693e-07, "logits/chosen": -2.7246408462524414, "logits/rejected": -2.9675967693328857, "logps/chosen": -31.08050537109375, "logps/rejected": -94.33982849121094, "loss": 0.4678, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5400222539901733, "rewards/margins": 5.851654052734375, "rewards/rejected": -4.311631679534912, "step": 1062 }, { "epoch": 3.0635608522932465, "grad_norm": 8.405101776123047, "learning_rate": 6.930635838150289e-07, "logits/chosen": -3.039780616760254, "logits/rejected": -3.2694220542907715, "logps/chosen": -28.359289169311523, "logps/rejected": -101.5490951538086, "loss": 0.2925, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7516313791275024, "rewards/margins": 6.899847507476807, "rewards/rejected": -5.148216247558594, "step": 1063 }, { "epoch": 3.06644998194294, "grad_norm": 8.844460487365723, "learning_rate": 6.927745664739883e-07, "logits/chosen": -2.965789318084717, "logits/rejected": -3.135026693344116, "logps/chosen": -20.84684944152832, "logps/rejected": -73.15711212158203, "loss": 0.3729, "rewards/accuracies": 0.9375, "rewards/chosen": 2.2263271808624268, "rewards/margins": 4.961186408996582, "rewards/rejected": -2.7348592281341553, "step": 1064 }, { "epoch": 3.0693391115926327, "grad_norm": 8.659218788146973, "learning_rate": 6.92485549132948e-07, "logits/chosen": -2.993798017501831, "logits/rejected": -3.130769968032837, "logps/chosen": -38.16000747680664, "logps/rejected": -94.70951080322266, "loss": 0.4462, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8559349775314331, "rewards/margins": 5.8384294509887695, "rewards/rejected": -4.982494354248047, "step": 1065 }, { "epoch": 3.0722282412423256, "grad_norm": 6.341363906860352, "learning_rate": 6.921965317919075e-07, "logits/chosen": -3.0478529930114746, "logits/rejected": -3.164508819580078, "logps/chosen": -42.08983612060547, "logps/rejected": -93.62057495117188, "loss": 0.5214, "rewards/accuracies": 0.90625, "rewards/chosen": 0.051178961992263794, "rewards/margins": 4.758386135101318, "rewards/rejected": -4.707207679748535, "step": 1066 }, { "epoch": 3.075117370892019, "grad_norm": 7.522541522979736, "learning_rate": 6.91907514450867e-07, "logits/chosen": -2.9029083251953125, "logits/rejected": -3.0753979682922363, "logps/chosen": -30.84494400024414, "logps/rejected": -88.1226806640625, "loss": 0.374, "rewards/accuracies": 1.0, "rewards/chosen": 1.5337116718292236, "rewards/margins": 5.806659698486328, "rewards/rejected": -4.272948265075684, "step": 1067 }, { "epoch": 3.078006500541712, "grad_norm": 7.853333473205566, "learning_rate": 6.916184971098266e-07, "logits/chosen": -2.9563374519348145, "logits/rejected": -3.138817071914673, "logps/chosen": -44.37503433227539, "logps/rejected": -95.14920043945312, "loss": 0.5567, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04226207733154297, "rewards/margins": 5.023468017578125, "rewards/rejected": -4.981205940246582, "step": 1068 }, { "epoch": 3.0808956301914048, "grad_norm": 6.153560638427734, "learning_rate": 6.913294797687861e-07, "logits/chosen": -2.946904420852661, "logits/rejected": -3.034534454345703, "logps/chosen": -44.831058502197266, "logps/rejected": -98.69902038574219, "loss": 0.5552, "rewards/accuracies": 0.96875, "rewards/chosen": 0.10717444121837616, "rewards/margins": 5.1640143394470215, "rewards/rejected": -5.056840419769287, "step": 1069 }, { "epoch": 3.0837847598410977, "grad_norm": 7.036251068115234, "learning_rate": 6.910404624277456e-07, "logits/chosen": -2.976745128631592, "logits/rejected": -3.1901931762695312, "logps/chosen": -38.06911087036133, "logps/rejected": -93.66549682617188, "loss": 0.4034, "rewards/accuracies": 0.90625, "rewards/chosen": 0.778319239616394, "rewards/margins": 5.544676303863525, "rewards/rejected": -4.766357421875, "step": 1070 }, { "epoch": 3.086673889490791, "grad_norm": 9.462154388427734, "learning_rate": 6.907514450867051e-07, "logits/chosen": -3.003833055496216, "logits/rejected": -3.1002135276794434, "logps/chosen": -40.39048767089844, "logps/rejected": -94.95343017578125, "loss": 0.4965, "rewards/accuracies": 0.875, "rewards/chosen": 0.5655775666236877, "rewards/margins": 5.31961727142334, "rewards/rejected": -4.754039764404297, "step": 1071 }, { "epoch": 3.089563019140484, "grad_norm": 6.6296162605285645, "learning_rate": 6.904624277456647e-07, "logits/chosen": -2.8813915252685547, "logits/rejected": -3.053690195083618, "logps/chosen": -29.026546478271484, "logps/rejected": -91.16659545898438, "loss": 0.4184, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7098641395568848, "rewards/margins": 5.940976142883301, "rewards/rejected": -4.231112003326416, "step": 1072 }, { "epoch": 3.092452148790177, "grad_norm": 5.40433406829834, "learning_rate": 6.901734104046243e-07, "logits/chosen": -3.0442614555358887, "logits/rejected": -3.1720218658447266, "logps/chosen": -41.3781623840332, "logps/rejected": -90.4642562866211, "loss": 0.5214, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6625763177871704, "rewards/margins": 5.018177032470703, "rewards/rejected": -4.355600833892822, "step": 1073 }, { "epoch": 3.09534127843987, "grad_norm": 8.318293571472168, "learning_rate": 6.898843930635838e-07, "logits/chosen": -2.970888614654541, "logits/rejected": -3.135284900665283, "logps/chosen": -27.445615768432617, "logps/rejected": -85.32743835449219, "loss": 0.4448, "rewards/accuracies": 1.0, "rewards/chosen": 1.5959079265594482, "rewards/margins": 5.25495719909668, "rewards/rejected": -3.6590487957000732, "step": 1074 }, { "epoch": 3.098230408089563, "grad_norm": 10.339656829833984, "learning_rate": 6.895953757225433e-07, "logits/chosen": -2.933354377746582, "logits/rejected": -3.1391377449035645, "logps/chosen": -38.560787200927734, "logps/rejected": -99.60823059082031, "loss": 0.4356, "rewards/accuracies": 0.96875, "rewards/chosen": 0.747337818145752, "rewards/margins": 5.9548797607421875, "rewards/rejected": -5.2075419425964355, "step": 1075 }, { "epoch": 3.101119537739256, "grad_norm": 5.9685211181640625, "learning_rate": 6.893063583815029e-07, "logits/chosen": -2.858339309692383, "logits/rejected": -3.060328722000122, "logps/chosen": -30.956892013549805, "logps/rejected": -84.02294158935547, "loss": 0.4034, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6619188785552979, "rewards/margins": 5.37868595123291, "rewards/rejected": -3.7167673110961914, "step": 1076 }, { "epoch": 3.104008667388949, "grad_norm": 7.655157566070557, "learning_rate": 6.890173410404623e-07, "logits/chosen": -3.0240516662597656, "logits/rejected": -3.135807991027832, "logps/chosen": -22.794921875, "logps/rejected": -83.76409149169922, "loss": 0.3545, "rewards/accuracies": 1.0, "rewards/chosen": 2.229602575302124, "rewards/margins": 5.914067268371582, "rewards/rejected": -3.684464693069458, "step": 1077 }, { "epoch": 3.106897797038642, "grad_norm": 6.901151180267334, "learning_rate": 6.887283236994219e-07, "logits/chosen": -2.9708807468414307, "logits/rejected": -3.0543079376220703, "logps/chosen": -29.280481338500977, "logps/rejected": -90.233642578125, "loss": 0.3997, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5385187864303589, "rewards/margins": 6.109985828399658, "rewards/rejected": -4.571466445922852, "step": 1078 }, { "epoch": 3.109786926688335, "grad_norm": 8.929713249206543, "learning_rate": 6.884393063583815e-07, "logits/chosen": -3.074685573577881, "logits/rejected": -3.1943225860595703, "logps/chosen": -27.738689422607422, "logps/rejected": -80.46551513671875, "loss": 0.4056, "rewards/accuracies": 0.9375, "rewards/chosen": 1.754768967628479, "rewards/margins": 4.910357475280762, "rewards/rejected": -3.1555891036987305, "step": 1079 }, { "epoch": 3.112676056338028, "grad_norm": 7.9555559158325195, "learning_rate": 6.88150289017341e-07, "logits/chosen": -2.944380521774292, "logits/rejected": -3.1063191890716553, "logps/chosen": -30.697980880737305, "logps/rejected": -87.4502182006836, "loss": 0.4261, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6087119579315186, "rewards/margins": 5.837274074554443, "rewards/rejected": -4.228562355041504, "step": 1080 }, { "epoch": 3.1155651859877214, "grad_norm": 7.3632073402404785, "learning_rate": 6.878612716763006e-07, "logits/chosen": -2.8981494903564453, "logits/rejected": -3.0189738273620605, "logps/chosen": -29.446853637695312, "logps/rejected": -84.26390838623047, "loss": 0.4095, "rewards/accuracies": 1.0, "rewards/chosen": 1.4438164234161377, "rewards/margins": 5.103682041168213, "rewards/rejected": -3.659864902496338, "step": 1081 }, { "epoch": 3.1184543156374143, "grad_norm": 7.137562274932861, "learning_rate": 6.875722543352601e-07, "logits/chosen": -2.917050361633301, "logits/rejected": -3.0424556732177734, "logps/chosen": -31.74191665649414, "logps/rejected": -87.6261978149414, "loss": 0.3845, "rewards/accuracies": 0.96875, "rewards/chosen": 1.549453854560852, "rewards/margins": 5.572031497955322, "rewards/rejected": -4.022578239440918, "step": 1082 }, { "epoch": 3.121343445287107, "grad_norm": 8.254206657409668, "learning_rate": 6.872832369942197e-07, "logits/chosen": -2.861593723297119, "logits/rejected": -3.1333718299865723, "logps/chosen": -33.14597702026367, "logps/rejected": -80.67408752441406, "loss": 0.4025, "rewards/accuracies": 0.96875, "rewards/chosen": 1.782370686531067, "rewards/margins": 4.967679023742676, "rewards/rejected": -3.1853082180023193, "step": 1083 }, { "epoch": 3.1242325749368005, "grad_norm": 7.290658473968506, "learning_rate": 6.869942196531791e-07, "logits/chosen": -2.9665138721466064, "logits/rejected": -3.171579599380493, "logps/chosen": -26.444459915161133, "logps/rejected": -95.45358276367188, "loss": 0.3176, "rewards/accuracies": 1.0, "rewards/chosen": 1.9338639974594116, "rewards/margins": 6.465426921844482, "rewards/rejected": -4.5315632820129395, "step": 1084 }, { "epoch": 3.1271217045864934, "grad_norm": 6.43713903427124, "learning_rate": 6.867052023121387e-07, "logits/chosen": -2.9241256713867188, "logits/rejected": -3.0775997638702393, "logps/chosen": -43.471927642822266, "logps/rejected": -102.45735931396484, "loss": 0.4616, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5207817554473877, "rewards/margins": 5.718257427215576, "rewards/rejected": -5.197474956512451, "step": 1085 }, { "epoch": 3.1300108342361863, "grad_norm": 9.743998527526855, "learning_rate": 6.864161849710982e-07, "logits/chosen": -3.0328969955444336, "logits/rejected": -3.1600215435028076, "logps/chosen": -35.53805160522461, "logps/rejected": -90.5384521484375, "loss": 0.4496, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0129778385162354, "rewards/margins": 5.230287075042725, "rewards/rejected": -4.21730899810791, "step": 1086 }, { "epoch": 3.132899963885879, "grad_norm": 7.32438850402832, "learning_rate": 6.861271676300577e-07, "logits/chosen": -2.951223373413086, "logits/rejected": -3.1977109909057617, "logps/chosen": -35.711360931396484, "logps/rejected": -104.45552825927734, "loss": 0.3669, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8323777914047241, "rewards/margins": 6.448084354400635, "rewards/rejected": -5.615706920623779, "step": 1087 }, { "epoch": 3.1357890935355726, "grad_norm": 7.74755334854126, "learning_rate": 6.858381502890172e-07, "logits/chosen": -3.0631957054138184, "logits/rejected": -3.244495153427124, "logps/chosen": -34.85624313354492, "logps/rejected": -104.20152282714844, "loss": 0.4046, "rewards/accuracies": 1.0, "rewards/chosen": 1.1851894855499268, "rewards/margins": 6.6071672439575195, "rewards/rejected": -5.421977519989014, "step": 1088 }, { "epoch": 3.1386782231852655, "grad_norm": 9.056041717529297, "learning_rate": 6.855491329479769e-07, "logits/chosen": -3.012695074081421, "logits/rejected": -3.186544418334961, "logps/chosen": -35.6223258972168, "logps/rejected": -92.57254028320312, "loss": 0.4111, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0121158361434937, "rewards/margins": 5.399301052093506, "rewards/rejected": -4.387185096740723, "step": 1089 }, { "epoch": 3.1415673528349584, "grad_norm": 7.623884201049805, "learning_rate": 6.852601156069365e-07, "logits/chosen": -2.875335216522217, "logits/rejected": -2.9853134155273438, "logps/chosen": -23.160188674926758, "logps/rejected": -74.96711730957031, "loss": 0.3891, "rewards/accuracies": 1.0, "rewards/chosen": 2.1813008785247803, "rewards/margins": 5.177597522735596, "rewards/rejected": -2.9962961673736572, "step": 1090 }, { "epoch": 3.1444564824846517, "grad_norm": 9.609661102294922, "learning_rate": 6.849710982658959e-07, "logits/chosen": -2.959489345550537, "logits/rejected": -3.0486702919006348, "logps/chosen": -40.96274948120117, "logps/rejected": -99.3431167602539, "loss": 0.4716, "rewards/accuracies": 1.0, "rewards/chosen": 0.5464834570884705, "rewards/margins": 5.641820907592773, "rewards/rejected": -5.095337867736816, "step": 1091 }, { "epoch": 3.1473456121343446, "grad_norm": 8.085561752319336, "learning_rate": 6.846820809248555e-07, "logits/chosen": -3.0260846614837646, "logits/rejected": -3.1694717407226562, "logps/chosen": -33.904884338378906, "logps/rejected": -101.14601135253906, "loss": 0.3963, "rewards/accuracies": 0.96875, "rewards/chosen": 0.922608494758606, "rewards/margins": 6.107451438903809, "rewards/rejected": -5.184843063354492, "step": 1092 }, { "epoch": 3.1502347417840375, "grad_norm": 7.7593770027160645, "learning_rate": 6.84393063583815e-07, "logits/chosen": -2.934476852416992, "logits/rejected": -3.095773220062256, "logps/chosen": -28.458663940429688, "logps/rejected": -86.18451690673828, "loss": 0.4298, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5746420621871948, "rewards/margins": 5.603529453277588, "rewards/rejected": -4.028887748718262, "step": 1093 }, { "epoch": 3.1531238714337304, "grad_norm": 9.052186965942383, "learning_rate": 6.841040462427745e-07, "logits/chosen": -2.9732635021209717, "logits/rejected": -3.077697515487671, "logps/chosen": -28.912277221679688, "logps/rejected": -78.67254638671875, "loss": 0.4556, "rewards/accuracies": 1.0, "rewards/chosen": 1.5962488651275635, "rewards/margins": 4.8146748542785645, "rewards/rejected": -3.2184255123138428, "step": 1094 }, { "epoch": 3.1560130010834238, "grad_norm": 6.592229843139648, "learning_rate": 6.83815028901734e-07, "logits/chosen": -2.94498872756958, "logits/rejected": -2.9999566078186035, "logps/chosen": -33.36788558959961, "logps/rejected": -84.7956771850586, "loss": 0.4797, "rewards/accuracies": 0.875, "rewards/chosen": 1.5704693794250488, "rewards/margins": 4.542809963226318, "rewards/rejected": -2.9723401069641113, "step": 1095 }, { "epoch": 3.1589021307331167, "grad_norm": 8.487543106079102, "learning_rate": 6.835260115606936e-07, "logits/chosen": -2.7923591136932373, "logits/rejected": -2.9640860557556152, "logps/chosen": -25.683500289916992, "logps/rejected": -77.66715240478516, "loss": 0.3777, "rewards/accuracies": 1.0, "rewards/chosen": 2.2808289527893066, "rewards/margins": 5.041024208068848, "rewards/rejected": -2.760195255279541, "step": 1096 }, { "epoch": 3.1617912603828096, "grad_norm": 7.340328216552734, "learning_rate": 6.832369942196532e-07, "logits/chosen": -2.915348529815674, "logits/rejected": -3.131683588027954, "logps/chosen": -26.320205688476562, "logps/rejected": -76.3829345703125, "loss": 0.3631, "rewards/accuracies": 0.90625, "rewards/chosen": 1.786217451095581, "rewards/margins": 4.983068943023682, "rewards/rejected": -3.1968512535095215, "step": 1097 }, { "epoch": 3.164680390032503, "grad_norm": 8.632430076599121, "learning_rate": 6.829479768786127e-07, "logits/chosen": -2.890244245529175, "logits/rejected": -3.1248505115509033, "logps/chosen": -26.353418350219727, "logps/rejected": -82.48915100097656, "loss": 0.3787, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9747728109359741, "rewards/margins": 5.576591491699219, "rewards/rejected": -3.601818561553955, "step": 1098 }, { "epoch": 3.167569519682196, "grad_norm": 5.0771026611328125, "learning_rate": 6.826589595375722e-07, "logits/chosen": -2.9520790576934814, "logits/rejected": -3.1694343090057373, "logps/chosen": -27.049827575683594, "logps/rejected": -94.18456268310547, "loss": 0.3523, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7833614349365234, "rewards/margins": 6.295957088470459, "rewards/rejected": -4.5125956535339355, "step": 1099 }, { "epoch": 3.1704586493318887, "grad_norm": 6.461648941040039, "learning_rate": 6.823699421965318e-07, "logits/chosen": -2.871774196624756, "logits/rejected": -3.0903372764587402, "logps/chosen": -40.35097885131836, "logps/rejected": -104.37425994873047, "loss": 0.414, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5787716507911682, "rewards/margins": 6.066462516784668, "rewards/rejected": -5.4876909255981445, "step": 1100 }, { "epoch": 3.1733477789815816, "grad_norm": 5.984121322631836, "learning_rate": 6.820809248554913e-07, "logits/chosen": -2.9864375591278076, "logits/rejected": -3.173529863357544, "logps/chosen": -36.60149002075195, "logps/rejected": -93.95584106445312, "loss": 0.3867, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9820646047592163, "rewards/margins": 5.586619853973389, "rewards/rejected": -4.604555130004883, "step": 1101 }, { "epoch": 3.176236908631275, "grad_norm": 7.2152509689331055, "learning_rate": 6.817919075144508e-07, "logits/chosen": -2.917901039123535, "logits/rejected": -3.075770854949951, "logps/chosen": -28.76615333557129, "logps/rejected": -94.69629669189453, "loss": 0.3647, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7407971620559692, "rewards/margins": 6.20065450668335, "rewards/rejected": -4.4598565101623535, "step": 1102 }, { "epoch": 3.179126038280968, "grad_norm": 7.901645660400391, "learning_rate": 6.815028901734104e-07, "logits/chosen": -2.931480884552002, "logits/rejected": -3.1380209922790527, "logps/chosen": -29.15249252319336, "logps/rejected": -93.39216613769531, "loss": 0.3365, "rewards/accuracies": 1.0, "rewards/chosen": 1.9268407821655273, "rewards/margins": 6.174460411071777, "rewards/rejected": -4.247620105743408, "step": 1103 }, { "epoch": 3.1820151679306607, "grad_norm": 9.362436294555664, "learning_rate": 6.812138728323698e-07, "logits/chosen": -2.9723727703094482, "logits/rejected": -3.0341854095458984, "logps/chosen": -33.66691970825195, "logps/rejected": -92.80327606201172, "loss": 0.4692, "rewards/accuracies": 0.90625, "rewards/chosen": 1.269653558731079, "rewards/margins": 5.666813850402832, "rewards/rejected": -4.397159576416016, "step": 1104 }, { "epoch": 3.184904297580354, "grad_norm": 8.451431274414062, "learning_rate": 6.809248554913295e-07, "logits/chosen": -2.861527442932129, "logits/rejected": -3.060823678970337, "logps/chosen": -36.68347930908203, "logps/rejected": -83.6566390991211, "loss": 0.5599, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1175048351287842, "rewards/margins": 4.692575454711914, "rewards/rejected": -3.575071096420288, "step": 1105 }, { "epoch": 3.187793427230047, "grad_norm": 7.5692524909973145, "learning_rate": 6.80635838150289e-07, "logits/chosen": -3.0553369522094727, "logits/rejected": -3.2535033226013184, "logps/chosen": -29.15985107421875, "logps/rejected": -98.391357421875, "loss": 0.3435, "rewards/accuracies": 1.0, "rewards/chosen": 1.2280186414718628, "rewards/margins": 6.4108757972717285, "rewards/rejected": -5.182857990264893, "step": 1106 }, { "epoch": 3.19068255687974, "grad_norm": 8.829919815063477, "learning_rate": 6.803468208092486e-07, "logits/chosen": -2.9640324115753174, "logits/rejected": -3.157973289489746, "logps/chosen": -22.041561126708984, "logps/rejected": -86.8866195678711, "loss": 0.3485, "rewards/accuracies": 0.9375, "rewards/chosen": 2.170281171798706, "rewards/margins": 5.935110092163086, "rewards/rejected": -3.76482892036438, "step": 1107 }, { "epoch": 3.193571686529433, "grad_norm": 6.364317893981934, "learning_rate": 6.80057803468208e-07, "logits/chosen": -3.016188621520996, "logits/rejected": -3.1415514945983887, "logps/chosen": -40.3745002746582, "logps/rejected": -97.14999389648438, "loss": 0.4608, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6695420145988464, "rewards/margins": 5.657318115234375, "rewards/rejected": -4.9877753257751465, "step": 1108 }, { "epoch": 3.196460816179126, "grad_norm": 8.174789428710938, "learning_rate": 6.797687861271676e-07, "logits/chosen": -2.8560478687286377, "logits/rejected": -3.1250033378601074, "logps/chosen": -25.45450782775879, "logps/rejected": -82.2366943359375, "loss": 0.3864, "rewards/accuracies": 1.0, "rewards/chosen": 1.9317176342010498, "rewards/margins": 5.465859413146973, "rewards/rejected": -3.534142255783081, "step": 1109 }, { "epoch": 3.199349945828819, "grad_norm": 6.4451904296875, "learning_rate": 6.794797687861271e-07, "logits/chosen": -2.9258103370666504, "logits/rejected": -3.112175703048706, "logps/chosen": -37.917030334472656, "logps/rejected": -99.80653381347656, "loss": 0.4443, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8714628219604492, "rewards/margins": 6.136251449584961, "rewards/rejected": -5.26478910446167, "step": 1110 }, { "epoch": 3.202239075478512, "grad_norm": 7.167956829071045, "learning_rate": 6.791907514450866e-07, "logits/chosen": -2.961075782775879, "logits/rejected": -3.1590096950531006, "logps/chosen": -34.8183479309082, "logps/rejected": -98.88211059570312, "loss": 0.4319, "rewards/accuracies": 1.0, "rewards/chosen": 0.8423032760620117, "rewards/margins": 6.15290641784668, "rewards/rejected": -5.310603141784668, "step": 1111 }, { "epoch": 3.2051282051282053, "grad_norm": 8.902471542358398, "learning_rate": 6.789017341040462e-07, "logits/chosen": -2.947436809539795, "logits/rejected": -3.2083029747009277, "logps/chosen": -36.9202766418457, "logps/rejected": -101.46366882324219, "loss": 0.4645, "rewards/accuracies": 1.0, "rewards/chosen": 1.110818862915039, "rewards/margins": 6.565610408782959, "rewards/rejected": -5.454792022705078, "step": 1112 }, { "epoch": 3.208017334777898, "grad_norm": 5.582228660583496, "learning_rate": 6.786127167630058e-07, "logits/chosen": -3.0347890853881836, "logits/rejected": -3.1254570484161377, "logps/chosen": -36.33754348754883, "logps/rejected": -103.39714050292969, "loss": 0.5049, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0070853233337402, "rewards/margins": 6.579894065856934, "rewards/rejected": -5.572808265686035, "step": 1113 }, { "epoch": 3.210906464427591, "grad_norm": 10.502250671386719, "learning_rate": 6.783236994219654e-07, "logits/chosen": -2.992494583129883, "logits/rejected": -3.1458024978637695, "logps/chosen": -28.975046157836914, "logps/rejected": -100.75527954101562, "loss": 0.355, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6556227207183838, "rewards/margins": 7.322080612182617, "rewards/rejected": -5.666456699371338, "step": 1114 }, { "epoch": 3.2137955940772844, "grad_norm": 8.142538070678711, "learning_rate": 6.780346820809248e-07, "logits/chosen": -2.9181320667266846, "logits/rejected": -3.102332592010498, "logps/chosen": -35.5823974609375, "logps/rejected": -90.59376525878906, "loss": 0.3993, "rewards/accuracies": 1.0, "rewards/chosen": 1.2614588737487793, "rewards/margins": 5.768867492675781, "rewards/rejected": -4.507408618927002, "step": 1115 }, { "epoch": 3.2166847237269773, "grad_norm": 10.682500839233398, "learning_rate": 6.777456647398844e-07, "logits/chosen": -2.888967990875244, "logits/rejected": -3.099576473236084, "logps/chosen": -22.465160369873047, "logps/rejected": -74.47038269042969, "loss": 0.4486, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2475228309631348, "rewards/margins": 4.9376959800720215, "rewards/rejected": -2.6901731491088867, "step": 1116 }, { "epoch": 3.2195738533766702, "grad_norm": 7.292617321014404, "learning_rate": 6.774566473988439e-07, "logits/chosen": -2.9349093437194824, "logits/rejected": -3.080709457397461, "logps/chosen": -37.06263732910156, "logps/rejected": -92.96084594726562, "loss": 0.399, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8751789331436157, "rewards/margins": 5.5562615394592285, "rewards/rejected": -4.681081771850586, "step": 1117 }, { "epoch": 3.222462983026363, "grad_norm": 7.146094799041748, "learning_rate": 6.771676300578034e-07, "logits/chosen": -2.992584705352783, "logits/rejected": -3.061310052871704, "logps/chosen": -28.131851196289062, "logps/rejected": -96.0514144897461, "loss": 0.3303, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4459311962127686, "rewards/margins": 6.2543487548828125, "rewards/rejected": -4.808417797088623, "step": 1118 }, { "epoch": 3.2253521126760565, "grad_norm": 8.371424674987793, "learning_rate": 6.768786127167629e-07, "logits/chosen": -2.9118218421936035, "logits/rejected": -3.077974557876587, "logps/chosen": -37.73115158081055, "logps/rejected": -89.66812896728516, "loss": 0.4643, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8889783620834351, "rewards/margins": 5.223452568054199, "rewards/rejected": -4.334473133087158, "step": 1119 }, { "epoch": 3.2282412423257494, "grad_norm": 11.351994514465332, "learning_rate": 6.765895953757225e-07, "logits/chosen": -2.9835546016693115, "logits/rejected": -3.1370906829833984, "logps/chosen": -43.906986236572266, "logps/rejected": -102.52887725830078, "loss": 0.543, "rewards/accuracies": 1.0, "rewards/chosen": 0.18561959266662598, "rewards/margins": 5.416900634765625, "rewards/rejected": -5.23128080368042, "step": 1120 }, { "epoch": 3.2282412423257494, "eval_logits/chosen": -3.032665967941284, "eval_logits/rejected": -3.201531171798706, "eval_logps/chosen": -41.435176849365234, "eval_logps/rejected": -102.23127746582031, "eval_loss": 0.5208070278167725, "eval_rewards/accuracies": 0.9516128897666931, "eval_rewards/chosen": 0.48812395334243774, "eval_rewards/margins": 6.016234397888184, "eval_rewards/rejected": -5.528109550476074, "eval_runtime": 225.7528, "eval_samples_per_second": 0.545, "eval_steps_per_second": 0.275, "step": 1120 }, { "epoch": 3.2311303719754423, "grad_norm": 7.338752269744873, "learning_rate": 6.76300578034682e-07, "logits/chosen": -2.990037679672241, "logits/rejected": -3.1087164878845215, "logps/chosen": -34.83611297607422, "logps/rejected": -87.18364715576172, "loss": 0.4577, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1622670888900757, "rewards/margins": 5.0843963623046875, "rewards/rejected": -3.9221296310424805, "step": 1121 }, { "epoch": 3.2340195016251356, "grad_norm": 10.210882186889648, "learning_rate": 6.760115606936416e-07, "logits/chosen": -3.037564277648926, "logits/rejected": -3.238056182861328, "logps/chosen": -44.130767822265625, "logps/rejected": -109.2099838256836, "loss": 0.4653, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3413291573524475, "rewards/margins": 6.425462245941162, "rewards/rejected": -6.084133148193359, "step": 1122 }, { "epoch": 3.2369086312748285, "grad_norm": 7.469390869140625, "learning_rate": 6.757225433526012e-07, "logits/chosen": -2.9734554290771484, "logits/rejected": -3.0861995220184326, "logps/chosen": -27.728939056396484, "logps/rejected": -90.68671417236328, "loss": 0.3156, "rewards/accuracies": 1.0, "rewards/chosen": 1.7256743907928467, "rewards/margins": 6.0939788818359375, "rewards/rejected": -4.368304252624512, "step": 1123 }, { "epoch": 3.2397977609245214, "grad_norm": 9.869345664978027, "learning_rate": 6.754335260115607e-07, "logits/chosen": -2.823392868041992, "logits/rejected": -3.1009416580200195, "logps/chosen": -28.940561294555664, "logps/rejected": -88.2731704711914, "loss": 0.3884, "rewards/accuracies": 1.0, "rewards/chosen": 1.6686450242996216, "rewards/margins": 5.712466239929199, "rewards/rejected": -4.043821334838867, "step": 1124 }, { "epoch": 3.2426868905742143, "grad_norm": 9.73472785949707, "learning_rate": 6.751445086705202e-07, "logits/chosen": -2.9267139434814453, "logits/rejected": -3.0780017375946045, "logps/chosen": -38.78782653808594, "logps/rejected": -92.75542449951172, "loss": 0.5018, "rewards/accuracies": 1.0, "rewards/chosen": 0.8344171643257141, "rewards/margins": 5.3143157958984375, "rewards/rejected": -4.479898929595947, "step": 1125 }, { "epoch": 3.2455760202239077, "grad_norm": 7.335124492645264, "learning_rate": 6.748554913294797e-07, "logits/chosen": -2.9009273052215576, "logits/rejected": -3.05010986328125, "logps/chosen": -36.541786193847656, "logps/rejected": -108.00987243652344, "loss": 0.4179, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9419667720794678, "rewards/margins": 6.561823844909668, "rewards/rejected": -5.619856834411621, "step": 1126 }, { "epoch": 3.2484651498736006, "grad_norm": 7.828618049621582, "learning_rate": 6.745664739884393e-07, "logits/chosen": -3.0415079593658447, "logits/rejected": -3.1207478046417236, "logps/chosen": -39.50275421142578, "logps/rejected": -95.83662414550781, "loss": 0.4881, "rewards/accuracies": 1.0, "rewards/chosen": 0.5800409913063049, "rewards/margins": 5.3387322425842285, "rewards/rejected": -4.75869083404541, "step": 1127 }, { "epoch": 3.2513542795232935, "grad_norm": 8.635397911071777, "learning_rate": 6.742774566473987e-07, "logits/chosen": -2.829932928085327, "logits/rejected": -2.9729347229003906, "logps/chosen": -27.834386825561523, "logps/rejected": -91.72992706298828, "loss": 0.323, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0638697147369385, "rewards/margins": 6.172399044036865, "rewards/rejected": -4.108529090881348, "step": 1128 }, { "epoch": 3.254243409172987, "grad_norm": 8.38090705871582, "learning_rate": 6.739884393063584e-07, "logits/chosen": -2.9867191314697266, "logits/rejected": -3.1279308795928955, "logps/chosen": -43.79297637939453, "logps/rejected": -92.36766052246094, "loss": 0.5106, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2347724735736847, "rewards/margins": 4.8414411544799805, "rewards/rejected": -4.606668472290039, "step": 1129 }, { "epoch": 3.2571325388226797, "grad_norm": 7.699036121368408, "learning_rate": 6.736994219653179e-07, "logits/chosen": -2.9904799461364746, "logits/rejected": -3.1948294639587402, "logps/chosen": -39.68174362182617, "logps/rejected": -114.3167724609375, "loss": 0.4126, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6754813194274902, "rewards/margins": 7.173931121826172, "rewards/rejected": -6.49845027923584, "step": 1130 }, { "epoch": 3.2600216684723726, "grad_norm": 9.093860626220703, "learning_rate": 6.734104046242775e-07, "logits/chosen": -3.0123720169067383, "logits/rejected": -3.2037785053253174, "logps/chosen": -40.895530700683594, "logps/rejected": -100.90084838867188, "loss": 0.477, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5869402885437012, "rewards/margins": 5.9859538078308105, "rewards/rejected": -5.399013519287109, "step": 1131 }, { "epoch": 3.262910798122066, "grad_norm": 7.7018890380859375, "learning_rate": 6.73121387283237e-07, "logits/chosen": -3.0116100311279297, "logits/rejected": -3.1101319789886475, "logps/chosen": -32.96857833862305, "logps/rejected": -86.64228057861328, "loss": 0.4443, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3564732074737549, "rewards/margins": 5.2184648513793945, "rewards/rejected": -3.8619918823242188, "step": 1132 }, { "epoch": 3.265799927771759, "grad_norm": 9.594058990478516, "learning_rate": 6.728323699421965e-07, "logits/chosen": -2.953883647918701, "logits/rejected": -3.171375274658203, "logps/chosen": -18.77123260498047, "logps/rejected": -78.41181182861328, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 2.585232734680176, "rewards/margins": 5.6125264167785645, "rewards/rejected": -3.0272936820983887, "step": 1133 }, { "epoch": 3.2686890574214518, "grad_norm": 5.252825736999512, "learning_rate": 6.725433526011561e-07, "logits/chosen": -2.937654972076416, "logits/rejected": -3.1073741912841797, "logps/chosen": -27.377038955688477, "logps/rejected": -82.40264892578125, "loss": 0.4179, "rewards/accuracies": 1.0, "rewards/chosen": 1.8021947145462036, "rewards/margins": 5.537286758422852, "rewards/rejected": -3.7350921630859375, "step": 1134 }, { "epoch": 3.2715781870711447, "grad_norm": 8.774269104003906, "learning_rate": 6.722543352601155e-07, "logits/chosen": -2.8628883361816406, "logits/rejected": -3.1256399154663086, "logps/chosen": -32.318153381347656, "logps/rejected": -83.82781219482422, "loss": 0.4284, "rewards/accuracies": 1.0, "rewards/chosen": 1.390027403831482, "rewards/margins": 4.983666896820068, "rewards/rejected": -3.593639373779297, "step": 1135 }, { "epoch": 3.274467316720838, "grad_norm": 9.040322303771973, "learning_rate": 6.719653179190751e-07, "logits/chosen": -2.995636224746704, "logits/rejected": -3.1734726428985596, "logps/chosen": -25.34115219116211, "logps/rejected": -92.6556396484375, "loss": 0.3521, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6676597595214844, "rewards/margins": 6.055270195007324, "rewards/rejected": -4.387610912322998, "step": 1136 }, { "epoch": 3.277356446370531, "grad_norm": 6.526458263397217, "learning_rate": 6.716763005780347e-07, "logits/chosen": -2.910215377807617, "logits/rejected": -3.1185667514801025, "logps/chosen": -32.142005920410156, "logps/rejected": -90.19015502929688, "loss": 0.431, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1916732788085938, "rewards/margins": 5.544375896453857, "rewards/rejected": -4.352702617645264, "step": 1137 }, { "epoch": 3.280245576020224, "grad_norm": 6.6003947257995605, "learning_rate": 6.713872832369942e-07, "logits/chosen": -2.944094657897949, "logits/rejected": -3.0862348079681396, "logps/chosen": -38.998321533203125, "logps/rejected": -107.858154296875, "loss": 0.436, "rewards/accuracies": 1.0, "rewards/chosen": 0.6935598850250244, "rewards/margins": 6.547449111938477, "rewards/rejected": -5.853888988494873, "step": 1138 }, { "epoch": 3.2831347056699167, "grad_norm": 7.960841655731201, "learning_rate": 6.710982658959537e-07, "logits/chosen": -3.0318171977996826, "logits/rejected": -3.1899712085723877, "logps/chosen": -37.60247039794922, "logps/rejected": -94.32508850097656, "loss": 0.4519, "rewards/accuracies": 1.0, "rewards/chosen": 0.9751060605049133, "rewards/margins": 5.866357326507568, "rewards/rejected": -4.891251564025879, "step": 1139 }, { "epoch": 3.28602383531961, "grad_norm": 7.135639667510986, "learning_rate": 6.708092485549133e-07, "logits/chosen": -2.936007022857666, "logits/rejected": -3.1068551540374756, "logps/chosen": -24.248348236083984, "logps/rejected": -84.49359130859375, "loss": 0.378, "rewards/accuracies": 1.0, "rewards/chosen": 2.099820852279663, "rewards/margins": 5.776991367340088, "rewards/rejected": -3.677170753479004, "step": 1140 }, { "epoch": 3.288912964969303, "grad_norm": 5.908297538757324, "learning_rate": 6.705202312138728e-07, "logits/chosen": -2.911057233810425, "logits/rejected": -3.0449562072753906, "logps/chosen": -34.901763916015625, "logps/rejected": -99.13328552246094, "loss": 0.319, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2180269956588745, "rewards/margins": 6.09932804107666, "rewards/rejected": -4.881300926208496, "step": 1141 }, { "epoch": 3.291802094618996, "grad_norm": 8.637467384338379, "learning_rate": 6.702312138728323e-07, "logits/chosen": -2.925649642944336, "logits/rejected": -3.1419010162353516, "logps/chosen": -23.14232635498047, "logps/rejected": -82.05215454101562, "loss": 0.3973, "rewards/accuracies": 0.9375, "rewards/chosen": 1.971496820449829, "rewards/margins": 5.259834289550781, "rewards/rejected": -3.288337469100952, "step": 1142 }, { "epoch": 3.294691224268689, "grad_norm": 8.744532585144043, "learning_rate": 6.699421965317918e-07, "logits/chosen": -2.867208957672119, "logits/rejected": -3.141702175140381, "logps/chosen": -34.10907745361328, "logps/rejected": -90.91251373291016, "loss": 0.3885, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3865406513214111, "rewards/margins": 5.9686479568481445, "rewards/rejected": -4.5821075439453125, "step": 1143 }, { "epoch": 3.297580353918382, "grad_norm": 6.927666187286377, "learning_rate": 6.696531791907514e-07, "logits/chosen": -3.092756748199463, "logits/rejected": -3.0839381217956543, "logps/chosen": -39.33696746826172, "logps/rejected": -89.69991302490234, "loss": 0.4891, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7755428552627563, "rewards/margins": 4.823001861572266, "rewards/rejected": -4.047459125518799, "step": 1144 }, { "epoch": 3.300469483568075, "grad_norm": 10.231202125549316, "learning_rate": 6.69364161849711e-07, "logits/chosen": -2.783945083618164, "logits/rejected": -2.9566450119018555, "logps/chosen": -29.468542098999023, "logps/rejected": -72.2046127319336, "loss": 0.4974, "rewards/accuracies": 0.9375, "rewards/chosen": 1.696235179901123, "rewards/margins": 4.549047470092773, "rewards/rejected": -2.852811813354492, "step": 1145 }, { "epoch": 3.3033586132177684, "grad_norm": 10.030852317810059, "learning_rate": 6.690751445086705e-07, "logits/chosen": -2.944270610809326, "logits/rejected": -3.0873186588287354, "logps/chosen": -22.6407470703125, "logps/rejected": -87.27885437011719, "loss": 0.2719, "rewards/accuracies": 1.0, "rewards/chosen": 1.9892823696136475, "rewards/margins": 6.12428617477417, "rewards/rejected": -4.135003089904785, "step": 1146 }, { "epoch": 3.3062477428674613, "grad_norm": 6.63667631149292, "learning_rate": 6.687861271676301e-07, "logits/chosen": -2.8604416847229004, "logits/rejected": -3.073668956756592, "logps/chosen": -30.488222122192383, "logps/rejected": -91.02366638183594, "loss": 0.4095, "rewards/accuracies": 1.0, "rewards/chosen": 1.4959802627563477, "rewards/margins": 5.7914042472839355, "rewards/rejected": -4.295424461364746, "step": 1147 }, { "epoch": 3.309136872517154, "grad_norm": 6.5090484619140625, "learning_rate": 6.684971098265896e-07, "logits/chosen": -2.961132526397705, "logits/rejected": -3.0474772453308105, "logps/chosen": -40.58101272583008, "logps/rejected": -91.59550476074219, "loss": 0.5309, "rewards/accuracies": 0.9375, "rewards/chosen": 0.537729799747467, "rewards/margins": 4.7289323806762695, "rewards/rejected": -4.191202640533447, "step": 1148 }, { "epoch": 3.312026002166847, "grad_norm": 7.625451564788818, "learning_rate": 6.682080924855491e-07, "logits/chosen": -3.014270305633545, "logits/rejected": -3.159895896911621, "logps/chosen": -41.004783630371094, "logps/rejected": -100.76163482666016, "loss": 0.4353, "rewards/accuracies": 1.0, "rewards/chosen": 0.4132654666900635, "rewards/margins": 5.778007507324219, "rewards/rejected": -5.364742279052734, "step": 1149 }, { "epoch": 3.3149151318165404, "grad_norm": 6.352293491363525, "learning_rate": 6.679190751445086e-07, "logits/chosen": -2.9561431407928467, "logits/rejected": -3.0579376220703125, "logps/chosen": -32.74283218383789, "logps/rejected": -100.68021392822266, "loss": 0.3782, "rewards/accuracies": 0.96875, "rewards/chosen": 1.372828483581543, "rewards/margins": 6.632473945617676, "rewards/rejected": -5.259645462036133, "step": 1150 }, { "epoch": 3.3178042614662333, "grad_norm": 6.69126558303833, "learning_rate": 6.676300578034682e-07, "logits/chosen": -2.9483394622802734, "logits/rejected": -3.1084747314453125, "logps/chosen": -33.60860061645508, "logps/rejected": -88.77405548095703, "loss": 0.4577, "rewards/accuracies": 0.90625, "rewards/chosen": 1.337059736251831, "rewards/margins": 5.568593978881836, "rewards/rejected": -4.231533527374268, "step": 1151 }, { "epoch": 3.320693391115926, "grad_norm": 7.17863655090332, "learning_rate": 6.673410404624276e-07, "logits/chosen": -2.992528200149536, "logits/rejected": -3.109541416168213, "logps/chosen": -31.011737823486328, "logps/rejected": -95.45381164550781, "loss": 0.3686, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3243393898010254, "rewards/margins": 6.123620986938477, "rewards/rejected": -4.799281597137451, "step": 1152 }, { "epoch": 3.323582520765619, "grad_norm": 7.933640956878662, "learning_rate": 6.670520231213873e-07, "logits/chosen": -2.981813907623291, "logits/rejected": -3.1497814655303955, "logps/chosen": -30.93709373474121, "logps/rejected": -106.36029052734375, "loss": 0.4726, "rewards/accuracies": 0.96875, "rewards/chosen": 1.68696129322052, "rewards/margins": 7.259613990783691, "rewards/rejected": -5.572653293609619, "step": 1153 }, { "epoch": 3.3264716504153125, "grad_norm": 9.329155921936035, "learning_rate": 6.667630057803469e-07, "logits/chosen": -2.9713551998138428, "logits/rejected": -3.1068482398986816, "logps/chosen": -28.583839416503906, "logps/rejected": -79.01005554199219, "loss": 0.4125, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5993751287460327, "rewards/margins": 4.695761203765869, "rewards/rejected": -3.096386194229126, "step": 1154 }, { "epoch": 3.3293607800650054, "grad_norm": 7.5958333015441895, "learning_rate": 6.664739884393063e-07, "logits/chosen": -2.9320011138916016, "logits/rejected": -3.1260876655578613, "logps/chosen": -28.468725204467773, "logps/rejected": -89.60223388671875, "loss": 0.358, "rewards/accuracies": 1.0, "rewards/chosen": 1.9470716714859009, "rewards/margins": 6.147886753082275, "rewards/rejected": -4.200814723968506, "step": 1155 }, { "epoch": 3.3322499097146983, "grad_norm": 6.327671527862549, "learning_rate": 6.661849710982659e-07, "logits/chosen": -2.9602210521698, "logits/rejected": -3.1063125133514404, "logps/chosen": -38.72031784057617, "logps/rejected": -105.42236328125, "loss": 0.4479, "rewards/accuracies": 1.0, "rewards/chosen": 0.8207967281341553, "rewards/margins": 6.6174821853637695, "rewards/rejected": -5.796685695648193, "step": 1156 }, { "epoch": 3.3351390393643916, "grad_norm": 7.61992073059082, "learning_rate": 6.658959537572254e-07, "logits/chosen": -2.9013116359710693, "logits/rejected": -2.987628936767578, "logps/chosen": -43.27040100097656, "logps/rejected": -100.26988220214844, "loss": 0.4502, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6679575443267822, "rewards/margins": 5.919259548187256, "rewards/rejected": -5.2513017654418945, "step": 1157 }, { "epoch": 3.3380281690140845, "grad_norm": 5.720763683319092, "learning_rate": 6.65606936416185e-07, "logits/chosen": -2.8375234603881836, "logits/rejected": -3.0189671516418457, "logps/chosen": -32.894710540771484, "logps/rejected": -91.4463882446289, "loss": 0.4048, "rewards/accuracies": 1.0, "rewards/chosen": 1.505902886390686, "rewards/margins": 5.787133693695068, "rewards/rejected": -4.281230449676514, "step": 1158 }, { "epoch": 3.3409172986637774, "grad_norm": 6.885571002960205, "learning_rate": 6.653179190751444e-07, "logits/chosen": -2.9010603427886963, "logits/rejected": -3.0313243865966797, "logps/chosen": -34.11402893066406, "logps/rejected": -85.59977722167969, "loss": 0.4991, "rewards/accuracies": 0.9375, "rewards/chosen": 1.209869623184204, "rewards/margins": 4.912221431732178, "rewards/rejected": -3.7023518085479736, "step": 1159 }, { "epoch": 3.3438064283134707, "grad_norm": 6.425413131713867, "learning_rate": 6.65028901734104e-07, "logits/chosen": -2.9251770973205566, "logits/rejected": -3.167825222015381, "logps/chosen": -30.00826644897461, "logps/rejected": -101.4412612915039, "loss": 0.3759, "rewards/accuracies": 1.0, "rewards/chosen": 1.5805566310882568, "rewards/margins": 6.623287200927734, "rewards/rejected": -5.042730331420898, "step": 1160 }, { "epoch": 3.3466955579631636, "grad_norm": 5.630822658538818, "learning_rate": 6.647398843930635e-07, "logits/chosen": -2.980992317199707, "logits/rejected": -3.1718287467956543, "logps/chosen": -36.58369445800781, "logps/rejected": -99.50775909423828, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 0.7844582200050354, "rewards/margins": 5.989752292633057, "rewards/rejected": -5.205294609069824, "step": 1161 }, { "epoch": 3.3495846876128565, "grad_norm": 7.409828186035156, "learning_rate": 6.644508670520231e-07, "logits/chosen": -2.9825375080108643, "logits/rejected": -3.102735996246338, "logps/chosen": -41.037330627441406, "logps/rejected": -86.6535873413086, "loss": 0.4804, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7751534581184387, "rewards/margins": 4.792898178100586, "rewards/rejected": -4.017745018005371, "step": 1162 }, { "epoch": 3.35247381726255, "grad_norm": 10.912490844726562, "learning_rate": 6.641618497109826e-07, "logits/chosen": -2.996246337890625, "logits/rejected": -3.123526096343994, "logps/chosen": -41.25068664550781, "logps/rejected": -104.71024322509766, "loss": 0.5148, "rewards/accuracies": 0.90625, "rewards/chosen": 0.37428319454193115, "rewards/margins": 5.968832492828369, "rewards/rejected": -5.594549179077148, "step": 1163 }, { "epoch": 3.355362946912243, "grad_norm": 7.883077144622803, "learning_rate": 6.638728323699422e-07, "logits/chosen": -2.9580070972442627, "logits/rejected": -3.1471104621887207, "logps/chosen": -40.125160217285156, "logps/rejected": -93.84523010253906, "loss": 0.4864, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7595306634902954, "rewards/margins": 4.900563716888428, "rewards/rejected": -4.141033172607422, "step": 1164 }, { "epoch": 3.3582520765619357, "grad_norm": 4.634730339050293, "learning_rate": 6.635838150289018e-07, "logits/chosen": -2.8900468349456787, "logits/rejected": -3.122659921646118, "logps/chosen": -26.643930435180664, "logps/rejected": -93.04237365722656, "loss": 0.4214, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8183917999267578, "rewards/margins": 6.343979358673096, "rewards/rejected": -4.5255889892578125, "step": 1165 }, { "epoch": 3.3611412062116286, "grad_norm": 7.014386177062988, "learning_rate": 6.632947976878612e-07, "logits/chosen": -2.998014450073242, "logits/rejected": -3.1907906532287598, "logps/chosen": -35.309104919433594, "logps/rejected": -96.04341125488281, "loss": 0.402, "rewards/accuracies": 1.0, "rewards/chosen": 1.552594780921936, "rewards/margins": 6.037311553955078, "rewards/rejected": -4.484716415405273, "step": 1166 }, { "epoch": 3.364030335861322, "grad_norm": 5.631239891052246, "learning_rate": 6.630057803468208e-07, "logits/chosen": -3.0466859340667725, "logits/rejected": -3.2456088066101074, "logps/chosen": -34.062652587890625, "logps/rejected": -83.63455200195312, "loss": 0.5031, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2434642314910889, "rewards/margins": 4.792796611785889, "rewards/rejected": -3.5493319034576416, "step": 1167 }, { "epoch": 3.366919465511015, "grad_norm": 7.209017276763916, "learning_rate": 6.627167630057803e-07, "logits/chosen": -3.0232784748077393, "logits/rejected": -3.2736809253692627, "logps/chosen": -33.46652603149414, "logps/rejected": -97.40567779541016, "loss": 0.3777, "rewards/accuracies": 0.9375, "rewards/chosen": 1.198300838470459, "rewards/margins": 5.83237886428833, "rewards/rejected": -4.634078025817871, "step": 1168 }, { "epoch": 3.3698085951607077, "grad_norm": 9.293299674987793, "learning_rate": 6.624277456647398e-07, "logits/chosen": -2.931091547012329, "logits/rejected": -3.0472447872161865, "logps/chosen": -25.581588745117188, "logps/rejected": -75.38117980957031, "loss": 0.4909, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8332159519195557, "rewards/margins": 4.764425277709961, "rewards/rejected": -2.931208848953247, "step": 1169 }, { "epoch": 3.3726977248104006, "grad_norm": 10.372138977050781, "learning_rate": 6.621387283236994e-07, "logits/chosen": -2.9071078300476074, "logits/rejected": -3.151092529296875, "logps/chosen": -35.76829528808594, "logps/rejected": -88.73760986328125, "loss": 0.4575, "rewards/accuracies": 0.96875, "rewards/chosen": 1.320026159286499, "rewards/margins": 5.061858177185059, "rewards/rejected": -3.7418317794799805, "step": 1170 }, { "epoch": 3.375586854460094, "grad_norm": 9.585521697998047, "learning_rate": 6.61849710982659e-07, "logits/chosen": -2.975118637084961, "logits/rejected": -3.1628031730651855, "logps/chosen": -46.09466552734375, "logps/rejected": -110.51539611816406, "loss": 0.5324, "rewards/accuracies": 0.96875, "rewards/chosen": 0.1351126730442047, "rewards/margins": 6.258009433746338, "rewards/rejected": -6.122897148132324, "step": 1171 }, { "epoch": 3.378475984109787, "grad_norm": 4.577286243438721, "learning_rate": 6.615606936416184e-07, "logits/chosen": -2.9904699325561523, "logits/rejected": -3.1693854331970215, "logps/chosen": -35.114933013916016, "logps/rejected": -117.08765411376953, "loss": 0.3, "rewards/accuracies": 1.0, "rewards/chosen": 1.3274266719818115, "rewards/margins": 7.935360431671143, "rewards/rejected": -6.607933044433594, "step": 1172 }, { "epoch": 3.38136511375948, "grad_norm": 7.875155448913574, "learning_rate": 6.61271676300578e-07, "logits/chosen": -2.889267683029175, "logits/rejected": -3.0049779415130615, "logps/chosen": -38.47202682495117, "logps/rejected": -105.46700286865234, "loss": 0.3968, "rewards/accuracies": 0.96875, "rewards/chosen": 0.806753933429718, "rewards/margins": 6.418295383453369, "rewards/rejected": -5.611541748046875, "step": 1173 }, { "epoch": 3.384254243409173, "grad_norm": 6.875993251800537, "learning_rate": 6.609826589595375e-07, "logits/chosen": -3.0134177207946777, "logits/rejected": -3.1422946453094482, "logps/chosen": -31.11956214904785, "logps/rejected": -91.8653793334961, "loss": 0.3712, "rewards/accuracies": 0.96875, "rewards/chosen": 1.255359411239624, "rewards/margins": 5.749382495880127, "rewards/rejected": -4.494022846221924, "step": 1174 }, { "epoch": 3.387143373058866, "grad_norm": 7.490189075469971, "learning_rate": 6.606936416184971e-07, "logits/chosen": -2.8642959594726562, "logits/rejected": -3.0745794773101807, "logps/chosen": -19.941282272338867, "logps/rejected": -85.65182495117188, "loss": 0.4224, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3155314922332764, "rewards/margins": 5.883464336395264, "rewards/rejected": -3.567932605743408, "step": 1175 }, { "epoch": 3.390032502708559, "grad_norm": 6.659086227416992, "learning_rate": 6.604046242774565e-07, "logits/chosen": -3.037597894668579, "logits/rejected": -3.171318769454956, "logps/chosen": -34.07804489135742, "logps/rejected": -95.62974548339844, "loss": 0.3833, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0604852437973022, "rewards/margins": 5.982115745544434, "rewards/rejected": -4.921630382537842, "step": 1176 }, { "epoch": 3.3929216323582523, "grad_norm": 8.770858764648438, "learning_rate": 6.601156069364161e-07, "logits/chosen": -2.946432113647461, "logits/rejected": -3.0532608032226562, "logps/chosen": -32.574092864990234, "logps/rejected": -91.4446029663086, "loss": 0.5106, "rewards/accuracies": 1.0, "rewards/chosen": 1.1843230724334717, "rewards/margins": 5.597712516784668, "rewards/rejected": -4.413389205932617, "step": 1177 }, { "epoch": 3.395810762007945, "grad_norm": 7.222620010375977, "learning_rate": 6.598265895953758e-07, "logits/chosen": -3.00657320022583, "logits/rejected": -3.0865330696105957, "logps/chosen": -33.114131927490234, "logps/rejected": -87.24146270751953, "loss": 0.499, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4272733926773071, "rewards/margins": 5.27847957611084, "rewards/rejected": -3.8512063026428223, "step": 1178 }, { "epoch": 3.398699891657638, "grad_norm": 8.928431510925293, "learning_rate": 6.595375722543352e-07, "logits/chosen": -2.8992807865142822, "logits/rejected": -3.061394691467285, "logps/chosen": -32.61311340332031, "logps/rejected": -86.58439636230469, "loss": 0.4664, "rewards/accuracies": 0.9375, "rewards/chosen": 1.743781566619873, "rewards/margins": 5.312830924987793, "rewards/rejected": -3.569049835205078, "step": 1179 }, { "epoch": 3.401589021307331, "grad_norm": 7.665157794952393, "learning_rate": 6.592485549132948e-07, "logits/chosen": -2.955437183380127, "logits/rejected": -3.133408546447754, "logps/chosen": -31.833925247192383, "logps/rejected": -85.94158935546875, "loss": 0.4516, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3470675945281982, "rewards/margins": 5.274678707122803, "rewards/rejected": -3.9276113510131836, "step": 1180 }, { "epoch": 3.4044781509570243, "grad_norm": 7.75153112411499, "learning_rate": 6.589595375722543e-07, "logits/chosen": -2.9180870056152344, "logits/rejected": -3.0903725624084473, "logps/chosen": -34.774295806884766, "logps/rejected": -90.01785278320312, "loss": 0.5045, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9286811351776123, "rewards/margins": 5.228360652923584, "rewards/rejected": -4.299679756164551, "step": 1181 }, { "epoch": 3.4073672806067172, "grad_norm": 7.390554904937744, "learning_rate": 6.586705202312139e-07, "logits/chosen": -2.966555595397949, "logits/rejected": -3.1883482933044434, "logps/chosen": -36.04998779296875, "logps/rejected": -78.9727554321289, "loss": 0.454, "rewards/accuracies": 0.90625, "rewards/chosen": 1.287245750427246, "rewards/margins": 4.360153675079346, "rewards/rejected": -3.0729076862335205, "step": 1182 }, { "epoch": 3.41025641025641, "grad_norm": 7.824517726898193, "learning_rate": 6.583815028901733e-07, "logits/chosen": -2.9096062183380127, "logits/rejected": -3.1155035495758057, "logps/chosen": -30.133874893188477, "logps/rejected": -84.15745544433594, "loss": 0.4107, "rewards/accuracies": 1.0, "rewards/chosen": 1.581912875175476, "rewards/margins": 5.115543365478516, "rewards/rejected": -3.5336310863494873, "step": 1183 }, { "epoch": 3.4131455399061035, "grad_norm": 6.273678302764893, "learning_rate": 6.580924855491329e-07, "logits/chosen": -3.0568106174468994, "logits/rejected": -3.128861904144287, "logps/chosen": -38.02037048339844, "logps/rejected": -87.7366714477539, "loss": 0.44, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8188021183013916, "rewards/margins": 4.85684061050415, "rewards/rejected": -4.0380377769470215, "step": 1184 }, { "epoch": 3.4160346695557964, "grad_norm": 5.387135982513428, "learning_rate": 6.578034682080924e-07, "logits/chosen": -3.0138001441955566, "logits/rejected": -3.1982200145721436, "logps/chosen": -36.038719177246094, "logps/rejected": -97.31689453125, "loss": 0.4269, "rewards/accuracies": 0.96875, "rewards/chosen": 1.058012843132019, "rewards/margins": 5.897914409637451, "rewards/rejected": -4.839901924133301, "step": 1185 }, { "epoch": 3.4189237992054893, "grad_norm": 9.021865844726562, "learning_rate": 6.57514450867052e-07, "logits/chosen": -3.027353525161743, "logits/rejected": -3.180368661880493, "logps/chosen": -37.261573791503906, "logps/rejected": -87.34783935546875, "loss": 0.493, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7562281489372253, "rewards/margins": 4.6967244148254395, "rewards/rejected": -3.9404962062835693, "step": 1186 }, { "epoch": 3.421812928855182, "grad_norm": 6.687682151794434, "learning_rate": 6.572254335260116e-07, "logits/chosen": -2.8218514919281006, "logits/rejected": -3.095712661743164, "logps/chosen": -29.544036865234375, "logps/rejected": -100.00978088378906, "loss": 0.3505, "rewards/accuracies": 1.0, "rewards/chosen": 1.510624647140503, "rewards/margins": 6.735953330993652, "rewards/rejected": -5.225329399108887, "step": 1187 }, { "epoch": 3.4247020585048755, "grad_norm": 6.651750087738037, "learning_rate": 6.569364161849711e-07, "logits/chosen": -2.866000175476074, "logits/rejected": -3.071326494216919, "logps/chosen": -25.94731903076172, "logps/rejected": -91.8892822265625, "loss": 0.3474, "rewards/accuracies": 0.9375, "rewards/chosen": 1.915204405784607, "rewards/margins": 6.271011829376221, "rewards/rejected": -4.355806350708008, "step": 1188 }, { "epoch": 3.4275911881545684, "grad_norm": 6.833232402801514, "learning_rate": 6.566473988439306e-07, "logits/chosen": -2.9401378631591797, "logits/rejected": -3.1240386962890625, "logps/chosen": -31.69731330871582, "logps/rejected": -89.86800384521484, "loss": 0.3261, "rewards/accuracies": 1.0, "rewards/chosen": 1.5100722312927246, "rewards/margins": 5.834771633148193, "rewards/rejected": -4.324699878692627, "step": 1189 }, { "epoch": 3.4304803178042613, "grad_norm": 7.567733287811279, "learning_rate": 6.563583815028901e-07, "logits/chosen": -2.8989367485046387, "logits/rejected": -3.0294785499572754, "logps/chosen": -36.26030349731445, "logps/rejected": -94.56551361083984, "loss": 0.459, "rewards/accuracies": 1.0, "rewards/chosen": 1.0468873977661133, "rewards/margins": 5.49797248840332, "rewards/rejected": -4.451085567474365, "step": 1190 }, { "epoch": 3.4304803178042613, "eval_logits/chosen": -3.010016441345215, "eval_logits/rejected": -3.192237615585327, "eval_logps/chosen": -38.501976013183594, "eval_logps/rejected": -98.1452407836914, "eval_loss": 0.49832701683044434, "eval_rewards/accuracies": 0.9516128897666931, "eval_rewards/chosen": 0.7814433574676514, "eval_rewards/margins": 5.9009504318237305, "eval_rewards/rejected": -5.1195068359375, "eval_runtime": 221.4893, "eval_samples_per_second": 0.555, "eval_steps_per_second": 0.28, "step": 1190 }, { "epoch": 3.4333694474539547, "grad_norm": 7.813441753387451, "learning_rate": 6.560693641618497e-07, "logits/chosen": -2.9026761054992676, "logits/rejected": -3.1066229343414307, "logps/chosen": -28.073278427124023, "logps/rejected": -90.75816345214844, "loss": 0.3744, "rewards/accuracies": 1.0, "rewards/chosen": 1.5109996795654297, "rewards/margins": 5.899188041687012, "rewards/rejected": -4.388188362121582, "step": 1191 }, { "epoch": 3.4362585771036476, "grad_norm": 7.754255771636963, "learning_rate": 6.557803468208092e-07, "logits/chosen": -2.8325493335723877, "logits/rejected": -2.9796061515808105, "logps/chosen": -34.95858383178711, "logps/rejected": -87.00569152832031, "loss": 0.369, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2079001665115356, "rewards/margins": 5.138439178466797, "rewards/rejected": -3.930539131164551, "step": 1192 }, { "epoch": 3.4391477067533405, "grad_norm": 8.818733215332031, "learning_rate": 6.554913294797687e-07, "logits/chosen": -2.852367639541626, "logits/rejected": -2.980334520339966, "logps/chosen": -42.293941497802734, "logps/rejected": -97.95160675048828, "loss": 0.5035, "rewards/accuracies": 1.0, "rewards/chosen": 0.6009363532066345, "rewards/margins": 5.392393112182617, "rewards/rejected": -4.791456699371338, "step": 1193 }, { "epoch": 3.442036836403034, "grad_norm": 6.475344181060791, "learning_rate": 6.552023121387283e-07, "logits/chosen": -2.970595121383667, "logits/rejected": -3.203674554824829, "logps/chosen": -27.404098510742188, "logps/rejected": -107.02018737792969, "loss": 0.3432, "rewards/accuracies": 1.0, "rewards/chosen": 1.8317346572875977, "rewards/margins": 7.664312839508057, "rewards/rejected": -5.832577705383301, "step": 1194 }, { "epoch": 3.4449259660527267, "grad_norm": 8.83211898803711, "learning_rate": 6.549132947976879e-07, "logits/chosen": -2.979494094848633, "logits/rejected": -3.161970853805542, "logps/chosen": -24.868099212646484, "logps/rejected": -93.3631591796875, "loss": 0.3453, "rewards/accuracies": 0.90625, "rewards/chosen": 1.8995988368988037, "rewards/margins": 6.56655216217041, "rewards/rejected": -4.6669535636901855, "step": 1195 }, { "epoch": 3.4478150957024196, "grad_norm": 7.8446364402771, "learning_rate": 6.546242774566473e-07, "logits/chosen": -2.8619742393493652, "logits/rejected": -2.9349849224090576, "logps/chosen": -46.969417572021484, "logps/rejected": -96.7087173461914, "loss": 0.5535, "rewards/accuracies": 1.0, "rewards/chosen": 0.12071564793586731, "rewards/margins": 4.760495662689209, "rewards/rejected": -4.639779567718506, "step": 1196 }, { "epoch": 3.4507042253521125, "grad_norm": 7.266077041625977, "learning_rate": 6.543352601156069e-07, "logits/chosen": -2.9504685401916504, "logits/rejected": -3.2466840744018555, "logps/chosen": -27.968914031982422, "logps/rejected": -97.31230926513672, "loss": 0.3456, "rewards/accuracies": 1.0, "rewards/chosen": 1.9196981191635132, "rewards/margins": 6.769850730895996, "rewards/rejected": -4.850152969360352, "step": 1197 }, { "epoch": 3.453593355001806, "grad_norm": 6.892640590667725, "learning_rate": 6.540462427745665e-07, "logits/chosen": -2.989354133605957, "logits/rejected": -3.240401268005371, "logps/chosen": -32.989051818847656, "logps/rejected": -94.62863159179688, "loss": 0.3285, "rewards/accuracies": 1.0, "rewards/chosen": 1.5843431949615479, "rewards/margins": 6.419270038604736, "rewards/rejected": -4.834926605224609, "step": 1198 }, { "epoch": 3.4564824846514988, "grad_norm": 8.687429428100586, "learning_rate": 6.53757225433526e-07, "logits/chosen": -2.982879877090454, "logits/rejected": -3.1676089763641357, "logps/chosen": -34.43013000488281, "logps/rejected": -82.70875549316406, "loss": 0.5244, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2673364877700806, "rewards/margins": 4.733578681945801, "rewards/rejected": -3.4662418365478516, "step": 1199 }, { "epoch": 3.4593716143011917, "grad_norm": 9.177407264709473, "learning_rate": 6.534682080924855e-07, "logits/chosen": -2.9733431339263916, "logits/rejected": -3.1888415813446045, "logps/chosen": -34.92578887939453, "logps/rejected": -103.15664672851562, "loss": 0.3841, "rewards/accuracies": 1.0, "rewards/chosen": 1.1121536493301392, "rewards/margins": 6.4855122566223145, "rewards/rejected": -5.373358726501465, "step": 1200 }, { "epoch": 3.4622607439508846, "grad_norm": 7.133665561676025, "learning_rate": 6.53179190751445e-07, "logits/chosen": -2.930936813354492, "logits/rejected": -3.0735907554626465, "logps/chosen": -30.189712524414062, "logps/rejected": -102.69230651855469, "loss": 0.3806, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4493975639343262, "rewards/margins": 6.841971397399902, "rewards/rejected": -5.392574310302734, "step": 1201 }, { "epoch": 3.465149873600578, "grad_norm": 6.849958896636963, "learning_rate": 6.528901734104047e-07, "logits/chosen": -2.9681005477905273, "logits/rejected": -3.109126329421997, "logps/chosen": -27.60321617126465, "logps/rejected": -97.32830047607422, "loss": 0.3282, "rewards/accuracies": 0.96875, "rewards/chosen": 2.153848648071289, "rewards/margins": 6.62473726272583, "rewards/rejected": -4.470887660980225, "step": 1202 }, { "epoch": 3.468039003250271, "grad_norm": 7.215054035186768, "learning_rate": 6.526011560693641e-07, "logits/chosen": -2.9898293018341064, "logits/rejected": -3.146735429763794, "logps/chosen": -31.022573471069336, "logps/rejected": -87.80887603759766, "loss": 0.4381, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4002118110656738, "rewards/margins": 5.6501569747924805, "rewards/rejected": -4.249945640563965, "step": 1203 }, { "epoch": 3.4709281328999637, "grad_norm": 5.573791980743408, "learning_rate": 6.523121387283237e-07, "logits/chosen": -2.968106269836426, "logits/rejected": -3.1025900840759277, "logps/chosen": -32.73833084106445, "logps/rejected": -102.96227264404297, "loss": 0.436, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3660706281661987, "rewards/margins": 6.896907806396484, "rewards/rejected": -5.530837535858154, "step": 1204 }, { "epoch": 3.473817262549657, "grad_norm": 8.74062728881836, "learning_rate": 6.520231213872832e-07, "logits/chosen": -2.9894893169403076, "logits/rejected": -3.093731641769409, "logps/chosen": -43.2652702331543, "logps/rejected": -94.17662048339844, "loss": 0.4982, "rewards/accuracies": 0.90625, "rewards/chosen": 0.27087080478668213, "rewards/margins": 4.941316604614258, "rewards/rejected": -4.670445442199707, "step": 1205 }, { "epoch": 3.47670639219935, "grad_norm": 8.002286911010742, "learning_rate": 6.517341040462427e-07, "logits/chosen": -2.9791507720947266, "logits/rejected": -3.119478702545166, "logps/chosen": -37.49748229980469, "logps/rejected": -106.97254943847656, "loss": 0.4403, "rewards/accuracies": 1.0, "rewards/chosen": 0.9103296995162964, "rewards/margins": 6.816275119781494, "rewards/rejected": -5.905945301055908, "step": 1206 }, { "epoch": 3.479595521849043, "grad_norm": 7.0509467124938965, "learning_rate": 6.514450867052022e-07, "logits/chosen": -3.0444886684417725, "logits/rejected": -3.199779987335205, "logps/chosen": -42.841331481933594, "logps/rejected": -104.39757537841797, "loss": 0.5015, "rewards/accuracies": 0.84375, "rewards/chosen": 0.14184334874153137, "rewards/margins": 5.901840686798096, "rewards/rejected": -5.759997367858887, "step": 1207 }, { "epoch": 3.482484651498736, "grad_norm": 9.093733787536621, "learning_rate": 6.511560693641618e-07, "logits/chosen": -2.928572654724121, "logits/rejected": -3.1248533725738525, "logps/chosen": -29.21430015563965, "logps/rejected": -92.15713500976562, "loss": 0.393, "rewards/accuracies": 0.96875, "rewards/chosen": 1.429091215133667, "rewards/margins": 5.888828754425049, "rewards/rejected": -4.459737777709961, "step": 1208 }, { "epoch": 3.485373781148429, "grad_norm": 9.934209823608398, "learning_rate": 6.508670520231214e-07, "logits/chosen": -2.938390016555786, "logits/rejected": -3.1332833766937256, "logps/chosen": -35.5466194152832, "logps/rejected": -95.60855102539062, "loss": 0.4621, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9514822959899902, "rewards/margins": 5.5891642570495605, "rewards/rejected": -4.63768196105957, "step": 1209 }, { "epoch": 3.488262910798122, "grad_norm": 7.87867546081543, "learning_rate": 6.505780346820809e-07, "logits/chosen": -2.9307026863098145, "logits/rejected": -3.0342726707458496, "logps/chosen": -41.864471435546875, "logps/rejected": -100.10031127929688, "loss": 0.4671, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5259483456611633, "rewards/margins": 5.356874465942383, "rewards/rejected": -4.830925941467285, "step": 1210 }, { "epoch": 3.491152040447815, "grad_norm": 6.289589881896973, "learning_rate": 6.502890173410405e-07, "logits/chosen": -2.946017026901245, "logits/rejected": -3.0882411003112793, "logps/chosen": -25.6102294921875, "logps/rejected": -90.05644989013672, "loss": 0.2915, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7871453762054443, "rewards/margins": 6.067050457000732, "rewards/rejected": -4.279904842376709, "step": 1211 }, { "epoch": 3.4940411700975083, "grad_norm": 8.377429962158203, "learning_rate": 6.5e-07, "logits/chosen": -2.856842279434204, "logits/rejected": -3.0445313453674316, "logps/chosen": -29.840171813964844, "logps/rejected": -89.66841888427734, "loss": 0.381, "rewards/accuracies": 1.0, "rewards/chosen": 1.5206040143966675, "rewards/margins": 5.716944217681885, "rewards/rejected": -4.196340560913086, "step": 1212 }, { "epoch": 3.496930299747201, "grad_norm": 10.43875789642334, "learning_rate": 6.497109826589595e-07, "logits/chosen": -3.017268657684326, "logits/rejected": -3.1619858741760254, "logps/chosen": -30.507837295532227, "logps/rejected": -90.2428970336914, "loss": 0.4767, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3670604228973389, "rewards/margins": 5.596538543701172, "rewards/rejected": -4.229477882385254, "step": 1213 }, { "epoch": 3.499819429396894, "grad_norm": 8.18755054473877, "learning_rate": 6.49421965317919e-07, "logits/chosen": -3.006850242614746, "logits/rejected": -2.989393472671509, "logps/chosen": -39.77470397949219, "logps/rejected": -97.23899841308594, "loss": 0.4801, "rewards/accuracies": 0.875, "rewards/chosen": 0.9327096939086914, "rewards/margins": 5.789101600646973, "rewards/rejected": -4.856391429901123, "step": 1214 }, { "epoch": 3.502708559046587, "grad_norm": 8.249961853027344, "learning_rate": 6.491329479768786e-07, "logits/chosen": -2.845686197280884, "logits/rejected": -3.058565616607666, "logps/chosen": -29.23155975341797, "logps/rejected": -89.19596099853516, "loss": 0.4107, "rewards/accuracies": 0.875, "rewards/chosen": 1.6856811046600342, "rewards/margins": 5.846856594085693, "rewards/rejected": -4.16117525100708, "step": 1215 }, { "epoch": 3.5055976886962803, "grad_norm": 8.037659645080566, "learning_rate": 6.488439306358381e-07, "logits/chosen": -2.9382524490356445, "logits/rejected": -3.0983996391296387, "logps/chosen": -37.21372985839844, "logps/rejected": -96.94890594482422, "loss": 0.4083, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8608141541481018, "rewards/margins": 5.808571815490723, "rewards/rejected": -4.947758197784424, "step": 1216 }, { "epoch": 3.508486818345973, "grad_norm": 7.019534111022949, "learning_rate": 6.485549132947976e-07, "logits/chosen": -2.8899686336517334, "logits/rejected": -3.005509376525879, "logps/chosen": -37.99907302856445, "logps/rejected": -88.30801391601562, "loss": 0.4923, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8494772911071777, "rewards/margins": 4.673811912536621, "rewards/rejected": -3.824334144592285, "step": 1217 }, { "epoch": 3.511375947995666, "grad_norm": 7.874727725982666, "learning_rate": 6.482658959537573e-07, "logits/chosen": -3.0458176136016846, "logits/rejected": -3.0874736309051514, "logps/chosen": -35.45365905761719, "logps/rejected": -92.91163635253906, "loss": 0.4668, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9893369078636169, "rewards/margins": 5.500915050506592, "rewards/rejected": -4.51157808303833, "step": 1218 }, { "epoch": 3.5142650776453594, "grad_norm": 6.3698883056640625, "learning_rate": 6.479768786127168e-07, "logits/chosen": -2.973655939102173, "logits/rejected": -3.2157230377197266, "logps/chosen": -19.431163787841797, "logps/rejected": -84.90454864501953, "loss": 0.3561, "rewards/accuracies": 0.96875, "rewards/chosen": 2.410310745239258, "rewards/margins": 6.316949367523193, "rewards/rejected": -3.9066381454467773, "step": 1219 }, { "epoch": 3.5171542072950523, "grad_norm": 7.793456554412842, "learning_rate": 6.476878612716763e-07, "logits/chosen": -3.014634609222412, "logits/rejected": -3.2039687633514404, "logps/chosen": -30.296422958374023, "logps/rejected": -101.3128662109375, "loss": 0.3975, "rewards/accuracies": 1.0, "rewards/chosen": 1.3563472032546997, "rewards/margins": 6.589505195617676, "rewards/rejected": -5.233157157897949, "step": 1220 }, { "epoch": 3.5200433369447452, "grad_norm": 8.637478828430176, "learning_rate": 6.473988439306358e-07, "logits/chosen": -2.9676432609558105, "logits/rejected": -3.1644442081451416, "logps/chosen": -30.066843032836914, "logps/rejected": -93.79949188232422, "loss": 0.3805, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6679936647415161, "rewards/margins": 6.079108238220215, "rewards/rejected": -4.411114692687988, "step": 1221 }, { "epoch": 3.5229324665944386, "grad_norm": 11.361743927001953, "learning_rate": 6.471098265895954e-07, "logits/chosen": -2.9500668048858643, "logits/rejected": -3.1605334281921387, "logps/chosen": -28.5344181060791, "logps/rejected": -90.54347229003906, "loss": 0.3397, "rewards/accuracies": 1.0, "rewards/chosen": 1.7476998567581177, "rewards/margins": 6.04937744140625, "rewards/rejected": -4.301677227020264, "step": 1222 }, { "epoch": 3.5258215962441315, "grad_norm": 7.506185054779053, "learning_rate": 6.468208092485548e-07, "logits/chosen": -2.8712007999420166, "logits/rejected": -3.01094388961792, "logps/chosen": -29.39912223815918, "logps/rejected": -84.37739562988281, "loss": 0.4424, "rewards/accuracies": 1.0, "rewards/chosen": 1.7277075052261353, "rewards/margins": 5.302696228027344, "rewards/rejected": -3.574988842010498, "step": 1223 }, { "epoch": 3.5287107258938244, "grad_norm": 7.1562819480896, "learning_rate": 6.465317919075144e-07, "logits/chosen": -2.963535785675049, "logits/rejected": -3.003194570541382, "logps/chosen": -38.45402526855469, "logps/rejected": -97.59296417236328, "loss": 0.4588, "rewards/accuracies": 1.0, "rewards/chosen": 0.9794670939445496, "rewards/margins": 5.558536529541016, "rewards/rejected": -4.579068660736084, "step": 1224 }, { "epoch": 3.5315998555435177, "grad_norm": 9.75706958770752, "learning_rate": 6.462427745664739e-07, "logits/chosen": -2.8683271408081055, "logits/rejected": -2.9837002754211426, "logps/chosen": -30.965600967407227, "logps/rejected": -84.55060577392578, "loss": 0.4156, "rewards/accuracies": 0.9375, "rewards/chosen": 1.686657428741455, "rewards/margins": 5.352930068969727, "rewards/rejected": -3.6662726402282715, "step": 1225 }, { "epoch": 3.5344889851932106, "grad_norm": 7.215769290924072, "learning_rate": 6.459537572254336e-07, "logits/chosen": -3.0040650367736816, "logits/rejected": -3.1872568130493164, "logps/chosen": -30.604522705078125, "logps/rejected": -94.168212890625, "loss": 0.3825, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1878774166107178, "rewards/margins": 5.99579381942749, "rewards/rejected": -4.80791711807251, "step": 1226 }, { "epoch": 3.5373781148429035, "grad_norm": 8.277982711791992, "learning_rate": 6.45664739884393e-07, "logits/chosen": -2.9692459106445312, "logits/rejected": -3.086988925933838, "logps/chosen": -29.58713150024414, "logps/rejected": -85.84359741210938, "loss": 0.4268, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4270594120025635, "rewards/margins": 5.338814735412598, "rewards/rejected": -3.911755084991455, "step": 1227 }, { "epoch": 3.5402672444925964, "grad_norm": 9.152341842651367, "learning_rate": 6.453757225433526e-07, "logits/chosen": -2.9192609786987305, "logits/rejected": -3.032089948654175, "logps/chosen": -27.110532760620117, "logps/rejected": -89.43904113769531, "loss": 0.3964, "rewards/accuracies": 1.0, "rewards/chosen": 1.7016956806182861, "rewards/margins": 5.838253021240234, "rewards/rejected": -4.1365580558776855, "step": 1228 }, { "epoch": 3.54315637414229, "grad_norm": 10.160305976867676, "learning_rate": 6.450867052023122e-07, "logits/chosen": -2.9451560974121094, "logits/rejected": -3.154487371444702, "logps/chosen": -33.79972457885742, "logps/rejected": -77.13689422607422, "loss": 0.49, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9979040622711182, "rewards/margins": 4.478257179260254, "rewards/rejected": -3.4803526401519775, "step": 1229 }, { "epoch": 3.5460455037919827, "grad_norm": 7.475276470184326, "learning_rate": 6.447976878612716e-07, "logits/chosen": -2.8626627922058105, "logits/rejected": -3.0126264095306396, "logps/chosen": -36.83469772338867, "logps/rejected": -90.08132934570312, "loss": 0.481, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0368261337280273, "rewards/margins": 5.167102813720703, "rewards/rejected": -4.130276679992676, "step": 1230 }, { "epoch": 3.5489346334416756, "grad_norm": 10.445663452148438, "learning_rate": 6.445086705202312e-07, "logits/chosen": -2.9333600997924805, "logits/rejected": -3.1190388202667236, "logps/chosen": -30.602964401245117, "logps/rejected": -84.45314025878906, "loss": 0.4738, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5905400514602661, "rewards/margins": 5.136867046356201, "rewards/rejected": -3.546327590942383, "step": 1231 }, { "epoch": 3.5518237630913685, "grad_norm": 8.825756072998047, "learning_rate": 6.442196531791907e-07, "logits/chosen": -2.9245262145996094, "logits/rejected": -3.0870044231414795, "logps/chosen": -27.412065505981445, "logps/rejected": -86.08302307128906, "loss": 0.3792, "rewards/accuracies": 0.9375, "rewards/chosen": 1.844475507736206, "rewards/margins": 5.5756754875183105, "rewards/rejected": -3.7311997413635254, "step": 1232 }, { "epoch": 3.554712892741062, "grad_norm": 9.236495971679688, "learning_rate": 6.439306358381503e-07, "logits/chosen": -3.00203537940979, "logits/rejected": -3.133101224899292, "logps/chosen": -40.4885139465332, "logps/rejected": -93.68370056152344, "loss": 0.5421, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4945104122161865, "rewards/margins": 4.930638313293457, "rewards/rejected": -4.43612813949585, "step": 1233 }, { "epoch": 3.5576020223907547, "grad_norm": 9.213461875915527, "learning_rate": 6.436416184971098e-07, "logits/chosen": -3.007295846939087, "logits/rejected": -3.1365039348602295, "logps/chosen": -27.974456787109375, "logps/rejected": -93.53524780273438, "loss": 0.3189, "rewards/accuracies": 0.9375, "rewards/chosen": 1.793336033821106, "rewards/margins": 6.164029121398926, "rewards/rejected": -4.370693683624268, "step": 1234 }, { "epoch": 3.5604911520404476, "grad_norm": 7.540794372558594, "learning_rate": 6.433526011560694e-07, "logits/chosen": -2.933330535888672, "logits/rejected": -3.1158790588378906, "logps/chosen": -33.787479400634766, "logps/rejected": -89.52913665771484, "loss": 0.4877, "rewards/accuracies": 0.96875, "rewards/chosen": 1.113220453262329, "rewards/margins": 5.337453365325928, "rewards/rejected": -4.224233150482178, "step": 1235 }, { "epoch": 3.563380281690141, "grad_norm": 8.40842056274414, "learning_rate": 6.430635838150289e-07, "logits/chosen": -2.8361144065856934, "logits/rejected": -3.044712543487549, "logps/chosen": -25.59959602355957, "logps/rejected": -79.72248077392578, "loss": 0.4166, "rewards/accuracies": 1.0, "rewards/chosen": 2.0432844161987305, "rewards/margins": 5.417537212371826, "rewards/rejected": -3.374253273010254, "step": 1236 }, { "epoch": 3.566269411339834, "grad_norm": 9.145458221435547, "learning_rate": 6.427745664739884e-07, "logits/chosen": -2.948995590209961, "logits/rejected": -3.2094759941101074, "logps/chosen": -27.112655639648438, "logps/rejected": -79.4100112915039, "loss": 0.4758, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7725131511688232, "rewards/margins": 5.043639659881592, "rewards/rejected": -3.2711257934570312, "step": 1237 }, { "epoch": 3.569158540989527, "grad_norm": 8.920918464660645, "learning_rate": 6.424855491329479e-07, "logits/chosen": -2.832505702972412, "logits/rejected": -3.0362558364868164, "logps/chosen": -25.203399658203125, "logps/rejected": -80.86105346679688, "loss": 0.4307, "rewards/accuracies": 1.0, "rewards/chosen": 2.1308040618896484, "rewards/margins": 5.485569477081299, "rewards/rejected": -3.3547656536102295, "step": 1238 }, { "epoch": 3.57204767063922, "grad_norm": 9.466201782226562, "learning_rate": 6.421965317919075e-07, "logits/chosen": -2.9332873821258545, "logits/rejected": -3.0575881004333496, "logps/chosen": -33.612308502197266, "logps/rejected": -79.11341094970703, "loss": 0.5334, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9029427766799927, "rewards/margins": 4.069962501525879, "rewards/rejected": -3.167019844055176, "step": 1239 }, { "epoch": 3.574936800288913, "grad_norm": 8.4860200881958, "learning_rate": 6.419075144508669e-07, "logits/chosen": -2.9858741760253906, "logits/rejected": -3.20950984954834, "logps/chosen": -28.2838191986084, "logps/rejected": -84.21061706542969, "loss": 0.4675, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7629954814910889, "rewards/margins": 5.2871599197387695, "rewards/rejected": -3.5241641998291016, "step": 1240 }, { "epoch": 3.577825929938606, "grad_norm": 8.994297981262207, "learning_rate": 6.416184971098265e-07, "logits/chosen": -2.8558547496795654, "logits/rejected": -3.1094908714294434, "logps/chosen": -19.59609603881836, "logps/rejected": -74.54452514648438, "loss": 0.3656, "rewards/accuracies": 0.96875, "rewards/chosen": 2.428563117980957, "rewards/margins": 5.229296684265137, "rewards/rejected": -2.800733804702759, "step": 1241 }, { "epoch": 3.5807150595882993, "grad_norm": 8.678279876708984, "learning_rate": 6.413294797687862e-07, "logits/chosen": -2.8992364406585693, "logits/rejected": -3.1029205322265625, "logps/chosen": -31.849231719970703, "logps/rejected": -95.31643676757812, "loss": 0.4195, "rewards/accuracies": 1.0, "rewards/chosen": 1.189420223236084, "rewards/margins": 5.8580780029296875, "rewards/rejected": -4.6686577796936035, "step": 1242 }, { "epoch": 3.583604189237992, "grad_norm": 7.877676963806152, "learning_rate": 6.410404624277457e-07, "logits/chosen": -2.898036479949951, "logits/rejected": -3.0398874282836914, "logps/chosen": -36.92192840576172, "logps/rejected": -88.4075927734375, "loss": 0.5341, "rewards/accuracies": 0.9375, "rewards/chosen": 0.91129070520401, "rewards/margins": 4.695516109466553, "rewards/rejected": -3.7842259407043457, "step": 1243 }, { "epoch": 3.586493318887685, "grad_norm": 9.204252243041992, "learning_rate": 6.407514450867052e-07, "logits/chosen": -2.895111322402954, "logits/rejected": -3.0419697761535645, "logps/chosen": -29.167774200439453, "logps/rejected": -92.14838409423828, "loss": 0.3665, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7164334058761597, "rewards/margins": 6.023285865783691, "rewards/rejected": -4.306852340698242, "step": 1244 }, { "epoch": 3.589382448537378, "grad_norm": 6.825695991516113, "learning_rate": 6.404624277456647e-07, "logits/chosen": -2.929023504257202, "logits/rejected": -3.052938222885132, "logps/chosen": -33.45384216308594, "logps/rejected": -98.30229949951172, "loss": 0.3576, "rewards/accuracies": 1.0, "rewards/chosen": 1.5508506298065186, "rewards/margins": 6.557228088378906, "rewards/rejected": -5.006377220153809, "step": 1245 }, { "epoch": 3.592271578187071, "grad_norm": 8.285348892211914, "learning_rate": 6.401734104046243e-07, "logits/chosen": -2.8752970695495605, "logits/rejected": -3.067911386489868, "logps/chosen": -34.16585922241211, "logps/rejected": -100.17144775390625, "loss": 0.4173, "rewards/accuracies": 0.96875, "rewards/chosen": 1.277269959449768, "rewards/margins": 6.265329360961914, "rewards/rejected": -4.988059043884277, "step": 1246 }, { "epoch": 3.5951607078367642, "grad_norm": 7.339543342590332, "learning_rate": 6.398843930635837e-07, "logits/chosen": -2.912506341934204, "logits/rejected": -3.0001461505889893, "logps/chosen": -28.192447662353516, "logps/rejected": -86.46900939941406, "loss": 0.3653, "rewards/accuracies": 1.0, "rewards/chosen": 1.7898435592651367, "rewards/margins": 5.79799222946167, "rewards/rejected": -4.008149147033691, "step": 1247 }, { "epoch": 3.598049837486457, "grad_norm": 8.546707153320312, "learning_rate": 6.395953757225433e-07, "logits/chosen": -2.962319850921631, "logits/rejected": -3.1828808784484863, "logps/chosen": -36.652626037597656, "logps/rejected": -90.96659088134766, "loss": 0.5226, "rewards/accuracies": 1.0, "rewards/chosen": 0.6067295074462891, "rewards/margins": 5.111895561218262, "rewards/rejected": -4.505166053771973, "step": 1248 }, { "epoch": 3.60093896713615, "grad_norm": 8.468091011047363, "learning_rate": 6.393063583815028e-07, "logits/chosen": -3.064500570297241, "logits/rejected": -3.1690220832824707, "logps/chosen": -39.99603271484375, "logps/rejected": -106.20087432861328, "loss": 0.3907, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6233406662940979, "rewards/margins": 6.322566032409668, "rewards/rejected": -5.699225425720215, "step": 1249 }, { "epoch": 3.6038280967858434, "grad_norm": 8.981438636779785, "learning_rate": 6.390173410404625e-07, "logits/chosen": -3.0344948768615723, "logits/rejected": -3.1598198413848877, "logps/chosen": -30.781564712524414, "logps/rejected": -92.44019317626953, "loss": 0.383, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2135969400405884, "rewards/margins": 5.881299018859863, "rewards/rejected": -4.667702674865723, "step": 1250 }, { "epoch": 3.6067172264355363, "grad_norm": 6.708261966705322, "learning_rate": 6.38728323699422e-07, "logits/chosen": -2.9003703594207764, "logits/rejected": -3.1286780834198, "logps/chosen": -36.775028228759766, "logps/rejected": -101.00663757324219, "loss": 0.4414, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0973063707351685, "rewards/margins": 6.19039249420166, "rewards/rejected": -5.0930867195129395, "step": 1251 }, { "epoch": 3.609606356085229, "grad_norm": 6.909271717071533, "learning_rate": 6.384393063583815e-07, "logits/chosen": -3.016122341156006, "logits/rejected": -3.1688549518585205, "logps/chosen": -34.70520782470703, "logps/rejected": -108.47164154052734, "loss": 0.3662, "rewards/accuracies": 0.96875, "rewards/chosen": 1.062281608581543, "rewards/margins": 6.793678283691406, "rewards/rejected": -5.7313971519470215, "step": 1252 }, { "epoch": 3.6124954857349225, "grad_norm": 8.312588691711426, "learning_rate": 6.381502890173411e-07, "logits/chosen": -2.944061756134033, "logits/rejected": -3.0993809700012207, "logps/chosen": -42.46772766113281, "logps/rejected": -104.24906921386719, "loss": 0.4872, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5311974287033081, "rewards/margins": 6.1976318359375, "rewards/rejected": -5.666434288024902, "step": 1253 }, { "epoch": 3.6153846153846154, "grad_norm": 7.004942417144775, "learning_rate": 6.378612716763005e-07, "logits/chosen": -2.910881757736206, "logits/rejected": -3.0741944313049316, "logps/chosen": -40.195465087890625, "logps/rejected": -95.2978286743164, "loss": 0.4999, "rewards/accuracies": 0.90625, "rewards/chosen": 0.886313796043396, "rewards/margins": 5.657858371734619, "rewards/rejected": -4.771543502807617, "step": 1254 }, { "epoch": 3.6182737450343083, "grad_norm": 8.615281105041504, "learning_rate": 6.375722543352601e-07, "logits/chosen": -2.945834159851074, "logits/rejected": -3.028817892074585, "logps/chosen": -37.91835403442383, "logps/rejected": -90.60009002685547, "loss": 0.4504, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8373653292655945, "rewards/margins": 5.1907548904418945, "rewards/rejected": -4.353389263153076, "step": 1255 }, { "epoch": 3.6211628746840017, "grad_norm": 9.410245895385742, "learning_rate": 6.372832369942196e-07, "logits/chosen": -2.895263433456421, "logits/rejected": -3.1565561294555664, "logps/chosen": -30.348867416381836, "logps/rejected": -88.51463317871094, "loss": 0.4002, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4102795124053955, "rewards/margins": 5.329163551330566, "rewards/rejected": -3.918883800506592, "step": 1256 }, { "epoch": 3.6240520043336946, "grad_norm": 8.582783699035645, "learning_rate": 6.369942196531791e-07, "logits/chosen": -3.036875009536743, "logits/rejected": -3.107023000717163, "logps/chosen": -37.05952072143555, "logps/rejected": -89.54093933105469, "loss": 0.4214, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7655221819877625, "rewards/margins": 5.069192886352539, "rewards/rejected": -4.303670883178711, "step": 1257 }, { "epoch": 3.6269411339833875, "grad_norm": 7.015293121337891, "learning_rate": 6.367052023121386e-07, "logits/chosen": -2.928175687789917, "logits/rejected": -3.1846513748168945, "logps/chosen": -40.7780876159668, "logps/rejected": -111.16459655761719, "loss": 0.3631, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6327213048934937, "rewards/margins": 6.726260185241699, "rewards/rejected": -6.093538761138916, "step": 1258 }, { "epoch": 3.629830263633081, "grad_norm": 8.813332557678223, "learning_rate": 6.364161849710983e-07, "logits/chosen": -2.965904951095581, "logits/rejected": -3.193467378616333, "logps/chosen": -26.350292205810547, "logps/rejected": -85.18939971923828, "loss": 0.4045, "rewards/accuracies": 0.9375, "rewards/chosen": 1.985276460647583, "rewards/margins": 5.565190315246582, "rewards/rejected": -3.579914093017578, "step": 1259 }, { "epoch": 3.6327193932827737, "grad_norm": 7.513662815093994, "learning_rate": 6.361271676300578e-07, "logits/chosen": -2.9071717262268066, "logits/rejected": -3.1528396606445312, "logps/chosen": -33.502586364746094, "logps/rejected": -104.00751495361328, "loss": 0.3364, "rewards/accuracies": 1.0, "rewards/chosen": 1.8256784677505493, "rewards/margins": 7.343685150146484, "rewards/rejected": -5.518007278442383, "step": 1260 }, { "epoch": 3.6327193932827737, "eval_logits/chosen": -3.008526563644409, "eval_logits/rejected": -3.193986177444458, "eval_logps/chosen": -38.94639205932617, "eval_logps/rejected": -99.6908187866211, "eval_loss": 0.5012088418006897, "eval_rewards/accuracies": 0.9516128897666931, "eval_rewards/chosen": 0.7370027303695679, "eval_rewards/margins": 6.011066913604736, "eval_rewards/rejected": -5.274064064025879, "eval_runtime": 225.8217, "eval_samples_per_second": 0.545, "eval_steps_per_second": 0.275, "step": 1260 }, { "epoch": 3.6356085229324666, "grad_norm": 8.651247024536133, "learning_rate": 6.358381502890173e-07, "logits/chosen": -2.9020297527313232, "logits/rejected": -3.131164073944092, "logps/chosen": -29.607004165649414, "logps/rejected": -87.72140502929688, "loss": 0.372, "rewards/accuracies": 1.0, "rewards/chosen": 1.7781766653060913, "rewards/margins": 5.800894737243652, "rewards/rejected": -4.0227179527282715, "step": 1261 }, { "epoch": 3.6384976525821595, "grad_norm": 10.72392749786377, "learning_rate": 6.355491329479769e-07, "logits/chosen": -3.034162998199463, "logits/rejected": -3.0947651863098145, "logps/chosen": -33.65589141845703, "logps/rejected": -94.8243179321289, "loss": 0.4635, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2625701427459717, "rewards/margins": 5.810977935791016, "rewards/rejected": -4.548407554626465, "step": 1262 }, { "epoch": 3.6413867822318524, "grad_norm": 7.951934814453125, "learning_rate": 6.352601156069364e-07, "logits/chosen": -2.8824455738067627, "logits/rejected": -3.0081567764282227, "logps/chosen": -31.326457977294922, "logps/rejected": -97.51856994628906, "loss": 0.3287, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8120312690734863, "rewards/margins": 6.558890342712402, "rewards/rejected": -4.746859073638916, "step": 1263 }, { "epoch": 3.6442759118815458, "grad_norm": 6.501068115234375, "learning_rate": 6.349710982658959e-07, "logits/chosen": -3.032130479812622, "logits/rejected": -3.1707499027252197, "logps/chosen": -41.47872543334961, "logps/rejected": -99.60432434082031, "loss": 0.5137, "rewards/accuracies": 1.0, "rewards/chosen": 0.69719398021698, "rewards/margins": 5.480794429779053, "rewards/rejected": -4.783599853515625, "step": 1264 }, { "epoch": 3.6471650415312387, "grad_norm": 6.216990947723389, "learning_rate": 6.346820809248554e-07, "logits/chosen": -2.911954879760742, "logits/rejected": -3.0588181018829346, "logps/chosen": -33.84471130371094, "logps/rejected": -101.1288070678711, "loss": 0.4007, "rewards/accuracies": 0.96875, "rewards/chosen": 1.260469913482666, "rewards/margins": 6.580338478088379, "rewards/rejected": -5.319869041442871, "step": 1265 }, { "epoch": 3.6500541711809316, "grad_norm": 8.10071086883545, "learning_rate": 6.34393063583815e-07, "logits/chosen": -2.8740811347961426, "logits/rejected": -3.1627378463745117, "logps/chosen": -29.657670974731445, "logps/rejected": -88.42770385742188, "loss": 0.3863, "rewards/accuracies": 1.0, "rewards/chosen": 1.7509692907333374, "rewards/margins": 5.775938987731934, "rewards/rejected": -4.024969577789307, "step": 1266 }, { "epoch": 3.652943300830625, "grad_norm": 7.832666873931885, "learning_rate": 6.341040462427746e-07, "logits/chosen": -2.988461494445801, "logits/rejected": -3.10882306098938, "logps/chosen": -32.31313705444336, "logps/rejected": -92.81288146972656, "loss": 0.4518, "rewards/accuracies": 0.96875, "rewards/chosen": 1.01151442527771, "rewards/margins": 5.597645282745361, "rewards/rejected": -4.586131572723389, "step": 1267 }, { "epoch": 3.655832430480318, "grad_norm": 10.159040451049805, "learning_rate": 6.338150289017341e-07, "logits/chosen": -2.962517499923706, "logits/rejected": -3.099689483642578, "logps/chosen": -31.13443374633789, "logps/rejected": -91.37663269042969, "loss": 0.4266, "rewards/accuracies": 1.0, "rewards/chosen": 1.324827790260315, "rewards/margins": 5.64141845703125, "rewards/rejected": -4.316590785980225, "step": 1268 }, { "epoch": 3.6587215601300107, "grad_norm": 10.42715835571289, "learning_rate": 6.335260115606936e-07, "logits/chosen": -2.9960882663726807, "logits/rejected": -3.199423313140869, "logps/chosen": -32.648658752441406, "logps/rejected": -75.19529724121094, "loss": 0.5175, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3349902629852295, "rewards/margins": 4.139366626739502, "rewards/rejected": -2.8043766021728516, "step": 1269 }, { "epoch": 3.661610689779704, "grad_norm": 8.158656120300293, "learning_rate": 6.332369942196532e-07, "logits/chosen": -2.917339563369751, "logits/rejected": -3.203784465789795, "logps/chosen": -36.022056579589844, "logps/rejected": -95.25109100341797, "loss": 0.4454, "rewards/accuracies": 0.96875, "rewards/chosen": 1.349752426147461, "rewards/margins": 6.054520606994629, "rewards/rejected": -4.704768180847168, "step": 1270 }, { "epoch": 3.664499819429397, "grad_norm": 7.766666889190674, "learning_rate": 6.329479768786126e-07, "logits/chosen": -3.004528284072876, "logits/rejected": -3.200566530227661, "logps/chosen": -35.77241134643555, "logps/rejected": -103.2925033569336, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 0.9117356538772583, "rewards/margins": 6.529834747314453, "rewards/rejected": -5.618098735809326, "step": 1271 }, { "epoch": 3.66738894907909, "grad_norm": 8.723661422729492, "learning_rate": 6.326589595375722e-07, "logits/chosen": -2.9288454055786133, "logits/rejected": -3.0945301055908203, "logps/chosen": -34.12077713012695, "logps/rejected": -94.21281433105469, "loss": 0.4173, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0061585903167725, "rewards/margins": 5.55726432800293, "rewards/rejected": -4.5511064529418945, "step": 1272 }, { "epoch": 3.670278078728783, "grad_norm": 10.636380195617676, "learning_rate": 6.323699421965318e-07, "logits/chosen": -2.9453649520874023, "logits/rejected": -3.1181657314300537, "logps/chosen": -28.582992553710938, "logps/rejected": -88.62783813476562, "loss": 0.4376, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5678489208221436, "rewards/margins": 5.647963047027588, "rewards/rejected": -4.080114364624023, "step": 1273 }, { "epoch": 3.673167208378476, "grad_norm": 11.469963073730469, "learning_rate": 6.320809248554912e-07, "logits/chosen": -3.026515007019043, "logits/rejected": -3.192876100540161, "logps/chosen": -43.76711654663086, "logps/rejected": -104.7932357788086, "loss": 0.5105, "rewards/accuracies": 0.9375, "rewards/chosen": 0.44364404678344727, "rewards/margins": 5.881948471069336, "rewards/rejected": -5.438304424285889, "step": 1274 }, { "epoch": 3.676056338028169, "grad_norm": 9.576238632202148, "learning_rate": 6.317919075144509e-07, "logits/chosen": -2.9993271827697754, "logits/rejected": -3.215329170227051, "logps/chosen": -20.572526931762695, "logps/rejected": -86.45536804199219, "loss": 0.2822, "rewards/accuracies": 1.0, "rewards/chosen": 2.191366672515869, "rewards/margins": 6.0251288414001465, "rewards/rejected": -3.8337624073028564, "step": 1275 }, { "epoch": 3.678945467677862, "grad_norm": 11.28487777709961, "learning_rate": 6.315028901734104e-07, "logits/chosen": -2.852379322052002, "logits/rejected": -3.086193799972534, "logps/chosen": -26.660858154296875, "logps/rejected": -84.0394287109375, "loss": 0.3865, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0371901988983154, "rewards/margins": 5.467850685119629, "rewards/rejected": -3.4306602478027344, "step": 1276 }, { "epoch": 3.681834597327555, "grad_norm": 8.467432975769043, "learning_rate": 6.3121387283237e-07, "logits/chosen": -2.952434539794922, "logits/rejected": -3.0182394981384277, "logps/chosen": -39.75359344482422, "logps/rejected": -95.61465454101562, "loss": 0.4634, "rewards/accuracies": 1.0, "rewards/chosen": 0.8731884360313416, "rewards/margins": 5.583732604980469, "rewards/rejected": -4.710544586181641, "step": 1277 }, { "epoch": 3.684723726977248, "grad_norm": 12.228453636169434, "learning_rate": 6.309248554913294e-07, "logits/chosen": -2.986982822418213, "logits/rejected": -3.2089011669158936, "logps/chosen": -28.046567916870117, "logps/rejected": -81.62374877929688, "loss": 0.4105, "rewards/accuracies": 0.96875, "rewards/chosen": 1.572021722793579, "rewards/margins": 5.1242475509643555, "rewards/rejected": -3.5522255897521973, "step": 1278 }, { "epoch": 3.687612856626941, "grad_norm": 6.8759918212890625, "learning_rate": 6.30635838150289e-07, "logits/chosen": -2.942202091217041, "logits/rejected": -3.2127113342285156, "logps/chosen": -26.995054244995117, "logps/rejected": -91.74103546142578, "loss": 0.425, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6337214708328247, "rewards/margins": 6.251190185546875, "rewards/rejected": -4.61746883392334, "step": 1279 }, { "epoch": 3.690501986276634, "grad_norm": 8.54127025604248, "learning_rate": 6.303468208092485e-07, "logits/chosen": -3.001784563064575, "logits/rejected": -3.0979089736938477, "logps/chosen": -39.14813232421875, "logps/rejected": -106.13145446777344, "loss": 0.4382, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8357616662979126, "rewards/margins": 6.442821025848389, "rewards/rejected": -5.607059478759766, "step": 1280 }, { "epoch": 3.6933911159263273, "grad_norm": 10.30473804473877, "learning_rate": 6.30057803468208e-07, "logits/chosen": -3.053086519241333, "logits/rejected": -3.204604148864746, "logps/chosen": -26.53069305419922, "logps/rejected": -91.05410766601562, "loss": 0.369, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7811554670333862, "rewards/margins": 5.934622287750244, "rewards/rejected": -4.153467178344727, "step": 1281 }, { "epoch": 3.69628024557602, "grad_norm": 5.874949932098389, "learning_rate": 6.297687861271675e-07, "logits/chosen": -2.926111936569214, "logits/rejected": -3.0671074390411377, "logps/chosen": -31.30632972717285, "logps/rejected": -93.29002380371094, "loss": 0.3642, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4815101623535156, "rewards/margins": 5.873227119445801, "rewards/rejected": -4.391716003417969, "step": 1282 }, { "epoch": 3.699169375225713, "grad_norm": 7.544445991516113, "learning_rate": 6.294797687861272e-07, "logits/chosen": -2.8806920051574707, "logits/rejected": -3.1091766357421875, "logps/chosen": -30.29092025756836, "logps/rejected": -96.11750793457031, "loss": 0.3988, "rewards/accuracies": 1.0, "rewards/chosen": 1.498199224472046, "rewards/margins": 6.034296035766602, "rewards/rejected": -4.536096572875977, "step": 1283 }, { "epoch": 3.7020585048754064, "grad_norm": 7.132835865020752, "learning_rate": 6.291907514450868e-07, "logits/chosen": -2.9753618240356445, "logits/rejected": -3.141683578491211, "logps/chosen": -36.16944885253906, "logps/rejected": -94.68183898925781, "loss": 0.4647, "rewards/accuracies": 1.0, "rewards/chosen": 0.7259795665740967, "rewards/margins": 5.485468864440918, "rewards/rejected": -4.759490013122559, "step": 1284 }, { "epoch": 3.7049476345250993, "grad_norm": 9.490225791931152, "learning_rate": 6.289017341040462e-07, "logits/chosen": -2.9024858474731445, "logits/rejected": -3.0767874717712402, "logps/chosen": -36.26067352294922, "logps/rejected": -95.51725006103516, "loss": 0.4744, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9274134635925293, "rewards/margins": 5.51552677154541, "rewards/rejected": -4.588112831115723, "step": 1285 }, { "epoch": 3.7078367641747922, "grad_norm": 7.394033908843994, "learning_rate": 6.286127167630058e-07, "logits/chosen": -2.8792781829833984, "logits/rejected": -3.047779083251953, "logps/chosen": -32.31446838378906, "logps/rejected": -91.15709686279297, "loss": 0.3925, "rewards/accuracies": 0.9375, "rewards/chosen": 1.234331727027893, "rewards/margins": 5.4244384765625, "rewards/rejected": -4.190106391906738, "step": 1286 }, { "epoch": 3.7107258938244856, "grad_norm": 10.804142951965332, "learning_rate": 6.283236994219653e-07, "logits/chosen": -2.7796483039855957, "logits/rejected": -3.0226571559906006, "logps/chosen": -31.995439529418945, "logps/rejected": -85.03958892822266, "loss": 0.4106, "rewards/accuracies": 1.0, "rewards/chosen": 1.54103684425354, "rewards/margins": 5.362735748291016, "rewards/rejected": -3.8216991424560547, "step": 1287 }, { "epoch": 3.7136150234741785, "grad_norm": 10.560297012329102, "learning_rate": 6.280346820809248e-07, "logits/chosen": -2.892746925354004, "logits/rejected": -3.196336030960083, "logps/chosen": -31.752071380615234, "logps/rejected": -87.85303497314453, "loss": 0.4109, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3729321956634521, "rewards/margins": 5.3232831954956055, "rewards/rejected": -3.9503509998321533, "step": 1288 }, { "epoch": 3.7165041531238714, "grad_norm": 8.798260688781738, "learning_rate": 6.277456647398843e-07, "logits/chosen": -2.965150833129883, "logits/rejected": -3.1560606956481934, "logps/chosen": -29.44830894470215, "logps/rejected": -81.08546447753906, "loss": 0.4247, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7392700910568237, "rewards/margins": 5.173094749450684, "rewards/rejected": -3.4338245391845703, "step": 1289 }, { "epoch": 3.7193932827735647, "grad_norm": 6.321654796600342, "learning_rate": 6.274566473988439e-07, "logits/chosen": -2.9867191314697266, "logits/rejected": -3.1849727630615234, "logps/chosen": -30.75218963623047, "logps/rejected": -102.37690734863281, "loss": 0.3949, "rewards/accuracies": 0.96875, "rewards/chosen": 1.398028016090393, "rewards/margins": 6.859350204467773, "rewards/rejected": -5.461322784423828, "step": 1290 }, { "epoch": 3.7222824124232576, "grad_norm": 6.9565324783325195, "learning_rate": 6.271676300578034e-07, "logits/chosen": -2.9544665813446045, "logits/rejected": -3.1614129543304443, "logps/chosen": -37.29814529418945, "logps/rejected": -97.78783416748047, "loss": 0.4603, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8973726034164429, "rewards/margins": 5.604400157928467, "rewards/rejected": -4.707027435302734, "step": 1291 }, { "epoch": 3.7251715420729505, "grad_norm": 8.862770080566406, "learning_rate": 6.26878612716763e-07, "logits/chosen": -2.9781455993652344, "logits/rejected": -2.968607187271118, "logps/chosen": -35.6072883605957, "logps/rejected": -93.04724884033203, "loss": 0.4355, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0734314918518066, "rewards/margins": 5.586458206176758, "rewards/rejected": -4.513027191162109, "step": 1292 }, { "epoch": 3.7280606717226434, "grad_norm": 8.486722946166992, "learning_rate": 6.265895953757225e-07, "logits/chosen": -3.0120797157287598, "logits/rejected": -3.1828064918518066, "logps/chosen": -35.67335510253906, "logps/rejected": -99.4171142578125, "loss": 0.4516, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1009981632232666, "rewards/margins": 6.195237159729004, "rewards/rejected": -5.094239234924316, "step": 1293 }, { "epoch": 3.7309498013723363, "grad_norm": 8.776713371276855, "learning_rate": 6.263005780346821e-07, "logits/chosen": -2.904238224029541, "logits/rejected": -3.250941753387451, "logps/chosen": -24.616870880126953, "logps/rejected": -86.07714080810547, "loss": 0.3476, "rewards/accuracies": 1.0, "rewards/chosen": 1.9754822254180908, "rewards/margins": 5.872725009918213, "rewards/rejected": -3.8972432613372803, "step": 1294 }, { "epoch": 3.7338389310220297, "grad_norm": 8.880350112915039, "learning_rate": 6.260115606936416e-07, "logits/chosen": -2.7833704948425293, "logits/rejected": -3.027312755584717, "logps/chosen": -24.228862762451172, "logps/rejected": -84.49358367919922, "loss": 0.2795, "rewards/accuracies": 1.0, "rewards/chosen": 1.969635009765625, "rewards/margins": 5.661406517028809, "rewards/rejected": -3.6917715072631836, "step": 1295 }, { "epoch": 3.7367280606717226, "grad_norm": 8.518406867980957, "learning_rate": 6.257225433526011e-07, "logits/chosen": -2.9650044441223145, "logits/rejected": -3.100206136703491, "logps/chosen": -27.03685760498047, "logps/rejected": -89.01072692871094, "loss": 0.3433, "rewards/accuracies": 1.0, "rewards/chosen": 1.898200273513794, "rewards/margins": 6.135906219482422, "rewards/rejected": -4.237705707550049, "step": 1296 }, { "epoch": 3.7396171903214155, "grad_norm": 6.026510715484619, "learning_rate": 6.254335260115607e-07, "logits/chosen": -2.849933624267578, "logits/rejected": -3.0017430782318115, "logps/chosen": -30.230304718017578, "logps/rejected": -86.92579650878906, "loss": 0.4252, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6874921321868896, "rewards/margins": 5.643777370452881, "rewards/rejected": -3.956284999847412, "step": 1297 }, { "epoch": 3.742506319971109, "grad_norm": 7.325735569000244, "learning_rate": 6.251445086705201e-07, "logits/chosen": -2.890697717666626, "logits/rejected": -3.1268725395202637, "logps/chosen": -31.441905975341797, "logps/rejected": -89.51827239990234, "loss": 0.393, "rewards/accuracies": 1.0, "rewards/chosen": 1.4528748989105225, "rewards/margins": 5.664659023284912, "rewards/rejected": -4.211784362792969, "step": 1298 }, { "epoch": 3.7453954496208017, "grad_norm": 9.43138599395752, "learning_rate": 6.248554913294798e-07, "logits/chosen": -2.9255828857421875, "logits/rejected": -3.1297242641448975, "logps/chosen": -35.78047180175781, "logps/rejected": -92.50003051757812, "loss": 0.5387, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0284216403961182, "rewards/margins": 5.376994609832764, "rewards/rejected": -4.348572731018066, "step": 1299 }, { "epoch": 3.7482845792704946, "grad_norm": 7.800970554351807, "learning_rate": 6.245664739884393e-07, "logits/chosen": -2.8595213890075684, "logits/rejected": -3.0280466079711914, "logps/chosen": -26.743465423583984, "logps/rejected": -96.67745971679688, "loss": 0.3234, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9357280731201172, "rewards/margins": 6.433441162109375, "rewards/rejected": -4.497713088989258, "step": 1300 }, { "epoch": 3.751173708920188, "grad_norm": 7.636995315551758, "learning_rate": 6.242774566473989e-07, "logits/chosen": -3.0153157711029053, "logits/rejected": -3.1430768966674805, "logps/chosen": -34.400970458984375, "logps/rejected": -102.57356262207031, "loss": 0.43, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1373168230056763, "rewards/margins": 6.549903869628906, "rewards/rejected": -5.412588119506836, "step": 1301 }, { "epoch": 3.754062838569881, "grad_norm": 7.0435662269592285, "learning_rate": 6.239884393063583e-07, "logits/chosen": -2.9665298461914062, "logits/rejected": -3.182804584503174, "logps/chosen": -31.239091873168945, "logps/rejected": -92.1754150390625, "loss": 0.4018, "rewards/accuracies": 1.0, "rewards/chosen": 1.580966591835022, "rewards/margins": 5.681421756744385, "rewards/rejected": -4.100454807281494, "step": 1302 }, { "epoch": 3.756951968219574, "grad_norm": 7.142702102661133, "learning_rate": 6.236994219653179e-07, "logits/chosen": -2.9029347896575928, "logits/rejected": -3.1449270248413086, "logps/chosen": -35.379425048828125, "logps/rejected": -100.2436294555664, "loss": 0.4154, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1559923887252808, "rewards/margins": 6.46164608001709, "rewards/rejected": -5.305653095245361, "step": 1303 }, { "epoch": 3.759841097869267, "grad_norm": 6.767080307006836, "learning_rate": 6.234104046242774e-07, "logits/chosen": -2.8333141803741455, "logits/rejected": -3.1252388954162598, "logps/chosen": -31.46160888671875, "logps/rejected": -105.51744842529297, "loss": 0.393, "rewards/accuracies": 0.90625, "rewards/chosen": 1.838028907775879, "rewards/margins": 7.214391708374023, "rewards/rejected": -5.376363277435303, "step": 1304 }, { "epoch": 3.76273022751896, "grad_norm": 8.990958213806152, "learning_rate": 6.231213872832369e-07, "logits/chosen": -2.9816713333129883, "logits/rejected": -3.1828510761260986, "logps/chosen": -29.201387405395508, "logps/rejected": -88.30374145507812, "loss": 0.4054, "rewards/accuracies": 0.96875, "rewards/chosen": 1.856555461883545, "rewards/margins": 5.91706657409668, "rewards/rejected": -4.060511112213135, "step": 1305 }, { "epoch": 3.765619357168653, "grad_norm": 11.448803901672363, "learning_rate": 6.228323699421965e-07, "logits/chosen": -2.9838056564331055, "logits/rejected": -3.1232481002807617, "logps/chosen": -33.30843734741211, "logps/rejected": -85.64945983886719, "loss": 0.4777, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2868794202804565, "rewards/margins": 4.889901638031006, "rewards/rejected": -3.6030220985412598, "step": 1306 }, { "epoch": 3.768508486818346, "grad_norm": 9.90556526184082, "learning_rate": 6.225433526011561e-07, "logits/chosen": -2.8940396308898926, "logits/rejected": -3.116547107696533, "logps/chosen": -41.925506591796875, "logps/rejected": -95.68726348876953, "loss": 0.4979, "rewards/accuracies": 0.9375, "rewards/chosen": 0.29552143812179565, "rewards/margins": 5.27392578125, "rewards/rejected": -4.9784040451049805, "step": 1307 }, { "epoch": 3.7713976164680387, "grad_norm": 4.652599334716797, "learning_rate": 6.222543352601156e-07, "logits/chosen": -3.0434064865112305, "logits/rejected": -3.132833480834961, "logps/chosen": -41.78955078125, "logps/rejected": -95.18156433105469, "loss": 0.4407, "rewards/accuracies": 0.90625, "rewards/chosen": 0.48681822419166565, "rewards/margins": 5.110825061798096, "rewards/rejected": -4.624006271362305, "step": 1308 }, { "epoch": 3.774286746117732, "grad_norm": 7.4146037101745605, "learning_rate": 6.219653179190751e-07, "logits/chosen": -2.9681386947631836, "logits/rejected": -3.102508783340454, "logps/chosen": -30.967300415039062, "logps/rejected": -101.53083801269531, "loss": 0.3874, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3581275939941406, "rewards/margins": 6.728959560394287, "rewards/rejected": -5.370831489562988, "step": 1309 }, { "epoch": 3.777175875767425, "grad_norm": 8.26491928100586, "learning_rate": 6.216763005780347e-07, "logits/chosen": -3.046229362487793, "logits/rejected": -3.082897663116455, "logps/chosen": -37.51806640625, "logps/rejected": -99.03260040283203, "loss": 0.5143, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7308778166770935, "rewards/margins": 5.750983238220215, "rewards/rejected": -5.020104885101318, "step": 1310 }, { "epoch": 3.780065005417118, "grad_norm": 9.198770523071289, "learning_rate": 6.213872832369942e-07, "logits/chosen": -2.8291399478912354, "logits/rejected": -3.1426048278808594, "logps/chosen": -29.5699462890625, "logps/rejected": -77.91183471679688, "loss": 0.4202, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9895058870315552, "rewards/margins": 4.9624481201171875, "rewards/rejected": -2.9729421138763428, "step": 1311 }, { "epoch": 3.782954135066811, "grad_norm": 8.516637802124023, "learning_rate": 6.210982658959537e-07, "logits/chosen": -2.9078431129455566, "logits/rejected": -3.1357767581939697, "logps/chosen": -35.340362548828125, "logps/rejected": -99.2737045288086, "loss": 0.3907, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1004703044891357, "rewards/margins": 6.2952165603637695, "rewards/rejected": -5.194746971130371, "step": 1312 }, { "epoch": 3.785843264716504, "grad_norm": 9.254905700683594, "learning_rate": 6.208092485549132e-07, "logits/chosen": -2.854177474975586, "logits/rejected": -3.0429232120513916, "logps/chosen": -29.43996810913086, "logps/rejected": -81.63568115234375, "loss": 0.4364, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6932263374328613, "rewards/margins": 4.900639533996582, "rewards/rejected": -3.2074131965637207, "step": 1313 }, { "epoch": 3.788732394366197, "grad_norm": 9.763927459716797, "learning_rate": 6.205202312138728e-07, "logits/chosen": -2.878307819366455, "logits/rejected": -3.1016199588775635, "logps/chosen": -30.36964225769043, "logps/rejected": -81.01396179199219, "loss": 0.554, "rewards/accuracies": 0.9375, "rewards/chosen": 1.24788498878479, "rewards/margins": 4.55085563659668, "rewards/rejected": -3.3029708862304688, "step": 1314 }, { "epoch": 3.7916215240158904, "grad_norm": 9.266138076782227, "learning_rate": 6.202312138728323e-07, "logits/chosen": -2.965878963470459, "logits/rejected": -3.1496856212615967, "logps/chosen": -40.66381072998047, "logps/rejected": -88.31173706054688, "loss": 0.5309, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7339428067207336, "rewards/margins": 4.821751594543457, "rewards/rejected": -4.087809085845947, "step": 1315 }, { "epoch": 3.7945106536655833, "grad_norm": 7.181875228881836, "learning_rate": 6.199421965317919e-07, "logits/chosen": -2.8800578117370605, "logits/rejected": -3.038363218307495, "logps/chosen": -30.121395111083984, "logps/rejected": -100.32392120361328, "loss": 0.4285, "rewards/accuracies": 0.96875, "rewards/chosen": 1.525291919708252, "rewards/margins": 6.596563816070557, "rewards/rejected": -5.071271896362305, "step": 1316 }, { "epoch": 3.797399783315276, "grad_norm": 9.642192840576172, "learning_rate": 6.196531791907515e-07, "logits/chosen": -3.034661054611206, "logits/rejected": -3.2039642333984375, "logps/chosen": -27.570697784423828, "logps/rejected": -85.83975219726562, "loss": 0.3828, "rewards/accuracies": 1.0, "rewards/chosen": 1.7659170627593994, "rewards/margins": 5.635101318359375, "rewards/rejected": -3.8691837787628174, "step": 1317 }, { "epoch": 3.8002889129649695, "grad_norm": 6.801894664764404, "learning_rate": 6.19364161849711e-07, "logits/chosen": -3.0789079666137695, "logits/rejected": -3.160175323486328, "logps/chosen": -29.678791046142578, "logps/rejected": -86.43124389648438, "loss": 0.4072, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2127935886383057, "rewards/margins": 5.446372985839844, "rewards/rejected": -4.233579635620117, "step": 1318 }, { "epoch": 3.8031780426146624, "grad_norm": 11.4086275100708, "learning_rate": 6.190751445086705e-07, "logits/chosen": -2.977652072906494, "logits/rejected": -3.1269869804382324, "logps/chosen": -29.449831008911133, "logps/rejected": -83.84745788574219, "loss": 0.4091, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8032032251358032, "rewards/margins": 5.449938774108887, "rewards/rejected": -3.646735191345215, "step": 1319 }, { "epoch": 3.8060671722643553, "grad_norm": 9.216382026672363, "learning_rate": 6.1878612716763e-07, "logits/chosen": -2.8904712200164795, "logits/rejected": -3.0881259441375732, "logps/chosen": -27.553316116333008, "logps/rejected": -82.30638885498047, "loss": 0.4234, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8825953006744385, "rewards/margins": 5.5572004318237305, "rewards/rejected": -3.674605369567871, "step": 1320 }, { "epoch": 3.8089563019140487, "grad_norm": 7.786829471588135, "learning_rate": 6.184971098265896e-07, "logits/chosen": -2.8670432567596436, "logits/rejected": -3.1132872104644775, "logps/chosen": -37.376102447509766, "logps/rejected": -100.35604858398438, "loss": 0.441, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1206047534942627, "rewards/margins": 5.8544135093688965, "rewards/rejected": -4.733808517456055, "step": 1321 }, { "epoch": 3.8118454315637416, "grad_norm": 7.223175525665283, "learning_rate": 6.18208092485549e-07, "logits/chosen": -2.9220125675201416, "logits/rejected": -3.112122058868408, "logps/chosen": -33.22113037109375, "logps/rejected": -90.89326477050781, "loss": 0.4276, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4023501873016357, "rewards/margins": 5.527824878692627, "rewards/rejected": -4.125474452972412, "step": 1322 }, { "epoch": 3.8147345612134345, "grad_norm": 9.806281089782715, "learning_rate": 6.179190751445087e-07, "logits/chosen": -3.056105852127075, "logits/rejected": -3.1464078426361084, "logps/chosen": -35.39714050292969, "logps/rejected": -105.7602310180664, "loss": 0.3789, "rewards/accuracies": 1.0, "rewards/chosen": 1.149552583694458, "rewards/margins": 6.855291366577148, "rewards/rejected": -5.705738544464111, "step": 1323 }, { "epoch": 3.8176236908631274, "grad_norm": 9.20378589630127, "learning_rate": 6.176300578034682e-07, "logits/chosen": -2.930341958999634, "logits/rejected": -3.0687973499298096, "logps/chosen": -47.38954544067383, "logps/rejected": -99.28984832763672, "loss": 0.5465, "rewards/accuracies": 0.96875, "rewards/chosen": -0.1679038405418396, "rewards/margins": 5.1587371826171875, "rewards/rejected": -5.326640605926514, "step": 1324 }, { "epoch": 3.8205128205128203, "grad_norm": 8.973315238952637, "learning_rate": 6.173410404624277e-07, "logits/chosen": -2.9308996200561523, "logits/rejected": -3.099889039993286, "logps/chosen": -40.79496765136719, "logps/rejected": -106.27552032470703, "loss": 0.4346, "rewards/accuracies": 1.0, "rewards/chosen": 0.5278165340423584, "rewards/margins": 6.19487190246582, "rewards/rejected": -5.667055606842041, "step": 1325 }, { "epoch": 3.8234019501625136, "grad_norm": 9.016422271728516, "learning_rate": 6.170520231213872e-07, "logits/chosen": -3.0504276752471924, "logits/rejected": -3.204780101776123, "logps/chosen": -35.74081802368164, "logps/rejected": -85.25267028808594, "loss": 0.4602, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9483072757720947, "rewards/margins": 4.616374969482422, "rewards/rejected": -3.668067693710327, "step": 1326 }, { "epoch": 3.8262910798122065, "grad_norm": 7.369863033294678, "learning_rate": 6.167630057803468e-07, "logits/chosen": -2.8666181564331055, "logits/rejected": -3.1520326137542725, "logps/chosen": -34.674049377441406, "logps/rejected": -98.61238098144531, "loss": 0.3678, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2480497360229492, "rewards/margins": 6.074270248413086, "rewards/rejected": -4.826220512390137, "step": 1327 }, { "epoch": 3.8291802094618994, "grad_norm": 12.434473991394043, "learning_rate": 6.164739884393064e-07, "logits/chosen": -2.9082205295562744, "logits/rejected": -2.9641690254211426, "logps/chosen": -33.202022552490234, "logps/rejected": -78.36689758300781, "loss": 0.5148, "rewards/accuracies": 1.0, "rewards/chosen": 1.3553084135055542, "rewards/margins": 4.305822849273682, "rewards/rejected": -2.950514554977417, "step": 1328 }, { "epoch": 3.8320693391115928, "grad_norm": 6.980872631072998, "learning_rate": 6.161849710982658e-07, "logits/chosen": -2.876234292984009, "logits/rejected": -3.090965747833252, "logps/chosen": -32.10536575317383, "logps/rejected": -91.3846206665039, "loss": 0.4025, "rewards/accuracies": 1.0, "rewards/chosen": 1.5599260330200195, "rewards/margins": 5.739920139312744, "rewards/rejected": -4.179994106292725, "step": 1329 }, { "epoch": 3.8349584687612857, "grad_norm": 8.388582229614258, "learning_rate": 6.158959537572254e-07, "logits/chosen": -3.051640033721924, "logits/rejected": -3.148284912109375, "logps/chosen": -37.38250732421875, "logps/rejected": -91.83979797363281, "loss": 0.4177, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7641358375549316, "rewards/margins": 5.39351224899292, "rewards/rejected": -4.629376411437988, "step": 1330 }, { "epoch": 3.8349584687612857, "eval_logits/chosen": -2.981549024581909, "eval_logits/rejected": -3.169443368911743, "eval_logps/chosen": -38.374942779541016, "eval_logps/rejected": -99.9098129272461, "eval_loss": 0.49186035990715027, "eval_rewards/accuracies": 0.975806474685669, "eval_rewards/chosen": 0.7941473126411438, "eval_rewards/margins": 6.090111255645752, "eval_rewards/rejected": -5.295963287353516, "eval_runtime": 223.6073, "eval_samples_per_second": 0.55, "eval_steps_per_second": 0.277, "step": 1330 } ], "logging_steps": 1.0, "max_steps": 3460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 70, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }