{ "best_global_step": 672, "best_metric": 2.965437173843384, "best_model_checkpoint": "/tmp/svadugur/39824/wnr_change_preference-speaker=gemma-listener=pixtral_ft-length_conditioned=True-contexts=hard-39824/checkpoint-672", "epoch": 0.8002977298101972, "eval_steps": 168, "global_step": 672, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_logits/chosen": -2.10208797454834, "eval_logits/rejected": -2.152259349822998, "eval_logps/chosen": -70.71012878417969, "eval_logps/rejected": -59.822689056396484, "eval_loss": 1.0, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0, "eval_runtime": 1957.5803, "eval_samples_per_second": 0.533, "eval_steps_per_second": 0.267, "step": 0 }, { "epoch": 0.001190919240788984, "grad_norm": 2.9556796550750732, "learning_rate": 1e-06, "logits/chosen": -2.1261730194091797, "logits/rejected": -2.188065767288208, "logps/chosen": -74.85185241699219, "logps/rejected": -57.64994812011719, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.002381838481577968, "grad_norm": 2.83097243309021, "learning_rate": 9.998808104886768e-07, "logits/chosen": -2.063091993331909, "logits/rejected": -2.097243070602417, "logps/chosen": -72.62290954589844, "logps/rejected": -54.48834991455078, "loss": 1.0029, "rewards/accuracies": 0.40625, "rewards/chosen": 0.024543095380067825, "rewards/margins": -0.011489558964967728, "rewards/rejected": 0.03603265434503555, "step": 2 }, { "epoch": 0.003572757722366952, "grad_norm": 3.000828266143799, "learning_rate": 9.997616209773539e-07, "logits/chosen": -2.089338541030884, "logits/rejected": -2.2037031650543213, "logps/chosen": -74.82547760009766, "logps/rejected": -57.98599624633789, "loss": 1.0052, "rewards/accuracies": 0.375, "rewards/chosen": -0.021430063992738724, "rewards/margins": -0.020783554762601852, "rewards/rejected": -0.0006465082988142967, "step": 3 }, { "epoch": 0.004763676963155936, "grad_norm": 2.502246618270874, "learning_rate": 9.99642431466031e-07, "logits/chosen": -2.071497678756714, "logits/rejected": -2.076540470123291, "logps/chosen": -68.75935363769531, "logps/rejected": -58.04631805419922, "loss": 1.0052, "rewards/accuracies": 0.46875, "rewards/chosen": -0.02013782411813736, "rewards/margins": -0.020892463624477386, "rewards/rejected": 0.0007546426495537162, "step": 4 }, { "epoch": 0.00595459620394492, "grad_norm": 2.4348957538604736, "learning_rate": 9.99523241954708e-07, "logits/chosen": -2.105354070663452, "logits/rejected": -2.171637773513794, "logps/chosen": -67.39825439453125, "logps/rejected": -58.28297424316406, "loss": 0.9969, "rewards/accuracies": 0.53125, "rewards/chosen": 0.021353675052523613, "rewards/margins": 0.012472379021346569, "rewards/rejected": 0.008881294168531895, "step": 5 }, { "epoch": 0.007145515444733904, "grad_norm": 2.9869771003723145, "learning_rate": 9.99404052443385e-07, "logits/chosen": -2.163001298904419, "logits/rejected": -2.14371395111084, "logps/chosen": -68.87754821777344, "logps/rejected": -64.19584655761719, "loss": 1.0002, "rewards/accuracies": 0.5, "rewards/chosen": 0.002550060860812664, "rewards/margins": -0.0007917839102447033, "rewards/rejected": 0.0033418426755815744, "step": 6 }, { "epoch": 0.008336434685522889, "grad_norm": 3.0197970867156982, "learning_rate": 9.99284862932062e-07, "logits/chosen": -2.1279282569885254, "logits/rejected": -2.2354867458343506, "logps/chosen": -69.26617431640625, "logps/rejected": -62.50086975097656, "loss": 1.0005, "rewards/accuracies": 0.46875, "rewards/chosen": 0.006436777301132679, "rewards/margins": -0.0019582370296120644, "rewards/rejected": 0.008395012468099594, "step": 7 }, { "epoch": 0.009527353926311872, "grad_norm": 2.8384740352630615, "learning_rate": 9.991656734207388e-07, "logits/chosen": -2.083566665649414, "logits/rejected": -2.0853514671325684, "logps/chosen": -71.22711944580078, "logps/rejected": -63.66456604003906, "loss": 0.9962, "rewards/accuracies": 0.65625, "rewards/chosen": 0.004151535220444202, "rewards/margins": 0.015153147280216217, "rewards/rejected": -0.011001610197126865, "step": 8 }, { "epoch": 0.010718273167100856, "grad_norm": 3.503282070159912, "learning_rate": 9.99046483909416e-07, "logits/chosen": -2.0324482917785645, "logits/rejected": -2.0936965942382812, "logps/chosen": -69.4775619506836, "logps/rejected": -64.57220458984375, "loss": 1.0011, "rewards/accuracies": 0.5, "rewards/chosen": -0.008526635356247425, "rewards/margins": -0.004305863752961159, "rewards/rejected": -0.004220771603286266, "step": 9 }, { "epoch": 0.01190919240788984, "grad_norm": 2.5504684448242188, "learning_rate": 9.989272943980928e-07, "logits/chosen": -2.10343599319458, "logits/rejected": -2.137152671813965, "logps/chosen": -71.69035339355469, "logps/rejected": -63.80641555786133, "loss": 0.998, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0012466839980334044, "rewards/margins": 0.008074636571109295, "rewards/rejected": -0.009321319870650768, "step": 10 }, { "epoch": 0.013100111648678824, "grad_norm": 2.882068157196045, "learning_rate": 9.988081048867698e-07, "logits/chosen": -2.2072951793670654, "logits/rejected": -2.167363166809082, "logps/chosen": -69.69296264648438, "logps/rejected": -60.367671966552734, "loss": 1.0003, "rewards/accuracies": 0.5625, "rewards/chosen": -0.023274589329957962, "rewards/margins": -0.0010082365479320288, "rewards/rejected": -0.02226635254919529, "step": 11 }, { "epoch": 0.014291030889467808, "grad_norm": 2.27917218208313, "learning_rate": 9.98688915375447e-07, "logits/chosen": -2.146411418914795, "logits/rejected": -2.18780255317688, "logps/chosen": -66.66046905517578, "logps/rejected": -57.04871368408203, "loss": 1.0062, "rewards/accuracies": 0.34375, "rewards/chosen": -0.0313134640455246, "rewards/margins": -0.02501366101205349, "rewards/rejected": -0.006299805827438831, "step": 12 }, { "epoch": 0.015481950130256793, "grad_norm": 3.0425612926483154, "learning_rate": 9.98569725864124e-07, "logits/chosen": -2.069312334060669, "logits/rejected": -2.1323840618133545, "logps/chosen": -71.17840576171875, "logps/rejected": -59.36183547973633, "loss": 1.0001, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0035680995788425207, "rewards/margins": -0.0005238289013504982, "rewards/rejected": 0.004091931506991386, "step": 13 }, { "epoch": 0.016672869371045777, "grad_norm": 3.2356395721435547, "learning_rate": 9.984505363528009e-07, "logits/chosen": -2.0985915660858154, "logits/rejected": -2.1956639289855957, "logps/chosen": -73.30680847167969, "logps/rejected": -59.56623458862305, "loss": 0.9975, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00831148587167263, "rewards/margins": 0.009911739267408848, "rewards/rejected": -0.0016002531629055738, "step": 14 }, { "epoch": 0.01786378861183476, "grad_norm": 3.5333335399627686, "learning_rate": 9.98331346841478e-07, "logits/chosen": -2.1203396320343018, "logits/rejected": -2.120502471923828, "logps/chosen": -74.4678726196289, "logps/rejected": -63.93107223510742, "loss": 1.0024, "rewards/accuracies": 0.46875, "rewards/chosen": -0.01827826537191868, "rewards/margins": -0.009508490562438965, "rewards/rejected": -0.008769773878157139, "step": 15 }, { "epoch": 0.019054707852623743, "grad_norm": 2.4912896156311035, "learning_rate": 9.982121573301548e-07, "logits/chosen": -2.1713521480560303, "logits/rejected": -2.2329022884368896, "logps/chosen": -68.79865264892578, "logps/rejected": -59.343711853027344, "loss": 1.0022, "rewards/accuracies": 0.5, "rewards/chosen": -0.0037726988084614277, "rewards/margins": -0.008941184729337692, "rewards/rejected": 0.005168485455214977, "step": 16 }, { "epoch": 0.020245627093412728, "grad_norm": 2.482011079788208, "learning_rate": 9.980929678188319e-07, "logits/chosen": -2.1543784141540527, "logits/rejected": -2.191584587097168, "logps/chosen": -68.84781646728516, "logps/rejected": -60.68608093261719, "loss": 0.9981, "rewards/accuracies": 0.625, "rewards/chosen": 0.0315876379609108, "rewards/margins": 0.00746381189674139, "rewards/rejected": 0.024123825132846832, "step": 17 }, { "epoch": 0.021436546334201712, "grad_norm": 3.3822102546691895, "learning_rate": 9.97973778307509e-07, "logits/chosen": -2.1380093097686768, "logits/rejected": -2.147953510284424, "logps/chosen": -75.77896118164062, "logps/rejected": -59.920570373535156, "loss": 0.9918, "rewards/accuracies": 0.5, "rewards/chosen": -0.00562058761715889, "rewards/margins": 0.0329722985625267, "rewards/rejected": -0.03859288990497589, "step": 18 }, { "epoch": 0.022627465574990697, "grad_norm": 3.245479106903076, "learning_rate": 9.978545887961858e-07, "logits/chosen": -2.146111249923706, "logits/rejected": -2.1624724864959717, "logps/chosen": -70.14439392089844, "logps/rejected": -57.989356994628906, "loss": 1.0021, "rewards/accuracies": 0.46875, "rewards/chosen": 0.005433977581560612, "rewards/margins": -0.008410370908677578, "rewards/rejected": 0.013844347558915615, "step": 19 }, { "epoch": 0.02381838481577968, "grad_norm": 2.572532892227173, "learning_rate": 9.977353992848629e-07, "logits/chosen": -2.0595502853393555, "logits/rejected": -2.1493234634399414, "logps/chosen": -68.56172180175781, "logps/rejected": -60.948875427246094, "loss": 0.9969, "rewards/accuracies": 0.625, "rewards/chosen": 0.013733913190662861, "rewards/margins": 0.01241089217364788, "rewards/rejected": 0.0013230210170149803, "step": 20 }, { "epoch": 0.025009304056568663, "grad_norm": 2.8745832443237305, "learning_rate": 9.9761620977354e-07, "logits/chosen": -2.145564556121826, "logits/rejected": -2.18635630607605, "logps/chosen": -73.96602630615234, "logps/rejected": -59.88689422607422, "loss": 0.9956, "rewards/accuracies": 0.59375, "rewards/chosen": 0.014617693610489368, "rewards/margins": 0.017503835260868073, "rewards/rejected": -0.0028861418832093477, "step": 21 }, { "epoch": 0.026200223297357647, "grad_norm": 3.025338649749756, "learning_rate": 9.974970202622168e-07, "logits/chosen": -2.0228800773620605, "logits/rejected": -2.0998806953430176, "logps/chosen": -70.67532348632812, "logps/rejected": -61.890445709228516, "loss": 0.9941, "rewards/accuracies": 0.625, "rewards/chosen": 0.03250552713871002, "rewards/margins": 0.02384599670767784, "rewards/rejected": 0.008659528568387032, "step": 22 }, { "epoch": 0.02739114253814663, "grad_norm": 2.770050048828125, "learning_rate": 9.973778307508939e-07, "logits/chosen": -2.125861644744873, "logits/rejected": -2.2060837745666504, "logps/chosen": -65.18946838378906, "logps/rejected": -60.1985969543457, "loss": 1.0019, "rewards/accuracies": 0.46875, "rewards/chosen": 0.009228968992829323, "rewards/margins": -0.007653176784515381, "rewards/rejected": 0.016882145777344704, "step": 23 }, { "epoch": 0.028582061778935616, "grad_norm": 2.5358779430389404, "learning_rate": 9.972586412395707e-07, "logits/chosen": -2.0910990238189697, "logits/rejected": -2.1015467643737793, "logps/chosen": -70.2918472290039, "logps/rejected": -62.71061325073242, "loss": 0.9974, "rewards/accuracies": 0.5625, "rewards/chosen": 0.016874730587005615, "rewards/margins": 0.010525466874241829, "rewards/rejected": 0.00634926650673151, "step": 24 }, { "epoch": 0.0297729810197246, "grad_norm": 2.545015811920166, "learning_rate": 9.971394517282478e-07, "logits/chosen": -2.0483245849609375, "logits/rejected": -2.115046262741089, "logps/chosen": -61.53189468383789, "logps/rejected": -62.63129806518555, "loss": 1.0043, "rewards/accuracies": 0.40625, "rewards/chosen": 0.01698998175561428, "rewards/margins": -0.017092108726501465, "rewards/rejected": 0.034082088619470596, "step": 25 }, { "epoch": 0.030963900260513585, "grad_norm": 3.0461254119873047, "learning_rate": 9.970202622169249e-07, "logits/chosen": -2.102782964706421, "logits/rejected": -2.1467134952545166, "logps/chosen": -74.34263610839844, "logps/rejected": -64.75270080566406, "loss": 1.0021, "rewards/accuracies": 0.53125, "rewards/chosen": 0.004559122957289219, "rewards/margins": -0.008457290008664131, "rewards/rejected": 0.013016415759921074, "step": 26 }, { "epoch": 0.03215481950130257, "grad_norm": 3.1236517429351807, "learning_rate": 9.96901072705602e-07, "logits/chosen": -2.1444623470306396, "logits/rejected": -2.153841495513916, "logps/chosen": -75.41719818115234, "logps/rejected": -61.428466796875, "loss": 0.9955, "rewards/accuracies": 0.59375, "rewards/chosen": 0.045157622545957565, "rewards/margins": 0.018078196793794632, "rewards/rejected": 0.027079429477453232, "step": 27 }, { "epoch": 0.033345738742091555, "grad_norm": 3.1750268936157227, "learning_rate": 9.967818831942788e-07, "logits/chosen": -2.073230028152466, "logits/rejected": -2.1347484588623047, "logps/chosen": -79.23770141601562, "logps/rejected": -58.034889221191406, "loss": 0.9983, "rewards/accuracies": 0.59375, "rewards/chosen": 0.03362405300140381, "rewards/margins": 0.006712662987411022, "rewards/rejected": 0.026911389082670212, "step": 28 }, { "epoch": 0.03453665798288054, "grad_norm": 2.805912494659424, "learning_rate": 9.96662693682956e-07, "logits/chosen": -2.11061692237854, "logits/rejected": -2.1494195461273193, "logps/chosen": -66.59683990478516, "logps/rejected": -57.80373764038086, "loss": 1.003, "rewards/accuracies": 0.46875, "rewards/chosen": 0.018703389912843704, "rewards/margins": -0.011956297792494297, "rewards/rejected": 0.030659688636660576, "step": 29 }, { "epoch": 0.03572757722366952, "grad_norm": 3.0219945907592773, "learning_rate": 9.965435041716328e-07, "logits/chosen": -2.1472086906433105, "logits/rejected": -2.138707160949707, "logps/chosen": -71.09435272216797, "logps/rejected": -58.785037994384766, "loss": 0.995, "rewards/accuracies": 0.625, "rewards/chosen": 0.03876239061355591, "rewards/margins": 0.019963061437010765, "rewards/rejected": 0.018799329176545143, "step": 30 }, { "epoch": 0.0369184964644585, "grad_norm": 3.121483087539673, "learning_rate": 9.964243146603098e-07, "logits/chosen": -2.1233959197998047, "logits/rejected": -2.207313060760498, "logps/chosen": -72.66891479492188, "logps/rejected": -57.58470916748047, "loss": 0.9947, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03569481149315834, "rewards/margins": 0.02124917507171631, "rewards/rejected": 0.014445639215409756, "step": 31 }, { "epoch": 0.038109415705247486, "grad_norm": 3.183081865310669, "learning_rate": 9.963051251489867e-07, "logits/chosen": -2.061000108718872, "logits/rejected": -2.086076498031616, "logps/chosen": -71.74437713623047, "logps/rejected": -57.249473571777344, "loss": 0.9921, "rewards/accuracies": 0.65625, "rewards/chosen": 0.04521138221025467, "rewards/margins": 0.031943414360284805, "rewards/rejected": 0.013267969712615013, "step": 32 }, { "epoch": 0.03930033494603647, "grad_norm": 2.855787754058838, "learning_rate": 9.961859356376638e-07, "logits/chosen": -2.067350387573242, "logits/rejected": -2.0882670879364014, "logps/chosen": -73.80513000488281, "logps/rejected": -60.819610595703125, "loss": 0.9896, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03863553702831268, "rewards/margins": 0.04181652143597603, "rewards/rejected": -0.003180979285389185, "step": 33 }, { "epoch": 0.040491254186825455, "grad_norm": 3.0384507179260254, "learning_rate": 9.960667461263408e-07, "logits/chosen": -2.1004209518432617, "logits/rejected": -2.163328170776367, "logps/chosen": -75.18993377685547, "logps/rejected": -62.35363006591797, "loss": 0.9988, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06429854780435562, "rewards/margins": 0.0048142061568796635, "rewards/rejected": 0.059484340250492096, "step": 34 }, { "epoch": 0.04168217342761444, "grad_norm": 3.504845380783081, "learning_rate": 9.95947556615018e-07, "logits/chosen": -2.1168060302734375, "logits/rejected": -2.176920175552368, "logps/chosen": -71.2417221069336, "logps/rejected": -59.30324935913086, "loss": 0.9961, "rewards/accuracies": 0.53125, "rewards/chosen": 0.040868308395147324, "rewards/margins": 0.015663383528590202, "rewards/rejected": 0.025204923003911972, "step": 35 }, { "epoch": 0.042873092668403424, "grad_norm": 3.002854108810425, "learning_rate": 9.958283671036948e-07, "logits/chosen": -2.111664056777954, "logits/rejected": -2.2406811714172363, "logps/chosen": -68.69439697265625, "logps/rejected": -61.3931770324707, "loss": 0.9886, "rewards/accuracies": 0.59375, "rewards/chosen": 0.03283758461475372, "rewards/margins": 0.045583367347717285, "rewards/rejected": -0.01274578832089901, "step": 36 }, { "epoch": 0.04406401190919241, "grad_norm": 2.9200422763824463, "learning_rate": 9.957091775923718e-07, "logits/chosen": -2.1143743991851807, "logits/rejected": -2.1632628440856934, "logps/chosen": -69.47038269042969, "logps/rejected": -61.52223587036133, "loss": 0.9915, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0579463467001915, "rewards/margins": 0.033970415592193604, "rewards/rejected": 0.023975932970643044, "step": 37 }, { "epoch": 0.045254931149981394, "grad_norm": 3.1568901538848877, "learning_rate": 9.955899880810487e-07, "logits/chosen": -2.0893168449401855, "logits/rejected": -2.155588388442993, "logps/chosen": -67.1396484375, "logps/rejected": -62.47167205810547, "loss": 0.9885, "rewards/accuracies": 0.59375, "rewards/chosen": 0.058427631855010986, "rewards/margins": 0.04611816257238388, "rewards/rejected": 0.012309467419981956, "step": 38 }, { "epoch": 0.04644585039077038, "grad_norm": 2.9592390060424805, "learning_rate": 9.954707985697258e-07, "logits/chosen": -2.099496364593506, "logits/rejected": -2.165286064147949, "logps/chosen": -73.25782775878906, "logps/rejected": -59.28329086303711, "loss": 0.9863, "rewards/accuracies": 0.71875, "rewards/chosen": 0.07341372966766357, "rewards/margins": 0.054925765842199326, "rewards/rejected": 0.018487967550754547, "step": 39 }, { "epoch": 0.04763676963155936, "grad_norm": 2.618873357772827, "learning_rate": 9.953516090584029e-07, "logits/chosen": -2.0202560424804688, "logits/rejected": -2.083470344543457, "logps/chosen": -68.5243911743164, "logps/rejected": -61.91728210449219, "loss": 0.9976, "rewards/accuracies": 0.5, "rewards/chosen": 0.08767521381378174, "rewards/margins": 0.009407281875610352, "rewards/rejected": 0.07826793938875198, "step": 40 }, { "epoch": 0.04882768887234834, "grad_norm": 3.3390657901763916, "learning_rate": 9.952324195470797e-07, "logits/chosen": -2.0049169063568115, "logits/rejected": -2.107433795928955, "logps/chosen": -71.25166320800781, "logps/rejected": -58.15052795410156, "loss": 0.988, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09065859019756317, "rewards/margins": 0.04814638942480087, "rewards/rejected": 0.0425121933221817, "step": 41 }, { "epoch": 0.050018608113137325, "grad_norm": 2.9964346885681152, "learning_rate": 9.951132300357568e-07, "logits/chosen": -2.06874942779541, "logits/rejected": -2.0768520832061768, "logps/chosen": -70.0673828125, "logps/rejected": -57.602962493896484, "loss": 0.9834, "rewards/accuracies": 0.71875, "rewards/chosen": 0.09599728137254715, "rewards/margins": 0.06674634665250778, "rewards/rejected": 0.029250945895910263, "step": 42 }, { "epoch": 0.05120952735392631, "grad_norm": 2.6889421939849854, "learning_rate": 9.949940405244339e-07, "logits/chosen": -2.0455241203308105, "logits/rejected": -2.1661438941955566, "logps/chosen": -71.58322143554688, "logps/rejected": -59.69987487792969, "loss": 0.9906, "rewards/accuracies": 0.6875, "rewards/chosen": 0.10223518311977386, "rewards/margins": 0.037813615053892136, "rewards/rejected": 0.06442156434059143, "step": 43 }, { "epoch": 0.052400446594715294, "grad_norm": 4.754739284515381, "learning_rate": 9.948748510131107e-07, "logits/chosen": -2.0812788009643555, "logits/rejected": -2.1302907466888428, "logps/chosen": -67.7397689819336, "logps/rejected": -59.486473083496094, "loss": 0.9932, "rewards/accuracies": 0.625, "rewards/chosen": 0.08154942095279694, "rewards/margins": 0.027261365205049515, "rewards/rejected": 0.054288048297166824, "step": 44 }, { "epoch": 0.05359136583550428, "grad_norm": 3.2415924072265625, "learning_rate": 9.947556615017878e-07, "logits/chosen": -2.0488994121551514, "logits/rejected": -2.107581853866577, "logps/chosen": -74.98902130126953, "logps/rejected": -58.81890106201172, "loss": 0.9792, "rewards/accuracies": 0.75, "rewards/chosen": 0.13779376447200775, "rewards/margins": 0.08386645466089249, "rewards/rejected": 0.05392731353640556, "step": 45 }, { "epoch": 0.05478228507629326, "grad_norm": 3.014906644821167, "learning_rate": 9.946364719904647e-07, "logits/chosen": -2.0125226974487305, "logits/rejected": -2.1152451038360596, "logps/chosen": -74.94960021972656, "logps/rejected": -58.83934783935547, "loss": 0.9852, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15040120482444763, "rewards/margins": 0.059716396033763885, "rewards/rejected": 0.09068480879068375, "step": 46 }, { "epoch": 0.05597320431708225, "grad_norm": 2.5049920082092285, "learning_rate": 9.945172824791417e-07, "logits/chosen": -2.056734561920166, "logits/rejected": -2.132366180419922, "logps/chosen": -64.77973937988281, "logps/rejected": -56.898170471191406, "loss": 0.9883, "rewards/accuracies": 0.65625, "rewards/chosen": 0.13146808743476868, "rewards/margins": 0.0472959503531456, "rewards/rejected": 0.08417212963104248, "step": 47 }, { "epoch": 0.05716412355787123, "grad_norm": 3.4279298782348633, "learning_rate": 9.943980929678188e-07, "logits/chosen": -2.117225170135498, "logits/rejected": -2.1347711086273193, "logps/chosen": -71.69500732421875, "logps/rejected": -60.485923767089844, "loss": 0.9917, "rewards/accuracies": 0.59375, "rewards/chosen": 0.10478873550891876, "rewards/margins": 0.03293478488922119, "rewards/rejected": 0.07185395061969757, "step": 48 }, { "epoch": 0.05835504279866022, "grad_norm": 2.3982832431793213, "learning_rate": 9.942789034564959e-07, "logits/chosen": -2.120838165283203, "logits/rejected": -2.1384878158569336, "logps/chosen": -66.99083709716797, "logps/rejected": -62.23066711425781, "loss": 0.9964, "rewards/accuracies": 0.65625, "rewards/chosen": 0.142441987991333, "rewards/margins": 0.014467742294073105, "rewards/rejected": 0.1279742419719696, "step": 49 }, { "epoch": 0.0595459620394492, "grad_norm": 3.038424253463745, "learning_rate": 9.941597139451727e-07, "logits/chosen": -2.107520341873169, "logits/rejected": -2.1690258979797363, "logps/chosen": -73.60063171386719, "logps/rejected": -55.07114791870117, "loss": 0.9766, "rewards/accuracies": 0.75, "rewards/chosen": 0.15072697401046753, "rewards/margins": 0.09442386031150818, "rewards/rejected": 0.05630312114953995, "step": 50 }, { "epoch": 0.060736881280238186, "grad_norm": 3.060072898864746, "learning_rate": 9.940405244338498e-07, "logits/chosen": -2.0844345092773438, "logits/rejected": -2.1335248947143555, "logps/chosen": -71.34362030029297, "logps/rejected": -57.87122344970703, "loss": 0.9773, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14347566664218903, "rewards/margins": 0.09147419780492783, "rewards/rejected": 0.0520014688372612, "step": 51 }, { "epoch": 0.06192780052102717, "grad_norm": 3.0692126750946045, "learning_rate": 9.939213349225267e-07, "logits/chosen": -2.1107821464538574, "logits/rejected": -2.195147752761841, "logps/chosen": -72.3443374633789, "logps/rejected": -59.46559143066406, "loss": 0.977, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1801864057779312, "rewards/margins": 0.0928497463464737, "rewards/rejected": 0.08733666688203812, "step": 52 }, { "epoch": 0.06311871976181616, "grad_norm": 4.043155193328857, "learning_rate": 9.938021454112038e-07, "logits/chosen": -2.050759792327881, "logits/rejected": -2.1228039264678955, "logps/chosen": -74.41062927246094, "logps/rejected": -58.72724533081055, "loss": 0.9816, "rewards/accuracies": 0.71875, "rewards/chosen": 0.18999791145324707, "rewards/margins": 0.07435978949069977, "rewards/rejected": 0.1156381368637085, "step": 53 }, { "epoch": 0.06430963900260514, "grad_norm": 2.980093479156494, "learning_rate": 9.936829558998806e-07, "logits/chosen": -2.080554723739624, "logits/rejected": -2.1593594551086426, "logps/chosen": -68.48040771484375, "logps/rejected": -56.6127815246582, "loss": 0.9776, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14754004776477814, "rewards/margins": 0.09033909440040588, "rewards/rejected": 0.057200949639081955, "step": 54 }, { "epoch": 0.06550055824339412, "grad_norm": 4.751723289489746, "learning_rate": 9.935637663885577e-07, "logits/chosen": -2.0421042442321777, "logits/rejected": -2.1586124897003174, "logps/chosen": -74.30927276611328, "logps/rejected": -57.84846878051758, "loss": 0.9529, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23988866806030273, "rewards/margins": 0.19088631868362427, "rewards/rejected": 0.04900236055254936, "step": 55 }, { "epoch": 0.06669147748418311, "grad_norm": 2.9842798709869385, "learning_rate": 9.934445768772348e-07, "logits/chosen": -2.063551187515259, "logits/rejected": -2.1104307174682617, "logps/chosen": -64.1803970336914, "logps/rejected": -58.75416564941406, "loss": 0.9806, "rewards/accuracies": 0.53125, "rewards/chosen": 0.16593357920646667, "rewards/margins": 0.07797824591398239, "rewards/rejected": 0.08795534819364548, "step": 56 }, { "epoch": 0.0678823967249721, "grad_norm": 2.836536169052124, "learning_rate": 9.933253873659118e-07, "logits/chosen": -2.0899367332458496, "logits/rejected": -2.1251606941223145, "logps/chosen": -66.68295288085938, "logps/rejected": -57.12969207763672, "loss": 0.9787, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20211277902126312, "rewards/margins": 0.0855523943901062, "rewards/rejected": 0.11656039208173752, "step": 57 }, { "epoch": 0.06907331596576108, "grad_norm": 2.7228591442108154, "learning_rate": 9.932061978545887e-07, "logits/chosen": -2.10972261428833, "logits/rejected": -2.1162281036376953, "logps/chosen": -66.32699584960938, "logps/rejected": -63.65371322631836, "loss": 0.9903, "rewards/accuracies": 0.625, "rewards/chosen": 0.13188597559928894, "rewards/margins": 0.039817631244659424, "rewards/rejected": 0.09206835180521011, "step": 58 }, { "epoch": 0.07026423520655005, "grad_norm": 2.930144786834717, "learning_rate": 9.930870083432658e-07, "logits/chosen": -2.0634303092956543, "logits/rejected": -2.1144540309906006, "logps/chosen": -66.58775329589844, "logps/rejected": -55.836795806884766, "loss": 0.9803, "rewards/accuracies": 0.65625, "rewards/chosen": 0.23236124217510223, "rewards/margins": 0.08058193325996399, "rewards/rejected": 0.15177927911281586, "step": 59 }, { "epoch": 0.07145515444733903, "grad_norm": 3.234738826751709, "learning_rate": 9.929678188319426e-07, "logits/chosen": -2.0506489276885986, "logits/rejected": -2.161982774734497, "logps/chosen": -63.076515197753906, "logps/rejected": -51.992530822753906, "loss": 0.9715, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21192666888237, "rewards/margins": 0.11582750082015991, "rewards/rejected": 0.09609916806221008, "step": 60 }, { "epoch": 0.07264607368812802, "grad_norm": 3.180436134338379, "learning_rate": 9.928486293206197e-07, "logits/chosen": -2.020657539367676, "logits/rejected": -2.1030075550079346, "logps/chosen": -65.03704833984375, "logps/rejected": -55.86555480957031, "loss": 0.9698, "rewards/accuracies": 0.75, "rewards/chosen": 0.18103531002998352, "rewards/margins": 0.12208140641450882, "rewards/rejected": 0.058953892439603806, "step": 61 }, { "epoch": 0.073836992928917, "grad_norm": 2.7966575622558594, "learning_rate": 9.927294398092968e-07, "logits/chosen": -2.0753958225250244, "logits/rejected": -2.1148107051849365, "logps/chosen": -65.95004272460938, "logps/rejected": -54.1075553894043, "loss": 0.9695, "rewards/accuracies": 0.78125, "rewards/chosen": 0.23364107310771942, "rewards/margins": 0.12292584031820297, "rewards/rejected": 0.11071522533893585, "step": 62 }, { "epoch": 0.07502791216970599, "grad_norm": 3.2117764949798584, "learning_rate": 9.926102502979736e-07, "logits/chosen": -2.142831563949585, "logits/rejected": -2.110743522644043, "logps/chosen": -64.40410614013672, "logps/rejected": -58.213985443115234, "loss": 0.9679, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22972869873046875, "rewards/margins": 0.13060742616653442, "rewards/rejected": 0.09912126511335373, "step": 63 }, { "epoch": 0.07621883141049497, "grad_norm": 2.8063488006591797, "learning_rate": 9.924910607866507e-07, "logits/chosen": -2.0249500274658203, "logits/rejected": -2.131273031234741, "logps/chosen": -66.79505920410156, "logps/rejected": -60.89450454711914, "loss": 0.9749, "rewards/accuracies": 0.71875, "rewards/chosen": 0.24840286374092102, "rewards/margins": 0.10170003771781921, "rewards/rejected": 0.1467028260231018, "step": 64 }, { "epoch": 0.07740975065128396, "grad_norm": 2.8345541954040527, "learning_rate": 9.923718712753278e-07, "logits/chosen": -2.0970382690429688, "logits/rejected": -2.092100143432617, "logps/chosen": -72.47186279296875, "logps/rejected": -63.38644027709961, "loss": 0.9716, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2797465920448303, "rewards/margins": 0.11646394431591034, "rewards/rejected": 0.16328266263008118, "step": 65 }, { "epoch": 0.07860066989207294, "grad_norm": 3.2718026638031006, "learning_rate": 9.922526817640047e-07, "logits/chosen": -2.0869791507720947, "logits/rejected": -2.091785430908203, "logps/chosen": -68.00900268554688, "logps/rejected": -55.79011154174805, "loss": 0.967, "rewards/accuracies": 0.6875, "rewards/chosen": 0.27257445454597473, "rewards/margins": 0.134619802236557, "rewards/rejected": 0.13795465230941772, "step": 66 }, { "epoch": 0.07979158913286193, "grad_norm": 3.9868814945220947, "learning_rate": 9.921334922526817e-07, "logits/chosen": -2.040935754776001, "logits/rejected": -2.048142671585083, "logps/chosen": -70.20096588134766, "logps/rejected": -58.18809509277344, "loss": 0.9531, "rewards/accuracies": 0.8125, "rewards/chosen": 0.31756556034088135, "rewards/margins": 0.1912028193473816, "rewards/rejected": 0.12636275589466095, "step": 67 }, { "epoch": 0.08098250837365091, "grad_norm": 2.9870283603668213, "learning_rate": 9.920143027413586e-07, "logits/chosen": -2.0563268661499023, "logits/rejected": -2.138254404067993, "logps/chosen": -70.9878158569336, "logps/rejected": -63.29623794555664, "loss": 0.9736, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2687835693359375, "rewards/margins": 0.10817517340183258, "rewards/rejected": 0.16060838103294373, "step": 68 }, { "epoch": 0.0821734276144399, "grad_norm": 2.7701380252838135, "learning_rate": 9.918951132300357e-07, "logits/chosen": -2.129303455352783, "logits/rejected": -2.1426234245300293, "logps/chosen": -71.05921936035156, "logps/rejected": -61.66722106933594, "loss": 0.9741, "rewards/accuracies": 0.71875, "rewards/chosen": 0.28042468428611755, "rewards/margins": 0.10635799169540405, "rewards/rejected": 0.1740666925907135, "step": 69 }, { "epoch": 0.08336434685522888, "grad_norm": 3.0016634464263916, "learning_rate": 9.917759237187127e-07, "logits/chosen": -2.0921010971069336, "logits/rejected": -2.1915218830108643, "logps/chosen": -61.12738037109375, "logps/rejected": -61.70901870727539, "loss": 0.9706, "rewards/accuracies": 0.65625, "rewards/chosen": 0.26646947860717773, "rewards/margins": 0.12036387622356415, "rewards/rejected": 0.1461055725812912, "step": 70 }, { "epoch": 0.08455526609601786, "grad_norm": 4.603982925415039, "learning_rate": 9.916567342073898e-07, "logits/chosen": -2.1963024139404297, "logits/rejected": -2.234809160232544, "logps/chosen": -70.74130249023438, "logps/rejected": -62.42111587524414, "loss": 0.9665, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3218453526496887, "rewards/margins": 0.13687904179096222, "rewards/rejected": 0.1849662810564041, "step": 71 }, { "epoch": 0.08574618533680685, "grad_norm": 3.906040906906128, "learning_rate": 9.915375446960667e-07, "logits/chosen": -2.1280651092529297, "logits/rejected": -2.1452083587646484, "logps/chosen": -75.50003814697266, "logps/rejected": -58.349056243896484, "loss": 0.9533, "rewards/accuracies": 0.84375, "rewards/chosen": 0.40310168266296387, "rewards/margins": 0.19462570548057556, "rewards/rejected": 0.2084759920835495, "step": 72 }, { "epoch": 0.08693710457759583, "grad_norm": 2.8063759803771973, "learning_rate": 9.914183551847437e-07, "logits/chosen": -2.088521957397461, "logits/rejected": -2.159076690673828, "logps/chosen": -65.07228088378906, "logps/rejected": -59.42371368408203, "loss": 0.9746, "rewards/accuracies": 0.625, "rewards/chosen": 0.3038679361343384, "rewards/margins": 0.10464370250701904, "rewards/rejected": 0.19922423362731934, "step": 73 }, { "epoch": 0.08812802381838482, "grad_norm": 2.9668681621551514, "learning_rate": 9.912991656734206e-07, "logits/chosen": -2.0250301361083984, "logits/rejected": -2.067556142807007, "logps/chosen": -72.21542358398438, "logps/rejected": -54.5234260559082, "loss": 0.9525, "rewards/accuracies": 0.75, "rewards/chosen": 0.3641531467437744, "rewards/margins": 0.19696572422981262, "rewards/rejected": 0.1671874225139618, "step": 74 }, { "epoch": 0.0893189430591738, "grad_norm": 2.787719249725342, "learning_rate": 9.911799761620977e-07, "logits/chosen": -2.0815091133117676, "logits/rejected": -2.099273204803467, "logps/chosen": -64.40299987792969, "logps/rejected": -54.32618713378906, "loss": 0.9686, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3053808808326721, "rewards/margins": 0.13059090077877045, "rewards/rejected": 0.17479000985622406, "step": 75 }, { "epoch": 0.09050986229996279, "grad_norm": 3.185534954071045, "learning_rate": 9.910607866507745e-07, "logits/chosen": -2.1047890186309814, "logits/rejected": -2.17598295211792, "logps/chosen": -69.4736099243164, "logps/rejected": -58.02397537231445, "loss": 0.9489, "rewards/accuracies": 0.75, "rewards/chosen": 0.3870241045951843, "rewards/margins": 0.2109554260969162, "rewards/rejected": 0.17606867849826813, "step": 76 }, { "epoch": 0.09170078154075177, "grad_norm": 2.9413671493530273, "learning_rate": 9.909415971394516e-07, "logits/chosen": -2.0488955974578857, "logits/rejected": -2.077265739440918, "logps/chosen": -66.46485137939453, "logps/rejected": -57.42985534667969, "loss": 0.9603, "rewards/accuracies": 0.6875, "rewards/chosen": 0.40315741300582886, "rewards/margins": 0.16546973586082458, "rewards/rejected": 0.23768764734268188, "step": 77 }, { "epoch": 0.09289170078154076, "grad_norm": 3.1215457916259766, "learning_rate": 9.908224076281287e-07, "logits/chosen": -2.122507333755493, "logits/rejected": -2.1114492416381836, "logps/chosen": -66.53837585449219, "logps/rejected": -56.33354568481445, "loss": 0.9512, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4044770300388336, "rewards/margins": 0.2071419656276703, "rewards/rejected": 0.19733506441116333, "step": 78 }, { "epoch": 0.09408262002232974, "grad_norm": 2.999993085861206, "learning_rate": 9.907032181168058e-07, "logits/chosen": -2.0921084880828857, "logits/rejected": -2.18192720413208, "logps/chosen": -63.8614501953125, "logps/rejected": -58.05756378173828, "loss": 0.9503, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3784988820552826, "rewards/margins": 0.20832213759422302, "rewards/rejected": 0.17017678916454315, "step": 79 }, { "epoch": 0.09527353926311873, "grad_norm": 52.12335968017578, "learning_rate": 9.905840286054826e-07, "logits/chosen": -2.0765879154205322, "logits/rejected": -2.1220850944519043, "logps/chosen": -63.42406463623047, "logps/rejected": -57.7418212890625, "loss": 0.9803, "rewards/accuracies": 0.625, "rewards/chosen": 0.3155377507209778, "rewards/margins": 0.08300646394491196, "rewards/rejected": 0.23253126442432404, "step": 80 }, { "epoch": 0.09646445850390771, "grad_norm": 3.3584983348846436, "learning_rate": 9.904648390941597e-07, "logits/chosen": -2.039642572402954, "logits/rejected": -2.072404146194458, "logps/chosen": -69.92835998535156, "logps/rejected": -52.94126892089844, "loss": 0.9161, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5230180621147156, "rewards/margins": 0.3522893190383911, "rewards/rejected": 0.17072872817516327, "step": 81 }, { "epoch": 0.09765537774469668, "grad_norm": 2.520843505859375, "learning_rate": 9.903456495828366e-07, "logits/chosen": -2.022801399230957, "logits/rejected": -2.0865023136138916, "logps/chosen": -63.474273681640625, "logps/rejected": -60.458656311035156, "loss": 0.9729, "rewards/accuracies": 0.53125, "rewards/chosen": 0.35608986020088196, "rewards/margins": 0.11386869102716446, "rewards/rejected": 0.2422211766242981, "step": 82 }, { "epoch": 0.09884629698548567, "grad_norm": 3.0908007621765137, "learning_rate": 9.902264600715136e-07, "logits/chosen": -2.044825553894043, "logits/rejected": -2.0719966888427734, "logps/chosen": -64.24059295654297, "logps/rejected": -55.63276290893555, "loss": 0.9375, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39238834381103516, "rewards/margins": 0.2580901086330414, "rewards/rejected": 0.1342982053756714, "step": 83 }, { "epoch": 0.10003721622627465, "grad_norm": 2.959847927093506, "learning_rate": 9.901072705601907e-07, "logits/chosen": -2.042581081390381, "logits/rejected": -2.093832492828369, "logps/chosen": -67.2495346069336, "logps/rejected": -54.32720184326172, "loss": 0.9304, "rewards/accuracies": 0.78125, "rewards/chosen": 0.47203338146209717, "rewards/margins": 0.29207220673561096, "rewards/rejected": 0.17996113002300262, "step": 84 }, { "epoch": 0.10122813546706363, "grad_norm": 3.0653176307678223, "learning_rate": 9.899880810488676e-07, "logits/chosen": -2.0208475589752197, "logits/rejected": -2.0858027935028076, "logps/chosen": -68.57534790039062, "logps/rejected": -57.39752197265625, "loss": 0.9578, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3875155746936798, "rewards/margins": 0.1751491129398346, "rewards/rejected": 0.2123664915561676, "step": 85 }, { "epoch": 0.10241905470785262, "grad_norm": 3.071047306060791, "learning_rate": 9.898688915375446e-07, "logits/chosen": -2.089477062225342, "logits/rejected": -2.2121410369873047, "logps/chosen": -62.38296890258789, "logps/rejected": -56.31248092651367, "loss": 0.938, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3790595531463623, "rewards/margins": 0.2582724690437317, "rewards/rejected": 0.12078707665205002, "step": 86 }, { "epoch": 0.1036099739486416, "grad_norm": 3.0807950496673584, "learning_rate": 9.897497020262217e-07, "logits/chosen": -2.003408670425415, "logits/rejected": -2.072655200958252, "logps/chosen": -67.64408874511719, "logps/rejected": -54.306556701660156, "loss": 0.9348, "rewards/accuracies": 0.78125, "rewards/chosen": 0.43231478333473206, "rewards/margins": 0.2727700173854828, "rewards/rejected": 0.15954478085041046, "step": 87 }, { "epoch": 0.10480089318943059, "grad_norm": 3.5513975620269775, "learning_rate": 9.896305125148986e-07, "logits/chosen": -2.086155652999878, "logits/rejected": -2.172215700149536, "logps/chosen": -73.8574447631836, "logps/rejected": -60.61324691772461, "loss": 0.9182, "rewards/accuracies": 0.75, "rewards/chosen": 0.5365055203437805, "rewards/margins": 0.34487563371658325, "rewards/rejected": 0.19162993133068085, "step": 88 }, { "epoch": 0.10599181243021957, "grad_norm": 3.1048481464385986, "learning_rate": 9.895113230035756e-07, "logits/chosen": -2.0252766609191895, "logits/rejected": -2.104962110519409, "logps/chosen": -66.25935363769531, "logps/rejected": -58.18647384643555, "loss": 0.9509, "rewards/accuracies": 0.71875, "rewards/chosen": 0.461963415145874, "rewards/margins": 0.20703738927841187, "rewards/rejected": 0.25492602586746216, "step": 89 }, { "epoch": 0.10718273167100856, "grad_norm": 2.8967394828796387, "learning_rate": 9.893921334922525e-07, "logits/chosen": -1.9827775955200195, "logits/rejected": -2.0354697704315186, "logps/chosen": -62.99433135986328, "logps/rejected": -55.6053352355957, "loss": 0.9273, "rewards/accuracies": 0.84375, "rewards/chosen": 0.539474368095398, "rewards/margins": 0.3060680031776428, "rewards/rejected": 0.23340635001659393, "step": 90 }, { "epoch": 0.10837365091179754, "grad_norm": 3.1400201320648193, "learning_rate": 9.892729439809296e-07, "logits/chosen": -2.0274133682250977, "logits/rejected": -2.0879132747650146, "logps/chosen": -68.17367553710938, "logps/rejected": -58.50785446166992, "loss": 0.9206, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5230134129524231, "rewards/margins": 0.33339396119117737, "rewards/rejected": 0.18961945176124573, "step": 91 }, { "epoch": 0.10956457015258653, "grad_norm": 2.8190062046051025, "learning_rate": 9.891537544696067e-07, "logits/chosen": -2.036652088165283, "logits/rejected": -2.0356414318084717, "logps/chosen": -69.98944091796875, "logps/rejected": -60.47941589355469, "loss": 0.9387, "rewards/accuracies": 0.65625, "rewards/chosen": 0.498754620552063, "rewards/margins": 0.26256489753723145, "rewards/rejected": 0.23618975281715393, "step": 92 }, { "epoch": 0.11075548939337551, "grad_norm": 3.6574721336364746, "learning_rate": 9.890345649582837e-07, "logits/chosen": -1.9980363845825195, "logits/rejected": -2.0426275730133057, "logps/chosen": -67.35034942626953, "logps/rejected": -58.67851257324219, "loss": 0.9115, "rewards/accuracies": 0.75, "rewards/chosen": 0.503044843673706, "rewards/margins": 0.3705731928348541, "rewards/rejected": 0.13247168064117432, "step": 93 }, { "epoch": 0.1119464086341645, "grad_norm": 2.931793451309204, "learning_rate": 9.889153754469606e-07, "logits/chosen": -1.9867925643920898, "logits/rejected": -2.056858777999878, "logps/chosen": -65.62866973876953, "logps/rejected": -58.654266357421875, "loss": 0.9266, "rewards/accuracies": 0.75, "rewards/chosen": 0.4904864430427551, "rewards/margins": 0.31099510192871094, "rewards/rejected": 0.1794913411140442, "step": 94 }, { "epoch": 0.11313732787495348, "grad_norm": 3.1559643745422363, "learning_rate": 9.887961859356377e-07, "logits/chosen": -1.9927308559417725, "logits/rejected": -2.0795202255249023, "logps/chosen": -68.66719055175781, "logps/rejected": -60.747459411621094, "loss": 0.9182, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5855157375335693, "rewards/margins": 0.350320041179657, "rewards/rejected": 0.23519566655158997, "step": 95 }, { "epoch": 0.11432824711574247, "grad_norm": 3.7782695293426514, "learning_rate": 9.886769964243145e-07, "logits/chosen": -2.0165109634399414, "logits/rejected": -2.1176016330718994, "logps/chosen": -61.02204513549805, "logps/rejected": -55.63885498046875, "loss": 0.8987, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5333103537559509, "rewards/margins": 0.4294084906578064, "rewards/rejected": 0.10390186309814453, "step": 96 }, { "epoch": 0.11551916635653145, "grad_norm": 2.922933340072632, "learning_rate": 9.885578069129916e-07, "logits/chosen": -2.0734031200408936, "logits/rejected": -2.065685272216797, "logps/chosen": -67.10218811035156, "logps/rejected": -60.328041076660156, "loss": 0.9407, "rewards/accuracies": 0.6875, "rewards/chosen": 0.42156982421875, "rewards/margins": 0.2487458884716034, "rewards/rejected": 0.17282389104366302, "step": 97 }, { "epoch": 0.11671008559732043, "grad_norm": 2.663207530975342, "learning_rate": 9.884386174016685e-07, "logits/chosen": -2.0420186519622803, "logits/rejected": -2.062703847885132, "logps/chosen": -62.44654846191406, "logps/rejected": -62.31230926513672, "loss": 0.951, "rewards/accuracies": 0.6875, "rewards/chosen": 0.41855213046073914, "rewards/margins": 0.2080845981836319, "rewards/rejected": 0.21046754717826843, "step": 98 }, { "epoch": 0.11790100483810942, "grad_norm": 3.526052713394165, "learning_rate": 9.883194278903455e-07, "logits/chosen": -2.031085968017578, "logits/rejected": -2.0408051013946533, "logps/chosen": -64.61476135253906, "logps/rejected": -58.5986328125, "loss": 0.9179, "rewards/accuracies": 0.75, "rewards/chosen": 0.5009661316871643, "rewards/margins": 0.3515300452709198, "rewards/rejected": 0.1494361162185669, "step": 99 }, { "epoch": 0.1190919240788984, "grad_norm": 3.4608068466186523, "learning_rate": 9.882002383790226e-07, "logits/chosen": -1.9809231758117676, "logits/rejected": -2.0780935287475586, "logps/chosen": -66.32308959960938, "logps/rejected": -60.57201385498047, "loss": 0.8865, "rewards/accuracies": 0.8125, "rewards/chosen": 0.48552268743515015, "rewards/margins": 0.4797893464565277, "rewards/rejected": 0.005733308382332325, "step": 100 }, { "epoch": 0.12028284331968739, "grad_norm": 3.0466952323913574, "learning_rate": 9.880810488676997e-07, "logits/chosen": -2.050908088684082, "logits/rejected": -2.0837204456329346, "logps/chosen": -62.83184814453125, "logps/rejected": -56.19243621826172, "loss": 0.911, "rewards/accuracies": 0.8125, "rewards/chosen": 0.43155378103256226, "rewards/margins": 0.3806958794593811, "rewards/rejected": 0.05085792392492294, "step": 101 }, { "epoch": 0.12147376256047637, "grad_norm": 3.3948607444763184, "learning_rate": 9.879618593563765e-07, "logits/chosen": -2.0714919567108154, "logits/rejected": -2.1270735263824463, "logps/chosen": -69.11546325683594, "logps/rejected": -59.758323669433594, "loss": 0.8966, "rewards/accuracies": 0.84375, "rewards/chosen": 0.44262391328811646, "rewards/margins": 0.43057817220687866, "rewards/rejected": 0.012045752257108688, "step": 102 }, { "epoch": 0.12266468180126536, "grad_norm": 3.6139605045318604, "learning_rate": 9.878426698450536e-07, "logits/chosen": -1.9971951246261597, "logits/rejected": -2.0674073696136475, "logps/chosen": -69.50723266601562, "logps/rejected": -60.062564849853516, "loss": 0.9095, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5241125822067261, "rewards/margins": 0.38886532187461853, "rewards/rejected": 0.13524730503559113, "step": 103 }, { "epoch": 0.12385560104205434, "grad_norm": 3.555863857269287, "learning_rate": 9.877234803337305e-07, "logits/chosen": -2.0723624229431152, "logits/rejected": -2.1624836921691895, "logps/chosen": -64.61151123046875, "logps/rejected": -56.062950134277344, "loss": 0.8966, "rewards/accuracies": 0.71875, "rewards/chosen": 0.36502712965011597, "rewards/margins": 0.43896782398223877, "rewards/rejected": -0.073940709233284, "step": 104 }, { "epoch": 0.12504652028284333, "grad_norm": 3.185349225997925, "learning_rate": 9.876042908224076e-07, "logits/chosen": -2.0002951622009277, "logits/rejected": -2.0542399883270264, "logps/chosen": -67.17625427246094, "logps/rejected": -60.09967041015625, "loss": 0.9159, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4498951733112335, "rewards/margins": 0.3614354133605957, "rewards/rejected": 0.08845976740121841, "step": 105 }, { "epoch": 0.1262374395236323, "grad_norm": 3.289133071899414, "learning_rate": 9.874851013110846e-07, "logits/chosen": -2.068037509918213, "logits/rejected": -2.1124651432037354, "logps/chosen": -68.92134857177734, "logps/rejected": -56.182621002197266, "loss": 0.897, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4859602451324463, "rewards/margins": 0.44136863946914673, "rewards/rejected": 0.044591665267944336, "step": 106 }, { "epoch": 0.1274283587644213, "grad_norm": 3.0434458255767822, "learning_rate": 9.873659117997615e-07, "logits/chosen": -2.0234429836273193, "logits/rejected": -2.090620517730713, "logps/chosen": -62.72702407836914, "logps/rejected": -60.054988861083984, "loss": 0.9558, "rewards/accuracies": 0.5625, "rewards/chosen": 0.22568944096565247, "rewards/margins": 0.19353830814361572, "rewards/rejected": 0.03215114027261734, "step": 107 }, { "epoch": 0.12861927800521028, "grad_norm": 3.211669683456421, "learning_rate": 9.872467222884386e-07, "logits/chosen": -2.076704740524292, "logits/rejected": -2.143450975418091, "logps/chosen": -66.25326538085938, "logps/rejected": -66.40040588378906, "loss": 0.961, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3214268386363983, "rewards/margins": 0.161854088306427, "rewards/rejected": 0.15957272052764893, "step": 108 }, { "epoch": 0.12981019724599926, "grad_norm": 3.0798308849334717, "learning_rate": 9.871275327771156e-07, "logits/chosen": -2.021026611328125, "logits/rejected": -2.0421364307403564, "logps/chosen": -62.147430419921875, "logps/rejected": -55.358699798583984, "loss": 0.9236, "rewards/accuracies": 0.71875, "rewards/chosen": 0.41467007994651794, "rewards/margins": 0.3208097815513611, "rewards/rejected": 0.09386028349399567, "step": 109 }, { "epoch": 0.13100111648678825, "grad_norm": 3.6892588138580322, "learning_rate": 9.870083432657925e-07, "logits/chosen": -2.0522563457489014, "logits/rejected": -2.1019034385681152, "logps/chosen": -66.99970245361328, "logps/rejected": -57.83711242675781, "loss": 0.8561, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5312105417251587, "rewards/margins": 0.6201417446136475, "rewards/rejected": -0.0889311209321022, "step": 110 }, { "epoch": 0.13219203572757723, "grad_norm": 3.2412502765655518, "learning_rate": 9.868891537544696e-07, "logits/chosen": -2.064802408218384, "logits/rejected": -2.1003055572509766, "logps/chosen": -67.1126937866211, "logps/rejected": -62.281768798828125, "loss": 0.9046, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5407306551933289, "rewards/margins": 0.4139191210269928, "rewards/rejected": 0.12681150436401367, "step": 111 }, { "epoch": 0.13338295496836622, "grad_norm": 2.9571566581726074, "learning_rate": 9.867699642431464e-07, "logits/chosen": -2.0430619716644287, "logits/rejected": -2.1045918464660645, "logps/chosen": -68.93502044677734, "logps/rejected": -62.133155822753906, "loss": 0.9036, "rewards/accuracies": 0.71875, "rewards/chosen": 0.42951077222824097, "rewards/margins": 0.4142611622810364, "rewards/rejected": 0.015249650925397873, "step": 112 }, { "epoch": 0.1345738742091552, "grad_norm": 3.6695497035980225, "learning_rate": 9.866507747318235e-07, "logits/chosen": -2.0360448360443115, "logits/rejected": -2.0994348526000977, "logps/chosen": -59.1708984375, "logps/rejected": -66.81745147705078, "loss": 0.8858, "rewards/accuracies": 0.6875, "rewards/chosen": 0.273764967918396, "rewards/margins": 0.49686652421951294, "rewards/rejected": -0.22310154139995575, "step": 113 }, { "epoch": 0.1357647934499442, "grad_norm": 3.300729990005493, "learning_rate": 9.865315852205006e-07, "logits/chosen": -2.0182547569274902, "logits/rejected": -2.120955467224121, "logps/chosen": -65.45956420898438, "logps/rejected": -66.12915802001953, "loss": 0.9523, "rewards/accuracies": 0.5625, "rewards/chosen": 0.367256224155426, "rewards/margins": 0.21935436129570007, "rewards/rejected": 0.14790186285972595, "step": 114 }, { "epoch": 0.13695571269073317, "grad_norm": 3.08241868019104, "learning_rate": 9.864123957091777e-07, "logits/chosen": -2.0577304363250732, "logits/rejected": -2.0931715965270996, "logps/chosen": -67.42969512939453, "logps/rejected": -61.27907943725586, "loss": 0.9055, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3365929424762726, "rewards/margins": 0.41082486510276794, "rewards/rejected": -0.07423193752765656, "step": 115 }, { "epoch": 0.13814663193152216, "grad_norm": 3.556131601333618, "learning_rate": 9.862932061978545e-07, "logits/chosen": -1.9860936403274536, "logits/rejected": -2.0718016624450684, "logps/chosen": -69.31189727783203, "logps/rejected": -58.25116729736328, "loss": 0.8609, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5405215620994568, "rewards/margins": 0.6131572723388672, "rewards/rejected": -0.07263574004173279, "step": 116 }, { "epoch": 0.13933755117231114, "grad_norm": 4.644652843475342, "learning_rate": 9.861740166865316e-07, "logits/chosen": -2.063572406768799, "logits/rejected": -2.1479711532592773, "logps/chosen": -66.28840637207031, "logps/rejected": -56.62774658203125, "loss": 0.8885, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5419347286224365, "rewards/margins": 0.5028614401817322, "rewards/rejected": 0.03907326981425285, "step": 117 }, { "epoch": 0.1405284704131001, "grad_norm": 3.6094913482666016, "learning_rate": 9.860548271752085e-07, "logits/chosen": -2.1102583408355713, "logits/rejected": -2.179460048675537, "logps/chosen": -67.51112365722656, "logps/rejected": -57.60792541503906, "loss": 0.8211, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6051231622695923, "rewards/margins": 0.7883574366569519, "rewards/rejected": -0.18323424458503723, "step": 118 }, { "epoch": 0.14171938965388908, "grad_norm": 3.0819077491760254, "learning_rate": 9.859356376638855e-07, "logits/chosen": -2.107353448867798, "logits/rejected": -2.1378657817840576, "logps/chosen": -63.070960998535156, "logps/rejected": -65.02497100830078, "loss": 0.94, "rewards/accuracies": 0.59375, "rewards/chosen": 0.29511862993240356, "rewards/margins": 0.26224610209465027, "rewards/rejected": 0.0328725203871727, "step": 119 }, { "epoch": 0.14291030889467807, "grad_norm": 3.4685652256011963, "learning_rate": 9.858164481525624e-07, "logits/chosen": -2.0386221408843994, "logits/rejected": -2.0831265449523926, "logps/chosen": -67.67759704589844, "logps/rejected": -65.43622589111328, "loss": 0.9425, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4068605303764343, "rewards/margins": 0.25570645928382874, "rewards/rejected": 0.15115408599376678, "step": 120 }, { "epoch": 0.14410122813546705, "grad_norm": 3.452723741531372, "learning_rate": 9.856972586412395e-07, "logits/chosen": -2.00689959526062, "logits/rejected": -2.165332317352295, "logps/chosen": -66.68095397949219, "logps/rejected": -57.52690124511719, "loss": 0.8219, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5243154764175415, "rewards/margins": 0.8085456490516663, "rewards/rejected": -0.28423023223876953, "step": 121 }, { "epoch": 0.14529214737625604, "grad_norm": 3.7984554767608643, "learning_rate": 9.855780691299165e-07, "logits/chosen": -2.0638980865478516, "logits/rejected": -2.1304283142089844, "logps/chosen": -65.26104736328125, "logps/rejected": -59.27961349487305, "loss": 0.8377, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5600318908691406, "rewards/margins": 0.714052677154541, "rewards/rejected": -0.15402080118656158, "step": 122 }, { "epoch": 0.14648306661704502, "grad_norm": 4.091915607452393, "learning_rate": 9.854588796185936e-07, "logits/chosen": -2.0776708126068115, "logits/rejected": -2.168989896774292, "logps/chosen": -69.57801818847656, "logps/rejected": -64.0331802368164, "loss": 0.8603, "rewards/accuracies": 0.78125, "rewards/chosen": 0.41675639152526855, "rewards/margins": 0.6307723522186279, "rewards/rejected": -0.2140159010887146, "step": 123 }, { "epoch": 0.147673985857834, "grad_norm": 3.298560619354248, "learning_rate": 9.853396901072705e-07, "logits/chosen": -2.031759023666382, "logits/rejected": -2.085998058319092, "logps/chosen": -69.09909057617188, "logps/rejected": -66.25604248046875, "loss": 0.8846, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6177868843078613, "rewards/margins": 0.5053398013114929, "rewards/rejected": 0.112447090446949, "step": 124 }, { "epoch": 0.148864905098623, "grad_norm": 3.570509910583496, "learning_rate": 9.852205005959475e-07, "logits/chosen": -2.037961006164551, "logits/rejected": -2.171517848968506, "logps/chosen": -64.94511413574219, "logps/rejected": -62.96685028076172, "loss": 0.8239, "rewards/accuracies": 0.875, "rewards/chosen": 0.36216190457344055, "rewards/margins": 0.7894969582557678, "rewards/rejected": -0.42733505368232727, "step": 125 }, { "epoch": 0.15005582433941198, "grad_norm": 3.089000701904297, "learning_rate": 9.851013110846244e-07, "logits/chosen": -2.021090507507324, "logits/rejected": -2.089552640914917, "logps/chosen": -65.77288818359375, "logps/rejected": -63.96531677246094, "loss": 0.8903, "rewards/accuracies": 0.625, "rewards/chosen": 0.3450705111026764, "rewards/margins": 0.5099721550941467, "rewards/rejected": -0.16490164399147034, "step": 126 }, { "epoch": 0.15124674358020096, "grad_norm": 3.66096568107605, "learning_rate": 9.849821215733015e-07, "logits/chosen": -2.0038154125213623, "logits/rejected": -2.1206414699554443, "logps/chosen": -71.77007293701172, "logps/rejected": -60.042911529541016, "loss": 0.8221, "rewards/accuracies": 0.84375, "rewards/chosen": 0.43710845708847046, "rewards/margins": 0.8071966767311096, "rewards/rejected": -0.37008827924728394, "step": 127 }, { "epoch": 0.15243766282098994, "grad_norm": 3.8460001945495605, "learning_rate": 9.848629320619786e-07, "logits/chosen": -2.030214309692383, "logits/rejected": -2.113687038421631, "logps/chosen": -67.85010528564453, "logps/rejected": -65.68498229980469, "loss": 0.7852, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5813971161842346, "rewards/margins": 0.9723818898200989, "rewards/rejected": -0.39098477363586426, "step": 128 }, { "epoch": 0.15362858206177893, "grad_norm": 4.174023628234863, "learning_rate": 9.847437425506554e-07, "logits/chosen": -1.9727104902267456, "logits/rejected": -2.0934336185455322, "logps/chosen": -67.58734893798828, "logps/rejected": -61.66020965576172, "loss": 0.787, "rewards/accuracies": 0.875, "rewards/chosen": 0.5289995074272156, "rewards/margins": 0.942211925983429, "rewards/rejected": -0.41321247816085815, "step": 129 }, { "epoch": 0.1548195013025679, "grad_norm": 3.7196693420410156, "learning_rate": 9.846245530393325e-07, "logits/chosen": -1.9798918962478638, "logits/rejected": -2.075251817703247, "logps/chosen": -68.61293029785156, "logps/rejected": -59.76530456542969, "loss": 0.8118, "rewards/accuracies": 0.75, "rewards/chosen": 0.3688432276248932, "rewards/margins": 0.8380426168441772, "rewards/rejected": -0.46919941902160645, "step": 130 }, { "epoch": 0.1560104205433569, "grad_norm": 3.301661252975464, "learning_rate": 9.845053635280096e-07, "logits/chosen": -2.087707996368408, "logits/rejected": -2.136826992034912, "logps/chosen": -69.33439636230469, "logps/rejected": -62.48397445678711, "loss": 0.8055, "rewards/accuracies": 0.75, "rewards/chosen": 0.44689908623695374, "rewards/margins": 0.9314046502113342, "rewards/rejected": -0.48450565338134766, "step": 131 }, { "epoch": 0.15720133978414588, "grad_norm": 4.15054988861084, "learning_rate": 9.843861740166864e-07, "logits/chosen": -2.04575514793396, "logits/rejected": -2.115335464477539, "logps/chosen": -69.427978515625, "logps/rejected": -62.618934631347656, "loss": 0.7869, "rewards/accuracies": 0.75, "rewards/chosen": 0.31083136796951294, "rewards/margins": 0.9618932008743286, "rewards/rejected": -0.6510618329048157, "step": 132 }, { "epoch": 0.15839225902493487, "grad_norm": 3.8120007514953613, "learning_rate": 9.842669845053635e-07, "logits/chosen": -2.066240072250366, "logits/rejected": -2.100522041320801, "logps/chosen": -66.57635498046875, "logps/rejected": -62.58241271972656, "loss": 0.8237, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3306209444999695, "rewards/margins": 0.8269537687301636, "rewards/rejected": -0.4963328242301941, "step": 133 }, { "epoch": 0.15958317826572385, "grad_norm": 4.075441837310791, "learning_rate": 9.841477949940404e-07, "logits/chosen": -1.9729503393173218, "logits/rejected": -2.0969998836517334, "logps/chosen": -66.75556182861328, "logps/rejected": -61.849098205566406, "loss": 0.7848, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5699818134307861, "rewards/margins": 1.007562279701233, "rewards/rejected": -0.4375804364681244, "step": 134 }, { "epoch": 0.16077409750651284, "grad_norm": 3.828011989593506, "learning_rate": 9.840286054827174e-07, "logits/chosen": -2.061943292617798, "logits/rejected": -2.194150686264038, "logps/chosen": -70.40845489501953, "logps/rejected": -65.12775421142578, "loss": 0.7636, "rewards/accuracies": 0.875, "rewards/chosen": 0.45086920261383057, "rewards/margins": 1.08955979347229, "rewards/rejected": -0.6386905908584595, "step": 135 }, { "epoch": 0.16196501674730182, "grad_norm": 4.294039726257324, "learning_rate": 9.839094159713945e-07, "logits/chosen": -2.0720303058624268, "logits/rejected": -2.2079038619995117, "logps/chosen": -72.89014434814453, "logps/rejected": -63.324222564697266, "loss": 0.7296, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6203330159187317, "rewards/margins": 1.2339322566986084, "rewards/rejected": -0.6135989427566528, "step": 136 }, { "epoch": 0.1631559359880908, "grad_norm": 3.6656291484832764, "learning_rate": 9.837902264600716e-07, "logits/chosen": -2.0360629558563232, "logits/rejected": -2.1413474082946777, "logps/chosen": -64.83580017089844, "logps/rejected": -64.41714477539062, "loss": 0.8816, "rewards/accuracies": 0.625, "rewards/chosen": 0.04554498940706253, "rewards/margins": 0.5655654668807983, "rewards/rejected": -0.5200205445289612, "step": 137 }, { "epoch": 0.1643468552288798, "grad_norm": 3.508998394012451, "learning_rate": 9.836710369487484e-07, "logits/chosen": -2.0303940773010254, "logits/rejected": -2.1780500411987305, "logps/chosen": -69.0756607055664, "logps/rejected": -62.3876838684082, "loss": 0.8292, "rewards/accuracies": 0.84375, "rewards/chosen": 0.34269410371780396, "rewards/margins": 0.8467454314231873, "rewards/rejected": -0.5040513277053833, "step": 138 }, { "epoch": 0.16553777446966877, "grad_norm": 3.8833138942718506, "learning_rate": 9.835518474374255e-07, "logits/chosen": -2.0725913047790527, "logits/rejected": -2.1661136150360107, "logps/chosen": -68.47532653808594, "logps/rejected": -66.46410369873047, "loss": 0.8047, "rewards/accuracies": 0.75, "rewards/chosen": 0.5259618163108826, "rewards/margins": 0.9880290031433105, "rewards/rejected": -0.4620671570301056, "step": 139 }, { "epoch": 0.16672869371045776, "grad_norm": 4.56343936920166, "learning_rate": 9.834326579261024e-07, "logits/chosen": -2.072979688644409, "logits/rejected": -2.2064640522003174, "logps/chosen": -71.22791290283203, "logps/rejected": -69.96098327636719, "loss": 0.6709, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5005596280097961, "rewards/margins": 1.611066222190857, "rewards/rejected": -1.1105066537857056, "step": 140 }, { "epoch": 0.16791961295124674, "grad_norm": 3.610990524291992, "learning_rate": 9.833134684147795e-07, "logits/chosen": -2.0660574436187744, "logits/rejected": -2.1572649478912354, "logps/chosen": -72.41468811035156, "logps/rejected": -66.92437744140625, "loss": 0.8593, "rewards/accuracies": 0.75, "rewards/chosen": 0.2994958460330963, "rewards/margins": 0.7047387361526489, "rewards/rejected": -0.4052428603172302, "step": 141 }, { "epoch": 0.16911053219203573, "grad_norm": 3.359017848968506, "learning_rate": 9.831942789034563e-07, "logits/chosen": -2.070753574371338, "logits/rejected": -2.138495445251465, "logps/chosen": -73.66520690917969, "logps/rejected": -72.62770080566406, "loss": 0.8211, "rewards/accuracies": 0.75, "rewards/chosen": 0.12776267528533936, "rewards/margins": 0.8522368669509888, "rewards/rejected": -0.7244741916656494, "step": 142 }, { "epoch": 0.1703014514328247, "grad_norm": 3.7041354179382324, "learning_rate": 9.830750893921334e-07, "logits/chosen": -2.0750021934509277, "logits/rejected": -2.2418642044067383, "logps/chosen": -74.01957702636719, "logps/rejected": -67.53632354736328, "loss": 0.7703, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5002488493919373, "rewards/margins": 1.1277104616165161, "rewards/rejected": -0.6274614334106445, "step": 143 }, { "epoch": 0.1714923706736137, "grad_norm": 3.9019064903259277, "learning_rate": 9.829558998808105e-07, "logits/chosen": -2.041166305541992, "logits/rejected": -2.139305591583252, "logps/chosen": -65.28072357177734, "logps/rejected": -71.2506332397461, "loss": 0.8593, "rewards/accuracies": 0.6875, "rewards/chosen": 0.057499974966049194, "rewards/margins": 0.7772101163864136, "rewards/rejected": -0.7197101712226868, "step": 144 }, { "epoch": 0.17268328991440268, "grad_norm": 3.8554937839508057, "learning_rate": 9.828367103694875e-07, "logits/chosen": -2.005864381790161, "logits/rejected": -2.0571203231811523, "logps/chosen": -69.73316192626953, "logps/rejected": -63.01100158691406, "loss": 0.8219, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5684286952018738, "rewards/margins": 0.8522120714187622, "rewards/rejected": -0.2837834358215332, "step": 145 }, { "epoch": 0.17387420915519167, "grad_norm": 3.2072224617004395, "learning_rate": 9.827175208581644e-07, "logits/chosen": -2.0738697052001953, "logits/rejected": -2.172498941421509, "logps/chosen": -68.20930480957031, "logps/rejected": -70.6118392944336, "loss": 0.7892, "rewards/accuracies": 0.75, "rewards/chosen": 0.40957191586494446, "rewards/margins": 1.1250214576721191, "rewards/rejected": -0.7154496908187866, "step": 146 }, { "epoch": 0.17506512839598065, "grad_norm": 3.8597137928009033, "learning_rate": 9.825983313468415e-07, "logits/chosen": -2.090118646621704, "logits/rejected": -2.1352641582489014, "logps/chosen": -68.11825561523438, "logps/rejected": -67.88504028320312, "loss": 0.7999, "rewards/accuracies": 0.75, "rewards/chosen": 0.23044489324092865, "rewards/margins": 1.0094027519226074, "rewards/rejected": -0.7789578437805176, "step": 147 }, { "epoch": 0.17625604763676964, "grad_norm": 3.66079044342041, "learning_rate": 9.824791418355183e-07, "logits/chosen": -2.071930408477783, "logits/rejected": -2.1434175968170166, "logps/chosen": -63.45271301269531, "logps/rejected": -71.06719970703125, "loss": 0.8298, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24694254994392395, "rewards/margins": 0.9144624471664429, "rewards/rejected": -0.6675198674201965, "step": 148 }, { "epoch": 0.17744696687755862, "grad_norm": 3.813354969024658, "learning_rate": 9.823599523241954e-07, "logits/chosen": -2.0076229572296143, "logits/rejected": -2.166483163833618, "logps/chosen": -67.3553466796875, "logps/rejected": -67.42361450195312, "loss": 0.8038, "rewards/accuracies": 0.6875, "rewards/chosen": -0.005198769271373749, "rewards/margins": 0.9976803660392761, "rewards/rejected": -1.002879023551941, "step": 149 }, { "epoch": 0.1786378861183476, "grad_norm": 3.9033925533294678, "learning_rate": 9.822407628128725e-07, "logits/chosen": -2.1381115913391113, "logits/rejected": -2.1526906490325928, "logps/chosen": -74.60137939453125, "logps/rejected": -68.7036361694336, "loss": 0.7822, "rewards/accuracies": 0.75, "rewards/chosen": 0.27569350600242615, "rewards/margins": 1.0473655462265015, "rewards/rejected": -0.7716720700263977, "step": 150 }, { "epoch": 0.1798288053591366, "grad_norm": 3.459782361984253, "learning_rate": 9.821215733015493e-07, "logits/chosen": -2.136368989944458, "logits/rejected": -2.075028657913208, "logps/chosen": -66.05201721191406, "logps/rejected": -66.44989013671875, "loss": 0.9767, "rewards/accuracies": 0.46875, "rewards/chosen": 0.16263572871685028, "rewards/margins": 0.1576789766550064, "rewards/rejected": 0.004956766963005066, "step": 151 }, { "epoch": 0.18101972459992557, "grad_norm": 4.35509729385376, "learning_rate": 9.820023837902264e-07, "logits/chosen": -2.103015184402466, "logits/rejected": -2.1774420738220215, "logps/chosen": -68.27909851074219, "logps/rejected": -72.03514862060547, "loss": 0.8208, "rewards/accuracies": 0.75, "rewards/chosen": 0.4620397090911865, "rewards/margins": 0.9541091322898865, "rewards/rejected": -0.49206939339637756, "step": 152 }, { "epoch": 0.18221064384071456, "grad_norm": 4.262054443359375, "learning_rate": 9.818831942789035e-07, "logits/chosen": -2.045294761657715, "logits/rejected": -2.222249746322632, "logps/chosen": -64.26964569091797, "logps/rejected": -69.26112365722656, "loss": 0.6802, "rewards/accuracies": 0.84375, "rewards/chosen": 0.614319384098053, "rewards/margins": 1.6484620571136475, "rewards/rejected": -1.0341426134109497, "step": 153 }, { "epoch": 0.18340156308150354, "grad_norm": 4.5582275390625, "learning_rate": 9.817640047675803e-07, "logits/chosen": -2.079808473587036, "logits/rejected": -2.1491281986236572, "logps/chosen": -67.69279479980469, "logps/rejected": -68.80178833007812, "loss": 0.8549, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004261620342731476, "rewards/margins": 0.7605800032615662, "rewards/rejected": -0.7648415565490723, "step": 154 }, { "epoch": 0.18459248232229253, "grad_norm": 3.6772711277008057, "learning_rate": 9.816448152562574e-07, "logits/chosen": -2.1179754734039307, "logits/rejected": -2.1545450687408447, "logps/chosen": -64.47651672363281, "logps/rejected": -67.83186340332031, "loss": 0.8639, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1526850461959839, "rewards/margins": 0.782399594783783, "rewards/rejected": -0.6297144293785095, "step": 155 }, { "epoch": 0.1857834015630815, "grad_norm": 3.771129846572876, "learning_rate": 9.815256257449343e-07, "logits/chosen": -2.047318458557129, "logits/rejected": -2.1822896003723145, "logps/chosen": -71.52033233642578, "logps/rejected": -68.17339324951172, "loss": 0.7637, "rewards/accuracies": 0.71875, "rewards/chosen": 0.47775086760520935, "rewards/margins": 1.248555302619934, "rewards/rejected": -0.7708043456077576, "step": 156 }, { "epoch": 0.1869743208038705, "grad_norm": 4.038097381591797, "learning_rate": 9.814064362336114e-07, "logits/chosen": -2.03074049949646, "logits/rejected": -2.1594130992889404, "logps/chosen": -67.63143157958984, "logps/rejected": -68.26014709472656, "loss": 0.7953, "rewards/accuracies": 0.75, "rewards/chosen": 0.4917548894882202, "rewards/margins": 1.1505261659622192, "rewards/rejected": -0.6587712168693542, "step": 157 }, { "epoch": 0.18816524004465948, "grad_norm": 3.4557275772094727, "learning_rate": 9.812872467222884e-07, "logits/chosen": -2.0236685276031494, "logits/rejected": -2.0870625972747803, "logps/chosen": -64.18167877197266, "logps/rejected": -60.709964752197266, "loss": 0.9075, "rewards/accuracies": 0.5625, "rewards/chosen": 0.16807787120342255, "rewards/margins": 0.5112019777297974, "rewards/rejected": -0.3431241810321808, "step": 158 }, { "epoch": 0.18935615928544847, "grad_norm": 4.154829502105713, "learning_rate": 9.811680572109655e-07, "logits/chosen": -2.0764193534851074, "logits/rejected": -2.125408172607422, "logps/chosen": -65.80474090576172, "logps/rejected": -65.56358337402344, "loss": 0.8664, "rewards/accuracies": 0.625, "rewards/chosen": 0.22508302330970764, "rewards/margins": 0.6572768092155457, "rewards/rejected": -0.4321938455104828, "step": 159 }, { "epoch": 0.19054707852623745, "grad_norm": 3.584824800491333, "learning_rate": 9.810488676996424e-07, "logits/chosen": -2.126493453979492, "logits/rejected": -2.214369773864746, "logps/chosen": -67.39305877685547, "logps/rejected": -65.82145690917969, "loss": 0.81, "rewards/accuracies": 0.75, "rewards/chosen": 0.14219757914543152, "rewards/margins": 1.0637831687927246, "rewards/rejected": -0.9215856194496155, "step": 160 }, { "epoch": 0.19173799776702644, "grad_norm": 3.3031015396118164, "learning_rate": 9.809296781883194e-07, "logits/chosen": -2.078460216522217, "logits/rejected": -2.1394264698028564, "logps/chosen": -66.56594848632812, "logps/rejected": -69.8752670288086, "loss": 0.8496, "rewards/accuracies": 0.59375, "rewards/chosen": 0.20721465349197388, "rewards/margins": 0.8263473510742188, "rewards/rejected": -0.6191325783729553, "step": 161 }, { "epoch": 0.19292891700781542, "grad_norm": 4.050117492675781, "learning_rate": 9.808104886769963e-07, "logits/chosen": -2.0724823474884033, "logits/rejected": -2.155473470687866, "logps/chosen": -65.60140991210938, "logps/rejected": -70.21705627441406, "loss": 0.7646, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5772321820259094, "rewards/margins": 1.2883250713348389, "rewards/rejected": -0.7110928297042847, "step": 162 }, { "epoch": 0.1941198362486044, "grad_norm": 4.184625625610352, "learning_rate": 9.806912991656734e-07, "logits/chosen": -1.9820616245269775, "logits/rejected": -2.1038618087768555, "logps/chosen": -68.83283996582031, "logps/rejected": -63.0880126953125, "loss": 0.7944, "rewards/accuracies": 0.71875, "rewards/chosen": 0.21776309609413147, "rewards/margins": 1.1722980737686157, "rewards/rejected": -0.9545350074768066, "step": 163 }, { "epoch": 0.19531075548939336, "grad_norm": 4.221437454223633, "learning_rate": 9.805721096543502e-07, "logits/chosen": -2.014169216156006, "logits/rejected": -2.198230743408203, "logps/chosen": -64.88208770751953, "logps/rejected": -63.280330657958984, "loss": 0.6743, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9322190880775452, "rewards/margins": 1.764447808265686, "rewards/rejected": -0.8322287797927856, "step": 164 }, { "epoch": 0.19650167473018235, "grad_norm": 3.4287779331207275, "learning_rate": 9.804529201430273e-07, "logits/chosen": -2.0385358333587646, "logits/rejected": -2.1154332160949707, "logps/chosen": -66.80724334716797, "logps/rejected": -68.53314208984375, "loss": 0.7695, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5204597115516663, "rewards/margins": 1.3212864398956299, "rewards/rejected": -0.8008266687393188, "step": 165 }, { "epoch": 0.19769259397097133, "grad_norm": 4.014829158782959, "learning_rate": 9.803337306317044e-07, "logits/chosen": -2.068859338760376, "logits/rejected": -2.2108538150787354, "logps/chosen": -67.49810028076172, "logps/rejected": -66.98979187011719, "loss": 0.6959, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6892157793045044, "rewards/margins": 1.6890063285827637, "rewards/rejected": -0.9997906684875488, "step": 166 }, { "epoch": 0.19888351321176032, "grad_norm": 4.076284885406494, "learning_rate": 9.802145411203815e-07, "logits/chosen": -2.0955209732055664, "logits/rejected": -2.1867847442626953, "logps/chosen": -69.4415054321289, "logps/rejected": -64.42069244384766, "loss": 0.7533, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4622475504875183, "rewards/margins": 1.284137487411499, "rewards/rejected": -0.8218899965286255, "step": 167 }, { "epoch": 0.2000744324525493, "grad_norm": 3.8336105346679688, "learning_rate": 9.800953516090583e-07, "logits/chosen": -2.022120714187622, "logits/rejected": -2.072425127029419, "logps/chosen": -59.63624954223633, "logps/rejected": -64.46969604492188, "loss": 0.8518, "rewards/accuracies": 0.59375, "rewards/chosen": 0.7772837281227112, "rewards/margins": 0.7684420943260193, "rewards/rejected": 0.008841633796691895, "step": 168 }, { "epoch": 0.2000744324525493, "eval_logits/chosen": -2.095194101333618, "eval_logits/rejected": -2.1891181468963623, "eval_logps/chosen": -66.78530883789062, "eval_logps/rejected": -66.67434692382812, "eval_loss": 0.8102010488510132, "eval_rewards/accuracies": 0.6915708780288696, "eval_rewards/chosen": 0.3924814760684967, "eval_rewards/margins": 1.0776466131210327, "eval_rewards/rejected": -0.6851651668548584, "eval_runtime": 1876.375, "eval_samples_per_second": 0.556, "eval_steps_per_second": 0.278, "step": 168 }, { "epoch": 0.20126535169333828, "grad_norm": 4.509078502655029, "learning_rate": 9.799761620977354e-07, "logits/chosen": -2.0032639503479004, "logits/rejected": -2.1738126277923584, "logps/chosen": -69.3968276977539, "logps/rejected": -70.0867691040039, "loss": 0.7243, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5515201687812805, "rewards/margins": 1.599515438079834, "rewards/rejected": -1.0479952096939087, "step": 169 }, { "epoch": 0.20245627093412727, "grad_norm": 3.8341472148895264, "learning_rate": 9.798569725864123e-07, "logits/chosen": -2.1320559978485107, "logits/rejected": -2.2534005641937256, "logps/chosen": -65.94788360595703, "logps/rejected": -73.53800201416016, "loss": 0.6592, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4464108347892761, "rewards/margins": 2.0964910984039307, "rewards/rejected": -1.6500803232192993, "step": 170 }, { "epoch": 0.20364719017491625, "grad_norm": 4.210196018218994, "learning_rate": 9.797377830750893e-07, "logits/chosen": -2.122807264328003, "logits/rejected": -2.210653305053711, "logps/chosen": -68.03795623779297, "logps/rejected": -70.7711181640625, "loss": 0.7782, "rewards/accuracies": 0.78125, "rewards/chosen": 0.37717828154563904, "rewards/margins": 1.3744537830352783, "rewards/rejected": -0.9972752928733826, "step": 171 }, { "epoch": 0.20483810941570524, "grad_norm": 4.25936222076416, "learning_rate": 9.796185935637664e-07, "logits/chosen": -2.018360137939453, "logits/rejected": -2.099370002746582, "logps/chosen": -63.33076477050781, "logps/rejected": -69.88211822509766, "loss": 0.6958, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7823428511619568, "rewards/margins": 1.699712872505188, "rewards/rejected": -0.9173699617385864, "step": 172 }, { "epoch": 0.20602902865649422, "grad_norm": 3.6652023792266846, "learning_rate": 9.794994040524433e-07, "logits/chosen": -2.070406198501587, "logits/rejected": -2.119770050048828, "logps/chosen": -64.86528015136719, "logps/rejected": -69.3088150024414, "loss": 0.7463, "rewards/accuracies": 0.71875, "rewards/chosen": 0.47796541452407837, "rewards/margins": 1.4572774171829224, "rewards/rejected": -0.9793118238449097, "step": 173 }, { "epoch": 0.2072199478972832, "grad_norm": 5.020044803619385, "learning_rate": 9.793802145411203e-07, "logits/chosen": -2.0431876182556152, "logits/rejected": -2.06881046295166, "logps/chosen": -67.43488311767578, "logps/rejected": -71.38408660888672, "loss": 0.8151, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2391720861196518, "rewards/margins": 1.098628044128418, "rewards/rejected": -0.8594560623168945, "step": 174 }, { "epoch": 0.2084108671380722, "grad_norm": 3.4613821506500244, "learning_rate": 9.792610250297974e-07, "logits/chosen": -2.0769290924072266, "logits/rejected": -2.138162136077881, "logps/chosen": -72.55262756347656, "logps/rejected": -72.98710632324219, "loss": 0.8379, "rewards/accuracies": 0.71875, "rewards/chosen": 0.03935595229268074, "rewards/margins": 0.9902422428131104, "rewards/rejected": -0.9508862495422363, "step": 175 }, { "epoch": 0.20960178637886118, "grad_norm": 4.220311164855957, "learning_rate": 9.791418355184743e-07, "logits/chosen": -2.013784885406494, "logits/rejected": -2.1314945220947266, "logps/chosen": -64.54747009277344, "logps/rejected": -66.63037872314453, "loss": 0.7854, "rewards/accuracies": 0.78125, "rewards/chosen": 0.25128644704818726, "rewards/margins": 1.3092023134231567, "rewards/rejected": -1.0579159259796143, "step": 176 }, { "epoch": 0.21079270561965016, "grad_norm": 4.19986629486084, "learning_rate": 9.790226460071513e-07, "logits/chosen": -2.0383248329162598, "logits/rejected": -2.0758168697357178, "logps/chosen": -63.8239631652832, "logps/rejected": -70.85440826416016, "loss": 0.886, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3267938494682312, "rewards/margins": 0.7741100788116455, "rewards/rejected": -0.44731616973876953, "step": 177 }, { "epoch": 0.21198362486043915, "grad_norm": 3.912597417831421, "learning_rate": 9.789034564958282e-07, "logits/chosen": -2.0150275230407715, "logits/rejected": -2.140063762664795, "logps/chosen": -67.88175964355469, "logps/rejected": -69.7936782836914, "loss": 0.6761, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8536794185638428, "rewards/margins": 2.037966251373291, "rewards/rejected": -1.1842868328094482, "step": 178 }, { "epoch": 0.21317454410122813, "grad_norm": 4.298770427703857, "learning_rate": 9.787842669845053e-07, "logits/chosen": -2.0052337646484375, "logits/rejected": -2.1124563217163086, "logps/chosen": -70.94829559326172, "logps/rejected": -72.99111938476562, "loss": 0.806, "rewards/accuracies": 0.71875, "rewards/chosen": 0.17831172049045563, "rewards/margins": 1.1731653213500977, "rewards/rejected": -0.9948536157608032, "step": 179 }, { "epoch": 0.21436546334201712, "grad_norm": 4.608489990234375, "learning_rate": 9.786650774731824e-07, "logits/chosen": -2.1431965827941895, "logits/rejected": -2.148815393447876, "logps/chosen": -67.91397094726562, "logps/rejected": -70.10904693603516, "loss": 0.857, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21055996417999268, "rewards/margins": 0.9587528109550476, "rewards/rejected": -0.7481929063796997, "step": 180 }, { "epoch": 0.2155563825828061, "grad_norm": 3.382219076156616, "learning_rate": 9.785458879618594e-07, "logits/chosen": -2.103175640106201, "logits/rejected": -2.169501304626465, "logps/chosen": -71.94068908691406, "logps/rejected": -71.22469329833984, "loss": 0.8148, "rewards/accuracies": 0.625, "rewards/chosen": 0.3448840379714966, "rewards/margins": 1.1543853282928467, "rewards/rejected": -0.8095013499259949, "step": 181 }, { "epoch": 0.21674730182359508, "grad_norm": 4.483565330505371, "learning_rate": 9.784266984505363e-07, "logits/chosen": -2.020340919494629, "logits/rejected": -2.094576120376587, "logps/chosen": -62.03450393676758, "logps/rejected": -67.73089599609375, "loss": 0.8169, "rewards/accuracies": 0.6875, "rewards/chosen": 0.36339107155799866, "rewards/margins": 1.0842857360839844, "rewards/rejected": -0.720894455909729, "step": 182 }, { "epoch": 0.21793822106438407, "grad_norm": 4.564268112182617, "learning_rate": 9.783075089392134e-07, "logits/chosen": -2.184807777404785, "logits/rejected": -2.249319553375244, "logps/chosen": -66.42752075195312, "logps/rejected": -70.45005798339844, "loss": 0.8919, "rewards/accuracies": 0.5625, "rewards/chosen": 0.13213390111923218, "rewards/margins": 0.7339523434638977, "rewards/rejected": -0.6018184423446655, "step": 183 }, { "epoch": 0.21912914030517305, "grad_norm": 4.3693647384643555, "learning_rate": 9.781883194278902e-07, "logits/chosen": -2.0527477264404297, "logits/rejected": -2.203853130340576, "logps/chosen": -68.08193969726562, "logps/rejected": -72.46173095703125, "loss": 0.753, "rewards/accuracies": 0.875, "rewards/chosen": 0.08398689329624176, "rewards/margins": 1.5248041152954102, "rewards/rejected": -1.4408174753189087, "step": 184 }, { "epoch": 0.22032005954596204, "grad_norm": 4.541518688201904, "learning_rate": 9.780691299165673e-07, "logits/chosen": -2.132509708404541, "logits/rejected": -2.1862146854400635, "logps/chosen": -62.581851959228516, "logps/rejected": -75.58824157714844, "loss": 0.7893, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10172304511070251, "rewards/margins": 1.4503560066223145, "rewards/rejected": -1.348633050918579, "step": 185 }, { "epoch": 0.22151097878675102, "grad_norm": 3.9704833030700684, "learning_rate": 9.779499404052442e-07, "logits/chosen": -2.093498468399048, "logits/rejected": -2.2299716472625732, "logps/chosen": -66.90142059326172, "logps/rejected": -71.37428283691406, "loss": 0.7199, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3936837315559387, "rewards/margins": 1.9375908374786377, "rewards/rejected": -1.5439070463180542, "step": 186 }, { "epoch": 0.22270189802754, "grad_norm": 4.658227920532227, "learning_rate": 9.778307508939212e-07, "logits/chosen": -2.063769578933716, "logits/rejected": -2.17863392829895, "logps/chosen": -67.90970611572266, "logps/rejected": -72.40007019042969, "loss": 0.6723, "rewards/accuracies": 0.75, "rewards/chosen": 0.6009326577186584, "rewards/margins": 1.9633736610412598, "rewards/rejected": -1.3624409437179565, "step": 187 }, { "epoch": 0.223892817268329, "grad_norm": 4.5494608879089355, "learning_rate": 9.777115613825983e-07, "logits/chosen": -1.9910345077514648, "logits/rejected": -2.156224250793457, "logps/chosen": -61.65850830078125, "logps/rejected": -72.31427764892578, "loss": 0.6572, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6405559778213501, "rewards/margins": 2.095534324645996, "rewards/rejected": -1.4549782276153564, "step": 188 }, { "epoch": 0.22508373650911798, "grad_norm": 5.3154072761535645, "learning_rate": 9.775923718712754e-07, "logits/chosen": -2.091996192932129, "logits/rejected": -2.222421884536743, "logps/chosen": -69.68858337402344, "logps/rejected": -70.23088073730469, "loss": 0.7592, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4717257618904114, "rewards/margins": 1.5318044424057007, "rewards/rejected": -1.0600786209106445, "step": 189 }, { "epoch": 0.22627465574990696, "grad_norm": 3.6894025802612305, "learning_rate": 9.774731823599522e-07, "logits/chosen": -2.089150905609131, "logits/rejected": -2.1946969032287598, "logps/chosen": -64.80979919433594, "logps/rejected": -70.37748718261719, "loss": 0.691, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5179126262664795, "rewards/margins": 1.9195709228515625, "rewards/rejected": -1.401658535003662, "step": 190 }, { "epoch": 0.22746557499069595, "grad_norm": 5.212259769439697, "learning_rate": 9.773539928486293e-07, "logits/chosen": -2.0406267642974854, "logits/rejected": -2.1987547874450684, "logps/chosen": -64.7923355102539, "logps/rejected": -74.67061614990234, "loss": 0.6675, "rewards/accuracies": 0.75, "rewards/chosen": 0.3659212589263916, "rewards/margins": 2.140580415725708, "rewards/rejected": -1.7746591567993164, "step": 191 }, { "epoch": 0.22865649423148493, "grad_norm": 4.470324516296387, "learning_rate": 9.772348033373062e-07, "logits/chosen": -2.1291632652282715, "logits/rejected": -2.172471284866333, "logps/chosen": -67.14098358154297, "logps/rejected": -71.51541900634766, "loss": 0.7778, "rewards/accuracies": 0.65625, "rewards/chosen": 0.2821413278579712, "rewards/margins": 1.347557783126831, "rewards/rejected": -1.0654164552688599, "step": 192 }, { "epoch": 0.22984741347227391, "grad_norm": 4.365176200866699, "learning_rate": 9.771156138259833e-07, "logits/chosen": -2.113168716430664, "logits/rejected": -2.1591379642486572, "logps/chosen": -66.10099792480469, "logps/rejected": -72.36320495605469, "loss": 0.7386, "rewards/accuracies": 0.59375, "rewards/chosen": 0.30903783440589905, "rewards/margins": 1.5732556581497192, "rewards/rejected": -1.264217734336853, "step": 193 }, { "epoch": 0.2310383327130629, "grad_norm": 3.828382968902588, "learning_rate": 9.769964243146603e-07, "logits/chosen": -2.0883846282958984, "logits/rejected": -2.161964178085327, "logps/chosen": -67.02897644042969, "logps/rejected": -76.09003448486328, "loss": 0.7405, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6336478590965271, "rewards/margins": 1.7819035053253174, "rewards/rejected": -1.1482555866241455, "step": 194 }, { "epoch": 0.23222925195385188, "grad_norm": 4.466710090637207, "learning_rate": 9.768772348033372e-07, "logits/chosen": -2.0876359939575195, "logits/rejected": -2.1192147731781006, "logps/chosen": -64.33212280273438, "logps/rejected": -72.78559112548828, "loss": 0.863, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5134353637695312, "rewards/margins": 1.0370181798934937, "rewards/rejected": -0.5235828161239624, "step": 195 }, { "epoch": 0.23342017119464087, "grad_norm": 4.989686012268066, "learning_rate": 9.767580452920143e-07, "logits/chosen": -1.9890960454940796, "logits/rejected": -2.1436655521392822, "logps/chosen": -63.948699951171875, "logps/rejected": -67.88838958740234, "loss": 0.7078, "rewards/accuracies": 0.75, "rewards/chosen": 1.0579134225845337, "rewards/margins": 1.890324354171753, "rewards/rejected": -0.8324109315872192, "step": 196 }, { "epoch": 0.23461109043542985, "grad_norm": 3.8459107875823975, "learning_rate": 9.766388557806913e-07, "logits/chosen": -2.0894532203674316, "logits/rejected": -2.1298744678497314, "logps/chosen": -68.81575012207031, "logps/rejected": -75.06775665283203, "loss": 0.7569, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6547811031341553, "rewards/margins": 1.6863417625427246, "rewards/rejected": -1.0315605401992798, "step": 197 }, { "epoch": 0.23580200967621884, "grad_norm": 4.489509582519531, "learning_rate": 9.765196662693682e-07, "logits/chosen": -2.0577783584594727, "logits/rejected": -2.17691969871521, "logps/chosen": -70.1551742553711, "logps/rejected": -69.85494232177734, "loss": 0.8081, "rewards/accuracies": 0.78125, "rewards/chosen": 0.30496376752853394, "rewards/margins": 1.5172255039215088, "rewards/rejected": -1.21226167678833, "step": 198 }, { "epoch": 0.23699292891700782, "grad_norm": 4.559385299682617, "learning_rate": 9.764004767580453e-07, "logits/chosen": -2.020599365234375, "logits/rejected": -2.182141065597534, "logps/chosen": -63.51189422607422, "logps/rejected": -71.57492065429688, "loss": 0.707, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5762036442756653, "rewards/margins": 2.013766288757324, "rewards/rejected": -1.4375628232955933, "step": 199 }, { "epoch": 0.2381838481577968, "grad_norm": 4.586977481842041, "learning_rate": 9.762812872467221e-07, "logits/chosen": -2.054384231567383, "logits/rejected": -2.168550968170166, "logps/chosen": -65.83186340332031, "logps/rejected": -72.97911834716797, "loss": 0.7878, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03160744905471802, "rewards/margins": 1.544594407081604, "rewards/rejected": -1.5762017965316772, "step": 200 }, { "epoch": 0.2393747673985858, "grad_norm": 6.149441242218018, "learning_rate": 9.761620977353992e-07, "logits/chosen": -2.0651049613952637, "logits/rejected": -2.25582218170166, "logps/chosen": -63.91303634643555, "logps/rejected": -72.27268981933594, "loss": 0.6785, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4535936415195465, "rewards/margins": 2.2461678981781006, "rewards/rejected": -1.792574405670166, "step": 201 }, { "epoch": 0.24056568663937478, "grad_norm": 3.976292848587036, "learning_rate": 9.760429082240763e-07, "logits/chosen": -2.0693507194519043, "logits/rejected": -2.148181676864624, "logps/chosen": -71.03289794921875, "logps/rejected": -72.63015747070312, "loss": 0.8331, "rewards/accuracies": 0.75, "rewards/chosen": 0.19508926570415497, "rewards/margins": 1.0774043798446655, "rewards/rejected": -0.8823151588439941, "step": 202 }, { "epoch": 0.24175660588016376, "grad_norm": 4.418228626251221, "learning_rate": 9.759237187127534e-07, "logits/chosen": -2.0654006004333496, "logits/rejected": -2.174964666366577, "logps/chosen": -68.07296752929688, "logps/rejected": -70.29798126220703, "loss": 0.7861, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1656646579504013, "rewards/margins": 1.5376962423324585, "rewards/rejected": -1.3720314502716064, "step": 203 }, { "epoch": 0.24294752512095275, "grad_norm": 4.413089275360107, "learning_rate": 9.758045292014302e-07, "logits/chosen": -2.15700626373291, "logits/rejected": -2.257018566131592, "logps/chosen": -63.60345458984375, "logps/rejected": -70.55128479003906, "loss": 0.7479, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7267858386039734, "rewards/margins": 1.889531135559082, "rewards/rejected": -1.1627453565597534, "step": 204 }, { "epoch": 0.24413844436174173, "grad_norm": 5.020565986633301, "learning_rate": 9.756853396901073e-07, "logits/chosen": -2.078622817993164, "logits/rejected": -2.1957194805145264, "logps/chosen": -63.054813385009766, "logps/rejected": -72.93460083007812, "loss": 0.7259, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6908627152442932, "rewards/margins": 1.90858793258667, "rewards/rejected": -1.2177250385284424, "step": 205 }, { "epoch": 0.24532936360253071, "grad_norm": 4.275873184204102, "learning_rate": 9.755661501787841e-07, "logits/chosen": -2.0457379817962646, "logits/rejected": -2.16450834274292, "logps/chosen": -64.46638488769531, "logps/rejected": -75.6580581665039, "loss": 0.8747, "rewards/accuracies": 0.65625, "rewards/chosen": -0.074039526283741, "rewards/margins": 1.066882610321045, "rewards/rejected": -1.1409220695495605, "step": 206 }, { "epoch": 0.2465202828433197, "grad_norm": 5.417848110198975, "learning_rate": 9.754469606674612e-07, "logits/chosen": -2.0367305278778076, "logits/rejected": -2.1446585655212402, "logps/chosen": -66.31596374511719, "logps/rejected": -67.06472778320312, "loss": 0.693, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0994064807891846, "rewards/margins": 2.038930892944336, "rewards/rejected": -0.939524233341217, "step": 207 }, { "epoch": 0.24771120208410868, "grad_norm": 4.569162845611572, "learning_rate": 9.75327771156138e-07, "logits/chosen": -2.0451059341430664, "logits/rejected": -2.1415066719055176, "logps/chosen": -63.84112548828125, "logps/rejected": -72.18502044677734, "loss": 0.9367, "rewards/accuracies": 0.625, "rewards/chosen": 0.1311434954404831, "rewards/margins": 0.8622312545776367, "rewards/rejected": -0.7310878038406372, "step": 208 }, { "epoch": 0.24890212132489767, "grad_norm": 3.8541994094848633, "learning_rate": 9.752085816448152e-07, "logits/chosen": -2.116469383239746, "logits/rejected": -2.253817558288574, "logps/chosen": -63.456520080566406, "logps/rejected": -74.64271545410156, "loss": 0.7074, "rewards/accuracies": 0.71875, "rewards/chosen": 0.32439929246902466, "rewards/margins": 1.9804308414459229, "rewards/rejected": -1.6560313701629639, "step": 209 }, { "epoch": 0.25009304056568665, "grad_norm": 3.7089409828186035, "learning_rate": 9.750893921334922e-07, "logits/chosen": -2.071775436401367, "logits/rejected": -2.2042598724365234, "logps/chosen": -69.21183013916016, "logps/rejected": -75.4842758178711, "loss": 0.6523, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5058817863464355, "rewards/margins": 2.4009504318237305, "rewards/rejected": -1.895068645477295, "step": 210 }, { "epoch": 0.2512839598064756, "grad_norm": 4.408862113952637, "learning_rate": 9.749702026221693e-07, "logits/chosen": -2.08271861076355, "logits/rejected": -2.1522023677825928, "logps/chosen": -68.2458724975586, "logps/rejected": -71.7373046875, "loss": 0.7385, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7882341742515564, "rewards/margins": 2.013417959213257, "rewards/rejected": -1.2251837253570557, "step": 211 }, { "epoch": 0.2524748790472646, "grad_norm": 5.093323707580566, "learning_rate": 9.748510131108462e-07, "logits/chosen": -2.079484701156616, "logits/rejected": -2.1456105709075928, "logps/chosen": -65.40966033935547, "logps/rejected": -72.24821472167969, "loss": 0.7715, "rewards/accuracies": 0.78125, "rewards/chosen": 0.34504058957099915, "rewards/margins": 1.636763095855713, "rewards/rejected": -1.2917224168777466, "step": 212 }, { "epoch": 0.2536657982880536, "grad_norm": 5.202884197235107, "learning_rate": 9.747318235995232e-07, "logits/chosen": -2.0441079139709473, "logits/rejected": -2.0908896923065186, "logps/chosen": -63.82802963256836, "logps/rejected": -73.6737060546875, "loss": 0.7246, "rewards/accuracies": 0.78125, "rewards/chosen": 0.23027293384075165, "rewards/margins": 1.9351657629013062, "rewards/rejected": -1.7048927545547485, "step": 213 }, { "epoch": 0.2548567175288426, "grad_norm": 3.9744813442230225, "learning_rate": 9.746126340882e-07, "logits/chosen": -2.0012922286987305, "logits/rejected": -2.203634738922119, "logps/chosen": -60.963096618652344, "logps/rejected": -71.46150207519531, "loss": 0.652, "rewards/accuracies": 0.75, "rewards/chosen": 0.6352204084396362, "rewards/margins": 2.377963066101074, "rewards/rejected": -1.742742896080017, "step": 214 }, { "epoch": 0.25604763676963155, "grad_norm": 4.574095726013184, "learning_rate": 9.744934445768772e-07, "logits/chosen": -2.019613265991211, "logits/rejected": -2.111145496368408, "logps/chosen": -61.497833251953125, "logps/rejected": -69.71112060546875, "loss": 0.7868, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5597839951515198, "rewards/margins": 1.6737381219863892, "rewards/rejected": -1.1139541864395142, "step": 215 }, { "epoch": 0.25723855601042056, "grad_norm": 6.1001296043396, "learning_rate": 9.743742550655542e-07, "logits/chosen": -2.039886474609375, "logits/rejected": -2.1492481231689453, "logps/chosen": -65.29914855957031, "logps/rejected": -68.33444213867188, "loss": 0.6629, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7244287133216858, "rewards/margins": 2.1148862838745117, "rewards/rejected": -1.3904576301574707, "step": 216 }, { "epoch": 0.2584294752512095, "grad_norm": 6.005996227264404, "learning_rate": 9.742550655542311e-07, "logits/chosen": -2.0449187755584717, "logits/rejected": -2.135500431060791, "logps/chosen": -64.95618438720703, "logps/rejected": -69.23109436035156, "loss": 0.7926, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4655466675758362, "rewards/margins": 1.6096751689910889, "rewards/rejected": -1.1441285610198975, "step": 217 }, { "epoch": 0.25962039449199853, "grad_norm": 3.7861340045928955, "learning_rate": 9.741358760429082e-07, "logits/chosen": -2.0536160469055176, "logits/rejected": -2.2053334712982178, "logps/chosen": -64.9434585571289, "logps/rejected": -76.34773254394531, "loss": 0.6338, "rewards/accuracies": 0.78125, "rewards/chosen": 0.853361964225769, "rewards/margins": 2.644932746887207, "rewards/rejected": -1.7915705442428589, "step": 218 }, { "epoch": 0.2608113137327875, "grad_norm": 5.599305629730225, "learning_rate": 9.740166865315853e-07, "logits/chosen": -2.023293972015381, "logits/rejected": -2.2409491539001465, "logps/chosen": -64.5833740234375, "logps/rejected": -70.4532470703125, "loss": 0.5926, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2158524990081787, "rewards/margins": 2.624061107635498, "rewards/rejected": -1.4082088470458984, "step": 219 }, { "epoch": 0.2620022329735765, "grad_norm": 3.8995463848114014, "learning_rate": 9.738974970202621e-07, "logits/chosen": -2.1149144172668457, "logits/rejected": -2.24360728263855, "logps/chosen": -67.9100112915039, "logps/rejected": -80.81379699707031, "loss": 0.7379, "rewards/accuracies": 0.75, "rewards/chosen": 0.9412909746170044, "rewards/margins": 2.117875576019287, "rewards/rejected": -1.1765847206115723, "step": 220 }, { "epoch": 0.26319315221436546, "grad_norm": 4.713465213775635, "learning_rate": 9.737783075089392e-07, "logits/chosen": -2.0721511840820312, "logits/rejected": -2.204181671142578, "logps/chosen": -63.385807037353516, "logps/rejected": -71.54985809326172, "loss": 0.6733, "rewards/accuracies": 0.75, "rewards/chosen": 0.8715008497238159, "rewards/margins": 2.3047776222229004, "rewards/rejected": -1.4332767724990845, "step": 221 }, { "epoch": 0.26438407145515447, "grad_norm": 4.0672607421875, "learning_rate": 9.73659117997616e-07, "logits/chosen": -1.9974966049194336, "logits/rejected": -2.1302897930145264, "logps/chosen": -58.44713592529297, "logps/rejected": -67.2203598022461, "loss": 0.6421, "rewards/accuracies": 0.78125, "rewards/chosen": 1.5899746417999268, "rewards/margins": 2.6708099842071533, "rewards/rejected": -1.0808351039886475, "step": 222 }, { "epoch": 0.2655749906959434, "grad_norm": 5.275457382202148, "learning_rate": 9.735399284862931e-07, "logits/chosen": -2.096951961517334, "logits/rejected": -2.1449105739593506, "logps/chosen": -63.76134490966797, "logps/rejected": -71.77184295654297, "loss": 0.7662, "rewards/accuracies": 0.78125, "rewards/chosen": 0.46735823154449463, "rewards/margins": 1.6712943315505981, "rewards/rejected": -1.203935980796814, "step": 223 }, { "epoch": 0.26676590993673244, "grad_norm": 5.4317307472229, "learning_rate": 9.734207389749702e-07, "logits/chosen": -2.0572309494018555, "logits/rejected": -2.241177797317505, "logps/chosen": -66.0074691772461, "logps/rejected": -77.299560546875, "loss": 0.6138, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6299949884414673, "rewards/margins": 2.8418049812316895, "rewards/rejected": -2.2118098735809326, "step": 224 }, { "epoch": 0.2679568291775214, "grad_norm": 4.163811683654785, "learning_rate": 9.733015494636473e-07, "logits/chosen": -2.063748836517334, "logits/rejected": -2.1536073684692383, "logps/chosen": -68.0132827758789, "logps/rejected": -74.00108337402344, "loss": 0.782, "rewards/accuracies": 0.71875, "rewards/chosen": 0.45450836420059204, "rewards/margins": 1.6418888568878174, "rewards/rejected": -1.187380313873291, "step": 225 }, { "epoch": 0.2691477484183104, "grad_norm": 4.601158142089844, "learning_rate": 9.731823599523241e-07, "logits/chosen": -2.0918071269989014, "logits/rejected": -2.211503505706787, "logps/chosen": -66.3461685180664, "logps/rejected": -75.47848510742188, "loss": 0.7646, "rewards/accuracies": 0.6875, "rewards/chosen": 0.250731498003006, "rewards/margins": 1.812919020652771, "rewards/rejected": -1.5621873140335083, "step": 226 }, { "epoch": 0.27033866765909936, "grad_norm": 5.433541774749756, "learning_rate": 9.730631704410012e-07, "logits/chosen": -2.0259323120117188, "logits/rejected": -2.246365785598755, "logps/chosen": -62.085960388183594, "logps/rejected": -74.79238891601562, "loss": 0.6303, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5734451413154602, "rewards/margins": 2.5252699851989746, "rewards/rejected": -1.9518249034881592, "step": 227 }, { "epoch": 0.2715295868998884, "grad_norm": 4.5801615715026855, "learning_rate": 9.72943980929678e-07, "logits/chosen": -2.0982460975646973, "logits/rejected": -2.186762809753418, "logps/chosen": -63.6893424987793, "logps/rejected": -72.1757583618164, "loss": 0.8376, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06968489289283752, "rewards/margins": 1.245438575744629, "rewards/rejected": -1.1757535934448242, "step": 228 }, { "epoch": 0.27272050614067733, "grad_norm": 4.2799506187438965, "learning_rate": 9.728247914183551e-07, "logits/chosen": -2.033379077911377, "logits/rejected": -2.1439642906188965, "logps/chosen": -62.450443267822266, "logps/rejected": -70.5012435913086, "loss": 0.7162, "rewards/accuracies": 0.75, "rewards/chosen": 1.0415939092636108, "rewards/margins": 2.1317994594573975, "rewards/rejected": -1.090205430984497, "step": 229 }, { "epoch": 0.27391142538146634, "grad_norm": 5.162893295288086, "learning_rate": 9.72705601907032e-07, "logits/chosen": -2.059795379638672, "logits/rejected": -2.1940221786499023, "logps/chosen": -64.6734848022461, "logps/rejected": -71.22876739501953, "loss": 0.6595, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2121357917785645, "rewards/margins": 2.481269359588623, "rewards/rejected": -1.2691338062286377, "step": 230 }, { "epoch": 0.2751023446222553, "grad_norm": 4.854302406311035, "learning_rate": 9.72586412395709e-07, "logits/chosen": -2.061279773712158, "logits/rejected": -2.1468122005462646, "logps/chosen": -60.40235900878906, "logps/rejected": -72.70694732666016, "loss": 0.6728, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7645822763442993, "rewards/margins": 2.3024489879608154, "rewards/rejected": -1.5378665924072266, "step": 231 }, { "epoch": 0.2762932638630443, "grad_norm": 3.9446568489074707, "learning_rate": 9.724672228843862e-07, "logits/chosen": -2.015653610229492, "logits/rejected": -2.1383564472198486, "logps/chosen": -66.18637084960938, "logps/rejected": -76.45068359375, "loss": 0.7046, "rewards/accuracies": 0.75, "rewards/chosen": 0.3485923409461975, "rewards/margins": 2.2331998348236084, "rewards/rejected": -1.8846075534820557, "step": 232 }, { "epoch": 0.27748418310383327, "grad_norm": 4.106429100036621, "learning_rate": 9.723480333730632e-07, "logits/chosen": -2.0412886142730713, "logits/rejected": -2.1879706382751465, "logps/chosen": -58.433677673339844, "logps/rejected": -65.79380798339844, "loss": 0.7788, "rewards/accuracies": 0.625, "rewards/chosen": 0.4239969253540039, "rewards/margins": 1.4182395935058594, "rewards/rejected": -0.9942427277565002, "step": 233 }, { "epoch": 0.2786751023446223, "grad_norm": 5.2572855949401855, "learning_rate": 9.7222884386174e-07, "logits/chosen": -1.9684555530548096, "logits/rejected": -2.03987193107605, "logps/chosen": -59.647369384765625, "logps/rejected": -68.78548431396484, "loss": 0.676, "rewards/accuracies": 0.8125, "rewards/chosen": 1.169419527053833, "rewards/margins": 2.228832721710205, "rewards/rejected": -1.059413194656372, "step": 234 }, { "epoch": 0.27986602158541124, "grad_norm": 4.195286273956299, "learning_rate": 9.721096543504172e-07, "logits/chosen": -2.124605894088745, "logits/rejected": -2.170642137527466, "logps/chosen": -64.14769744873047, "logps/rejected": -74.6089096069336, "loss": 0.7057, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0978187322616577, "rewards/margins": 2.1849210262298584, "rewards/rejected": -1.0871022939682007, "step": 235 }, { "epoch": 0.2810569408262002, "grad_norm": 3.717033863067627, "learning_rate": 9.71990464839094e-07, "logits/chosen": -2.090604305267334, "logits/rejected": -2.114527940750122, "logps/chosen": -65.42227172851562, "logps/rejected": -66.48345947265625, "loss": 0.933, "rewards/accuracies": 0.4375, "rewards/chosen": 0.006022538989782333, "rewards/margins": 0.5497673153877258, "rewards/rejected": -0.5437447428703308, "step": 236 }, { "epoch": 0.2822478600669892, "grad_norm": 4.13685417175293, "learning_rate": 9.71871275327771e-07, "logits/chosen": -2.117845296859741, "logits/rejected": -2.215693950653076, "logps/chosen": -63.911834716796875, "logps/rejected": -70.92743682861328, "loss": 0.7701, "rewards/accuracies": 0.75, "rewards/chosen": 0.40869709849357605, "rewards/margins": 1.6877129077911377, "rewards/rejected": -1.2790157794952393, "step": 237 }, { "epoch": 0.28343877930777817, "grad_norm": 4.6425628662109375, "learning_rate": 9.717520858164482e-07, "logits/chosen": -2.0949904918670654, "logits/rejected": -2.184805154800415, "logps/chosen": -60.036231994628906, "logps/rejected": -70.91552734375, "loss": 0.6925, "rewards/accuracies": 0.65625, "rewards/chosen": 1.497302770614624, "rewards/margins": 2.3094871044158936, "rewards/rejected": -0.8121843338012695, "step": 238 }, { "epoch": 0.2846296985485672, "grad_norm": 4.7119364738464355, "learning_rate": 9.71632896305125e-07, "logits/chosen": -2.096247434616089, "logits/rejected": -2.189969062805176, "logps/chosen": -59.57472229003906, "logps/rejected": -67.63516235351562, "loss": 0.8321, "rewards/accuracies": 0.59375, "rewards/chosen": 0.6694955825805664, "rewards/margins": 1.1851533651351929, "rewards/rejected": -0.5156577825546265, "step": 239 }, { "epoch": 0.28582061778935614, "grad_norm": 4.25119161605835, "learning_rate": 9.715137067938021e-07, "logits/chosen": -2.0782291889190674, "logits/rejected": -2.1119649410247803, "logps/chosen": -61.315025329589844, "logps/rejected": -72.03547668457031, "loss": 0.8743, "rewards/accuracies": 0.625, "rewards/chosen": 0.6162082552909851, "rewards/margins": 0.9367533922195435, "rewards/rejected": -0.3205450773239136, "step": 240 }, { "epoch": 0.28701153703014515, "grad_norm": 3.957826614379883, "learning_rate": 9.713945172824792e-07, "logits/chosen": -2.1422863006591797, "logits/rejected": -2.207879066467285, "logps/chosen": -65.33467864990234, "logps/rejected": -75.01111602783203, "loss": 0.643, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9759160280227661, "rewards/margins": 2.407346248626709, "rewards/rejected": -1.4314303398132324, "step": 241 }, { "epoch": 0.2882024562709341, "grad_norm": 4.073090553283691, "learning_rate": 9.71275327771156e-07, "logits/chosen": -2.051300525665283, "logits/rejected": -2.1515309810638428, "logps/chosen": -61.373016357421875, "logps/rejected": -70.26400756835938, "loss": 0.7242, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8974589109420776, "rewards/margins": 1.896742343902588, "rewards/rejected": -0.9992835521697998, "step": 242 }, { "epoch": 0.2893933755117231, "grad_norm": 4.258055686950684, "learning_rate": 9.711561382598331e-07, "logits/chosen": -2.034534454345703, "logits/rejected": -2.1427063941955566, "logps/chosen": -68.26506805419922, "logps/rejected": -69.7530517578125, "loss": 0.7416, "rewards/accuracies": 0.65625, "rewards/chosen": 1.1822150945663452, "rewards/margins": 2.0133514404296875, "rewards/rejected": -0.8311365842819214, "step": 243 }, { "epoch": 0.2905842947525121, "grad_norm": 4.205196857452393, "learning_rate": 9.7103694874851e-07, "logits/chosen": -2.0992562770843506, "logits/rejected": -2.1925134658813477, "logps/chosen": -61.553958892822266, "logps/rejected": -69.81407165527344, "loss": 0.669, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0720895528793335, "rewards/margins": 2.218890428543091, "rewards/rejected": -1.1468009948730469, "step": 244 }, { "epoch": 0.2917752139933011, "grad_norm": 4.336493015289307, "learning_rate": 9.70917759237187e-07, "logits/chosen": -2.0306808948516846, "logits/rejected": -2.1234045028686523, "logps/chosen": -60.45526123046875, "logps/rejected": -69.82769012451172, "loss": 0.7358, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6461515426635742, "rewards/margins": 1.864901065826416, "rewards/rejected": -1.2187495231628418, "step": 245 }, { "epoch": 0.29296613323409004, "grad_norm": 5.358771324157715, "learning_rate": 9.707985697258641e-07, "logits/chosen": -2.161972999572754, "logits/rejected": -2.16630220413208, "logps/chosen": -67.70983123779297, "logps/rejected": -70.09552764892578, "loss": 0.7596, "rewards/accuracies": 0.65625, "rewards/chosen": 0.9123355746269226, "rewards/margins": 1.6832486391067505, "rewards/rejected": -0.7709131240844727, "step": 246 }, { "epoch": 0.29415705247487905, "grad_norm": 4.083241939544678, "learning_rate": 9.706793802145412e-07, "logits/chosen": -2.071126699447632, "logits/rejected": -2.1447150707244873, "logps/chosen": -56.71778106689453, "logps/rejected": -65.2761459350586, "loss": 0.7819, "rewards/accuracies": 0.65625, "rewards/chosen": 0.985905647277832, "rewards/margins": 1.649339199066162, "rewards/rejected": -0.6634334921836853, "step": 247 }, { "epoch": 0.295347971715668, "grad_norm": 5.907279968261719, "learning_rate": 9.70560190703218e-07, "logits/chosen": -1.9931800365447998, "logits/rejected": -2.193349599838257, "logps/chosen": -64.03900909423828, "logps/rejected": -75.08411407470703, "loss": 0.811, "rewards/accuracies": 0.75, "rewards/chosen": -0.1597195863723755, "rewards/margins": 1.587841272354126, "rewards/rejected": -1.7475612163543701, "step": 248 }, { "epoch": 0.296538890956457, "grad_norm": 4.659930229187012, "learning_rate": 9.704410011918951e-07, "logits/chosen": -1.9925682544708252, "logits/rejected": -2.1594223976135254, "logps/chosen": -53.34965133666992, "logps/rejected": -69.26322937011719, "loss": 0.6163, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4652992486953735, "rewards/margins": 2.7615597248077393, "rewards/rejected": -1.2962605953216553, "step": 249 }, { "epoch": 0.297729810197246, "grad_norm": 5.575434684753418, "learning_rate": 9.70321811680572e-07, "logits/chosen": -2.071300506591797, "logits/rejected": -2.1177151203155518, "logps/chosen": -59.644866943359375, "logps/rejected": -66.1758804321289, "loss": 0.8269, "rewards/accuracies": 0.6875, "rewards/chosen": 1.070604681968689, "rewards/margins": 1.2136173248291016, "rewards/rejected": -0.14301252365112305, "step": 250 }, { "epoch": 0.298920729438035, "grad_norm": 4.6643548011779785, "learning_rate": 9.70202622169249e-07, "logits/chosen": -2.087681770324707, "logits/rejected": -2.2316291332244873, "logps/chosen": -63.4311637878418, "logps/rejected": -70.54767608642578, "loss": 0.7789, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4144554138183594, "rewards/margins": 1.5637069940567017, "rewards/rejected": -1.1492514610290527, "step": 251 }, { "epoch": 0.30011164867882395, "grad_norm": 6.783273220062256, "learning_rate": 9.70083432657926e-07, "logits/chosen": -2.0187854766845703, "logits/rejected": -2.2102630138397217, "logps/chosen": -61.89167022705078, "logps/rejected": -74.73959350585938, "loss": 0.7092, "rewards/accuracies": 0.78125, "rewards/chosen": 0.513271152973175, "rewards/margins": 2.2684969902038574, "rewards/rejected": -1.755225658416748, "step": 252 }, { "epoch": 0.30130256791961296, "grad_norm": 5.07480525970459, "learning_rate": 9.69964243146603e-07, "logits/chosen": -2.060853958129883, "logits/rejected": -2.1877822875976562, "logps/chosen": -67.2308578491211, "logps/rejected": -81.58448791503906, "loss": 0.6845, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7553887367248535, "rewards/margins": 2.8925485610961914, "rewards/rejected": -2.137159824371338, "step": 253 }, { "epoch": 0.3024934871604019, "grad_norm": 5.72538948059082, "learning_rate": 9.6984505363528e-07, "logits/chosen": -2.065664291381836, "logits/rejected": -2.2474164962768555, "logps/chosen": -59.609222412109375, "logps/rejected": -75.87594604492188, "loss": 0.5495, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7996182441711426, "rewards/margins": 3.527404308319092, "rewards/rejected": -1.7277858257293701, "step": 254 }, { "epoch": 0.30368440640119093, "grad_norm": 4.797563076019287, "learning_rate": 9.697258641239572e-07, "logits/chosen": -1.9752191305160522, "logits/rejected": -2.1384270191192627, "logps/chosen": -57.0115852355957, "logps/rejected": -68.13401794433594, "loss": 0.6839, "rewards/accuracies": 0.8125, "rewards/chosen": 1.6939717531204224, "rewards/margins": 2.5112416744232178, "rewards/rejected": -0.817270040512085, "step": 255 }, { "epoch": 0.3048753256419799, "grad_norm": 8.129981994628906, "learning_rate": 9.69606674612634e-07, "logits/chosen": -2.0118088722229004, "logits/rejected": -2.1519601345062256, "logps/chosen": -61.4858283996582, "logps/rejected": -68.1743392944336, "loss": 0.8052, "rewards/accuracies": 0.75, "rewards/chosen": 0.9426844716072083, "rewards/margins": 1.6143218278884888, "rewards/rejected": -0.6716374158859253, "step": 256 }, { "epoch": 0.3060662448827689, "grad_norm": 4.252197265625, "learning_rate": 9.69487485101311e-07, "logits/chosen": -2.121166944503784, "logits/rejected": -2.117983102798462, "logps/chosen": -63.14940643310547, "logps/rejected": -69.66085052490234, "loss": 0.8502, "rewards/accuracies": 0.59375, "rewards/chosen": 1.1632612943649292, "rewards/margins": 1.1168262958526611, "rewards/rejected": 0.046435266733169556, "step": 257 }, { "epoch": 0.30725716412355786, "grad_norm": 5.507360458374023, "learning_rate": 9.69368295589988e-07, "logits/chosen": -2.009212017059326, "logits/rejected": -2.1902055740356445, "logps/chosen": -56.99302673339844, "logps/rejected": -73.44856262207031, "loss": 0.5617, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4761853218078613, "rewards/margins": 3.2839813232421875, "rewards/rejected": -1.8077960014343262, "step": 258 }, { "epoch": 0.30844808336434687, "grad_norm": 6.159033298492432, "learning_rate": 9.69249106078665e-07, "logits/chosen": -2.0232930183410645, "logits/rejected": -2.1288564205169678, "logps/chosen": -50.59425735473633, "logps/rejected": -64.03901672363281, "loss": 0.6862, "rewards/accuracies": 0.6875, "rewards/chosen": 1.656673789024353, "rewards/margins": 2.246936321258545, "rewards/rejected": -0.5902624130249023, "step": 259 }, { "epoch": 0.3096390026051358, "grad_norm": 4.864058494567871, "learning_rate": 9.69129916567342e-07, "logits/chosen": -2.0433621406555176, "logits/rejected": -2.165602207183838, "logps/chosen": -59.04927444458008, "logps/rejected": -67.32926940917969, "loss": 0.6926, "rewards/accuracies": 0.75, "rewards/chosen": 1.1433091163635254, "rewards/margins": 2.1398227214813232, "rewards/rejected": -0.9965137839317322, "step": 260 }, { "epoch": 0.31082992184592484, "grad_norm": 6.191671371459961, "learning_rate": 9.69010727056019e-07, "logits/chosen": -1.9974759817123413, "logits/rejected": -2.1150407791137695, "logps/chosen": -57.38859558105469, "logps/rejected": -63.15414810180664, "loss": 0.6526, "rewards/accuracies": 0.75, "rewards/chosen": 1.518768310546875, "rewards/margins": 2.4309074878692627, "rewards/rejected": -0.9121394157409668, "step": 261 }, { "epoch": 0.3120208410867138, "grad_norm": 6.7899580001831055, "learning_rate": 9.68891537544696e-07, "logits/chosen": -2.065378189086914, "logits/rejected": -2.2287027835845947, "logps/chosen": -57.25279235839844, "logps/rejected": -68.5666275024414, "loss": 0.7194, "rewards/accuracies": 0.6875, "rewards/chosen": 1.2721599340438843, "rewards/margins": 2.32496976852417, "rewards/rejected": -1.0528099536895752, "step": 262 }, { "epoch": 0.3132117603275028, "grad_norm": 4.1083879470825195, "learning_rate": 9.68772348033373e-07, "logits/chosen": -1.9846159219741821, "logits/rejected": -2.0903759002685547, "logps/chosen": -51.54987335205078, "logps/rejected": -66.06757354736328, "loss": 0.7089, "rewards/accuracies": 0.8125, "rewards/chosen": 1.9627017974853516, "rewards/margins": 2.196500539779663, "rewards/rejected": -0.23379887640476227, "step": 263 }, { "epoch": 0.31440267956829177, "grad_norm": 5.07354736328125, "learning_rate": 9.6865315852205e-07, "logits/chosen": -2.0726685523986816, "logits/rejected": -2.1955034732818604, "logps/chosen": -59.5717887878418, "logps/rejected": -66.48152160644531, "loss": 0.824, "rewards/accuracies": 0.65625, "rewards/chosen": 0.7690951228141785, "rewards/margins": 1.5007678270339966, "rewards/rejected": -0.7316728234291077, "step": 264 }, { "epoch": 0.3155935988090808, "grad_norm": 3.8698484897613525, "learning_rate": 9.68533969010727e-07, "logits/chosen": -2.038722515106201, "logits/rejected": -2.179316520690918, "logps/chosen": -55.7758674621582, "logps/rejected": -65.7356185913086, "loss": 0.7639, "rewards/accuracies": 0.75, "rewards/chosen": 1.2344189882278442, "rewards/margins": 1.9204294681549072, "rewards/rejected": -0.6860105991363525, "step": 265 }, { "epoch": 0.31678451804986973, "grad_norm": 4.3524699211120605, "learning_rate": 9.68414779499404e-07, "logits/chosen": -2.0776565074920654, "logits/rejected": -2.2376229763031006, "logps/chosen": -57.85504150390625, "logps/rejected": -70.78446197509766, "loss": 0.8196, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9775715470314026, "rewards/margins": 2.0063908100128174, "rewards/rejected": -1.0288190841674805, "step": 266 }, { "epoch": 0.31797543729065875, "grad_norm": 4.420696258544922, "learning_rate": 9.68295589988081e-07, "logits/chosen": -2.1044740676879883, "logits/rejected": -2.1758131980895996, "logps/chosen": -63.879005432128906, "logps/rejected": -71.8736801147461, "loss": 0.7831, "rewards/accuracies": 0.6875, "rewards/chosen": 0.500740647315979, "rewards/margins": 1.721328616142273, "rewards/rejected": -1.220587968826294, "step": 267 }, { "epoch": 0.3191663565314477, "grad_norm": 4.777884483337402, "learning_rate": 9.68176400476758e-07, "logits/chosen": -2.169586181640625, "logits/rejected": -2.243595838546753, "logps/chosen": -59.477882385253906, "logps/rejected": -73.32268524169922, "loss": 0.6626, "rewards/accuracies": 0.75, "rewards/chosen": 1.1850404739379883, "rewards/margins": 2.495879888534546, "rewards/rejected": -1.3108394145965576, "step": 268 }, { "epoch": 0.3203572757722367, "grad_norm": 4.5097246170043945, "learning_rate": 9.680572109654351e-07, "logits/chosen": -2.066986560821533, "logits/rejected": -2.2099666595458984, "logps/chosen": -60.64256286621094, "logps/rejected": -73.1756591796875, "loss": 0.6126, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4928622245788574, "rewards/margins": 2.9781932830810547, "rewards/rejected": -1.4853312969207764, "step": 269 }, { "epoch": 0.3215481950130257, "grad_norm": 5.056450843811035, "learning_rate": 9.67938021454112e-07, "logits/chosen": -1.996673583984375, "logits/rejected": -2.128770112991333, "logps/chosen": -59.97050476074219, "logps/rejected": -72.0550537109375, "loss": 0.6748, "rewards/accuracies": 0.8125, "rewards/chosen": 1.25550377368927, "rewards/margins": 2.6470248699188232, "rewards/rejected": -1.3915212154388428, "step": 270 }, { "epoch": 0.3227391142538147, "grad_norm": 4.893198013305664, "learning_rate": 9.67818831942789e-07, "logits/chosen": -2.0527522563934326, "logits/rejected": -2.1278185844421387, "logps/chosen": -66.65520477294922, "logps/rejected": -80.43985748291016, "loss": 0.668, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6066175699234009, "rewards/margins": 2.7968156337738037, "rewards/rejected": -2.1901981830596924, "step": 271 }, { "epoch": 0.32393003349460364, "grad_norm": 5.037367343902588, "learning_rate": 9.67699642431466e-07, "logits/chosen": -1.9992523193359375, "logits/rejected": -2.053532123565674, "logps/chosen": -59.81591796875, "logps/rejected": -67.00481414794922, "loss": 0.7715, "rewards/accuracies": 0.625, "rewards/chosen": 0.9632441997528076, "rewards/margins": 1.7549448013305664, "rewards/rejected": -0.7917006015777588, "step": 272 }, { "epoch": 0.32512095273539265, "grad_norm": 4.8170952796936035, "learning_rate": 9.67580452920143e-07, "logits/chosen": -2.029456615447998, "logits/rejected": -2.177724838256836, "logps/chosen": -52.301116943359375, "logps/rejected": -74.32378387451172, "loss": 0.6373, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2975057363510132, "rewards/margins": 2.618222713470459, "rewards/rejected": -1.320717215538025, "step": 273 }, { "epoch": 0.3263118719761816, "grad_norm": 4.344991207122803, "learning_rate": 9.674612634088199e-07, "logits/chosen": -2.00789213180542, "logits/rejected": -2.2211270332336426, "logps/chosen": -58.05427932739258, "logps/rejected": -72.33134460449219, "loss": 0.5433, "rewards/accuracies": 0.90625, "rewards/chosen": 1.8619905710220337, "rewards/margins": 3.747495174407959, "rewards/rejected": -1.8855047225952148, "step": 274 }, { "epoch": 0.3275027912169706, "grad_norm": 5.928807735443115, "learning_rate": 9.67342073897497e-07, "logits/chosen": -2.0503427982330322, "logits/rejected": -2.1778314113616943, "logps/chosen": -61.94017028808594, "logps/rejected": -69.97138977050781, "loss": 0.6413, "rewards/accuracies": 0.75, "rewards/chosen": 1.6098482608795166, "rewards/margins": 2.8703665733337402, "rewards/rejected": -1.2605184316635132, "step": 275 }, { "epoch": 0.3286937104577596, "grad_norm": 5.153820514678955, "learning_rate": 9.67222884386174e-07, "logits/chosen": -2.038595199584961, "logits/rejected": -2.1464691162109375, "logps/chosen": -54.99358367919922, "logps/rejected": -64.48481750488281, "loss": 0.692, "rewards/accuracies": 0.75, "rewards/chosen": 1.2777093648910522, "rewards/margins": 2.3508965969085693, "rewards/rejected": -1.0731871128082275, "step": 276 }, { "epoch": 0.3298846296985486, "grad_norm": 6.0933027267456055, "learning_rate": 9.67103694874851e-07, "logits/chosen": -2.0025291442871094, "logits/rejected": -2.12542724609375, "logps/chosen": -58.879817962646484, "logps/rejected": -72.68545532226562, "loss": 0.7305, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9014298915863037, "rewards/margins": 2.3369152545928955, "rewards/rejected": -1.4354853630065918, "step": 277 }, { "epoch": 0.33107554893933755, "grad_norm": 4.430479049682617, "learning_rate": 9.66984505363528e-07, "logits/chosen": -2.0573410987854004, "logits/rejected": -2.1665244102478027, "logps/chosen": -65.00631713867188, "logps/rejected": -75.492431640625, "loss": 0.7248, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6707623600959778, "rewards/margins": 2.4796142578125, "rewards/rejected": -1.8088520765304565, "step": 278 }, { "epoch": 0.33226646818012656, "grad_norm": 5.5920586585998535, "learning_rate": 9.66865315852205e-07, "logits/chosen": -2.066577672958374, "logits/rejected": -2.1673004627227783, "logps/chosen": -65.13902282714844, "logps/rejected": -74.98155975341797, "loss": 0.8089, "rewards/accuracies": 0.65625, "rewards/chosen": 0.32063567638397217, "rewards/margins": 1.616420865058899, "rewards/rejected": -1.2957853078842163, "step": 279 }, { "epoch": 0.3334573874209155, "grad_norm": 5.362323760986328, "learning_rate": 9.667461263408819e-07, "logits/chosen": -2.0640902519226074, "logits/rejected": -2.191352128982544, "logps/chosen": -63.84337615966797, "logps/rejected": -78.95126342773438, "loss": 0.7516, "rewards/accuracies": 0.625, "rewards/chosen": 0.5339335203170776, "rewards/margins": 1.9537832736968994, "rewards/rejected": -1.4198496341705322, "step": 280 }, { "epoch": 0.3346483066617045, "grad_norm": 4.908367156982422, "learning_rate": 9.66626936829559e-07, "logits/chosen": -2.0424768924713135, "logits/rejected": -2.087646961212158, "logps/chosen": -60.19773483276367, "logps/rejected": -69.21307373046875, "loss": 0.7853, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9463330507278442, "rewards/margins": 1.5753707885742188, "rewards/rejected": -0.6290376782417297, "step": 281 }, { "epoch": 0.3358392259024935, "grad_norm": 5.150336265563965, "learning_rate": 9.66507747318236e-07, "logits/chosen": -2.082916259765625, "logits/rejected": -2.1852657794952393, "logps/chosen": -61.78703689575195, "logps/rejected": -79.23316955566406, "loss": 0.5922, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2426995038986206, "rewards/margins": 3.1702888011932373, "rewards/rejected": -1.9275891780853271, "step": 282 }, { "epoch": 0.33703014514328244, "grad_norm": 4.3616766929626465, "learning_rate": 9.663885578069129e-07, "logits/chosen": -2.1213228702545166, "logits/rejected": -2.2141427993774414, "logps/chosen": -60.88889694213867, "logps/rejected": -68.85999298095703, "loss": 0.7926, "rewards/accuracies": 0.6875, "rewards/chosen": 0.95714271068573, "rewards/margins": 1.7020437717437744, "rewards/rejected": -0.7449010610580444, "step": 283 }, { "epoch": 0.33822106438407146, "grad_norm": 4.510905742645264, "learning_rate": 9.6626936829559e-07, "logits/chosen": -2.088102340698242, "logits/rejected": -2.0928666591644287, "logps/chosen": -60.74573516845703, "logps/rejected": -68.18902587890625, "loss": 0.8517, "rewards/accuracies": 0.75, "rewards/chosen": 1.0510367155075073, "rewards/margins": 1.152805209159851, "rewards/rejected": -0.10176831483840942, "step": 284 }, { "epoch": 0.3394119836248604, "grad_norm": 5.6036529541015625, "learning_rate": 9.66150178784267e-07, "logits/chosen": -2.024994134902954, "logits/rejected": -2.108574867248535, "logps/chosen": -59.69922637939453, "logps/rejected": -71.19767761230469, "loss": 0.838, "rewards/accuracies": 0.53125, "rewards/chosen": 0.15597569942474365, "rewards/margins": 1.238416314125061, "rewards/rejected": -1.0824406147003174, "step": 285 }, { "epoch": 0.3406029028656494, "grad_norm": 4.533432483673096, "learning_rate": 9.660309892729439e-07, "logits/chosen": -2.130126953125, "logits/rejected": -2.17478609085083, "logps/chosen": -66.68235778808594, "logps/rejected": -70.39476776123047, "loss": 0.8713, "rewards/accuracies": 0.65625, "rewards/chosen": 0.17172345519065857, "rewards/margins": 0.9791300296783447, "rewards/rejected": -0.8074067234992981, "step": 286 }, { "epoch": 0.3417938221064384, "grad_norm": 6.448958873748779, "learning_rate": 9.65911799761621e-07, "logits/chosen": -2.0070550441741943, "logits/rejected": -2.2126882076263428, "logps/chosen": -56.64120101928711, "logps/rejected": -74.3144760131836, "loss": 0.5552, "rewards/accuracies": 0.84375, "rewards/chosen": 1.6577577590942383, "rewards/margins": 3.493809223175049, "rewards/rejected": -1.8360515832901, "step": 287 }, { "epoch": 0.3429847413472274, "grad_norm": 4.266571044921875, "learning_rate": 9.657926102502978e-07, "logits/chosen": -2.1660237312316895, "logits/rejected": -2.243546962738037, "logps/chosen": -59.30128860473633, "logps/rejected": -72.55908966064453, "loss": 0.7396, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8783037662506104, "rewards/margins": 2.3007030487060547, "rewards/rejected": -1.4223994016647339, "step": 288 }, { "epoch": 0.34417566058801635, "grad_norm": 6.04646635055542, "learning_rate": 9.65673420738975e-07, "logits/chosen": -2.1477575302124023, "logits/rejected": -2.205188274383545, "logps/chosen": -58.31019592285156, "logps/rejected": -71.03971862792969, "loss": 0.7494, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1010186672210693, "rewards/margins": 2.00439453125, "rewards/rejected": -0.9033758044242859, "step": 289 }, { "epoch": 0.34536657982880536, "grad_norm": 5.001651287078857, "learning_rate": 9.65554231227652e-07, "logits/chosen": -1.9660470485687256, "logits/rejected": -2.089228868484497, "logps/chosen": -58.33993911743164, "logps/rejected": -69.64691925048828, "loss": 0.7486, "rewards/accuracies": 0.75, "rewards/chosen": 0.8101170659065247, "rewards/margins": 1.8417941331863403, "rewards/rejected": -1.0316771268844604, "step": 290 }, { "epoch": 0.3465574990695943, "grad_norm": 6.351467609405518, "learning_rate": 9.65435041716329e-07, "logits/chosen": -2.1367952823638916, "logits/rejected": -2.219146728515625, "logps/chosen": -55.74396896362305, "logps/rejected": -68.02373504638672, "loss": 0.7597, "rewards/accuracies": 0.625, "rewards/chosen": 1.382678747177124, "rewards/margins": 1.9294312000274658, "rewards/rejected": -0.5467524528503418, "step": 291 }, { "epoch": 0.34774841831038333, "grad_norm": 4.498994827270508, "learning_rate": 9.65315852205006e-07, "logits/chosen": -2.045718193054199, "logits/rejected": -2.1660208702087402, "logps/chosen": -58.67814636230469, "logps/rejected": -70.811279296875, "loss": 0.7202, "rewards/accuracies": 0.75, "rewards/chosen": 1.1656091213226318, "rewards/margins": 2.285017967224121, "rewards/rejected": -1.1194090843200684, "step": 292 }, { "epoch": 0.3489393375511723, "grad_norm": 5.389950752258301, "learning_rate": 9.65196662693683e-07, "logits/chosen": -2.062422752380371, "logits/rejected": -2.1661887168884277, "logps/chosen": -68.9365463256836, "logps/rejected": -73.11428833007812, "loss": 0.805, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5256232023239136, "rewards/margins": 1.7080421447753906, "rewards/rejected": -1.1824190616607666, "step": 293 }, { "epoch": 0.3501302567919613, "grad_norm": 5.5609965324401855, "learning_rate": 9.650774731823598e-07, "logits/chosen": -2.0318171977996826, "logits/rejected": -2.2027108669281006, "logps/chosen": -58.26385498046875, "logps/rejected": -77.07402038574219, "loss": 0.6842, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7663946151733398, "rewards/margins": 2.7826075553894043, "rewards/rejected": -2.0162129402160645, "step": 294 }, { "epoch": 0.35132117603275026, "grad_norm": 4.117257595062256, "learning_rate": 9.64958283671037e-07, "logits/chosen": -2.027434825897217, "logits/rejected": -2.153012752532959, "logps/chosen": -61.111061096191406, "logps/rejected": -77.08381652832031, "loss": 0.6358, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3963602781295776, "rewards/margins": 2.945718765258789, "rewards/rejected": -1.5493581295013428, "step": 295 }, { "epoch": 0.35251209527353927, "grad_norm": 5.814301013946533, "learning_rate": 9.648390941597138e-07, "logits/chosen": -2.073610305786133, "logits/rejected": -2.1862339973449707, "logps/chosen": -57.7972526550293, "logps/rejected": -66.91902160644531, "loss": 0.76, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2425628900527954, "rewards/margins": 1.9525370597839355, "rewards/rejected": -0.7099741101264954, "step": 296 }, { "epoch": 0.35370301451432823, "grad_norm": 5.177628993988037, "learning_rate": 9.647199046483909e-07, "logits/chosen": -2.0465247631073, "logits/rejected": -2.1716017723083496, "logps/chosen": -59.97663879394531, "logps/rejected": -76.82195281982422, "loss": 0.6854, "rewards/accuracies": 0.75, "rewards/chosen": 0.9444101452827454, "rewards/margins": 2.9712796211242676, "rewards/rejected": -2.026869297027588, "step": 297 }, { "epoch": 0.35489393375511724, "grad_norm": 4.405248165130615, "learning_rate": 9.64600715137068e-07, "logits/chosen": -2.032655954360962, "logits/rejected": -2.124601364135742, "logps/chosen": -60.43484878540039, "logps/rejected": -74.06610107421875, "loss": 0.8256, "rewards/accuracies": 0.65625, "rewards/chosen": 0.9598337411880493, "rewards/margins": 2.121115207672119, "rewards/rejected": -1.1612815856933594, "step": 298 }, { "epoch": 0.3560848529959062, "grad_norm": 4.143632888793945, "learning_rate": 9.64481525625745e-07, "logits/chosen": -2.06502103805542, "logits/rejected": -2.242353677749634, "logps/chosen": -65.98043060302734, "logps/rejected": -83.87583923339844, "loss": 0.671, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3673820495605469, "rewards/margins": 2.792447805404663, "rewards/rejected": -2.425065517425537, "step": 299 }, { "epoch": 0.3572757722366952, "grad_norm": 4.738718032836914, "learning_rate": 9.643623361144219e-07, "logits/chosen": -2.070469379425049, "logits/rejected": -2.138763904571533, "logps/chosen": -61.91624450683594, "logps/rejected": -73.09893035888672, "loss": 0.766, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6123766899108887, "rewards/margins": 1.8910664319992065, "rewards/rejected": -1.2786898612976074, "step": 300 }, { "epoch": 0.35846669147748417, "grad_norm": 10.290288925170898, "learning_rate": 9.64243146603099e-07, "logits/chosen": -2.083531141281128, "logits/rejected": -2.170302391052246, "logps/chosen": -64.50162506103516, "logps/rejected": -72.09941101074219, "loss": 0.7666, "rewards/accuracies": 0.65625, "rewards/chosen": 0.6850690841674805, "rewards/margins": 2.1210057735443115, "rewards/rejected": -1.435936689376831, "step": 301 }, { "epoch": 0.3596576107182732, "grad_norm": 4.876739978790283, "learning_rate": 9.641239570917758e-07, "logits/chosen": -2.032383918762207, "logits/rejected": -2.1312971115112305, "logps/chosen": -60.236629486083984, "logps/rejected": -74.77197265625, "loss": 0.7854, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5294451117515564, "rewards/margins": 2.0228962898254395, "rewards/rejected": -1.4934512376785278, "step": 302 }, { "epoch": 0.36084852995906214, "grad_norm": 5.451337814331055, "learning_rate": 9.640047675804529e-07, "logits/chosen": -2.090843677520752, "logits/rejected": -2.1753499507904053, "logps/chosen": -65.25302124023438, "logps/rejected": -74.45545959472656, "loss": 0.8221, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3568646311759949, "rewards/margins": 1.4053480625152588, "rewards/rejected": -1.0484833717346191, "step": 303 }, { "epoch": 0.36203944919985115, "grad_norm": 4.034193515777588, "learning_rate": 9.6388557806913e-07, "logits/chosen": -2.068943500518799, "logits/rejected": -2.152190685272217, "logps/chosen": -57.007022857666016, "logps/rejected": -76.06800079345703, "loss": 0.7373, "rewards/accuracies": 0.59375, "rewards/chosen": 1.3281725645065308, "rewards/margins": 2.43452525138855, "rewards/rejected": -1.1063528060913086, "step": 304 }, { "epoch": 0.3632303684406401, "grad_norm": 6.0637030601501465, "learning_rate": 9.637663885578068e-07, "logits/chosen": -2.073181629180908, "logits/rejected": -2.1401116847991943, "logps/chosen": -60.063907623291016, "logps/rejected": -68.97496032714844, "loss": 0.809, "rewards/accuracies": 0.75, "rewards/chosen": 0.7945795655250549, "rewards/margins": 1.738977074623108, "rewards/rejected": -0.9443975687026978, "step": 305 }, { "epoch": 0.3644212876814291, "grad_norm": 4.760483264923096, "learning_rate": 9.636471990464839e-07, "logits/chosen": -2.0931873321533203, "logits/rejected": -2.1429619789123535, "logps/chosen": -65.605224609375, "logps/rejected": -74.58683776855469, "loss": 0.8316, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1355072259902954, "rewards/margins": 1.7374855279922485, "rewards/rejected": -1.6019784212112427, "step": 306 }, { "epoch": 0.3656122069222181, "grad_norm": 4.480938911437988, "learning_rate": 9.63528009535161e-07, "logits/chosen": -2.0630204677581787, "logits/rejected": -2.1635501384735107, "logps/chosen": -61.6185302734375, "logps/rejected": -74.62679290771484, "loss": 0.7999, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7074642181396484, "rewards/margins": 1.8467730283737183, "rewards/rejected": -1.1393084526062012, "step": 307 }, { "epoch": 0.3668031261630071, "grad_norm": 6.022768497467041, "learning_rate": 9.634088200238378e-07, "logits/chosen": -2.0643200874328613, "logits/rejected": -2.197810649871826, "logps/chosen": -64.29682159423828, "logps/rejected": -84.76124572753906, "loss": 0.6563, "rewards/accuracies": 0.78125, "rewards/chosen": 0.21822328865528107, "rewards/margins": 3.103416919708252, "rewards/rejected": -2.8851938247680664, "step": 308 }, { "epoch": 0.36799404540379604, "grad_norm": 4.801238059997559, "learning_rate": 9.632896305125149e-07, "logits/chosen": -2.05877947807312, "logits/rejected": -2.19181752204895, "logps/chosen": -66.1600341796875, "logps/rejected": -79.94879150390625, "loss": 0.651, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4906170964241028, "rewards/margins": 2.9009437561035156, "rewards/rejected": -2.4103267192840576, "step": 309 }, { "epoch": 0.36918496464458506, "grad_norm": 6.0726518630981445, "learning_rate": 9.631704410011917e-07, "logits/chosen": -2.122303009033203, "logits/rejected": -2.2448318004608154, "logps/chosen": -62.2222900390625, "logps/rejected": -75.63919830322266, "loss": 0.7335, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6527348756790161, "rewards/margins": 2.5306060314178467, "rewards/rejected": -1.8778711557388306, "step": 310 }, { "epoch": 0.370375883885374, "grad_norm": 4.069520473480225, "learning_rate": 9.630512514898688e-07, "logits/chosen": -2.069359302520752, "logits/rejected": -2.1273701190948486, "logps/chosen": -60.80562210083008, "logps/rejected": -76.5986557006836, "loss": 0.6696, "rewards/accuracies": 0.71875, "rewards/chosen": 1.1416927576065063, "rewards/margins": 2.8429582118988037, "rewards/rejected": -1.7012654542922974, "step": 311 }, { "epoch": 0.371566803126163, "grad_norm": 5.13167142868042, "learning_rate": 9.62932061978546e-07, "logits/chosen": -2.079087734222412, "logits/rejected": -2.134584903717041, "logps/chosen": -64.24640655517578, "logps/rejected": -74.72264862060547, "loss": 0.7711, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2567249536514282, "rewards/margins": 1.8866678476333618, "rewards/rejected": -1.629942774772644, "step": 312 }, { "epoch": 0.372757722366952, "grad_norm": 4.749898910522461, "learning_rate": 9.62812872467223e-07, "logits/chosen": -2.0297036170959473, "logits/rejected": -2.100623846054077, "logps/chosen": -63.70416259765625, "logps/rejected": -71.48663330078125, "loss": 0.7662, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8256319761276245, "rewards/margins": 2.0759787559509277, "rewards/rejected": -1.2503470182418823, "step": 313 }, { "epoch": 0.373948641607741, "grad_norm": 5.584217548370361, "learning_rate": 9.626936829558998e-07, "logits/chosen": -2.0534541606903076, "logits/rejected": -2.101585865020752, "logps/chosen": -66.89623260498047, "logps/rejected": -80.40367126464844, "loss": 0.749, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2566073536872864, "rewards/margins": 2.1007649898529053, "rewards/rejected": -1.8441576957702637, "step": 314 }, { "epoch": 0.37513956084852995, "grad_norm": 5.762143135070801, "learning_rate": 9.62574493444577e-07, "logits/chosen": -2.0198569297790527, "logits/rejected": -2.211641788482666, "logps/chosen": -63.430728912353516, "logps/rejected": -84.98551940917969, "loss": 0.62, "rewards/accuracies": 0.75, "rewards/chosen": 0.9965840578079224, "rewards/margins": 3.318856716156006, "rewards/rejected": -2.322272300720215, "step": 315 }, { "epoch": 0.37633048008931896, "grad_norm": 4.745457649230957, "learning_rate": 9.624553039332538e-07, "logits/chosen": -2.1046388149261475, "logits/rejected": -2.2087559700012207, "logps/chosen": -62.775665283203125, "logps/rejected": -81.10919189453125, "loss": 0.6725, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6354756355285645, "rewards/margins": 2.4808337688446045, "rewards/rejected": -1.8453580141067505, "step": 316 }, { "epoch": 0.3775213993301079, "grad_norm": 4.434830665588379, "learning_rate": 9.623361144219308e-07, "logits/chosen": -2.0978164672851562, "logits/rejected": -2.193469285964966, "logps/chosen": -59.34352111816406, "logps/rejected": -77.67386627197266, "loss": 0.7144, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6573122143745422, "rewards/margins": 2.3139078617095947, "rewards/rejected": -1.6565957069396973, "step": 317 }, { "epoch": 0.37871231857089693, "grad_norm": 4.952322006225586, "learning_rate": 9.622169249106077e-07, "logits/chosen": -2.069267749786377, "logits/rejected": -2.1796445846557617, "logps/chosen": -54.55491638183594, "logps/rejected": -71.47491455078125, "loss": 0.8008, "rewards/accuracies": 0.5625, "rewards/chosen": 1.171583890914917, "rewards/margins": 1.6833664178848267, "rewards/rejected": -0.5117826461791992, "step": 318 }, { "epoch": 0.3799032378116859, "grad_norm": 3.8017871379852295, "learning_rate": 9.620977353992848e-07, "logits/chosen": -2.07423996925354, "logits/rejected": -2.2400941848754883, "logps/chosen": -65.2566146850586, "logps/rejected": -79.6611328125, "loss": 0.7431, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5381469130516052, "rewards/margins": 2.1292717456817627, "rewards/rejected": -1.5911247730255127, "step": 319 }, { "epoch": 0.3810941570524749, "grad_norm": 4.263278961181641, "learning_rate": 9.619785458879618e-07, "logits/chosen": -1.9985129833221436, "logits/rejected": -2.1613335609436035, "logps/chosen": -62.21971893310547, "logps/rejected": -83.48847961425781, "loss": 0.6842, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4516451060771942, "rewards/margins": 2.9614524841308594, "rewards/rejected": -2.5098073482513428, "step": 320 }, { "epoch": 0.38228507629326386, "grad_norm": 4.825644493103027, "learning_rate": 9.61859356376639e-07, "logits/chosen": -2.111597776412964, "logits/rejected": -2.2366552352905273, "logps/chosen": -60.694175720214844, "logps/rejected": -78.57465362548828, "loss": 0.6604, "rewards/accuracies": 0.75, "rewards/chosen": 1.0963318347930908, "rewards/margins": 2.947145462036133, "rewards/rejected": -1.8508137464523315, "step": 321 }, { "epoch": 0.38347599553405287, "grad_norm": 5.916278839111328, "learning_rate": 9.617401668653158e-07, "logits/chosen": -2.0361528396606445, "logits/rejected": -2.1930131912231445, "logps/chosen": -65.18759155273438, "logps/rejected": -82.61458587646484, "loss": 0.6026, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1607736349105835, "rewards/margins": 3.223864793777466, "rewards/rejected": -2.063091278076172, "step": 322 }, { "epoch": 0.38466691477484183, "grad_norm": 5.233994007110596, "learning_rate": 9.616209773539929e-07, "logits/chosen": -2.0388851165771484, "logits/rejected": -2.163778066635132, "logps/chosen": -61.59674835205078, "logps/rejected": -71.47067260742188, "loss": 0.7328, "rewards/accuracies": 0.6875, "rewards/chosen": 1.2142999172210693, "rewards/margins": 2.2455356121063232, "rewards/rejected": -1.031235694885254, "step": 323 }, { "epoch": 0.38585783401563084, "grad_norm": 4.896060943603516, "learning_rate": 9.615017878426697e-07, "logits/chosen": -2.0863754749298096, "logits/rejected": -2.139662981033325, "logps/chosen": -62.45058822631836, "logps/rejected": -68.01310729980469, "loss": 0.8125, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0249167680740356, "rewards/margins": 1.6958131790161133, "rewards/rejected": -0.6708964109420776, "step": 324 }, { "epoch": 0.3870487532564198, "grad_norm": 3.848073720932007, "learning_rate": 9.613825983313468e-07, "logits/chosen": -2.0460875034332275, "logits/rejected": -2.2013590335845947, "logps/chosen": -57.516170501708984, "logps/rejected": -75.33316802978516, "loss": 0.6102, "rewards/accuracies": 0.65625, "rewards/chosen": 1.6438847780227661, "rewards/margins": 3.1415791511535645, "rewards/rejected": -1.4976942539215088, "step": 325 }, { "epoch": 0.3882396724972088, "grad_norm": 5.360319137573242, "learning_rate": 9.612634088200239e-07, "logits/chosen": -2.050051212310791, "logits/rejected": -2.16831374168396, "logps/chosen": -53.03232955932617, "logps/rejected": -71.25399017333984, "loss": 0.6691, "rewards/accuracies": 0.6875, "rewards/chosen": 1.475629210472107, "rewards/margins": 2.528454303741455, "rewards/rejected": -1.0528250932693481, "step": 326 }, { "epoch": 0.38943059173799777, "grad_norm": 4.53249979019165, "learning_rate": 9.611442193087007e-07, "logits/chosen": -1.9925678968429565, "logits/rejected": -2.184300184249878, "logps/chosen": -62.860816955566406, "logps/rejected": -76.70263671875, "loss": 0.6202, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2216259241104126, "rewards/margins": 3.227480411529541, "rewards/rejected": -2.005854368209839, "step": 327 }, { "epoch": 0.3906215109787867, "grad_norm": 4.6732683181762695, "learning_rate": 9.610250297973778e-07, "logits/chosen": -2.0671274662017822, "logits/rejected": -2.175478219985962, "logps/chosen": -57.319217681884766, "logps/rejected": -71.51544189453125, "loss": 0.7598, "rewards/accuracies": 0.65625, "rewards/chosen": 1.1728706359863281, "rewards/margins": 2.1341514587402344, "rewards/rejected": -0.9612804651260376, "step": 328 }, { "epoch": 0.39181243021957574, "grad_norm": 4.781519889831543, "learning_rate": 9.609058402860549e-07, "logits/chosen": -2.0688581466674805, "logits/rejected": -2.1086771488189697, "logps/chosen": -63.15998458862305, "logps/rejected": -69.5813980102539, "loss": 0.8663, "rewards/accuracies": 0.59375, "rewards/chosen": 0.6529667973518372, "rewards/margins": 1.191946268081665, "rewards/rejected": -0.5389796495437622, "step": 329 }, { "epoch": 0.3930033494603647, "grad_norm": 4.234360694885254, "learning_rate": 9.607866507747317e-07, "logits/chosen": -2.066392660140991, "logits/rejected": -2.1427571773529053, "logps/chosen": -57.980621337890625, "logps/rejected": -74.50462341308594, "loss": 0.5772, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3527286052703857, "rewards/margins": 3.042518377304077, "rewards/rejected": -1.6897894144058228, "step": 330 }, { "epoch": 0.3941942687011537, "grad_norm": 4.753359794616699, "learning_rate": 9.606674612634088e-07, "logits/chosen": -2.0565109252929688, "logits/rejected": -2.2657246589660645, "logps/chosen": -59.76316833496094, "logps/rejected": -73.6971435546875, "loss": 0.6724, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1001635789871216, "rewards/margins": 2.7092437744140625, "rewards/rejected": -1.60908043384552, "step": 331 }, { "epoch": 0.39538518794194266, "grad_norm": 5.462575435638428, "learning_rate": 9.605482717520857e-07, "logits/chosen": -2.1400418281555176, "logits/rejected": -2.2585511207580566, "logps/chosen": -59.097434997558594, "logps/rejected": -78.48956298828125, "loss": 0.6845, "rewards/accuracies": 0.84375, "rewards/chosen": 1.444150447845459, "rewards/margins": 2.6714134216308594, "rewards/rejected": -1.2272629737854004, "step": 332 }, { "epoch": 0.3965761071827317, "grad_norm": 5.601904392242432, "learning_rate": 9.604290822407627e-07, "logits/chosen": -2.1376938819885254, "logits/rejected": -2.1822917461395264, "logps/chosen": -55.30360794067383, "logps/rejected": -70.33074188232422, "loss": 0.7818, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7609890699386597, "rewards/margins": 1.7825645208358765, "rewards/rejected": -1.0215754508972168, "step": 333 }, { "epoch": 0.39776702642352063, "grad_norm": 4.779054164886475, "learning_rate": 9.603098927294398e-07, "logits/chosen": -2.0383033752441406, "logits/rejected": -2.1814346313476562, "logps/chosen": -59.140838623046875, "logps/rejected": -73.1127700805664, "loss": 0.6946, "rewards/accuracies": 0.71875, "rewards/chosen": 1.4330402612686157, "rewards/margins": 2.4694504737854004, "rewards/rejected": -1.0364102125167847, "step": 334 }, { "epoch": 0.39895794566430964, "grad_norm": 4.181259632110596, "learning_rate": 9.601907032181169e-07, "logits/chosen": -2.0674991607666016, "logits/rejected": -2.159090995788574, "logps/chosen": -63.12284851074219, "logps/rejected": -74.44039154052734, "loss": 0.6999, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6113160252571106, "rewards/margins": 2.2457823753356934, "rewards/rejected": -1.6344664096832275, "step": 335 }, { "epoch": 0.4001488649050986, "grad_norm": 4.86850643157959, "learning_rate": 9.600715137067938e-07, "logits/chosen": -2.078183650970459, "logits/rejected": -2.2036328315734863, "logps/chosen": -64.65361785888672, "logps/rejected": -72.52267456054688, "loss": 0.6793, "rewards/accuracies": 0.71875, "rewards/chosen": 1.5353374481201172, "rewards/margins": 2.4511280059814453, "rewards/rejected": -0.9157902598381042, "step": 336 }, { "epoch": 0.4001488649050986, "eval_logits/chosen": -2.0820157527923584, "eval_logits/rejected": -2.1880459785461426, "eval_logps/chosen": -60.4847412109375, "eval_logps/rejected": -70.92153930664062, "eval_loss": 0.7437037229537964, "eval_rewards/accuracies": 0.717432975769043, "eval_rewards/chosen": 1.0225385427474976, "eval_rewards/margins": 2.132423162460327, "eval_rewards/rejected": -1.10988450050354, "eval_runtime": 1873.4332, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.279, "step": 336 }, { "epoch": 0.4013397841458876, "grad_norm": 3.8159170150756836, "learning_rate": 9.599523241954708e-07, "logits/chosen": -2.035350799560547, "logits/rejected": -2.2138750553131104, "logps/chosen": -57.0502815246582, "logps/rejected": -74.6092758178711, "loss": 0.6072, "rewards/accuracies": 0.8125, "rewards/chosen": 1.457616925239563, "rewards/margins": 3.2696430683135986, "rewards/rejected": -1.8120261430740356, "step": 337 }, { "epoch": 0.40253070338667657, "grad_norm": 4.825898170471191, "learning_rate": 9.598331346841477e-07, "logits/chosen": -2.0373923778533936, "logits/rejected": -2.2305614948272705, "logps/chosen": -66.34529113769531, "logps/rejected": -80.3983383178711, "loss": 0.6504, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5143203139305115, "rewards/margins": 2.842151165008545, "rewards/rejected": -2.327831268310547, "step": 338 }, { "epoch": 0.4037216226274656, "grad_norm": 3.926079750061035, "learning_rate": 9.597139451728248e-07, "logits/chosen": -1.999110460281372, "logits/rejected": -2.1460483074188232, "logps/chosen": -54.932559967041016, "logps/rejected": -70.13517761230469, "loss": 0.6674, "rewards/accuracies": 0.6875, "rewards/chosen": 1.9303613901138306, "rewards/margins": 2.8123040199279785, "rewards/rejected": -0.881942868232727, "step": 339 }, { "epoch": 0.40491254186825454, "grad_norm": 3.4636125564575195, "learning_rate": 9.595947556615016e-07, "logits/chosen": -2.048419952392578, "logits/rejected": -2.2352294921875, "logps/chosen": -56.88501739501953, "logps/rejected": -76.97010803222656, "loss": 0.5902, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6123476028442383, "rewards/margins": 3.5150198936462402, "rewards/rejected": -1.902672529220581, "step": 340 }, { "epoch": 0.40610346110904355, "grad_norm": 7.083526611328125, "learning_rate": 9.594755661501787e-07, "logits/chosen": -2.081402063369751, "logits/rejected": -2.2822277545928955, "logps/chosen": -62.79628372192383, "logps/rejected": -80.86520385742188, "loss": 0.6664, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6808026432991028, "rewards/margins": 2.9504570960998535, "rewards/rejected": -2.2696547508239746, "step": 341 }, { "epoch": 0.4072943803498325, "grad_norm": 4.59112548828125, "learning_rate": 9.593563766388558e-07, "logits/chosen": -2.1200413703918457, "logits/rejected": -2.22007155418396, "logps/chosen": -66.2000503540039, "logps/rejected": -81.56787109375, "loss": 0.6899, "rewards/accuracies": 0.75, "rewards/chosen": 0.44214266538619995, "rewards/margins": 2.878032684326172, "rewards/rejected": -2.4358901977539062, "step": 342 }, { "epoch": 0.4084852995906215, "grad_norm": 7.343733787536621, "learning_rate": 9.592371871275328e-07, "logits/chosen": -2.0677974224090576, "logits/rejected": -2.1035451889038086, "logps/chosen": -62.68344497680664, "logps/rejected": -71.75181579589844, "loss": 0.7564, "rewards/accuracies": 0.65625, "rewards/chosen": 0.8437380790710449, "rewards/margins": 2.0222389698028564, "rewards/rejected": -1.1785006523132324, "step": 343 }, { "epoch": 0.4096762188314105, "grad_norm": 6.05220365524292, "learning_rate": 9.591179976162097e-07, "logits/chosen": -1.9901398420333862, "logits/rejected": -2.1085686683654785, "logps/chosen": -55.81694030761719, "logps/rejected": -69.32213592529297, "loss": 0.6469, "rewards/accuracies": 0.875, "rewards/chosen": 1.4888585805892944, "rewards/margins": 3.0923612117767334, "rewards/rejected": -1.6035025119781494, "step": 344 }, { "epoch": 0.4108671380721995, "grad_norm": 5.059830665588379, "learning_rate": 9.589988081048868e-07, "logits/chosen": -2.0549917221069336, "logits/rejected": -2.1792116165161133, "logps/chosen": -65.74911499023438, "logps/rejected": -75.921630859375, "loss": 0.8179, "rewards/accuracies": 0.6875, "rewards/chosen": 0.36568060517311096, "rewards/margins": 1.7548487186431885, "rewards/rejected": -1.3891680240631104, "step": 345 }, { "epoch": 0.41205805731298845, "grad_norm": 5.126185894012451, "learning_rate": 9.588796185935636e-07, "logits/chosen": -2.080465078353882, "logits/rejected": -2.1675610542297363, "logps/chosen": -65.25645446777344, "logps/rejected": -74.84571075439453, "loss": 0.7614, "rewards/accuracies": 0.625, "rewards/chosen": 0.3343963921070099, "rewards/margins": 2.0711803436279297, "rewards/rejected": -1.7367841005325317, "step": 346 }, { "epoch": 0.41324897655377746, "grad_norm": 5.51197624206543, "learning_rate": 9.587604290822407e-07, "logits/chosen": -2.0594658851623535, "logits/rejected": -2.075834035873413, "logps/chosen": -61.96136474609375, "logps/rejected": -68.27967071533203, "loss": 0.9033, "rewards/accuracies": 0.59375, "rewards/chosen": 0.5282816290855408, "rewards/margins": 0.8481558561325073, "rewards/rejected": -0.3198741674423218, "step": 347 }, { "epoch": 0.4144398957945664, "grad_norm": 7.400087833404541, "learning_rate": 9.586412395709178e-07, "logits/chosen": -2.0619771480560303, "logits/rejected": -2.162893295288086, "logps/chosen": -62.48553466796875, "logps/rejected": -76.20463562011719, "loss": 0.6702, "rewards/accuracies": 0.6875, "rewards/chosen": 1.485717535018921, "rewards/margins": 3.0559496879577637, "rewards/rejected": -1.5702321529388428, "step": 348 }, { "epoch": 0.4156308150353554, "grad_norm": 5.890714645385742, "learning_rate": 9.585220500595947e-07, "logits/chosen": -1.990283489227295, "logits/rejected": -2.1919875144958496, "logps/chosen": -59.02941131591797, "logps/rejected": -81.58980560302734, "loss": 0.6346, "rewards/accuracies": 0.84375, "rewards/chosen": 1.248684287071228, "rewards/margins": 3.465710163116455, "rewards/rejected": -2.2170257568359375, "step": 349 }, { "epoch": 0.4168217342761444, "grad_norm": 5.977250576019287, "learning_rate": 9.584028605482717e-07, "logits/chosen": -2.1267504692077637, "logits/rejected": -2.22228741645813, "logps/chosen": -61.07741165161133, "logps/rejected": -79.08322143554688, "loss": 0.6777, "rewards/accuracies": 0.75, "rewards/chosen": 0.5742582678794861, "rewards/margins": 2.750896453857422, "rewards/rejected": -2.17663836479187, "step": 350 }, { "epoch": 0.4180126535169334, "grad_norm": 4.726648330688477, "learning_rate": 9.582836710369488e-07, "logits/chosen": -2.0434417724609375, "logits/rejected": -2.153148889541626, "logps/chosen": -59.59062957763672, "logps/rejected": -68.12052917480469, "loss": 0.8049, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9205676317214966, "rewards/margins": 1.9291279315948486, "rewards/rejected": -1.0085601806640625, "step": 351 }, { "epoch": 0.41920357275772235, "grad_norm": 4.38850736618042, "learning_rate": 9.581644815256257e-07, "logits/chosen": -1.9509074687957764, "logits/rejected": -2.0977509021759033, "logps/chosen": -59.73723220825195, "logps/rejected": -70.54544830322266, "loss": 0.7157, "rewards/accuracies": 0.6875, "rewards/chosen": 1.8100647926330566, "rewards/margins": 2.8287715911865234, "rewards/rejected": -1.0187065601348877, "step": 352 }, { "epoch": 0.42039449199851137, "grad_norm": 4.683259010314941, "learning_rate": 9.580452920143027e-07, "logits/chosen": -1.9800975322723389, "logits/rejected": -2.087035655975342, "logps/chosen": -57.36590576171875, "logps/rejected": -69.88197326660156, "loss": 0.7097, "rewards/accuracies": 0.65625, "rewards/chosen": 1.4682300090789795, "rewards/margins": 2.4573795795440674, "rewards/rejected": -0.9891496896743774, "step": 353 }, { "epoch": 0.4215854112393003, "grad_norm": 6.716897964477539, "learning_rate": 9.579261025029796e-07, "logits/chosen": -2.083285093307495, "logits/rejected": -2.127377510070801, "logps/chosen": -60.37921905517578, "logps/rejected": -72.19203186035156, "loss": 0.7497, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8349239230155945, "rewards/margins": 2.0725440979003906, "rewards/rejected": -1.237620234489441, "step": 354 }, { "epoch": 0.42277633048008934, "grad_norm": 4.736146926879883, "learning_rate": 9.578069129916567e-07, "logits/chosen": -2.132145643234253, "logits/rejected": -2.1591413021087646, "logps/chosen": -63.5974006652832, "logps/rejected": -75.17237854003906, "loss": 0.7946, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5492779612541199, "rewards/margins": 1.891312599182129, "rewards/rejected": -1.3420346975326538, "step": 355 }, { "epoch": 0.4239672497208783, "grad_norm": 4.434257507324219, "learning_rate": 9.576877234803337e-07, "logits/chosen": -2.0638976097106934, "logits/rejected": -2.140242576599121, "logps/chosen": -63.58637619018555, "logps/rejected": -67.5662841796875, "loss": 0.8153, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6296316385269165, "rewards/margins": 1.3817522525787354, "rewards/rejected": -0.7521206140518188, "step": 356 }, { "epoch": 0.4251581689616673, "grad_norm": 7.927043914794922, "learning_rate": 9.575685339690108e-07, "logits/chosen": -2.071065902709961, "logits/rejected": -2.107393503189087, "logps/chosen": -59.56000900268555, "logps/rejected": -79.64523315429688, "loss": 0.7511, "rewards/accuracies": 0.65625, "rewards/chosen": 0.17355157434940338, "rewards/margins": 2.370046615600586, "rewards/rejected": -2.1964950561523438, "step": 357 }, { "epoch": 0.42634908820245626, "grad_norm": 6.99193000793457, "learning_rate": 9.574493444576877e-07, "logits/chosen": -2.0694494247436523, "logits/rejected": -2.175675868988037, "logps/chosen": -54.561614990234375, "logps/rejected": -71.13641357421875, "loss": 0.6898, "rewards/accuracies": 0.8125, "rewards/chosen": 1.6999918222427368, "rewards/margins": 2.688734769821167, "rewards/rejected": -0.9887429475784302, "step": 358 }, { "epoch": 0.4275400074432453, "grad_norm": 5.346283435821533, "learning_rate": 9.573301549463648e-07, "logits/chosen": -2.1266396045684814, "logits/rejected": -2.2338850498199463, "logps/chosen": -62.93007278442383, "logps/rejected": -83.17249298095703, "loss": 0.6661, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3347327411174774, "rewards/margins": 2.8276889324188232, "rewards/rejected": -2.4929559230804443, "step": 359 }, { "epoch": 0.42873092668403423, "grad_norm": 4.872267723083496, "learning_rate": 9.572109654350416e-07, "logits/chosen": -2.020106077194214, "logits/rejected": -2.2267231941223145, "logps/chosen": -56.38937759399414, "logps/rejected": -80.30964660644531, "loss": 0.4977, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6701418161392212, "rewards/margins": 4.205547332763672, "rewards/rejected": -2.535405397415161, "step": 360 }, { "epoch": 0.42992184592482324, "grad_norm": 5.455663681030273, "learning_rate": 9.570917759237187e-07, "logits/chosen": -2.099173069000244, "logits/rejected": -2.1525347232818604, "logps/chosen": -67.28924560546875, "logps/rejected": -83.65498352050781, "loss": 0.7266, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06684048473834991, "rewards/margins": 2.3921475410461426, "rewards/rejected": -2.4589881896972656, "step": 361 }, { "epoch": 0.4311127651656122, "grad_norm": 6.877169609069824, "learning_rate": 9.569725864123955e-07, "logits/chosen": -2.083549976348877, "logits/rejected": -2.203047752380371, "logps/chosen": -58.7923583984375, "logps/rejected": -75.99861907958984, "loss": 0.6207, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2917927503585815, "rewards/margins": 3.153388500213623, "rewards/rejected": -1.861595869064331, "step": 362 }, { "epoch": 0.4323036844064012, "grad_norm": 4.3725504875183105, "learning_rate": 9.568533969010726e-07, "logits/chosen": -2.0327420234680176, "logits/rejected": -2.190349817276001, "logps/chosen": -54.835994720458984, "logps/rejected": -73.86656188964844, "loss": 0.5569, "rewards/accuracies": 0.8125, "rewards/chosen": 2.1053757667541504, "rewards/margins": 3.637225866317749, "rewards/rejected": -1.531849980354309, "step": 363 }, { "epoch": 0.43349460364719017, "grad_norm": 7.00095796585083, "learning_rate": 9.567342073897497e-07, "logits/chosen": -2.0400452613830566, "logits/rejected": -2.196161985397339, "logps/chosen": -66.29910278320312, "logps/rejected": -78.22321319580078, "loss": 0.6539, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6282521486282349, "rewards/margins": 2.576903820037842, "rewards/rejected": -1.9486517906188965, "step": 364 }, { "epoch": 0.4346855228879792, "grad_norm": 5.64506721496582, "learning_rate": 9.566150178784268e-07, "logits/chosen": -2.107952117919922, "logits/rejected": -2.182736396789551, "logps/chosen": -58.79761505126953, "logps/rejected": -79.36897277832031, "loss": 0.6932, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0087348222732544, "rewards/margins": 2.8258044719696045, "rewards/rejected": -1.8170697689056396, "step": 365 }, { "epoch": 0.43587644212876814, "grad_norm": 6.422011375427246, "learning_rate": 9.564958283671036e-07, "logits/chosen": -2.049006223678589, "logits/rejected": -2.1400158405303955, "logps/chosen": -57.50290298461914, "logps/rejected": -68.63572692871094, "loss": 0.7468, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1271835565567017, "rewards/margins": 2.069075345993042, "rewards/rejected": -0.9418919682502747, "step": 366 }, { "epoch": 0.43706736136955715, "grad_norm": 5.737358570098877, "learning_rate": 9.563766388557807e-07, "logits/chosen": -2.1103568077087402, "logits/rejected": -2.217919111251831, "logps/chosen": -61.49726104736328, "logps/rejected": -74.56958770751953, "loss": 0.734, "rewards/accuracies": 0.65625, "rewards/chosen": 0.2806127667427063, "rewards/margins": 2.0038158893585205, "rewards/rejected": -1.723203182220459, "step": 367 }, { "epoch": 0.4382582806103461, "grad_norm": 6.224607944488525, "learning_rate": 9.562574493444576e-07, "logits/chosen": -2.103545904159546, "logits/rejected": -2.124147415161133, "logps/chosen": -59.5854377746582, "logps/rejected": -65.43804168701172, "loss": 0.868, "rewards/accuracies": 0.6875, "rewards/chosen": 1.068199634552002, "rewards/margins": 1.1273746490478516, "rewards/rejected": -0.0591750293970108, "step": 368 }, { "epoch": 0.4394491998511351, "grad_norm": 4.483856678009033, "learning_rate": 9.561382598331346e-07, "logits/chosen": -2.0401012897491455, "logits/rejected": -2.1753270626068115, "logps/chosen": -61.9348258972168, "logps/rejected": -84.17595672607422, "loss": 0.6535, "rewards/accuracies": 0.75, "rewards/chosen": 0.7349647283554077, "rewards/margins": 3.01501202583313, "rewards/rejected": -2.2800471782684326, "step": 369 }, { "epoch": 0.4406401190919241, "grad_norm": 8.891215324401855, "learning_rate": 9.560190703218117e-07, "logits/chosen": -2.0383434295654297, "logits/rejected": -2.1959471702575684, "logps/chosen": -59.472328186035156, "logps/rejected": -85.20333862304688, "loss": 0.5361, "rewards/accuracies": 0.875, "rewards/chosen": 1.842582106590271, "rewards/margins": 3.866873025894165, "rewards/rejected": -2.0242912769317627, "step": 370 }, { "epoch": 0.4418310383327131, "grad_norm": 5.028306007385254, "learning_rate": 9.558998808104886e-07, "logits/chosen": -2.0524232387542725, "logits/rejected": -2.161771535873413, "logps/chosen": -56.886600494384766, "logps/rejected": -77.05963897705078, "loss": 0.7044, "rewards/accuracies": 0.71875, "rewards/chosen": 1.1511985063552856, "rewards/margins": 2.895174026489258, "rewards/rejected": -1.7439754009246826, "step": 371 }, { "epoch": 0.44302195757350205, "grad_norm": 6.0968337059021, "learning_rate": 9.557806912991656e-07, "logits/chosen": -2.0386414527893066, "logits/rejected": -2.179063081741333, "logps/chosen": -60.680789947509766, "logps/rejected": -77.35613250732422, "loss": 0.7413, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7408033609390259, "rewards/margins": 2.574425220489502, "rewards/rejected": -1.8336219787597656, "step": 372 }, { "epoch": 0.44421287681429106, "grad_norm": 5.698088645935059, "learning_rate": 9.556615017878427e-07, "logits/chosen": -2.064033269882202, "logits/rejected": -2.2251453399658203, "logps/chosen": -61.444156646728516, "logps/rejected": -79.0611343383789, "loss": 0.6623, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5018725395202637, "rewards/margins": 2.8327670097351074, "rewards/rejected": -2.3308944702148438, "step": 373 }, { "epoch": 0.44540379605508, "grad_norm": 6.674217224121094, "learning_rate": 9.555423122765196e-07, "logits/chosen": -2.030679702758789, "logits/rejected": -2.074617385864258, "logps/chosen": -50.81689453125, "logps/rejected": -67.62401580810547, "loss": 0.6844, "rewards/accuracies": 0.75, "rewards/chosen": 2.223998785018921, "rewards/margins": 2.844841957092285, "rewards/rejected": -0.6208430528640747, "step": 374 }, { "epoch": 0.44659471529586897, "grad_norm": 6.149866104125977, "learning_rate": 9.554231227651967e-07, "logits/chosen": -2.0916202068328857, "logits/rejected": -2.2200756072998047, "logps/chosen": -60.85958480834961, "logps/rejected": -67.3919448852539, "loss": 0.8934, "rewards/accuracies": 0.59375, "rewards/chosen": 0.7978518605232239, "rewards/margins": 1.303454875946045, "rewards/rejected": -0.5056030750274658, "step": 375 }, { "epoch": 0.447785634536658, "grad_norm": 5.906194686889648, "learning_rate": 9.553039332538735e-07, "logits/chosen": -2.1124112606048584, "logits/rejected": -2.189512252807617, "logps/chosen": -57.84950256347656, "logps/rejected": -72.47064208984375, "loss": 0.6531, "rewards/accuracies": 0.78125, "rewards/chosen": 1.5460408926010132, "rewards/margins": 2.805232048034668, "rewards/rejected": -1.2591912746429443, "step": 376 }, { "epoch": 0.44897655377744694, "grad_norm": 5.421777248382568, "learning_rate": 9.551847437425506e-07, "logits/chosen": -2.0482513904571533, "logits/rejected": -2.1467814445495605, "logps/chosen": -59.26988220214844, "logps/rejected": -72.22036743164062, "loss": 0.7605, "rewards/accuracies": 0.59375, "rewards/chosen": 0.3498375415802002, "rewards/margins": 1.8772006034851074, "rewards/rejected": -1.5273629426956177, "step": 377 }, { "epoch": 0.45016747301823595, "grad_norm": 5.470702171325684, "learning_rate": 9.550655542312277e-07, "logits/chosen": -1.9984112977981567, "logits/rejected": -2.1717705726623535, "logps/chosen": -65.15260314941406, "logps/rejected": -78.53115844726562, "loss": 0.704, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5737107992172241, "rewards/margins": 2.692276954650879, "rewards/rejected": -2.1185662746429443, "step": 378 }, { "epoch": 0.4513583922590249, "grad_norm": 4.843687057495117, "learning_rate": 9.549463647199047e-07, "logits/chosen": -2.0745689868927, "logits/rejected": -2.2068333625793457, "logps/chosen": -56.49230194091797, "logps/rejected": -70.0605697631836, "loss": 0.6517, "rewards/accuracies": 0.84375, "rewards/chosen": 2.2202725410461426, "rewards/margins": 2.9953696727752686, "rewards/rejected": -0.7750973701477051, "step": 379 }, { "epoch": 0.4525493114998139, "grad_norm": 5.827022075653076, "learning_rate": 9.548271752085816e-07, "logits/chosen": -2.0966556072235107, "logits/rejected": -2.259737491607666, "logps/chosen": -56.19529724121094, "logps/rejected": -77.65433502197266, "loss": 0.6698, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3631584644317627, "rewards/margins": 3.452023506164551, "rewards/rejected": -2.088865280151367, "step": 380 }, { "epoch": 0.4537402307406029, "grad_norm": 6.615501880645752, "learning_rate": 9.547079856972587e-07, "logits/chosen": -2.041388750076294, "logits/rejected": -2.170722723007202, "logps/chosen": -65.68104553222656, "logps/rejected": -82.9830551147461, "loss": 0.7684, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6595515608787537, "rewards/margins": 2.578282594680786, "rewards/rejected": -1.9187309741973877, "step": 381 }, { "epoch": 0.4549311499813919, "grad_norm": 7.298476219177246, "learning_rate": 9.545887961859355e-07, "logits/chosen": -2.0180768966674805, "logits/rejected": -2.0655763149261475, "logps/chosen": -56.19814682006836, "logps/rejected": -69.24427795410156, "loss": 0.9045, "rewards/accuracies": 0.59375, "rewards/chosen": 1.1999038457870483, "rewards/margins": 1.0239636898040771, "rewards/rejected": 0.17594027519226074, "step": 382 }, { "epoch": 0.45612206922218085, "grad_norm": 5.550846099853516, "learning_rate": 9.544696066746126e-07, "logits/chosen": -2.0942773818969727, "logits/rejected": -2.1285476684570312, "logps/chosen": -58.486122131347656, "logps/rejected": -71.45911407470703, "loss": 0.786, "rewards/accuracies": 0.65625, "rewards/chosen": 1.4097487926483154, "rewards/margins": 1.9384500980377197, "rewards/rejected": -0.5287013649940491, "step": 383 }, { "epoch": 0.45731298846296986, "grad_norm": 5.9580254554748535, "learning_rate": 9.543504171632895e-07, "logits/chosen": -2.0065691471099854, "logits/rejected": -2.177626132965088, "logps/chosen": -61.49897766113281, "logps/rejected": -80.16981506347656, "loss": 0.6626, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9234988689422607, "rewards/margins": 2.8023602962493896, "rewards/rejected": -1.8788613080978394, "step": 384 }, { "epoch": 0.4585039077037588, "grad_norm": 4.4135894775390625, "learning_rate": 9.542312276519665e-07, "logits/chosen": -2.1259937286376953, "logits/rejected": -2.2149569988250732, "logps/chosen": -65.74937438964844, "logps/rejected": -75.88553619384766, "loss": 0.8199, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11138676106929779, "rewards/margins": 1.6168580055236816, "rewards/rejected": -1.505471110343933, "step": 385 }, { "epoch": 0.45969482694454783, "grad_norm": 5.566812038421631, "learning_rate": 9.541120381406436e-07, "logits/chosen": -2.103306770324707, "logits/rejected": -2.156820774078369, "logps/chosen": -57.56303405761719, "logps/rejected": -64.21145629882812, "loss": 0.8679, "rewards/accuracies": 0.6875, "rewards/chosen": 1.2184107303619385, "rewards/margins": 1.3068690299987793, "rewards/rejected": -0.08845829963684082, "step": 386 }, { "epoch": 0.4608857461853368, "grad_norm": 7.4872283935546875, "learning_rate": 9.539928486293207e-07, "logits/chosen": -2.124866485595703, "logits/rejected": -2.2249808311462402, "logps/chosen": -68.68611907958984, "logps/rejected": -72.93592834472656, "loss": 0.9299, "rewards/accuracies": 0.59375, "rewards/chosen": -0.561974048614502, "rewards/margins": 0.7530481219291687, "rewards/rejected": -1.3150219917297363, "step": 387 }, { "epoch": 0.4620766654261258, "grad_norm": 5.263156890869141, "learning_rate": 9.538736591179976e-07, "logits/chosen": -2.0254878997802734, "logits/rejected": -2.1239075660705566, "logps/chosen": -58.69213104248047, "logps/rejected": -78.84133911132812, "loss": 0.6361, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9041172862052917, "rewards/margins": 3.0658316612243652, "rewards/rejected": -2.161714553833008, "step": 388 }, { "epoch": 0.46326758466691476, "grad_norm": 5.118015766143799, "learning_rate": 9.537544696066746e-07, "logits/chosen": -2.0733399391174316, "logits/rejected": -2.1993319988250732, "logps/chosen": -65.90795135498047, "logps/rejected": -82.35076904296875, "loss": 0.7392, "rewards/accuracies": 0.75, "rewards/chosen": 0.15298688411712646, "rewards/margins": 2.406588554382324, "rewards/rejected": -2.2536017894744873, "step": 389 }, { "epoch": 0.46445850390770377, "grad_norm": 4.589547634124756, "learning_rate": 9.536352800953516e-07, "logits/chosen": -1.9614571332931519, "logits/rejected": -2.0885496139526367, "logps/chosen": -51.83538818359375, "logps/rejected": -79.23238372802734, "loss": 0.5755, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6367697715759277, "rewards/margins": 3.634571075439453, "rewards/rejected": -1.9978010654449463, "step": 390 }, { "epoch": 0.4656494231484927, "grad_norm": 4.592448711395264, "learning_rate": 9.535160905840286e-07, "logits/chosen": -2.07706356048584, "logits/rejected": -2.241504430770874, "logps/chosen": -59.9658203125, "logps/rejected": -88.30884552001953, "loss": 0.5666, "rewards/accuracies": 0.875, "rewards/chosen": 0.9946798086166382, "rewards/margins": 3.9566478729248047, "rewards/rejected": -2.961968183517456, "step": 391 }, { "epoch": 0.46684034238928174, "grad_norm": 9.315032005310059, "learning_rate": 9.533969010727055e-07, "logits/chosen": -1.931472897529602, "logits/rejected": -2.0894775390625, "logps/chosen": -57.29275894165039, "logps/rejected": -63.28105545043945, "loss": 0.6266, "rewards/accuracies": 0.75, "rewards/chosen": 2.238210916519165, "rewards/margins": 3.025779962539673, "rewards/rejected": -0.7875691652297974, "step": 392 }, { "epoch": 0.4680312616300707, "grad_norm": 5.823709011077881, "learning_rate": 9.532777115613826e-07, "logits/chosen": -2.1047170162200928, "logits/rejected": -2.217543840408325, "logps/chosen": -56.03072738647461, "logps/rejected": -78.45002746582031, "loss": 0.6406, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9501168131828308, "rewards/margins": 2.8594634532928467, "rewards/rejected": -1.9093466997146606, "step": 393 }, { "epoch": 0.4692221808708597, "grad_norm": 5.118168830871582, "learning_rate": 9.531585220500596e-07, "logits/chosen": -2.0554065704345703, "logits/rejected": -2.2108845710754395, "logps/chosen": -60.044193267822266, "logps/rejected": -84.81208801269531, "loss": 0.5501, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4131470918655396, "rewards/margins": 4.240612030029297, "rewards/rejected": -2.8274648189544678, "step": 394 }, { "epoch": 0.47041310011164866, "grad_norm": 7.380710124969482, "learning_rate": 9.530393325387365e-07, "logits/chosen": -2.022090196609497, "logits/rejected": -2.1729941368103027, "logps/chosen": -59.27381896972656, "logps/rejected": -76.38799285888672, "loss": 0.5838, "rewards/accuracies": 0.75, "rewards/chosen": 2.0460457801818848, "rewards/margins": 3.5625216960906982, "rewards/rejected": -1.516476035118103, "step": 395 }, { "epoch": 0.4716040193524377, "grad_norm": 3.899137496948242, "learning_rate": 9.529201430274135e-07, "logits/chosen": -2.071802854537964, "logits/rejected": -2.202205181121826, "logps/chosen": -63.62469482421875, "logps/rejected": -80.08545684814453, "loss": 0.7432, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6360763311386108, "rewards/margins": 2.6294374465942383, "rewards/rejected": -1.993360996246338, "step": 396 }, { "epoch": 0.47279493859322663, "grad_norm": 6.074102401733398, "learning_rate": 9.528009535160906e-07, "logits/chosen": -2.0353739261627197, "logits/rejected": -2.1979644298553467, "logps/chosen": -59.541954040527344, "logps/rejected": -81.5006332397461, "loss": 0.6099, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3792132139205933, "rewards/margins": 3.8650455474853516, "rewards/rejected": -2.4858319759368896, "step": 397 }, { "epoch": 0.47398585783401564, "grad_norm": 9.028854370117188, "learning_rate": 9.526817640047675e-07, "logits/chosen": -2.1093432903289795, "logits/rejected": -2.195864677429199, "logps/chosen": -58.793540954589844, "logps/rejected": -78.69255828857422, "loss": 0.7168, "rewards/accuracies": 0.65625, "rewards/chosen": 0.6847477555274963, "rewards/margins": 2.3519465923309326, "rewards/rejected": -1.6671987771987915, "step": 398 }, { "epoch": 0.4751767770748046, "grad_norm": 5.4573469161987305, "learning_rate": 9.525625744934446e-07, "logits/chosen": -2.0539798736572266, "logits/rejected": -2.1958320140838623, "logps/chosen": -61.93581771850586, "logps/rejected": -85.57869720458984, "loss": 0.605, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2606730461120605, "rewards/margins": 4.177415370941162, "rewards/rejected": -2.9167423248291016, "step": 399 }, { "epoch": 0.4763676963155936, "grad_norm": 7.741009712219238, "learning_rate": 9.524433849821215e-07, "logits/chosen": -2.039515972137451, "logits/rejected": -2.1192805767059326, "logps/chosen": -56.54401397705078, "logps/rejected": -79.13418579101562, "loss": 0.5934, "rewards/accuracies": 0.78125, "rewards/chosen": 1.9265093803405762, "rewards/margins": 3.5022308826446533, "rewards/rejected": -1.5757213830947876, "step": 400 }, { "epoch": 0.47755861555638257, "grad_norm": 8.35806655883789, "learning_rate": 9.523241954707986e-07, "logits/chosen": -2.005694627761841, "logits/rejected": -2.217440605163574, "logps/chosen": -65.83984375, "logps/rejected": -86.38520050048828, "loss": 0.6446, "rewards/accuracies": 0.78125, "rewards/chosen": 0.24870072305202484, "rewards/margins": 3.366122007369995, "rewards/rejected": -3.1174211502075195, "step": 401 }, { "epoch": 0.4787495347971716, "grad_norm": 4.755357265472412, "learning_rate": 9.522050059594755e-07, "logits/chosen": -2.0338289737701416, "logits/rejected": -2.1843113899230957, "logps/chosen": -62.298484802246094, "logps/rejected": -81.29685974121094, "loss": 0.5949, "rewards/accuracies": 0.8125, "rewards/chosen": 1.6560587882995605, "rewards/margins": 3.6128973960876465, "rewards/rejected": -1.9568384885787964, "step": 402 }, { "epoch": 0.47994045403796054, "grad_norm": 5.135293960571289, "learning_rate": 9.520858164481526e-07, "logits/chosen": -2.1105504035949707, "logits/rejected": -2.1800992488861084, "logps/chosen": -60.48038101196289, "logps/rejected": -79.36955261230469, "loss": 0.5688, "rewards/accuracies": 0.75, "rewards/chosen": 1.6140934228897095, "rewards/margins": 3.602942943572998, "rewards/rejected": -1.988849401473999, "step": 403 }, { "epoch": 0.48113137327874955, "grad_norm": 5.392461776733398, "learning_rate": 9.519666269368295e-07, "logits/chosen": -2.0643181800842285, "logits/rejected": -2.161083936691284, "logps/chosen": -58.844879150390625, "logps/rejected": -77.64503479003906, "loss": 0.6864, "rewards/accuracies": 0.71875, "rewards/chosen": 1.333883285522461, "rewards/margins": 2.94034743309021, "rewards/rejected": -1.60646390914917, "step": 404 }, { "epoch": 0.4823222925195385, "grad_norm": 5.031263828277588, "learning_rate": 9.518474374255065e-07, "logits/chosen": -2.023526430130005, "logits/rejected": -2.0979814529418945, "logps/chosen": -59.255706787109375, "logps/rejected": -75.5120620727539, "loss": 0.6963, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5511705875396729, "rewards/margins": 2.837486505508423, "rewards/rejected": -1.286316156387329, "step": 405 }, { "epoch": 0.4835132117603275, "grad_norm": 3.988776683807373, "learning_rate": 9.517282479141835e-07, "logits/chosen": -1.9877779483795166, "logits/rejected": -2.1023480892181396, "logps/chosen": -65.88886260986328, "logps/rejected": -79.08589172363281, "loss": 0.7876, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19553399085998535, "rewards/margins": 2.0315496921539307, "rewards/rejected": -1.8360158205032349, "step": 406 }, { "epoch": 0.4847041310011165, "grad_norm": 5.310043811798096, "learning_rate": 9.516090584028606e-07, "logits/chosen": -2.059744358062744, "logits/rejected": -2.15596079826355, "logps/chosen": -59.2331428527832, "logps/rejected": -70.33969116210938, "loss": 0.8587, "rewards/accuracies": 0.65625, "rewards/chosen": 0.24825096130371094, "rewards/margins": 1.4088387489318848, "rewards/rejected": -1.1605875492095947, "step": 407 }, { "epoch": 0.4858950502419055, "grad_norm": 5.578313827514648, "learning_rate": 9.514898688915374e-07, "logits/chosen": -2.049237012863159, "logits/rejected": -2.025498867034912, "logps/chosen": -59.4207763671875, "logps/rejected": -65.84461975097656, "loss": 0.9474, "rewards/accuracies": 0.625, "rewards/chosen": 0.7227839231491089, "rewards/margins": 0.6141993403434753, "rewards/rejected": 0.10858480632305145, "step": 408 }, { "epoch": 0.48708596948269445, "grad_norm": 6.126920223236084, "learning_rate": 9.513706793802145e-07, "logits/chosen": -2.0583763122558594, "logits/rejected": -2.1982645988464355, "logps/chosen": -68.42807006835938, "logps/rejected": -77.58187103271484, "loss": 0.7109, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5912520289421082, "rewards/margins": 2.4932684898376465, "rewards/rejected": -1.902016282081604, "step": 409 }, { "epoch": 0.48827688872348346, "grad_norm": 5.582123756408691, "learning_rate": 9.512514898688915e-07, "logits/chosen": -2.124188184738159, "logits/rejected": -2.1708061695098877, "logps/chosen": -65.89220428466797, "logps/rejected": -82.47891998291016, "loss": 0.7095, "rewards/accuracies": 0.625, "rewards/chosen": 0.6353859901428223, "rewards/margins": 2.323373794555664, "rewards/rejected": -1.6879875659942627, "step": 410 }, { "epoch": 0.4894678079642724, "grad_norm": 5.047007083892822, "learning_rate": 9.511323003575686e-07, "logits/chosen": -2.0359389781951904, "logits/rejected": -2.20161509513855, "logps/chosen": -65.84139251708984, "logps/rejected": -80.87841033935547, "loss": 0.6411, "rewards/accuracies": 0.75, "rewards/chosen": 0.6478867530822754, "rewards/margins": 3.4794700145721436, "rewards/rejected": -2.831583261489868, "step": 411 }, { "epoch": 0.49065872720506143, "grad_norm": 7.54962158203125, "learning_rate": 9.510131108462455e-07, "logits/chosen": -2.075251817703247, "logits/rejected": -2.188967227935791, "logps/chosen": -61.86984634399414, "logps/rejected": -81.413330078125, "loss": 0.7136, "rewards/accuracies": 0.75, "rewards/chosen": 0.4468899369239807, "rewards/margins": 2.806140899658203, "rewards/rejected": -2.359250783920288, "step": 412 }, { "epoch": 0.4918496464458504, "grad_norm": 6.134336948394775, "learning_rate": 9.508939213349225e-07, "logits/chosen": -2.09818172454834, "logits/rejected": -2.1585352420806885, "logps/chosen": -65.53841400146484, "logps/rejected": -85.4222640991211, "loss": 0.7428, "rewards/accuracies": 0.84375, "rewards/chosen": -0.12303458154201508, "rewards/margins": 2.5141942501068115, "rewards/rejected": -2.6372287273406982, "step": 413 }, { "epoch": 0.4930405656866394, "grad_norm": 5.559001445770264, "learning_rate": 9.507747318235995e-07, "logits/chosen": -2.122840404510498, "logits/rejected": -2.177142858505249, "logps/chosen": -64.53176879882812, "logps/rejected": -71.03251647949219, "loss": 0.8604, "rewards/accuracies": 0.75, "rewards/chosen": 0.2555948793888092, "rewards/margins": 1.4709688425064087, "rewards/rejected": -1.2153738737106323, "step": 414 }, { "epoch": 0.49423148492742836, "grad_norm": 6.709038257598877, "learning_rate": 9.506555423122765e-07, "logits/chosen": -2.0267226696014404, "logits/rejected": -2.1282122135162354, "logps/chosen": -57.2590446472168, "logps/rejected": -83.84140014648438, "loss": 0.7297, "rewards/accuracies": 0.75, "rewards/chosen": 1.1288514137268066, "rewards/margins": 3.340319871902466, "rewards/rejected": -2.211468458175659, "step": 415 }, { "epoch": 0.49542240416821737, "grad_norm": 4.658398628234863, "learning_rate": 9.505363528009535e-07, "logits/chosen": -2.069049119949341, "logits/rejected": -2.158811092376709, "logps/chosen": -63.848785400390625, "logps/rejected": -81.49402618408203, "loss": 0.712, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4597509503364563, "rewards/margins": 2.735990524291992, "rewards/rejected": -2.2762396335601807, "step": 416 }, { "epoch": 0.4966133234090063, "grad_norm": 5.5168633460998535, "learning_rate": 9.504171632896305e-07, "logits/chosen": -2.1579997539520264, "logits/rejected": -2.221600294113159, "logps/chosen": -63.82947540283203, "logps/rejected": -77.32247161865234, "loss": 0.8965, "rewards/accuracies": 0.71875, "rewards/chosen": 0.041531819850206375, "rewards/margins": 1.797458529472351, "rewards/rejected": -1.7559268474578857, "step": 417 }, { "epoch": 0.49780424264979534, "grad_norm": 4.413315296173096, "learning_rate": 9.502979737783074e-07, "logits/chosen": -2.052731990814209, "logits/rejected": -2.164874315261841, "logps/chosen": -61.546470642089844, "logps/rejected": -81.37680053710938, "loss": 0.6761, "rewards/accuracies": 0.875, "rewards/chosen": 0.7579778432846069, "rewards/margins": 3.2286534309387207, "rewards/rejected": -2.4706759452819824, "step": 418 }, { "epoch": 0.4989951618905843, "grad_norm": 7.131612300872803, "learning_rate": 9.501787842669845e-07, "logits/chosen": -2.066992998123169, "logits/rejected": -2.153254508972168, "logps/chosen": -57.601131439208984, "logps/rejected": -76.2832260131836, "loss": 0.6414, "rewards/accuracies": 0.8125, "rewards/chosen": 1.270740032196045, "rewards/margins": 3.154682159423828, "rewards/rejected": -1.8839415311813354, "step": 419 }, { "epoch": 0.5001860811313733, "grad_norm": 4.2406511306762695, "learning_rate": 9.500595947556615e-07, "logits/chosen": -2.0410220623016357, "logits/rejected": -2.148576259613037, "logps/chosen": -61.825767517089844, "logps/rejected": -85.4156494140625, "loss": 0.5679, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9721894264221191, "rewards/margins": 3.481475591659546, "rewards/rejected": -2.5092861652374268, "step": 420 }, { "epoch": 0.5013770003721623, "grad_norm": 5.777607440948486, "learning_rate": 9.499404052443385e-07, "logits/chosen": -2.021477222442627, "logits/rejected": -2.0781233310699463, "logps/chosen": -64.37686157226562, "logps/rejected": -69.77725219726562, "loss": 0.8002, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9894933700561523, "rewards/margins": 1.904630184173584, "rewards/rejected": -0.9151369333267212, "step": 421 }, { "epoch": 0.5025679196129512, "grad_norm": 5.411812782287598, "learning_rate": 9.498212157330154e-07, "logits/chosen": -2.0742366313934326, "logits/rejected": -2.1960368156433105, "logps/chosen": -62.197242736816406, "logps/rejected": -81.46981811523438, "loss": 0.7069, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5512429475784302, "rewards/margins": 2.8386099338531494, "rewards/rejected": -2.2873668670654297, "step": 422 }, { "epoch": 0.5037588388537402, "grad_norm": 3.6406986713409424, "learning_rate": 9.497020262216925e-07, "logits/chosen": -2.080265760421753, "logits/rejected": -2.135622024536133, "logps/chosen": -63.45114517211914, "logps/rejected": -72.53724670410156, "loss": 0.8546, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5824188590049744, "rewards/margins": 1.5319311618804932, "rewards/rejected": -0.9495121240615845, "step": 423 }, { "epoch": 0.5049497580945292, "grad_norm": 7.073276042938232, "learning_rate": 9.495828367103694e-07, "logits/chosen": -2.061634063720703, "logits/rejected": -2.1407151222229004, "logps/chosen": -62.124202728271484, "logps/rejected": -83.67166137695312, "loss": 0.6303, "rewards/accuracies": 0.75, "rewards/chosen": 1.262550950050354, "rewards/margins": 3.515831232070923, "rewards/rejected": -2.2532801628112793, "step": 424 }, { "epoch": 0.5061406773353182, "grad_norm": 5.699594974517822, "learning_rate": 9.494636471990465e-07, "logits/chosen": -1.9886335134506226, "logits/rejected": -2.112421989440918, "logps/chosen": -62.22149658203125, "logps/rejected": -77.16929626464844, "loss": 0.8473, "rewards/accuracies": 0.65625, "rewards/chosen": 0.37519487738609314, "rewards/margins": 1.7837992906570435, "rewards/rejected": -1.408604383468628, "step": 425 }, { "epoch": 0.5073315965761072, "grad_norm": 5.06885290145874, "learning_rate": 9.493444576877234e-07, "logits/chosen": -2.098360300064087, "logits/rejected": -2.187927007675171, "logps/chosen": -66.91960906982422, "logps/rejected": -83.71408081054688, "loss": 0.7096, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5120065212249756, "rewards/margins": 2.926283359527588, "rewards/rejected": -2.4142770767211914, "step": 426 }, { "epoch": 0.5085225158168961, "grad_norm": 5.838263988494873, "learning_rate": 9.492252681764005e-07, "logits/chosen": -2.057260036468506, "logits/rejected": -2.132216215133667, "logps/chosen": -61.762451171875, "logps/rejected": -82.0263671875, "loss": 0.7091, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6513605117797852, "rewards/margins": 3.028524160385132, "rewards/rejected": -2.3771636486053467, "step": 427 }, { "epoch": 0.5097134350576852, "grad_norm": 5.054000377655029, "learning_rate": 9.491060786650774e-07, "logits/chosen": -1.9836682081222534, "logits/rejected": -2.0630533695220947, "logps/chosen": -57.56833267211914, "logps/rejected": -79.29310607910156, "loss": 0.5866, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4099249839782715, "rewards/margins": 3.4935920238494873, "rewards/rejected": -2.083667516708374, "step": 428 }, { "epoch": 0.5109043542984741, "grad_norm": 4.475489616394043, "learning_rate": 9.489868891537545e-07, "logits/chosen": -2.008068561553955, "logits/rejected": -2.095367670059204, "logps/chosen": -58.79857635498047, "logps/rejected": -78.88754272460938, "loss": 0.6312, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8362659215927124, "rewards/margins": 3.2613625526428223, "rewards/rejected": -2.4250965118408203, "step": 429 }, { "epoch": 0.5120952735392631, "grad_norm": 4.705774307250977, "learning_rate": 9.488676996424314e-07, "logits/chosen": -2.0187599658966064, "logits/rejected": -2.111584424972534, "logps/chosen": -60.369956970214844, "logps/rejected": -81.2163314819336, "loss": 0.7699, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5648252964019775, "rewards/margins": 2.580085039138794, "rewards/rejected": -2.0152595043182373, "step": 430 }, { "epoch": 0.513286192780052, "grad_norm": 4.082242488861084, "learning_rate": 9.487485101311084e-07, "logits/chosen": -2.016443967819214, "logits/rejected": -2.125413179397583, "logps/chosen": -67.62548065185547, "logps/rejected": -76.50028991699219, "loss": 0.7963, "rewards/accuracies": 0.59375, "rewards/chosen": 0.8014708757400513, "rewards/margins": 2.074507236480713, "rewards/rejected": -1.2730363607406616, "step": 431 }, { "epoch": 0.5144771120208411, "grad_norm": 5.349870681762695, "learning_rate": 9.486293206197854e-07, "logits/chosen": -2.005357265472412, "logits/rejected": -2.1655919551849365, "logps/chosen": -58.38141632080078, "logps/rejected": -83.27313995361328, "loss": 0.6316, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7782081365585327, "rewards/margins": 3.6939785480499268, "rewards/rejected": -2.9157705307006836, "step": 432 }, { "epoch": 0.5156680312616301, "grad_norm": 5.1222662925720215, "learning_rate": 9.485101311084625e-07, "logits/chosen": -2.0845422744750977, "logits/rejected": -2.1711385250091553, "logps/chosen": -69.3544921875, "logps/rejected": -83.21988677978516, "loss": 0.7211, "rewards/accuracies": 0.71875, "rewards/chosen": 0.051577091217041016, "rewards/margins": 2.6760928630828857, "rewards/rejected": -2.6245152950286865, "step": 433 }, { "epoch": 0.516858950502419, "grad_norm": 5.024703025817871, "learning_rate": 9.483909415971394e-07, "logits/chosen": -2.019563913345337, "logits/rejected": -2.134014129638672, "logps/chosen": -68.15892028808594, "logps/rejected": -87.27709197998047, "loss": 0.6859, "rewards/accuracies": 0.71875, "rewards/chosen": 0.35775601863861084, "rewards/margins": 2.943021297454834, "rewards/rejected": -2.5852653980255127, "step": 434 }, { "epoch": 0.518049869743208, "grad_norm": 5.2067341804504395, "learning_rate": 9.482717520858164e-07, "logits/chosen": -2.0537524223327637, "logits/rejected": -2.1650633811950684, "logps/chosen": -56.09587097167969, "logps/rejected": -78.11259460449219, "loss": 0.623, "rewards/accuracies": 0.71875, "rewards/chosen": 1.2331197261810303, "rewards/margins": 3.3659443855285645, "rewards/rejected": -2.132824659347534, "step": 435 }, { "epoch": 0.5192407889839971, "grad_norm": 4.558169364929199, "learning_rate": 9.481525625744934e-07, "logits/chosen": -2.0779531002044678, "logits/rejected": -2.184032917022705, "logps/chosen": -66.09591674804688, "logps/rejected": -82.28178405761719, "loss": 0.7633, "rewards/accuracies": 0.625, "rewards/chosen": -0.054946985095739365, "rewards/margins": 2.0414469242095947, "rewards/rejected": -2.0963938236236572, "step": 436 }, { "epoch": 0.520431708224786, "grad_norm": 5.499298572540283, "learning_rate": 9.480333730631705e-07, "logits/chosen": -2.01016902923584, "logits/rejected": -2.164433717727661, "logps/chosen": -60.4771614074707, "logps/rejected": -73.96514892578125, "loss": 0.792, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6634389162063599, "rewards/margins": 2.2333433628082275, "rewards/rejected": -1.5699044466018677, "step": 437 }, { "epoch": 0.521622627465575, "grad_norm": 6.838984489440918, "learning_rate": 9.479141835518474e-07, "logits/chosen": -2.108638286590576, "logits/rejected": -2.1886916160583496, "logps/chosen": -64.15475463867188, "logps/rejected": -83.60176086425781, "loss": 0.7063, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11822853982448578, "rewards/margins": 2.9877383708953857, "rewards/rejected": -2.869509696960449, "step": 438 }, { "epoch": 0.5228135467063639, "grad_norm": 5.013105392456055, "learning_rate": 9.477949940405244e-07, "logits/chosen": -2.0367212295532227, "logits/rejected": -2.134021043777466, "logps/chosen": -66.86412811279297, "logps/rejected": -73.94998168945312, "loss": 0.8479, "rewards/accuracies": 0.625, "rewards/chosen": -0.06278449296951294, "rewards/margins": 1.6036516427993774, "rewards/rejected": -1.6664361953735352, "step": 439 }, { "epoch": 0.524004465947153, "grad_norm": 4.937006950378418, "learning_rate": 9.476758045292014e-07, "logits/chosen": -1.9879471063613892, "logits/rejected": -2.071777582168579, "logps/chosen": -61.23939514160156, "logps/rejected": -76.15548706054688, "loss": 0.7139, "rewards/accuracies": 0.71875, "rewards/chosen": 1.5234321355819702, "rewards/margins": 2.7579684257507324, "rewards/rejected": -1.2345365285873413, "step": 440 }, { "epoch": 0.525195385187942, "grad_norm": 6.23076057434082, "learning_rate": 9.475566150178784e-07, "logits/chosen": -2.0316622257232666, "logits/rejected": -2.151430606842041, "logps/chosen": -63.005470275878906, "logps/rejected": -80.26461029052734, "loss": 0.6831, "rewards/accuracies": 0.75, "rewards/chosen": 0.6260561943054199, "rewards/margins": 2.879939556121826, "rewards/rejected": -2.2538833618164062, "step": 441 }, { "epoch": 0.5263863044287309, "grad_norm": 4.864465713500977, "learning_rate": 9.474374255065554e-07, "logits/chosen": -2.0241079330444336, "logits/rejected": -2.182438373565674, "logps/chosen": -65.37443542480469, "logps/rejected": -89.14207458496094, "loss": 0.6749, "rewards/accuracies": 0.75, "rewards/chosen": 0.2579042911529541, "rewards/margins": 3.5167295932769775, "rewards/rejected": -3.2588253021240234, "step": 442 }, { "epoch": 0.5275772236695199, "grad_norm": 5.477908134460449, "learning_rate": 9.473182359952325e-07, "logits/chosen": -2.087125778198242, "logits/rejected": -2.174705982208252, "logps/chosen": -64.44747924804688, "logps/rejected": -86.09111785888672, "loss": 0.7164, "rewards/accuracies": 0.75, "rewards/chosen": 0.05260547995567322, "rewards/margins": 2.9498605728149414, "rewards/rejected": -2.8972551822662354, "step": 443 }, { "epoch": 0.5287681429103089, "grad_norm": 6.51650333404541, "learning_rate": 9.471990464839093e-07, "logits/chosen": -2.0869908332824707, "logits/rejected": -2.1878738403320312, "logps/chosen": -63.0596809387207, "logps/rejected": -82.18658447265625, "loss": 0.6194, "rewards/accuracies": 0.75, "rewards/chosen": 1.1739453077316284, "rewards/margins": 3.512042760848999, "rewards/rejected": -2.33809757232666, "step": 444 }, { "epoch": 0.5299590621510979, "grad_norm": 5.347571849822998, "learning_rate": 9.470798569725864e-07, "logits/chosen": -2.1463077068328857, "logits/rejected": -2.185598134994507, "logps/chosen": -66.52043914794922, "logps/rejected": -82.32789611816406, "loss": 0.8878, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5627257227897644, "rewards/margins": 1.3730669021606445, "rewards/rejected": -1.9357929229736328, "step": 445 }, { "epoch": 0.5311499813918868, "grad_norm": 8.213510513305664, "learning_rate": 9.469606674612634e-07, "logits/chosen": -2.0238864421844482, "logits/rejected": -2.113452911376953, "logps/chosen": -59.86419677734375, "logps/rejected": -80.09732818603516, "loss": 0.6423, "rewards/accuracies": 0.875, "rewards/chosen": 1.09601628780365, "rewards/margins": 3.142940044403076, "rewards/rejected": -2.0469236373901367, "step": 446 }, { "epoch": 0.5323409006326758, "grad_norm": 6.038510322570801, "learning_rate": 9.468414779499404e-07, "logits/chosen": -2.122096061706543, "logits/rejected": -2.204254150390625, "logps/chosen": -63.84870147705078, "logps/rejected": -76.9211196899414, "loss": 0.858, "rewards/accuracies": 0.625, "rewards/chosen": -0.035798296332359314, "rewards/margins": 1.811650276184082, "rewards/rejected": -1.8474485874176025, "step": 447 }, { "epoch": 0.5335318198734649, "grad_norm": 4.573263168334961, "learning_rate": 9.467222884386173e-07, "logits/chosen": -2.0297458171844482, "logits/rejected": -2.1657066345214844, "logps/chosen": -58.729331970214844, "logps/rejected": -74.1070785522461, "loss": 0.7557, "rewards/accuracies": 0.75, "rewards/chosen": 0.4878809452056885, "rewards/margins": 1.855330467224121, "rewards/rejected": -1.367449402809143, "step": 448 }, { "epoch": 0.5347227391142538, "grad_norm": 8.563860893249512, "learning_rate": 9.466030989272944e-07, "logits/chosen": -2.099313259124756, "logits/rejected": -2.219266414642334, "logps/chosen": -56.59629440307617, "logps/rejected": -81.81945037841797, "loss": 0.6522, "rewards/accuracies": 0.75, "rewards/chosen": 0.8270818591117859, "rewards/margins": 3.037867546081543, "rewards/rejected": -2.2107856273651123, "step": 449 }, { "epoch": 0.5359136583550428, "grad_norm": 6.982388019561768, "learning_rate": 9.464839094159713e-07, "logits/chosen": -2.046339273452759, "logits/rejected": -2.201575994491577, "logps/chosen": -57.976036071777344, "logps/rejected": -74.43959045410156, "loss": 0.6199, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4652165174484253, "rewards/margins": 3.4150474071502686, "rewards/rejected": -1.949831247329712, "step": 450 }, { "epoch": 0.5371045775958317, "grad_norm": 4.467795372009277, "learning_rate": 9.463647199046484e-07, "logits/chosen": -2.041494846343994, "logits/rejected": -2.119837999343872, "logps/chosen": -51.869964599609375, "logps/rejected": -67.6826171875, "loss": 0.7844, "rewards/accuracies": 0.71875, "rewards/chosen": 1.8310627937316895, "rewards/margins": 2.4466567039489746, "rewards/rejected": -0.6155937910079956, "step": 451 }, { "epoch": 0.5382954968366208, "grad_norm": 3.372605323791504, "learning_rate": 9.462455303933253e-07, "logits/chosen": -2.178441047668457, "logits/rejected": -2.2420666217803955, "logps/chosen": -61.22886276245117, "logps/rejected": -78.12798309326172, "loss": 0.6787, "rewards/accuracies": 0.75, "rewards/chosen": 1.5965471267700195, "rewards/margins": 2.716207504272461, "rewards/rejected": -1.1196606159210205, "step": 452 }, { "epoch": 0.5394864160774098, "grad_norm": 5.93565034866333, "learning_rate": 9.461263408820024e-07, "logits/chosen": -2.0750679969787598, "logits/rejected": -2.1671037673950195, "logps/chosen": -59.94355392456055, "logps/rejected": -82.5330810546875, "loss": 0.5769, "rewards/accuracies": 0.8125, "rewards/chosen": 1.449497103691101, "rewards/margins": 3.711385488510132, "rewards/rejected": -2.2618887424468994, "step": 453 }, { "epoch": 0.5406773353181987, "grad_norm": 5.866143226623535, "learning_rate": 9.460071513706793e-07, "logits/chosen": -2.0230939388275146, "logits/rejected": -2.0904812812805176, "logps/chosen": -69.02767944335938, "logps/rejected": -81.68034362792969, "loss": 0.7022, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5215569138526917, "rewards/margins": 2.3278865814208984, "rewards/rejected": -1.806329607963562, "step": 454 }, { "epoch": 0.5418682545589877, "grad_norm": 4.5576934814453125, "learning_rate": 9.458879618593564e-07, "logits/chosen": -2.0981392860412598, "logits/rejected": -2.1523351669311523, "logps/chosen": -59.0096321105957, "logps/rejected": -77.22565460205078, "loss": 0.6154, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4477345943450928, "rewards/margins": 3.326836585998535, "rewards/rejected": -1.8791019916534424, "step": 455 }, { "epoch": 0.5430591737997768, "grad_norm": 7.0909929275512695, "learning_rate": 9.457687723480334e-07, "logits/chosen": -2.0256507396698, "logits/rejected": -2.093125581741333, "logps/chosen": -61.308502197265625, "logps/rejected": -78.03748321533203, "loss": 0.7121, "rewards/accuracies": 0.75, "rewards/chosen": 0.9984704852104187, "rewards/margins": 2.5233848094940186, "rewards/rejected": -1.5249141454696655, "step": 456 }, { "epoch": 0.5442500930405657, "grad_norm": 6.982992172241211, "learning_rate": 9.456495828367103e-07, "logits/chosen": -2.0230817794799805, "logits/rejected": -2.1543471813201904, "logps/chosen": -59.802459716796875, "logps/rejected": -78.90797424316406, "loss": 0.6245, "rewards/accuracies": 0.84375, "rewards/chosen": 1.7276856899261475, "rewards/margins": 3.403764247894287, "rewards/rejected": -1.6760783195495605, "step": 457 }, { "epoch": 0.5454410122813547, "grad_norm": 7.607675552368164, "learning_rate": 9.455303933253873e-07, "logits/chosen": -2.058309555053711, "logits/rejected": -2.1837661266326904, "logps/chosen": -58.430816650390625, "logps/rejected": -84.23172760009766, "loss": 0.6862, "rewards/accuracies": 0.71875, "rewards/chosen": 0.37371230125427246, "rewards/margins": 2.9801509380340576, "rewards/rejected": -2.606438636779785, "step": 458 }, { "epoch": 0.5466319315221436, "grad_norm": 4.4723334312438965, "learning_rate": 9.454112038140644e-07, "logits/chosen": -2.0461020469665527, "logits/rejected": -2.1218464374542236, "logps/chosen": -50.55552673339844, "logps/rejected": -73.2822265625, "loss": 0.7185, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4523391723632812, "rewards/margins": 2.3266799449920654, "rewards/rejected": -0.8743405342102051, "step": 459 }, { "epoch": 0.5478228507629327, "grad_norm": 3.484758138656616, "learning_rate": 9.452920143027413e-07, "logits/chosen": -2.046983003616333, "logits/rejected": -2.1787965297698975, "logps/chosen": -57.92928695678711, "logps/rejected": -77.86680603027344, "loss": 0.6626, "rewards/accuracies": 0.71875, "rewards/chosen": 1.484151005744934, "rewards/margins": 3.1696243286132812, "rewards/rejected": -1.685473084449768, "step": 460 }, { "epoch": 0.5490137700037216, "grad_norm": 6.116722583770752, "learning_rate": 9.451728247914183e-07, "logits/chosen": -2.060929775238037, "logits/rejected": -2.1270384788513184, "logps/chosen": -54.80955505371094, "logps/rejected": -75.00919342041016, "loss": 0.6132, "rewards/accuracies": 0.875, "rewards/chosen": 1.576910138130188, "rewards/margins": 2.8777084350585938, "rewards/rejected": -1.3007984161376953, "step": 461 }, { "epoch": 0.5502046892445106, "grad_norm": 5.41518497467041, "learning_rate": 9.450536352800953e-07, "logits/chosen": -1.9820048809051514, "logits/rejected": -2.0456326007843018, "logps/chosen": -64.10653686523438, "logps/rejected": -73.09156036376953, "loss": 0.7816, "rewards/accuracies": 0.59375, "rewards/chosen": 0.8672045469284058, "rewards/margins": 1.8126507997512817, "rewards/rejected": -0.9454463720321655, "step": 462 }, { "epoch": 0.5513956084852996, "grad_norm": 4.9168782234191895, "learning_rate": 9.449344457687724e-07, "logits/chosen": -2.072232484817505, "logits/rejected": -2.186692237854004, "logps/chosen": -53.77753448486328, "logps/rejected": -78.6878433227539, "loss": 0.5714, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2949191331863403, "rewards/margins": 3.5157687664031982, "rewards/rejected": -2.2208495140075684, "step": 463 }, { "epoch": 0.5525865277260886, "grad_norm": 4.613181114196777, "learning_rate": 9.448152562574493e-07, "logits/chosen": -1.975500464439392, "logits/rejected": -2.1350765228271484, "logps/chosen": -56.9305305480957, "logps/rejected": -82.65409088134766, "loss": 0.6378, "rewards/accuracies": 0.75, "rewards/chosen": 1.4439586400985718, "rewards/margins": 3.178760051727295, "rewards/rejected": -1.7348015308380127, "step": 464 }, { "epoch": 0.5537774469668776, "grad_norm": 5.025057315826416, "learning_rate": 9.446960667461263e-07, "logits/chosen": -2.0035016536712646, "logits/rejected": -2.1164016723632812, "logps/chosen": -60.33737564086914, "logps/rejected": -80.81794738769531, "loss": 0.7052, "rewards/accuracies": 0.75, "rewards/chosen": 0.7851480841636658, "rewards/margins": 2.6898789405822754, "rewards/rejected": -1.9047307968139648, "step": 465 }, { "epoch": 0.5549683662076665, "grad_norm": 4.705110549926758, "learning_rate": 9.445768772348033e-07, "logits/chosen": -2.0718464851379395, "logits/rejected": -2.1388370990753174, "logps/chosen": -68.74627685546875, "logps/rejected": -84.00959777832031, "loss": 0.7796, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08904676884412766, "rewards/margins": 2.217409372329712, "rewards/rejected": -2.3064560890197754, "step": 466 }, { "epoch": 0.5561592854484555, "grad_norm": 5.371584415435791, "learning_rate": 9.444576877234803e-07, "logits/chosen": -2.0725972652435303, "logits/rejected": -2.216618537902832, "logps/chosen": -64.27254486083984, "logps/rejected": -86.4856948852539, "loss": 0.6619, "rewards/accuracies": 0.90625, "rewards/chosen": 0.47188788652420044, "rewards/margins": 2.9455413818359375, "rewards/rejected": -2.4736533164978027, "step": 467 }, { "epoch": 0.5573502046892446, "grad_norm": 7.353106498718262, "learning_rate": 9.443384982121573e-07, "logits/chosen": -2.0677876472473145, "logits/rejected": -2.187809467315674, "logps/chosen": -63.87247848510742, "logps/rejected": -83.17523193359375, "loss": 0.8383, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11417033523321152, "rewards/margins": 2.065004348754883, "rewards/rejected": -2.1791746616363525, "step": 468 }, { "epoch": 0.5585411239300335, "grad_norm": 5.325639724731445, "learning_rate": 9.442193087008344e-07, "logits/chosen": -2.050110340118408, "logits/rejected": -2.2243266105651855, "logps/chosen": -61.48550796508789, "logps/rejected": -77.02227020263672, "loss": 0.7229, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9951452612876892, "rewards/margins": 2.814791440963745, "rewards/rejected": -1.8196463584899902, "step": 469 }, { "epoch": 0.5597320431708225, "grad_norm": 6.015285491943359, "learning_rate": 9.441001191895112e-07, "logits/chosen": -2.081470012664795, "logits/rejected": -2.195378303527832, "logps/chosen": -67.0416259765625, "logps/rejected": -82.08342742919922, "loss": 0.7562, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3975358307361603, "rewards/margins": 2.3028981685638428, "rewards/rejected": -1.9053624868392944, "step": 470 }, { "epoch": 0.5609229624116114, "grad_norm": 4.000321865081787, "learning_rate": 9.439809296781883e-07, "logits/chosen": -2.069586753845215, "logits/rejected": -2.1068146228790283, "logps/chosen": -61.25200271606445, "logps/rejected": -72.50525665283203, "loss": 0.718, "rewards/accuracies": 0.75, "rewards/chosen": 1.1674331426620483, "rewards/margins": 2.4884467124938965, "rewards/rejected": -1.3210134506225586, "step": 471 }, { "epoch": 0.5621138816524004, "grad_norm": 8.570185661315918, "learning_rate": 9.438617401668653e-07, "logits/chosen": -2.0533576011657715, "logits/rejected": -2.1571695804595947, "logps/chosen": -64.52731323242188, "logps/rejected": -74.85494995117188, "loss": 0.6944, "rewards/accuracies": 0.75, "rewards/chosen": 1.3142201900482178, "rewards/margins": 2.7825756072998047, "rewards/rejected": -1.468355417251587, "step": 472 }, { "epoch": 0.5633048008931895, "grad_norm": 5.745954990386963, "learning_rate": 9.437425506555423e-07, "logits/chosen": -2.06998872756958, "logits/rejected": -2.136941909790039, "logps/chosen": -55.52248001098633, "logps/rejected": -87.38365173339844, "loss": 0.7119, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9747969508171082, "rewards/margins": 2.919600009918213, "rewards/rejected": -1.9448028802871704, "step": 473 }, { "epoch": 0.5644957201339784, "grad_norm": 8.536637306213379, "learning_rate": 9.436233611442192e-07, "logits/chosen": -2.0700197219848633, "logits/rejected": -2.118279457092285, "logps/chosen": -64.55198669433594, "logps/rejected": -78.70846557617188, "loss": 0.7496, "rewards/accuracies": 0.78125, "rewards/chosen": 0.883358359336853, "rewards/margins": 2.648012638092041, "rewards/rejected": -1.764654278755188, "step": 474 }, { "epoch": 0.5656866393747674, "grad_norm": 6.204571723937988, "learning_rate": 9.435041716328963e-07, "logits/chosen": -2.0773355960845947, "logits/rejected": -2.186403274536133, "logps/chosen": -61.88854217529297, "logps/rejected": -81.1494140625, "loss": 0.6903, "rewards/accuracies": 0.75, "rewards/chosen": 0.6524734497070312, "rewards/margins": 3.012601852416992, "rewards/rejected": -2.36012864112854, "step": 475 }, { "epoch": 0.5668775586155563, "grad_norm": 4.723480701446533, "learning_rate": 9.433849821215733e-07, "logits/chosen": -2.0293400287628174, "logits/rejected": -2.144986629486084, "logps/chosen": -60.12078857421875, "logps/rejected": -82.41568756103516, "loss": 0.6284, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6224037408828735, "rewards/margins": 3.812221050262451, "rewards/rejected": -2.189817190170288, "step": 476 }, { "epoch": 0.5680684778563454, "grad_norm": 5.249765396118164, "learning_rate": 9.432657926102503e-07, "logits/chosen": -2.085609197616577, "logits/rejected": -2.1770782470703125, "logps/chosen": -62.4950065612793, "logps/rejected": -76.47200012207031, "loss": 0.7014, "rewards/accuracies": 0.6875, "rewards/chosen": 0.45935362577438354, "rewards/margins": 2.327876567840576, "rewards/rejected": -1.8685228824615479, "step": 477 }, { "epoch": 0.5692593970971344, "grad_norm": 6.427459716796875, "learning_rate": 9.431466030989273e-07, "logits/chosen": -2.0169570446014404, "logits/rejected": -2.1322011947631836, "logps/chosen": -53.95726013183594, "logps/rejected": -78.6175537109375, "loss": 0.5775, "rewards/accuracies": 0.75, "rewards/chosen": 1.565946340560913, "rewards/margins": 3.4554877281188965, "rewards/rejected": -1.8895412683486938, "step": 478 }, { "epoch": 0.5704503163379233, "grad_norm": 5.790453910827637, "learning_rate": 9.430274135876043e-07, "logits/chosen": -2.004453420639038, "logits/rejected": -2.1276731491088867, "logps/chosen": -63.903167724609375, "logps/rejected": -79.64225769042969, "loss": 0.6901, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8706890940666199, "rewards/margins": 2.812974452972412, "rewards/rejected": -1.9422850608825684, "step": 479 }, { "epoch": 0.5716412355787123, "grad_norm": 5.520792007446289, "learning_rate": 9.429082240762812e-07, "logits/chosen": -2.078685760498047, "logits/rejected": -2.1732077598571777, "logps/chosen": -59.1612663269043, "logps/rejected": -83.11751556396484, "loss": 0.701, "rewards/accuracies": 0.75, "rewards/chosen": 1.0776560306549072, "rewards/margins": 3.1866910457611084, "rewards/rejected": -2.1090352535247803, "step": 480 }, { "epoch": 0.5728321548195013, "grad_norm": 4.430825233459473, "learning_rate": 9.427890345649583e-07, "logits/chosen": -2.0297274589538574, "logits/rejected": -2.1665005683898926, "logps/chosen": -63.637447357177734, "logps/rejected": -80.89619445800781, "loss": 0.6996, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0844756364822388, "rewards/margins": 3.298743724822998, "rewards/rejected": -2.2142677307128906, "step": 481 }, { "epoch": 0.5740230740602903, "grad_norm": 5.667402267456055, "learning_rate": 9.426698450536353e-07, "logits/chosen": -2.082949161529541, "logits/rejected": -2.204157590866089, "logps/chosen": -69.23554229736328, "logps/rejected": -81.50869750976562, "loss": 0.7439, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4507637023925781, "rewards/margins": 2.4129726886749268, "rewards/rejected": -1.962208867073059, "step": 482 }, { "epoch": 0.5752139933010793, "grad_norm": 5.858860015869141, "learning_rate": 9.425506555423122e-07, "logits/chosen": -2.0648882389068604, "logits/rejected": -2.155804395675659, "logps/chosen": -65.02101135253906, "logps/rejected": -74.32233428955078, "loss": 0.7402, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7207483053207397, "rewards/margins": 2.411341905593872, "rewards/rejected": -1.6905936002731323, "step": 483 }, { "epoch": 0.5764049125418682, "grad_norm": 6.6823272705078125, "learning_rate": 9.424314660309892e-07, "logits/chosen": -2.11752986907959, "logits/rejected": -2.220902442932129, "logps/chosen": -66.50628662109375, "logps/rejected": -83.19381713867188, "loss": 0.8262, "rewards/accuracies": 0.71875, "rewards/chosen": -0.36037716269493103, "rewards/margins": 1.813175916671753, "rewards/rejected": -2.173552989959717, "step": 484 }, { "epoch": 0.5775958317826573, "grad_norm": 5.48204231262207, "learning_rate": 9.423122765196663e-07, "logits/chosen": -2.029283285140991, "logits/rejected": -2.1579926013946533, "logps/chosen": -56.168636322021484, "logps/rejected": -73.98223114013672, "loss": 0.6741, "rewards/accuracies": 0.78125, "rewards/chosen": 1.9464073181152344, "rewards/margins": 3.1922268867492676, "rewards/rejected": -1.245819330215454, "step": 485 }, { "epoch": 0.5787867510234462, "grad_norm": 5.393186569213867, "learning_rate": 9.421930870083432e-07, "logits/chosen": -2.0383048057556152, "logits/rejected": -2.136166572570801, "logps/chosen": -63.58411407470703, "logps/rejected": -81.44671630859375, "loss": 0.7827, "rewards/accuracies": 0.6875, "rewards/chosen": 0.40998756885528564, "rewards/margins": 2.2500545978546143, "rewards/rejected": -1.8400671482086182, "step": 486 }, { "epoch": 0.5799776702642352, "grad_norm": 4.934922695159912, "learning_rate": 9.420738974970202e-07, "logits/chosen": -2.0274033546447754, "logits/rejected": -2.172438621520996, "logps/chosen": -61.84455871582031, "logps/rejected": -79.23541259765625, "loss": 0.6581, "rewards/accuracies": 0.78125, "rewards/chosen": 1.006157398223877, "rewards/margins": 3.2600135803222656, "rewards/rejected": -2.2538561820983887, "step": 487 }, { "epoch": 0.5811685895050241, "grad_norm": 5.69887113571167, "learning_rate": 9.419547079856972e-07, "logits/chosen": -2.079094409942627, "logits/rejected": -2.2245049476623535, "logps/chosen": -63.126739501953125, "logps/rejected": -85.2304458618164, "loss": 0.668, "rewards/accuracies": 0.84375, "rewards/chosen": 0.24565014243125916, "rewards/margins": 2.871412754058838, "rewards/rejected": -2.625762462615967, "step": 488 }, { "epoch": 0.5823595087458132, "grad_norm": 6.644979953765869, "learning_rate": 9.418355184743743e-07, "logits/chosen": -1.9743406772613525, "logits/rejected": -2.0459606647491455, "logps/chosen": -63.987091064453125, "logps/rejected": -76.39976501464844, "loss": 0.7082, "rewards/accuracies": 0.75, "rewards/chosen": 1.0293091535568237, "rewards/margins": 2.749526023864746, "rewards/rejected": -1.7202171087265015, "step": 489 }, { "epoch": 0.5835504279866022, "grad_norm": 5.009417533874512, "learning_rate": 9.417163289630512e-07, "logits/chosen": -2.1236982345581055, "logits/rejected": -2.1758909225463867, "logps/chosen": -61.82400894165039, "logps/rejected": -78.65525817871094, "loss": 0.8203, "rewards/accuracies": 0.65625, "rewards/chosen": -0.01632719486951828, "rewards/margins": 1.510299801826477, "rewards/rejected": -1.5266269445419312, "step": 490 }, { "epoch": 0.5847413472273911, "grad_norm": 7.540627956390381, "learning_rate": 9.415971394517283e-07, "logits/chosen": -2.0954360961914062, "logits/rejected": -2.2385473251342773, "logps/chosen": -58.605690002441406, "logps/rejected": -83.24158477783203, "loss": 0.5854, "rewards/accuracies": 0.84375, "rewards/chosen": 1.6407978534698486, "rewards/margins": 4.206471920013428, "rewards/rejected": -2.565674066543579, "step": 491 }, { "epoch": 0.5859322664681801, "grad_norm": 4.90788459777832, "learning_rate": 9.414779499404052e-07, "logits/chosen": -2.101508378982544, "logits/rejected": -2.104369878768921, "logps/chosen": -59.191898345947266, "logps/rejected": -87.21234893798828, "loss": 0.6517, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3957775831222534, "rewards/margins": 3.415158748626709, "rewards/rejected": -2.019381284713745, "step": 492 }, { "epoch": 0.5871231857089692, "grad_norm": 4.381888389587402, "learning_rate": 9.413587604290822e-07, "logits/chosen": -2.0902512073516846, "logits/rejected": -2.200296401977539, "logps/chosen": -55.33375549316406, "logps/rejected": -82.576171875, "loss": 0.5698, "rewards/accuracies": 0.78125, "rewards/chosen": 1.9197062253952026, "rewards/margins": 4.048729419708252, "rewards/rejected": -2.1290230751037598, "step": 493 }, { "epoch": 0.5883141049497581, "grad_norm": 5.851631164550781, "learning_rate": 9.412395709177592e-07, "logits/chosen": -2.0968637466430664, "logits/rejected": -2.2670934200286865, "logps/chosen": -56.73735046386719, "logps/rejected": -82.4952621459961, "loss": 0.7397, "rewards/accuracies": 0.65625, "rewards/chosen": 0.8379700183868408, "rewards/margins": 2.840369939804077, "rewards/rejected": -2.0023999214172363, "step": 494 }, { "epoch": 0.5895050241905471, "grad_norm": 5.706981658935547, "learning_rate": 9.411203814064363e-07, "logits/chosen": -2.0256106853485107, "logits/rejected": -2.066960096359253, "logps/chosen": -61.96274185180664, "logps/rejected": -76.6291732788086, "loss": 0.7932, "rewards/accuracies": 0.65625, "rewards/chosen": 0.7916320562362671, "rewards/margins": 2.2485616207122803, "rewards/rejected": -1.4569295644760132, "step": 495 }, { "epoch": 0.590695943431336, "grad_norm": 7.2781500816345215, "learning_rate": 9.410011918951131e-07, "logits/chosen": -2.0752134323120117, "logits/rejected": -2.186281442642212, "logps/chosen": -61.73458480834961, "logps/rejected": -82.06429290771484, "loss": 0.7397, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5085170865058899, "rewards/margins": 2.8417956829071045, "rewards/rejected": -2.3332784175872803, "step": 496 }, { "epoch": 0.5918868626721251, "grad_norm": 7.620236873626709, "learning_rate": 9.408820023837902e-07, "logits/chosen": -2.1094846725463867, "logits/rejected": -2.2274646759033203, "logps/chosen": -64.03652954101562, "logps/rejected": -82.7719497680664, "loss": 0.73, "rewards/accuracies": 0.65625, "rewards/chosen": 0.7936136722564697, "rewards/margins": 2.942509174346924, "rewards/rejected": -2.148895740509033, "step": 497 }, { "epoch": 0.593077781912914, "grad_norm": 6.440336227416992, "learning_rate": 9.407628128724672e-07, "logits/chosen": -2.110004186630249, "logits/rejected": -2.1497015953063965, "logps/chosen": -67.4857406616211, "logps/rejected": -68.76017761230469, "loss": 0.9452, "rewards/accuracies": 0.5625, "rewards/chosen": 0.022750310599803925, "rewards/margins": 0.8242576718330383, "rewards/rejected": -0.8015074133872986, "step": 498 }, { "epoch": 0.594268701153703, "grad_norm": 5.746518611907959, "learning_rate": 9.406436233611442e-07, "logits/chosen": -2.083132266998291, "logits/rejected": -2.1699016094207764, "logps/chosen": -55.99351501464844, "logps/rejected": -78.3394775390625, "loss": 0.6935, "rewards/accuracies": 0.75, "rewards/chosen": 1.310661792755127, "rewards/margins": 2.8697633743286133, "rewards/rejected": -1.5591018199920654, "step": 499 }, { "epoch": 0.595459620394492, "grad_norm": 4.744858264923096, "learning_rate": 9.405244338498212e-07, "logits/chosen": -2.094606637954712, "logits/rejected": -2.1400182247161865, "logps/chosen": -62.052001953125, "logps/rejected": -68.87946319580078, "loss": 0.8413, "rewards/accuracies": 0.5625, "rewards/chosen": 0.48311010003089905, "rewards/margins": 1.3090307712554932, "rewards/rejected": -0.825920581817627, "step": 500 }, { "epoch": 0.596650539635281, "grad_norm": 5.027463912963867, "learning_rate": 9.404052443384982e-07, "logits/chosen": -2.0472183227539062, "logits/rejected": -2.0808663368225098, "logps/chosen": -61.839195251464844, "logps/rejected": -69.37129974365234, "loss": 0.9709, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2317502200603485, "rewards/margins": 0.49988698959350586, "rewards/rejected": -0.26813679933547974, "step": 501 }, { "epoch": 0.59784145887607, "grad_norm": 6.217766761779785, "learning_rate": 9.402860548271752e-07, "logits/chosen": -2.114854335784912, "logits/rejected": -2.1907217502593994, "logps/chosen": -58.199893951416016, "logps/rejected": -80.0832748413086, "loss": 0.6155, "rewards/accuracies": 0.6875, "rewards/chosen": 1.5619364976882935, "rewards/margins": 3.53302001953125, "rewards/rejected": -1.9710835218429565, "step": 502 }, { "epoch": 0.5990323781168589, "grad_norm": 5.199097633361816, "learning_rate": 9.401668653158522e-07, "logits/chosen": -2.109105348587036, "logits/rejected": -2.211841344833374, "logps/chosen": -66.9583969116211, "logps/rejected": -84.06356811523438, "loss": 0.8379, "rewards/accuracies": 0.71875, "rewards/chosen": -0.039437174797058105, "rewards/margins": 1.8774163722991943, "rewards/rejected": -1.9168537855148315, "step": 503 }, { "epoch": 0.6002232973576479, "grad_norm": 4.221584796905518, "learning_rate": 9.400476758045292e-07, "logits/chosen": -2.052877426147461, "logits/rejected": -2.1138689517974854, "logps/chosen": -60.37678146362305, "logps/rejected": -73.22274017333984, "loss": 0.7922, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8583395481109619, "rewards/margins": 1.827189564704895, "rewards/rejected": -0.9688501358032227, "step": 504 }, { "epoch": 0.6002232973576479, "eval_logits/chosen": -2.077465295791626, "eval_logits/rejected": -2.1846256256103516, "eval_logps/chosen": -60.6956787109375, "eval_logps/rejected": -75.1336669921875, "eval_loss": 0.7192278504371643, "eval_rewards/accuracies": 0.7318007946014404, "eval_rewards/chosen": 1.0014444589614868, "eval_rewards/margins": 2.5325419902801514, "eval_rewards/rejected": -1.5310975313186646, "eval_runtime": 1885.1242, "eval_samples_per_second": 0.553, "eval_steps_per_second": 0.277, "step": 504 }, { "epoch": 0.601414216598437, "grad_norm": 8.571722984313965, "learning_rate": 9.399284862932062e-07, "logits/chosen": -2.076796054840088, "logits/rejected": -2.1405928134918213, "logps/chosen": -61.58353805541992, "logps/rejected": -84.10604858398438, "loss": 0.8176, "rewards/accuracies": 0.8125, "rewards/chosen": 0.29530298709869385, "rewards/margins": 2.338151693344116, "rewards/rejected": -2.042848825454712, "step": 505 }, { "epoch": 0.6026051358392259, "grad_norm": 5.664702892303467, "learning_rate": 9.398092967818831e-07, "logits/chosen": -2.030216932296753, "logits/rejected": -2.1094212532043457, "logps/chosen": -53.30809020996094, "logps/rejected": -73.69144439697266, "loss": 0.7061, "rewards/accuracies": 0.8125, "rewards/chosen": 1.7011232376098633, "rewards/margins": 3.055046796798706, "rewards/rejected": -1.3539237976074219, "step": 506 }, { "epoch": 0.6037960550800149, "grad_norm": 6.496354103088379, "learning_rate": 9.396901072705602e-07, "logits/chosen": -2.0192947387695312, "logits/rejected": -2.1237635612487793, "logps/chosen": -55.810218811035156, "logps/rejected": -68.19090270996094, "loss": 0.6669, "rewards/accuracies": 0.78125, "rewards/chosen": 1.7526839971542358, "rewards/margins": 2.85787034034729, "rewards/rejected": -1.1051864624023438, "step": 507 }, { "epoch": 0.6049869743208038, "grad_norm": 5.008580684661865, "learning_rate": 9.395709177592372e-07, "logits/chosen": -2.0599722862243652, "logits/rejected": -2.1920480728149414, "logps/chosen": -64.8340072631836, "logps/rejected": -86.55136108398438, "loss": 0.606, "rewards/accuracies": 0.75, "rewards/chosen": 1.3779447078704834, "rewards/margins": 3.6808454990386963, "rewards/rejected": -2.302900791168213, "step": 508 }, { "epoch": 0.6061778935615929, "grad_norm": 6.014128684997559, "learning_rate": 9.394517282479141e-07, "logits/chosen": -2.069884777069092, "logits/rejected": -2.132938861846924, "logps/chosen": -54.890769958496094, "logps/rejected": -67.34212493896484, "loss": 0.7458, "rewards/accuracies": 0.6875, "rewards/chosen": 1.3446096181869507, "rewards/margins": 2.3242430686950684, "rewards/rejected": -0.9796334505081177, "step": 509 }, { "epoch": 0.6073688128023819, "grad_norm": 4.770650863647461, "learning_rate": 9.393325387365911e-07, "logits/chosen": -2.077300548553467, "logits/rejected": -2.161210298538208, "logps/chosen": -50.366939544677734, "logps/rejected": -77.94657135009766, "loss": 0.6226, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6955677270889282, "rewards/margins": 3.471684455871582, "rewards/rejected": -1.7761163711547852, "step": 510 }, { "epoch": 0.6085597320431708, "grad_norm": 6.351809978485107, "learning_rate": 9.392133492252682e-07, "logits/chosen": -2.0904436111450195, "logits/rejected": -2.14642071723938, "logps/chosen": -56.81629180908203, "logps/rejected": -78.36703491210938, "loss": 0.646, "rewards/accuracies": 0.75, "rewards/chosen": 1.451798439025879, "rewards/margins": 3.2690629959106445, "rewards/rejected": -1.817264437675476, "step": 511 }, { "epoch": 0.6097506512839598, "grad_norm": 4.512458801269531, "learning_rate": 9.390941597139451e-07, "logits/chosen": -2.082821846008301, "logits/rejected": -2.168276786804199, "logps/chosen": -60.695064544677734, "logps/rejected": -72.27941131591797, "loss": 0.76, "rewards/accuracies": 0.75, "rewards/chosen": 0.6056603193283081, "rewards/margins": 2.1821980476379395, "rewards/rejected": -1.576537847518921, "step": 512 }, { "epoch": 0.6109415705247488, "grad_norm": 5.832391738891602, "learning_rate": 9.389749702026222e-07, "logits/chosen": -2.0251364707946777, "logits/rejected": -2.136969566345215, "logps/chosen": -53.97468566894531, "logps/rejected": -73.76161193847656, "loss": 0.6374, "rewards/accuracies": 0.84375, "rewards/chosen": 1.8414015769958496, "rewards/margins": 3.362848997116089, "rewards/rejected": -1.5214474201202393, "step": 513 }, { "epoch": 0.6121324897655378, "grad_norm": 5.317727565765381, "learning_rate": 9.388557806912991e-07, "logits/chosen": -1.9913185834884644, "logits/rejected": -2.084792375564575, "logps/chosen": -54.303924560546875, "logps/rejected": -73.24234008789062, "loss": 0.699, "rewards/accuracies": 0.75, "rewards/chosen": 1.551003098487854, "rewards/margins": 2.9587583541870117, "rewards/rejected": -1.4077551364898682, "step": 514 }, { "epoch": 0.6133234090063268, "grad_norm": 6.56810998916626, "learning_rate": 9.387365911799762e-07, "logits/chosen": -2.1077167987823486, "logits/rejected": -2.216343402862549, "logps/chosen": -63.52043914794922, "logps/rejected": -78.13069915771484, "loss": 0.7368, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5051001906394958, "rewards/margins": 2.522722005844116, "rewards/rejected": -2.0176219940185547, "step": 515 }, { "epoch": 0.6145143282471157, "grad_norm": 5.663352012634277, "learning_rate": 9.386174016686531e-07, "logits/chosen": -2.0389156341552734, "logits/rejected": -2.1484615802764893, "logps/chosen": -55.79420471191406, "logps/rejected": -80.0696792602539, "loss": 0.6181, "rewards/accuracies": 0.71875, "rewards/chosen": 1.3084840774536133, "rewards/margins": 3.2540295124053955, "rewards/rejected": -1.9455455541610718, "step": 516 }, { "epoch": 0.6157052474879047, "grad_norm": 5.651805400848389, "learning_rate": 9.384982121573302e-07, "logits/chosen": -2.0317702293395996, "logits/rejected": -2.1769838333129883, "logps/chosen": -56.83985137939453, "logps/rejected": -77.29360961914062, "loss": 0.7152, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9562615752220154, "rewards/margins": 2.980064868927002, "rewards/rejected": -2.023803234100342, "step": 517 }, { "epoch": 0.6168961667286937, "grad_norm": 4.671713829040527, "learning_rate": 9.383790226460071e-07, "logits/chosen": -2.1186070442199707, "logits/rejected": -2.1562929153442383, "logps/chosen": -59.97046661376953, "logps/rejected": -73.54984283447266, "loss": 0.7807, "rewards/accuracies": 0.75, "rewards/chosen": 1.643129587173462, "rewards/margins": 2.488546371459961, "rewards/rejected": -0.8454163074493408, "step": 518 }, { "epoch": 0.6180870859694827, "grad_norm": 5.158901214599609, "learning_rate": 9.382598331346841e-07, "logits/chosen": -2.0665276050567627, "logits/rejected": -2.1604652404785156, "logps/chosen": -66.71312713623047, "logps/rejected": -77.90924072265625, "loss": 0.7604, "rewards/accuracies": 0.625, "rewards/chosen": 0.42701372504234314, "rewards/margins": 2.071807861328125, "rewards/rejected": -1.644794225692749, "step": 519 }, { "epoch": 0.6192780052102717, "grad_norm": 19.22475814819336, "learning_rate": 9.381406436233611e-07, "logits/chosen": -2.1009163856506348, "logits/rejected": -2.1373350620269775, "logps/chosen": -59.4240837097168, "logps/rejected": -78.69136047363281, "loss": 0.7378, "rewards/accuracies": 0.65625, "rewards/chosen": 1.0963177680969238, "rewards/margins": 2.291837215423584, "rewards/rejected": -1.1955193281173706, "step": 520 }, { "epoch": 0.6204689244510606, "grad_norm": 6.458510875701904, "learning_rate": 9.380214541120382e-07, "logits/chosen": -2.058680295944214, "logits/rejected": -2.117750644683838, "logps/chosen": -57.598426818847656, "logps/rejected": -73.77471160888672, "loss": 0.7663, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8923689723014832, "rewards/margins": 2.329787492752075, "rewards/rejected": -1.4374186992645264, "step": 521 }, { "epoch": 0.6216598436918497, "grad_norm": 4.206431865692139, "learning_rate": 9.379022646007151e-07, "logits/chosen": -2.0515270233154297, "logits/rejected": -2.1316397190093994, "logps/chosen": -59.319175720214844, "logps/rejected": -79.19468688964844, "loss": 0.8329, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9203413128852844, "rewards/margins": 2.250347137451172, "rewards/rejected": -1.3300061225891113, "step": 522 }, { "epoch": 0.6228507629326386, "grad_norm": 4.881239414215088, "learning_rate": 9.377830750893921e-07, "logits/chosen": -1.997342586517334, "logits/rejected": -2.144123077392578, "logps/chosen": -53.27873229980469, "logps/rejected": -84.954345703125, "loss": 0.494, "rewards/accuracies": 0.90625, "rewards/chosen": 1.349608063697815, "rewards/margins": 4.461462020874023, "rewards/rejected": -3.111854314804077, "step": 523 }, { "epoch": 0.6240416821734276, "grad_norm": 7.1884589195251465, "learning_rate": 9.376638855780691e-07, "logits/chosen": -1.9831098318099976, "logits/rejected": -2.219945192337036, "logps/chosen": -58.246681213378906, "logps/rejected": -89.1043701171875, "loss": 0.5728, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4701645374298096, "rewards/margins": 4.35382080078125, "rewards/rejected": -2.8836557865142822, "step": 524 }, { "epoch": 0.6252326014142165, "grad_norm": 7.683620452880859, "learning_rate": 9.375446960667461e-07, "logits/chosen": -2.0619282722473145, "logits/rejected": -2.2101876735687256, "logps/chosen": -64.7601089477539, "logps/rejected": -90.76367950439453, "loss": 0.6815, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4984264373779297, "rewards/margins": 3.2646000385284424, "rewards/rejected": -3.763026237487793, "step": 525 }, { "epoch": 0.6264235206550056, "grad_norm": 4.688850402832031, "learning_rate": 9.374255065554231e-07, "logits/chosen": -1.9844729900360107, "logits/rejected": -2.1754839420318604, "logps/chosen": -55.98891830444336, "logps/rejected": -82.77084350585938, "loss": 0.5448, "rewards/accuracies": 0.78125, "rewards/chosen": 1.8472521305084229, "rewards/margins": 4.431239604949951, "rewards/rejected": -2.5839874744415283, "step": 526 }, { "epoch": 0.6276144398957946, "grad_norm": 5.735210418701172, "learning_rate": 9.373063170441001e-07, "logits/chosen": -2.0653374195098877, "logits/rejected": -2.17403507232666, "logps/chosen": -56.73674774169922, "logps/rejected": -77.82350158691406, "loss": 0.698, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3786481618881226, "rewards/margins": 3.2761807441711426, "rewards/rejected": -1.8975322246551514, "step": 527 }, { "epoch": 0.6288053591365835, "grad_norm": 5.2421464920043945, "learning_rate": 9.37187127532777e-07, "logits/chosen": -2.07747745513916, "logits/rejected": -2.0903420448303223, "logps/chosen": -57.624263763427734, "logps/rejected": -70.68936157226562, "loss": 0.8642, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2910071015357971, "rewards/margins": 1.441615343093872, "rewards/rejected": -1.1506080627441406, "step": 528 }, { "epoch": 0.6299962783773725, "grad_norm": 4.172196388244629, "learning_rate": 9.370679380214541e-07, "logits/chosen": -2.043907403945923, "logits/rejected": -2.1668202877044678, "logps/chosen": -55.94355773925781, "logps/rejected": -84.72102355957031, "loss": 0.5768, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7728127241134644, "rewards/margins": 4.2406325340271, "rewards/rejected": -2.467820167541504, "step": 529 }, { "epoch": 0.6311871976181616, "grad_norm": 4.164093494415283, "learning_rate": 9.369487485101311e-07, "logits/chosen": -2.057997226715088, "logits/rejected": -2.2352771759033203, "logps/chosen": -58.41293716430664, "logps/rejected": -79.10549926757812, "loss": 0.5688, "rewards/accuracies": 0.75, "rewards/chosen": 1.7046430110931396, "rewards/margins": 3.9513378143310547, "rewards/rejected": -2.246694803237915, "step": 530 }, { "epoch": 0.6323781168589505, "grad_norm": 6.0555548667907715, "learning_rate": 9.368295589988081e-07, "logits/chosen": -2.0359957218170166, "logits/rejected": -2.185640573501587, "logps/chosen": -55.958763122558594, "logps/rejected": -86.5208969116211, "loss": 0.4679, "rewards/accuracies": 0.8125, "rewards/chosen": 1.8087431192398071, "rewards/margins": 4.874593257904053, "rewards/rejected": -3.065850257873535, "step": 531 }, { "epoch": 0.6335690360997395, "grad_norm": 5.25529670715332, "learning_rate": 9.36710369487485e-07, "logits/chosen": -2.087388277053833, "logits/rejected": -2.1618735790252686, "logps/chosen": -65.1708755493164, "logps/rejected": -83.65644073486328, "loss": 0.7174, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6974815726280212, "rewards/margins": 2.6598899364471436, "rewards/rejected": -1.9624083042144775, "step": 532 }, { "epoch": 0.6347599553405284, "grad_norm": 8.557341575622559, "learning_rate": 9.365911799761621e-07, "logits/chosen": -2.0321078300476074, "logits/rejected": -2.1604084968566895, "logps/chosen": -60.204246520996094, "logps/rejected": -80.44601440429688, "loss": 0.7045, "rewards/accuracies": 0.75, "rewards/chosen": 0.7160237431526184, "rewards/margins": 2.695330858230591, "rewards/rejected": -1.9793070554733276, "step": 533 }, { "epoch": 0.6359508745813175, "grad_norm": 6.205986499786377, "learning_rate": 9.364719904648391e-07, "logits/chosen": -1.9947019815444946, "logits/rejected": -2.112785816192627, "logps/chosen": -52.803749084472656, "logps/rejected": -78.81884765625, "loss": 0.6617, "rewards/accuracies": 0.78125, "rewards/chosen": 1.5885809659957886, "rewards/margins": 3.597254753112793, "rewards/rejected": -2.0086734294891357, "step": 534 }, { "epoch": 0.6371417938221065, "grad_norm": 8.218914985656738, "learning_rate": 9.363528009535161e-07, "logits/chosen": -2.046429395675659, "logits/rejected": -2.182546854019165, "logps/chosen": -63.72667694091797, "logps/rejected": -86.32437896728516, "loss": 0.5204, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7863500118255615, "rewards/margins": 4.74614143371582, "rewards/rejected": -2.9597911834716797, "step": 535 }, { "epoch": 0.6383327130628954, "grad_norm": 7.2462873458862305, "learning_rate": 9.36233611442193e-07, "logits/chosen": -2.0361053943634033, "logits/rejected": -2.1505720615386963, "logps/chosen": -49.0486946105957, "logps/rejected": -68.91242980957031, "loss": 0.6114, "rewards/accuracies": 0.84375, "rewards/chosen": 2.4144773483276367, "rewards/margins": 3.6221957206726074, "rewards/rejected": -1.2077186107635498, "step": 536 }, { "epoch": 0.6395236323036844, "grad_norm": 5.9103593826293945, "learning_rate": 9.361144219308701e-07, "logits/chosen": -2.0438592433929443, "logits/rejected": -2.146685838699341, "logps/chosen": -57.47576141357422, "logps/rejected": -74.00638580322266, "loss": 0.7112, "rewards/accuracies": 0.78125, "rewards/chosen": 1.5897973775863647, "rewards/margins": 3.101548671722412, "rewards/rejected": -1.5117511749267578, "step": 537 }, { "epoch": 0.6407145515444734, "grad_norm": 5.437600612640381, "learning_rate": 9.35995232419547e-07, "logits/chosen": -2.01820707321167, "logits/rejected": -2.197512626647949, "logps/chosen": -61.04362869262695, "logps/rejected": -73.29283905029297, "loss": 0.7545, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0734268426895142, "rewards/margins": 2.774121046066284, "rewards/rejected": -1.7006943225860596, "step": 538 }, { "epoch": 0.6419054707852624, "grad_norm": 6.068284034729004, "learning_rate": 9.358760429082241e-07, "logits/chosen": -2.1003551483154297, "logits/rejected": -2.1138391494750977, "logps/chosen": -68.55464172363281, "logps/rejected": -66.16322326660156, "loss": 0.9463, "rewards/accuracies": 0.65625, "rewards/chosen": 0.38491690158843994, "rewards/margins": 0.8805495500564575, "rewards/rejected": -0.49563267827033997, "step": 539 }, { "epoch": 0.6430963900260513, "grad_norm": 8.055935859680176, "learning_rate": 9.35756853396901e-07, "logits/chosen": -1.9713202714920044, "logits/rejected": -2.1288466453552246, "logps/chosen": -56.03582763671875, "logps/rejected": -79.0322494506836, "loss": 0.6016, "rewards/accuracies": 0.75, "rewards/chosen": 1.4063127040863037, "rewards/margins": 3.761354684829712, "rewards/rejected": -2.355041980743408, "step": 540 }, { "epoch": 0.6442873092668403, "grad_norm": 5.135018348693848, "learning_rate": 9.356376638855781e-07, "logits/chosen": -2.0602164268493652, "logits/rejected": -2.0615878105163574, "logps/chosen": -56.35248565673828, "logps/rejected": -77.93510437011719, "loss": 0.7265, "rewards/accuracies": 0.8125, "rewards/chosen": 0.903400719165802, "rewards/margins": 2.8395869731903076, "rewards/rejected": -1.9361860752105713, "step": 541 }, { "epoch": 0.6454782285076294, "grad_norm": 4.618165493011475, "learning_rate": 9.35518474374255e-07, "logits/chosen": -2.0654897689819336, "logits/rejected": -2.2025582790374756, "logps/chosen": -58.7901725769043, "logps/rejected": -83.14234161376953, "loss": 0.562, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4897840023040771, "rewards/margins": 3.4338345527648926, "rewards/rejected": -1.944050669670105, "step": 542 }, { "epoch": 0.6466691477484183, "grad_norm": 5.4245734214782715, "learning_rate": 9.353992848629321e-07, "logits/chosen": -2.0667295455932617, "logits/rejected": -2.1077969074249268, "logps/chosen": -62.24642562866211, "logps/rejected": -73.88693237304688, "loss": 0.8092, "rewards/accuracies": 0.625, "rewards/chosen": 0.589015543460846, "rewards/margins": 1.9195780754089355, "rewards/rejected": -1.3305624723434448, "step": 543 }, { "epoch": 0.6478600669892073, "grad_norm": 5.788570404052734, "learning_rate": 9.352800953516091e-07, "logits/chosen": -1.9728822708129883, "logits/rejected": -2.028533458709717, "logps/chosen": -62.72517395019531, "logps/rejected": -71.13706970214844, "loss": 0.7961, "rewards/accuracies": 0.59375, "rewards/chosen": 1.245305061340332, "rewards/margins": 2.0530753135681152, "rewards/rejected": -0.807770311832428, "step": 544 }, { "epoch": 0.6490509862299962, "grad_norm": 9.120170593261719, "learning_rate": 9.35160905840286e-07, "logits/chosen": -2.022082567214966, "logits/rejected": -2.115356206893921, "logps/chosen": -60.27552032470703, "logps/rejected": -77.7234878540039, "loss": 0.617, "rewards/accuracies": 0.78125, "rewards/chosen": 1.5028492212295532, "rewards/margins": 3.488696575164795, "rewards/rejected": -1.9858474731445312, "step": 545 }, { "epoch": 0.6502419054707853, "grad_norm": 5.63955020904541, "learning_rate": 9.35041716328963e-07, "logits/chosen": -2.0819737911224365, "logits/rejected": -2.1552882194519043, "logps/chosen": -57.54275131225586, "logps/rejected": -76.82593536376953, "loss": 0.6749, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8390093445777893, "rewards/margins": 2.7601354122161865, "rewards/rejected": -1.9211260080337524, "step": 546 }, { "epoch": 0.6514328247115743, "grad_norm": 5.162178993225098, "learning_rate": 9.349225268176401e-07, "logits/chosen": -2.071624755859375, "logits/rejected": -2.102609634399414, "logps/chosen": -58.2882080078125, "logps/rejected": -81.3963623046875, "loss": 0.7449, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1768486499786377, "rewards/margins": 2.9417078495025635, "rewards/rejected": -1.7648591995239258, "step": 547 }, { "epoch": 0.6526237439523632, "grad_norm": 9.903887748718262, "learning_rate": 9.34803337306317e-07, "logits/chosen": -2.0702359676361084, "logits/rejected": -2.1038432121276855, "logps/chosen": -61.24104309082031, "logps/rejected": -76.29843139648438, "loss": 0.7993, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9216614961624146, "rewards/margins": 2.0447819232940674, "rewards/rejected": -1.1231204271316528, "step": 548 }, { "epoch": 0.6538146631931522, "grad_norm": 4.951479434967041, "learning_rate": 9.34684147794994e-07, "logits/chosen": -2.0714163780212402, "logits/rejected": -2.155938148498535, "logps/chosen": -62.49516296386719, "logps/rejected": -77.83843994140625, "loss": 0.7213, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5508764982223511, "rewards/margins": 2.642409563064575, "rewards/rejected": -2.0915331840515137, "step": 549 }, { "epoch": 0.6550055824339412, "grad_norm": 5.331981658935547, "learning_rate": 9.34564958283671e-07, "logits/chosen": -2.0583934783935547, "logits/rejected": -2.1697540283203125, "logps/chosen": -59.3411750793457, "logps/rejected": -86.6995849609375, "loss": 0.564, "rewards/accuracies": 0.78125, "rewards/chosen": 1.9020302295684814, "rewards/margins": 4.612680435180664, "rewards/rejected": -2.7106502056121826, "step": 550 }, { "epoch": 0.6561965016747302, "grad_norm": 4.819728851318359, "learning_rate": 9.34445768772348e-07, "logits/chosen": -2.0935802459716797, "logits/rejected": -2.270740032196045, "logps/chosen": -63.463966369628906, "logps/rejected": -91.03096771240234, "loss": 0.5395, "rewards/accuracies": 0.71875, "rewards/chosen": 1.1642091274261475, "rewards/margins": 4.324226379394531, "rewards/rejected": -3.160017728805542, "step": 551 }, { "epoch": 0.6573874209155192, "grad_norm": 5.5605034828186035, "learning_rate": 9.34326579261025e-07, "logits/chosen": -2.0414516925811768, "logits/rejected": -2.1687257289886475, "logps/chosen": -66.89607238769531, "logps/rejected": -82.88510131835938, "loss": 0.75, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5448755025863647, "rewards/margins": 2.7367630004882812, "rewards/rejected": -2.191887855529785, "step": 552 }, { "epoch": 0.6585783401563081, "grad_norm": 4.699146270751953, "learning_rate": 9.34207389749702e-07, "logits/chosen": -1.9994057416915894, "logits/rejected": -2.0807406902313232, "logps/chosen": -65.23576354980469, "logps/rejected": -80.89012908935547, "loss": 0.8068, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17832067608833313, "rewards/margins": 1.994524598121643, "rewards/rejected": -1.8162038326263428, "step": 553 }, { "epoch": 0.6597692593970972, "grad_norm": 5.3070197105407715, "learning_rate": 9.34088200238379e-07, "logits/chosen": -2.103069305419922, "logits/rejected": -2.185427665710449, "logps/chosen": -67.59382629394531, "logps/rejected": -84.73088073730469, "loss": 0.738, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6900717616081238, "rewards/margins": 2.6776859760284424, "rewards/rejected": -1.9876145124435425, "step": 554 }, { "epoch": 0.6609601786378861, "grad_norm": 12.146828651428223, "learning_rate": 9.33969010727056e-07, "logits/chosen": -2.0072686672210693, "logits/rejected": -2.155783176422119, "logps/chosen": -50.92415237426758, "logps/rejected": -73.25518035888672, "loss": 0.5632, "rewards/accuracies": 0.875, "rewards/chosen": 3.16032075881958, "rewards/margins": 4.496382236480713, "rewards/rejected": -1.3360613584518433, "step": 555 }, { "epoch": 0.6621510978786751, "grad_norm": 5.83470344543457, "learning_rate": 9.33849821215733e-07, "logits/chosen": -1.9844270944595337, "logits/rejected": -2.185839891433716, "logps/chosen": -58.11042022705078, "logps/rejected": -93.92730712890625, "loss": 0.5664, "rewards/accuracies": 0.875, "rewards/chosen": 1.3820821046829224, "rewards/margins": 4.6762495040893555, "rewards/rejected": -3.2941675186157227, "step": 556 }, { "epoch": 0.6633420171194641, "grad_norm": 3.718324899673462, "learning_rate": 9.337306317044101e-07, "logits/chosen": -2.05000376701355, "logits/rejected": -2.2140471935272217, "logps/chosen": -65.32917785644531, "logps/rejected": -88.96187591552734, "loss": 0.6221, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5177390575408936, "rewards/margins": 3.5710203647613525, "rewards/rejected": -3.053281784057617, "step": 557 }, { "epoch": 0.6645329363602531, "grad_norm": 6.262959003448486, "learning_rate": 9.336114421930869e-07, "logits/chosen": -2.0527169704437256, "logits/rejected": -2.111624240875244, "logps/chosen": -66.44235229492188, "logps/rejected": -79.62799835205078, "loss": 0.782, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2123083770275116, "rewards/margins": 2.133471727371216, "rewards/rejected": -2.34578013420105, "step": 558 }, { "epoch": 0.6657238556010421, "grad_norm": 6.395543098449707, "learning_rate": 9.33492252681764e-07, "logits/chosen": -2.0395004749298096, "logits/rejected": -2.127108097076416, "logps/chosen": -54.410552978515625, "logps/rejected": -73.8921127319336, "loss": 0.6963, "rewards/accuracies": 0.8125, "rewards/chosen": 1.6095541715621948, "rewards/margins": 3.1492807865142822, "rewards/rejected": -1.5397266149520874, "step": 559 }, { "epoch": 0.666914774841831, "grad_norm": 5.173194408416748, "learning_rate": 9.33373063170441e-07, "logits/chosen": -1.961228370666504, "logits/rejected": -2.168367385864258, "logps/chosen": -60.835880279541016, "logps/rejected": -91.978515625, "loss": 0.5356, "rewards/accuracies": 0.84375, "rewards/chosen": 1.48959481716156, "rewards/margins": 4.835398197174072, "rewards/rejected": -3.3458027839660645, "step": 560 }, { "epoch": 0.66810569408262, "grad_norm": 7.454349040985107, "learning_rate": 9.33253873659118e-07, "logits/chosen": -2.0525124073028564, "logits/rejected": -2.1310718059539795, "logps/chosen": -63.490901947021484, "logps/rejected": -81.37293243408203, "loss": 0.6874, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0871219635009766, "rewards/margins": 3.2682905197143555, "rewards/rejected": -2.181168794631958, "step": 561 }, { "epoch": 0.669296613323409, "grad_norm": 5.457616806030273, "learning_rate": 9.331346841477949e-07, "logits/chosen": -2.0850651264190674, "logits/rejected": -2.168133497238159, "logps/chosen": -59.93644332885742, "logps/rejected": -90.67539978027344, "loss": 0.5565, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6666592359542847, "rewards/margins": 4.570645809173584, "rewards/rejected": -2.903986692428589, "step": 562 }, { "epoch": 0.670487532564198, "grad_norm": 5.821387767791748, "learning_rate": 9.33015494636472e-07, "logits/chosen": -1.9900736808776855, "logits/rejected": -2.1059627532958984, "logps/chosen": -63.221614837646484, "logps/rejected": -82.37303161621094, "loss": 0.6597, "rewards/accuracies": 0.8125, "rewards/chosen": 1.32173490524292, "rewards/margins": 3.8626370429992676, "rewards/rejected": -2.5409018993377686, "step": 563 }, { "epoch": 0.671678451804987, "grad_norm": 5.06032657623291, "learning_rate": 9.328963051251489e-07, "logits/chosen": -2.0572729110717773, "logits/rejected": -2.1150898933410645, "logps/chosen": -69.12310791015625, "logps/rejected": -79.90864562988281, "loss": 0.8402, "rewards/accuracies": 0.65625, "rewards/chosen": 0.34678128361701965, "rewards/margins": 1.8787516355514526, "rewards/rejected": -1.531970500946045, "step": 564 }, { "epoch": 0.6728693710457759, "grad_norm": 6.5150322914123535, "learning_rate": 9.32777115613826e-07, "logits/chosen": -2.1451258659362793, "logits/rejected": -2.2645351886749268, "logps/chosen": -65.03531646728516, "logps/rejected": -89.82403564453125, "loss": 0.7359, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4765368700027466, "rewards/margins": 3.2347512245178223, "rewards/rejected": -2.758213996887207, "step": 565 }, { "epoch": 0.6740602902865649, "grad_norm": 6.396346092224121, "learning_rate": 9.32657926102503e-07, "logits/chosen": -2.02987003326416, "logits/rejected": -2.1199145317077637, "logps/chosen": -61.38423156738281, "logps/rejected": -85.47749328613281, "loss": 0.6858, "rewards/accuracies": 0.75, "rewards/chosen": 0.7543643712997437, "rewards/margins": 3.2258799076080322, "rewards/rejected": -2.471515655517578, "step": 566 }, { "epoch": 0.675251209527354, "grad_norm": 5.256311893463135, "learning_rate": 9.3253873659118e-07, "logits/chosen": -2.0905044078826904, "logits/rejected": -2.0990583896636963, "logps/chosen": -70.35136413574219, "logps/rejected": -82.6588363647461, "loss": 0.8653, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2630240023136139, "rewards/margins": 1.763492465019226, "rewards/rejected": -2.0265164375305176, "step": 567 }, { "epoch": 0.6764421287681429, "grad_norm": 4.433882236480713, "learning_rate": 9.324195470798569e-07, "logits/chosen": -2.0556063652038574, "logits/rejected": -2.205796480178833, "logps/chosen": -68.16077423095703, "logps/rejected": -92.7958755493164, "loss": 0.5841, "rewards/accuracies": 0.75, "rewards/chosen": 0.3616257309913635, "rewards/margins": 3.975687026977539, "rewards/rejected": -3.614061117172241, "step": 568 }, { "epoch": 0.6776330480089319, "grad_norm": 4.653103351593018, "learning_rate": 9.32300357568534e-07, "logits/chosen": -2.032710075378418, "logits/rejected": -2.146111249923706, "logps/chosen": -62.11894989013672, "logps/rejected": -69.879638671875, "loss": 0.8802, "rewards/accuracies": 0.65625, "rewards/chosen": 0.685104250907898, "rewards/margins": 1.5912164449691772, "rewards/rejected": -0.9061121940612793, "step": 569 }, { "epoch": 0.6788239672497208, "grad_norm": 5.7732834815979, "learning_rate": 9.32181168057211e-07, "logits/chosen": -1.9633139371871948, "logits/rejected": -2.1394662857055664, "logps/chosen": -66.61666107177734, "logps/rejected": -94.55445861816406, "loss": 0.5589, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3159758448600769, "rewards/margins": 4.0947585105896, "rewards/rejected": -3.778782606124878, "step": 570 }, { "epoch": 0.6800148864905099, "grad_norm": 6.234752178192139, "learning_rate": 9.320619785458879e-07, "logits/chosen": -2.0530664920806885, "logits/rejected": -2.148212194442749, "logps/chosen": -59.522071838378906, "logps/rejected": -79.06482696533203, "loss": 0.6853, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2286895513534546, "rewards/margins": 3.185847759246826, "rewards/rejected": -1.9571585655212402, "step": 571 }, { "epoch": 0.6812058057312989, "grad_norm": 4.3012919425964355, "learning_rate": 9.319427890345649e-07, "logits/chosen": -2.1038339138031006, "logits/rejected": -2.2846975326538086, "logps/chosen": -56.70014190673828, "logps/rejected": -91.20567321777344, "loss": 0.4874, "rewards/accuracies": 0.75, "rewards/chosen": 1.202422857284546, "rewards/margins": 4.786464214324951, "rewards/rejected": -3.584041118621826, "step": 572 }, { "epoch": 0.6823967249720878, "grad_norm": 5.23031759262085, "learning_rate": 9.31823599523242e-07, "logits/chosen": -2.0942635536193848, "logits/rejected": -2.207315683364868, "logps/chosen": -66.54418182373047, "logps/rejected": -85.1339340209961, "loss": 0.6977, "rewards/accuracies": 0.65625, "rewards/chosen": 0.4565780758857727, "rewards/margins": 2.9895401000976562, "rewards/rejected": -2.5329625606536865, "step": 573 }, { "epoch": 0.6835876442128768, "grad_norm": 4.784534454345703, "learning_rate": 9.317044100119189e-07, "logits/chosen": -2.035714864730835, "logits/rejected": -2.1269867420196533, "logps/chosen": -62.298309326171875, "logps/rejected": -86.89015197753906, "loss": 0.7232, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1745593547821045, "rewards/margins": 3.2189836502075195, "rewards/rejected": -2.044424295425415, "step": 574 }, { "epoch": 0.6847785634536658, "grad_norm": 6.47156286239624, "learning_rate": 9.315852205005959e-07, "logits/chosen": -2.0463507175445557, "logits/rejected": -2.1278069019317627, "logps/chosen": -66.19558715820312, "logps/rejected": -84.84830474853516, "loss": 0.7242, "rewards/accuracies": 0.6875, "rewards/chosen": 0.632526695728302, "rewards/margins": 2.8942599296569824, "rewards/rejected": -2.261733055114746, "step": 575 }, { "epoch": 0.6859694826944548, "grad_norm": 9.563532829284668, "learning_rate": 9.314660309892729e-07, "logits/chosen": -2.0245914459228516, "logits/rejected": -2.1070685386657715, "logps/chosen": -58.39350891113281, "logps/rejected": -87.19683837890625, "loss": 0.4889, "rewards/accuracies": 0.875, "rewards/chosen": 1.1379106044769287, "rewards/margins": 4.302646160125732, "rewards/rejected": -3.164735794067383, "step": 576 }, { "epoch": 0.6871604019352437, "grad_norm": 6.383600234985352, "learning_rate": 9.3134684147795e-07, "logits/chosen": -2.0563836097717285, "logits/rejected": -2.1050000190734863, "logps/chosen": -71.43286895751953, "logps/rejected": -82.19892883300781, "loss": 0.871, "rewards/accuracies": 0.6875, "rewards/chosen": -0.484889954328537, "rewards/margins": 1.469208002090454, "rewards/rejected": -1.9540977478027344, "step": 577 }, { "epoch": 0.6883513211760327, "grad_norm": 4.2310380935668945, "learning_rate": 9.312276519666269e-07, "logits/chosen": -2.101975917816162, "logits/rejected": -2.226443290710449, "logps/chosen": -69.49107360839844, "logps/rejected": -90.36126708984375, "loss": 0.6563, "rewards/accuracies": 0.8125, "rewards/chosen": 0.20292983949184418, "rewards/margins": 3.3092041015625, "rewards/rejected": -3.106274366378784, "step": 578 }, { "epoch": 0.6895422404168218, "grad_norm": 5.748950004577637, "learning_rate": 9.31108462455304e-07, "logits/chosen": -1.9911810159683228, "logits/rejected": -2.2701518535614014, "logps/chosen": -66.88318634033203, "logps/rejected": -98.77941131591797, "loss": 0.5624, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6249961853027344, "rewards/margins": 4.565987586975098, "rewards/rejected": -3.9409916400909424, "step": 579 }, { "epoch": 0.6907331596576107, "grad_norm": 5.136340618133545, "learning_rate": 9.309892729439809e-07, "logits/chosen": -2.011827230453491, "logits/rejected": -2.098989248275757, "logps/chosen": -61.38947677612305, "logps/rejected": -84.7833480834961, "loss": 0.6747, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3098903000354767, "rewards/margins": 3.041748046875, "rewards/rejected": -2.7318577766418457, "step": 580 }, { "epoch": 0.6919240788983997, "grad_norm": 3.2560300827026367, "learning_rate": 9.308700834326579e-07, "logits/chosen": -2.033812999725342, "logits/rejected": -2.1479156017303467, "logps/chosen": -57.779476165771484, "logps/rejected": -86.86677551269531, "loss": 0.531, "rewards/accuracies": 0.84375, "rewards/chosen": 1.69545316696167, "rewards/margins": 4.47818660736084, "rewards/rejected": -2.7827329635620117, "step": 581 }, { "epoch": 0.6931149981391886, "grad_norm": 5.278158187866211, "learning_rate": 9.307508939213349e-07, "logits/chosen": -1.992519497871399, "logits/rejected": -2.1010937690734863, "logps/chosen": -68.06216430664062, "logps/rejected": -87.33244323730469, "loss": 0.7143, "rewards/accuracies": 0.75, "rewards/chosen": 0.2111784666776657, "rewards/margins": 2.9920077323913574, "rewards/rejected": -2.7808287143707275, "step": 582 }, { "epoch": 0.6943059173799777, "grad_norm": 6.90396785736084, "learning_rate": 9.30631704410012e-07, "logits/chosen": -2.0037386417388916, "logits/rejected": -2.1887593269348145, "logps/chosen": -62.418392181396484, "logps/rejected": -87.91120147705078, "loss": 0.6696, "rewards/accuracies": 0.875, "rewards/chosen": 0.5777861475944519, "rewards/margins": 3.642629861831665, "rewards/rejected": -3.0648434162139893, "step": 583 }, { "epoch": 0.6954968366207667, "grad_norm": 6.656667232513428, "learning_rate": 9.305125148986888e-07, "logits/chosen": -2.043980836868286, "logits/rejected": -2.1463332176208496, "logps/chosen": -64.12037658691406, "logps/rejected": -81.36907196044922, "loss": 0.7037, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1957851648330688, "rewards/margins": 3.2289884090423584, "rewards/rejected": -2.033203125, "step": 584 }, { "epoch": 0.6966877558615556, "grad_norm": 5.623282432556152, "learning_rate": 9.303933253873659e-07, "logits/chosen": -2.0071825981140137, "logits/rejected": -2.1176652908325195, "logps/chosen": -56.47019958496094, "logps/rejected": -90.3737564086914, "loss": 0.5296, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4656548500061035, "rewards/margins": 4.7555036544799805, "rewards/rejected": -3.2898483276367188, "step": 585 }, { "epoch": 0.6978786751023446, "grad_norm": 5.26791524887085, "learning_rate": 9.302741358760429e-07, "logits/chosen": -2.0310585498809814, "logits/rejected": -2.1521990299224854, "logps/chosen": -64.91459655761719, "logps/rejected": -86.0724105834961, "loss": 0.7133, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9270878434181213, "rewards/margins": 2.8540563583374023, "rewards/rejected": -1.9269682168960571, "step": 586 }, { "epoch": 0.6990695943431336, "grad_norm": 5.454206943511963, "learning_rate": 9.301549463647199e-07, "logits/chosen": -1.9913359880447388, "logits/rejected": -2.0889105796813965, "logps/chosen": -55.67362594604492, "logps/rejected": -74.4872817993164, "loss": 0.7292, "rewards/accuracies": 0.8125, "rewards/chosen": 0.922332763671875, "rewards/margins": 2.6485116481781006, "rewards/rejected": -1.7261788845062256, "step": 587 }, { "epoch": 0.7002605135839226, "grad_norm": 5.335691928863525, "learning_rate": 9.300357568533969e-07, "logits/chosen": -2.0883498191833496, "logits/rejected": -2.2059497833251953, "logps/chosen": -57.40571594238281, "logps/rejected": -89.90081787109375, "loss": 0.5619, "rewards/accuracies": 0.875, "rewards/chosen": 0.9722448587417603, "rewards/margins": 4.100956916809082, "rewards/rejected": -3.1287121772766113, "step": 588 }, { "epoch": 0.7014514328247116, "grad_norm": 6.780941009521484, "learning_rate": 9.299165673420739e-07, "logits/chosen": -2.0158591270446777, "logits/rejected": -2.165972948074341, "logps/chosen": -64.84733581542969, "logps/rejected": -87.8653793334961, "loss": 0.7139, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5093948245048523, "rewards/margins": 3.1214826107025146, "rewards/rejected": -2.612088203430176, "step": 589 }, { "epoch": 0.7026423520655005, "grad_norm": 7.649287223815918, "learning_rate": 9.297973778307508e-07, "logits/chosen": -1.970332384109497, "logits/rejected": -2.1141600608825684, "logps/chosen": -59.857948303222656, "logps/rejected": -91.47373962402344, "loss": 0.616, "rewards/accuracies": 0.75, "rewards/chosen": 0.8565456867218018, "rewards/margins": 4.173666477203369, "rewards/rejected": -3.3171207904815674, "step": 590 }, { "epoch": 0.7038332713062896, "grad_norm": 3.5163519382476807, "learning_rate": 9.296781883194279e-07, "logits/chosen": -2.0312411785125732, "logits/rejected": -2.1595351696014404, "logps/chosen": -65.221435546875, "logps/rejected": -83.65908813476562, "loss": 0.7302, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8065836429595947, "rewards/margins": 2.860025405883789, "rewards/rejected": -2.0534415245056152, "step": 591 }, { "epoch": 0.7050241905470785, "grad_norm": 7.549407958984375, "learning_rate": 9.295589988081049e-07, "logits/chosen": -2.053978443145752, "logits/rejected": -2.199840545654297, "logps/chosen": -65.70941925048828, "logps/rejected": -92.74232482910156, "loss": 0.6379, "rewards/accuracies": 0.78125, "rewards/chosen": 0.08590041846036911, "rewards/margins": 3.6754558086395264, "rewards/rejected": -3.589555501937866, "step": 592 }, { "epoch": 0.7062151097878675, "grad_norm": 4.859821796417236, "learning_rate": 9.294398092967819e-07, "logits/chosen": -2.0885684490203857, "logits/rejected": -2.1390955448150635, "logps/chosen": -69.62409210205078, "logps/rejected": -85.46839141845703, "loss": 0.8261, "rewards/accuracies": 0.71875, "rewards/chosen": 0.053818635642528534, "rewards/margins": 1.8439273834228516, "rewards/rejected": -1.7901086807250977, "step": 593 }, { "epoch": 0.7074060290286565, "grad_norm": 5.746118545532227, "learning_rate": 9.293206197854588e-07, "logits/chosen": -2.031909704208374, "logits/rejected": -2.1327438354492188, "logps/chosen": -56.058013916015625, "logps/rejected": -78.62257385253906, "loss": 0.6858, "rewards/accuracies": 0.75, "rewards/chosen": 0.9188597798347473, "rewards/margins": 2.9726147651672363, "rewards/rejected": -2.0537548065185547, "step": 594 }, { "epoch": 0.7085969482694455, "grad_norm": 3.750704765319824, "learning_rate": 9.292014302741359e-07, "logits/chosen": -2.085087776184082, "logits/rejected": -2.202199935913086, "logps/chosen": -65.58414459228516, "logps/rejected": -81.10910034179688, "loss": 0.7575, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4604765772819519, "rewards/margins": 2.5784342288970947, "rewards/rejected": -2.117957830429077, "step": 595 }, { "epoch": 0.7097878675102345, "grad_norm": 5.079291343688965, "learning_rate": 9.290822407628129e-07, "logits/chosen": -2.0259816646575928, "logits/rejected": -2.1088619232177734, "logps/chosen": -61.31565856933594, "logps/rejected": -76.72444152832031, "loss": 0.7556, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8705599904060364, "rewards/margins": 2.516780376434326, "rewards/rejected": -1.6462202072143555, "step": 596 }, { "epoch": 0.7109787867510234, "grad_norm": 5.496130466461182, "learning_rate": 9.289630512514898e-07, "logits/chosen": -2.0211384296417236, "logits/rejected": -2.143052101135254, "logps/chosen": -57.00369644165039, "logps/rejected": -71.2691650390625, "loss": 0.6737, "rewards/accuracies": 0.75, "rewards/chosen": 2.1985831260681152, "rewards/margins": 3.5476784706115723, "rewards/rejected": -1.3490954637527466, "step": 597 }, { "epoch": 0.7121697059918124, "grad_norm": 4.958765506744385, "learning_rate": 9.288438617401668e-07, "logits/chosen": -1.9894452095031738, "logits/rejected": -2.16617751121521, "logps/chosen": -59.9060173034668, "logps/rejected": -84.89080810546875, "loss": 0.6333, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0591521263122559, "rewards/margins": 3.6179959774017334, "rewards/rejected": -2.5588436126708984, "step": 598 }, { "epoch": 0.7133606252326015, "grad_norm": 8.351611137390137, "learning_rate": 9.287246722288439e-07, "logits/chosen": -2.088949680328369, "logits/rejected": -2.1460795402526855, "logps/chosen": -62.105316162109375, "logps/rejected": -86.42855072021484, "loss": 0.6884, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7952117919921875, "rewards/margins": 2.8710896968841553, "rewards/rejected": -2.075878143310547, "step": 599 }, { "epoch": 0.7145515444733904, "grad_norm": 4.623297691345215, "learning_rate": 9.286054827175208e-07, "logits/chosen": -2.057363510131836, "logits/rejected": -2.2484774589538574, "logps/chosen": -65.93575286865234, "logps/rejected": -92.15900421142578, "loss": 0.6191, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9738456010818481, "rewards/margins": 3.793975830078125, "rewards/rejected": -2.8201301097869873, "step": 600 }, { "epoch": 0.7157424637141794, "grad_norm": 6.150411605834961, "learning_rate": 9.284862932061978e-07, "logits/chosen": -2.0150716304779053, "logits/rejected": -2.1020915508270264, "logps/chosen": -64.57142639160156, "logps/rejected": -82.6600341796875, "loss": 0.7014, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3960244953632355, "rewards/margins": 2.826004981994629, "rewards/rejected": -2.4299802780151367, "step": 601 }, { "epoch": 0.7169333829549683, "grad_norm": 6.284717559814453, "learning_rate": 9.283671036948748e-07, "logits/chosen": -2.088770866394043, "logits/rejected": -2.162731170654297, "logps/chosen": -57.579612731933594, "logps/rejected": -88.28019714355469, "loss": 0.6045, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3639283180236816, "rewards/margins": 4.139405727386475, "rewards/rejected": -2.775477409362793, "step": 602 }, { "epoch": 0.7181243021957574, "grad_norm": 5.514683246612549, "learning_rate": 9.282479141835517e-07, "logits/chosen": -2.0367090702056885, "logits/rejected": -2.1407124996185303, "logps/chosen": -55.85023498535156, "logps/rejected": -78.45614624023438, "loss": 0.6697, "rewards/accuracies": 0.75, "rewards/chosen": 1.5807077884674072, "rewards/margins": 3.5581459999084473, "rewards/rejected": -1.9774385690689087, "step": 603 }, { "epoch": 0.7193152214365464, "grad_norm": 4.912137985229492, "learning_rate": 9.281287246722288e-07, "logits/chosen": -2.084434986114502, "logits/rejected": -2.164109945297241, "logps/chosen": -49.93518829345703, "logps/rejected": -76.52398681640625, "loss": 0.6642, "rewards/accuracies": 0.6875, "rewards/chosen": 2.08785080909729, "rewards/margins": 3.2184677124023438, "rewards/rejected": -1.130616307258606, "step": 604 }, { "epoch": 0.7205061406773353, "grad_norm": 9.391376495361328, "learning_rate": 9.280095351609058e-07, "logits/chosen": -1.990814208984375, "logits/rejected": -2.1170880794525146, "logps/chosen": -53.73088073730469, "logps/rejected": -78.40556335449219, "loss": 0.6571, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4790377616882324, "rewards/margins": 3.4420371055603027, "rewards/rejected": -1.9629992246627808, "step": 605 }, { "epoch": 0.7216970599181243, "grad_norm": 3.353400707244873, "learning_rate": 9.278903456495828e-07, "logits/chosen": -2.0280046463012695, "logits/rejected": -2.142568588256836, "logps/chosen": -58.76980209350586, "logps/rejected": -82.77047729492188, "loss": 0.5209, "rewards/accuracies": 0.6875, "rewards/chosen": 1.6071809530258179, "rewards/margins": 4.1281046867370605, "rewards/rejected": -2.520923614501953, "step": 606 }, { "epoch": 0.7228879791589133, "grad_norm": 5.480037689208984, "learning_rate": 9.277711561382597e-07, "logits/chosen": -1.978115439414978, "logits/rejected": -2.156242609024048, "logps/chosen": -59.03209686279297, "logps/rejected": -91.73088073730469, "loss": 0.5514, "rewards/accuracies": 0.71875, "rewards/chosen": 1.2319231033325195, "rewards/margins": 4.431438446044922, "rewards/rejected": -3.1995153427124023, "step": 607 }, { "epoch": 0.7240788983997023, "grad_norm": 9.424610137939453, "learning_rate": 9.276519666269368e-07, "logits/chosen": -2.029099941253662, "logits/rejected": -2.1726832389831543, "logps/chosen": -52.15224075317383, "logps/rejected": -84.33854675292969, "loss": 0.6266, "rewards/accuracies": 0.71875, "rewards/chosen": 1.3128886222839355, "rewards/margins": 3.7924113273620605, "rewards/rejected": -2.479523181915283, "step": 608 }, { "epoch": 0.7252698176404913, "grad_norm": 5.125524520874023, "learning_rate": 9.275327771156138e-07, "logits/chosen": -1.9640729427337646, "logits/rejected": -2.0501301288604736, "logps/chosen": -60.70658493041992, "logps/rejected": -78.96841430664062, "loss": 0.7101, "rewards/accuracies": 0.6875, "rewards/chosen": 1.6959151029586792, "rewards/margins": 2.824493169784546, "rewards/rejected": -1.128577709197998, "step": 609 }, { "epoch": 0.7264607368812802, "grad_norm": 5.011012554168701, "learning_rate": 9.274135876042908e-07, "logits/chosen": -2.07043194770813, "logits/rejected": -2.1895179748535156, "logps/chosen": -62.604068756103516, "logps/rejected": -85.97433471679688, "loss": 0.7402, "rewards/accuracies": 0.75, "rewards/chosen": 0.7421680688858032, "rewards/margins": 3.1482181549072266, "rewards/rejected": -2.406050205230713, "step": 610 }, { "epoch": 0.7276516561220692, "grad_norm": 5.473383903503418, "learning_rate": 9.272943980929677e-07, "logits/chosen": -2.0103919506073, "logits/rejected": -2.2061409950256348, "logps/chosen": -57.31387710571289, "logps/rejected": -75.18946075439453, "loss": 0.6327, "rewards/accuracies": 0.875, "rewards/chosen": 1.782273530960083, "rewards/margins": 3.7721967697143555, "rewards/rejected": -1.9899230003356934, "step": 611 }, { "epoch": 0.7288425753628582, "grad_norm": 5.706201553344727, "learning_rate": 9.271752085816448e-07, "logits/chosen": -2.0802199840545654, "logits/rejected": -2.138526678085327, "logps/chosen": -59.47992706298828, "logps/rejected": -74.72522735595703, "loss": 0.7624, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2259418964385986, "rewards/margins": 2.3127851486206055, "rewards/rejected": -1.0868427753448486, "step": 612 }, { "epoch": 0.7300334946036472, "grad_norm": 5.003928184509277, "learning_rate": 9.270560190703217e-07, "logits/chosen": -2.023852825164795, "logits/rejected": -2.1254448890686035, "logps/chosen": -53.18684005737305, "logps/rejected": -81.44467163085938, "loss": 0.5881, "rewards/accuracies": 0.75, "rewards/chosen": 1.4836245775222778, "rewards/margins": 3.9411072731018066, "rewards/rejected": -2.4574828147888184, "step": 613 }, { "epoch": 0.7312244138444361, "grad_norm": 9.279070854187012, "learning_rate": 9.269368295589988e-07, "logits/chosen": -1.9889533519744873, "logits/rejected": -2.160715103149414, "logps/chosen": -61.20843505859375, "logps/rejected": -78.31036376953125, "loss": 0.6807, "rewards/accuracies": 0.84375, "rewards/chosen": 1.31057608127594, "rewards/margins": 3.2393555641174316, "rewards/rejected": -1.928779125213623, "step": 614 }, { "epoch": 0.7324153330852251, "grad_norm": 5.617761135101318, "learning_rate": 9.268176400476757e-07, "logits/chosen": -2.099531650543213, "logits/rejected": -2.1799731254577637, "logps/chosen": -56.09318161010742, "logps/rejected": -77.99337768554688, "loss": 0.6593, "rewards/accuracies": 0.78125, "rewards/chosen": 1.657498836517334, "rewards/margins": 3.021178722381592, "rewards/rejected": -1.3636798858642578, "step": 615 }, { "epoch": 0.7336062523260142, "grad_norm": 5.772714138031006, "learning_rate": 9.266984505363527e-07, "logits/chosen": -2.058978319168091, "logits/rejected": -2.1180572509765625, "logps/chosen": -59.54370880126953, "logps/rejected": -78.49557495117188, "loss": 0.6851, "rewards/accuracies": 0.75, "rewards/chosen": 1.5837297439575195, "rewards/margins": 3.3514015674591064, "rewards/rejected": -1.767672061920166, "step": 616 }, { "epoch": 0.7347971715668031, "grad_norm": 5.780079364776611, "learning_rate": 9.265792610250297e-07, "logits/chosen": -2.0533227920532227, "logits/rejected": -2.0842199325561523, "logps/chosen": -51.33500671386719, "logps/rejected": -76.80592346191406, "loss": 0.6646, "rewards/accuracies": 0.6875, "rewards/chosen": 1.3203223943710327, "rewards/margins": 2.9530420303344727, "rewards/rejected": -1.6327195167541504, "step": 617 }, { "epoch": 0.7359880908075921, "grad_norm": 6.434371471405029, "learning_rate": 9.264600715137068e-07, "logits/chosen": -2.1087565422058105, "logits/rejected": -2.183197498321533, "logps/chosen": -62.7166748046875, "logps/rejected": -78.16108703613281, "loss": 0.8167, "rewards/accuracies": 0.65625, "rewards/chosen": 0.4955495595932007, "rewards/margins": 1.9237923622131348, "rewards/rejected": -1.4282429218292236, "step": 618 }, { "epoch": 0.737179010048381, "grad_norm": 6.303630828857422, "learning_rate": 9.263408820023836e-07, "logits/chosen": -2.0094494819641113, "logits/rejected": -2.110860824584961, "logps/chosen": -56.69042205810547, "logps/rejected": -71.39663696289062, "loss": 0.7868, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0772470235824585, "rewards/margins": 2.4478468894958496, "rewards/rejected": -1.370599627494812, "step": 619 }, { "epoch": 0.7383699292891701, "grad_norm": 5.615312576293945, "learning_rate": 9.262216924910607e-07, "logits/chosen": -2.067594289779663, "logits/rejected": -2.1751840114593506, "logps/chosen": -55.964725494384766, "logps/rejected": -75.47276306152344, "loss": 0.7148, "rewards/accuracies": 0.8125, "rewards/chosen": 1.7447344064712524, "rewards/margins": 2.6430342197418213, "rewards/rejected": -0.8982996344566345, "step": 620 }, { "epoch": 0.7395608485299591, "grad_norm": 5.204858779907227, "learning_rate": 9.261025029797377e-07, "logits/chosen": -2.106569766998291, "logits/rejected": -2.0806078910827637, "logps/chosen": -53.91269302368164, "logps/rejected": -63.80672073364258, "loss": 0.8822, "rewards/accuracies": 0.59375, "rewards/chosen": 1.2086825370788574, "rewards/margins": 1.3115967512130737, "rewards/rejected": -0.10291427373886108, "step": 621 }, { "epoch": 0.740751767770748, "grad_norm": 4.285759449005127, "learning_rate": 9.259833134684148e-07, "logits/chosen": -2.035536527633667, "logits/rejected": -2.139349937438965, "logps/chosen": -55.12523651123047, "logps/rejected": -76.3583755493164, "loss": 0.6107, "rewards/accuracies": 0.8125, "rewards/chosen": 1.8930113315582275, "rewards/margins": 3.9210267066955566, "rewards/rejected": -2.02801513671875, "step": 622 }, { "epoch": 0.741942687011537, "grad_norm": 4.228950500488281, "learning_rate": 9.258641239570917e-07, "logits/chosen": -2.0176031589508057, "logits/rejected": -2.210439682006836, "logps/chosen": -48.124473571777344, "logps/rejected": -87.73609161376953, "loss": 0.4281, "rewards/accuracies": 0.875, "rewards/chosen": 2.2269182205200195, "rewards/margins": 5.369488716125488, "rewards/rejected": -3.1425704956054688, "step": 623 }, { "epoch": 0.743133606252326, "grad_norm": 6.579267978668213, "learning_rate": 9.257449344457687e-07, "logits/chosen": -2.0140581130981445, "logits/rejected": -2.1151976585388184, "logps/chosen": -60.97420883178711, "logps/rejected": -77.07455444335938, "loss": 0.7723, "rewards/accuracies": 0.65625, "rewards/chosen": 0.7476484775543213, "rewards/margins": 2.2869229316711426, "rewards/rejected": -1.5392743349075317, "step": 624 }, { "epoch": 0.744324525493115, "grad_norm": 6.218290328979492, "learning_rate": 9.256257449344457e-07, "logits/chosen": -2.039381504058838, "logits/rejected": -2.1495203971862793, "logps/chosen": -60.140403747558594, "logps/rejected": -77.96969604492188, "loss": 0.7049, "rewards/accuracies": 0.75, "rewards/chosen": 0.7456068992614746, "rewards/margins": 2.495666265487671, "rewards/rejected": -1.7500593662261963, "step": 625 }, { "epoch": 0.745515444733904, "grad_norm": 5.303410053253174, "learning_rate": 9.255065554231227e-07, "logits/chosen": -2.018404722213745, "logits/rejected": -2.23787522315979, "logps/chosen": -51.24806594848633, "logps/rejected": -83.53851318359375, "loss": 0.5224, "rewards/accuracies": 0.8125, "rewards/chosen": 2.317352771759033, "rewards/margins": 4.907583236694336, "rewards/rejected": -2.5902304649353027, "step": 626 }, { "epoch": 0.7467063639746929, "grad_norm": 5.157975196838379, "learning_rate": 9.253873659117997e-07, "logits/chosen": -2.0183401107788086, "logits/rejected": -2.157519817352295, "logps/chosen": -51.9130859375, "logps/rejected": -69.22000122070312, "loss": 0.6444, "rewards/accuracies": 0.78125, "rewards/chosen": 2.413715124130249, "rewards/margins": 3.552999973297119, "rewards/rejected": -1.1392847299575806, "step": 627 }, { "epoch": 0.747897283215482, "grad_norm": 4.6381707191467285, "learning_rate": 9.252681764004767e-07, "logits/chosen": -2.0786540508270264, "logits/rejected": -2.226592540740967, "logps/chosen": -63.45248794555664, "logps/rejected": -80.62217712402344, "loss": 0.7322, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4244003891944885, "rewards/margins": 2.540653944015503, "rewards/rejected": -2.116253614425659, "step": 628 }, { "epoch": 0.749088202456271, "grad_norm": 6.246036529541016, "learning_rate": 9.251489868891536e-07, "logits/chosen": -2.08960223197937, "logits/rejected": -2.128757953643799, "logps/chosen": -63.30988693237305, "logps/rejected": -71.64029693603516, "loss": 0.87, "rewards/accuracies": 0.65625, "rewards/chosen": 0.4171410799026489, "rewards/margins": 1.3592671155929565, "rewards/rejected": -0.9421262145042419, "step": 629 }, { "epoch": 0.7502791216970599, "grad_norm": 8.459240913391113, "learning_rate": 9.250297973778307e-07, "logits/chosen": -2.0337512493133545, "logits/rejected": -2.078925848007202, "logps/chosen": -53.64933395385742, "logps/rejected": -70.37319946289062, "loss": 0.7252, "rewards/accuracies": 0.71875, "rewards/chosen": 1.8058099746704102, "rewards/margins": 2.6793458461761475, "rewards/rejected": -0.8735360503196716, "step": 630 }, { "epoch": 0.7514700409378489, "grad_norm": 3.7448720932006836, "learning_rate": 9.249106078665077e-07, "logits/chosen": -2.0358550548553467, "logits/rejected": -2.1451199054718018, "logps/chosen": -49.267704010009766, "logps/rejected": -80.26771545410156, "loss": 0.5339, "rewards/accuracies": 0.78125, "rewards/chosen": 2.4472832679748535, "rewards/margins": 4.351206302642822, "rewards/rejected": -1.903923511505127, "step": 631 }, { "epoch": 0.7526609601786379, "grad_norm": 7.201323509216309, "learning_rate": 9.247914183551848e-07, "logits/chosen": -1.9911574125289917, "logits/rejected": -2.130922317504883, "logps/chosen": -48.80897521972656, "logps/rejected": -78.35938262939453, "loss": 0.5414, "rewards/accuracies": 0.78125, "rewards/chosen": 2.4373152256011963, "rewards/margins": 4.74107027053833, "rewards/rejected": -2.303755283355713, "step": 632 }, { "epoch": 0.7538518794194269, "grad_norm": 5.176777362823486, "learning_rate": 9.246722288438616e-07, "logits/chosen": -2.1018359661102295, "logits/rejected": -2.249817371368408, "logps/chosen": -66.02680969238281, "logps/rejected": -92.39988708496094, "loss": 0.6266, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5397177338600159, "rewards/margins": 3.5683236122131348, "rewards/rejected": -3.0286059379577637, "step": 633 }, { "epoch": 0.7550427986602158, "grad_norm": 7.065025806427002, "learning_rate": 9.245530393325387e-07, "logits/chosen": -2.0086934566497803, "logits/rejected": -2.115095376968384, "logps/chosen": -57.615962982177734, "logps/rejected": -74.00410461425781, "loss": 0.6669, "rewards/accuracies": 0.71875, "rewards/chosen": 1.8736319541931152, "rewards/margins": 3.4184186458587646, "rewards/rejected": -1.5447869300842285, "step": 634 }, { "epoch": 0.7562337179010048, "grad_norm": 5.039332389831543, "learning_rate": 9.244338498212157e-07, "logits/chosen": -2.024085760116577, "logits/rejected": -2.1545636653900146, "logps/chosen": -58.39849853515625, "logps/rejected": -76.64482879638672, "loss": 0.6813, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2591276168823242, "rewards/margins": 2.960599660873413, "rewards/rejected": -1.7014720439910889, "step": 635 }, { "epoch": 0.7574246371417939, "grad_norm": 8.557599067687988, "learning_rate": 9.243146603098927e-07, "logits/chosen": -2.0317678451538086, "logits/rejected": -2.167958974838257, "logps/chosen": -53.40253829956055, "logps/rejected": -84.37836456298828, "loss": 0.6006, "rewards/accuracies": 0.8125, "rewards/chosen": 1.8050668239593506, "rewards/margins": 3.7188334465026855, "rewards/rejected": -1.9137669801712036, "step": 636 }, { "epoch": 0.7586155563825828, "grad_norm": 5.147088527679443, "learning_rate": 9.241954707985696e-07, "logits/chosen": -2.086437225341797, "logits/rejected": -2.2115800380706787, "logps/chosen": -60.890052795410156, "logps/rejected": -88.153564453125, "loss": 0.6338, "rewards/accuracies": 0.6875, "rewards/chosen": 1.048736810684204, "rewards/margins": 3.440852165222168, "rewards/rejected": -2.392115354537964, "step": 637 }, { "epoch": 0.7598064756233718, "grad_norm": 7.465814590454102, "learning_rate": 9.240762812872467e-07, "logits/chosen": -1.9993630647659302, "logits/rejected": -2.1479032039642334, "logps/chosen": -62.10674285888672, "logps/rejected": -77.00485229492188, "loss": 0.7479, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0626033544540405, "rewards/margins": 2.6169490814208984, "rewards/rejected": -1.5543458461761475, "step": 638 }, { "epoch": 0.7609973948641607, "grad_norm": 4.744126796722412, "learning_rate": 9.239570917759236e-07, "logits/chosen": -2.035052537918091, "logits/rejected": -2.16973614692688, "logps/chosen": -46.93419647216797, "logps/rejected": -79.49308013916016, "loss": 0.5022, "rewards/accuracies": 0.90625, "rewards/chosen": 2.4924936294555664, "rewards/margins": 4.6742167472839355, "rewards/rejected": -2.1817233562469482, "step": 639 }, { "epoch": 0.7621883141049498, "grad_norm": 5.5098419189453125, "learning_rate": 9.238379022646007e-07, "logits/chosen": -1.979839563369751, "logits/rejected": -2.1810264587402344, "logps/chosen": -58.51254653930664, "logps/rejected": -85.18621826171875, "loss": 0.5585, "rewards/accuracies": 0.875, "rewards/chosen": 0.8218082189559937, "rewards/margins": 4.114978790283203, "rewards/rejected": -3.2931711673736572, "step": 640 }, { "epoch": 0.7633792333457388, "grad_norm": 5.870126724243164, "learning_rate": 9.237187127532776e-07, "logits/chosen": -2.015373706817627, "logits/rejected": -2.129627227783203, "logps/chosen": -53.26652526855469, "logps/rejected": -76.93531799316406, "loss": 0.6555, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3327374458312988, "rewards/margins": 3.284653425216675, "rewards/rejected": -1.9519156217575073, "step": 641 }, { "epoch": 0.7645701525865277, "grad_norm": 6.779722213745117, "learning_rate": 9.235995232419546e-07, "logits/chosen": -2.0172548294067383, "logits/rejected": -2.191756248474121, "logps/chosen": -54.68974304199219, "logps/rejected": -88.62168884277344, "loss": 0.5069, "rewards/accuracies": 0.84375, "rewards/chosen": 1.927128553390503, "rewards/margins": 4.963258266448975, "rewards/rejected": -3.0361292362213135, "step": 642 }, { "epoch": 0.7657610718273167, "grad_norm": 6.578105926513672, "learning_rate": 9.234803337306316e-07, "logits/chosen": -2.061282157897949, "logits/rejected": -2.2319912910461426, "logps/chosen": -54.32838821411133, "logps/rejected": -79.12886810302734, "loss": 0.6369, "rewards/accuracies": 0.65625, "rewards/chosen": 1.7015914916992188, "rewards/margins": 3.921652317047119, "rewards/rejected": -2.2200605869293213, "step": 643 }, { "epoch": 0.7669519910681057, "grad_norm": 5.823929309844971, "learning_rate": 9.233611442193087e-07, "logits/chosen": -2.117051839828491, "logits/rejected": -2.21502685546875, "logps/chosen": -54.379032135009766, "logps/rejected": -78.04922485351562, "loss": 0.6198, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4274952411651611, "rewards/margins": 3.66121768951416, "rewards/rejected": -2.233722448348999, "step": 644 }, { "epoch": 0.7681429103088947, "grad_norm": 6.609743595123291, "learning_rate": 9.232419547079857e-07, "logits/chosen": -2.0710055828094482, "logits/rejected": -2.2554166316986084, "logps/chosen": -53.34526062011719, "logps/rejected": -81.31503295898438, "loss": 0.5666, "rewards/accuracies": 0.8125, "rewards/chosen": 1.7964363098144531, "rewards/margins": 4.316765785217285, "rewards/rejected": -2.5203299522399902, "step": 645 }, { "epoch": 0.7693338295496837, "grad_norm": 6.070062160491943, "learning_rate": 9.231227651966626e-07, "logits/chosen": -1.9953184127807617, "logits/rejected": -2.1491310596466064, "logps/chosen": -58.00665283203125, "logps/rejected": -86.60801696777344, "loss": 0.6251, "rewards/accuracies": 0.75, "rewards/chosen": 1.377376675605774, "rewards/margins": 4.334383010864258, "rewards/rejected": -2.9570066928863525, "step": 646 }, { "epoch": 0.7705247487904726, "grad_norm": 5.83568000793457, "learning_rate": 9.230035756853396e-07, "logits/chosen": -2.0467448234558105, "logits/rejected": -2.157357931137085, "logps/chosen": -60.451560974121094, "logps/rejected": -76.16677856445312, "loss": 0.6988, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0034230947494507, "rewards/margins": 3.318706512451172, "rewards/rejected": -2.3152832984924316, "step": 647 }, { "epoch": 0.7717156680312617, "grad_norm": 4.986474990844727, "learning_rate": 9.228843861740167e-07, "logits/chosen": -2.035411834716797, "logits/rejected": -2.1186370849609375, "logps/chosen": -64.96098327636719, "logps/rejected": -84.17206573486328, "loss": 0.7351, "rewards/accuracies": 0.75, "rewards/chosen": 0.6200524568557739, "rewards/margins": 2.876138210296631, "rewards/rejected": -2.2560858726501465, "step": 648 }, { "epoch": 0.7729065872720506, "grad_norm": 5.9861836433410645, "learning_rate": 9.227651966626936e-07, "logits/chosen": -1.966442584991455, "logits/rejected": -2.066636800765991, "logps/chosen": -55.53995132446289, "logps/rejected": -74.72180938720703, "loss": 0.7014, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8769949078559875, "rewards/margins": 2.7230770587921143, "rewards/rejected": -1.846082091331482, "step": 649 }, { "epoch": 0.7740975065128396, "grad_norm": 5.224770545959473, "learning_rate": 9.226460071513706e-07, "logits/chosen": -2.050632953643799, "logits/rejected": -2.152020215988159, "logps/chosen": -59.0282096862793, "logps/rejected": -88.68195343017578, "loss": 0.6205, "rewards/accuracies": 0.75, "rewards/chosen": 0.8047587871551514, "rewards/margins": 3.535388946533203, "rewards/rejected": -2.7306301593780518, "step": 650 }, { "epoch": 0.7752884257536286, "grad_norm": 4.847574234008789, "learning_rate": 9.225268176400476e-07, "logits/chosen": -2.077760696411133, "logits/rejected": -2.198995590209961, "logps/chosen": -62.68391799926758, "logps/rejected": -90.92389678955078, "loss": 0.5759, "rewards/accuracies": 0.875, "rewards/chosen": 1.1712605953216553, "rewards/margins": 3.941704034805298, "rewards/rejected": -2.7704434394836426, "step": 651 }, { "epoch": 0.7764793449944176, "grad_norm": 6.098292350769043, "learning_rate": 9.224076281287246e-07, "logits/chosen": -2.1087656021118164, "logits/rejected": -2.138070821762085, "logps/chosen": -58.793575286865234, "logps/rejected": -73.67060089111328, "loss": 0.7403, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1691787242889404, "rewards/margins": 2.2391624450683594, "rewards/rejected": -1.069983720779419, "step": 652 }, { "epoch": 0.7776702642352066, "grad_norm": 4.72073221206665, "learning_rate": 9.222884386174016e-07, "logits/chosen": -2.014582633972168, "logits/rejected": -2.165205478668213, "logps/chosen": -58.50836181640625, "logps/rejected": -70.0895004272461, "loss": 0.8073, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9929172396659851, "rewards/margins": 2.21323823928833, "rewards/rejected": -1.2203208208084106, "step": 653 }, { "epoch": 0.7788611834759955, "grad_norm": 9.791297912597656, "learning_rate": 9.221692491060787e-07, "logits/chosen": -2.0800719261169434, "logits/rejected": -2.1209099292755127, "logps/chosen": -64.66854858398438, "logps/rejected": -73.04179382324219, "loss": 0.7868, "rewards/accuracies": 0.625, "rewards/chosen": 0.9242310523986816, "rewards/margins": 1.915661096572876, "rewards/rejected": -0.9914299845695496, "step": 654 }, { "epoch": 0.7800521027167845, "grad_norm": 8.33814525604248, "learning_rate": 9.220500595947555e-07, "logits/chosen": -1.9906415939331055, "logits/rejected": -2.0022010803222656, "logps/chosen": -56.38947296142578, "logps/rejected": -71.89482116699219, "loss": 0.792, "rewards/accuracies": 0.6875, "rewards/chosen": 1.4079070091247559, "rewards/margins": 2.2834513187408447, "rewards/rejected": -0.8755444884300232, "step": 655 }, { "epoch": 0.7812430219575734, "grad_norm": 5.887550354003906, "learning_rate": 9.219308700834326e-07, "logits/chosen": -2.1098010540008545, "logits/rejected": -2.3184497356414795, "logps/chosen": -67.50691223144531, "logps/rejected": -100.49127197265625, "loss": 0.5179, "rewards/accuracies": 0.71875, "rewards/chosen": 0.37169140577316284, "rewards/margins": 4.927766799926758, "rewards/rejected": -4.556074619293213, "step": 656 }, { "epoch": 0.7824339411983625, "grad_norm": 5.680395126342773, "learning_rate": 9.218116805721096e-07, "logits/chosen": -2.058759927749634, "logits/rejected": -2.1798818111419678, "logps/chosen": -66.96067810058594, "logps/rejected": -81.79708862304688, "loss": 0.6948, "rewards/accuracies": 0.78125, "rewards/chosen": 1.413511037826538, "rewards/margins": 3.3672537803649902, "rewards/rejected": -1.9537427425384521, "step": 657 }, { "epoch": 0.7836248604391515, "grad_norm": 7.423297882080078, "learning_rate": 9.216924910607867e-07, "logits/chosen": -2.013681650161743, "logits/rejected": -2.1366608142852783, "logps/chosen": -66.68406677246094, "logps/rejected": -85.52183532714844, "loss": 0.7255, "rewards/accuracies": 0.65625, "rewards/chosen": 0.47576332092285156, "rewards/margins": 3.0799763202667236, "rewards/rejected": -2.604212760925293, "step": 658 }, { "epoch": 0.7848157796799404, "grad_norm": 4.934439182281494, "learning_rate": 9.215733015494635e-07, "logits/chosen": -2.0476746559143066, "logits/rejected": -2.180467367172241, "logps/chosen": -66.86254119873047, "logps/rejected": -74.79537200927734, "loss": 0.7725, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2715948224067688, "rewards/margins": 1.7655384540557861, "rewards/rejected": -1.493943452835083, "step": 659 }, { "epoch": 0.7860066989207294, "grad_norm": 7.748165607452393, "learning_rate": 9.214541120381406e-07, "logits/chosen": -2.045229434967041, "logits/rejected": -2.1088497638702393, "logps/chosen": -69.33380126953125, "logps/rejected": -82.66470336914062, "loss": 0.8598, "rewards/accuracies": 0.71875, "rewards/chosen": 0.23643268644809723, "rewards/margins": 2.006319761276245, "rewards/rejected": -1.7698872089385986, "step": 660 }, { "epoch": 0.7871976181615185, "grad_norm": 9.782285690307617, "learning_rate": 9.213349225268176e-07, "logits/chosen": -2.097456693649292, "logits/rejected": -2.055799961090088, "logps/chosen": -63.5713996887207, "logps/rejected": -73.68312072753906, "loss": 0.9273, "rewards/accuracies": 0.625, "rewards/chosen": 0.15229028463363647, "rewards/margins": 1.2522265911102295, "rewards/rejected": -1.0999362468719482, "step": 661 }, { "epoch": 0.7883885374023074, "grad_norm": 7.248333930969238, "learning_rate": 9.212157330154946e-07, "logits/chosen": -2.088657855987549, "logits/rejected": -2.161501169204712, "logps/chosen": -65.15269470214844, "logps/rejected": -81.30773162841797, "loss": 0.7417, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4531443417072296, "rewards/margins": 2.947922706604004, "rewards/rejected": -2.4947783946990967, "step": 662 }, { "epoch": 0.7895794566430964, "grad_norm": 8.911661148071289, "learning_rate": 9.210965435041715e-07, "logits/chosen": -2.0407357215881348, "logits/rejected": -2.127894639968872, "logps/chosen": -59.47509002685547, "logps/rejected": -78.92170715332031, "loss": 0.7223, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1010856628417969, "rewards/margins": 3.137827157974243, "rewards/rejected": -2.0367414951324463, "step": 663 }, { "epoch": 0.7907703758838853, "grad_norm": 6.581150054931641, "learning_rate": 9.209773539928486e-07, "logits/chosen": -2.0562074184417725, "logits/rejected": -2.163145065307617, "logps/chosen": -58.19085693359375, "logps/rejected": -73.210693359375, "loss": 0.7994, "rewards/accuracies": 0.71875, "rewards/chosen": 1.2331854104995728, "rewards/margins": 2.187264919281006, "rewards/rejected": -0.9540796875953674, "step": 664 }, { "epoch": 0.7919612951246744, "grad_norm": 7.313786506652832, "learning_rate": 9.208581644815255e-07, "logits/chosen": -1.9856019020080566, "logits/rejected": -2.1972126960754395, "logps/chosen": -57.07604217529297, "logps/rejected": -92.74014282226562, "loss": 0.5866, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1897923946380615, "rewards/margins": 4.945401191711426, "rewards/rejected": -3.7556092739105225, "step": 665 }, { "epoch": 0.7931522143654633, "grad_norm": 7.6575422286987305, "learning_rate": 9.207389749702026e-07, "logits/chosen": -1.995926856994629, "logits/rejected": -2.1053359508514404, "logps/chosen": -62.64823913574219, "logps/rejected": -88.80012512207031, "loss": 0.6069, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1610356569290161, "rewards/margins": 3.940547227859497, "rewards/rejected": -2.7795116901397705, "step": 666 }, { "epoch": 0.7943431336062523, "grad_norm": 6.3281426429748535, "learning_rate": 9.206197854588796e-07, "logits/chosen": -1.9730287790298462, "logits/rejected": -2.176378011703491, "logps/chosen": -59.33301544189453, "logps/rejected": -96.47058868408203, "loss": 0.5081, "rewards/accuracies": 0.875, "rewards/chosen": 1.6205689907073975, "rewards/margins": 5.785343647003174, "rewards/rejected": -4.164774417877197, "step": 667 }, { "epoch": 0.7955340528470413, "grad_norm": 5.055420875549316, "learning_rate": 9.205005959475565e-07, "logits/chosen": -2.0802676677703857, "logits/rejected": -2.1757895946502686, "logps/chosen": -61.67909622192383, "logps/rejected": -83.07132720947266, "loss": 0.6376, "rewards/accuracies": 0.78125, "rewards/chosen": 1.111585259437561, "rewards/margins": 3.0826563835144043, "rewards/rejected": -1.9710713624954224, "step": 668 }, { "epoch": 0.7967249720878303, "grad_norm": 6.190169334411621, "learning_rate": 9.203814064362335e-07, "logits/chosen": -2.0440847873687744, "logits/rejected": -2.156343460083008, "logps/chosen": -58.36088562011719, "logps/rejected": -90.65571594238281, "loss": 0.5031, "rewards/accuracies": 0.875, "rewards/chosen": 1.3037174940109253, "rewards/margins": 4.98504638671875, "rewards/rejected": -3.681328773498535, "step": 669 }, { "epoch": 0.7979158913286193, "grad_norm": 7.507907867431641, "learning_rate": 9.202622169249106e-07, "logits/chosen": -1.9821484088897705, "logits/rejected": -2.101825475692749, "logps/chosen": -60.25959014892578, "logps/rejected": -86.30553436279297, "loss": 0.6777, "rewards/accuracies": 0.6875, "rewards/chosen": 1.2188715934753418, "rewards/margins": 3.7973313331604004, "rewards/rejected": -2.5784599781036377, "step": 670 }, { "epoch": 0.7991068105694082, "grad_norm": 6.938436508178711, "learning_rate": 9.201430274135876e-07, "logits/chosen": -2.005141258239746, "logits/rejected": -2.1527328491210938, "logps/chosen": -61.21720886230469, "logps/rejected": -82.9426040649414, "loss": 0.6001, "rewards/accuracies": 0.8125, "rewards/chosen": 1.357237458229065, "rewards/margins": 3.9699275493621826, "rewards/rejected": -2.6126904487609863, "step": 671 }, { "epoch": 0.8002977298101972, "grad_norm": 6.498921871185303, "learning_rate": 9.200238379022645e-07, "logits/chosen": -2.1475820541381836, "logits/rejected": -2.142122983932495, "logps/chosen": -76.78697204589844, "logps/rejected": -81.92558288574219, "loss": 0.9853, "rewards/accuracies": 0.65625, "rewards/chosen": -0.992220401763916, "rewards/margins": 0.5924146175384521, "rewards/rejected": -1.5846350193023682, "step": 672 }, { "epoch": 0.8002977298101972, "eval_logits/chosen": -2.0738651752471924, "eval_logits/rejected": -2.186373233795166, "eval_logps/chosen": -66.11119842529297, "eval_logps/rejected": -84.8781509399414, "eval_loss": 0.6981044411659241, "eval_rewards/accuracies": 0.7461685538291931, "eval_rewards/chosen": 0.45989206433296204, "eval_rewards/margins": 2.965437173843384, "eval_rewards/rejected": -2.505545139312744, "eval_runtime": 1868.2364, "eval_samples_per_second": 0.558, "eval_steps_per_second": 0.279, "step": 672 } ], "logging_steps": 1.0, "max_steps": 8390, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 168, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }