{ "best_metric": 0.9497246742248535, "best_model_checkpoint": "saves/Vicuna-7B-v1.5/lora/orpo-salt/checkpoint-1500", "epoch": 2.9969690846635686, "eval_steps": 500, "global_step": 1854, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01616488179430188, "grad_norm": 0.3899887204170227, "learning_rate": 4.999648198770648e-06, "logits/chosen": -0.8260404467582703, "logits/rejected": -0.779380202293396, "logps/chosen": -1.0734994411468506, "logps/rejected": -1.2254035472869873, "loss": 1.146, "odds_ratio_loss": 0.7249619364738464, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.1073499470949173, "rewards/margins": 0.01519041694700718, "rewards/rejected": -0.12254035472869873, "sft_loss": 1.0734994411468506, "step": 10 }, { "epoch": 0.03232976358860376, "grad_norm": 0.4923989176750183, "learning_rate": 4.998578646361359e-06, "logits/chosen": -0.7854002714157104, "logits/rejected": -0.781389594078064, "logps/chosen": -1.0866433382034302, "logps/rejected": -1.2551138401031494, "loss": 1.1535, "odds_ratio_loss": 0.668422520160675, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.10866433382034302, "rewards/margins": 0.016847047954797745, "rewards/rejected": -0.12551137804985046, "sft_loss": 1.0866433382034302, "step": 20 }, { "epoch": 0.04849464538290564, "grad_norm": 0.7084988951683044, "learning_rate": 4.996791614004449e-06, "logits/chosen": -0.7559419274330139, "logits/rejected": -0.7485054731369019, "logps/chosen": -1.0929394960403442, "logps/rejected": -1.1501963138580322, "loss": 1.1699, "odds_ratio_loss": 0.7694913148880005, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.10929396003484726, "rewards/margins": 0.00572569016367197, "rewards/rejected": -0.11501964181661606, "sft_loss": 1.0929394960403442, "step": 30 }, { "epoch": 0.06465952717720752, "grad_norm": 0.8286219239234924, "learning_rate": 4.994287614855618e-06, "logits/chosen": -0.8193706274032593, "logits/rejected": -0.7897969484329224, "logps/chosen": -1.1362740993499756, "logps/rejected": -1.1394835710525513, "loss": 1.2171, "odds_ratio_loss": 0.808376133441925, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.11362739652395248, "rewards/margins": 0.0003209514543414116, "rewards/rejected": -0.11394836008548737, "sft_loss": 1.1362740993499756, "step": 40 }, { "epoch": 0.0808244089715094, "grad_norm": 0.537628173828125, "learning_rate": 4.991067367951343e-06, "logits/chosen": -0.7530331015586853, "logits/rejected": -0.7703112363815308, "logps/chosen": -1.0968067646026611, "logps/rejected": -1.1828521490097046, "loss": 1.1729, "odds_ratio_loss": 0.7610759735107422, "rewards/accuracies": 0.5, "rewards/chosen": -0.10968067497015, "rewards/margins": 0.00860452838242054, "rewards/rejected": -0.11828521639108658, "sft_loss": 1.0968067646026611, "step": 50 }, { "epoch": 0.09698929076581128, "grad_norm": 0.2992643415927887, "learning_rate": 4.987131798002389e-06, "logits/chosen": -0.7554941773414612, "logits/rejected": -0.7805821299552917, "logps/chosen": -1.120224118232727, "logps/rejected": -1.1958564519882202, "loss": 1.2007, "odds_ratio_loss": 0.804762065410614, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.11202241480350494, "rewards/margins": 0.007563246879726648, "rewards/rejected": -0.11958565562963486, "sft_loss": 1.120224118232727, "step": 60 }, { "epoch": 0.11315417256011315, "grad_norm": 0.5207487940788269, "learning_rate": 4.982482035128285e-06, "logits/chosen": -0.7931987643241882, "logits/rejected": -0.7725004553794861, "logps/chosen": -1.158760666847229, "logps/rejected": -1.3085857629776, "loss": 1.2342, "odds_ratio_loss": 0.7545939683914185, "rewards/accuracies": 0.5, "rewards/chosen": -0.11587607860565186, "rewards/margins": 0.01498250663280487, "rewards/rejected": -0.13085858523845673, "sft_loss": 1.158760666847229, "step": 70 }, { "epoch": 0.12931905435441504, "grad_norm": 0.8179022669792175, "learning_rate": 4.9771194145328e-06, "logits/chosen": -0.7553219199180603, "logits/rejected": -0.7355794906616211, "logps/chosen": -0.9810718297958374, "logps/rejected": -1.1142699718475342, "loss": 1.0496, "odds_ratio_loss": 0.6851751208305359, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09810719639062881, "rewards/margins": 0.013319805264472961, "rewards/rejected": -0.11142698675394058, "sft_loss": 0.9810718297958374, "step": 80 }, { "epoch": 0.1454839361487169, "grad_norm": 0.5893221497535706, "learning_rate": 4.971045476120532e-06, "logits/chosen": -0.7767540216445923, "logits/rejected": -0.7691196203231812, "logps/chosen": -1.0343536138534546, "logps/rejected": -1.1126210689544678, "loss": 1.1086, "odds_ratio_loss": 0.7424803972244263, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.10343535989522934, "rewards/margins": 0.007826738059520721, "rewards/rejected": -0.11126209795475006, "sft_loss": 1.0343536138534546, "step": 90 }, { "epoch": 0.1616488179430188, "grad_norm": 0.3746645748615265, "learning_rate": 4.964261964054713e-06, "logits/chosen": -0.749561607837677, "logits/rejected": -0.7426966428756714, "logps/chosen": -1.0808948278427124, "logps/rejected": -1.1608020067214966, "loss": 1.1637, "odds_ratio_loss": 0.8280612826347351, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.10808948427438736, "rewards/margins": 0.007990716025233269, "rewards/rejected": -0.11608020961284637, "sft_loss": 1.0808948278427124, "step": 100 }, { "epoch": 0.17781369973732067, "grad_norm": 0.5266828536987305, "learning_rate": 4.956770826256372e-06, "logits/chosen": -0.7276872396469116, "logits/rejected": -0.7239276766777039, "logps/chosen": -1.0891507863998413, "logps/rejected": -1.188951015472412, "loss": 1.1606, "odds_ratio_loss": 0.7148129940032959, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.10891509056091309, "rewards/margins": 0.009980013594031334, "rewards/rejected": -0.11889511346817017, "sft_loss": 1.0891507863998413, "step": 110 }, { "epoch": 0.19397858153162256, "grad_norm": 0.5117731690406799, "learning_rate": 4.94857421384497e-06, "logits/chosen": -0.7153638601303101, "logits/rejected": -0.7017214894294739, "logps/chosen": -1.0659247636795044, "logps/rejected": -1.1995283365249634, "loss": 1.1411, "odds_ratio_loss": 0.7518999576568604, "rewards/accuracies": 0.5, "rewards/chosen": -0.10659247636795044, "rewards/margins": 0.013360358774662018, "rewards/rejected": -0.11995282024145126, "sft_loss": 1.0659247636795044, "step": 120 }, { "epoch": 0.21014346332592443, "grad_norm": 0.3964090049266815, "learning_rate": 4.939674480520701e-06, "logits/chosen": -0.7281032800674438, "logits/rejected": -0.6757130026817322, "logps/chosen": -0.9924377202987671, "logps/rejected": -1.0807675123214722, "loss": 1.0644, "odds_ratio_loss": 0.7199574708938599, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09924378246068954, "rewards/margins": 0.008832980878651142, "rewards/rejected": -0.10807675123214722, "sft_loss": 0.9924377202987671, "step": 130 }, { "epoch": 0.2263083451202263, "grad_norm": 0.31593117117881775, "learning_rate": 4.930074181888613e-06, "logits/chosen": -0.6932573914527893, "logits/rejected": -0.6765223741531372, "logps/chosen": -1.011648416519165, "logps/rejected": -1.1101162433624268, "loss": 1.0811, "odds_ratio_loss": 0.6949580907821655, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.10116484016180038, "rewards/margins": 0.009846789762377739, "rewards/rejected": -0.11101162433624268, "sft_loss": 1.011648416519165, "step": 140 }, { "epoch": 0.2424732269145282, "grad_norm": 0.7396884560585022, "learning_rate": 4.91977607472475e-06, "logits/chosen": -0.6414996981620789, "logits/rejected": -0.6007689237594604, "logps/chosen": -1.0180175304412842, "logps/rejected": -1.0574676990509033, "loss": 1.0929, "odds_ratio_loss": 0.748645544052124, "rewards/accuracies": 0.5, "rewards/chosen": -0.10180176794528961, "rewards/margins": 0.00394500233232975, "rewards/rejected": -0.10574676841497421, "sft_loss": 1.0180175304412842, "step": 150 }, { "epoch": 0.2586381087088301, "grad_norm": 0.5049052834510803, "learning_rate": 4.908783116184534e-06, "logits/chosen": -0.6661972403526306, "logits/rejected": -0.626873791217804, "logps/chosen": -0.953465461730957, "logps/rejected": -1.0835082530975342, "loss": 1.02, "odds_ratio_loss": 0.6655644178390503, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09534655511379242, "rewards/margins": 0.013004262931644917, "rewards/rejected": -0.10835081338882446, "sft_loss": 0.953465461730957, "step": 160 }, { "epoch": 0.27480299050313195, "grad_norm": 0.4969651699066162, "learning_rate": 4.897098462953598e-06, "logits/chosen": -0.5929690599441528, "logits/rejected": -0.6147447824478149, "logps/chosen": -0.9747630953788757, "logps/rejected": -1.1718312501907349, "loss": 1.0464, "odds_ratio_loss": 0.7164822220802307, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09747631102800369, "rewards/margins": 0.019706813618540764, "rewards/rejected": -0.11718311160802841, "sft_loss": 0.9747630953788757, "step": 170 }, { "epoch": 0.2909678722974338, "grad_norm": 0.37429389357566833, "learning_rate": 4.884725470341331e-06, "logits/chosen": -0.5573834180831909, "logits/rejected": -0.544479250907898, "logps/chosen": -0.8867887258529663, "logps/rejected": -1.1076356172561646, "loss": 0.9499, "odds_ratio_loss": 0.6307954788208008, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08867888152599335, "rewards/margins": 0.022084690630435944, "rewards/rejected": -0.1107635647058487, "sft_loss": 0.8867887258529663, "step": 180 }, { "epoch": 0.3071327540917357, "grad_norm": 1.2109434604644775, "learning_rate": 4.871667691317377e-06, "logits/chosen": -0.6222495436668396, "logits/rejected": -0.6174622774124146, "logps/chosen": -1.1702499389648438, "logps/rejected": -1.0528119802474976, "loss": 1.2649, "odds_ratio_loss": 0.9465614557266235, "rewards/accuracies": 0.40625, "rewards/chosen": -0.1170249953866005, "rewards/margins": -0.011743778362870216, "rewards/rejected": -0.10528121143579483, "sft_loss": 1.1702499389648438, "step": 190 }, { "epoch": 0.3232976358860376, "grad_norm": 1.5371562242507935, "learning_rate": 4.857928875491392e-06, "logits/chosen": -0.5464112162590027, "logits/rejected": -0.5513696670532227, "logps/chosen": -0.8908155560493469, "logps/rejected": -1.0076180696487427, "loss": 0.9612, "odds_ratio_loss": 0.7040323615074158, "rewards/accuracies": 0.5, "rewards/chosen": -0.08908155560493469, "rewards/margins": 0.011680259369313717, "rewards/rejected": -0.10076181590557098, "sft_loss": 0.8908155560493469, "step": 200 }, { "epoch": 0.33946251768033947, "grad_norm": 0.6159927845001221, "learning_rate": 4.843512968036314e-06, "logits/chosen": -0.6329461932182312, "logits/rejected": -0.592659592628479, "logps/chosen": -0.975503146648407, "logps/rejected": -0.9970613718032837, "loss": 1.0514, "odds_ratio_loss": 0.7591590881347656, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09755031019449234, "rewards/margins": 0.00215582805685699, "rewards/rejected": -0.09970613569021225, "sft_loss": 0.975503146648407, "step": 210 }, { "epoch": 0.35562739947464134, "grad_norm": 0.3111410439014435, "learning_rate": 4.828424108555486e-06, "logits/chosen": -0.5221891403198242, "logits/rejected": -0.5304391980171204, "logps/chosen": -1.1862733364105225, "logps/rejected": -1.2753493785858154, "loss": 1.2641, "odds_ratio_loss": 0.7783994674682617, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.11862732470035553, "rewards/margins": 0.008907611481845379, "rewards/rejected": -0.12753494083881378, "sft_loss": 1.1862733364105225, "step": 220 }, { "epoch": 0.3717922812689432, "grad_norm": 0.301698237657547, "learning_rate": 4.812666629893957e-06, "logits/chosen": -0.4992770254611969, "logits/rejected": -0.4967115521430969, "logps/chosen": -0.9971933364868164, "logps/rejected": -1.0213407278060913, "loss": 1.0744, "odds_ratio_loss": 0.7721298933029175, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.09971933811903, "rewards/margins": 0.0024147380609065294, "rewards/rejected": -0.1021340861916542, "sft_loss": 0.9971933364868164, "step": 230 }, { "epoch": 0.3879571630632451, "grad_norm": 0.2967057526111603, "learning_rate": 4.796245056894273e-06, "logits/chosen": -0.5198571085929871, "logits/rejected": -0.4987764358520508, "logps/chosen": -0.9578666687011719, "logps/rejected": -1.0644018650054932, "loss": 1.0315, "odds_ratio_loss": 0.7367077469825745, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0957866758108139, "rewards/margins": 0.01065351627767086, "rewards/rejected": -0.10644018650054932, "sft_loss": 0.9578666687011719, "step": 240 }, { "epoch": 0.404122044857547, "grad_norm": 0.336041659116745, "learning_rate": 4.779164105097148e-06, "logits/chosen": -0.4748106002807617, "logits/rejected": -0.44636374711990356, "logps/chosen": -0.9247462153434753, "logps/rejected": -1.1018692255020142, "loss": 0.9923, "odds_ratio_loss": 0.6758453845977783, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09247462451457977, "rewards/margins": 0.017712296918034554, "rewards/rejected": -0.11018691956996918, "sft_loss": 0.9247462153434753, "step": 250 }, { "epoch": 0.42028692665184886, "grad_norm": 0.5222122669219971, "learning_rate": 4.761428679387373e-06, "logits/chosen": -0.46434497833251953, "logits/rejected": -0.4350043833255768, "logps/chosen": -0.8905488848686218, "logps/rejected": -1.0182609558105469, "loss": 0.9591, "odds_ratio_loss": 0.6853379011154175, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0890548899769783, "rewards/margins": 0.01277120690792799, "rewards/rejected": -0.10182609409093857, "sft_loss": 0.8905488848686218, "step": 260 }, { "epoch": 0.4364518084461507, "grad_norm": 0.5936411023139954, "learning_rate": 4.7430438725853515e-06, "logits/chosen": -0.48627519607543945, "logits/rejected": -0.4379982352256775, "logps/chosen": -0.9183929562568665, "logps/rejected": -1.1679961681365967, "loss": 0.984, "odds_ratio_loss": 0.6556900143623352, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.09183929860591888, "rewards/margins": 0.024960322305560112, "rewards/rejected": -0.11679961532354355, "sft_loss": 0.9183929562568665, "step": 270 }, { "epoch": 0.4526166902404526, "grad_norm": 0.46239179372787476, "learning_rate": 4.724014963984669e-06, "logits/chosen": -0.4012899398803711, "logits/rejected": -0.411139577627182, "logps/chosen": -1.008721947669983, "logps/rejected": -1.2014849185943604, "loss": 1.0765, "odds_ratio_loss": 0.6780184507369995, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.10087219625711441, "rewards/margins": 0.01927630603313446, "rewards/rejected": -0.12014850229024887, "sft_loss": 1.008721947669983, "step": 280 }, { "epoch": 0.4687815720347545, "grad_norm": 0.5760877132415771, "learning_rate": 4.704347417836116e-06, "logits/chosen": -0.4533885419368744, "logits/rejected": -0.46080097556114197, "logps/chosen": -0.9372620582580566, "logps/rejected": -1.1106752157211304, "loss": 1.0089, "odds_ratio_loss": 0.716440737247467, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09372620284557343, "rewards/margins": 0.01734132692217827, "rewards/rejected": -0.1110675185918808, "sft_loss": 0.9372620582580566, "step": 290 }, { "epoch": 0.4849464538290564, "grad_norm": 0.44260743260383606, "learning_rate": 4.684046881778603e-06, "logits/chosen": -0.5344091653823853, "logits/rejected": -0.49474531412124634, "logps/chosen": -0.9150590896606445, "logps/rejected": -1.0017120838165283, "loss": 0.9833, "odds_ratio_loss": 0.6827279329299927, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09150592237710953, "rewards/margins": 0.008665294386446476, "rewards/rejected": -0.10017120838165283, "sft_loss": 0.9150590896606445, "step": 300 }, { "epoch": 0.5011113356233583, "grad_norm": 0.3225099742412567, "learning_rate": 4.663119185217409e-06, "logits/chosen": -0.43460625410079956, "logits/rejected": -0.4127863049507141, "logps/chosen": -0.8891846537590027, "logps/rejected": -1.0905497074127197, "loss": 0.954, "odds_ratio_loss": 0.6476849913597107, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08891846239566803, "rewards/margins": 0.020136509090662003, "rewards/rejected": -0.10905496776103973, "sft_loss": 0.8891846537590027, "step": 310 }, { "epoch": 0.5172762174176602, "grad_norm": 0.3512892723083496, "learning_rate": 4.641570337650232e-06, "logits/chosen": -0.43388432264328003, "logits/rejected": -0.39495667815208435, "logps/chosen": -0.8790934681892395, "logps/rejected": -0.9963566064834595, "loss": 0.9498, "odds_ratio_loss": 0.7069565057754517, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08790934085845947, "rewards/margins": 0.011726310476660728, "rewards/rejected": -0.09963564574718475, "sft_loss": 0.8790934681892395, "step": 320 }, { "epoch": 0.533441099211962, "grad_norm": 0.3520517349243164, "learning_rate": 4.61940652694154e-06, "logits/chosen": -0.45831650495529175, "logits/rejected": -0.4600452780723572, "logps/chosen": -0.9612126350402832, "logps/rejected": -1.0601940155029297, "loss": 1.0373, "odds_ratio_loss": 0.7606214880943298, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09612125903367996, "rewards/margins": 0.00989813357591629, "rewards/rejected": -0.10601940006017685, "sft_loss": 0.9612126350402832, "step": 330 }, { "epoch": 0.5496059810062639, "grad_norm": 0.42445889115333557, "learning_rate": 4.596634117545689e-06, "logits/chosen": -0.3920242190361023, "logits/rejected": -0.41387075185775757, "logps/chosen": -0.9238036274909973, "logps/rejected": -1.0761339664459229, "loss": 0.9917, "odds_ratio_loss": 0.6789978742599487, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09238035976886749, "rewards/margins": 0.015233027748763561, "rewards/rejected": -0.10761336982250214, "sft_loss": 0.9238036274909973, "step": 340 }, { "epoch": 0.5657708628005658, "grad_norm": 0.3377890884876251, "learning_rate": 4.573259648679335e-06, "logits/chosen": -0.39150765538215637, "logits/rejected": -0.4451742172241211, "logps/chosen": -0.9269700050354004, "logps/rejected": -1.077823281288147, "loss": 0.9987, "odds_ratio_loss": 0.7173791527748108, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.09269699454307556, "rewards/margins": 0.01508533675223589, "rewards/rejected": -0.10778234153985977, "sft_loss": 0.9269700050354004, "step": 350 }, { "epoch": 0.5819357445948676, "grad_norm": 0.9352906942367554, "learning_rate": 4.549289832443663e-06, "logits/chosen": -0.39780086278915405, "logits/rejected": -0.3602847754955292, "logps/chosen": -0.9020577669143677, "logps/rejected": -1.0630056858062744, "loss": 0.9737, "odds_ratio_loss": 0.7168340682983398, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09020576626062393, "rewards/margins": 0.01609480008482933, "rewards/rejected": -0.10630057752132416, "sft_loss": 0.9020577669143677, "step": 360 }, { "epoch": 0.5981006263891695, "grad_norm": 0.3642963469028473, "learning_rate": 4.524731551896978e-06, "logits/chosen": -0.4040652811527252, "logits/rejected": -0.39201897382736206, "logps/chosen": -0.822562038898468, "logps/rejected": -0.9484196901321411, "loss": 0.8918, "odds_ratio_loss": 0.6919523477554321, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08225620537996292, "rewards/margins": 0.012585763819515705, "rewards/rejected": -0.09484197199344635, "sft_loss": 0.822562038898468, "step": 370 }, { "epoch": 0.6142655081834714, "grad_norm": 0.9358541965484619, "learning_rate": 4.4995918590781925e-06, "logits/chosen": -0.41558751463890076, "logits/rejected": -0.39345669746398926, "logps/chosen": -0.9379288554191589, "logps/rejected": -1.0011296272277832, "loss": 1.0132, "odds_ratio_loss": 0.7530064582824707, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0937928855419159, "rewards/margins": 0.006320066750049591, "rewards/rejected": -0.10011295974254608, "sft_loss": 0.9379288554191589, "step": 380 }, { "epoch": 0.6304303899777733, "grad_norm": 0.42754364013671875, "learning_rate": 4.473877972981797e-06, "logits/chosen": -0.4294399321079254, "logits/rejected": -0.48693591356277466, "logps/chosen": -0.9050455093383789, "logps/rejected": -1.0990797281265259, "loss": 0.9681, "odds_ratio_loss": 0.6305026412010193, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.09050454199314117, "rewards/margins": 0.019403431564569473, "rewards/rejected": -0.10990796983242035, "sft_loss": 0.9050455093383789, "step": 390 }, { "epoch": 0.6465952717720752, "grad_norm": 0.3870018422603607, "learning_rate": 4.447597277484894e-06, "logits/chosen": -0.41894254088401794, "logits/rejected": -0.3863012492656708, "logps/chosen": -0.9011236429214478, "logps/rejected": -1.011643648147583, "loss": 0.971, "odds_ratio_loss": 0.6992276906967163, "rewards/accuracies": 0.5, "rewards/chosen": -0.0901123657822609, "rewards/margins": 0.011052015237510204, "rewards/rejected": -0.10116437822580338, "sft_loss": 0.9011236429214478, "step": 400 }, { "epoch": 0.6627601535663771, "grad_norm": 0.6716357469558716, "learning_rate": 4.42075731922687e-06, "logits/chosen": -0.381665974855423, "logits/rejected": -0.40627461671829224, "logps/chosen": -0.9860145449638367, "logps/rejected": -1.0734965801239014, "loss": 1.0559, "odds_ratio_loss": 0.6987608671188354, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09860144555568695, "rewards/margins": 0.008748206309974194, "rewards/rejected": -0.10734964907169342, "sft_loss": 0.9860145449638367, "step": 410 }, { "epoch": 0.6789250353606789, "grad_norm": 0.4379284083843231, "learning_rate": 4.3933658054423465e-06, "logits/chosen": -0.42450767755508423, "logits/rejected": -0.4302968978881836, "logps/chosen": -0.8682054281234741, "logps/rejected": -1.0158107280731201, "loss": 0.9348, "odds_ratio_loss": 0.6656124591827393, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0868205577135086, "rewards/margins": 0.014760518446564674, "rewards/rejected": -0.10158105939626694, "sft_loss": 0.8682054281234741, "step": 420 }, { "epoch": 0.6950899171549808, "grad_norm": 0.4341568052768707, "learning_rate": 4.365430601748003e-06, "logits/chosen": -0.3941816985607147, "logits/rejected": -0.349882036447525, "logps/chosen": -0.9646803140640259, "logps/rejected": -1.0113680362701416, "loss": 1.0372, "odds_ratio_loss": 0.7253597974777222, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09646803140640259, "rewards/margins": 0.004668788518756628, "rewards/rejected": -0.10113681852817535, "sft_loss": 0.9646803140640259, "step": 430 }, { "epoch": 0.7112547989492827, "grad_norm": 1.7109006643295288, "learning_rate": 4.336959729883925e-06, "logits/chosen": -0.37049371004104614, "logits/rejected": -0.3737342953681946, "logps/chosen": -0.9116461873054504, "logps/rejected": -0.9422439336776733, "loss": 0.9849, "odds_ratio_loss": 0.7329493165016174, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09116461873054504, "rewards/margins": 0.0030597783625125885, "rewards/rejected": -0.09422439336776733, "sft_loss": 0.9116461873054504, "step": 440 }, { "epoch": 0.7274196807435845, "grad_norm": 0.4295767843723297, "learning_rate": 4.307961365410118e-06, "logits/chosen": -0.46054011583328247, "logits/rejected": -0.4506424069404602, "logps/chosen": -0.904135525226593, "logps/rejected": -0.965890109539032, "loss": 0.9756, "odds_ratio_loss": 0.7150284051895142, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09041355550289154, "rewards/margins": 0.0061754509806632996, "rewards/rejected": -0.09658900648355484, "sft_loss": 0.904135525226593, "step": 450 }, { "epoch": 0.7435845625378864, "grad_norm": 0.7776443958282471, "learning_rate": 4.278443835358854e-06, "logits/chosen": -0.3951818645000458, "logits/rejected": -0.4040835499763489, "logps/chosen": -0.8823555707931519, "logps/rejected": -1.1062017679214478, "loss": 0.9449, "odds_ratio_loss": 0.6257806420326233, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08823557198047638, "rewards/margins": 0.022384602576494217, "rewards/rejected": -0.1106201782822609, "sft_loss": 0.8823555707931519, "step": 460 }, { "epoch": 0.7597494443321883, "grad_norm": 0.37953025102615356, "learning_rate": 4.248415615843523e-06, "logits/chosen": -0.376980721950531, "logits/rejected": -0.40178006887435913, "logps/chosen": -0.9119707345962524, "logps/rejected": -0.9874213933944702, "loss": 0.9817, "odds_ratio_loss": 0.6976627111434937, "rewards/accuracies": 0.5, "rewards/chosen": -0.09119707345962524, "rewards/margins": 0.007545073516666889, "rewards/rejected": -0.09874214231967926, "sft_loss": 0.9119707345962524, "step": 470 }, { "epoch": 0.7759143261264903, "grad_norm": 0.5314805507659912, "learning_rate": 4.217885329624666e-06, "logits/chosen": -0.3499462604522705, "logits/rejected": -0.33436357975006104, "logps/chosen": -0.876055121421814, "logps/rejected": -1.064893126487732, "loss": 0.9413, "odds_ratio_loss": 0.6526578068733215, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08760551363229752, "rewards/margins": 0.018883811309933662, "rewards/rejected": -0.10648931562900543, "sft_loss": 0.876055121421814, "step": 480 }, { "epoch": 0.7920792079207921, "grad_norm": 0.40282461047172546, "learning_rate": 4.186861743633911e-06, "logits/chosen": -0.41591471433639526, "logits/rejected": -0.4058813154697418, "logps/chosen": -0.8972100019454956, "logps/rejected": -1.093335509300232, "loss": 0.9699, "odds_ratio_loss": 0.7265552282333374, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08972100913524628, "rewards/margins": 0.01961255446076393, "rewards/rejected": -0.10933355987071991, "sft_loss": 0.8972100019454956, "step": 490 }, { "epoch": 0.808244089715094, "grad_norm": 0.43431738018989563, "learning_rate": 4.155353766456497e-06, "logits/chosen": -0.30508697032928467, "logits/rejected": -0.3136020302772522, "logps/chosen": -0.9303945302963257, "logps/rejected": -1.0141643285751343, "loss": 1.0008, "odds_ratio_loss": 0.7037394046783447, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09303945302963257, "rewards/margins": 0.00837697833776474, "rewards/rejected": -0.1014164462685585, "sft_loss": 0.9303945302963257, "step": 500 }, { "epoch": 0.808244089715094, "eval_logits/chosen": -0.3878052830696106, "eval_logits/rejected": -0.3689490258693695, "eval_logps/chosen": -0.9066087007522583, "eval_logps/rejected": -1.0192701816558838, "eval_loss": 0.9776538014411926, "eval_odds_ratio_loss": 0.710451602935791, "eval_rewards/accuracies": 0.5054545402526855, "eval_rewards/chosen": -0.09066087007522583, "eval_rewards/margins": 0.011266152374446392, "eval_rewards/rejected": -0.1019270196557045, "eval_runtime": 192.2826, "eval_samples_per_second": 5.721, "eval_sft_loss": 0.9066087007522583, "eval_steps_per_second": 2.86, "step": 500 }, { "epoch": 0.8244089715093958, "grad_norm": 0.3983856737613678, "learning_rate": 4.123370445773134e-06, "logits/chosen": -0.344710111618042, "logits/rejected": -0.3169902563095093, "logps/chosen": -0.8998648524284363, "logps/rejected": -0.9106130599975586, "loss": 0.975, "odds_ratio_loss": 0.7513402700424194, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.08998648822307587, "rewards/margins": 0.0010748239001259208, "rewards/rejected": -0.0910613164305687, "sft_loss": 0.8998648524284363, "step": 510 }, { "epoch": 0.8405738533036977, "grad_norm": 1.2135205268859863, "learning_rate": 4.090920965761906e-06, "logits/chosen": -0.3448580205440521, "logits/rejected": -0.3548375964164734, "logps/chosen": -0.9812738299369812, "logps/rejected": -1.0694336891174316, "loss": 1.0535, "odds_ratio_loss": 0.7224698662757874, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09812740236520767, "rewards/margins": 0.00881598424166441, "rewards/rejected": -0.10694338381290436, "sft_loss": 0.9812738299369812, "step": 520 }, { "epoch": 0.8567387350979996, "grad_norm": 0.9130859375, "learning_rate": 4.058014644460991e-06, "logits/chosen": -0.34060588479042053, "logits/rejected": -0.3562433123588562, "logps/chosen": -0.9648042917251587, "logps/rejected": -1.0603010654449463, "loss": 1.032, "odds_ratio_loss": 0.6720489859580994, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09648042917251587, "rewards/margins": 0.009549676440656185, "rewards/rejected": -0.10603010654449463, "sft_loss": 0.9648042917251587, "step": 530 }, { "epoch": 0.8729036168923014, "grad_norm": 0.6945879459381104, "learning_rate": 4.024660931092939e-06, "logits/chosen": -0.39998704195022583, "logits/rejected": -0.39360350370407104, "logps/chosen": -0.8902137875556946, "logps/rejected": -1.0513432025909424, "loss": 0.9562, "odds_ratio_loss": 0.6595617532730103, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08902137726545334, "rewards/margins": 0.016112947836518288, "rewards/rejected": -0.10513432323932648, "sft_loss": 0.8902137875556946, "step": 540 }, { "epoch": 0.8890684986866033, "grad_norm": 0.45378220081329346, "learning_rate": 3.990869403351272e-06, "logits/chosen": -0.3531869053840637, "logits/rejected": -0.38131508231163025, "logps/chosen": -0.9068384170532227, "logps/rejected": -1.065394639968872, "loss": 0.9704, "odds_ratio_loss": 0.635545015335083, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09068384021520615, "rewards/margins": 0.01585562154650688, "rewards/rejected": -0.10653946548700333, "sft_loss": 0.9068384170532227, "step": 550 }, { "epoch": 0.9052333804809052, "grad_norm": 0.5512678623199463, "learning_rate": 3.956649764650206e-06, "logits/chosen": -0.29515427350997925, "logits/rejected": -0.31435275077819824, "logps/chosen": -0.9203943014144897, "logps/rejected": -1.0603986978530884, "loss": 0.9918, "odds_ratio_loss": 0.7142159938812256, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09203943610191345, "rewards/margins": 0.014000418595969677, "rewards/rejected": -0.106039859354496, "sft_loss": 0.9203943014144897, "step": 560 }, { "epoch": 0.9213982622752072, "grad_norm": 0.5750080347061157, "learning_rate": 3.92201184133826e-06, "logits/chosen": -0.3182484209537506, "logits/rejected": -0.3164721131324768, "logps/chosen": -0.8570343255996704, "logps/rejected": -1.0225125551223755, "loss": 0.922, "odds_ratio_loss": 0.6495530009269714, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08570344746112823, "rewards/margins": 0.01654782146215439, "rewards/rejected": -0.10225125402212143, "sft_loss": 0.8570343255996704, "step": 570 }, { "epoch": 0.937563144069509, "grad_norm": 0.5823240876197815, "learning_rate": 3.886965579876572e-06, "logits/chosen": -0.307335764169693, "logits/rejected": -0.331511914730072, "logps/chosen": -0.8535898923873901, "logps/rejected": -0.9173160791397095, "loss": 0.9234, "odds_ratio_loss": 0.6983198523521423, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08535899966955185, "rewards/margins": 0.006372606847435236, "rewards/rejected": -0.09173160046339035, "sft_loss": 0.8535898923873901, "step": 580 }, { "epoch": 0.9537280258638109, "grad_norm": 0.3793308734893799, "learning_rate": 3.851521043982716e-06, "logits/chosen": -0.3546546399593353, "logits/rejected": -0.3105318248271942, "logps/chosen": -0.9257644414901733, "logps/rejected": -0.994279682636261, "loss": 0.9977, "odds_ratio_loss": 0.7192004919052124, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09257644414901733, "rewards/margins": 0.006851526442915201, "rewards/rejected": -0.0994279757142067, "sft_loss": 0.9257644414901733, "step": 590 }, { "epoch": 0.9698929076581128, "grad_norm": 0.5789406895637512, "learning_rate": 3.81568841174086e-06, "logits/chosen": -0.39430108666419983, "logits/rejected": -0.38088011741638184, "logps/chosen": -0.8874362111091614, "logps/rejected": -1.0097267627716064, "loss": 0.9592, "odds_ratio_loss": 0.7179639935493469, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08874362707138062, "rewards/margins": 0.012229054234921932, "rewards/rejected": -0.10097268968820572, "sft_loss": 0.8874362111091614, "step": 600 }, { "epoch": 0.9860577894524146, "grad_norm": 0.4505593478679657, "learning_rate": 3.7794779726790664e-06, "logits/chosen": -0.4197085499763489, "logits/rejected": -0.3544057607650757, "logps/chosen": -0.8556501269340515, "logps/rejected": -0.9688836336135864, "loss": 0.9233, "odds_ratio_loss": 0.6760933995246887, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0855650082230568, "rewards/margins": 0.011323352344334126, "rewards/rejected": -0.09688836336135864, "sft_loss": 0.8556501269340515, "step": 610 }, { "epoch": 1.0022226712467166, "grad_norm": 0.41950830817222595, "learning_rate": 3.7429001248146096e-06, "logits/chosen": -0.3560163080692291, "logits/rejected": -0.32193905115127563, "logps/chosen": -0.8660818934440613, "logps/rejected": -1.0638062953948975, "loss": 0.9302, "odds_ratio_loss": 0.6412297487258911, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08660819381475449, "rewards/margins": 0.019772443920373917, "rewards/rejected": -0.10638062655925751, "sft_loss": 0.8660818934440613, "step": 620 }, { "epoch": 1.0183875530410185, "grad_norm": 0.30259978771209717, "learning_rate": 3.7059653716681227e-06, "logits/chosen": -0.3218996524810791, "logits/rejected": -0.3514016568660736, "logps/chosen": -0.9751222729682922, "logps/rejected": -1.1278547048568726, "loss": 1.046, "odds_ratio_loss": 0.7084661722183228, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09751223772764206, "rewards/margins": 0.015273240394890308, "rewards/rejected": -0.11278548091650009, "sft_loss": 0.9751222729682922, "step": 630 }, { "epoch": 1.0345524348353203, "grad_norm": 1.449523687362671, "learning_rate": 3.668684319247463e-06, "logits/chosen": -0.3402321934700012, "logits/rejected": -0.3320569396018982, "logps/chosen": -0.8782706260681152, "logps/rejected": -1.0504738092422485, "loss": 0.9434, "odds_ratio_loss": 0.651136040687561, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08782706409692764, "rewards/margins": 0.01722031459212303, "rewards/rejected": -0.10504738241434097, "sft_loss": 0.8782706260681152, "step": 640 }, { "epoch": 1.0507173166296222, "grad_norm": 0.36652296781539917, "learning_rate": 3.6310676730021373e-06, "logits/chosen": -0.3392433524131775, "logits/rejected": -0.3268556296825409, "logps/chosen": -0.8789156079292297, "logps/rejected": -0.9153023958206177, "loss": 0.9515, "odds_ratio_loss": 0.7262720465660095, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08789155632257462, "rewards/margins": 0.0036386798601597548, "rewards/rejected": -0.09153024852275848, "sft_loss": 0.8789156079292297, "step": 650 }, { "epoch": 1.066882198423924, "grad_norm": 0.42644253373146057, "learning_rate": 3.593126234749178e-06, "logits/chosen": -0.35958123207092285, "logits/rejected": -0.33439984917640686, "logps/chosen": -0.9317266345024109, "logps/rejected": -0.9812437891960144, "loss": 1.004, "odds_ratio_loss": 0.7226861119270325, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09317266196012497, "rewards/margins": 0.0049517154693603516, "rewards/rejected": -0.09812436997890472, "sft_loss": 0.9317266345024109, "step": 660 }, { "epoch": 1.083047080218226, "grad_norm": 0.5300435423851013, "learning_rate": 3.554870899571343e-06, "logits/chosen": -0.4070967137813568, "logits/rejected": -0.38338038325309753, "logps/chosen": -0.9088705778121948, "logps/rejected": -1.0065948963165283, "loss": 0.9774, "odds_ratio_loss": 0.6850352883338928, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.09088706225156784, "rewards/margins": 0.009772435761988163, "rewards/rejected": -0.10065948963165283, "sft_loss": 0.9088705778121948, "step": 670 }, { "epoch": 1.0992119620125278, "grad_norm": 1.5718979835510254, "learning_rate": 3.5163126526885373e-06, "logits/chosen": -0.3708317279815674, "logits/rejected": -0.3510357737541199, "logps/chosen": -0.8702448606491089, "logps/rejected": -0.9972399473190308, "loss": 0.9409, "odds_ratio_loss": 0.7065256834030151, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08702448755502701, "rewards/margins": 0.012699509970843792, "rewards/rejected": -0.09972399473190308, "sft_loss": 0.8702448606491089, "step": 680 }, { "epoch": 1.1153768438068297, "grad_norm": 0.31913694739341736, "learning_rate": 3.4774625663033484e-06, "logits/chosen": -0.39085036516189575, "logits/rejected": -0.37611085176467896, "logps/chosen": -0.8731836080551147, "logps/rejected": -0.9660570025444031, "loss": 0.9427, "odds_ratio_loss": 0.6954530477523804, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08731836825609207, "rewards/margins": 0.009287341497838497, "rewards/rejected": -0.09660570323467255, "sft_loss": 0.8731836080551147, "step": 690 }, { "epoch": 1.1315417256011315, "grad_norm": 0.5645192265510559, "learning_rate": 3.4383317964216067e-06, "logits/chosen": -0.3893832564353943, "logits/rejected": -0.3442583680152893, "logps/chosen": -0.870397686958313, "logps/rejected": -0.9214354753494263, "loss": 0.9448, "odds_ratio_loss": 0.7436445355415344, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0870397686958313, "rewards/margins": 0.005103783216327429, "rewards/rejected": -0.09214354306459427, "sft_loss": 0.870397686958313, "step": 700 }, { "epoch": 1.1477066073954334, "grad_norm": 0.7822654247283936, "learning_rate": 3.398931579648877e-06, "logits/chosen": -0.3577522039413452, "logits/rejected": -0.2890363931655884, "logps/chosen": -0.9082385301589966, "logps/rejected": -1.1010273694992065, "loss": 0.9792, "odds_ratio_loss": 0.7092560529708862, "rewards/accuracies": 0.5, "rewards/chosen": -0.09082385897636414, "rewards/margins": 0.019278880208730698, "rewards/rejected": -0.11010273545980453, "sft_loss": 0.9082385301589966, "step": 710 }, { "epoch": 1.1638714891897353, "grad_norm": 0.6916553974151611, "learning_rate": 3.359273229963813e-06, "logits/chosen": -0.33050891757011414, "logits/rejected": -0.33249133825302124, "logps/chosen": -0.8524163961410522, "logps/rejected": -0.9603297114372253, "loss": 0.9215, "odds_ratio_loss": 0.6913267374038696, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0852416455745697, "rewards/margins": 0.010791336186230183, "rewards/rejected": -0.09603297710418701, "sft_loss": 0.8524163961410522, "step": 720 }, { "epoch": 1.1800363709840371, "grad_norm": 0.36541640758514404, "learning_rate": 3.319368135469285e-06, "logits/chosen": -0.34484899044036865, "logits/rejected": -0.3120992183685303, "logps/chosen": -0.8964350819587708, "logps/rejected": -1.0409529209136963, "loss": 0.9665, "odds_ratio_loss": 0.7009326219558716, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08964350074529648, "rewards/margins": 0.014451777562499046, "rewards/rejected": -0.10409528017044067, "sft_loss": 0.8964350819587708, "step": 730 }, { "epoch": 1.196201252778339, "grad_norm": 0.5928468704223633, "learning_rate": 3.279227755122228e-06, "logits/chosen": -0.359285831451416, "logits/rejected": -0.3708931505680084, "logps/chosen": -0.817459225654602, "logps/rejected": -1.1048064231872559, "loss": 0.8791, "odds_ratio_loss": 0.6168545484542847, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0817459225654602, "rewards/margins": 0.028734717518091202, "rewards/rejected": -0.1104806438088417, "sft_loss": 0.817459225654602, "step": 740 }, { "epoch": 1.2123661345726409, "grad_norm": 0.8944354057312012, "learning_rate": 3.2388636154431417e-06, "logits/chosen": -0.32971471548080444, "logits/rejected": -0.3240662217140198, "logps/chosen": -0.9531005024909973, "logps/rejected": -1.1055543422698975, "loss": 1.0252, "odds_ratio_loss": 0.7207925319671631, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09531004726886749, "rewards/margins": 0.015245395712554455, "rewards/rejected": -0.11055544763803482, "sft_loss": 0.9531005024909973, "step": 750 }, { "epoch": 1.2285310163669427, "grad_norm": 0.5451232194900513, "learning_rate": 3.198287307206192e-06, "logits/chosen": -0.3906642198562622, "logits/rejected": -0.36378178000450134, "logps/chosen": -0.909538745880127, "logps/rejected": -1.005489706993103, "loss": 0.9791, "odds_ratio_loss": 0.6954682469367981, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09095387905836105, "rewards/margins": 0.009595084004104137, "rewards/rejected": -0.10054896771907806, "sft_loss": 0.909538745880127, "step": 760 }, { "epoch": 1.2446958981612446, "grad_norm": 0.3986392617225647, "learning_rate": 3.157510482110856e-06, "logits/chosen": -0.31712478399276733, "logits/rejected": -0.3332034647464752, "logps/chosen": -0.8950090408325195, "logps/rejected": -0.9677726626396179, "loss": 0.9687, "odds_ratio_loss": 0.7365735173225403, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08950088918209076, "rewards/margins": 0.007276373915374279, "rewards/rejected": -0.09677727520465851, "sft_loss": 0.8950090408325195, "step": 770 }, { "epoch": 1.2608607799555465, "grad_norm": 0.9783799648284912, "learning_rate": 3.116544849436077e-06, "logits/chosen": -0.3367740213871002, "logits/rejected": -0.3552953600883484, "logps/chosen": -0.9589813351631165, "logps/rejected": -1.1763808727264404, "loss": 1.0263, "odds_ratio_loss": 0.6732120513916016, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09589814394712448, "rewards/margins": 0.02173994854092598, "rewards/rejected": -0.11763808876276016, "sft_loss": 0.9589813351631165, "step": 780 }, { "epoch": 1.2770256617498483, "grad_norm": 0.3939819931983948, "learning_rate": 3.0754021726778848e-06, "logits/chosen": -0.3505743741989136, "logits/rejected": -0.37322431802749634, "logps/chosen": -0.83990079164505, "logps/rejected": -1.0232980251312256, "loss": 0.9049, "odds_ratio_loss": 0.6501890420913696, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08399007469415665, "rewards/margins": 0.018339723348617554, "rewards/rejected": -0.1023297905921936, "sft_loss": 0.83990079164505, "step": 790 }, { "epoch": 1.2931905435441502, "grad_norm": 0.35344642400741577, "learning_rate": 3.0340942661714463e-06, "logits/chosen": -0.3435738980770111, "logits/rejected": -0.36761245131492615, "logps/chosen": -0.9316965341567993, "logps/rejected": -1.0095479488372803, "loss": 1.003, "odds_ratio_loss": 0.7125651836395264, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09316965192556381, "rewards/margins": 0.007785154972225428, "rewards/rejected": -0.1009548082947731, "sft_loss": 0.9316965341567993, "step": 800 }, { "epoch": 1.3093554253384523, "grad_norm": 0.4086878001689911, "learning_rate": 2.992632991698512e-06, "logits/chosen": -0.39886465668678284, "logits/rejected": -0.3849073350429535, "logps/chosen": -0.9022181630134583, "logps/rejected": -1.0039399862289429, "loss": 0.9729, "odds_ratio_loss": 0.7066690325737, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09022180736064911, "rewards/margins": 0.010172189213335514, "rewards/rejected": -0.10039399564266205, "sft_loss": 0.9022181630134583, "step": 810 }, { "epoch": 1.3255203071327541, "grad_norm": 0.45464497804641724, "learning_rate": 2.9510302550812537e-06, "logits/chosen": -0.3623855710029602, "logits/rejected": -0.31726986169815063, "logps/chosen": -0.8218330144882202, "logps/rejected": -1.0319081544876099, "loss": 0.8851, "odds_ratio_loss": 0.6329900026321411, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08218331634998322, "rewards/margins": 0.02100750431418419, "rewards/rejected": -0.1031908169388771, "sft_loss": 0.8218330144882202, "step": 820 }, { "epoch": 1.341685188927056, "grad_norm": 1.1504096984863281, "learning_rate": 2.9092980027634325e-06, "logits/chosen": -0.38953226804733276, "logits/rejected": -0.3612954914569855, "logps/chosen": -0.8214972615242004, "logps/rejected": -0.9684427976608276, "loss": 0.8864, "odds_ratio_loss": 0.6492589712142944, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08214972913265228, "rewards/margins": 0.014694547280669212, "rewards/rejected": -0.09684427082538605, "sft_loss": 0.8214972615242004, "step": 830 }, { "epoch": 1.3578500707213579, "grad_norm": 0.33391210436820984, "learning_rate": 2.867448218379927e-06, "logits/chosen": -0.3767167627811432, "logits/rejected": -0.3566213548183441, "logps/chosen": -0.9622126817703247, "logps/rejected": -1.030574083328247, "loss": 1.0363, "odds_ratio_loss": 0.7405400276184082, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09622127562761307, "rewards/margins": 0.006836143787950277, "rewards/rejected": -0.10305740684270859, "sft_loss": 0.9622126817703247, "step": 840 }, { "epoch": 1.3740149525156597, "grad_norm": 1.2477465867996216, "learning_rate": 2.825492919315559e-06, "logits/chosen": -0.3341541886329651, "logits/rejected": -0.28563547134399414, "logps/chosen": -0.9898349046707153, "logps/rejected": -0.9626699686050415, "loss": 1.0687, "odds_ratio_loss": 0.7890844345092773, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09898348897695541, "rewards/margins": -0.0027165021747350693, "rewards/rejected": -0.0962669849395752, "sft_loss": 0.9898349046707153, "step": 850 }, { "epoch": 1.3901798343099616, "grad_norm": 0.37100037932395935, "learning_rate": 2.7834441532542482e-06, "logits/chosen": -0.3620319366455078, "logits/rejected": -0.3429003357887268, "logps/chosen": -0.8693292737007141, "logps/rejected": -0.991874098777771, "loss": 0.9379, "odds_ratio_loss": 0.6856324076652527, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08693292737007141, "rewards/margins": 0.012254483997821808, "rewards/rejected": -0.09918741136789322, "sft_loss": 0.8693292737007141, "step": 860 }, { "epoch": 1.4063447161042635, "grad_norm": 1.2096267938613892, "learning_rate": 2.74131399471945e-06, "logits/chosen": -0.3446846306324005, "logits/rejected": -0.3061850666999817, "logps/chosen": -0.9667361974716187, "logps/rejected": -1.053593397140503, "loss": 1.0382, "odds_ratio_loss": 0.714438796043396, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0966736227273941, "rewards/margins": 0.008685723878443241, "rewards/rejected": -0.10535935312509537, "sft_loss": 0.9667361974716187, "step": 870 }, { "epoch": 1.4225095978985653, "grad_norm": 0.47893857955932617, "learning_rate": 2.6991145416068947e-06, "logits/chosen": -0.3955840468406677, "logits/rejected": -0.31594154238700867, "logps/chosen": -0.9019123315811157, "logps/rejected": -0.9488536715507507, "loss": 0.9734, "odds_ratio_loss": 0.7147491574287415, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09019123762845993, "rewards/margins": 0.0046941377222537994, "rewards/rejected": -0.09488537907600403, "sft_loss": 0.9019123315811157, "step": 880 }, { "epoch": 1.4386744796928672, "grad_norm": 0.2868447005748749, "learning_rate": 2.6568579117106143e-06, "logits/chosen": -0.4024140238761902, "logits/rejected": -0.4033503532409668, "logps/chosen": -0.8388016819953918, "logps/rejected": -0.9728044271469116, "loss": 0.9081, "odds_ratio_loss": 0.6926370859146118, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08388017117977142, "rewards/margins": 0.013400280848145485, "rewards/rejected": -0.09728045761585236, "sft_loss": 0.8388016819953918, "step": 890 }, { "epoch": 1.454839361487169, "grad_norm": 0.24462518095970154, "learning_rate": 2.6145562392432544e-06, "logits/chosen": -0.3949779272079468, "logits/rejected": -0.39668601751327515, "logps/chosen": -0.8613153696060181, "logps/rejected": -0.9795036315917969, "loss": 0.9305, "odds_ratio_loss": 0.6919496059417725, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08613153547048569, "rewards/margins": 0.011818833649158478, "rewards/rejected": -0.09795036166906357, "sft_loss": 0.8613153696060181, "step": 900 }, { "epoch": 1.471004243281471, "grad_norm": 0.5152093768119812, "learning_rate": 2.5722216713516682e-06, "logits/chosen": -0.42058199644088745, "logits/rejected": -0.38909250497817993, "logps/chosen": -0.8609904050827026, "logps/rejected": -0.9690335988998413, "loss": 0.9318, "odds_ratio_loss": 0.7082633972167969, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0860990509390831, "rewards/margins": 0.010804320685565472, "rewards/rejected": -0.09690337628126144, "sft_loss": 0.8609904050827026, "step": 910 }, { "epoch": 1.4871691250757728, "grad_norm": 0.5419692397117615, "learning_rate": 2.5298663646288064e-06, "logits/chosen": -0.35978519916534424, "logits/rejected": -0.35384541749954224, "logps/chosen": -0.8710163235664368, "logps/rejected": -1.0426474809646606, "loss": 0.9373, "odds_ratio_loss": 0.6623716354370117, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08710163086652756, "rewards/margins": 0.017163105309009552, "rewards/rejected": -0.10426473617553711, "sft_loss": 0.8710163235664368, "step": 920 }, { "epoch": 1.503334006870075, "grad_norm": 3.1488473415374756, "learning_rate": 2.487502481622879e-06, "logits/chosen": -0.4146711230278015, "logits/rejected": -0.40715789794921875, "logps/chosen": -0.9579635858535767, "logps/rejected": -1.0180439949035645, "loss": 1.0298, "odds_ratio_loss": 0.718089759349823, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0957963615655899, "rewards/margins": 0.006008026655763388, "rewards/rejected": -0.10180439800024033, "sft_loss": 0.9579635858535767, "step": 930 }, { "epoch": 1.5194988886643768, "grad_norm": 0.6520385146141052, "learning_rate": 2.4451421873448253e-06, "logits/chosen": -0.3981381952762604, "logits/rejected": -0.33850008249282837, "logps/chosen": -0.9044814109802246, "logps/rejected": -0.9930024147033691, "loss": 0.9767, "odds_ratio_loss": 0.7225072979927063, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09044814109802246, "rewards/margins": 0.008852103725075722, "rewards/rejected": -0.09930024296045303, "sft_loss": 0.9044814109802246, "step": 940 }, { "epoch": 1.5356637704586786, "grad_norm": 0.5775251984596252, "learning_rate": 2.40279764577506e-06, "logits/chosen": -0.36691075563430786, "logits/rejected": -0.31715118885040283, "logps/chosen": -0.9193195104598999, "logps/rejected": -0.9655280113220215, "loss": 0.9919, "odds_ratio_loss": 0.7258428931236267, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09193196147680283, "rewards/margins": 0.004620848223567009, "rewards/rejected": -0.09655280411243439, "sft_loss": 0.9193195104598999, "step": 950 }, { "epoch": 1.5518286522529805, "grad_norm": 0.4706912636756897, "learning_rate": 2.3604810163705242e-06, "logits/chosen": -0.3801175355911255, "logits/rejected": -0.34497779607772827, "logps/chosen": -0.8502659797668457, "logps/rejected": -0.9808200597763062, "loss": 0.9153, "odds_ratio_loss": 0.6503497362136841, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08502660691738129, "rewards/margins": 0.013055416755378246, "rewards/rejected": -0.09808202087879181, "sft_loss": 0.8502659797668457, "step": 960 }, { "epoch": 1.5679935340472824, "grad_norm": 0.8772755265235901, "learning_rate": 2.3182044505730364e-06, "logits/chosen": -0.3701505661010742, "logits/rejected": -0.3588781952857971, "logps/chosen": -0.8278260231018066, "logps/rejected": -0.9880140423774719, "loss": 0.8943, "odds_ratio_loss": 0.6643026471138, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08278260380029678, "rewards/margins": 0.016018804162740707, "rewards/rejected": -0.09880141168832779, "sft_loss": 0.8278260231018066, "step": 970 }, { "epoch": 1.5841584158415842, "grad_norm": 0.5644322633743286, "learning_rate": 2.275980088319941e-06, "logits/chosen": -0.37429267168045044, "logits/rejected": -0.38965049386024475, "logps/chosen": -0.830912709236145, "logps/rejected": -0.931898295879364, "loss": 0.901, "odds_ratio_loss": 0.7011361122131348, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08309127390384674, "rewards/margins": 0.010098553262650967, "rewards/rejected": -0.09318983554840088, "sft_loss": 0.830912709236145, "step": 980 }, { "epoch": 1.600323297635886, "grad_norm": 0.7188877463340759, "learning_rate": 2.2338200545580577e-06, "logits/chosen": -0.387838214635849, "logits/rejected": -0.3446332514286041, "logps/chosen": -0.8468879461288452, "logps/rejected": -1.0357553958892822, "loss": 0.9171, "odds_ratio_loss": 0.7018327713012695, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08468880504369736, "rewards/margins": 0.018886741250753403, "rewards/rejected": -0.10357554256916046, "sft_loss": 0.8468879461288452, "step": 990 }, { "epoch": 1.616488179430188, "grad_norm": 0.40455734729766846, "learning_rate": 2.191736455761947e-06, "logits/chosen": -0.32430940866470337, "logits/rejected": -0.3191392719745636, "logps/chosen": -0.7817317247390747, "logps/rejected": -0.8739973306655884, "loss": 0.8458, "odds_ratio_loss": 0.6406995058059692, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.07817317545413971, "rewards/margins": 0.009226562455296516, "rewards/rejected": -0.08739973604679108, "sft_loss": 0.7817317247390747, "step": 1000 }, { "epoch": 1.616488179430188, "eval_logits/chosen": -0.3771926760673523, "eval_logits/rejected": -0.3578239679336548, "eval_logps/chosen": -0.8850269317626953, "eval_logps/rejected": -0.9999891519546509, "eval_loss": 0.9560017585754395, "eval_odds_ratio_loss": 0.7097483277320862, "eval_rewards/accuracies": 0.5190908908843994, "eval_rewards/chosen": -0.08850269019603729, "eval_rewards/margins": 0.0114962263032794, "eval_rewards/rejected": -0.09999892115592957, "eval_runtime": 192.1461, "eval_samples_per_second": 5.725, "eval_sft_loss": 0.8850269317626953, "eval_steps_per_second": 2.862, "step": 1000 }, { "epoch": 1.6326530612244898, "grad_norm": 0.3581576347351074, "learning_rate": 2.1497413764574673e-06, "logits/chosen": -0.31272074580192566, "logits/rejected": -0.33244556188583374, "logps/chosen": -0.9109123349189758, "logps/rejected": -1.0614047050476074, "loss": 0.9769, "odds_ratio_loss": 0.6601108908653259, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09109123051166534, "rewards/margins": 0.01504923403263092, "rewards/rejected": -0.10614047199487686, "sft_loss": 0.9109123349189758, "step": 1010 }, { "epoch": 1.6488179430187917, "grad_norm": 1.0781522989273071, "learning_rate": 2.1078468757516395e-06, "logits/chosen": -0.3577747941017151, "logits/rejected": -0.372037798166275, "logps/chosen": -0.8666743040084839, "logps/rejected": -0.9286467432975769, "loss": 0.943, "odds_ratio_loss": 0.7631633877754211, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08666743338108063, "rewards/margins": 0.006197246722877026, "rewards/rejected": -0.09286467730998993, "sft_loss": 0.8666743040084839, "step": 1020 }, { "epoch": 1.6649828248130936, "grad_norm": 0.4093440771102905, "learning_rate": 2.0660649838698145e-06, "logits/chosen": -0.24239635467529297, "logits/rejected": -0.2550283670425415, "logps/chosen": -0.8779211044311523, "logps/rejected": -1.028240442276001, "loss": 0.9471, "odds_ratio_loss": 0.691811203956604, "rewards/accuracies": 0.46875, "rewards/chosen": -0.08779212832450867, "rewards/margins": 0.015031938441097736, "rewards/rejected": -0.10282406955957413, "sft_loss": 0.8779211044311523, "step": 1030 }, { "epoch": 1.6811477066073954, "grad_norm": 0.4143465459346771, "learning_rate": 2.0244076987011284e-06, "logits/chosen": -0.320882648229599, "logits/rejected": -0.35348570346832275, "logps/chosen": -0.9102975726127625, "logps/rejected": -1.0311200618743896, "loss": 0.9776, "odds_ratio_loss": 0.6728986501693726, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09102976322174072, "rewards/margins": 0.012082245200872421, "rewards/rejected": -0.10311201959848404, "sft_loss": 0.9102975726127625, "step": 1040 }, { "epoch": 1.6973125884016973, "grad_norm": 0.4322679340839386, "learning_rate": 1.982886982353251e-06, "logits/chosen": -0.33857375383377075, "logits/rejected": -0.38647031784057617, "logps/chosen": -0.8801182508468628, "logps/rejected": -1.0462461709976196, "loss": 0.9472, "odds_ratio_loss": 0.6703814268112183, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08801182359457016, "rewards/margins": 0.016612788662314415, "rewards/rejected": -0.10462461411952972, "sft_loss": 0.8801182508468628, "step": 1050 }, { "epoch": 1.7134774701959992, "grad_norm": 0.40310564637184143, "learning_rate": 1.941514757717392e-06, "logits/chosen": -0.3961712718009949, "logits/rejected": -0.3599357604980469, "logps/chosen": -0.857568621635437, "logps/rejected": -1.0133601427078247, "loss": 0.921, "odds_ratio_loss": 0.6347678899765015, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08575686812400818, "rewards/margins": 0.015579144470393658, "rewards/rejected": -0.10133601725101471, "sft_loss": 0.857568621635437, "step": 1060 }, { "epoch": 1.729642351990301, "grad_norm": 0.5565314888954163, "learning_rate": 1.9003029050445953e-06, "logits/chosen": -0.3478461802005768, "logits/rejected": -0.3207647204399109, "logps/chosen": -0.9041654467582703, "logps/rejected": -0.99024897813797, "loss": 0.9734, "odds_ratio_loss": 0.6924456357955933, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0904165506362915, "rewards/margins": 0.008608358912169933, "rewards/rejected": -0.09902490675449371, "sft_loss": 0.9041654467582703, "step": 1070 }, { "epoch": 1.745807233784603, "grad_norm": 0.4490904211997986, "learning_rate": 1.8592632585342523e-06, "logits/chosen": -0.36072981357574463, "logits/rejected": -0.3492718040943146, "logps/chosen": -0.8714792132377625, "logps/rejected": -1.010517954826355, "loss": 0.9396, "odds_ratio_loss": 0.6810620427131653, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08714792132377625, "rewards/margins": 0.013903876766562462, "rewards/rejected": -0.10105180740356445, "sft_loss": 0.8714792132377625, "step": 1080 }, { "epoch": 1.7619721155789048, "grad_norm": 0.71334308385849, "learning_rate": 1.8184076029358527e-06, "logits/chosen": -0.3724268078804016, "logits/rejected": -0.40728870034217834, "logps/chosen": -0.8329513669013977, "logps/rejected": -0.8585556745529175, "loss": 0.9053, "odds_ratio_loss": 0.723603367805481, "rewards/accuracies": 0.5, "rewards/chosen": -0.08329514414072037, "rewards/margins": 0.002560428809374571, "rewards/rejected": -0.08585558086633682, "sft_loss": 0.8329513669013977, "step": 1090 }, { "epoch": 1.7781369973732066, "grad_norm": 0.38024160265922546, "learning_rate": 1.7777476701649318e-06, "logits/chosen": -0.4104040563106537, "logits/rejected": -0.40031394362449646, "logps/chosen": -0.9076647758483887, "logps/rejected": -1.019285798072815, "loss": 0.9752, "odds_ratio_loss": 0.6755737662315369, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09076648205518723, "rewards/margins": 0.011162097565829754, "rewards/rejected": -0.10192857682704926, "sft_loss": 0.9076647758483887, "step": 1100 }, { "epoch": 1.7943018791675085, "grad_norm": 0.433108389377594, "learning_rate": 1.7372951359341925e-06, "logits/chosen": -0.35082167387008667, "logits/rejected": -0.3622151017189026, "logps/chosen": -0.8306609988212585, "logps/rejected": -0.9346961975097656, "loss": 0.8994, "odds_ratio_loss": 0.6869168281555176, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08306611329317093, "rewards/margins": 0.010403511114418507, "rewards/rejected": -0.09346961975097656, "sft_loss": 0.8306609988212585, "step": 1110 }, { "epoch": 1.8104667609618104, "grad_norm": 1.0182783603668213, "learning_rate": 1.6970616164007547e-06, "logits/chosen": -0.4078885614871979, "logits/rejected": -0.43148526549339294, "logps/chosen": -0.8258237838745117, "logps/rejected": -0.9274940490722656, "loss": 0.8967, "odds_ratio_loss": 0.7091785073280334, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08258237689733505, "rewards/margins": 0.010167025960981846, "rewards/rejected": -0.09274940937757492, "sft_loss": 0.8258237838745117, "step": 1120 }, { "epoch": 1.8266316427561122, "grad_norm": 1.0357805490493774, "learning_rate": 1.6570586648305276e-06, "logits/chosen": -0.4377085268497467, "logits/rejected": -0.407601922750473, "logps/chosen": -0.8756824731826782, "logps/rejected": -1.0340659618377686, "loss": 0.9437, "odds_ratio_loss": 0.6799197793006897, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0875682383775711, "rewards/margins": 0.01583835855126381, "rewards/rejected": -0.10340659320354462, "sft_loss": 0.8756824731826782, "step": 1130 }, { "epoch": 1.842796524550414, "grad_norm": 0.4784797430038452, "learning_rate": 1.6172977682806151e-06, "logits/chosen": -0.3374441862106323, "logits/rejected": -0.2926723062992096, "logps/chosen": -0.8671070337295532, "logps/rejected": -1.0173355340957642, "loss": 0.9326, "odds_ratio_loss": 0.6546159982681274, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08671069890260696, "rewards/margins": 0.015022864565253258, "rewards/rejected": -0.10173355042934418, "sft_loss": 0.8671070337295532, "step": 1140 }, { "epoch": 1.858961406344716, "grad_norm": 0.5492507219314575, "learning_rate": 1.5777903443007586e-06, "logits/chosen": -0.3145988881587982, "logits/rejected": -0.42871540784835815, "logps/chosen": -0.8989070057868958, "logps/rejected": -1.0172455310821533, "loss": 0.9689, "odds_ratio_loss": 0.6998150944709778, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08989070355892181, "rewards/margins": 0.011833854019641876, "rewards/rejected": -0.1017245501279831, "sft_loss": 0.8989070057868958, "step": 1150 }, { "epoch": 1.8751262881390178, "grad_norm": 0.4275898039340973, "learning_rate": 1.5385477376547226e-06, "logits/chosen": -0.3347630202770233, "logits/rejected": -0.34142249822616577, "logps/chosen": -0.9212555885314941, "logps/rejected": -1.0021544694900513, "loss": 0.9893, "odds_ratio_loss": 0.679952085018158, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0921255499124527, "rewards/margins": 0.008089900948107243, "rewards/rejected": -0.10021545737981796, "sft_loss": 0.9212555885314941, "step": 1160 }, { "epoch": 1.89129116993332, "grad_norm": 0.5769237875938416, "learning_rate": 1.4995812170625845e-06, "logits/chosen": -0.3509088456630707, "logits/rejected": -0.35828500986099243, "logps/chosen": -0.8898354768753052, "logps/rejected": -1.1126220226287842, "loss": 0.9543, "odds_ratio_loss": 0.6445311307907104, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08898355811834335, "rewards/margins": 0.02227865532040596, "rewards/rejected": -0.11126221716403961, "sft_loss": 0.8898354768753052, "step": 1170 }, { "epoch": 1.9074560517276218, "grad_norm": 0.9893414974212646, "learning_rate": 1.4609019719648666e-06, "logits/chosen": -0.34388267993927, "logits/rejected": -0.34255415201187134, "logps/chosen": -0.9129988551139832, "logps/rejected": -1.0511752367019653, "loss": 0.9778, "odds_ratio_loss": 0.6484531760215759, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0912998765707016, "rewards/margins": 0.013817653059959412, "rewards/rejected": -0.10511753708124161, "sft_loss": 0.9129988551139832, "step": 1180 }, { "epoch": 1.9236209335219236, "grad_norm": 0.8161694407463074, "learning_rate": 1.42252110930943e-06, "logits/chosen": -0.3889426589012146, "logits/rejected": -0.37780189514160156, "logps/chosen": -0.8312114477157593, "logps/rejected": -0.9597098231315613, "loss": 0.8972, "odds_ratio_loss": 0.6594355702400208, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08312113583087921, "rewards/margins": 0.01284984964877367, "rewards/rejected": -0.0959709957242012, "sft_loss": 0.8312114477157593, "step": 1190 }, { "epoch": 1.9397858153162255, "grad_norm": 0.6737188100814819, "learning_rate": 1.3844496503620493e-06, "logits/chosen": -0.34721988439559937, "logits/rejected": -0.29065969586372375, "logps/chosen": -0.8556321263313293, "logps/rejected": -0.9435693621635437, "loss": 0.9217, "odds_ratio_loss": 0.6608615517616272, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08556319773197174, "rewards/margins": 0.008793738670647144, "rewards/rejected": -0.09435693919658661, "sft_loss": 0.8556321263313293, "step": 1200 }, { "epoch": 1.9559506971105274, "grad_norm": 1.0895054340362549, "learning_rate": 1.3466985275416081e-06, "logits/chosen": -0.38311949372291565, "logits/rejected": -0.440490186214447, "logps/chosen": -0.9350228309631348, "logps/rejected": -1.0175323486328125, "loss": 1.0086, "odds_ratio_loss": 0.7355881929397583, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09350229054689407, "rewards/margins": 0.008250946179032326, "rewards/rejected": -0.10175323486328125, "sft_loss": 0.9350228309631348, "step": 1210 }, { "epoch": 1.9721155789048292, "grad_norm": 0.7546266913414001, "learning_rate": 1.309278581280791e-06, "logits/chosen": -0.32461339235305786, "logits/rejected": -0.38296985626220703, "logps/chosen": -0.825161337852478, "logps/rejected": -1.007612943649292, "loss": 0.8897, "odds_ratio_loss": 0.6452582478523254, "rewards/accuracies": 0.59375, "rewards/chosen": -0.082516148686409, "rewards/margins": 0.018245156854391098, "rewards/rejected": -0.1007612943649292, "sft_loss": 0.825161337852478, "step": 1220 }, { "epoch": 1.9882804606991311, "grad_norm": 0.30651387572288513, "learning_rate": 1.272200556913199e-06, "logits/chosen": -0.34240493178367615, "logits/rejected": -0.33365195989608765, "logps/chosen": -0.9005836248397827, "logps/rejected": -1.0132153034210205, "loss": 0.9729, "odds_ratio_loss": 0.722726583480835, "rewards/accuracies": 0.5, "rewards/chosen": -0.09005837142467499, "rewards/margins": 0.011263175867497921, "rewards/rejected": -0.10132155567407608, "sft_loss": 0.9005836248397827, "step": 1230 }, { "epoch": 2.004445342493433, "grad_norm": 0.6191690564155579, "learning_rate": 1.2354751015877698e-06, "logits/chosen": -0.3653295636177063, "logits/rejected": -0.3104439675807953, "logps/chosen": -0.8316798210144043, "logps/rejected": -1.0361697673797607, "loss": 0.8947, "odds_ratio_loss": 0.6298761963844299, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08316798508167267, "rewards/margins": 0.02044900692999363, "rewards/rejected": -0.10361699759960175, "sft_loss": 0.8316798210144043, "step": 1240 }, { "epoch": 2.020610224287735, "grad_norm": 0.752289354801178, "learning_rate": 1.1991127612113945e-06, "logits/chosen": -0.3582732379436493, "logits/rejected": -0.3034323751926422, "logps/chosen": -0.8952615857124329, "logps/rejected": -1.0291544198989868, "loss": 0.9609, "odds_ratio_loss": 0.6566318869590759, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08952615410089493, "rewards/margins": 0.013389283791184425, "rewards/rejected": -0.1029154434800148, "sft_loss": 0.8952615857124329, "step": 1250 }, { "epoch": 2.036775106082037, "grad_norm": 0.5910158753395081, "learning_rate": 1.1631239774206035e-06, "logits/chosen": -0.36862578988075256, "logits/rejected": -0.3653218150138855, "logps/chosen": -0.8613477945327759, "logps/rejected": -0.9755401611328125, "loss": 0.9325, "odds_ratio_loss": 0.7117538452148438, "rewards/accuracies": 0.46875, "rewards/chosen": -0.08613476902246475, "rewards/margins": 0.011419234797358513, "rewards/rejected": -0.09755401313304901, "sft_loss": 0.8613477945327759, "step": 1260 }, { "epoch": 2.052939987876339, "grad_norm": 0.5977714657783508, "learning_rate": 1.1275190845831978e-06, "logits/chosen": -0.35793787240982056, "logits/rejected": -0.3579494059085846, "logps/chosen": -0.8839446902275085, "logps/rejected": -1.0555723905563354, "loss": 0.9484, "odds_ratio_loss": 0.6443823575973511, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08839447796344757, "rewards/margins": 0.01716277375817299, "rewards/rejected": -0.10555724799633026, "sft_loss": 0.8839446902275085, "step": 1270 }, { "epoch": 2.0691048696706407, "grad_norm": 0.4356369078159332, "learning_rate": 1.0923083068306778e-06, "logits/chosen": -0.2889194190502167, "logits/rejected": -0.39258915185928345, "logps/chosen": -0.8745051622390747, "logps/rejected": -1.061402678489685, "loss": 0.94, "odds_ratio_loss": 0.6551867723464966, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08745051920413971, "rewards/margins": 0.018689759075641632, "rewards/rejected": -0.10614027827978134, "sft_loss": 0.8745051622390747, "step": 1280 }, { "epoch": 2.0852697514649425, "grad_norm": 0.2981340289115906, "learning_rate": 1.0575017551223348e-06, "logits/chosen": -0.39015138149261475, "logits/rejected": -0.40903449058532715, "logps/chosen": -0.7750725746154785, "logps/rejected": -0.9115964770317078, "loss": 0.8412, "odds_ratio_loss": 0.6609454154968262, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.07750725001096725, "rewards/margins": 0.013652404770255089, "rewards/rejected": -0.09115965664386749, "sft_loss": 0.7750725746154785, "step": 1290 }, { "epoch": 2.1014346332592444, "grad_norm": 0.39186251163482666, "learning_rate": 1.023109424341833e-06, "logits/chosen": -0.3986419141292572, "logits/rejected": -0.36254242062568665, "logps/chosen": -0.8747810125350952, "logps/rejected": -0.9971181750297546, "loss": 0.9444, "odds_ratio_loss": 0.6959220170974731, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08747810870409012, "rewards/margins": 0.012233709916472435, "rewards/rejected": -0.0997118204832077, "sft_loss": 0.8747810125350952, "step": 1300 }, { "epoch": 2.1175995150535463, "grad_norm": 0.4826388359069824, "learning_rate": 9.891411904271273e-07, "logits/chosen": -0.3570977747440338, "logits/rejected": -0.34066206216812134, "logps/chosen": -0.8385666608810425, "logps/rejected": -0.9865023493766785, "loss": 0.9076, "odds_ratio_loss": 0.6902373433113098, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08385667204856873, "rewards/margins": 0.014793576672673225, "rewards/rejected": -0.0986502468585968, "sft_loss": 0.8385666608810425, "step": 1310 }, { "epoch": 2.133764396847848, "grad_norm": 0.3553561866283417, "learning_rate": 9.556068075345363e-07, "logits/chosen": -0.28917670249938965, "logits/rejected": -0.3470838665962219, "logps/chosen": -0.8463741540908813, "logps/rejected": -0.9492172002792358, "loss": 0.9162, "odds_ratio_loss": 0.6985523104667664, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08463741838932037, "rewards/margins": 0.01028431672602892, "rewards/rejected": -0.09492173045873642, "sft_loss": 0.8463741540908813, "step": 1320 }, { "epoch": 2.14992927864215, "grad_norm": 0.3664523959159851, "learning_rate": 9.225159052377838e-07, "logits/chosen": -0.3276691436767578, "logits/rejected": -0.3102811872959137, "logps/chosen": -0.9000975489616394, "logps/rejected": -1.0900113582611084, "loss": 0.9658, "odds_ratio_loss": 0.6572277545928955, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09000976383686066, "rewards/margins": 0.018991392105817795, "rewards/rejected": -0.10900114476680756, "sft_loss": 0.9000975489616394, "step": 1330 }, { "epoch": 2.166094160436452, "grad_norm": 0.5697169899940491, "learning_rate": 8.898779857628184e-07, "logits/chosen": -0.35697469115257263, "logits/rejected": -0.29451218247413635, "logps/chosen": -0.7642744779586792, "logps/rejected": -0.8856114149093628, "loss": 0.8306, "odds_ratio_loss": 0.6628420948982239, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.07642744481563568, "rewards/margins": 0.012133700773119926, "rewards/rejected": -0.08856116235256195, "sft_loss": 0.7642744779586792, "step": 1340 }, { "epoch": 2.1822590422307537, "grad_norm": 1.7151192426681519, "learning_rate": 8.577024212591975e-07, "logits/chosen": -0.29253047704696655, "logits/rejected": -0.3413800001144409, "logps/chosen": -0.8930098414421082, "logps/rejected": -0.9748668670654297, "loss": 0.9639, "odds_ratio_loss": 0.708949089050293, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0893009752035141, "rewards/margins": 0.008185721933841705, "rewards/rejected": -0.09748668968677521, "sft_loss": 0.8930098414421082, "step": 1350 }, { "epoch": 2.1984239240250556, "grad_norm": 0.49061620235443115, "learning_rate": 8.259984511088276e-07, "logits/chosen": -0.3223104476928711, "logits/rejected": -0.29760584235191345, "logps/chosen": -0.8736541867256165, "logps/rejected": -0.9874069094657898, "loss": 0.9451, "odds_ratio_loss": 0.7148812413215637, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08736542612314224, "rewards/margins": 0.011375268921256065, "rewards/rejected": -0.09874069690704346, "sft_loss": 0.8736541867256165, "step": 1360 }, { "epoch": 2.2145888058193575, "grad_norm": 0.33556151390075684, "learning_rate": 7.947751792728237e-07, "logits/chosen": -0.3239595890045166, "logits/rejected": -0.34610220789909363, "logps/chosen": -0.8864496946334839, "logps/rejected": -1.0747450590133667, "loss": 0.9559, "odds_ratio_loss": 0.694658637046814, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08864498138427734, "rewards/margins": 0.018829550594091415, "rewards/rejected": -0.10747452825307846, "sft_loss": 0.8864496946334839, "step": 1370 }, { "epoch": 2.2307536876136593, "grad_norm": 0.5993340611457825, "learning_rate": 7.640415716772626e-07, "logits/chosen": -0.3385930359363556, "logits/rejected": -0.31589871644973755, "logps/chosen": -0.8884540796279907, "logps/rejected": -1.0432296991348267, "loss": 0.9579, "odds_ratio_loss": 0.6948095560073853, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08884540945291519, "rewards/margins": 0.015477565117180347, "rewards/rejected": -0.10432296991348267, "sft_loss": 0.8884540796279907, "step": 1380 }, { "epoch": 2.246918569407961, "grad_norm": 0.4777003228664398, "learning_rate": 7.338064536385722e-07, "logits/chosen": -0.3243527412414551, "logits/rejected": -0.3211807608604431, "logps/chosen": -0.8481816053390503, "logps/rejected": -1.0429704189300537, "loss": 0.9136, "odds_ratio_loss": 0.6539761424064636, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08481816202402115, "rewards/margins": 0.019478868693113327, "rewards/rejected": -0.10429704189300537, "sft_loss": 0.8481816053390503, "step": 1390 }, { "epoch": 2.263083451202263, "grad_norm": 0.6625237464904785, "learning_rate": 7.040785073292883e-07, "logits/chosen": -0.39626187086105347, "logits/rejected": -0.3658468425273895, "logps/chosen": -0.9418588876724243, "logps/rejected": -1.017301321029663, "loss": 1.0184, "odds_ratio_loss": 0.7650783658027649, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09418588131666183, "rewards/margins": 0.007544253021478653, "rewards/rejected": -0.10173014551401138, "sft_loss": 0.9418588876724243, "step": 1400 }, { "epoch": 2.279248332996565, "grad_norm": 0.5683190226554871, "learning_rate": 6.748662692849297e-07, "logits/chosen": -0.2916708290576935, "logits/rejected": -0.289817750453949, "logps/chosen": -0.8634734153747559, "logps/rejected": -1.1026208400726318, "loss": 0.9275, "odds_ratio_loss": 0.6400235295295715, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08634734898805618, "rewards/margins": 0.02391473576426506, "rewards/rejected": -0.11026208102703094, "sft_loss": 0.8634734153747559, "step": 1410 }, { "epoch": 2.295413214790867, "grad_norm": 1.625442624092102, "learning_rate": 6.46178127952686e-07, "logits/chosen": -0.35586509108543396, "logits/rejected": -0.35335296392440796, "logps/chosen": -0.8400161862373352, "logps/rejected": -0.9910812377929688, "loss": 0.9025, "odds_ratio_loss": 0.6247957348823547, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08400160819292068, "rewards/margins": 0.015106521546840668, "rewards/rejected": -0.09910812973976135, "sft_loss": 0.8400161862373352, "step": 1420 }, { "epoch": 2.3115780965851687, "grad_norm": 0.46490368247032166, "learning_rate": 6.180223212826289e-07, "logits/chosen": -0.33770841360092163, "logits/rejected": -0.37226027250289917, "logps/chosen": -0.858726978302002, "logps/rejected": -0.9763249158859253, "loss": 0.9249, "odds_ratio_loss": 0.6612924933433533, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08587270230054855, "rewards/margins": 0.011759791523218155, "rewards/rejected": -0.097632497549057, "sft_loss": 0.858726978302002, "step": 1430 }, { "epoch": 2.3277429783794705, "grad_norm": 0.25405463576316833, "learning_rate": 5.904069343621443e-07, "logits/chosen": -0.3201651871204376, "logits/rejected": -0.34286874532699585, "logps/chosen": -0.9113739132881165, "logps/rejected": -1.0487134456634521, "loss": 0.9766, "odds_ratio_loss": 0.652290403842926, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09113740175962448, "rewards/margins": 0.013733962550759315, "rewards/rejected": -0.10487135499715805, "sft_loss": 0.9113739132881165, "step": 1440 }, { "epoch": 2.3439078601737724, "grad_norm": 0.5318045020103455, "learning_rate": 5.633398970942544e-07, "logits/chosen": -0.32512596249580383, "logits/rejected": -0.2820747494697571, "logps/chosen": -0.8218180537223816, "logps/rejected": -0.9094691276550293, "loss": 0.8927, "odds_ratio_loss": 0.7083881497383118, "rewards/accuracies": 0.5, "rewards/chosen": -0.08218181133270264, "rewards/margins": 0.008765103295445442, "rewards/rejected": -0.09094691276550293, "sft_loss": 0.8218180537223816, "step": 1450 }, { "epoch": 2.3600727419680743, "grad_norm": 0.6843146681785583, "learning_rate": 5.368289819205069e-07, "logits/chosen": -0.39002543687820435, "logits/rejected": -0.376250684261322, "logps/chosen": -0.7933530211448669, "logps/rejected": -0.9611787796020508, "loss": 0.8585, "odds_ratio_loss": 0.6519256234169006, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0793353021144867, "rewards/margins": 0.016782574355602264, "rewards/rejected": -0.09611787647008896, "sft_loss": 0.7933530211448669, "step": 1460 }, { "epoch": 2.376237623762376, "grad_norm": 0.3784586787223816, "learning_rate": 5.108818015890785e-07, "logits/chosen": -0.3249315917491913, "logits/rejected": -0.30507951974868774, "logps/chosen": -0.8853880167007446, "logps/rejected": -1.0341455936431885, "loss": 0.9531, "odds_ratio_loss": 0.6767874956130981, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08853879570960999, "rewards/margins": 0.014875771477818489, "rewards/rejected": -0.10341457277536392, "sft_loss": 0.8853880167007446, "step": 1470 }, { "epoch": 2.392402505556678, "grad_norm": 0.5850736498832703, "learning_rate": 4.855058069687291e-07, "logits/chosen": -0.4515988230705261, "logits/rejected": -0.4501380920410156, "logps/chosen": -0.8440315127372742, "logps/rejected": -0.978651225566864, "loss": 0.9111, "odds_ratio_loss": 0.6708062887191772, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0844031572341919, "rewards/margins": 0.013461967930197716, "rewards/rejected": -0.09786512702703476, "sft_loss": 0.8440315127372742, "step": 1480 }, { "epoch": 2.40856738735098, "grad_norm": 0.4087739884853363, "learning_rate": 4.607082849092523e-07, "logits/chosen": -0.3892877697944641, "logits/rejected": -0.4075300097465515, "logps/chosen": -0.9417757987976074, "logps/rejected": -1.0281052589416504, "loss": 1.0107, "odds_ratio_loss": 0.6892626881599426, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09417758882045746, "rewards/margins": 0.00863293744623661, "rewards/rejected": -0.10281052440404892, "sft_loss": 0.9417757987976074, "step": 1490 }, { "epoch": 2.4247322691452817, "grad_norm": 1.142304539680481, "learning_rate": 4.3649635614901405e-07, "logits/chosen": -0.39748096466064453, "logits/rejected": -0.2932053208351135, "logps/chosen": -0.8502078056335449, "logps/rejected": -0.883902907371521, "loss": 0.9219, "odds_ratio_loss": 0.7170731425285339, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08502078056335449, "rewards/margins": 0.003369513200595975, "rewards/rejected": -0.0883902907371521, "sft_loss": 0.8502078056335449, "step": 1500 }, { "epoch": 2.4247322691452817, "eval_logits/chosen": -0.3775150775909424, "eval_logits/rejected": -0.3581116795539856, "eval_logps/chosen": -0.8786855936050415, "eval_logps/rejected": -0.9948004484176636, "eval_loss": 0.9497246742248535, "eval_odds_ratio_loss": 0.7103896737098694, "eval_rewards/accuracies": 0.5163636207580566, "eval_rewards/chosen": -0.0878685712814331, "eval_rewards/margins": 0.011611479334533215, "eval_rewards/rejected": -0.0994800478219986, "eval_runtime": 192.2752, "eval_samples_per_second": 5.721, "eval_sft_loss": 0.8786855936050415, "eval_steps_per_second": 2.86, "step": 1500 }, { "epoch": 2.4408971509395836, "grad_norm": 0.3841034770011902, "learning_rate": 4.128769732701973e-07, "logits/chosen": -0.36835092306137085, "logits/rejected": -0.4074084758758545, "logps/chosen": -0.8371820449829102, "logps/rejected": -0.9595246315002441, "loss": 0.9062, "odds_ratio_loss": 0.6903966665267944, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0837181955575943, "rewards/margins": 0.012234264984726906, "rewards/rejected": -0.09595246613025665, "sft_loss": 0.8371820449829102, "step": 1510 }, { "epoch": 2.4570620327338855, "grad_norm": 0.6487218737602234, "learning_rate": 3.8985691870233046e-07, "logits/chosen": -0.36084288358688354, "logits/rejected": -0.35909101366996765, "logps/chosen": -0.8767590522766113, "logps/rejected": -0.9904271364212036, "loss": 0.9487, "odds_ratio_loss": 0.7190364599227905, "rewards/accuracies": 0.5, "rewards/chosen": -0.08767590671777725, "rewards/margins": 0.011366801336407661, "rewards/rejected": -0.09904270619153976, "sft_loss": 0.8767590522766113, "step": 1520 }, { "epoch": 2.4732269145281873, "grad_norm": 0.726983904838562, "learning_rate": 3.6744280277467904e-07, "logits/chosen": -0.3547779619693756, "logits/rejected": -0.37871819734573364, "logps/chosen": -0.8915858268737793, "logps/rejected": -1.0086140632629395, "loss": 0.9661, "odds_ratio_loss": 0.7449706792831421, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08915858715772629, "rewards/margins": 0.01170281507074833, "rewards/rejected": -0.10086140781641006, "sft_loss": 0.8915858268737793, "step": 1530 }, { "epoch": 2.489391796322489, "grad_norm": 0.6208191514015198, "learning_rate": 3.456410618180503e-07, "logits/chosen": -0.46183329820632935, "logits/rejected": -0.3973988890647888, "logps/chosen": -0.7950559258460999, "logps/rejected": -1.0139881372451782, "loss": 0.8596, "odds_ratio_loss": 0.6458045244216919, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07950559258460999, "rewards/margins": 0.021893223747611046, "rewards/rejected": -0.10139881074428558, "sft_loss": 0.7950559258460999, "step": 1540 }, { "epoch": 2.5055566781167915, "grad_norm": 0.40934354066848755, "learning_rate": 3.244579563165753e-07, "logits/chosen": -0.3711478114128113, "logits/rejected": -0.3300473093986511, "logps/chosen": -0.8490577936172485, "logps/rejected": -1.0569615364074707, "loss": 0.9137, "odds_ratio_loss": 0.6463108062744141, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08490578085184097, "rewards/margins": 0.020790381357073784, "rewards/rejected": -0.10569615662097931, "sft_loss": 0.8490577936172485, "step": 1550 }, { "epoch": 2.521721559911093, "grad_norm": 0.4264324903488159, "learning_rate": 3.038995691099697e-07, "logits/chosen": -0.35405951738357544, "logits/rejected": -0.3723445534706116, "logps/chosen": -0.8575676083564758, "logps/rejected": -1.0358964204788208, "loss": 0.9267, "odds_ratio_loss": 0.6915205717086792, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08575676381587982, "rewards/margins": 0.017832884564995766, "rewards/rejected": -0.10358965396881104, "sft_loss": 0.8575676083564758, "step": 1560 }, { "epoch": 2.5378864417053952, "grad_norm": 0.5124202370643616, "learning_rate": 2.839718036468192e-07, "logits/chosen": -0.39767321944236755, "logits/rejected": -0.361719012260437, "logps/chosen": -0.9866407513618469, "logps/rejected": -1.0687347650527954, "loss": 1.0574, "odds_ratio_loss": 0.7079859972000122, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09866407513618469, "rewards/margins": 0.00820938404649496, "rewards/rejected": -0.10687346756458282, "sft_loss": 0.9866407513618469, "step": 1570 }, { "epoch": 2.5540513234996967, "grad_norm": 0.6700158715248108, "learning_rate": 2.646803822893723e-07, "logits/chosen": -0.34473222494125366, "logits/rejected": -0.339333713054657, "logps/chosen": -0.9860366582870483, "logps/rejected": -1.0728685855865479, "loss": 1.0579, "odds_ratio_loss": 0.7182521224021912, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09860367327928543, "rewards/margins": 0.008683168329298496, "rewards/rejected": -0.10728684812784195, "sft_loss": 0.9860366582870483, "step": 1580 }, { "epoch": 2.570216205293999, "grad_norm": 0.4476275146007538, "learning_rate": 2.460308446703341e-07, "logits/chosen": -0.37150639295578003, "logits/rejected": -0.3977029621601105, "logps/chosen": -0.8994391560554504, "logps/rejected": -0.9403126835823059, "loss": 0.9704, "odds_ratio_loss": 0.7100769877433777, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08994391560554504, "rewards/margins": 0.004087349865585566, "rewards/rejected": -0.09403126686811447, "sft_loss": 0.8994391560554504, "step": 1590 }, { "epoch": 2.5863810870883004, "grad_norm": 0.8473093509674072, "learning_rate": 2.2802854610213143e-07, "logits/chosen": -0.38676199316978455, "logits/rejected": -0.3973104655742645, "logps/chosen": -0.8438700437545776, "logps/rejected": -1.018701434135437, "loss": 0.9107, "odds_ratio_loss": 0.6678277850151062, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08438700437545776, "rewards/margins": 0.017483150586485863, "rewards/rejected": -0.10187015682458878, "sft_loss": 0.8438700437545776, "step": 1600 }, { "epoch": 2.6025459688826027, "grad_norm": 1.2318559885025024, "learning_rate": 2.106786560391072e-07, "logits/chosen": -0.41062861680984497, "logits/rejected": -0.3663537800312042, "logps/chosen": -0.9180322885513306, "logps/rejected": -0.9797943830490112, "loss": 0.9881, "odds_ratio_loss": 0.7011545300483704, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09180323779582977, "rewards/margins": 0.006176213268190622, "rewards/rejected": -0.09797944128513336, "sft_loss": 0.9180322885513306, "step": 1610 }, { "epoch": 2.6187108506769046, "grad_norm": 1.8344284296035767, "learning_rate": 1.9398615659308255e-07, "logits/chosen": -0.3516565263271332, "logits/rejected": -0.3090236485004425, "logps/chosen": -0.8868433833122253, "logps/rejected": -0.9610105752944946, "loss": 0.9563, "odds_ratio_loss": 0.6944981813430786, "rewards/accuracies": 0.5, "rewards/chosen": -0.08868434280157089, "rewards/margins": 0.007416720036417246, "rewards/rejected": -0.0961010605096817, "sft_loss": 0.8868433833122253, "step": 1620 }, { "epoch": 2.6348757324712064, "grad_norm": 3.686185359954834, "learning_rate": 1.7795584110272184e-07, "logits/chosen": -0.33260416984558105, "logits/rejected": -0.32040587067604065, "logps/chosen": -0.9077906608581543, "logps/rejected": -1.0257583856582642, "loss": 0.9756, "odds_ratio_loss": 0.6781536340713501, "rewards/accuracies": 0.5, "rewards/chosen": -0.09077905863523483, "rewards/margins": 0.01179676502943039, "rewards/rejected": -0.10257583856582642, "sft_loss": 0.9077906608581543, "step": 1630 }, { "epoch": 2.6510406142655083, "grad_norm": 0.7552462220191956, "learning_rate": 1.6259231275709636e-07, "logits/chosen": -0.32405030727386475, "logits/rejected": -0.3262009024620056, "logps/chosen": -0.8568581342697144, "logps/rejected": -0.9373190999031067, "loss": 0.9294, "odds_ratio_loss": 0.7254046201705933, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.08568581938743591, "rewards/margins": 0.008046089671552181, "rewards/rejected": -0.09373190253973007, "sft_loss": 0.8568581342697144, "step": 1640 }, { "epoch": 2.66720549605981, "grad_norm": 0.45023104548454285, "learning_rate": 1.478999832738548e-07, "logits/chosen": -0.34250158071517944, "logits/rejected": -0.34709858894348145, "logps/chosen": -0.8354190587997437, "logps/rejected": -0.9979323148727417, "loss": 0.9021, "odds_ratio_loss": 0.6672018766403198, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08354191482067108, "rewards/margins": 0.01625131070613861, "rewards/rejected": -0.09979323297739029, "sft_loss": 0.8354190587997437, "step": 1650 }, { "epoch": 2.683370377854112, "grad_norm": 0.6760185956954956, "learning_rate": 1.338830716323769e-07, "logits/chosen": -0.34901902079582214, "logits/rejected": -0.352342426776886, "logps/chosen": -0.8232784271240234, "logps/rejected": -0.9058715105056763, "loss": 0.8916, "odds_ratio_loss": 0.6835728883743286, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08232785016298294, "rewards/margins": 0.008259310387074947, "rewards/rejected": -0.09058715403079987, "sft_loss": 0.8232784271240234, "step": 1660 }, { "epoch": 2.699535259648414, "grad_norm": 0.9901576638221741, "learning_rate": 1.205456028622723e-07, "logits/chosen": -0.3495160639286041, "logits/rejected": -0.35691842436790466, "logps/chosen": -0.8500292897224426, "logps/rejected": -1.0147500038146973, "loss": 0.9171, "odds_ratio_loss": 0.6710700988769531, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08500292897224426, "rewards/margins": 0.016472063958644867, "rewards/rejected": -0.10147500038146973, "sft_loss": 0.8500292897224426, "step": 1670 }, { "epoch": 2.7157001414427158, "grad_norm": 0.29376673698425293, "learning_rate": 1.0789140688756805e-07, "logits/chosen": -0.2777409255504608, "logits/rejected": -0.30515843629837036, "logps/chosen": -0.8388081789016724, "logps/rejected": -1.004902720451355, "loss": 0.9016, "odds_ratio_loss": 0.6277891397476196, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08388081192970276, "rewards/margins": 0.016609463840723038, "rewards/rejected": -0.10049028694629669, "sft_loss": 0.8388081789016724, "step": 1680 }, { "epoch": 2.7318650232370176, "grad_norm": 1.1649651527404785, "learning_rate": 9.592411742693098e-08, "logits/chosen": -0.3376592993736267, "logits/rejected": -0.33899828791618347, "logps/chosen": -0.8941831588745117, "logps/rejected": -0.9593558311462402, "loss": 0.9688, "odds_ratio_loss": 0.7464134693145752, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.08941832929849625, "rewards/margins": 0.00651725847274065, "rewards/rejected": -0.09593559056520462, "sft_loss": 0.8941831588745117, "step": 1690 }, { "epoch": 2.7480299050313195, "grad_norm": 0.365510493516922, "learning_rate": 8.464717095022168e-08, "logits/chosen": -0.26350411772727966, "logits/rejected": -0.3258097767829895, "logps/chosen": -0.8289276957511902, "logps/rejected": -0.9933468103408813, "loss": 0.894, "odds_ratio_loss": 0.6506984829902649, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0828927755355835, "rewards/margins": 0.01644190214574337, "rewards/rejected": -0.09933467954397202, "sft_loss": 0.8289276957511902, "step": 1700 }, { "epoch": 2.7641947868256214, "grad_norm": 0.860230565071106, "learning_rate": 7.406380569169841e-08, "logits/chosen": -0.35509008169174194, "logits/rejected": -0.3218967318534851, "logps/chosen": -0.9126371145248413, "logps/rejected": -0.8999163508415222, "loss": 0.9886, "odds_ratio_loss": 0.759522020816803, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.09126370400190353, "rewards/margins": -0.0012720691738650203, "rewards/rejected": -0.08999162912368774, "sft_loss": 0.9126371145248413, "step": 1710 }, { "epoch": 2.7803596686199232, "grad_norm": 2.069009780883789, "learning_rate": 6.417706072013808e-08, "logits/chosen": -0.3513588011264801, "logits/rejected": -0.31902140378952026, "logps/chosen": -0.8999738693237305, "logps/rejected": -0.9839135408401489, "loss": 0.9715, "odds_ratio_loss": 0.7152166366577148, "rewards/accuracies": 0.5, "rewards/chosen": -0.08999738842248917, "rewards/margins": 0.008393971249461174, "rewards/rejected": -0.09839136153459549, "sft_loss": 0.8999738693237305, "step": 1720 }, { "epoch": 2.796524550414225, "grad_norm": 0.59537672996521, "learning_rate": 5.498977506615294e-08, "logits/chosen": -0.33539581298828125, "logits/rejected": -0.36086633801460266, "logps/chosen": -0.8895516395568848, "logps/rejected": -0.9674522280693054, "loss": 0.9602, "odds_ratio_loss": 0.706065833568573, "rewards/accuracies": 0.5, "rewards/chosen": -0.08895515650510788, "rewards/margins": 0.007790066301822662, "rewards/rejected": -0.09674523025751114, "sft_loss": 0.8895516395568848, "step": 1730 }, { "epoch": 2.812689432208527, "grad_norm": 0.4070757031440735, "learning_rate": 4.6504586906947756e-08, "logits/chosen": -0.3671857714653015, "logits/rejected": -0.36166203022003174, "logps/chosen": -0.9486915469169617, "logps/rejected": -0.9999829530715942, "loss": 1.0182, "odds_ratio_loss": 0.6954110860824585, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09486915171146393, "rewards/margins": 0.005129144061356783, "rewards/rejected": -0.09999830275774002, "sft_loss": 0.9486915469169617, "step": 1740 }, { "epoch": 2.828854314002829, "grad_norm": 1.650687336921692, "learning_rate": 3.8723932808754914e-08, "logits/chosen": -0.2851547300815582, "logits/rejected": -0.2857135236263275, "logps/chosen": -0.9708272814750671, "logps/rejected": -0.9912136197090149, "loss": 1.0459, "odds_ratio_loss": 0.7506999969482422, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09708271920681, "rewards/margins": 0.0020386301912367344, "rewards/rejected": -0.0991213470697403, "sft_loss": 0.9708272814750671, "step": 1750 }, { "epoch": 2.8450191957971307, "grad_norm": 0.9035086035728455, "learning_rate": 3.1650047027158014e-08, "logits/chosen": -0.3378879427909851, "logits/rejected": -0.31768563389778137, "logps/chosen": -0.863334059715271, "logps/rejected": -0.9955730438232422, "loss": 0.9285, "odds_ratio_loss": 0.6513949632644653, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08633340895175934, "rewards/margins": 0.013223896734416485, "rewards/rejected": -0.0995573028922081, "sft_loss": 0.863334059715271, "step": 1760 }, { "epoch": 2.8611840775914326, "grad_norm": 0.3864952623844147, "learning_rate": 2.5284960865517848e-08, "logits/chosen": -0.39154380559921265, "logits/rejected": -0.34484562277793884, "logps/chosen": -0.82793790102005, "logps/rejected": -1.0070700645446777, "loss": 0.8928, "odds_ratio_loss": 0.6486603021621704, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08279379457235336, "rewards/margins": 0.0179132129997015, "rewards/rejected": -0.10070700943470001, "sft_loss": 0.82793790102005, "step": 1770 }, { "epoch": 2.8773489593857344, "grad_norm": 0.4017253518104553, "learning_rate": 1.9630502091670388e-08, "logits/chosen": -0.3473368287086487, "logits/rejected": -0.37853848934173584, "logps/chosen": -0.8299247622489929, "logps/rejected": -0.996843695640564, "loss": 0.8926, "odds_ratio_loss": 0.6264339685440063, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08299248665571213, "rewards/margins": 0.016691887751221657, "rewards/rejected": -0.09968437254428864, "sft_loss": 0.8299247622489929, "step": 1780 }, { "epoch": 2.8935138411800363, "grad_norm": 0.7657872438430786, "learning_rate": 1.4688294413074677e-08, "logits/chosen": -0.3813559114933014, "logits/rejected": -0.34783899784088135, "logps/chosen": -0.802249550819397, "logps/rejected": -0.9486366510391235, "loss": 0.8723, "odds_ratio_loss": 0.7008516788482666, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08022496104240417, "rewards/margins": 0.014638709835708141, "rewards/rejected": -0.09486366808414459, "sft_loss": 0.802249550819397, "step": 1790 }, { "epoch": 2.909678722974338, "grad_norm": 0.2962876558303833, "learning_rate": 1.0459757010556626e-08, "logits/chosen": -0.4134625494480133, "logits/rejected": -0.394450843334198, "logps/chosen": -0.8447575569152832, "logps/rejected": -0.919145405292511, "loss": 0.9156, "odds_ratio_loss": 0.7088185548782349, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08447576314210892, "rewards/margins": 0.007438770029693842, "rewards/rejected": -0.09191453456878662, "sft_loss": 0.8447575569152832, "step": 1800 }, { "epoch": 2.92584360476864, "grad_norm": 0.4512230455875397, "learning_rate": 6.94610413078306e-09, "logits/chosen": -0.4446278512477875, "logits/rejected": -0.37901362776756287, "logps/chosen": -0.8928766250610352, "logps/rejected": -1.078958511352539, "loss": 0.9617, "odds_ratio_loss": 0.6879509091377258, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0892876610159874, "rewards/margins": 0.01860819011926651, "rewards/rejected": -0.1078958511352539, "sft_loss": 0.8928766250610352, "step": 1810 }, { "epoch": 2.942008486562942, "grad_norm": 0.2540852427482605, "learning_rate": 4.14834473758563e-09, "logits/chosen": -0.4007115364074707, "logits/rejected": -0.3911517858505249, "logps/chosen": -0.8001864552497864, "logps/rejected": -1.0187556743621826, "loss": 0.8634, "odds_ratio_loss": 0.6319615244865417, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08001864701509476, "rewards/margins": 0.021856937557458878, "rewards/rejected": -0.10187558084726334, "sft_loss": 0.8001864552497864, "step": 1820 }, { "epoch": 2.9581733683572438, "grad_norm": 0.4121166467666626, "learning_rate": 2.067282222230349e-09, "logits/chosen": -0.3413907587528229, "logits/rejected": -0.278145968914032, "logps/chosen": -0.8189884424209595, "logps/rejected": -1.0053989887237549, "loss": 0.881, "odds_ratio_loss": 0.620233416557312, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08189885318279266, "rewards/margins": 0.018641049042344093, "rewards/rejected": -0.10053990036249161, "sft_loss": 0.8189884424209595, "step": 1830 }, { "epoch": 2.9743382501515456, "grad_norm": 3.4636123180389404, "learning_rate": 7.035141727212979e-10, "logits/chosen": -0.3847911059856415, "logits/rejected": -0.34176406264305115, "logps/chosen": -0.8342105746269226, "logps/rejected": -0.9381749033927917, "loss": 0.9018, "odds_ratio_loss": 0.6754266023635864, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08342105895280838, "rewards/margins": 0.010396432131528854, "rewards/rejected": -0.09381748735904694, "sft_loss": 0.8342105746269226, "step": 1840 }, { "epoch": 2.9905031319458475, "grad_norm": 1.2374101877212524, "learning_rate": 5.743220219761592e-11, "logits/chosen": -0.33420827984809875, "logits/rejected": -0.3142699599266052, "logps/chosen": -1.0187790393829346, "logps/rejected": -1.029541015625, "loss": 1.097, "odds_ratio_loss": 0.782578706741333, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.10187790542840958, "rewards/margins": 0.0010761909652501345, "rewards/rejected": -0.10295410454273224, "sft_loss": 1.0187790393829346, "step": 1850 }, { "epoch": 2.9969690846635686, "step": 1854, "total_flos": 1.9948570754930442e+18, "train_loss": 0.9750770799807618, "train_runtime": 17949.5667, "train_samples_per_second": 1.654, "train_steps_per_second": 0.103 } ], "logging_steps": 10, "max_steps": 1854, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.9948570754930442e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }