diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7063 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9981298423724285, + "eval_steps": 500, + "global_step": 467, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0021373230029388193, + "grad_norm": 3.1614959239959717, + "learning_rate": 2.127659574468085e-08, + "logits/chosen": -1.1381689310073853, + "logits/rejected": -0.9913416504859924, + "logps/chosen": -0.2839311957359314, + "logps/rejected": -0.2955534756183624, + "loss": 1.608, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7098279595375061, + "rewards/margins": 0.029055748134851456, + "rewards/rejected": -0.7388837337493896, + "step": 1 + }, + { + "epoch": 0.004274646005877639, + "grad_norm": 7.915106773376465, + "learning_rate": 4.25531914893617e-08, + "logits/chosen": -1.0311710834503174, + "logits/rejected": -0.8901023864746094, + "logps/chosen": -0.24952735006809235, + "logps/rejected": -0.24253402650356293, + "loss": 1.6086, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6238184571266174, + "rewards/margins": -0.01748332567512989, + "rewards/rejected": -0.6063351035118103, + "step": 2 + }, + { + "epoch": 0.006411969008816457, + "grad_norm": 7.10002326965332, + "learning_rate": 6.382978723404254e-08, + "logits/chosen": -0.9257642030715942, + "logits/rejected": -0.8286958932876587, + "logps/chosen": -0.2627497911453247, + "logps/rejected": -0.2670031189918518, + "loss": 1.6177, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6568744778633118, + "rewards/margins": 0.010633318684995174, + "rewards/rejected": -0.6675078272819519, + "step": 3 + }, + { + "epoch": 0.008549292011755277, + "grad_norm": 4.93231201171875, + "learning_rate": 8.51063829787234e-08, + "logits/chosen": -0.8513132929801941, + "logits/rejected": -0.7548086643218994, + "logps/chosen": -0.2775232195854187, + "logps/rejected": -0.26499998569488525, + "loss": 1.6472, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6938079595565796, + "rewards/margins": -0.03130800649523735, + "rewards/rejected": -0.6625000238418579, + "step": 4 + }, + { + "epoch": 0.010686615014694095, + "grad_norm": 7.3820319175720215, + "learning_rate": 1.0638297872340425e-07, + "logits/chosen": -1.1639097929000854, + "logits/rejected": -1.2206344604492188, + "logps/chosen": -0.28828343749046326, + "logps/rejected": -0.29974380135536194, + "loss": 1.6096, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7207085490226746, + "rewards/margins": 0.028650924563407898, + "rewards/rejected": -0.7493594884872437, + "step": 5 + }, + { + "epoch": 0.012823938017632914, + "grad_norm": 4.583745002746582, + "learning_rate": 1.2765957446808508e-07, + "logits/chosen": -1.078372597694397, + "logits/rejected": -1.0467870235443115, + "logps/chosen": -0.2583101987838745, + "logps/rejected": -0.27993106842041016, + "loss": 1.6052, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.645775556564331, + "rewards/margins": 0.054052069783210754, + "rewards/rejected": -0.6998275518417358, + "step": 6 + }, + { + "epoch": 0.014961261020571734, + "grad_norm": 7.005415916442871, + "learning_rate": 1.4893617021276595e-07, + "logits/chosen": -0.7785229682922363, + "logits/rejected": -0.7655000686645508, + "logps/chosen": -0.2553212642669678, + "logps/rejected": -0.24673190712928772, + "loss": 1.6078, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6383031606674194, + "rewards/margins": -0.021473374217748642, + "rewards/rejected": -0.6168297529220581, + "step": 7 + }, + { + "epoch": 0.017098584023510555, + "grad_norm": 3.668518304824829, + "learning_rate": 1.702127659574468e-07, + "logits/chosen": -1.0131795406341553, + "logits/rejected": -1.0326677560806274, + "logps/chosen": -0.24508661031723022, + "logps/rejected": -0.26711565256118774, + "loss": 1.5828, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6127164959907532, + "rewards/margins": 0.0550725981593132, + "rewards/rejected": -0.667789101600647, + "step": 8 + }, + { + "epoch": 0.01923590702644937, + "grad_norm": 3.8814697265625, + "learning_rate": 1.9148936170212765e-07, + "logits/chosen": -0.9995537996292114, + "logits/rejected": -0.8747727274894714, + "logps/chosen": -0.2774883508682251, + "logps/rejected": -0.25593557953834534, + "loss": 1.6179, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.693720817565918, + "rewards/margins": -0.05388186126947403, + "rewards/rejected": -0.6398389935493469, + "step": 9 + }, + { + "epoch": 0.02137323002938819, + "grad_norm": 7.050100803375244, + "learning_rate": 2.127659574468085e-07, + "logits/chosen": -1.0684819221496582, + "logits/rejected": -0.9995761513710022, + "logps/chosen": -0.3254011273384094, + "logps/rejected": -0.31700819730758667, + "loss": 1.6215, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8135027885437012, + "rewards/margins": -0.020982395857572556, + "rewards/rejected": -0.7925204038619995, + "step": 10 + }, + { + "epoch": 0.02351055303232701, + "grad_norm": 6.378317356109619, + "learning_rate": 2.3404255319148937e-07, + "logits/chosen": -0.960267186164856, + "logits/rejected": -0.8816654086112976, + "logps/chosen": -0.28366273641586304, + "logps/rejected": -0.2510431110858917, + "loss": 1.6101, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.70915687084198, + "rewards/margins": -0.0815490186214447, + "rewards/rejected": -0.6276078224182129, + "step": 11 + }, + { + "epoch": 0.02564787603526583, + "grad_norm": 2.851073980331421, + "learning_rate": 2.5531914893617016e-07, + "logits/chosen": -0.9916080832481384, + "logits/rejected": -1.0304456949234009, + "logps/chosen": -0.2935165464878082, + "logps/rejected": -0.26734259724617004, + "loss": 1.6198, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7337914705276489, + "rewards/margins": -0.06543491035699844, + "rewards/rejected": -0.6683565378189087, + "step": 12 + }, + { + "epoch": 0.027785199038204648, + "grad_norm": 6.843414306640625, + "learning_rate": 2.7659574468085106e-07, + "logits/chosen": -0.865218997001648, + "logits/rejected": -0.8769045472145081, + "logps/chosen": -0.3183431327342987, + "logps/rejected": -0.31171083450317383, + "loss": 1.6, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7958577275276184, + "rewards/margins": -0.016580628231167793, + "rewards/rejected": -0.7792771458625793, + "step": 13 + }, + { + "epoch": 0.029922522041143467, + "grad_norm": 5.762980937957764, + "learning_rate": 2.978723404255319e-07, + "logits/chosen": -0.8773849606513977, + "logits/rejected": -0.8515525460243225, + "logps/chosen": -0.31379514932632446, + "logps/rejected": -0.29587048292160034, + "loss": 1.624, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7844878435134888, + "rewards/margins": -0.04481163248419762, + "rewards/rejected": -0.7396762371063232, + "step": 14 + }, + { + "epoch": 0.03205984504408229, + "grad_norm": 5.013691425323486, + "learning_rate": 3.1914893617021275e-07, + "logits/chosen": -1.0915770530700684, + "logits/rejected": -0.9239784479141235, + "logps/chosen": -0.2713887393474579, + "logps/rejected": -0.2902137339115143, + "loss": 1.5727, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6784718632698059, + "rewards/margins": 0.047062430530786514, + "rewards/rejected": -0.7255342602729797, + "step": 15 + }, + { + "epoch": 0.03419716804702111, + "grad_norm": 4.163066387176514, + "learning_rate": 3.404255319148936e-07, + "logits/chosen": -0.9121224880218506, + "logits/rejected": -0.8888986706733704, + "logps/chosen": -0.2657870948314667, + "logps/rejected": -0.28352442383766174, + "loss": 1.6097, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6644677519798279, + "rewards/margins": 0.04434328153729439, + "rewards/rejected": -0.7088110446929932, + "step": 16 + }, + { + "epoch": 0.03633449104995993, + "grad_norm": 7.174858570098877, + "learning_rate": 3.617021276595745e-07, + "logits/chosen": -0.7462605834007263, + "logits/rejected": -0.7856797575950623, + "logps/chosen": -0.27784857153892517, + "logps/rejected": -0.2904101610183716, + "loss": 1.5875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6946213841438293, + "rewards/margins": 0.031404003500938416, + "rewards/rejected": -0.7260254621505737, + "step": 17 + }, + { + "epoch": 0.03847181405289874, + "grad_norm": 4.658429145812988, + "learning_rate": 3.829787234042553e-07, + "logits/chosen": -1.097366213798523, + "logits/rejected": -1.181847333908081, + "logps/chosen": -0.2530037760734558, + "logps/rejected": -0.2565597891807556, + "loss": 1.6262, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6325094699859619, + "rewards/margins": 0.008889976888895035, + "rewards/rejected": -0.6413994431495667, + "step": 18 + }, + { + "epoch": 0.04060913705583756, + "grad_norm": 6.409156799316406, + "learning_rate": 4.0425531914893614e-07, + "logits/chosen": -1.1247719526290894, + "logits/rejected": -1.0388509035110474, + "logps/chosen": -0.2717263698577881, + "logps/rejected": -0.28198879957199097, + "loss": 1.6261, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6793159246444702, + "rewards/margins": 0.02565601095557213, + "rewards/rejected": -0.704971969127655, + "step": 19 + }, + { + "epoch": 0.04274646005877638, + "grad_norm": 4.293993949890137, + "learning_rate": 4.25531914893617e-07, + "logits/chosen": -1.0168251991271973, + "logits/rejected": -1.0034692287445068, + "logps/chosen": -0.24037577211856842, + "logps/rejected": -0.23601552844047546, + "loss": 1.618, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6009395122528076, + "rewards/margins": -0.01090063713490963, + "rewards/rejected": -0.5900388360023499, + "step": 20 + }, + { + "epoch": 0.0448837830617152, + "grad_norm": 10.061914443969727, + "learning_rate": 4.4680851063829783e-07, + "logits/chosen": -1.1607167720794678, + "logits/rejected": -1.0358400344848633, + "logps/chosen": -0.3354223072528839, + "logps/rejected": -0.27505114674568176, + "loss": 1.6588, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.8385557532310486, + "rewards/margins": -0.1509278416633606, + "rewards/rejected": -0.6876278519630432, + "step": 21 + }, + { + "epoch": 0.04702110606465402, + "grad_norm": 2.9843106269836426, + "learning_rate": 4.6808510638297873e-07, + "logits/chosen": -1.0770177841186523, + "logits/rejected": -1.0807621479034424, + "logps/chosen": -0.26381832361221313, + "logps/rejected": -0.2722492814064026, + "loss": 1.6007, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6595457792282104, + "rewards/margins": 0.02107742801308632, + "rewards/rejected": -0.6806231737136841, + "step": 22 + }, + { + "epoch": 0.04915842906759284, + "grad_norm": 6.568375110626221, + "learning_rate": 4.893617021276595e-07, + "logits/chosen": -0.8165597915649414, + "logits/rejected": -0.9183764457702637, + "logps/chosen": -0.2863512933254242, + "logps/rejected": -0.29046231508255005, + "loss": 1.567, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7158782482147217, + "rewards/margins": 0.010277565568685532, + "rewards/rejected": -0.7261557579040527, + "step": 23 + }, + { + "epoch": 0.05129575207053166, + "grad_norm": 6.657776355743408, + "learning_rate": 5.106382978723403e-07, + "logits/chosen": -1.026950716972351, + "logits/rejected": -1.0364724397659302, + "logps/chosen": -0.2393265664577484, + "logps/rejected": -0.2672955393791199, + "loss": 1.5978, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5983164310455322, + "rewards/margins": 0.06992244720458984, + "rewards/rejected": -0.6682388782501221, + "step": 24 + }, + { + "epoch": 0.053433075073470476, + "grad_norm": 4.221382141113281, + "learning_rate": 5.319148936170212e-07, + "logits/chosen": -1.0916444063186646, + "logits/rejected": -0.9851425886154175, + "logps/chosen": -0.2879200279712677, + "logps/rejected": -0.32731401920318604, + "loss": 1.5978, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7198000550270081, + "rewards/margins": 0.09848497807979584, + "rewards/rejected": -0.8182849884033203, + "step": 25 + }, + { + "epoch": 0.055570398076409296, + "grad_norm": 4.939723968505859, + "learning_rate": 5.531914893617021e-07, + "logits/chosen": -1.227979063987732, + "logits/rejected": -1.0881894826889038, + "logps/chosen": -0.3044770658016205, + "logps/rejected": -0.29919499158859253, + "loss": 1.6223, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7611927390098572, + "rewards/margins": -0.013205248862504959, + "rewards/rejected": -0.7479875087738037, + "step": 26 + }, + { + "epoch": 0.057707721079348115, + "grad_norm": 4.797338485717773, + "learning_rate": 5.74468085106383e-07, + "logits/chosen": -0.9999684691429138, + "logits/rejected": -0.9719603061676025, + "logps/chosen": -0.2676877975463867, + "logps/rejected": -0.2837084233760834, + "loss": 1.6208, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6692195534706116, + "rewards/margins": 0.040051497519016266, + "rewards/rejected": -0.7092710137367249, + "step": 27 + }, + { + "epoch": 0.059845044082286934, + "grad_norm": 5.77163028717041, + "learning_rate": 5.957446808510638e-07, + "logits/chosen": -0.8914034366607666, + "logits/rejected": -0.9402093291282654, + "logps/chosen": -0.2679407596588135, + "logps/rejected": -0.2759988307952881, + "loss": 1.618, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6698518991470337, + "rewards/margins": 0.020145151764154434, + "rewards/rejected": -0.6899970173835754, + "step": 28 + }, + { + "epoch": 0.061982367085225754, + "grad_norm": 4.006846904754639, + "learning_rate": 6.170212765957446e-07, + "logits/chosen": -1.0984641313552856, + "logits/rejected": -1.0189847946166992, + "logps/chosen": -0.26423007249832153, + "logps/rejected": -0.2647170126438141, + "loss": 1.6184, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.660575270652771, + "rewards/margins": 0.001217234879732132, + "rewards/rejected": -0.6617924571037292, + "step": 29 + }, + { + "epoch": 0.06411969008816458, + "grad_norm": 7.086247444152832, + "learning_rate": 6.382978723404255e-07, + "logits/chosen": -1.1409313678741455, + "logits/rejected": -0.911389172077179, + "logps/chosen": -0.27589118480682373, + "logps/rejected": -0.25251269340515137, + "loss": 1.6156, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6897279024124146, + "rewards/margins": -0.05844618380069733, + "rewards/rejected": -0.6312817335128784, + "step": 30 + }, + { + "epoch": 0.06625701309110339, + "grad_norm": 5.860540866851807, + "learning_rate": 6.595744680851063e-07, + "logits/chosen": -1.0596109628677368, + "logits/rejected": -0.9196721315383911, + "logps/chosen": -0.2705378532409668, + "logps/rejected": -0.32445117831230164, + "loss": 1.5929, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6763445734977722, + "rewards/margins": 0.13478338718414307, + "rewards/rejected": -0.8111280202865601, + "step": 31 + }, + { + "epoch": 0.06839433609404222, + "grad_norm": 9.763710021972656, + "learning_rate": 6.808510638297872e-07, + "logits/chosen": -0.8758100867271423, + "logits/rejected": -0.8177347183227539, + "logps/chosen": -0.26637426018714905, + "logps/rejected": -0.29131248593330383, + "loss": 1.5963, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6659356355667114, + "rewards/margins": 0.06234561279416084, + "rewards/rejected": -0.7282812595367432, + "step": 32 + }, + { + "epoch": 0.07053165909698103, + "grad_norm": 5.033257484436035, + "learning_rate": 7.021276595744681e-07, + "logits/chosen": -1.0621229410171509, + "logits/rejected": -0.9115914702415466, + "logps/chosen": -0.30500558018684387, + "logps/rejected": -0.2638895809650421, + "loss": 1.6286, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7625139951705933, + "rewards/margins": -0.10279002785682678, + "rewards/rejected": -0.6597238779067993, + "step": 33 + }, + { + "epoch": 0.07266898209991986, + "grad_norm": 6.103218078613281, + "learning_rate": 7.23404255319149e-07, + "logits/chosen": -1.0077497959136963, + "logits/rejected": -0.9838278889656067, + "logps/chosen": -0.29778575897216797, + "logps/rejected": -0.3331536054611206, + "loss": 1.5943, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7444643378257751, + "rewards/margins": 0.08841972053050995, + "rewards/rejected": -0.8328840732574463, + "step": 34 + }, + { + "epoch": 0.07480630510285867, + "grad_norm": 19.696596145629883, + "learning_rate": 7.446808510638297e-07, + "logits/chosen": -1.0255154371261597, + "logits/rejected": -1.0052193403244019, + "logps/chosen": -0.2500300705432892, + "logps/rejected": -0.284759521484375, + "loss": 1.6112, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6250751614570618, + "rewards/margins": 0.0868237167596817, + "rewards/rejected": -0.7118988633155823, + "step": 35 + }, + { + "epoch": 0.07694362810579748, + "grad_norm": 8.413592338562012, + "learning_rate": 7.659574468085106e-07, + "logits/chosen": -0.7250477075576782, + "logits/rejected": -0.5996040105819702, + "logps/chosen": -0.27290984988212585, + "logps/rejected": -0.29640769958496094, + "loss": 1.5972, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.682274580001831, + "rewards/margins": 0.05874472111463547, + "rewards/rejected": -0.7410193085670471, + "step": 36 + }, + { + "epoch": 0.07908095110873631, + "grad_norm": 5.761231899261475, + "learning_rate": 7.872340425531915e-07, + "logits/chosen": -0.9720395803451538, + "logits/rejected": -0.9105511903762817, + "logps/chosen": -0.27565720677375793, + "logps/rejected": -0.2629649341106415, + "loss": 1.6134, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6891430616378784, + "rewards/margins": -0.03173065185546875, + "rewards/rejected": -0.6574123501777649, + "step": 37 + }, + { + "epoch": 0.08121827411167512, + "grad_norm": 11.410785675048828, + "learning_rate": 8.085106382978723e-07, + "logits/chosen": -0.8644598722457886, + "logits/rejected": -0.9072043895721436, + "logps/chosen": -0.3011826276779175, + "logps/rejected": -0.29128536581993103, + "loss": 1.6565, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7529566287994385, + "rewards/margins": -0.024743108078837395, + "rewards/rejected": -0.7282134890556335, + "step": 38 + }, + { + "epoch": 0.08335559711461395, + "grad_norm": 6.3143696784973145, + "learning_rate": 8.297872340425532e-07, + "logits/chosen": -1.0631940364837646, + "logits/rejected": -1.1349154710769653, + "logps/chosen": -0.256188303232193, + "logps/rejected": -0.27545157074928284, + "loss": 1.5869, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6404708027839661, + "rewards/margins": 0.048158105462789536, + "rewards/rejected": -0.6886288523674011, + "step": 39 + }, + { + "epoch": 0.08549292011755276, + "grad_norm": 4.655215740203857, + "learning_rate": 8.51063829787234e-07, + "logits/chosen": -1.0722713470458984, + "logits/rejected": -1.090339183807373, + "logps/chosen": -0.2882351577281952, + "logps/rejected": -0.29485568404197693, + "loss": 1.6342, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7205879092216492, + "rewards/margins": 0.01655130460858345, + "rewards/rejected": -0.7371392846107483, + "step": 40 + }, + { + "epoch": 0.08763024312049159, + "grad_norm": 10.655563354492188, + "learning_rate": 8.723404255319149e-07, + "logits/chosen": -1.0175336599349976, + "logits/rejected": -0.878496527671814, + "logps/chosen": -0.31010520458221436, + "logps/rejected": -0.33926013112068176, + "loss": 1.5843, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7752628922462463, + "rewards/margins": 0.07288742810487747, + "rewards/rejected": -0.8481503129005432, + "step": 41 + }, + { + "epoch": 0.0897675661234304, + "grad_norm": 6.481273174285889, + "learning_rate": 8.936170212765957e-07, + "logits/chosen": -1.056595802307129, + "logits/rejected": -0.9111218452453613, + "logps/chosen": -0.28797075152397156, + "logps/rejected": -0.3270872235298157, + "loss": 1.6069, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7199268937110901, + "rewards/margins": 0.0977911651134491, + "rewards/rejected": -0.8177180290222168, + "step": 42 + }, + { + "epoch": 0.09190488912636922, + "grad_norm": 6.524636745452881, + "learning_rate": 9.148936170212766e-07, + "logits/chosen": -0.7545532584190369, + "logits/rejected": -0.8094898462295532, + "logps/chosen": -0.35466429591178894, + "logps/rejected": -0.28596031665802, + "loss": 1.6206, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8866608738899231, + "rewards/margins": -0.17176005244255066, + "rewards/rejected": -0.7149007320404053, + "step": 43 + }, + { + "epoch": 0.09404221212930804, + "grad_norm": 15.254437446594238, + "learning_rate": 9.361702127659575e-07, + "logits/chosen": -0.909596860408783, + "logits/rejected": -0.8520998954772949, + "logps/chosen": -0.41655728220939636, + "logps/rejected": -0.4312165379524231, + "loss": 1.5864, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0413931608200073, + "rewards/margins": 0.03664811700582504, + "rewards/rejected": -1.0780413150787354, + "step": 44 + }, + { + "epoch": 0.09617953513224686, + "grad_norm": 3.75406551361084, + "learning_rate": 9.574468085106384e-07, + "logits/chosen": -0.9761062264442444, + "logits/rejected": -0.8622941374778748, + "logps/chosen": -0.31024113297462463, + "logps/rejected": -0.29050612449645996, + "loss": 1.6133, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7756028175354004, + "rewards/margins": -0.0493374839425087, + "rewards/rejected": -0.7262653708457947, + "step": 45 + }, + { + "epoch": 0.09831685813518568, + "grad_norm": 4.00916051864624, + "learning_rate": 9.78723404255319e-07, + "logits/chosen": -0.8587102890014648, + "logits/rejected": -0.9007890820503235, + "logps/chosen": -0.2729268968105316, + "logps/rejected": -0.28749462962150574, + "loss": 1.6015, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6823172569274902, + "rewards/margins": 0.036419324576854706, + "rewards/rejected": -0.7187365889549255, + "step": 46 + }, + { + "epoch": 0.1004541811381245, + "grad_norm": 6.543757438659668, + "learning_rate": 1e-06, + "logits/chosen": -0.9861698150634766, + "logits/rejected": -1.0527900457382202, + "logps/chosen": -0.261736124753952, + "logps/rejected": -0.30613070726394653, + "loss": 1.5661, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6543402671813965, + "rewards/margins": 0.11098647862672806, + "rewards/rejected": -0.7653267979621887, + "step": 47 + }, + { + "epoch": 0.10259150414106331, + "grad_norm": 4.550771713256836, + "learning_rate": 9.999860125306348e-07, + "logits/chosen": -0.765229344367981, + "logits/rejected": -0.9412630796432495, + "logps/chosen": -0.30382850766181946, + "logps/rejected": -0.3371865153312683, + "loss": 1.6164, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7595713138580322, + "rewards/margins": 0.08339503407478333, + "rewards/rejected": -0.8429663181304932, + "step": 48 + }, + { + "epoch": 0.10472882714400214, + "grad_norm": 3.9836678504943848, + "learning_rate": 9.999440509051367e-07, + "logits/chosen": -1.0350228548049927, + "logits/rejected": -0.8914788961410522, + "logps/chosen": -0.31590813398361206, + "logps/rejected": -0.34474560618400574, + "loss": 1.5679, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7897703647613525, + "rewards/margins": 0.07209358364343643, + "rewards/rejected": -0.861863911151886, + "step": 49 + }, + { + "epoch": 0.10686615014694095, + "grad_norm": 4.762299537658691, + "learning_rate": 9.998741174712533e-07, + "logits/chosen": -0.99764084815979, + "logits/rejected": -0.9120419025421143, + "logps/chosen": -0.32988840341567993, + "logps/rejected": -0.37920716404914856, + "loss": 1.5959, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8247209787368774, + "rewards/margins": 0.12329696863889694, + "rewards/rejected": -0.9480178356170654, + "step": 50 + }, + { + "epoch": 0.10900347314987978, + "grad_norm": 4.453758239746094, + "learning_rate": 9.997762161417517e-07, + "logits/chosen": -0.7613246440887451, + "logits/rejected": -0.7792637348175049, + "logps/chosen": -0.2225038707256317, + "logps/rejected": -0.2605491876602173, + "loss": 1.5962, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5562596321105957, + "rewards/margins": 0.09511324763298035, + "rewards/rejected": -0.6513729095458984, + "step": 51 + }, + { + "epoch": 0.11114079615281859, + "grad_norm": 5.08111047744751, + "learning_rate": 9.996503523941992e-07, + "logits/chosen": -0.9457738399505615, + "logits/rejected": -1.0001921653747559, + "logps/chosen": -0.30008092522621155, + "logps/rejected": -0.33674395084381104, + "loss": 1.5988, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7502023577690125, + "rewards/margins": 0.09165750443935394, + "rewards/rejected": -0.8418598175048828, + "step": 52 + }, + { + "epoch": 0.11327811915575742, + "grad_norm": 19.494726181030273, + "learning_rate": 9.994965332706572e-07, + "logits/chosen": -0.9162960052490234, + "logits/rejected": -0.8188440799713135, + "logps/chosen": -0.28970617055892944, + "logps/rejected": -0.31052201986312866, + "loss": 1.6028, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.724265456199646, + "rewards/margins": 0.05203955993056297, + "rewards/rejected": -0.7763050198554993, + "step": 53 + }, + { + "epoch": 0.11541544215869623, + "grad_norm": 4.843199729919434, + "learning_rate": 9.99314767377287e-07, + "logits/chosen": -1.0256187915802002, + "logits/rejected": -0.9491410851478577, + "logps/chosen": -0.2758142650127411, + "logps/rejected": -0.30541473627090454, + "loss": 1.5812, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6895356178283691, + "rewards/margins": 0.07400117814540863, + "rewards/rejected": -0.7635368704795837, + "step": 54 + }, + { + "epoch": 0.11755276516163506, + "grad_norm": 4.721622943878174, + "learning_rate": 9.991050648838675e-07, + "logits/chosen": -1.0089685916900635, + "logits/rejected": -0.8246462345123291, + "logps/chosen": -0.2628995478153229, + "logps/rejected": -0.26574474573135376, + "loss": 1.624, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.657248854637146, + "rewards/margins": 0.007113074883818626, + "rewards/rejected": -0.6643618941307068, + "step": 55 + }, + { + "epoch": 0.11969008816457387, + "grad_norm": 17.51543426513672, + "learning_rate": 9.98867437523228e-07, + "logits/chosen": -0.8552293181419373, + "logits/rejected": -0.8588695526123047, + "logps/chosen": -0.3222196102142334, + "logps/rejected": -0.3183228075504303, + "loss": 1.5887, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8055489659309387, + "rewards/margins": -0.009741947054862976, + "rewards/rejected": -0.7958070039749146, + "step": 56 + }, + { + "epoch": 0.1218274111675127, + "grad_norm": 4.062877178192139, + "learning_rate": 9.986018985905899e-07, + "logits/chosen": -0.941673219203949, + "logits/rejected": -0.9555226564407349, + "logps/chosen": -0.245195671916008, + "logps/rejected": -0.28370341658592224, + "loss": 1.6183, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6129892468452454, + "rewards/margins": 0.09626930952072144, + "rewards/rejected": -0.709258496761322, + "step": 57 + }, + { + "epoch": 0.12396473417045151, + "grad_norm": 4.429107189178467, + "learning_rate": 9.983084629428244e-07, + "logits/chosen": -1.1889485120773315, + "logits/rejected": -1.0553314685821533, + "logps/chosen": -0.34671419858932495, + "logps/rejected": -0.3131285011768341, + "loss": 1.6173, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8667855262756348, + "rewards/margins": -0.0839642882347107, + "rewards/rejected": -0.7828212976455688, + "step": 58 + }, + { + "epoch": 0.12610205717339032, + "grad_norm": 5.847541332244873, + "learning_rate": 9.979871469976195e-07, + "logits/chosen": -0.8485190868377686, + "logits/rejected": -0.8948504328727722, + "logps/chosen": -0.23155483603477478, + "logps/rejected": -0.2654906213283539, + "loss": 1.6077, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5788871049880981, + "rewards/margins": 0.08483947813510895, + "rewards/rejected": -0.6637265682220459, + "step": 59 + }, + { + "epoch": 0.12823938017632916, + "grad_norm": 4.097656726837158, + "learning_rate": 9.97637968732563e-07, + "logits/chosen": -0.963527500629425, + "logits/rejected": -0.9208120703697205, + "logps/chosen": -0.28316250443458557, + "logps/rejected": -0.28164517879486084, + "loss": 1.582, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7079063057899475, + "rewards/margins": -0.0037933755666017532, + "rewards/rejected": -0.7041129469871521, + "step": 60 + }, + { + "epoch": 0.13037670317926797, + "grad_norm": 12.278409004211426, + "learning_rate": 9.972609476841365e-07, + "logits/chosen": -0.8986336588859558, + "logits/rejected": -0.9140303134918213, + "logps/chosen": -0.2507745921611786, + "logps/rejected": -0.25815582275390625, + "loss": 1.5789, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6269363164901733, + "rewards/margins": 0.018453147262334824, + "rewards/rejected": -0.6453895568847656, + "step": 61 + }, + { + "epoch": 0.13251402618220678, + "grad_norm": 8.683540344238281, + "learning_rate": 9.968561049466213e-07, + "logits/chosen": -0.9374087452888489, + "logits/rejected": -1.089040994644165, + "logps/chosen": -0.26520273089408875, + "logps/rejected": -0.2902393937110901, + "loss": 1.6468, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6630067825317383, + "rewards/margins": 0.06259168684482574, + "rewards/rejected": -0.7255985140800476, + "step": 62 + }, + { + "epoch": 0.1346513491851456, + "grad_norm": 4.520249843597412, + "learning_rate": 9.964234631709185e-07, + "logits/chosen": -0.9794274568557739, + "logits/rejected": -1.1607794761657715, + "logps/chosen": -0.2808447480201721, + "logps/rejected": -0.305492103099823, + "loss": 1.6685, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7021118402481079, + "rewards/margins": 0.06161835044622421, + "rewards/rejected": -0.7637301683425903, + "step": 63 + }, + { + "epoch": 0.13678867218808444, + "grad_norm": 4.595892906188965, + "learning_rate": 9.959630465632831e-07, + "logits/chosen": -0.9844315052032471, + "logits/rejected": -0.8630974888801575, + "logps/chosen": -0.37489163875579834, + "logps/rejected": -0.2756366729736328, + "loss": 1.652, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.9372289180755615, + "rewards/margins": -0.24813732504844666, + "rewards/rejected": -0.689091682434082, + "step": 64 + }, + { + "epoch": 0.13892599519102325, + "grad_norm": 8.821249008178711, + "learning_rate": 9.954748808839674e-07, + "logits/chosen": -1.018763780593872, + "logits/rejected": -1.0369865894317627, + "logps/chosen": -0.2777743935585022, + "logps/rejected": -0.2588828206062317, + "loss": 1.6033, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6944360136985779, + "rewards/margins": -0.047228869050741196, + "rewards/rejected": -0.6472070813179016, + "step": 65 + }, + { + "epoch": 0.14106331819396206, + "grad_norm": 3.644948959350586, + "learning_rate": 9.949589934457814e-07, + "logits/chosen": -1.0388680696487427, + "logits/rejected": -1.0096322298049927, + "logps/chosen": -0.3784153163433075, + "logps/rejected": -0.4555840790271759, + "loss": 1.5949, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9460383057594299, + "rewards/margins": 0.19292199611663818, + "rewards/rejected": -1.1389602422714233, + "step": 66 + }, + { + "epoch": 0.14320064119690087, + "grad_norm": 3.4237723350524902, + "learning_rate": 9.944154131125642e-07, + "logits/chosen": -0.9876866936683655, + "logits/rejected": -1.1050664186477661, + "logps/chosen": -0.6056898236274719, + "logps/rejected": -0.5897864699363708, + "loss": 1.6075, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5142244100570679, + "rewards/margins": -0.03975825011730194, + "rewards/rejected": -1.4744662046432495, + "step": 67 + }, + { + "epoch": 0.14533796419983971, + "grad_norm": 4.325132846832275, + "learning_rate": 9.938441702975689e-07, + "logits/chosen": -1.0274345874786377, + "logits/rejected": -0.9806603789329529, + "logps/chosen": -0.38565298914909363, + "logps/rejected": -0.34850040078163147, + "loss": 1.6198, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.9641326069831848, + "rewards/margins": -0.09288151562213898, + "rewards/rejected": -0.8712509870529175, + "step": 68 + }, + { + "epoch": 0.14747528720277853, + "grad_norm": 7.449862480163574, + "learning_rate": 9.932452969617607e-07, + "logits/chosen": -0.9729686975479126, + "logits/rejected": -0.8644128441810608, + "logps/chosen": -0.3008931577205658, + "logps/rejected": -0.4920026361942291, + "loss": 1.5732, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7522329092025757, + "rewards/margins": 0.47777363657951355, + "rewards/rejected": -1.2300065755844116, + "step": 69 + }, + { + "epoch": 0.14961261020571734, + "grad_norm": 4.059993743896484, + "learning_rate": 9.926188266120295e-07, + "logits/chosen": -0.9896879196166992, + "logits/rejected": -0.9051238894462585, + "logps/chosen": -0.30066195130348206, + "logps/rejected": -0.25544169545173645, + "loss": 1.7178, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.751654863357544, + "rewards/margins": -0.11305058002471924, + "rewards/rejected": -0.6386042833328247, + "step": 70 + }, + { + "epoch": 0.15174993320865615, + "grad_norm": 4.462944507598877, + "learning_rate": 9.919647942993147e-07, + "logits/chosen": -1.0837461948394775, + "logits/rejected": -1.0208004713058472, + "logps/chosen": -0.33584755659103394, + "logps/rejected": -0.49902307987213135, + "loss": 1.6114, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8396189212799072, + "rewards/margins": 0.40793871879577637, + "rewards/rejected": -1.2475576400756836, + "step": 71 + }, + { + "epoch": 0.15388725621159496, + "grad_norm": 3.49957275390625, + "learning_rate": 9.912832366166441e-07, + "logits/chosen": -1.057317852973938, + "logits/rejected": -1.04885995388031, + "logps/chosen": -0.27743101119995117, + "logps/rejected": -0.2855708599090576, + "loss": 1.5374, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6935775279998779, + "rewards/margins": 0.02034958079457283, + "rewards/rejected": -0.7139270901679993, + "step": 72 + }, + { + "epoch": 0.1560245792145338, + "grad_norm": 6.884411811828613, + "learning_rate": 9.905741916970863e-07, + "logits/chosen": -0.7818067073822021, + "logits/rejected": -0.791998028755188, + "logps/chosen": -0.44259050488471985, + "logps/rejected": -0.6377058029174805, + "loss": 1.5573, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1064761877059937, + "rewards/margins": 0.4877880811691284, + "rewards/rejected": -1.594264268875122, + "step": 73 + }, + { + "epoch": 0.15816190221747262, + "grad_norm": 25.20929718017578, + "learning_rate": 9.898376992116177e-07, + "logits/chosen": -0.9175440073013306, + "logits/rejected": -0.9666653275489807, + "logps/chosen": -0.3093183934688568, + "logps/rejected": -0.29086652398109436, + "loss": 1.6364, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7732959985733032, + "rewards/margins": -0.04612968489527702, + "rewards/rejected": -0.7271662950515747, + "step": 74 + }, + { + "epoch": 0.16029922522041143, + "grad_norm": 7.956474781036377, + "learning_rate": 9.890738003669027e-07, + "logits/chosen": -1.12689208984375, + "logits/rejected": -1.0710712671279907, + "logps/chosen": -0.30693331360816956, + "logps/rejected": -0.39591145515441895, + "loss": 1.4775, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7673332691192627, + "rewards/margins": 0.22244536876678467, + "rewards/rejected": -0.9897785186767578, + "step": 75 + }, + { + "epoch": 0.16243654822335024, + "grad_norm": 5.171202659606934, + "learning_rate": 9.882825379029882e-07, + "logits/chosen": -1.2351804971694946, + "logits/rejected": -1.2987933158874512, + "logps/chosen": -0.26825714111328125, + "logps/rejected": -0.3104426860809326, + "loss": 1.633, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6706428527832031, + "rewards/margins": 0.1054638922214508, + "rewards/rejected": -0.7761067152023315, + "step": 76 + }, + { + "epoch": 0.16457387122628908, + "grad_norm": 4.766253471374512, + "learning_rate": 9.874639560909118e-07, + "logits/chosen": -0.9250165224075317, + "logits/rejected": -0.8637949228286743, + "logps/chosen": -0.3042350709438324, + "logps/rejected": -0.37980011105537415, + "loss": 1.5511, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.760587751865387, + "rewards/margins": 0.18891258537769318, + "rewards/rejected": -0.949500322341919, + "step": 77 + }, + { + "epoch": 0.1667111942292279, + "grad_norm": 10.560595512390137, + "learning_rate": 9.866181007302256e-07, + "logits/chosen": -1.0456985235214233, + "logits/rejected": -1.0086830854415894, + "logps/chosen": -0.3072804808616638, + "logps/rejected": -0.2886378765106201, + "loss": 1.5779, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7682012915611267, + "rewards/margins": -0.046606533229351044, + "rewards/rejected": -0.7215947508811951, + "step": 78 + }, + { + "epoch": 0.1688485172321667, + "grad_norm": 6.801359176635742, + "learning_rate": 9.857450191464337e-07, + "logits/chosen": -0.8191673159599304, + "logits/rejected": -0.9271634817123413, + "logps/chosen": -0.38711708784103394, + "logps/rejected": -0.6279405951499939, + "loss": 1.533, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9677926898002625, + "rewards/margins": 0.602059006690979, + "rewards/rejected": -1.5698516368865967, + "step": 79 + }, + { + "epoch": 0.17098584023510552, + "grad_norm": 9.583541870117188, + "learning_rate": 9.848447601883433e-07, + "logits/chosen": -1.2207063436508179, + "logits/rejected": -1.0649924278259277, + "logps/chosen": -0.39880311489105225, + "logps/rejected": -0.33250871300697327, + "loss": 1.5893, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9970077276229858, + "rewards/margins": -0.16573598980903625, + "rewards/rejected": -0.831271767616272, + "step": 80 + }, + { + "epoch": 0.17312316323804436, + "grad_norm": 8.752836227416992, + "learning_rate": 9.839173742253334e-07, + "logits/chosen": -1.181122899055481, + "logits/rejected": -1.1287283897399902, + "logps/chosen": -0.3108143210411072, + "logps/rejected": -0.38757073879241943, + "loss": 1.586, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7770357728004456, + "rewards/margins": 0.19189102947711945, + "rewards/rejected": -0.9689267873764038, + "step": 81 + }, + { + "epoch": 0.17526048624098317, + "grad_norm": 9.698758125305176, + "learning_rate": 9.82962913144534e-07, + "logits/chosen": -1.0705887079238892, + "logits/rejected": -0.8216981887817383, + "logps/chosen": -0.34870025515556335, + "logps/rejected": -0.339597225189209, + "loss": 1.5169, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8717506527900696, + "rewards/margins": -0.022757653146982193, + "rewards/rejected": -0.8489930629730225, + "step": 82 + }, + { + "epoch": 0.17739780924392198, + "grad_norm": 3.818091869354248, + "learning_rate": 9.819814303479267e-07, + "logits/chosen": -0.8575353026390076, + "logits/rejected": -0.873344898223877, + "logps/chosen": -0.28072673082351685, + "logps/rejected": -0.28653833270072937, + "loss": 1.6003, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7018167972564697, + "rewards/margins": 0.014529004693031311, + "rewards/rejected": -0.7163459062576294, + "step": 83 + }, + { + "epoch": 0.1795351322468608, + "grad_norm": 8.025263786315918, + "learning_rate": 9.80972980749353e-07, + "logits/chosen": -0.886692225933075, + "logits/rejected": -0.9297844171524048, + "logps/chosen": -0.3129986524581909, + "logps/rejected": -0.30843472480773926, + "loss": 1.6748, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7824965715408325, + "rewards/margins": -0.011409677565097809, + "rewards/rejected": -0.7710868716239929, + "step": 84 + }, + { + "epoch": 0.18167245524979964, + "grad_norm": 3.9557864665985107, + "learning_rate": 9.799376207714444e-07, + "logits/chosen": -0.9471138715744019, + "logits/rejected": -0.9405109882354736, + "logps/chosen": -0.2820201814174652, + "logps/rejected": -0.3643006980419159, + "loss": 1.5473, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7050503492355347, + "rewards/margins": 0.20570139586925507, + "rewards/rejected": -0.9107517600059509, + "step": 85 + }, + { + "epoch": 0.18380977825273845, + "grad_norm": 3.63122820854187, + "learning_rate": 9.788754083424652e-07, + "logits/chosen": -0.7530944347381592, + "logits/rejected": -0.7066674828529358, + "logps/chosen": -0.29479140043258667, + "logps/rejected": -0.35999545454978943, + "loss": 1.5651, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7369785904884338, + "rewards/margins": 0.16301003098487854, + "rewards/rejected": -0.8999886512756348, + "step": 86 + }, + { + "epoch": 0.18594710125567726, + "grad_norm": 5.069854259490967, + "learning_rate": 9.777864028930705e-07, + "logits/chosen": -0.9427808523178101, + "logits/rejected": -0.8523333668708801, + "logps/chosen": -0.3117142617702484, + "logps/rejected": -0.35370469093322754, + "loss": 1.5806, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7792856097221375, + "rewards/margins": 0.10497600585222244, + "rewards/rejected": -0.8842616677284241, + "step": 87 + }, + { + "epoch": 0.18808442425861607, + "grad_norm": 9.928412437438965, + "learning_rate": 9.766706653529812e-07, + "logits/chosen": -0.9436711072921753, + "logits/rejected": -0.8924795985221863, + "logps/chosen": -0.29760316014289856, + "logps/rejected": -0.6252572536468506, + "loss": 1.5214, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7440078854560852, + "rewards/margins": 0.8191351890563965, + "rewards/rejected": -1.5631431341171265, + "step": 88 + }, + { + "epoch": 0.1902217472615549, + "grad_norm": 5.750680446624756, + "learning_rate": 9.755282581475767e-07, + "logits/chosen": -1.0517209768295288, + "logits/rejected": -1.055407166481018, + "logps/chosen": -0.2816842794418335, + "logps/rejected": -0.3634580373764038, + "loss": 1.6133, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7042107582092285, + "rewards/margins": 0.20443443953990936, + "rewards/rejected": -0.9086451530456543, + "step": 89 + }, + { + "epoch": 0.19235907026449373, + "grad_norm": 4.923956871032715, + "learning_rate": 9.743592451943998e-07, + "logits/chosen": -1.0395478010177612, + "logits/rejected": -0.9153550863265991, + "logps/chosen": -0.28231117129325867, + "logps/rejected": -0.35972079634666443, + "loss": 1.5655, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7057778239250183, + "rewards/margins": 0.19352424144744873, + "rewards/rejected": -0.8993021249771118, + "step": 90 + }, + { + "epoch": 0.19449639326743254, + "grad_norm": 6.7967634201049805, + "learning_rate": 9.73163691899582e-07, + "logits/chosen": -1.0115331411361694, + "logits/rejected": -1.0663210153579712, + "logps/chosen": -0.2789320647716522, + "logps/rejected": -0.3152926564216614, + "loss": 1.5292, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.697330117225647, + "rewards/margins": 0.09090138971805573, + "rewards/rejected": -0.788231611251831, + "step": 91 + }, + { + "epoch": 0.19663371627037135, + "grad_norm": 5.242959022521973, + "learning_rate": 9.719416651541837e-07, + "logits/chosen": -1.1914602518081665, + "logits/rejected": -1.237089991569519, + "logps/chosen": -0.3149641156196594, + "logps/rejected": -0.4288913309574127, + "loss": 1.5711, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.787410318851471, + "rewards/margins": 0.28481805324554443, + "rewards/rejected": -1.0722283124923706, + "step": 92 + }, + { + "epoch": 0.1987710392733102, + "grad_norm": 7.285670280456543, + "learning_rate": 9.706932333304517e-07, + "logits/chosen": -1.0469316244125366, + "logits/rejected": -1.016736388206482, + "logps/chosen": -0.32207292318344116, + "logps/rejected": -0.33990058302879333, + "loss": 1.5721, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8051823377609253, + "rewards/margins": 0.04456908628344536, + "rewards/rejected": -0.8497514128684998, + "step": 93 + }, + { + "epoch": 0.200908362276249, + "grad_norm": 5.663970947265625, + "learning_rate": 9.694184662779929e-07, + "logits/chosen": -0.83307284116745, + "logits/rejected": -0.8582972288131714, + "logps/chosen": -0.29542282223701477, + "logps/rejected": -0.32081592082977295, + "loss": 1.5804, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7385570406913757, + "rewards/margins": 0.06348275393247604, + "rewards/rejected": -0.8020397424697876, + "step": 94 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 5.039822578430176, + "learning_rate": 9.681174353198686e-07, + "logits/chosen": -1.19273042678833, + "logits/rejected": -1.1986303329467773, + "logps/chosen": -0.4751918315887451, + "logps/rejected": -0.42982685565948486, + "loss": 1.5248, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1879795789718628, + "rewards/margins": -0.11341254413127899, + "rewards/rejected": -1.0745670795440674, + "step": 95 + }, + { + "epoch": 0.20518300828212663, + "grad_norm": 2.779069423675537, + "learning_rate": 9.667902132486008e-07, + "logits/chosen": -1.0337846279144287, + "logits/rejected": -0.978040337562561, + "logps/chosen": -0.3200725317001343, + "logps/rejected": -0.3141818940639496, + "loss": 1.5839, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8001812696456909, + "rewards/margins": -0.014726534485816956, + "rewards/rejected": -0.7854547500610352, + "step": 96 + }, + { + "epoch": 0.20732033128506547, + "grad_norm": 15.442702293395996, + "learning_rate": 9.65436874322102e-07, + "logits/chosen": -1.0476655960083008, + "logits/rejected": -1.1004842519760132, + "logps/chosen": -0.3345443606376648, + "logps/rejected": -0.34287557005882263, + "loss": 1.6456, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8363610506057739, + "rewards/margins": 0.020828042179346085, + "rewards/rejected": -0.8571889400482178, + "step": 97 + }, + { + "epoch": 0.20945765428800428, + "grad_norm": 3.940214157104492, + "learning_rate": 9.640574942595194e-07, + "logits/chosen": -1.1419146060943604, + "logits/rejected": -1.018257737159729, + "logps/chosen": -0.3185333013534546, + "logps/rejected": -0.32143470644950867, + "loss": 1.5681, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7963333129882812, + "rewards/margins": 0.007253464311361313, + "rewards/rejected": -0.8035867214202881, + "step": 98 + }, + { + "epoch": 0.2115949772909431, + "grad_norm": 10.583402633666992, + "learning_rate": 9.626521502369983e-07, + "logits/chosen": -1.102158784866333, + "logits/rejected": -0.9770756959915161, + "logps/chosen": -0.3017268180847168, + "logps/rejected": -0.3784370422363281, + "loss": 1.5746, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.754317045211792, + "rewards/margins": 0.19177556037902832, + "rewards/rejected": -0.9460926055908203, + "step": 99 + }, + { + "epoch": 0.2137323002938819, + "grad_norm": 4.430131435394287, + "learning_rate": 9.612209208833646e-07, + "logits/chosen": -1.2631207704544067, + "logits/rejected": -1.2441554069519043, + "logps/chosen": -0.4847136437892914, + "logps/rejected": -0.38210996985435486, + "loss": 1.6529, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2117840051651, + "rewards/margins": -0.25650906562805176, + "rewards/rejected": -0.9552749395370483, + "step": 100 + }, + { + "epoch": 0.21586962329682075, + "grad_norm": 15.133020401000977, + "learning_rate": 9.597638862757253e-07, + "logits/chosen": -1.0296027660369873, + "logits/rejected": -1.0020573139190674, + "logps/chosen": -0.32216939330101013, + "logps/rejected": -0.4231048822402954, + "loss": 1.5492, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8054234981536865, + "rewards/margins": 0.2523386776447296, + "rewards/rejected": -1.0577621459960938, + "step": 101 + }, + { + "epoch": 0.21800694629975956, + "grad_norm": 6.248828887939453, + "learning_rate": 9.58281127934988e-07, + "logits/chosen": -1.0385347604751587, + "logits/rejected": -1.007280945777893, + "logps/chosen": -0.3016239106655121, + "logps/rejected": -0.3423752188682556, + "loss": 1.553, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7540597319602966, + "rewards/margins": 0.10187835991382599, + "rewards/rejected": -0.8559381365776062, + "step": 102 + }, + { + "epoch": 0.22014426930269837, + "grad_norm": 9.791584968566895, + "learning_rate": 9.567727288213004e-07, + "logits/chosen": -1.0230026245117188, + "logits/rejected": -0.9750150442123413, + "logps/chosen": -0.27773773670196533, + "logps/rejected": -0.32571396231651306, + "loss": 1.5772, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6943443417549133, + "rewards/margins": 0.11994057148694992, + "rewards/rejected": -0.8142849206924438, + "step": 103 + }, + { + "epoch": 0.22228159230563718, + "grad_norm": 4.045163631439209, + "learning_rate": 9.552387733294078e-07, + "logits/chosen": -1.1475125551223755, + "logits/rejected": -0.9796882271766663, + "logps/chosen": -0.309417724609375, + "logps/rejected": -0.3697960674762726, + "loss": 1.5822, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7735442519187927, + "rewards/margins": 0.15094594657421112, + "rewards/rejected": -0.924490213394165, + "step": 104 + }, + { + "epoch": 0.224418915308576, + "grad_norm": 5.604581832885742, + "learning_rate": 9.536793472839324e-07, + "logits/chosen": -1.047705888748169, + "logits/rejected": -1.0832607746124268, + "logps/chosen": -0.32854345440864563, + "logps/rejected": -0.43526744842529297, + "loss": 1.6265, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8213586211204529, + "rewards/margins": 0.26680994033813477, + "rewards/rejected": -1.0881686210632324, + "step": 105 + }, + { + "epoch": 0.22655623831151483, + "grad_norm": 3.9951133728027344, + "learning_rate": 9.520945379345699e-07, + "logits/chosen": -0.9593117237091064, + "logits/rejected": -0.8573833107948303, + "logps/chosen": -0.4196315109729767, + "logps/rejected": -0.35942524671554565, + "loss": 1.5512, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0490788221359253, + "rewards/margins": -0.15051564574241638, + "rewards/rejected": -0.8985630869865417, + "step": 106 + }, + { + "epoch": 0.22869356131445365, + "grad_norm": 8.718377113342285, + "learning_rate": 9.504844339512094e-07, + "logits/chosen": -0.8320434093475342, + "logits/rejected": -0.7738104462623596, + "logps/chosen": -0.31670740246772766, + "logps/rejected": -0.32797324657440186, + "loss": 1.5849, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7917685508728027, + "rewards/margins": 0.028164558112621307, + "rewards/rejected": -0.8199330568313599, + "step": 107 + }, + { + "epoch": 0.23083088431739246, + "grad_norm": 3.695155382156372, + "learning_rate": 9.488491254189716e-07, + "logits/chosen": -0.8110507130622864, + "logits/rejected": -0.7713247537612915, + "logps/chosen": -0.35122472047805786, + "logps/rejected": -0.43228679895401, + "loss": 1.6413, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8780617713928223, + "rewards/margins": 0.20265522599220276, + "rewards/rejected": -1.0807169675827026, + "step": 108 + }, + { + "epoch": 0.23296820732033127, + "grad_norm": 6.846314430236816, + "learning_rate": 9.471887038331684e-07, + "logits/chosen": -0.9511977434158325, + "logits/rejected": -1.0073680877685547, + "logps/chosen": -0.3145419657230377, + "logps/rejected": -0.34916961193084717, + "loss": 1.5855, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7863549590110779, + "rewards/margins": 0.08656906336545944, + "rewards/rejected": -0.8729239702224731, + "step": 109 + }, + { + "epoch": 0.2351055303232701, + "grad_norm": 6.659543037414551, + "learning_rate": 9.455032620941839e-07, + "logits/chosen": -1.2216616868972778, + "logits/rejected": -1.1229971647262573, + "logps/chosen": -0.35468706488609314, + "logps/rejected": -0.5576741695404053, + "loss": 1.5417, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8867175579071045, + "rewards/margins": 0.5074679851531982, + "rewards/rejected": -1.3941854238510132, + "step": 110 + }, + { + "epoch": 0.23724285332620892, + "grad_norm": 3.1694774627685547, + "learning_rate": 9.43792894502277e-07, + "logits/chosen": -0.8341147303581238, + "logits/rejected": -0.9695035219192505, + "logps/chosen": -0.32898104190826416, + "logps/rejected": -0.36305293440818787, + "loss": 1.5743, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8224526643753052, + "rewards/margins": 0.08517970144748688, + "rewards/rejected": -0.9076323509216309, + "step": 111 + }, + { + "epoch": 0.23938017632914774, + "grad_norm": 3.8049800395965576, + "learning_rate": 9.420576967523048e-07, + "logits/chosen": -0.8800846338272095, + "logits/rejected": -0.902974009513855, + "logps/chosen": -0.30384454131126404, + "logps/rejected": -0.33347606658935547, + "loss": 1.5319, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7596113681793213, + "rewards/margins": 0.07407879829406738, + "rewards/rejected": -0.8336902260780334, + "step": 112 + }, + { + "epoch": 0.24151749933208655, + "grad_norm": 6.169259548187256, + "learning_rate": 9.402977659283689e-07, + "logits/chosen": -0.9351974725723267, + "logits/rejected": -0.8290128707885742, + "logps/chosen": -0.33734872937202454, + "logps/rejected": -0.4412783086299896, + "loss": 1.4782, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8433719277381897, + "rewards/margins": 0.2598239779472351, + "rewards/rejected": -1.1031959056854248, + "step": 113 + }, + { + "epoch": 0.2436548223350254, + "grad_norm": 2.9584643840789795, + "learning_rate": 9.385132004983832e-07, + "logits/chosen": -1.0792999267578125, + "logits/rejected": -1.1223125457763672, + "logps/chosen": -0.36617720127105713, + "logps/rejected": -0.4576934576034546, + "loss": 1.5475, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9154431223869324, + "rewards/margins": 0.22879061102867126, + "rewards/rejected": -1.1442335844039917, + "step": 114 + }, + { + "epoch": 0.2457921453379642, + "grad_norm": 9.862983703613281, + "learning_rate": 9.367041003085648e-07, + "logits/chosen": -1.1801700592041016, + "logits/rejected": -1.075859785079956, + "logps/chosen": -0.3576958477497101, + "logps/rejected": -0.2694101333618164, + "loss": 1.6048, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.894239604473114, + "rewards/margins": -0.22071433067321777, + "rewards/rejected": -0.6735252737998962, + "step": 115 + }, + { + "epoch": 0.24792946834090301, + "grad_norm": 4.047823905944824, + "learning_rate": 9.348705665778477e-07, + "logits/chosen": -1.1844444274902344, + "logits/rejected": -1.1176280975341797, + "logps/chosen": -0.3374484181404114, + "logps/rejected": -0.3525134325027466, + "loss": 1.5249, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8436209559440613, + "rewards/margins": 0.03766253590583801, + "rewards/rejected": -0.8812835812568665, + "step": 116 + }, + { + "epoch": 0.25006679134384185, + "grad_norm": 7.761148929595947, + "learning_rate": 9.330127018922193e-07, + "logits/chosen": -1.1179434061050415, + "logits/rejected": -1.0968945026397705, + "logps/chosen": -0.38077402114868164, + "logps/rejected": -0.42625805735588074, + "loss": 1.5768, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9519350528717041, + "rewards/margins": 0.11371012032032013, + "rewards/rejected": -1.0656450986862183, + "step": 117 + }, + { + "epoch": 0.25220411434678064, + "grad_norm": 2.0378005504608154, + "learning_rate": 9.311306101989812e-07, + "logits/chosen": -0.970659613609314, + "logits/rejected": -1.0176433324813843, + "logps/chosen": -0.3453654944896698, + "logps/rejected": -0.32289958000183105, + "loss": 1.5867, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8634137511253357, + "rewards/margins": -0.056164830923080444, + "rewards/rejected": -0.8072489500045776, + "step": 118 + }, + { + "epoch": 0.2543414373497195, + "grad_norm": 7.108942031860352, + "learning_rate": 9.29224396800933e-07, + "logits/chosen": -1.1055564880371094, + "logits/rejected": -1.001468539237976, + "logps/chosen": -0.33387163281440735, + "logps/rejected": -0.3175872564315796, + "loss": 1.643, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.834679126739502, + "rewards/margins": -0.04071101173758507, + "rewards/rejected": -0.7939680814743042, + "step": 119 + }, + { + "epoch": 0.2564787603526583, + "grad_norm": 5.642314434051514, + "learning_rate": 9.272941683504808e-07, + "logits/chosen": -0.9950228333473206, + "logits/rejected": -0.9583395719528198, + "logps/chosen": -0.2971169650554657, + "logps/rejected": -0.4023984670639038, + "loss": 1.5387, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7427924275398254, + "rewards/margins": 0.2632036805152893, + "rewards/rejected": -1.0059961080551147, + "step": 120 + }, + { + "epoch": 0.2586160833555971, + "grad_norm": 6.028919219970703, + "learning_rate": 9.253400328436698e-07, + "logits/chosen": -1.2061349153518677, + "logits/rejected": -1.1848161220550537, + "logps/chosen": -0.5155015587806702, + "logps/rejected": -0.6664289236068726, + "loss": 1.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2887539863586426, + "rewards/margins": 0.3773185610771179, + "rewards/rejected": -1.6660724878311157, + "step": 121 + }, + { + "epoch": 0.26075340635853594, + "grad_norm": 8.189311027526855, + "learning_rate": 9.233620996141421e-07, + "logits/chosen": -1.0705747604370117, + "logits/rejected": -1.0496965646743774, + "logps/chosen": -0.4411606192588806, + "logps/rejected": -0.4381504952907562, + "loss": 1.7164, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.102901577949524, + "rewards/margins": -0.007525326684117317, + "rewards/rejected": -1.0953762531280518, + "step": 122 + }, + { + "epoch": 0.26289072936147473, + "grad_norm": 8.586820602416992, + "learning_rate": 9.213604793270196e-07, + "logits/chosen": -1.1702253818511963, + "logits/rejected": -1.166388750076294, + "logps/chosen": -0.3666711151599884, + "logps/rejected": -0.3490923047065735, + "loss": 1.6561, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9166778326034546, + "rewards/margins": -0.04394708573818207, + "rewards/rejected": -0.8727307319641113, + "step": 123 + }, + { + "epoch": 0.26502805236441357, + "grad_norm": 6.50911808013916, + "learning_rate": 9.19335283972712e-07, + "logits/chosen": -1.0470083951950073, + "logits/rejected": -1.0692466497421265, + "logps/chosen": -0.4158693253993988, + "logps/rejected": -0.5129181742668152, + "loss": 1.6143, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0396732091903687, + "rewards/margins": 0.24262209236621857, + "rewards/rejected": -1.2822954654693604, + "step": 124 + }, + { + "epoch": 0.2671653753673524, + "grad_norm": 10.414886474609375, + "learning_rate": 9.172866268606513e-07, + "logits/chosen": -1.0535264015197754, + "logits/rejected": -1.0312259197235107, + "logps/chosen": -0.8472201824188232, + "logps/rejected": -0.8241742849349976, + "loss": 1.6177, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1180503368377686, + "rewards/margins": -0.05761456489562988, + "rewards/rejected": -2.0604357719421387, + "step": 125 + }, + { + "epoch": 0.2693026983702912, + "grad_norm": 4.565366744995117, + "learning_rate": 9.152146226129518e-07, + "logits/chosen": -1.0052158832550049, + "logits/rejected": -1.1335080862045288, + "logps/chosen": -0.36246898770332336, + "logps/rejected": -0.5093556046485901, + "loss": 1.5169, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9061724543571472, + "rewards/margins": 0.36721646785736084, + "rewards/rejected": -1.2733888626098633, + "step": 126 + }, + { + "epoch": 0.27144002137323003, + "grad_norm": 6.2105021476745605, + "learning_rate": 9.131193871579974e-07, + "logits/chosen": -1.1646008491516113, + "logits/rejected": -1.1527330875396729, + "logps/chosen": -0.40890955924987793, + "logps/rejected": -0.5555287599563599, + "loss": 1.5485, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0222738981246948, + "rewards/margins": 0.36654818058013916, + "rewards/rejected": -1.388822078704834, + "step": 127 + }, + { + "epoch": 0.2735773443761689, + "grad_norm": 11.547090530395508, + "learning_rate": 9.11001037723955e-07, + "logits/chosen": -0.9489108324050903, + "logits/rejected": -0.9458054304122925, + "logps/chosen": -0.3760164678096771, + "logps/rejected": -0.4494747519493103, + "loss": 1.5733, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.940041184425354, + "rewards/margins": 0.1836456060409546, + "rewards/rejected": -1.1236867904663086, + "step": 128 + }, + { + "epoch": 0.27571466737910766, + "grad_norm": 5.789463043212891, + "learning_rate": 9.088596928322157e-07, + "logits/chosen": -1.2436193227767944, + "logits/rejected": -1.1626383066177368, + "logps/chosen": -0.33822306990623474, + "logps/rejected": -0.3627959191799164, + "loss": 1.5896, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8455577492713928, + "rewards/margins": 0.06143195554614067, + "rewards/rejected": -0.9069896936416626, + "step": 129 + }, + { + "epoch": 0.2778519903820465, + "grad_norm": 4.706400394439697, + "learning_rate": 9.066954722907638e-07, + "logits/chosen": -1.1288872957229614, + "logits/rejected": -1.2139040231704712, + "logps/chosen": -0.4233206808567047, + "logps/rejected": -0.49574923515319824, + "loss": 1.5678, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0583016872406006, + "rewards/margins": 0.18107128143310547, + "rewards/rejected": -1.2393728494644165, + "step": 130 + }, + { + "epoch": 0.2799893133849853, + "grad_norm": 3.986043930053711, + "learning_rate": 9.045084971874737e-07, + "logits/chosen": -1.0815867185592651, + "logits/rejected": -1.079985499382019, + "logps/chosen": -0.45107316970825195, + "logps/rejected": -0.4917645752429962, + "loss": 1.5342, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1276829242706299, + "rewards/margins": 0.10172851383686066, + "rewards/rejected": -1.2294114828109741, + "step": 131 + }, + { + "epoch": 0.2821266363879241, + "grad_norm": 9.682952880859375, + "learning_rate": 9.022988898833342e-07, + "logits/chosen": -0.9553432464599609, + "logits/rejected": -0.882793664932251, + "logps/chosen": -0.29320845007896423, + "logps/rejected": -0.3070768415927887, + "loss": 1.6546, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7330211400985718, + "rewards/margins": 0.03467091917991638, + "rewards/rejected": -0.7676920890808105, + "step": 132 + }, + { + "epoch": 0.28426395939086296, + "grad_norm": 3.61600923538208, + "learning_rate": 9.000667740056032e-07, + "logits/chosen": -1.1807315349578857, + "logits/rejected": -1.174274206161499, + "logps/chosen": -0.28325143456459045, + "logps/rejected": -0.3123588562011719, + "loss": 1.5242, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7081285715103149, + "rewards/margins": 0.07276856154203415, + "rewards/rejected": -0.7808971405029297, + "step": 133 + }, + { + "epoch": 0.28640128239380175, + "grad_norm": 3.5451016426086426, + "learning_rate": 8.978122744408905e-07, + "logits/chosen": -0.9926080703735352, + "logits/rejected": -0.9227127432823181, + "logps/chosen": -0.36737683415412903, + "logps/rejected": -0.48489928245544434, + "loss": 1.5447, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9184421300888062, + "rewards/margins": 0.29380616545677185, + "rewards/rejected": -1.2122482061386108, + "step": 134 + }, + { + "epoch": 0.2885386053967406, + "grad_norm": 7.112710952758789, + "learning_rate": 8.955355173281707e-07, + "logits/chosen": -1.156009554862976, + "logits/rejected": -1.0397788286209106, + "logps/chosen": -0.42765456438064575, + "logps/rejected": -0.33751052618026733, + "loss": 1.494, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.069136381149292, + "rewards/margins": -0.22536009550094604, + "rewards/rejected": -0.8437763452529907, + "step": 135 + }, + { + "epoch": 0.29067592839967943, + "grad_norm": 3.9115548133850098, + "learning_rate": 8.932366300517249e-07, + "logits/chosen": -1.0605583190917969, + "logits/rejected": -1.0287466049194336, + "logps/chosen": -0.429559588432312, + "logps/rejected": -0.5004155039787292, + "loss": 1.51, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0738990306854248, + "rewards/margins": 0.17713983356952667, + "rewards/rejected": -1.251038908958435, + "step": 136 + }, + { + "epoch": 0.2928132514026182, + "grad_norm": 5.627295017242432, + "learning_rate": 8.909157412340149e-07, + "logits/chosen": -1.058396816253662, + "logits/rejected": -1.0246150493621826, + "logps/chosen": -0.36611852049827576, + "logps/rejected": -0.3920373022556305, + "loss": 1.5874, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9152963161468506, + "rewards/margins": 0.06479701399803162, + "rewards/rejected": -0.9800933003425598, + "step": 137 + }, + { + "epoch": 0.29495057440555705, + "grad_norm": 10.381692886352539, + "learning_rate": 8.885729807284854e-07, + "logits/chosen": -0.9971895217895508, + "logits/rejected": -1.0381770133972168, + "logps/chosen": -0.3338506519794464, + "logps/rejected": -0.3878532648086548, + "loss": 1.6782, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8346267342567444, + "rewards/margins": 0.13500656187534332, + "rewards/rejected": -0.9696332216262817, + "step": 138 + }, + { + "epoch": 0.29708789740849584, + "grad_norm": 3.448967933654785, + "learning_rate": 8.862084796122997e-07, + "logits/chosen": -1.0352673530578613, + "logits/rejected": -0.9383904933929443, + "logps/chosen": -0.3496807813644409, + "logps/rejected": -0.3880283236503601, + "loss": 1.533, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8742019534111023, + "rewards/margins": 0.09586883336305618, + "rewards/rejected": -0.9700708389282227, + "step": 139 + }, + { + "epoch": 0.2992252204114347, + "grad_norm": 7.80885648727417, + "learning_rate": 8.838223701790055e-07, + "logits/chosen": -1.027004599571228, + "logits/rejected": -1.0879004001617432, + "logps/chosen": -0.3734505772590637, + "logps/rejected": -0.4567010700702667, + "loss": 1.5578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9336264729499817, + "rewards/margins": 0.2081262618303299, + "rewards/rejected": -1.1417527198791504, + "step": 140 + }, + { + "epoch": 0.3013625434143735, + "grad_norm": 4.153167247772217, + "learning_rate": 8.814147859311332e-07, + "logits/chosen": -0.9403591156005859, + "logits/rejected": -0.9337973594665527, + "logps/chosen": -0.473748117685318, + "logps/rejected": -0.4763396084308624, + "loss": 1.5861, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1843702793121338, + "rewards/margins": 0.006478846073150635, + "rewards/rejected": -1.1908490657806396, + "step": 141 + }, + { + "epoch": 0.3034998664173123, + "grad_norm": 6.681884288787842, + "learning_rate": 8.789858615727264e-07, + "logits/chosen": -1.2222094535827637, + "logits/rejected": -1.1642351150512695, + "logps/chosen": -0.37209561467170715, + "logps/rejected": -0.3360154330730438, + "loss": 1.5777, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9302390217781067, + "rewards/margins": -0.09020054340362549, + "rewards/rejected": -0.840038537979126, + "step": 142 + }, + { + "epoch": 0.30563718942025114, + "grad_norm": 4.460201740264893, + "learning_rate": 8.765357330018055e-07, + "logits/chosen": -1.1878383159637451, + "logits/rejected": -1.1819963455200195, + "logps/chosen": -0.39119935035705566, + "logps/rejected": -0.5452548265457153, + "loss": 1.5387, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9779983758926392, + "rewards/margins": 0.3851388394832611, + "rewards/rejected": -1.3631370067596436, + "step": 143 + }, + { + "epoch": 0.3077745124231899, + "grad_norm": 9.836545944213867, + "learning_rate": 8.740645373027634e-07, + "logits/chosen": -0.8851446509361267, + "logits/rejected": -0.8251763582229614, + "logps/chosen": -0.29692715406417847, + "logps/rejected": -0.3346107602119446, + "loss": 1.5638, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.742317795753479, + "rewards/margins": 0.09420904517173767, + "rewards/rejected": -0.8365269303321838, + "step": 144 + }, + { + "epoch": 0.30991183542612877, + "grad_norm": 6.311913013458252, + "learning_rate": 8.71572412738697e-07, + "logits/chosen": -1.0461251735687256, + "logits/rejected": -1.1545249223709106, + "logps/chosen": -0.3259159028530121, + "logps/rejected": -0.40478914976119995, + "loss": 1.5561, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8147897720336914, + "rewards/margins": 0.19718317687511444, + "rewards/rejected": -1.0119729042053223, + "step": 145 + }, + { + "epoch": 0.3120491584290676, + "grad_norm": 3.136129379272461, + "learning_rate": 8.690594987436704e-07, + "logits/chosen": -0.9903483390808105, + "logits/rejected": -0.9337629675865173, + "logps/chosen": -0.6526888012886047, + "logps/rejected": -0.8471518754959106, + "loss": 1.5544, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6317218542099, + "rewards/margins": 0.4861578047275543, + "rewards/rejected": -2.117879629135132, + "step": 146 + }, + { + "epoch": 0.3141864814320064, + "grad_norm": 4.960554599761963, + "learning_rate": 8.66525935914913e-07, + "logits/chosen": -0.7758511900901794, + "logits/rejected": -0.8652888536453247, + "logps/chosen": -0.3104845881462097, + "logps/rejected": -0.3932048976421356, + "loss": 1.5272, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7762114405632019, + "rewards/margins": 0.20680075883865356, + "rewards/rejected": -0.9830121994018555, + "step": 147 + }, + { + "epoch": 0.31632380443494523, + "grad_norm": 6.391895771026611, + "learning_rate": 8.639718660049554e-07, + "logits/chosen": -0.988434374332428, + "logits/rejected": -0.9107407331466675, + "logps/chosen": -0.40984153747558594, + "logps/rejected": -0.46740108728408813, + "loss": 1.5732, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0246037244796753, + "rewards/margins": 0.14389894902706146, + "rewards/rejected": -1.168502688407898, + "step": 148 + }, + { + "epoch": 0.3184611274378841, + "grad_norm": 5.031012535095215, + "learning_rate": 8.613974319136957e-07, + "logits/chosen": -1.0476540327072144, + "logits/rejected": -1.2046645879745483, + "logps/chosen": -0.4192398190498352, + "logps/rejected": -0.551673173904419, + "loss": 1.5839, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0480995178222656, + "rewards/margins": 0.33108341693878174, + "rewards/rejected": -1.379183053970337, + "step": 149 + }, + { + "epoch": 0.32059845044082286, + "grad_norm": 4.456223011016846, + "learning_rate": 8.588027776804058e-07, + "logits/chosen": -1.0767971277236938, + "logits/rejected": -0.9776477217674255, + "logps/chosen": -0.3873605728149414, + "logps/rejected": -0.41790682077407837, + "loss": 1.5818, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9684014320373535, + "rewards/margins": 0.07636569440364838, + "rewards/rejected": -1.044767141342163, + "step": 150 + }, + { + "epoch": 0.3227357734437617, + "grad_norm": 4.048471927642822, + "learning_rate": 8.561880484756724e-07, + "logits/chosen": -1.0004703998565674, + "logits/rejected": -1.0351473093032837, + "logps/chosen": -0.3233994245529175, + "logps/rejected": -0.6644845008850098, + "loss": 1.5014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8084985017776489, + "rewards/margins": 0.8527127504348755, + "rewards/rejected": -1.6612112522125244, + "step": 151 + }, + { + "epoch": 0.3248730964467005, + "grad_norm": 4.682628154754639, + "learning_rate": 8.535533905932737e-07, + "logits/chosen": -1.0235004425048828, + "logits/rejected": -0.9808617234230042, + "logps/chosen": -0.34479135274887085, + "logps/rejected": -0.3569888770580292, + "loss": 1.5905, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8619784712791443, + "rewards/margins": 0.03049362078309059, + "rewards/rejected": -0.8924720287322998, + "step": 152 + }, + { + "epoch": 0.3270104194496393, + "grad_norm": 3.4882867336273193, + "learning_rate": 8.508989514419958e-07, + "logits/chosen": -1.0409698486328125, + "logits/rejected": -0.8566800951957703, + "logps/chosen": -0.3815579414367676, + "logps/rejected": -0.5463601350784302, + "loss": 1.5394, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.953894853591919, + "rewards/margins": 0.4120054841041565, + "rewards/rejected": -1.3659002780914307, + "step": 153 + }, + { + "epoch": 0.32914774245257816, + "grad_norm": 7.419760227203369, + "learning_rate": 8.482248795373835e-07, + "logits/chosen": -1.1312333345413208, + "logits/rejected": -1.1359437704086304, + "logps/chosen": -0.4473015069961548, + "logps/rejected": -0.4743323028087616, + "loss": 1.563, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1182537078857422, + "rewards/margins": 0.06757716834545135, + "rewards/rejected": -1.1858309507369995, + "step": 154 + }, + { + "epoch": 0.33128506545551695, + "grad_norm": 5.330379962921143, + "learning_rate": 8.455313244934324e-07, + "logits/chosen": -1.0080257654190063, + "logits/rejected": -0.9858949780464172, + "logps/chosen": -0.39618024230003357, + "logps/rejected": -0.39888572692871094, + "loss": 1.5041, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9904506206512451, + "rewards/margins": 0.006763685494661331, + "rewards/rejected": -0.9972144365310669, + "step": 155 + }, + { + "epoch": 0.3334223884584558, + "grad_norm": 7.0863518714904785, + "learning_rate": 8.428184370142171e-07, + "logits/chosen": -1.1136622428894043, + "logits/rejected": -1.1101243495941162, + "logps/chosen": -0.33028310537338257, + "logps/rejected": -0.3945181369781494, + "loss": 1.4914, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8257076740264893, + "rewards/margins": 0.16058766841888428, + "rewards/rejected": -0.9862953424453735, + "step": 156 + }, + { + "epoch": 0.3355597114613946, + "grad_norm": 23.59922218322754, + "learning_rate": 8.400863688854596e-07, + "logits/chosen": -1.038999319076538, + "logits/rejected": -1.1245837211608887, + "logps/chosen": -0.4650399386882782, + "logps/rejected": -0.651479959487915, + "loss": 1.6086, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1625999212265015, + "rewards/margins": 0.46609991788864136, + "rewards/rejected": -1.628699779510498, + "step": 157 + }, + { + "epoch": 0.3376970344643334, + "grad_norm": 7.815871715545654, + "learning_rate": 8.373352729660372e-07, + "logits/chosen": -0.8359465599060059, + "logits/rejected": -0.9364669919013977, + "logps/chosen": -0.3437475562095642, + "logps/rejected": -0.42519667744636536, + "loss": 1.6168, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8593689203262329, + "rewards/margins": 0.2036227285861969, + "rewards/rejected": -1.0629916191101074, + "step": 158 + }, + { + "epoch": 0.33983435746727225, + "grad_norm": 5.957912445068359, + "learning_rate": 8.34565303179429e-07, + "logits/chosen": -0.9782019257545471, + "logits/rejected": -0.9117124080657959, + "logps/chosen": -0.35762521624565125, + "logps/rejected": -0.3135136663913727, + "loss": 1.6004, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8940630555152893, + "rewards/margins": -0.11027887463569641, + "rewards/rejected": -0.7837841510772705, + "step": 159 + }, + { + "epoch": 0.34197168047021104, + "grad_norm": 5.102325916290283, + "learning_rate": 8.317766145051057e-07, + "logits/chosen": -1.137721061706543, + "logits/rejected": -0.9964947700500488, + "logps/chosen": -0.28172147274017334, + "logps/rejected": -0.2709539830684662, + "loss": 1.5036, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7043037414550781, + "rewards/margins": -0.02691873162984848, + "rewards/rejected": -0.6773849725723267, + "step": 160 + }, + { + "epoch": 0.3441090034731499, + "grad_norm": 12.086938858032227, + "learning_rate": 8.289693629698563e-07, + "logits/chosen": -1.0542073249816895, + "logits/rejected": -1.0224194526672363, + "logps/chosen": -0.49710360169410706, + "logps/rejected": -0.5311066508293152, + "loss": 1.4818, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2427589893341064, + "rewards/margins": 0.08500773459672928, + "rewards/rejected": -1.3277666568756104, + "step": 161 + }, + { + "epoch": 0.3462463264760887, + "grad_norm": 4.8447089195251465, + "learning_rate": 8.261437056390606e-07, + "logits/chosen": -0.9805575609207153, + "logits/rejected": -0.940955638885498, + "logps/chosen": -0.3420554995536804, + "logps/rejected": -0.32786181569099426, + "loss": 1.5357, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8551387190818787, + "rewards/margins": -0.03548429161310196, + "rewards/rejected": -0.8196544647216797, + "step": 162 + }, + { + "epoch": 0.3483836494790275, + "grad_norm": 7.347986221313477, + "learning_rate": 8.232998006078997e-07, + "logits/chosen": -1.0599087476730347, + "logits/rejected": -1.0014201402664185, + "logps/chosen": -0.36005347967147827, + "logps/rejected": -0.3714003264904022, + "loss": 1.5657, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9001337289810181, + "rewards/margins": 0.028367016464471817, + "rewards/rejected": -0.928500771522522, + "step": 163 + }, + { + "epoch": 0.35052097248196634, + "grad_norm": 5.851932048797607, + "learning_rate": 8.20437806992512e-07, + "logits/chosen": -1.1148821115493774, + "logits/rejected": -1.01279616355896, + "logps/chosen": -0.4296346604824066, + "logps/rejected": -0.4424145221710205, + "loss": 1.5853, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0740867853164673, + "rewards/margins": 0.03194954991340637, + "rewards/rejected": -1.1060361862182617, + "step": 164 + }, + { + "epoch": 0.3526582954849052, + "grad_norm": 4.438355922698975, + "learning_rate": 8.175578849210894e-07, + "logits/chosen": -1.0889073610305786, + "logits/rejected": -0.9964409470558167, + "logps/chosen": -0.4054264426231384, + "logps/rejected": -0.6688516736030579, + "loss": 1.5004, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0135661363601685, + "rewards/margins": 0.6585631370544434, + "rewards/rejected": -1.6721292734146118, + "step": 165 + }, + { + "epoch": 0.35479561848784397, + "grad_norm": 10.995403289794922, + "learning_rate": 8.146601955249187e-07, + "logits/chosen": -1.0129978656768799, + "logits/rejected": -1.0595769882202148, + "logps/chosen": -0.4223453998565674, + "logps/rejected": -0.49435415863990784, + "loss": 1.5537, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.055863380432129, + "rewards/margins": 0.1800219863653183, + "rewards/rejected": -1.2358853816986084, + "step": 166 + }, + { + "epoch": 0.3569329414907828, + "grad_norm": 5.968749046325684, + "learning_rate": 8.117449009293668e-07, + "logits/chosen": -0.9696434736251831, + "logits/rejected": -0.9188713431358337, + "logps/chosen": -0.3301633894443512, + "logps/rejected": -0.5673984289169312, + "loss": 1.5133, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8254085183143616, + "rewards/margins": 0.5930875539779663, + "rewards/rejected": -1.4184958934783936, + "step": 167 + }, + { + "epoch": 0.3590702644937216, + "grad_norm": 5.678130626678467, + "learning_rate": 8.088121642448089e-07, + "logits/chosen": -1.0743975639343262, + "logits/rejected": -1.1318800449371338, + "logps/chosen": -0.5486389398574829, + "logps/rejected": -0.498245507478714, + "loss": 1.5696, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3715972900390625, + "rewards/margins": -0.1259835809469223, + "rewards/rejected": -1.2456138134002686, + "step": 168 + }, + { + "epoch": 0.36120758749666043, + "grad_norm": 4.451059818267822, + "learning_rate": 8.058621495575031e-07, + "logits/chosen": -0.9348894953727722, + "logits/rejected": -0.9463680386543274, + "logps/chosen": -0.45605984330177307, + "logps/rejected": -0.5887668132781982, + "loss": 1.4619, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1401495933532715, + "rewards/margins": 0.3317675292491913, + "rewards/rejected": -1.4719170331954956, + "step": 169 + }, + { + "epoch": 0.36334491049959927, + "grad_norm": 4.130940914154053, + "learning_rate": 8.028950219204099e-07, + "logits/chosen": -1.0746759176254272, + "logits/rejected": -1.0093308687210083, + "logps/chosen": -0.3871830701828003, + "logps/rejected": -0.6328672170639038, + "loss": 1.537, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9679576754570007, + "rewards/margins": 0.6142103672027588, + "rewards/rejected": -1.5821679830551147, + "step": 170 + }, + { + "epoch": 0.36548223350253806, + "grad_norm": 4.6137824058532715, + "learning_rate": 7.99910947343957e-07, + "logits/chosen": -1.15254807472229, + "logits/rejected": -1.2261916399002075, + "logps/chosen": -0.42545759677886963, + "logps/rejected": -0.834107518196106, + "loss": 1.415, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0636440515518188, + "rewards/margins": 1.0216246843338013, + "rewards/rejected": -2.08526873588562, + "step": 171 + }, + { + "epoch": 0.3676195565054769, + "grad_norm": 4.724302291870117, + "learning_rate": 7.969100927867507e-07, + "logits/chosen": -1.178253412246704, + "logits/rejected": -1.1223783493041992, + "logps/chosen": -0.4991285800933838, + "logps/rejected": -0.5416950583457947, + "loss": 1.4574, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.24782133102417, + "rewards/margins": 0.10641634464263916, + "rewards/rejected": -1.354237675666809, + "step": 172 + }, + { + "epoch": 0.36975687950841574, + "grad_norm": 6.497979640960693, + "learning_rate": 7.938926261462365e-07, + "logits/chosen": -0.8218459486961365, + "logits/rejected": -0.7933223843574524, + "logps/chosen": -0.7412954568862915, + "logps/rejected": -0.8470227718353271, + "loss": 1.5134, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.853238582611084, + "rewards/margins": 0.2643181085586548, + "rewards/rejected": -2.117556571960449, + "step": 173 + }, + { + "epoch": 0.3718942025113545, + "grad_norm": 7.887294292449951, + "learning_rate": 7.908587162493028e-07, + "logits/chosen": -1.0390602350234985, + "logits/rejected": -0.9814541935920715, + "logps/chosen": -0.3448028862476349, + "logps/rejected": -0.37495365738868713, + "loss": 1.5008, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.862007200717926, + "rewards/margins": 0.07537690550088882, + "rewards/rejected": -0.9373840689659119, + "step": 174 + }, + { + "epoch": 0.37403152551429336, + "grad_norm": 6.731198310852051, + "learning_rate": 7.878085328428368e-07, + "logits/chosen": -0.9766249656677246, + "logits/rejected": -1.0798594951629639, + "logps/chosen": -0.3443922698497772, + "logps/rejected": -0.4274132549762726, + "loss": 1.5293, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8609806299209595, + "rewards/margins": 0.20755250751972198, + "rewards/rejected": -1.068533182144165, + "step": 175 + }, + { + "epoch": 0.37616884851723215, + "grad_norm": 6.996652603149414, + "learning_rate": 7.84742246584226e-07, + "logits/chosen": -1.1186981201171875, + "logits/rejected": -1.1801022291183472, + "logps/chosen": -0.375766396522522, + "logps/rejected": -0.4341758191585541, + "loss": 1.5967, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9394159317016602, + "rewards/margins": 0.14602357149124146, + "rewards/rejected": -1.0854394435882568, + "step": 176 + }, + { + "epoch": 0.378306171520171, + "grad_norm": 4.98004150390625, + "learning_rate": 7.81660029031811e-07, + "logits/chosen": -1.0316510200500488, + "logits/rejected": -1.0762914419174194, + "logps/chosen": -0.5189335346221924, + "logps/rejected": -0.5200464725494385, + "loss": 1.5431, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2973339557647705, + "rewards/margins": 0.0027822405099868774, + "rewards/rejected": -1.3001161813735962, + "step": 177 + }, + { + "epoch": 0.3804434945231098, + "grad_norm": 5.705168724060059, + "learning_rate": 7.785620526352861e-07, + "logits/chosen": -1.0669925212860107, + "logits/rejected": -1.1380958557128906, + "logps/chosen": -0.682020902633667, + "logps/rejected": -1.2245979309082031, + "loss": 1.5066, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.705052375793457, + "rewards/margins": 1.3564426898956299, + "rewards/rejected": -3.061494827270508, + "step": 178 + }, + { + "epoch": 0.3825808175260486, + "grad_norm": 7.848588466644287, + "learning_rate": 7.754484907260512e-07, + "logits/chosen": -1.2138592004776, + "logits/rejected": -1.069401502609253, + "logps/chosen": -0.49140703678131104, + "logps/rejected": -0.5626181364059448, + "loss": 1.5185, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2285176515579224, + "rewards/margins": 0.1780276894569397, + "rewards/rejected": -1.4065454006195068, + "step": 179 + }, + { + "epoch": 0.38471814052898745, + "grad_norm": 4.452049732208252, + "learning_rate": 7.723195175075135e-07, + "logits/chosen": -0.9280627369880676, + "logits/rejected": -0.9011946320533752, + "logps/chosen": -0.5324864387512207, + "logps/rejected": -0.7281622886657715, + "loss": 1.5208, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3312160968780518, + "rewards/margins": 0.48918962478637695, + "rewards/rejected": -1.8204057216644287, + "step": 180 + }, + { + "epoch": 0.38685546353192624, + "grad_norm": 6.341145038604736, + "learning_rate": 7.691753080453411e-07, + "logits/chosen": -0.9016430974006653, + "logits/rejected": -0.9220845103263855, + "logps/chosen": -0.42495110630989075, + "logps/rejected": -0.7001734972000122, + "loss": 1.5602, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0623778104782104, + "rewards/margins": 0.6880559325218201, + "rewards/rejected": -1.7504336833953857, + "step": 181 + }, + { + "epoch": 0.3889927865348651, + "grad_norm": 8.017127990722656, + "learning_rate": 7.660160382576683e-07, + "logits/chosen": -1.072341799736023, + "logits/rejected": -1.0071234703063965, + "logps/chosen": -0.3740030825138092, + "logps/rejected": -0.42607858777046204, + "loss": 1.5577, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9350078105926514, + "rewards/margins": 0.1301887333393097, + "rewards/rejected": -1.0651965141296387, + "step": 182 + }, + { + "epoch": 0.3911301095378039, + "grad_norm": 8.377826690673828, + "learning_rate": 7.628418849052523e-07, + "logits/chosen": -1.172293782234192, + "logits/rejected": -1.061620831489563, + "logps/chosen": -0.4983487129211426, + "logps/rejected": -0.44917386770248413, + "loss": 1.6102, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2458717823028564, + "rewards/margins": -0.12293709814548492, + "rewards/rejected": -1.1229346990585327, + "step": 183 + }, + { + "epoch": 0.3932674325407427, + "grad_norm": 5.036559581756592, + "learning_rate": 7.596530255815845e-07, + "logits/chosen": -1.2493027448654175, + "logits/rejected": -1.249387264251709, + "logps/chosen": -0.3506978154182434, + "logps/rejected": -0.7018638849258423, + "loss": 1.5607, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8767446279525757, + "rewards/margins": 0.8779150247573853, + "rewards/rejected": -1.754659652709961, + "step": 184 + }, + { + "epoch": 0.39540475554368154, + "grad_norm": 5.925335884094238, + "learning_rate": 7.564496387029531e-07, + "logits/chosen": -1.0624215602874756, + "logits/rejected": -1.0493366718292236, + "logps/chosen": -0.5904009342193604, + "logps/rejected": -0.8243600130081177, + "loss": 1.5543, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4760024547576904, + "rewards/margins": 0.5848975777626038, + "rewards/rejected": -2.0608999729156494, + "step": 185 + }, + { + "epoch": 0.3975420785466204, + "grad_norm": 5.356679916381836, + "learning_rate": 7.532319034984614e-07, + "logits/chosen": -1.097068190574646, + "logits/rejected": -1.1813206672668457, + "logps/chosen": -0.6806610822677612, + "logps/rejected": -1.157942533493042, + "loss": 1.595, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7016526460647583, + "rewards/margins": 1.1932036876678467, + "rewards/rejected": -2.8948559761047363, + "step": 186 + }, + { + "epoch": 0.39967940154955917, + "grad_norm": 16.624237060546875, + "learning_rate": 7.5e-07, + "logits/chosen": -1.0386743545532227, + "logits/rejected": -0.9917205572128296, + "logps/chosen": -0.46858224272727966, + "logps/rejected": -0.5119574666023254, + "loss": 1.5693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1714555025100708, + "rewards/margins": 0.10843798518180847, + "rewards/rejected": -1.2798936367034912, + "step": 187 + }, + { + "epoch": 0.401816724552498, + "grad_norm": 4.841832160949707, + "learning_rate": 7.467541090321733e-07, + "logits/chosen": -1.1640836000442505, + "logits/rejected": -1.1297622919082642, + "logps/chosen": -0.4247099757194519, + "logps/rejected": -0.44790923595428467, + "loss": 1.4403, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0617749691009521, + "rewards/margins": 0.05799813196063042, + "rewards/rejected": -1.1197729110717773, + "step": 188 + }, + { + "epoch": 0.4039540475554368, + "grad_norm": 4.527658939361572, + "learning_rate": 7.434944122021836e-07, + "logits/chosen": -1.0958904027938843, + "logits/rejected": -1.0618705749511719, + "logps/chosen": -0.492468923330307, + "logps/rejected": -0.7371240854263306, + "loss": 1.4343, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2311724424362183, + "rewards/margins": 0.6116377115249634, + "rewards/rejected": -1.8428101539611816, + "step": 189 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 5.381464004516602, + "learning_rate": 7.402210918896689e-07, + "logits/chosen": -1.1516568660736084, + "logits/rejected": -1.075246810913086, + "logps/chosen": -0.40711236000061035, + "logps/rejected": -0.6080544590950012, + "loss": 1.4249, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0177809000015259, + "rewards/margins": 0.5023550987243652, + "rewards/rejected": -1.5201361179351807, + "step": 190 + }, + { + "epoch": 0.40822869356131447, + "grad_norm": 3.7837634086608887, + "learning_rate": 7.369343312364993e-07, + "logits/chosen": -0.8244751691818237, + "logits/rejected": -0.9799928069114685, + "logps/chosen": -0.3934876620769501, + "logps/rejected": -0.8731093406677246, + "loss": 1.4991, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9837191104888916, + "rewards/margins": 1.19905424118042, + "rewards/rejected": -2.1827735900878906, + "step": 191 + }, + { + "epoch": 0.41036601656425326, + "grad_norm": 15.497288703918457, + "learning_rate": 7.33634314136531e-07, + "logits/chosen": -0.9831448793411255, + "logits/rejected": -0.9924254417419434, + "logps/chosen": -0.4264066815376282, + "logps/rejected": -0.47621241211891174, + "loss": 1.5327, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.066016674041748, + "rewards/margins": 0.12451447546482086, + "rewards/rejected": -1.1905312538146973, + "step": 192 + }, + { + "epoch": 0.4125033395671921, + "grad_norm": 8.750000953674316, + "learning_rate": 7.303212252253161e-07, + "logits/chosen": -1.0581997632980347, + "logits/rejected": -0.9458177089691162, + "logps/chosen": -0.44811898469924927, + "logps/rejected": -0.5640084147453308, + "loss": 1.515, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1202974319458008, + "rewards/margins": 0.28972357511520386, + "rewards/rejected": -1.4100210666656494, + "step": 193 + }, + { + "epoch": 0.41464066257013094, + "grad_norm": 6.45157527923584, + "learning_rate": 7.269952498697734e-07, + "logits/chosen": -1.1394294500350952, + "logits/rejected": -1.0299630165100098, + "logps/chosen": -0.5914661884307861, + "logps/rejected": -0.7591831684112549, + "loss": 1.4898, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4786653518676758, + "rewards/margins": 0.41929247975349426, + "rewards/rejected": -1.8979580402374268, + "step": 194 + }, + { + "epoch": 0.4167779855730697, + "grad_norm": 12.109285354614258, + "learning_rate": 7.236565741578162e-07, + "logits/chosen": -1.0763144493103027, + "logits/rejected": -1.0646532773971558, + "logps/chosen": -0.6971418261528015, + "logps/rejected": -0.7001355886459351, + "loss": 1.6446, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7428545951843262, + "rewards/margins": 0.0074843429028987885, + "rewards/rejected": -1.7503387928009033, + "step": 195 + }, + { + "epoch": 0.41891530857600856, + "grad_norm": 5.321496486663818, + "learning_rate": 7.203053848879418e-07, + "logits/chosen": -1.0298616886138916, + "logits/rejected": -1.0654942989349365, + "logps/chosen": -0.3775436580181122, + "logps/rejected": -0.4247080981731415, + "loss": 1.5052, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9438591599464417, + "rewards/margins": 0.11791113764047623, + "rewards/rejected": -1.0617702007293701, + "step": 196 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 5.436696529388428, + "learning_rate": 7.16941869558779e-07, + "logits/chosen": -0.8212717771530151, + "logits/rejected": -0.8311895728111267, + "logps/chosen": -0.3471302390098572, + "logps/rejected": -0.38963112235069275, + "loss": 1.4919, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8678255677223206, + "rewards/margins": 0.10625223070383072, + "rewards/rejected": -0.9740778803825378, + "step": 197 + }, + { + "epoch": 0.4231899545818862, + "grad_norm": 4.8039374351501465, + "learning_rate": 7.135662163585984e-07, + "logits/chosen": -1.043779969215393, + "logits/rejected": -1.0483262538909912, + "logps/chosen": -0.4499708116054535, + "logps/rejected": -0.7638281583786011, + "loss": 1.4337, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.124927043914795, + "rewards/margins": 0.7846433520317078, + "rewards/rejected": -1.9095702171325684, + "step": 198 + }, + { + "epoch": 0.425327277584825, + "grad_norm": 16.792325973510742, + "learning_rate": 7.101786141547828e-07, + "logits/chosen": -1.0792505741119385, + "logits/rejected": -1.0246937274932861, + "logps/chosen": -0.5006421804428101, + "logps/rejected": -0.7959640026092529, + "loss": 1.6284, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.25160551071167, + "rewards/margins": 0.7383045554161072, + "rewards/rejected": -1.9899098873138428, + "step": 199 + }, + { + "epoch": 0.4274646005877638, + "grad_norm": 3.4819066524505615, + "learning_rate": 7.067792524832603e-07, + "logits/chosen": -1.146781086921692, + "logits/rejected": -1.1264899969100952, + "logps/chosen": -0.41286712884902954, + "logps/rejected": -0.5021071434020996, + "loss": 1.5828, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0321677923202515, + "rewards/margins": 0.22309982776641846, + "rewards/rejected": -1.25526762008667, + "step": 200 + }, + { + "epoch": 0.42960192359070265, + "grad_norm": 4.699824810028076, + "learning_rate": 7.033683215379002e-07, + "logits/chosen": -1.204376459121704, + "logits/rejected": -1.0951697826385498, + "logps/chosen": -0.5037363171577454, + "logps/rejected": -0.5278567671775818, + "loss": 1.4982, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.259340763092041, + "rewards/margins": 0.060300953686237335, + "rewards/rejected": -1.3196419477462769, + "step": 201 + }, + { + "epoch": 0.4317392465936415, + "grad_norm": 6.670827865600586, + "learning_rate": 6.999460121598704e-07, + "logits/chosen": -1.240894079208374, + "logits/rejected": -1.1903066635131836, + "logps/chosen": -0.43070024251937866, + "logps/rejected": -0.5488986968994141, + "loss": 1.4348, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.076750636100769, + "rewards/margins": 0.29549601674079895, + "rewards/rejected": -1.3722467422485352, + "step": 202 + }, + { + "epoch": 0.4338765695965803, + "grad_norm": 5.494087219238281, + "learning_rate": 6.965125158269618e-07, + "logits/chosen": -0.9625495672225952, + "logits/rejected": -1.0179930925369263, + "logps/chosen": -0.46157702803611755, + "logps/rejected": -0.46555888652801514, + "loss": 1.5279, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.153942584991455, + "rewards/margins": 0.009954705834388733, + "rewards/rejected": -1.1638972759246826, + "step": 203 + }, + { + "epoch": 0.4360138925995191, + "grad_norm": 2.9398574829101562, + "learning_rate": 6.93068024642873e-07, + "logits/chosen": -1.0253064632415771, + "logits/rejected": -1.0069971084594727, + "logps/chosen": -0.7158012390136719, + "logps/rejected": -1.0074169635772705, + "loss": 1.4758, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7895030975341797, + "rewards/margins": 0.7290392518043518, + "rewards/rejected": -2.5185422897338867, + "step": 204 + }, + { + "epoch": 0.4381512156024579, + "grad_norm": 19.90937614440918, + "learning_rate": 6.896127313264642e-07, + "logits/chosen": -1.1298977136611938, + "logits/rejected": -1.038535475730896, + "logps/chosen": -0.4616938531398773, + "logps/rejected": -0.5090115070343018, + "loss": 1.5697, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1542346477508545, + "rewards/margins": 0.11829426884651184, + "rewards/rejected": -1.272528886795044, + "step": 205 + }, + { + "epoch": 0.44028853860539674, + "grad_norm": 8.115141868591309, + "learning_rate": 6.861468292009726e-07, + "logits/chosen": -1.1551835536956787, + "logits/rejected": -1.0220582485198975, + "logps/chosen": -0.5703631639480591, + "logps/rejected": -0.639542818069458, + "loss": 1.5053, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.4259079694747925, + "rewards/margins": 0.17294909060001373, + "rewards/rejected": -1.5988571643829346, + "step": 206 + }, + { + "epoch": 0.4424258616083356, + "grad_norm": 4.750410079956055, + "learning_rate": 6.826705121831976e-07, + "logits/chosen": -1.0615158081054688, + "logits/rejected": -1.0426111221313477, + "logps/chosen": -0.4923417270183563, + "logps/rejected": -0.7326016426086426, + "loss": 1.6027, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2308542728424072, + "rewards/margins": 0.6006497144699097, + "rewards/rejected": -1.8315041065216064, + "step": 207 + }, + { + "epoch": 0.44456318461127436, + "grad_norm": 10.167664527893066, + "learning_rate": 6.7918397477265e-07, + "logits/chosen": -0.9964532852172852, + "logits/rejected": -0.9452884793281555, + "logps/chosen": -0.44161027669906616, + "logps/rejected": -0.46314936876296997, + "loss": 1.4903, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1040257215499878, + "rewards/margins": 0.05384783446788788, + "rewards/rejected": -1.157873511314392, + "step": 208 + }, + { + "epoch": 0.4467005076142132, + "grad_norm": 6.646627426147461, + "learning_rate": 6.756874120406714e-07, + "logits/chosen": -1.2888025045394897, + "logits/rejected": -1.1944079399108887, + "logps/chosen": -0.5191195607185364, + "logps/rejected": -0.5014922022819519, + "loss": 1.5891, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.297798991203308, + "rewards/margins": -0.04406837746500969, + "rewards/rejected": -1.2537304162979126, + "step": 209 + }, + { + "epoch": 0.448837830617152, + "grad_norm": 14.135600090026855, + "learning_rate": 6.721810196195174e-07, + "logits/chosen": -1.063171148300171, + "logits/rejected": -0.992850124835968, + "logps/chosen": -0.6121366024017334, + "logps/rejected": -0.8903471231460571, + "loss": 1.5107, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.530341625213623, + "rewards/margins": 0.695526123046875, + "rewards/rejected": -2.225867748260498, + "step": 210 + }, + { + "epoch": 0.45097515362009083, + "grad_norm": 5.86182165145874, + "learning_rate": 6.68664993691415e-07, + "logits/chosen": -1.0612831115722656, + "logits/rejected": -0.9907495975494385, + "logps/chosen": -0.5007533431053162, + "logps/rejected": -0.558498203754425, + "loss": 1.4862, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2518832683563232, + "rewards/margins": 0.144362211227417, + "rewards/rejected": -1.3962455987930298, + "step": 211 + }, + { + "epoch": 0.45311247662302967, + "grad_norm": 6.808183670043945, + "learning_rate": 6.651395309775836e-07, + "logits/chosen": -1.0104148387908936, + "logits/rejected": -0.9466644525527954, + "logps/chosen": -0.5144920349121094, + "logps/rejected": -0.7595266103744507, + "loss": 1.5023, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.286230206489563, + "rewards/margins": 0.612586259841919, + "rewards/rejected": -1.898816466331482, + "step": 212 + }, + { + "epoch": 0.45524979962596845, + "grad_norm": 10.564079284667969, + "learning_rate": 6.6160482872723e-07, + "logits/chosen": -0.9594148397445679, + "logits/rejected": -0.6907047629356384, + "logps/chosen": -0.5221553444862366, + "logps/rejected": -0.5170431733131409, + "loss": 1.3874, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.305388331413269, + "rewards/margins": -0.012780493125319481, + "rewards/rejected": -1.2926077842712402, + "step": 213 + }, + { + "epoch": 0.4573871226289073, + "grad_norm": 3.8123362064361572, + "learning_rate": 6.580610847065123e-07, + "logits/chosen": -1.187415599822998, + "logits/rejected": -1.2069289684295654, + "logps/chosen": -0.5202743411064148, + "logps/rejected": -0.7161551117897034, + "loss": 1.4731, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3006858825683594, + "rewards/margins": 0.4897017776966095, + "rewards/rejected": -1.790387749671936, + "step": 214 + }, + { + "epoch": 0.45952444563184613, + "grad_norm": 5.941867351531982, + "learning_rate": 6.545084971874736e-07, + "logits/chosen": -1.1221826076507568, + "logits/rejected": -1.225363850593567, + "logps/chosen": -0.5203535556793213, + "logps/rejected": -0.721761167049408, + "loss": 1.5282, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3008837699890137, + "rewards/margins": 0.5035191178321838, + "rewards/rejected": -1.8044028282165527, + "step": 215 + }, + { + "epoch": 0.4616617686347849, + "grad_norm": 9.767511367797852, + "learning_rate": 6.509472649369509e-07, + "logits/chosen": -1.0308146476745605, + "logits/rejected": -1.0452611446380615, + "logps/chosen": -0.5595332980155945, + "logps/rejected": -0.8848905563354492, + "loss": 1.4983, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3988330364227295, + "rewards/margins": 0.8133932948112488, + "rewards/rejected": -2.212226152420044, + "step": 216 + }, + { + "epoch": 0.46379909163772376, + "grad_norm": 10.346195220947266, + "learning_rate": 6.473775872054521e-07, + "logits/chosen": -1.065643072128296, + "logits/rejected": -1.0063228607177734, + "logps/chosen": -0.6225321888923645, + "logps/rejected": -0.695502758026123, + "loss": 1.579, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5563305616378784, + "rewards/margins": 0.18242624402046204, + "rewards/rejected": -1.7387568950653076, + "step": 217 + }, + { + "epoch": 0.46593641464066254, + "grad_norm": 5.300914764404297, + "learning_rate": 6.437996637160086e-07, + "logits/chosen": -1.1427751779556274, + "logits/rejected": -1.0561769008636475, + "logps/chosen": -0.34371453523635864, + "logps/rejected": -0.5036041140556335, + "loss": 1.569, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8592862486839294, + "rewards/margins": 0.39972397685050964, + "rewards/rejected": -1.2590101957321167, + "step": 218 + }, + { + "epoch": 0.4680737376436014, + "grad_norm": 4.065801620483398, + "learning_rate": 6.402136946530014e-07, + "logits/chosen": -0.9067018032073975, + "logits/rejected": -0.8860527873039246, + "logps/chosen": -0.4043842852115631, + "logps/rejected": -0.37620705366134644, + "loss": 1.5412, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0109608173370361, + "rewards/margins": -0.07044325768947601, + "rewards/rejected": -0.9405175447463989, + "step": 219 + }, + { + "epoch": 0.4702110606465402, + "grad_norm": 7.65117073059082, + "learning_rate": 6.3661988065096e-07, + "logits/chosen": -1.2153977155685425, + "logits/rejected": -1.1215893030166626, + "logps/chosen": -0.7252041101455688, + "logps/rejected": -0.8050605058670044, + "loss": 1.5311, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.813010334968567, + "rewards/margins": 0.19964104890823364, + "rewards/rejected": -2.0126514434814453, + "step": 220 + }, + { + "epoch": 0.472348383649479, + "grad_norm": 11.2428560256958, + "learning_rate": 6.330184227833375e-07, + "logits/chosen": -0.9373965859413147, + "logits/rejected": -0.9961149096488953, + "logps/chosen": -0.35726746916770935, + "logps/rejected": -0.3844991624355316, + "loss": 1.6773, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.893168568611145, + "rewards/margins": 0.06807927042245865, + "rewards/rejected": -0.9612478613853455, + "step": 221 + }, + { + "epoch": 0.47448570665241785, + "grad_norm": 4.207274913787842, + "learning_rate": 6.294095225512604e-07, + "logits/chosen": -1.1654491424560547, + "logits/rejected": -0.9704052805900574, + "logps/chosen": -0.5772050023078918, + "logps/rejected": -0.5813780426979065, + "loss": 1.5306, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4430124759674072, + "rewards/margins": 0.010432573035359383, + "rewards/rejected": -1.4534451961517334, + "step": 222 + }, + { + "epoch": 0.4766230296553567, + "grad_norm": 9.872684478759766, + "learning_rate": 6.257933818722542e-07, + "logits/chosen": -1.1065782308578491, + "logits/rejected": -1.0201388597488403, + "logps/chosen": -0.45536190271377563, + "logps/rejected": -0.501686155796051, + "loss": 1.6091, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1384047269821167, + "rewards/margins": 0.1158108189702034, + "rewards/rejected": -1.2542154788970947, + "step": 223 + }, + { + "epoch": 0.4787603526582955, + "grad_norm": 4.423720359802246, + "learning_rate": 6.22170203068947e-07, + "logits/chosen": -1.0781972408294678, + "logits/rejected": -1.1105687618255615, + "logps/chosen": -0.4524337351322174, + "logps/rejected": -0.6186787486076355, + "loss": 1.4693, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1310843229293823, + "rewards/margins": 0.41561245918273926, + "rewards/rejected": -1.5466969013214111, + "step": 224 + }, + { + "epoch": 0.4808976756612343, + "grad_norm": 4.056224822998047, + "learning_rate": 6.185401888577487e-07, + "logits/chosen": -0.9637656211853027, + "logits/rejected": -1.049843668937683, + "logps/chosen": -0.4497312009334564, + "logps/rejected": -0.48656415939331055, + "loss": 1.5703, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1243281364440918, + "rewards/margins": 0.09208241105079651, + "rewards/rejected": -1.2164103984832764, + "step": 225 + }, + { + "epoch": 0.4830349986641731, + "grad_norm": 7.398245334625244, + "learning_rate": 6.149035423375098e-07, + "logits/chosen": -0.9626678228378296, + "logits/rejected": -0.9362674355506897, + "logps/chosen": -0.6995605230331421, + "logps/rejected": -0.7957741022109985, + "loss": 1.4827, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7489013671875, + "rewards/margins": 0.24053387343883514, + "rewards/rejected": -1.9894351959228516, + "step": 226 + }, + { + "epoch": 0.48517232166711194, + "grad_norm": 6.272643089294434, + "learning_rate": 6.112604669781572e-07, + "logits/chosen": -1.1032443046569824, + "logits/rejected": -0.9800142049789429, + "logps/chosen": -0.46503975987434387, + "logps/rejected": -0.6332381963729858, + "loss": 1.478, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1625994443893433, + "rewards/margins": 0.42049604654312134, + "rewards/rejected": -1.5830953121185303, + "step": 227 + }, + { + "epoch": 0.4873096446700508, + "grad_norm": 8.032245635986328, + "learning_rate": 6.07611166609311e-07, + "logits/chosen": -1.2633821964263916, + "logits/rejected": -1.1391656398773193, + "logps/chosen": -0.442541241645813, + "logps/rejected": -0.7329965829849243, + "loss": 1.5334, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1063531637191772, + "rewards/margins": 0.7261385917663574, + "rewards/rejected": -1.8324915170669556, + "step": 228 + }, + { + "epoch": 0.48944696767298956, + "grad_norm": 3.549194097518921, + "learning_rate": 6.039558454088795e-07, + "logits/chosen": -1.1071525812149048, + "logits/rejected": -0.9752082228660583, + "logps/chosen": -0.7183459401130676, + "logps/rejected": -0.5012776255607605, + "loss": 1.5703, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.7958645820617676, + "rewards/margins": -0.542670726776123, + "rewards/rejected": -1.2531940937042236, + "step": 229 + }, + { + "epoch": 0.4915842906759284, + "grad_norm": 7.271309852600098, + "learning_rate": 6.002947078916364e-07, + "logits/chosen": -1.1285755634307861, + "logits/rejected": -1.0309889316558838, + "logps/chosen": -0.5892479419708252, + "logps/rejected": -0.8529112339019775, + "loss": 1.5424, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4731197357177734, + "rewards/margins": 0.65915846824646, + "rewards/rejected": -2.1322782039642334, + "step": 230 + }, + { + "epoch": 0.49372161367886724, + "grad_norm": 9.371764183044434, + "learning_rate": 5.966279588977766e-07, + "logits/chosen": -1.1868503093719482, + "logits/rejected": -1.0965262651443481, + "logps/chosen": -0.5913974642753601, + "logps/rejected": -0.5998914837837219, + "loss": 1.5586, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4784936904907227, + "rewards/margins": 0.02123492956161499, + "rewards/rejected": -1.4997284412384033, + "step": 231 + }, + { + "epoch": 0.49585893668180603, + "grad_norm": 3.9326915740966797, + "learning_rate": 5.929558035814574e-07, + "logits/chosen": -1.0558724403381348, + "logits/rejected": -1.0998754501342773, + "logps/chosen": -0.5341198444366455, + "logps/rejected": -0.6374375820159912, + "loss": 1.4501, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3352996110916138, + "rewards/margins": 0.25829440355300903, + "rewards/rejected": -1.593593955039978, + "step": 232 + }, + { + "epoch": 0.49799625968474487, + "grad_norm": 5.740602016448975, + "learning_rate": 5.892784473993183e-07, + "logits/chosen": -0.9249231815338135, + "logits/rejected": -0.9186200499534607, + "logps/chosen": -0.5150828957557678, + "logps/rejected": -0.7307932376861572, + "loss": 1.5555, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2877074480056763, + "rewards/margins": 0.5392757654190063, + "rewards/rejected": -1.826983094215393, + "step": 233 + }, + { + "epoch": 0.5001335826876837, + "grad_norm": 4.63511323928833, + "learning_rate": 5.855960960989876e-07, + "logits/chosen": -1.0853294134140015, + "logits/rejected": -0.9906125068664551, + "logps/chosen": -0.6735414266586304, + "logps/rejected": -0.6355391144752502, + "loss": 1.5485, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.6838535070419312, + "rewards/margins": -0.09500567615032196, + "rewards/rejected": -1.5888478755950928, + "step": 234 + }, + { + "epoch": 0.5022709056906225, + "grad_norm": 4.9034810066223145, + "learning_rate": 5.819089557075688e-07, + "logits/chosen": -1.1883198022842407, + "logits/rejected": -1.111728310585022, + "logps/chosen": -0.5791587233543396, + "logps/rejected": -0.5992032289505005, + "loss": 1.4893, + "rewards/accuracies": 0.1875, + "rewards/chosen": -1.4478968381881714, + "rewards/margins": 0.05011126026511192, + "rewards/rejected": -1.498008131980896, + "step": 235 + }, + { + "epoch": 0.5044082286935613, + "grad_norm": 4.951679706573486, + "learning_rate": 5.782172325201155e-07, + "logits/chosen": -1.099345088005066, + "logits/rejected": -0.9732553362846375, + "logps/chosen": -0.41670212149620056, + "logps/rejected": -0.49779534339904785, + "loss": 1.4412, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.041755199432373, + "rewards/margins": 0.2027330994606018, + "rewards/rejected": -1.2444884777069092, + "step": 236 + }, + { + "epoch": 0.5065455516965002, + "grad_norm": 5.122452259063721, + "learning_rate": 5.745211330880872e-07, + "logits/chosen": -1.1886011362075806, + "logits/rejected": -1.0731868743896484, + "logps/chosen": -0.5531087517738342, + "logps/rejected": -0.5820314884185791, + "loss": 1.4998, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3827718496322632, + "rewards/margins": 0.07230678200721741, + "rewards/rejected": -1.4550786018371582, + "step": 237 + }, + { + "epoch": 0.508682874699439, + "grad_norm": 5.16255521774292, + "learning_rate": 5.708208642077945e-07, + "logits/chosen": -0.9164212942123413, + "logits/rejected": -0.9998050332069397, + "logps/chosen": -0.4382399022579193, + "logps/rejected": -0.526303768157959, + "loss": 1.4613, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0955997705459595, + "rewards/margins": 0.22015975415706635, + "rewards/rejected": -1.315759539604187, + "step": 238 + }, + { + "epoch": 0.5108201977023777, + "grad_norm": 5.4947052001953125, + "learning_rate": 5.671166329088277e-07, + "logits/chosen": -1.2119641304016113, + "logits/rejected": -1.2597813606262207, + "logps/chosen": -0.5000596046447754, + "logps/rejected": -0.6806124448776245, + "loss": 1.49, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2501490116119385, + "rewards/margins": 0.4513818621635437, + "rewards/rejected": -1.701530933380127, + "step": 239 + }, + { + "epoch": 0.5129575207053166, + "grad_norm": 6.559435844421387, + "learning_rate": 5.634086464424742e-07, + "logits/chosen": -1.217972755432129, + "logits/rejected": -1.1487551927566528, + "logps/chosen": -0.49298563599586487, + "logps/rejected": -0.6521138548851013, + "loss": 1.5189, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.232464075088501, + "rewards/margins": 0.3978206217288971, + "rewards/rejected": -1.6302846670150757, + "step": 240 + }, + { + "epoch": 0.5150948437082554, + "grad_norm": 11.107866287231445, + "learning_rate": 5.596971122701221e-07, + "logits/chosen": -1.1741474866867065, + "logits/rejected": -1.1197490692138672, + "logps/chosen": -0.3616866171360016, + "logps/rejected": -0.4305798411369324, + "loss": 1.5347, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9042165875434875, + "rewards/margins": 0.17223304510116577, + "rewards/rejected": -1.0764496326446533, + "step": 241 + }, + { + "epoch": 0.5172321667111942, + "grad_norm": 6.960956573486328, + "learning_rate": 5.559822380516539e-07, + "logits/chosen": -1.2302253246307373, + "logits/rejected": -1.075247883796692, + "logps/chosen": -0.4363064169883728, + "logps/rejected": -0.31692779064178467, + "loss": 1.5663, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.090766191482544, + "rewards/margins": -0.29844653606414795, + "rewards/rejected": -0.7923195958137512, + "step": 242 + }, + { + "epoch": 0.5193694897141331, + "grad_norm": 6.810981273651123, + "learning_rate": 5.522642316338268e-07, + "logits/chosen": -1.0663301944732666, + "logits/rejected": -1.1324117183685303, + "logps/chosen": -0.4831632673740387, + "logps/rejected": -0.7216737270355225, + "loss": 1.4868, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.207908272743225, + "rewards/margins": 0.596276044845581, + "rewards/rejected": -1.8041841983795166, + "step": 243 + }, + { + "epoch": 0.5215068127170719, + "grad_norm": 5.594532489776611, + "learning_rate": 5.48543301038644e-07, + "logits/chosen": -1.1370617151260376, + "logits/rejected": -1.1158241033554077, + "logps/chosen": -0.49420881271362305, + "logps/rejected": -0.6353664398193359, + "loss": 1.4837, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2355221509933472, + "rewards/margins": 0.35289400815963745, + "rewards/rejected": -1.5884160995483398, + "step": 244 + }, + { + "epoch": 0.5236441357200107, + "grad_norm": 5.150038719177246, + "learning_rate": 5.448196544517167e-07, + "logits/chosen": -0.9938709139823914, + "logits/rejected": -0.9924468398094177, + "logps/chosen": -0.46965187788009644, + "logps/rejected": -0.5854384899139404, + "loss": 1.4984, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.174129843711853, + "rewards/margins": 0.28946635127067566, + "rewards/rejected": -1.4635961055755615, + "step": 245 + }, + { + "epoch": 0.5257814587229495, + "grad_norm": 5.775374412536621, + "learning_rate": 5.410935002106152e-07, + "logits/chosen": -1.2858668565750122, + "logits/rejected": -1.339996099472046, + "logps/chosen": -0.5900648832321167, + "logps/rejected": -0.743665337562561, + "loss": 1.5673, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.475162148475647, + "rewards/margins": 0.38400113582611084, + "rewards/rejected": -1.8591632843017578, + "step": 246 + }, + { + "epoch": 0.5279187817258884, + "grad_norm": 6.673636436462402, + "learning_rate": 5.373650467932121e-07, + "logits/chosen": -0.9241991639137268, + "logits/rejected": -0.9246317744255066, + "logps/chosen": -0.7461893558502197, + "logps/rejected": -1.305277705192566, + "loss": 1.5427, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8654735088348389, + "rewards/margins": 1.3977206945419312, + "rewards/rejected": -3.2631943225860596, + "step": 247 + }, + { + "epoch": 0.5300561047288271, + "grad_norm": 10.964753150939941, + "learning_rate": 5.336345028060199e-07, + "logits/chosen": -1.1549595594406128, + "logits/rejected": -1.0779494047164917, + "logps/chosen": -0.4138936996459961, + "logps/rejected": -0.5448856353759766, + "loss": 1.5164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0347342491149902, + "rewards/margins": 0.32747986912727356, + "rewards/rejected": -1.362214207649231, + "step": 248 + }, + { + "epoch": 0.5321934277317659, + "grad_norm": 7.340090751647949, + "learning_rate": 5.299020769725171e-07, + "logits/chosen": -1.2774559259414673, + "logits/rejected": -1.161049723625183, + "logps/chosen": -0.5907987356185913, + "logps/rejected": -0.784879207611084, + "loss": 1.5591, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4769967794418335, + "rewards/margins": 0.48520126938819885, + "rewards/rejected": -1.96219801902771, + "step": 249 + }, + { + "epoch": 0.5343307507347048, + "grad_norm": 9.162212371826172, + "learning_rate": 5.26167978121472e-07, + "logits/chosen": -0.9980146884918213, + "logits/rejected": -0.9839775562286377, + "logps/chosen": -0.5775759816169739, + "logps/rejected": -0.9316012859344482, + "loss": 1.5055, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4439398050308228, + "rewards/margins": 0.8850634694099426, + "rewards/rejected": -2.32900333404541, + "step": 250 + }, + { + "epoch": 0.5364680737376436, + "grad_norm": 6.221843242645264, + "learning_rate": 5.224324151752575e-07, + "logits/chosen": -1.1261907815933228, + "logits/rejected": -1.119322657585144, + "logps/chosen": -0.6372994184494019, + "logps/rejected": -0.7105351686477661, + "loss": 1.5162, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5932482481002808, + "rewards/margins": 0.1830897331237793, + "rewards/rejected": -1.7763381004333496, + "step": 251 + }, + { + "epoch": 0.5386053967405824, + "grad_norm": 7.211760997772217, + "learning_rate": 5.18695597138163e-07, + "logits/chosen": -1.0601727962493896, + "logits/rejected": -1.0903403759002686, + "logps/chosen": -0.6433329582214355, + "logps/rejected": -0.7051988840103149, + "loss": 1.5038, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6083323955535889, + "rewards/margins": 0.15466496348381042, + "rewards/rejected": -1.7629972696304321, + "step": 252 + }, + { + "epoch": 0.5407427197435213, + "grad_norm": 11.955366134643555, + "learning_rate": 5.149577330846992e-07, + "logits/chosen": -0.9905640482902527, + "logits/rejected": -0.9827526807785034, + "logps/chosen": -0.460682213306427, + "logps/rejected": -0.5131441950798035, + "loss": 1.5193, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.1517056226730347, + "rewards/margins": 0.13115480542182922, + "rewards/rejected": -1.282860279083252, + "step": 253 + }, + { + "epoch": 0.5428800427464601, + "grad_norm": 7.990007400512695, + "learning_rate": 5.112190321479025e-07, + "logits/chosen": -1.0986907482147217, + "logits/rejected": -1.047619342803955, + "logps/chosen": -0.42801418900489807, + "logps/rejected": -0.4528927206993103, + "loss": 1.5259, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.070035457611084, + "rewards/margins": 0.062196291983127594, + "rewards/rejected": -1.1322317123413086, + "step": 254 + }, + { + "epoch": 0.5450173657493989, + "grad_norm": 12.59540843963623, + "learning_rate": 5.074797035076318e-07, + "logits/chosen": -1.088548183441162, + "logits/rejected": -1.094880223274231, + "logps/chosen": -0.44651007652282715, + "logps/rejected": -0.6774522066116333, + "loss": 1.4684, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1162753105163574, + "rewards/margins": 0.577355146408081, + "rewards/rejected": -1.6936304569244385, + "step": 255 + }, + { + "epoch": 0.5471546887523377, + "grad_norm": 8.063529968261719, + "learning_rate": 5.037399563788664e-07, + "logits/chosen": -1.0905590057373047, + "logits/rejected": -1.0908201932907104, + "logps/chosen": -0.571644127368927, + "logps/rejected": -1.0555922985076904, + "loss": 1.4844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.429110050201416, + "rewards/margins": 1.2098705768585205, + "rewards/rejected": -2.6389808654785156, + "step": 256 + }, + { + "epoch": 0.5492920117552765, + "grad_norm": 4.676781177520752, + "learning_rate": 5e-07, + "logits/chosen": -0.9753708839416504, + "logits/rejected": -0.8780065774917603, + "logps/chosen": -0.4151499271392822, + "logps/rejected": -0.6088293790817261, + "loss": 1.5588, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0378748178482056, + "rewards/margins": 0.4841986298561096, + "rewards/rejected": -1.5220733880996704, + "step": 257 + }, + { + "epoch": 0.5514293347582153, + "grad_norm": 5.617074489593506, + "learning_rate": 4.962600436211335e-07, + "logits/chosen": -1.0411148071289062, + "logits/rejected": -1.0378780364990234, + "logps/chosen": -0.4362824261188507, + "logps/rejected": -0.6974292993545532, + "loss": 1.5426, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0907059907913208, + "rewards/margins": 0.6528674960136414, + "rewards/rejected": -1.743573546409607, + "step": 258 + }, + { + "epoch": 0.5535666577611541, + "grad_norm": 4.551398754119873, + "learning_rate": 4.925202964923683e-07, + "logits/chosen": -1.2078725099563599, + "logits/rejected": -1.1912176609039307, + "logps/chosen": -0.6027969121932983, + "logps/rejected": -0.6982436180114746, + "loss": 1.4439, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.506992220878601, + "rewards/margins": 0.23861676454544067, + "rewards/rejected": -1.7456090450286865, + "step": 259 + }, + { + "epoch": 0.555703980764093, + "grad_norm": 7.812004566192627, + "learning_rate": 4.887809678520975e-07, + "logits/chosen": -0.9420537948608398, + "logits/rejected": -0.9832956790924072, + "logps/chosen": -0.4232637584209442, + "logps/rejected": -0.5239140391349792, + "loss": 1.531, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.058159351348877, + "rewards/margins": 0.2516256868839264, + "rewards/rejected": -1.30978524684906, + "step": 260 + }, + { + "epoch": 0.5578413037670318, + "grad_norm": 6.302420139312744, + "learning_rate": 4.850422669153009e-07, + "logits/chosen": -0.898323655128479, + "logits/rejected": -0.8499814867973328, + "logps/chosen": -0.4516730308532715, + "logps/rejected": -0.48423612117767334, + "loss": 1.5732, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1291825771331787, + "rewards/margins": 0.08140772581100464, + "rewards/rejected": -1.2105903625488281, + "step": 261 + }, + { + "epoch": 0.5599786267699706, + "grad_norm": 4.167732238769531, + "learning_rate": 4.813044028618372e-07, + "logits/chosen": -1.16361403465271, + "logits/rejected": -1.165586233139038, + "logps/chosen": -0.7509727478027344, + "logps/rejected": -1.160327672958374, + "loss": 1.531, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8774319887161255, + "rewards/margins": 1.0233874320983887, + "rewards/rejected": -2.9008195400238037, + "step": 262 + }, + { + "epoch": 0.5621159497729095, + "grad_norm": 3.631260633468628, + "learning_rate": 4.775675848247427e-07, + "logits/chosen": -1.097744107246399, + "logits/rejected": -1.1007401943206787, + "logps/chosen": -0.6823064088821411, + "logps/rejected": -0.7260396480560303, + "loss": 1.4614, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.705765962600708, + "rewards/margins": 0.10933306813240051, + "rewards/rejected": -1.8150990009307861, + "step": 263 + }, + { + "epoch": 0.5642532727758482, + "grad_norm": 10.018603324890137, + "learning_rate": 4.7383202187852804e-07, + "logits/chosen": -1.0766931772232056, + "logits/rejected": -1.059554934501648, + "logps/chosen": -0.592974066734314, + "logps/rejected": -0.5639240741729736, + "loss": 1.6513, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.4824351072311401, + "rewards/margins": -0.07262498140335083, + "rewards/rejected": -1.4098100662231445, + "step": 264 + }, + { + "epoch": 0.566390595778787, + "grad_norm": 10.986533164978027, + "learning_rate": 4.700979230274829e-07, + "logits/chosen": -1.1942667961120605, + "logits/rejected": -1.2509047985076904, + "logps/chosen": -0.5944747924804688, + "logps/rejected": -0.7951716184616089, + "loss": 1.5124, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.486187219619751, + "rewards/margins": 0.5017418265342712, + "rewards/rejected": -1.987929105758667, + "step": 265 + }, + { + "epoch": 0.5685279187817259, + "grad_norm": 7.592911720275879, + "learning_rate": 4.6636549719398016e-07, + "logits/chosen": -1.0581759214401245, + "logits/rejected": -1.1627366542816162, + "logps/chosen": -0.4522143006324768, + "logps/rejected": -0.7358817458152771, + "loss": 1.518, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1305358409881592, + "rewards/margins": 0.7091686129570007, + "rewards/rejected": -1.8397043943405151, + "step": 266 + }, + { + "epoch": 0.5706652417846647, + "grad_norm": 6.3432488441467285, + "learning_rate": 4.626349532067879e-07, + "logits/chosen": -1.2379893064498901, + "logits/rejected": -1.2148290872573853, + "logps/chosen": -0.6970117092132568, + "logps/rejected": -0.6927489638328552, + "loss": 1.5069, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7425293922424316, + "rewards/margins": -0.010656729340553284, + "rewards/rejected": -1.73187255859375, + "step": 267 + }, + { + "epoch": 0.5728025647876035, + "grad_norm": 17.149002075195312, + "learning_rate": 4.5890649978938487e-07, + "logits/chosen": -1.0269464254379272, + "logits/rejected": -0.992821216583252, + "logps/chosen": -0.5960928201675415, + "logps/rejected": -0.9721090793609619, + "loss": 1.4951, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.490232229232788, + "rewards/margins": 0.940040647983551, + "rewards/rejected": -2.4302728176116943, + "step": 268 + }, + { + "epoch": 0.5749398877905424, + "grad_norm": 14.956734657287598, + "learning_rate": 4.5518034554828327e-07, + "logits/chosen": -1.1823533773422241, + "logits/rejected": -1.1165564060211182, + "logps/chosen": -0.6763143539428711, + "logps/rejected": -0.6308431029319763, + "loss": 1.6765, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.6907857656478882, + "rewards/margins": -0.11367809772491455, + "rewards/rejected": -1.5771077871322632, + "step": 269 + }, + { + "epoch": 0.5770772107934812, + "grad_norm": 10.486448287963867, + "learning_rate": 4.514566989613559e-07, + "logits/chosen": -0.9095754027366638, + "logits/rejected": -0.9553719162940979, + "logps/chosen": -0.4061740040779114, + "logps/rejected": -0.5486086010932922, + "loss": 1.6421, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0154350996017456, + "rewards/margins": 0.3560864329338074, + "rewards/rejected": -1.3715215921401978, + "step": 270 + }, + { + "epoch": 0.57921453379642, + "grad_norm": 6.875598907470703, + "learning_rate": 4.477357683661733e-07, + "logits/chosen": -1.016709327697754, + "logits/rejected": -1.0593208074569702, + "logps/chosen": -0.9499566555023193, + "logps/rejected": -1.4164179563522339, + "loss": 1.4581, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.374891519546509, + "rewards/margins": 1.1661533117294312, + "rewards/rejected": -3.5410449504852295, + "step": 271 + }, + { + "epoch": 0.5813518567993589, + "grad_norm": 4.006314754486084, + "learning_rate": 4.4401776194834603e-07, + "logits/chosen": -1.007668375968933, + "logits/rejected": -0.886061429977417, + "logps/chosen": -0.8610125780105591, + "logps/rejected": -0.9118555188179016, + "loss": 1.5326, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.152531385421753, + "rewards/margins": 0.12710730731487274, + "rewards/rejected": -2.2796387672424316, + "step": 272 + }, + { + "epoch": 0.5834891798022976, + "grad_norm": 5.665445804595947, + "learning_rate": 4.403028877298779e-07, + "logits/chosen": -0.9741629362106323, + "logits/rejected": -0.9361596703529358, + "logps/chosen": -0.6918255090713501, + "logps/rejected": -0.7569224834442139, + "loss": 1.4643, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.729563593864441, + "rewards/margins": 0.16274265944957733, + "rewards/rejected": -1.8923062086105347, + "step": 273 + }, + { + "epoch": 0.5856265028052364, + "grad_norm": 6.989828109741211, + "learning_rate": 4.3659135355752593e-07, + "logits/chosen": -0.9656753540039062, + "logits/rejected": -0.9632652997970581, + "logps/chosen": -0.719750702381134, + "logps/rejected": -0.7859822511672974, + "loss": 1.6282, + "rewards/accuracies": 0.1875, + "rewards/chosen": -1.7993768453598022, + "rewards/margins": 0.16557902097702026, + "rewards/rejected": -1.9649556875228882, + "step": 274 + }, + { + "epoch": 0.5877638258081752, + "grad_norm": 13.183172225952148, + "learning_rate": 4.328833670911724e-07, + "logits/chosen": -0.7260496616363525, + "logits/rejected": -0.6252482533454895, + "logps/chosen": -0.7889982461929321, + "logps/rejected": -0.7441343069076538, + "loss": 1.5726, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.9724955558776855, + "rewards/margins": -0.11215980350971222, + "rewards/rejected": -1.8603358268737793, + "step": 275 + }, + { + "epoch": 0.5899011488111141, + "grad_norm": 4.624598979949951, + "learning_rate": 4.2917913579220553e-07, + "logits/chosen": -1.2080626487731934, + "logits/rejected": -1.0251206159591675, + "logps/chosen": -0.6163781881332397, + "logps/rejected": -0.5924357175827026, + "loss": 1.5521, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.5409454107284546, + "rewards/margins": -0.05985613912343979, + "rewards/rejected": -1.4810893535614014, + "step": 276 + }, + { + "epoch": 0.5920384718140529, + "grad_norm": 5.583042144775391, + "learning_rate": 4.254788669119127e-07, + "logits/chosen": -1.2352536916732788, + "logits/rejected": -1.1895160675048828, + "logps/chosen": -0.6844548583030701, + "logps/rejected": -1.2121189832687378, + "loss": 1.5148, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.711137056350708, + "rewards/margins": 1.3191602230072021, + "rewards/rejected": -3.0302975177764893, + "step": 277 + }, + { + "epoch": 0.5941757948169917, + "grad_norm": 7.106680393218994, + "learning_rate": 4.2178276747988444e-07, + "logits/chosen": -1.0981608629226685, + "logits/rejected": -1.097581148147583, + "logps/chosen": -0.7495326995849609, + "logps/rejected": -0.9760425090789795, + "loss": 1.556, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8738317489624023, + "rewards/margins": 0.5662744641304016, + "rewards/rejected": -2.440106153488159, + "step": 278 + }, + { + "epoch": 0.5963131178199306, + "grad_norm": 18.446353912353516, + "learning_rate": 4.180910442924311e-07, + "logits/chosen": -1.1512420177459717, + "logits/rejected": -1.117828607559204, + "logps/chosen": -0.7383052110671997, + "logps/rejected": -0.5387950539588928, + "loss": 1.5904, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.845763087272644, + "rewards/margins": -0.49877557158470154, + "rewards/rejected": -1.3469874858856201, + "step": 279 + }, + { + "epoch": 0.5984504408228694, + "grad_norm": 7.653874397277832, + "learning_rate": 4.144039039010124e-07, + "logits/chosen": -0.9732711315155029, + "logits/rejected": -0.9874970316886902, + "logps/chosen": -0.5405304431915283, + "logps/rejected": -0.5719125866889954, + "loss": 1.4652, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.3513259887695312, + "rewards/margins": 0.07845549285411835, + "rewards/rejected": -1.429781436920166, + "step": 280 + }, + { + "epoch": 0.6005877638258081, + "grad_norm": 21.542905807495117, + "learning_rate": 4.107215526006817e-07, + "logits/chosen": -0.9731748700141907, + "logits/rejected": -0.8227044939994812, + "logps/chosen": -0.7879041433334351, + "logps/rejected": -1.0974005460739136, + "loss": 1.5757, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9697604179382324, + "rewards/margins": 0.773740828037262, + "rewards/rejected": -2.7435011863708496, + "step": 281 + }, + { + "epoch": 0.602725086828747, + "grad_norm": 23.93320655822754, + "learning_rate": 4.070441964185427e-07, + "logits/chosen": -0.95560622215271, + "logits/rejected": -0.8649751543998718, + "logps/chosen": -0.7008312940597534, + "logps/rejected": -0.8380212187767029, + "loss": 1.4773, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7520781755447388, + "rewards/margins": 0.34297484159469604, + "rewards/rejected": -2.09505295753479, + "step": 282 + }, + { + "epoch": 0.6048624098316858, + "grad_norm": 8.638965606689453, + "learning_rate": 4.0337204110222347e-07, + "logits/chosen": -1.095788598060608, + "logits/rejected": -1.0814367532730103, + "logps/chosen": -1.298113226890564, + "logps/rejected": -0.9762169718742371, + "loss": 1.6721, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.2452828884124756, + "rewards/margins": -0.804740309715271, + "rewards/rejected": -2.440542459487915, + "step": 283 + }, + { + "epoch": 0.6069997328346246, + "grad_norm": 10.137685775756836, + "learning_rate": 3.997052921083636e-07, + "logits/chosen": -0.9736285209655762, + "logits/rejected": -0.9580354690551758, + "logps/chosen": -0.760413408279419, + "logps/rejected": -0.6742191910743713, + "loss": 1.6567, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9010334014892578, + "rewards/margins": -0.2154853194952011, + "rewards/rejected": -1.6855480670928955, + "step": 284 + }, + { + "epoch": 0.6091370558375635, + "grad_norm": 9.106868743896484, + "learning_rate": 3.960441545911204e-07, + "logits/chosen": -1.0179665088653564, + "logits/rejected": -1.0651259422302246, + "logps/chosen": -0.6087648272514343, + "logps/rejected": -0.8786913156509399, + "loss": 1.4928, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5219119787216187, + "rewards/margins": 0.6748162508010864, + "rewards/rejected": -2.196728467941284, + "step": 285 + }, + { + "epoch": 0.6112743788405023, + "grad_norm": 26.156328201293945, + "learning_rate": 3.92388833390689e-07, + "logits/chosen": -0.7999076843261719, + "logits/rejected": -0.8813593983650208, + "logps/chosen": -0.564832329750061, + "logps/rejected": -0.6178359985351562, + "loss": 1.5268, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4120807647705078, + "rewards/margins": 0.13250917196273804, + "rewards/rejected": -1.544589877128601, + "step": 286 + }, + { + "epoch": 0.6134117018434411, + "grad_norm": 6.264410018920898, + "learning_rate": 3.8873953302184283e-07, + "logits/chosen": -0.9414565563201904, + "logits/rejected": -0.8855915069580078, + "logps/chosen": -0.5107077360153198, + "logps/rejected": -0.4826885461807251, + "loss": 1.6146, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2767692804336548, + "rewards/margins": -0.0700480192899704, + "rewards/rejected": -1.206721305847168, + "step": 287 + }, + { + "epoch": 0.6155490248463799, + "grad_norm": 4.398477554321289, + "learning_rate": 3.8509645766249034e-07, + "logits/chosen": -1.208186388015747, + "logits/rejected": -1.1303879022598267, + "logps/chosen": -0.6073251962661743, + "logps/rejected": -0.7171632647514343, + "loss": 1.4436, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.518312931060791, + "rewards/margins": 0.27459537982940674, + "rewards/rejected": -1.7929084300994873, + "step": 288 + }, + { + "epoch": 0.6176863478493188, + "grad_norm": 7.238714218139648, + "learning_rate": 3.814598111422513e-07, + "logits/chosen": -1.1826701164245605, + "logits/rejected": -1.10812246799469, + "logps/chosen": -0.5312564373016357, + "logps/rejected": -0.8379625678062439, + "loss": 1.4728, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3281410932540894, + "rewards/margins": 0.7667654752731323, + "rewards/rejected": -2.0949063301086426, + "step": 289 + }, + { + "epoch": 0.6198236708522575, + "grad_norm": 9.654561042785645, + "learning_rate": 3.778297969310529e-07, + "logits/chosen": -1.042212724685669, + "logits/rejected": -1.0525404214859009, + "logps/chosen": -0.48385101556777954, + "logps/rejected": -0.5251243114471436, + "loss": 1.5846, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2096275091171265, + "rewards/margins": 0.10318319499492645, + "rewards/rejected": -1.3128107786178589, + "step": 290 + }, + { + "epoch": 0.6219609938551963, + "grad_norm": 7.571155071258545, + "learning_rate": 3.742066181277457e-07, + "logits/chosen": -1.013511061668396, + "logits/rejected": -1.0030831098556519, + "logps/chosen": -0.518464207649231, + "logps/rejected": -0.6727429628372192, + "loss": 1.4901, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2961605787277222, + "rewards/margins": 0.3856966495513916, + "rewards/rejected": -1.6818573474884033, + "step": 291 + }, + { + "epoch": 0.6240983168581352, + "grad_norm": 14.75028133392334, + "learning_rate": 3.7059047744873955e-07, + "logits/chosen": -1.0597176551818848, + "logits/rejected": -1.0602428913116455, + "logps/chosen": -0.406850129365921, + "logps/rejected": -0.6300184726715088, + "loss": 1.5866, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0171253681182861, + "rewards/margins": 0.5579207539558411, + "rewards/rejected": -1.5750460624694824, + "step": 292 + }, + { + "epoch": 0.626235639861074, + "grad_norm": 15.745006561279297, + "learning_rate": 3.669815772166625e-07, + "logits/chosen": -1.2174315452575684, + "logits/rejected": -1.1674164533615112, + "logps/chosen": -0.7850491404533386, + "logps/rejected": -0.519411027431488, + "loss": 1.6349, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.9626227617263794, + "rewards/margins": -0.6640951633453369, + "rewards/rejected": -1.298527717590332, + "step": 293 + }, + { + "epoch": 0.6283729628640128, + "grad_norm": 5.319033622741699, + "learning_rate": 3.6338011934904e-07, + "logits/chosen": -1.2410860061645508, + "logits/rejected": -1.2556794881820679, + "logps/chosen": -0.5623264908790588, + "logps/rejected": -0.6723726987838745, + "loss": 1.5504, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4058163166046143, + "rewards/margins": 0.27511537075042725, + "rewards/rejected": -1.680931568145752, + "step": 294 + }, + { + "epoch": 0.6305102858669517, + "grad_norm": 6.130387783050537, + "learning_rate": 3.5978630534699865e-07, + "logits/chosen": -1.0128402709960938, + "logits/rejected": -0.9643303155899048, + "logps/chosen": -0.7096951007843018, + "logps/rejected": -0.9845619797706604, + "loss": 1.4712, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7742375135421753, + "rewards/margins": 0.6871672868728638, + "rewards/rejected": -2.461405038833618, + "step": 295 + }, + { + "epoch": 0.6326476088698905, + "grad_norm": 5.171699047088623, + "learning_rate": 3.562003362839914e-07, + "logits/chosen": -0.9853148460388184, + "logits/rejected": -0.9575502872467041, + "logps/chosen": -0.5504204034805298, + "logps/rejected": -0.6407392621040344, + "loss": 1.4703, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3760509490966797, + "rewards/margins": 0.22579708695411682, + "rewards/rejected": -1.6018481254577637, + "step": 296 + }, + { + "epoch": 0.6347849318728293, + "grad_norm": 6.909521102905273, + "learning_rate": 3.526224127945478e-07, + "logits/chosen": -1.0654948949813843, + "logits/rejected": -0.9859604239463806, + "logps/chosen": -0.49610620737075806, + "logps/rejected": -0.657010018825531, + "loss": 1.5083, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2402657270431519, + "rewards/margins": 0.40225934982299805, + "rewards/rejected": -1.6425249576568604, + "step": 297 + }, + { + "epoch": 0.6369222548757681, + "grad_norm": 8.302495956420898, + "learning_rate": 3.49052735063049e-07, + "logits/chosen": -0.9159524440765381, + "logits/rejected": -0.9115422964096069, + "logps/chosen": -0.47944843769073486, + "logps/rejected": -0.8350514769554138, + "loss": 1.4795, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.198621153831482, + "rewards/margins": 0.8890076875686646, + "rewards/rejected": -2.0876286029815674, + "step": 298 + }, + { + "epoch": 0.6390595778787069, + "grad_norm": 8.518818855285645, + "learning_rate": 3.454915028125263e-07, + "logits/chosen": -1.1150455474853516, + "logits/rejected": -1.0280684232711792, + "logps/chosen": -0.6259050965309143, + "logps/rejected": -0.7404144406318665, + "loss": 1.6097, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5647625923156738, + "rewards/margins": 0.2862735688686371, + "rewards/rejected": -1.8510361909866333, + "step": 299 + }, + { + "epoch": 0.6411969008816457, + "grad_norm": 5.814731597900391, + "learning_rate": 3.4193891529348795e-07, + "logits/chosen": -0.8910290002822876, + "logits/rejected": -0.887702465057373, + "logps/chosen": -0.5074589848518372, + "logps/rejected": -0.5351721048355103, + "loss": 1.5944, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2686476707458496, + "rewards/margins": 0.06928272545337677, + "rewards/rejected": -1.3379302024841309, + "step": 300 + }, + { + "epoch": 0.6433342238845846, + "grad_norm": 6.027761459350586, + "learning_rate": 3.3839517127277004e-07, + "logits/chosen": -0.9173066020011902, + "logits/rejected": -0.8492714762687683, + "logps/chosen": -0.5284210443496704, + "logps/rejected": -0.9472272992134094, + "loss": 1.4989, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3210527896881104, + "rewards/margins": 1.04701566696167, + "rewards/rejected": -2.368068218231201, + "step": 301 + }, + { + "epoch": 0.6454715468875234, + "grad_norm": 14.567846298217773, + "learning_rate": 3.348604690224166e-07, + "logits/chosen": -1.006145715713501, + "logits/rejected": -1.1280263662338257, + "logps/chosen": -0.552309513092041, + "logps/rejected": -0.8673663139343262, + "loss": 1.4528, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3807737827301025, + "rewards/margins": 0.7876418232917786, + "rewards/rejected": -2.1684157848358154, + "step": 302 + }, + { + "epoch": 0.6476088698904622, + "grad_norm": 6.112685203552246, + "learning_rate": 3.31335006308585e-07, + "logits/chosen": -1.1263082027435303, + "logits/rejected": -1.1103566884994507, + "logps/chosen": -0.5413545966148376, + "logps/rejected": -0.5674502849578857, + "loss": 1.5207, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3533867597579956, + "rewards/margins": 0.06523902714252472, + "rewards/rejected": -1.4186255931854248, + "step": 303 + }, + { + "epoch": 0.649746192893401, + "grad_norm": 8.962607383728027, + "learning_rate": 3.2781898038048237e-07, + "logits/chosen": -1.2004753351211548, + "logits/rejected": -1.023838758468628, + "logps/chosen": -0.4371573030948639, + "logps/rejected": -0.4363090991973877, + "loss": 1.5006, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.092893362045288, + "rewards/margins": -0.0021204352378845215, + "rewards/rejected": -1.0907728672027588, + "step": 304 + }, + { + "epoch": 0.6518835158963399, + "grad_norm": 4.282663822174072, + "learning_rate": 3.243125879593286e-07, + "logits/chosen": -1.0454260110855103, + "logits/rejected": -1.0447431802749634, + "logps/chosen": -0.624133288860321, + "logps/rejected": -0.8872759342193604, + "loss": 1.4943, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5603333711624146, + "rewards/margins": 0.6578565239906311, + "rewards/rejected": -2.2181897163391113, + "step": 305 + }, + { + "epoch": 0.6540208388992786, + "grad_norm": 5.838572978973389, + "learning_rate": 3.2081602522734985e-07, + "logits/chosen": -1.0571078062057495, + "logits/rejected": -1.1394490003585815, + "logps/chosen": -0.5408369302749634, + "logps/rejected": -0.632168710231781, + "loss": 1.4793, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3520923852920532, + "rewards/margins": 0.22832949459552765, + "rewards/rejected": -1.580422043800354, + "step": 306 + }, + { + "epoch": 0.6561581619022174, + "grad_norm": 8.524316787719727, + "learning_rate": 3.173294878168025e-07, + "logits/chosen": -1.079796552658081, + "logits/rejected": -1.080225944519043, + "logps/chosen": -0.5492815971374512, + "logps/rejected": -0.7810524702072144, + "loss": 1.5853, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.373204231262207, + "rewards/margins": 0.5794269442558289, + "rewards/rejected": -1.9526311159133911, + "step": 307 + }, + { + "epoch": 0.6582954849051563, + "grad_norm": 4.74807071685791, + "learning_rate": 3.138531707990274e-07, + "logits/chosen": -0.903827428817749, + "logits/rejected": -1.0083296298980713, + "logps/chosen": -0.5046001672744751, + "logps/rejected": -0.6112987995147705, + "loss": 1.4597, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2615002393722534, + "rewards/margins": 0.2667466402053833, + "rewards/rejected": -1.5282469987869263, + "step": 308 + }, + { + "epoch": 0.6604328079080951, + "grad_norm": 5.756943702697754, + "learning_rate": 3.1038726867353583e-07, + "logits/chosen": -1.2377115488052368, + "logits/rejected": -1.2392162084579468, + "logps/chosen": -0.5622467398643494, + "logps/rejected": -0.7166787385940552, + "loss": 1.643, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4056168794631958, + "rewards/margins": 0.3860801160335541, + "rewards/rejected": -1.7916970252990723, + "step": 309 + }, + { + "epoch": 0.6625701309110339, + "grad_norm": 10.2656831741333, + "learning_rate": 3.069319753571269e-07, + "logits/chosen": -0.8976331949234009, + "logits/rejected": -0.9083345532417297, + "logps/chosen": -0.45293816924095154, + "logps/rejected": -0.57112717628479, + "loss": 1.6145, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1323453187942505, + "rewards/margins": 0.29547253251075745, + "rewards/rejected": -1.427817940711975, + "step": 310 + }, + { + "epoch": 0.6647074539139728, + "grad_norm": 9.763343811035156, + "learning_rate": 3.034874841730382e-07, + "logits/chosen": -1.4269440174102783, + "logits/rejected": -1.2256879806518555, + "logps/chosen": -0.48573237657546997, + "logps/rejected": -0.7592737674713135, + "loss": 1.5495, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.214331030845642, + "rewards/margins": 0.6838533878326416, + "rewards/rejected": -1.8981844186782837, + "step": 311 + }, + { + "epoch": 0.6668447769169116, + "grad_norm": 5.243341445922852, + "learning_rate": 3.000539878401296e-07, + "logits/chosen": -1.1378743648529053, + "logits/rejected": -1.1113413572311401, + "logps/chosen": -0.5226577520370483, + "logps/rejected": -0.47466573119163513, + "loss": 1.538, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3066444396972656, + "rewards/margins": -0.11998005956411362, + "rewards/rejected": -1.186664342880249, + "step": 312 + }, + { + "epoch": 0.6689820999198504, + "grad_norm": 8.089217185974121, + "learning_rate": 2.9663167846209996e-07, + "logits/chosen": -1.2200907468795776, + "logits/rejected": -1.200815200805664, + "logps/chosen": -0.5275171399116516, + "logps/rejected": -0.6412789821624756, + "loss": 1.4825, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3187928199768066, + "rewards/margins": 0.28440454602241516, + "rewards/rejected": -1.6031973361968994, + "step": 313 + }, + { + "epoch": 0.6711194229227893, + "grad_norm": 5.0957417488098145, + "learning_rate": 2.9322074751673974e-07, + "logits/chosen": -1.0893193483352661, + "logits/rejected": -1.135164499282837, + "logps/chosen": -0.573096752166748, + "logps/rejected": -0.6840115785598755, + "loss": 1.5689, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.4327419996261597, + "rewards/margins": 0.27728694677352905, + "rewards/rejected": -1.710028886795044, + "step": 314 + }, + { + "epoch": 0.673256745925728, + "grad_norm": 6.795984745025635, + "learning_rate": 2.898213858452173e-07, + "logits/chosen": -1.0072894096374512, + "logits/rejected": -0.949766993522644, + "logps/chosen": -0.6433035135269165, + "logps/rejected": -0.7086694836616516, + "loss": 1.5963, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6082587242126465, + "rewards/margins": 0.16341499984264374, + "rewards/rejected": -1.7716736793518066, + "step": 315 + }, + { + "epoch": 0.6753940689286668, + "grad_norm": 5.6735053062438965, + "learning_rate": 2.864337836414018e-07, + "logits/chosen": -0.9407764077186584, + "logits/rejected": -1.0202523469924927, + "logps/chosen": -0.47564423084259033, + "logps/rejected": -0.8013235330581665, + "loss": 1.5025, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1891103982925415, + "rewards/margins": 0.8141986131668091, + "rewards/rejected": -2.0033090114593506, + "step": 316 + }, + { + "epoch": 0.6775313919316056, + "grad_norm": 4.4380059242248535, + "learning_rate": 2.8305813044122093e-07, + "logits/chosen": -1.073567271232605, + "logits/rejected": -1.010934591293335, + "logps/chosen": -0.412260502576828, + "logps/rejected": -0.5673598051071167, + "loss": 1.4443, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.030651330947876, + "rewards/margins": 0.38774824142456055, + "rewards/rejected": -1.4183995723724365, + "step": 317 + }, + { + "epoch": 0.6796687149345445, + "grad_norm": 11.309561729431152, + "learning_rate": 2.7969461511205806e-07, + "logits/chosen": -1.146227478981018, + "logits/rejected": -1.0426236391067505, + "logps/chosen": -0.766873300075531, + "logps/rejected": -0.9035167694091797, + "loss": 1.4907, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9171831607818604, + "rewards/margins": 0.34160885214805603, + "rewards/rejected": -2.258791923522949, + "step": 318 + }, + { + "epoch": 0.6818060379374833, + "grad_norm": 6.63892126083374, + "learning_rate": 2.763434258421836e-07, + "logits/chosen": -1.126558542251587, + "logits/rejected": -1.1528400182724, + "logps/chosen": -0.5386890172958374, + "logps/rejected": -0.9026880264282227, + "loss": 1.5645, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3467226028442383, + "rewards/margins": 0.9099973440170288, + "rewards/rejected": -2.2567198276519775, + "step": 319 + }, + { + "epoch": 0.6839433609404221, + "grad_norm": 5.889532566070557, + "learning_rate": 2.730047501302266e-07, + "logits/chosen": -1.1312450170516968, + "logits/rejected": -1.0725492238998413, + "logps/chosen": -0.4850122928619385, + "logps/rejected": -0.5434764623641968, + "loss": 1.5617, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2125308513641357, + "rewards/margins": 0.14616040885448456, + "rewards/rejected": -1.3586912155151367, + "step": 320 + }, + { + "epoch": 0.686080683943361, + "grad_norm": 4.353449821472168, + "learning_rate": 2.696787747746839e-07, + "logits/chosen": -0.8624957203865051, + "logits/rejected": -0.8295655846595764, + "logps/chosen": -0.40189000964164734, + "logps/rejected": -0.47676557302474976, + "loss": 1.4684, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0047250986099243, + "rewards/margins": 0.18718883395195007, + "rewards/rejected": -1.1919138431549072, + "step": 321 + }, + { + "epoch": 0.6882180069462998, + "grad_norm": 9.558797836303711, + "learning_rate": 2.6636568586346897e-07, + "logits/chosen": -0.9744606614112854, + "logits/rejected": -0.9377647042274475, + "logps/chosen": -0.5311119556427002, + "logps/rejected": -0.5584622621536255, + "loss": 1.5027, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.3277798891067505, + "rewards/margins": 0.06837557256221771, + "rewards/rejected": -1.3961554765701294, + "step": 322 + }, + { + "epoch": 0.6903553299492385, + "grad_norm": 5.22324275970459, + "learning_rate": 2.6306566876350067e-07, + "logits/chosen": -1.044816017150879, + "logits/rejected": -1.068107008934021, + "logps/chosen": -0.5444616079330444, + "logps/rejected": -0.6575828790664673, + "loss": 1.5442, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3611540794372559, + "rewards/margins": 0.2828032672405243, + "rewards/rejected": -1.6439573764801025, + "step": 323 + }, + { + "epoch": 0.6924926529521774, + "grad_norm": 5.757807731628418, + "learning_rate": 2.597789081103313e-07, + "logits/chosen": -1.0435606241226196, + "logits/rejected": -1.078909158706665, + "logps/chosen": -0.4910418689250946, + "logps/rejected": -0.6150948405265808, + "loss": 1.5089, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.227604627609253, + "rewards/margins": 0.3101324737071991, + "rewards/rejected": -1.5377371311187744, + "step": 324 + }, + { + "epoch": 0.6946299759551162, + "grad_norm": 5.650123119354248, + "learning_rate": 2.5650558779781635e-07, + "logits/chosen": -0.9889379739761353, + "logits/rejected": -1.084942102432251, + "logps/chosen": -0.5245276689529419, + "logps/rejected": -0.564633846282959, + "loss": 1.4216, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.311319351196289, + "rewards/margins": 0.10026533901691437, + "rewards/rejected": -1.4115846157073975, + "step": 325 + }, + { + "epoch": 0.696767298958055, + "grad_norm": 5.227182865142822, + "learning_rate": 2.5324589096782656e-07, + "logits/chosen": -1.1572788953781128, + "logits/rejected": -0.9472928047180176, + "logps/chosen": -0.6365960836410522, + "logps/rejected": -0.8092702031135559, + "loss": 1.5535, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.5914900302886963, + "rewards/margins": 0.43168535828590393, + "rewards/rejected": -2.0231754779815674, + "step": 326 + }, + { + "epoch": 0.6989046219609939, + "grad_norm": 9.626143455505371, + "learning_rate": 2.500000000000001e-07, + "logits/chosen": -0.9521257281303406, + "logits/rejected": -1.0960386991500854, + "logps/chosen": -0.5820955634117126, + "logps/rejected": -0.8665407299995422, + "loss": 1.5325, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.455238938331604, + "rewards/margins": 0.7111131548881531, + "rewards/rejected": -2.1663520336151123, + "step": 327 + }, + { + "epoch": 0.7010419449639327, + "grad_norm": 8.55288028717041, + "learning_rate": 2.467680965015387e-07, + "logits/chosen": -1.1446974277496338, + "logits/rejected": -1.0407241582870483, + "logps/chosen": -0.579289972782135, + "logps/rejected": -0.6028792858123779, + "loss": 1.5148, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4482249021530151, + "rewards/margins": 0.058973364531993866, + "rewards/rejected": -1.5071980953216553, + "step": 328 + }, + { + "epoch": 0.7031792679668715, + "grad_norm": 9.617560386657715, + "learning_rate": 2.435503612970469e-07, + "logits/chosen": -1.177872657775879, + "logits/rejected": -1.0678505897521973, + "logps/chosen": -0.5095410346984863, + "logps/rejected": -0.559869110584259, + "loss": 1.4961, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2738525867462158, + "rewards/margins": 0.12582018971443176, + "rewards/rejected": -1.3996728658676147, + "step": 329 + }, + { + "epoch": 0.7053165909698104, + "grad_norm": 6.568669319152832, + "learning_rate": 2.403469744184154e-07, + "logits/chosen": -1.017891764640808, + "logits/rejected": -1.0297455787658691, + "logps/chosen": -0.552146852016449, + "logps/rejected": -1.3806557655334473, + "loss": 1.3951, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3803672790527344, + "rewards/margins": 2.071272373199463, + "rewards/rejected": -3.4516396522521973, + "step": 330 + }, + { + "epoch": 0.7074539139727491, + "grad_norm": 5.693061828613281, + "learning_rate": 2.371581150947476e-07, + "logits/chosen": -1.121291160583496, + "logits/rejected": -1.0978386402130127, + "logps/chosen": -0.7388095855712891, + "logps/rejected": -0.7743204236030579, + "loss": 1.4663, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8470239639282227, + "rewards/margins": 0.08877717703580856, + "rewards/rejected": -1.9358010292053223, + "step": 331 + }, + { + "epoch": 0.7095912369756879, + "grad_norm": 6.6337151527404785, + "learning_rate": 2.3398396174233176e-07, + "logits/chosen": -1.0051803588867188, + "logits/rejected": -1.1046079397201538, + "logps/chosen": -0.5532403588294983, + "logps/rejected": -0.8187743425369263, + "loss": 1.547, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.383100986480713, + "rewards/margins": 0.6638349294662476, + "rewards/rejected": -2.046935796737671, + "step": 332 + }, + { + "epoch": 0.7117285599786267, + "grad_norm": 7.117689609527588, + "learning_rate": 2.3082469195465893e-07, + "logits/chosen": -1.0851317644119263, + "logits/rejected": -1.0983937978744507, + "logps/chosen": -0.4725731611251831, + "logps/rejected": -0.5834302306175232, + "loss": 1.3948, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.181432843208313, + "rewards/margins": 0.2771427631378174, + "rewards/rejected": -1.4585756063461304, + "step": 333 + }, + { + "epoch": 0.7138658829815656, + "grad_norm": 6.473294258117676, + "learning_rate": 2.2768048249248644e-07, + "logits/chosen": -0.9775727987289429, + "logits/rejected": -0.9463880658149719, + "logps/chosen": -0.6236757040023804, + "logps/rejected": -0.6484177112579346, + "loss": 1.512, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.5591893196105957, + "rewards/margins": 0.061854973435401917, + "rewards/rejected": -1.6210441589355469, + "step": 334 + }, + { + "epoch": 0.7160032059845044, + "grad_norm": 5.626604080200195, + "learning_rate": 2.2455150927394878e-07, + "logits/chosen": -1.044307827949524, + "logits/rejected": -1.1238620281219482, + "logps/chosen": -0.5363501906394958, + "logps/rejected": -0.6088119745254517, + "loss": 1.5171, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.340875506401062, + "rewards/margins": 0.18115444481372833, + "rewards/rejected": -1.5220298767089844, + "step": 335 + }, + { + "epoch": 0.7181405289874432, + "grad_norm": 7.3833513259887695, + "learning_rate": 2.2143794736471388e-07, + "logits/chosen": -1.03908371925354, + "logits/rejected": -1.1508772373199463, + "logps/chosen": -0.46650949120521545, + "logps/rejected": -0.8443108797073364, + "loss": 1.4405, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.166273593902588, + "rewards/margins": 0.944503664970398, + "rewards/rejected": -2.1107773780822754, + "step": 336 + }, + { + "epoch": 0.7202778519903821, + "grad_norm": 6.9946184158325195, + "learning_rate": 2.1833997096818895e-07, + "logits/chosen": -1.017868161201477, + "logits/rejected": -0.9449481964111328, + "logps/chosen": -0.5905267000198364, + "logps/rejected": -0.6067604422569275, + "loss": 1.6098, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4763169288635254, + "rewards/margins": 0.04058411717414856, + "rewards/rejected": -1.5169010162353516, + "step": 337 + }, + { + "epoch": 0.7224151749933209, + "grad_norm": 6.809935569763184, + "learning_rate": 2.1525775341577402e-07, + "logits/chosen": -0.874271035194397, + "logits/rejected": -0.7729781270027161, + "logps/chosen": -0.42772772908210754, + "logps/rejected": -0.5716719627380371, + "loss": 1.3901, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0693193674087524, + "rewards/margins": 0.35986068844795227, + "rewards/rejected": -1.4291800260543823, + "step": 338 + }, + { + "epoch": 0.7245524979962596, + "grad_norm": 5.1241021156311035, + "learning_rate": 2.121914671571633e-07, + "logits/chosen": -1.1260241270065308, + "logits/rejected": -1.147855520248413, + "logps/chosen": -0.6018639206886292, + "logps/rejected": -0.8497560024261475, + "loss": 1.574, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5046600103378296, + "rewards/margins": 0.6197301149368286, + "rewards/rejected": -2.124390125274658, + "step": 339 + }, + { + "epoch": 0.7266898209991985, + "grad_norm": 11.285929679870605, + "learning_rate": 2.0914128375069722e-07, + "logits/chosen": -1.0443346500396729, + "logits/rejected": -0.9825816750526428, + "logps/chosen": -0.5199556350708008, + "logps/rejected": -0.5952978134155273, + "loss": 1.5966, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.299889087677002, + "rewards/margins": 0.1883554458618164, + "rewards/rejected": -1.4882445335388184, + "step": 340 + }, + { + "epoch": 0.7288271440021373, + "grad_norm": 6.03069543838501, + "learning_rate": 2.0610737385376348e-07, + "logits/chosen": -1.0298320055007935, + "logits/rejected": -0.9670383930206299, + "logps/chosen": -0.5948835015296936, + "logps/rejected": -0.7645187377929688, + "loss": 1.4949, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.4872087240219116, + "rewards/margins": 0.42408809065818787, + "rewards/rejected": -1.9112968444824219, + "step": 341 + }, + { + "epoch": 0.7309644670050761, + "grad_norm": 8.48175048828125, + "learning_rate": 2.0308990721324926e-07, + "logits/chosen": -0.7453219294548035, + "logits/rejected": -0.7619246244430542, + "logps/chosen": -0.4561096131801605, + "logps/rejected": -0.5257768034934998, + "loss": 1.477, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1402740478515625, + "rewards/margins": 0.17416785657405853, + "rewards/rejected": -1.3144419193267822, + "step": 342 + }, + { + "epoch": 0.733101790008015, + "grad_norm": 4.618956089019775, + "learning_rate": 2.0008905265604315e-07, + "logits/chosen": -1.058908462524414, + "logits/rejected": -1.0160064697265625, + "logps/chosen": -0.7591407895088196, + "logps/rejected": -0.741392195224762, + "loss": 1.4273, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8978519439697266, + "rewards/margins": -0.04437139630317688, + "rewards/rejected": -1.853480339050293, + "step": 343 + }, + { + "epoch": 0.7352391130109538, + "grad_norm": 4.5929155349731445, + "learning_rate": 1.971049780795901e-07, + "logits/chosen": -1.0634278059005737, + "logits/rejected": -1.0275781154632568, + "logps/chosen": -0.6927691102027893, + "logps/rejected": -0.7188006639480591, + "loss": 1.4542, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.7319227457046509, + "rewards/margins": 0.06507880985736847, + "rewards/rejected": -1.7970017194747925, + "step": 344 + }, + { + "epoch": 0.7373764360138926, + "grad_norm": 7.66823673248291, + "learning_rate": 1.9413785044249676e-07, + "logits/chosen": -1.0500935316085815, + "logits/rejected": -0.9972812533378601, + "logps/chosen": -0.6515956521034241, + "logps/rejected": -0.7696123123168945, + "loss": 1.509, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.6289891004562378, + "rewards/margins": 0.29504185914993286, + "rewards/rejected": -1.9240310192108154, + "step": 345 + }, + { + "epoch": 0.7395137590168315, + "grad_norm": 13.36498737335205, + "learning_rate": 1.9118783575519109e-07, + "logits/chosen": -1.2645362615585327, + "logits/rejected": -1.1548783779144287, + "logps/chosen": -0.6232104301452637, + "logps/rejected": -0.6466085910797119, + "loss": 1.5575, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5580260753631592, + "rewards/margins": 0.058495476841926575, + "rewards/rejected": -1.6165215969085693, + "step": 346 + }, + { + "epoch": 0.7416510820197703, + "grad_norm": 6.211390018463135, + "learning_rate": 1.8825509907063326e-07, + "logits/chosen": -0.987378716468811, + "logits/rejected": -0.9803218841552734, + "logps/chosen": -0.6375169157981873, + "logps/rejected": -0.8728115558624268, + "loss": 1.518, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.593792200088501, + "rewards/margins": 0.5882365107536316, + "rewards/rejected": -2.1820287704467773, + "step": 347 + }, + { + "epoch": 0.743788405022709, + "grad_norm": 7.908841609954834, + "learning_rate": 1.8533980447508135e-07, + "logits/chosen": -0.9738492965698242, + "logits/rejected": -1.0749965906143188, + "logps/chosen": -0.43783146142959595, + "logps/rejected": -0.7721287608146667, + "loss": 1.4855, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.094578742980957, + "rewards/margins": 0.8357431888580322, + "rewards/rejected": -1.9303218126296997, + "step": 348 + }, + { + "epoch": 0.7459257280256478, + "grad_norm": 4.574754238128662, + "learning_rate": 1.824421150789106e-07, + "logits/chosen": -1.07004714012146, + "logits/rejected": -1.0875684022903442, + "logps/chosen": -0.5463494062423706, + "logps/rejected": -0.5291048288345337, + "loss": 1.5418, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.3658735752105713, + "rewards/margins": -0.04311151057481766, + "rewards/rejected": -1.3227618932724, + "step": 349 + }, + { + "epoch": 0.7480630510285867, + "grad_norm": 7.958865642547607, + "learning_rate": 1.7956219300748792e-07, + "logits/chosen": -1.0490962266921997, + "logits/rejected": -1.0383286476135254, + "logps/chosen": -0.5858332514762878, + "logps/rejected": -0.6578047275543213, + "loss": 1.5021, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.464583158493042, + "rewards/margins": 0.1799287348985672, + "rewards/rejected": -1.6445120573043823, + "step": 350 + }, + { + "epoch": 0.7502003740315255, + "grad_norm": 6.273632526397705, + "learning_rate": 1.7670019939210023e-07, + "logits/chosen": -1.12949538230896, + "logits/rejected": -1.0367292165756226, + "logps/chosen": -0.4628137946128845, + "logps/rejected": -0.4637930393218994, + "loss": 1.4832, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1570343971252441, + "rewards/margins": 0.00244816392660141, + "rewards/rejected": -1.1594825983047485, + "step": 351 + }, + { + "epoch": 0.7523376970344643, + "grad_norm": 5.706187725067139, + "learning_rate": 1.7385629436093956e-07, + "logits/chosen": -1.0996813774108887, + "logits/rejected": -1.0577008724212646, + "logps/chosen": -0.7115722298622131, + "logps/rejected": -0.7016228437423706, + "loss": 1.5491, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7789306640625, + "rewards/margins": -0.024873413145542145, + "rewards/rejected": -1.7540570497512817, + "step": 352 + }, + { + "epoch": 0.7544750200374032, + "grad_norm": 9.662369728088379, + "learning_rate": 1.710306370301437e-07, + "logits/chosen": -1.0781750679016113, + "logits/rejected": -0.9331997036933899, + "logps/chosen": -0.5081960558891296, + "logps/rejected": -0.5510014295578003, + "loss": 1.5991, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.270490050315857, + "rewards/margins": 0.10701363533735275, + "rewards/rejected": -1.3775036334991455, + "step": 353 + }, + { + "epoch": 0.756612343040342, + "grad_norm": 4.557639122009277, + "learning_rate": 1.6822338549489446e-07, + "logits/chosen": -1.024949550628662, + "logits/rejected": -1.026584267616272, + "logps/chosen": -0.46658557653427124, + "logps/rejected": -0.6099130511283875, + "loss": 1.453, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1664639711380005, + "rewards/margins": 0.3583187162876129, + "rewards/rejected": -1.524782657623291, + "step": 354 + }, + { + "epoch": 0.7587496660432808, + "grad_norm": 6.783056259155273, + "learning_rate": 1.6543469682057104e-07, + "logits/chosen": -1.0237020254135132, + "logits/rejected": -1.01318359375, + "logps/chosen": -0.5265443921089172, + "logps/rejected": -0.5865951776504517, + "loss": 1.4962, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.3163609504699707, + "rewards/margins": 0.15012702345848083, + "rewards/rejected": -1.466488003730774, + "step": 355 + }, + { + "epoch": 0.7608869890462197, + "grad_norm": 8.898886680603027, + "learning_rate": 1.6266472703396284e-07, + "logits/chosen": -1.1975302696228027, + "logits/rejected": -1.0899324417114258, + "logps/chosen": -0.6463479399681091, + "logps/rejected": -1.0257855653762817, + "loss": 1.4989, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6158698797225952, + "rewards/margins": 0.9485940337181091, + "rewards/rejected": -2.5644638538360596, + "step": 356 + }, + { + "epoch": 0.7630243120491584, + "grad_norm": 11.671951293945312, + "learning_rate": 1.599136311145402e-07, + "logits/chosen": -1.1834355592727661, + "logits/rejected": -1.1248061656951904, + "logps/chosen": -0.7515526413917542, + "logps/rejected": -0.8097901344299316, + "loss": 1.5244, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.878881573677063, + "rewards/margins": 0.1455937623977661, + "rewards/rejected": -2.024475336074829, + "step": 357 + }, + { + "epoch": 0.7651616350520972, + "grad_norm": 5.740311622619629, + "learning_rate": 1.5718156298578288e-07, + "logits/chosen": -1.0501099824905396, + "logits/rejected": -1.0831395387649536, + "logps/chosen": -0.4516759216785431, + "logps/rejected": -0.547008752822876, + "loss": 1.4943, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1291898488998413, + "rewards/margins": 0.2383321225643158, + "rewards/rejected": -1.3675217628479004, + "step": 358 + }, + { + "epoch": 0.7672989580550361, + "grad_norm": 8.075958251953125, + "learning_rate": 1.5446867550656767e-07, + "logits/chosen": -1.0409198999404907, + "logits/rejected": -1.2201882600784302, + "logps/chosen": -0.620537519454956, + "logps/rejected": -1.1603175401687622, + "loss": 1.4869, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5513437986373901, + "rewards/margins": 1.3494499921798706, + "rewards/rejected": -2.90079402923584, + "step": 359 + }, + { + "epoch": 0.7694362810579749, + "grad_norm": 5.148620128631592, + "learning_rate": 1.5177512046261666e-07, + "logits/chosen": -0.7799254655838013, + "logits/rejected": -0.8649179935455322, + "logps/chosen": -0.5494642853736877, + "logps/rejected": -0.8911303281784058, + "loss": 1.4401, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3736608028411865, + "rewards/margins": 0.8541650772094727, + "rewards/rejected": -2.227825880050659, + "step": 360 + }, + { + "epoch": 0.7715736040609137, + "grad_norm": 5.721947193145752, + "learning_rate": 1.4910104855800426e-07, + "logits/chosen": -1.1887235641479492, + "logits/rejected": -1.0089201927185059, + "logps/chosen": -0.5180599689483643, + "logps/rejected": -0.49416518211364746, + "loss": 1.608, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2951500415802002, + "rewards/margins": -0.05973710119724274, + "rewards/rejected": -1.2354129552841187, + "step": 361 + }, + { + "epoch": 0.7737109270638525, + "grad_norm": 5.896087169647217, + "learning_rate": 1.4644660940672627e-07, + "logits/chosen": -1.0971053838729858, + "logits/rejected": -1.021689772605896, + "logps/chosen": -0.4189419448375702, + "logps/rejected": -0.5696815848350525, + "loss": 1.4472, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.047354817390442, + "rewards/margins": 0.37684905529022217, + "rewards/rejected": -1.424203872680664, + "step": 362 + }, + { + "epoch": 0.7758482500667914, + "grad_norm": 8.45847225189209, + "learning_rate": 1.4381195152432769e-07, + "logits/chosen": -0.8624813556671143, + "logits/rejected": -0.8767452239990234, + "logps/chosen": -0.615929126739502, + "logps/rejected": -0.6732630729675293, + "loss": 1.4672, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5398229360580444, + "rewards/margins": 0.14333483576774597, + "rewards/rejected": -1.6831578016281128, + "step": 363 + }, + { + "epoch": 0.7779855730697302, + "grad_norm": 6.6143927574157715, + "learning_rate": 1.4119722231959403e-07, + "logits/chosen": -1.0300284624099731, + "logits/rejected": -1.08317232131958, + "logps/chosen": -0.5381268858909607, + "logps/rejected": -0.5894446969032288, + "loss": 1.5538, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3453172445297241, + "rewards/margins": 0.12829461693763733, + "rewards/rejected": -1.473611831665039, + "step": 364 + }, + { + "epoch": 0.7801228960726689, + "grad_norm": 6.556136608123779, + "learning_rate": 1.3860256808630427e-07, + "logits/chosen": -1.0168136358261108, + "logits/rejected": -1.0091896057128906, + "logps/chosen": -0.8003075122833252, + "logps/rejected": -0.7712854146957397, + "loss": 1.4586, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0007686614990234, + "rewards/margins": -0.07255513966083527, + "rewards/rejected": -1.9282134771347046, + "step": 365 + }, + { + "epoch": 0.7822602190756078, + "grad_norm": 4.969550609588623, + "learning_rate": 1.3602813399504458e-07, + "logits/chosen": -1.1460951566696167, + "logits/rejected": -1.0588514804840088, + "logps/chosen": -0.6409847736358643, + "logps/rejected": -0.6086816787719727, + "loss": 1.5951, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.6024619340896606, + "rewards/margins": -0.08075767010450363, + "rewards/rejected": -1.5217043161392212, + "step": 366 + }, + { + "epoch": 0.7843975420785466, + "grad_norm": 7.199689865112305, + "learning_rate": 1.3347406408508694e-07, + "logits/chosen": -1.2313283681869507, + "logits/rejected": -1.2684742212295532, + "logps/chosen": -0.60216224193573, + "logps/rejected": -0.8374693393707275, + "loss": 1.4751, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5054056644439697, + "rewards/margins": 0.5882677435874939, + "rewards/rejected": -2.0936732292175293, + "step": 367 + }, + { + "epoch": 0.7865348650814854, + "grad_norm": 8.949746131896973, + "learning_rate": 1.3094050125632972e-07, + "logits/chosen": -0.9064935445785522, + "logits/rejected": -0.9444929957389832, + "logps/chosen": -0.45091521739959717, + "logps/rejected": -0.5422642230987549, + "loss": 1.4476, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1272881031036377, + "rewards/margins": 0.22837252914905548, + "rewards/rejected": -1.3556605577468872, + "step": 368 + }, + { + "epoch": 0.7886721880844243, + "grad_norm": 4.023352146148682, + "learning_rate": 1.284275872613028e-07, + "logits/chosen": -1.0363601446151733, + "logits/rejected": -0.9367664456367493, + "logps/chosen": -0.4743805229663849, + "logps/rejected": -0.448375940322876, + "loss": 1.5308, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1859513521194458, + "rewards/margins": -0.06501153111457825, + "rewards/rejected": -1.12093985080719, + "step": 369 + }, + { + "epoch": 0.7908095110873631, + "grad_norm": 9.706334114074707, + "learning_rate": 1.2593546269723647e-07, + "logits/chosen": -1.0525028705596924, + "logits/rejected": -1.0606484413146973, + "logps/chosen": -0.5636960864067078, + "logps/rejected": -0.6165448427200317, + "loss": 1.5128, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4092402458190918, + "rewards/margins": 0.13212202489376068, + "rewards/rejected": -1.5413621664047241, + "step": 370 + }, + { + "epoch": 0.7929468340903019, + "grad_norm": 11.990076065063477, + "learning_rate": 1.2346426699819456e-07, + "logits/chosen": -1.1552562713623047, + "logits/rejected": -1.0523865222930908, + "logps/chosen": -0.615561842918396, + "logps/rejected": -0.8749452233314514, + "loss": 1.4922, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5389046669006348, + "rewards/margins": 0.6484581232070923, + "rewards/rejected": -2.1873626708984375, + "step": 371 + }, + { + "epoch": 0.7950841570932408, + "grad_norm": 4.068089485168457, + "learning_rate": 1.2101413842727343e-07, + "logits/chosen": -1.1697150468826294, + "logits/rejected": -1.2012141942977905, + "logps/chosen": -0.5214999914169312, + "logps/rejected": -0.6672156453132629, + "loss": 1.4757, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3037500381469727, + "rewards/margins": 0.3642891049385071, + "rewards/rejected": -1.668039083480835, + "step": 372 + }, + { + "epoch": 0.7972214800961795, + "grad_norm": 8.035099983215332, + "learning_rate": 1.1858521406886674e-07, + "logits/chosen": -0.7799820899963379, + "logits/rejected": -0.8855147361755371, + "logps/chosen": -0.4354492723941803, + "logps/rejected": -0.508285641670227, + "loss": 1.5703, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0886231660842896, + "rewards/margins": 0.18209105730056763, + "rewards/rejected": -1.270714282989502, + "step": 373 + }, + { + "epoch": 0.7993588030991183, + "grad_norm": 11.540173530578613, + "learning_rate": 1.1617762982099444e-07, + "logits/chosen": -1.2532049417495728, + "logits/rejected": -1.293265700340271, + "logps/chosen": -0.6085352897644043, + "logps/rejected": -0.6329178810119629, + "loss": 1.5638, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5213382244110107, + "rewards/margins": 0.06095648184418678, + "rewards/rejected": -1.5822947025299072, + "step": 374 + }, + { + "epoch": 0.8014961261020572, + "grad_norm": 6.516201019287109, + "learning_rate": 1.1379152038770029e-07, + "logits/chosen": -0.9992303848266602, + "logits/rejected": -1.0280938148498535, + "logps/chosen": -0.5312505960464478, + "logps/rejected": -0.7041913866996765, + "loss": 1.5746, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3281266689300537, + "rewards/margins": 0.432352215051651, + "rewards/rejected": -1.7604787349700928, + "step": 375 + }, + { + "epoch": 0.803633449104996, + "grad_norm": 9.061089515686035, + "learning_rate": 1.1142701927151454e-07, + "logits/chosen": -1.1462222337722778, + "logits/rejected": -0.9769701361656189, + "logps/chosen": -0.5302731990814209, + "logps/rejected": -0.4824178218841553, + "loss": 1.5691, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.3256831169128418, + "rewards/margins": -0.11963848769664764, + "rewards/rejected": -1.2060445547103882, + "step": 376 + }, + { + "epoch": 0.8057707721079348, + "grad_norm": 15.69382381439209, + "learning_rate": 1.090842587659851e-07, + "logits/chosen": -1.1206320524215698, + "logits/rejected": -1.034964919090271, + "logps/chosen": -0.6023938655853271, + "logps/rejected": -0.5703404545783997, + "loss": 1.534, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.5059847831726074, + "rewards/margins": -0.08013379573822021, + "rewards/rejected": -1.4258509874343872, + "step": 377 + }, + { + "epoch": 0.8079080951108736, + "grad_norm": 6.683454990386963, + "learning_rate": 1.0676336994827512e-07, + "logits/chosen": -1.093888282775879, + "logits/rejected": -1.2383679151535034, + "logps/chosen": -0.38456153869628906, + "logps/rejected": -0.6876378655433655, + "loss": 1.5004, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9614039063453674, + "rewards/margins": 0.7576908469200134, + "rewards/rejected": -1.7190947532653809, + "step": 378 + }, + { + "epoch": 0.8100454181138125, + "grad_norm": 6.571846961975098, + "learning_rate": 1.044644826718295e-07, + "logits/chosen": -1.2148207426071167, + "logits/rejected": -1.1005587577819824, + "logps/chosen": -0.7769785523414612, + "logps/rejected": -1.3037354946136475, + "loss": 1.4388, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9424464702606201, + "rewards/margins": 1.316892147064209, + "rewards/rejected": -3.259338617324829, + "step": 379 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 4.527327537536621, + "learning_rate": 1.0218772555910954e-07, + "logits/chosen": -1.1576111316680908, + "logits/rejected": -1.1151889562606812, + "logps/chosen": -0.6413387656211853, + "logps/rejected": -0.7667942643165588, + "loss": 1.466, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.603346824645996, + "rewards/margins": 0.31363871693611145, + "rewards/rejected": -1.9169857501983643, + "step": 380 + }, + { + "epoch": 0.81432006411969, + "grad_norm": 4.658472061157227, + "learning_rate": 9.99332259943969e-08, + "logits/chosen": -0.8665165305137634, + "logits/rejected": -0.8802721500396729, + "logps/chosen": -0.5357838869094849, + "logps/rejected": -0.9002077579498291, + "loss": 1.4755, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3394598960876465, + "rewards/margins": 0.9110593199729919, + "rewards/rejected": -2.250519275665283, + "step": 381 + }, + { + "epoch": 0.8164573871226289, + "grad_norm": 7.652352333068848, + "learning_rate": 9.770111011666582e-08, + "logits/chosen": -1.3007032871246338, + "logits/rejected": -1.2247170209884644, + "logps/chosen": -0.5824642181396484, + "logps/rejected": -0.6378771662712097, + "loss": 1.5725, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.456160545349121, + "rewards/margins": 0.13853231072425842, + "rewards/rejected": -1.5946928262710571, + "step": 382 + }, + { + "epoch": 0.8185947101255677, + "grad_norm": 6.376747131347656, + "learning_rate": 9.549150281252632e-08, + "logits/chosen": -1.0382481813430786, + "logits/rejected": -0.9399144053459167, + "logps/chosen": -0.5589631199836731, + "logps/rejected": -0.6459823846817017, + "loss": 1.4609, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3974076509475708, + "rewards/margins": 0.2175484150648117, + "rewards/rejected": -1.6149561405181885, + "step": 383 + }, + { + "epoch": 0.8207320331285065, + "grad_norm": 11.018497467041016, + "learning_rate": 9.330452770923603e-08, + "logits/chosen": -1.2223553657531738, + "logits/rejected": -1.238554835319519, + "logps/chosen": -0.5341588258743286, + "logps/rejected": -0.7176991701126099, + "loss": 1.4492, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3353970050811768, + "rewards/margins": 0.4588509202003479, + "rewards/rejected": -1.7942478656768799, + "step": 384 + }, + { + "epoch": 0.8228693561314454, + "grad_norm": 4.911256313323975, + "learning_rate": 9.114030716778432e-08, + "logits/chosen": -0.9694303274154663, + "logits/rejected": -1.0394315719604492, + "logps/chosen": -0.4685894250869751, + "logps/rejected": -0.6834438443183899, + "loss": 1.4237, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1714736223220825, + "rewards/margins": 0.537135899066925, + "rewards/rejected": -1.7086098194122314, + "step": 385 + }, + { + "epoch": 0.8250066791343842, + "grad_norm": 11.421992301940918, + "learning_rate": 8.899896227604508e-08, + "logits/chosen": -1.1830335855484009, + "logits/rejected": -1.157397747039795, + "logps/chosen": -0.39773958921432495, + "logps/rejected": -0.46552199125289917, + "loss": 1.4859, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.9943490028381348, + "rewards/margins": 0.1694560945034027, + "rewards/rejected": -1.1638050079345703, + "step": 386 + }, + { + "epoch": 0.827144002137323, + "grad_norm": 6.942233085632324, + "learning_rate": 8.688061284200265e-08, + "logits/chosen": -1.0675994157791138, + "logits/rejected": -1.0512460470199585, + "logps/chosen": -0.5208725929260254, + "logps/rejected": -0.6654115319252014, + "loss": 1.3969, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.302181363105774, + "rewards/margins": 0.3613475561141968, + "rewards/rejected": -1.6635288000106812, + "step": 387 + }, + { + "epoch": 0.8292813251402619, + "grad_norm": 4.8206377029418945, + "learning_rate": 8.478537738704811e-08, + "logits/chosen": -1.1155000925064087, + "logits/rejected": -1.0786762237548828, + "logps/chosen": -0.5618028044700623, + "logps/rejected": -0.7909839749336243, + "loss": 1.4737, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.404506802558899, + "rewards/margins": 0.5729530453681946, + "rewards/rejected": -1.9774597883224487, + "step": 388 + }, + { + "epoch": 0.8314186481432007, + "grad_norm": 7.505739212036133, + "learning_rate": 8.271337313934867e-08, + "logits/chosen": -1.2557884454727173, + "logits/rejected": -1.2007073163986206, + "logps/chosen": -0.5404600501060486, + "logps/rejected": -0.6826989650726318, + "loss": 1.5093, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3511501550674438, + "rewards/margins": 0.35559743642807007, + "rewards/rejected": -1.7067475318908691, + "step": 389 + }, + { + "epoch": 0.8335559711461394, + "grad_norm": 6.654153347015381, + "learning_rate": 8.066471602728803e-08, + "logits/chosen": -1.155353307723999, + "logits/rejected": -1.1592519283294678, + "logps/chosen": -0.6218467950820923, + "logps/rejected": -0.9340539574623108, + "loss": 1.4667, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5546170473098755, + "rewards/margins": 0.7805179953575134, + "rewards/rejected": -2.335134983062744, + "step": 390 + }, + { + "epoch": 0.8356932941490782, + "grad_norm": 5.7856011390686035, + "learning_rate": 7.863952067298041e-08, + "logits/chosen": -0.8018568754196167, + "logits/rejected": -0.7033423185348511, + "logps/chosen": -0.4321405291557312, + "logps/rejected": -0.49103930592536926, + "loss": 1.4969, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0803513526916504, + "rewards/margins": 0.1472470909357071, + "rewards/rejected": -1.2275984287261963, + "step": 391 + }, + { + "epoch": 0.8378306171520171, + "grad_norm": 4.890209197998047, + "learning_rate": 7.663790038585794e-08, + "logits/chosen": -1.1529498100280762, + "logits/rejected": -1.1529746055603027, + "logps/chosen": -0.5385481119155884, + "logps/rejected": -0.8943907022476196, + "loss": 1.4222, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3463702201843262, + "rewards/margins": 0.8896064758300781, + "rewards/rejected": -2.2359766960144043, + "step": 392 + }, + { + "epoch": 0.8399679401549559, + "grad_norm": 7.706808567047119, + "learning_rate": 7.465996715633027e-08, + "logits/chosen": -1.055177927017212, + "logits/rejected": -1.0194472074508667, + "logps/chosen": -0.6158031821250916, + "logps/rejected": -0.6109176278114319, + "loss": 1.5822, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5395078659057617, + "rewards/margins": -0.012213785201311111, + "rewards/rejected": -1.527294397354126, + "step": 393 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 5.577699184417725, + "learning_rate": 7.270583164951926e-08, + "logits/chosen": -0.9695263504981995, + "logits/rejected": -0.8854237794876099, + "logps/chosen": -0.6802433133125305, + "logps/rejected": -0.6754633188247681, + "loss": 1.4759, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.700608253479004, + "rewards/margins": -0.011950075626373291, + "rewards/rejected": -1.6886582374572754, + "step": 394 + }, + { + "epoch": 0.8442425861608336, + "grad_norm": 4.328030586242676, + "learning_rate": 7.077560319906694e-08, + "logits/chosen": -0.9989792108535767, + "logits/rejected": -0.8419893980026245, + "logps/chosen": -0.4701617658138275, + "logps/rejected": -0.4230220913887024, + "loss": 1.5796, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.17540442943573, + "rewards/margins": -0.11784917116165161, + "rewards/rejected": -1.0575551986694336, + "step": 395 + }, + { + "epoch": 0.8463799091637724, + "grad_norm": 7.562992572784424, + "learning_rate": 6.886938980101869e-08, + "logits/chosen": -1.193518042564392, + "logits/rejected": -1.1734905242919922, + "logps/chosen": -0.6230531930923462, + "logps/rejected": -0.735701322555542, + "loss": 1.52, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5576331615447998, + "rewards/margins": 0.281620055437088, + "rewards/rejected": -1.8392531871795654, + "step": 396 + }, + { + "epoch": 0.8485172321667112, + "grad_norm": 3.633221387863159, + "learning_rate": 6.698729810778064e-08, + "logits/chosen": -1.1731326580047607, + "logits/rejected": -1.0659189224243164, + "logps/chosen": -0.5739960670471191, + "logps/rejected": -0.941279947757721, + "loss": 1.5875, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4349900484085083, + "rewards/margins": 0.9182097911834717, + "rewards/rejected": -2.3531999588012695, + "step": 397 + }, + { + "epoch": 0.85065455516965, + "grad_norm": 9.181096076965332, + "learning_rate": 6.512943342215232e-08, + "logits/chosen": -0.9327074885368347, + "logits/rejected": -0.9438521265983582, + "logps/chosen": -0.6690176725387573, + "logps/rejected": -0.7583512663841248, + "loss": 1.5198, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.672544240951538, + "rewards/margins": 0.22333401441574097, + "rewards/rejected": -1.8958781957626343, + "step": 398 + }, + { + "epoch": 0.8527918781725888, + "grad_norm": 5.558097839355469, + "learning_rate": 6.329589969143517e-08, + "logits/chosen": -0.8800374269485474, + "logits/rejected": -0.8219183683395386, + "logps/chosen": -0.5247557759284973, + "logps/rejected": -0.6464003324508667, + "loss": 1.4196, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3118892908096313, + "rewards/margins": 0.30411145091056824, + "rewards/rejected": -1.616000771522522, + "step": 399 + }, + { + "epoch": 0.8549292011755276, + "grad_norm": 3.8024463653564453, + "learning_rate": 6.148679950161672e-08, + "logits/chosen": -0.9593957662582397, + "logits/rejected": -0.9404938817024231, + "logps/chosen": -0.7068220973014832, + "logps/rejected": -0.7800813913345337, + "loss": 1.4438, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7670555114746094, + "rewards/margins": 0.1831480711698532, + "rewards/rejected": -1.9502032995224, + "step": 400 + }, + { + "epoch": 0.8570665241784665, + "grad_norm": 30.781763076782227, + "learning_rate": 5.9702234071631e-08, + "logits/chosen": -0.9686922430992126, + "logits/rejected": -0.9655731320381165, + "logps/chosen": -0.5401520729064941, + "logps/rejected": -0.5928196310997009, + "loss": 1.5804, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.350380301475525, + "rewards/margins": 0.13166896998882294, + "rewards/rejected": -1.4820491075515747, + "step": 401 + }, + { + "epoch": 0.8592038471814053, + "grad_norm": 14.888254165649414, + "learning_rate": 5.794230324769517e-08, + "logits/chosen": -1.049428105354309, + "logits/rejected": -0.9786369800567627, + "logps/chosen": -0.6207393407821655, + "logps/rejected": -0.659958004951477, + "loss": 1.517, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5518485307693481, + "rewards/margins": 0.09804654121398926, + "rewards/rejected": -1.6498949527740479, + "step": 402 + }, + { + "epoch": 0.8613411701843441, + "grad_norm": 5.892756938934326, + "learning_rate": 5.620710549772295e-08, + "logits/chosen": -1.0603619813919067, + "logits/rejected": -1.0535688400268555, + "logps/chosen": -0.6214827299118042, + "logps/rejected": -0.7300854921340942, + "loss": 1.5846, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5537068843841553, + "rewards/margins": 0.2715071439743042, + "rewards/rejected": -1.82521390914917, + "step": 403 + }, + { + "epoch": 0.863478493187283, + "grad_norm": 6.201269626617432, + "learning_rate": 5.44967379058161e-08, + "logits/chosen": -1.0106871128082275, + "logits/rejected": -0.8725728988647461, + "logps/chosen": -0.41202932596206665, + "logps/rejected": -0.657246470451355, + "loss": 1.5421, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0300732851028442, + "rewards/margins": 0.6130428910255432, + "rewards/rejected": -1.6431162357330322, + "step": 404 + }, + { + "epoch": 0.8656158161902218, + "grad_norm": 8.44877815246582, + "learning_rate": 5.2811296166831666e-08, + "logits/chosen": -1.1635454893112183, + "logits/rejected": -1.0440311431884766, + "logps/chosen": -0.8248953223228455, + "logps/rejected": -0.709219217300415, + "loss": 1.5559, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.0622382164001465, + "rewards/margins": -0.2891903817653656, + "rewards/rejected": -1.7730481624603271, + "step": 405 + }, + { + "epoch": 0.8677531391931605, + "grad_norm": 5.078684329986572, + "learning_rate": 5.11508745810284e-08, + "logits/chosen": -1.1656029224395752, + "logits/rejected": -1.0778682231903076, + "logps/chosen": -0.512388288974762, + "logps/rejected": -0.5113582015037537, + "loss": 1.485, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.280970811843872, + "rewards/margins": -0.0025753136724233627, + "rewards/rejected": -1.2783952951431274, + "step": 406 + }, + { + "epoch": 0.8698904621960993, + "grad_norm": 6.306398868560791, + "learning_rate": 4.951556604879048e-08, + "logits/chosen": -1.1278554201126099, + "logits/rejected": -1.149449348449707, + "logps/chosen": -0.5426372289657593, + "logps/rejected": -0.8084475994110107, + "loss": 1.5014, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.356593132019043, + "rewards/margins": 0.6645258665084839, + "rewards/rejected": -2.0211191177368164, + "step": 407 + }, + { + "epoch": 0.8720277851990382, + "grad_norm": 9.290410995483398, + "learning_rate": 4.7905462065429946e-08, + "logits/chosen": -0.8804436922073364, + "logits/rejected": -0.856006383895874, + "logps/chosen": -0.41280895471572876, + "logps/rejected": -0.8294933438301086, + "loss": 1.4497, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0320223569869995, + "rewards/margins": 1.0417108535766602, + "rewards/rejected": -2.07373309135437, + "step": 408 + }, + { + "epoch": 0.874165108201977, + "grad_norm": 11.550924301147461, + "learning_rate": 4.6320652716067555e-08, + "logits/chosen": -1.0321075916290283, + "logits/rejected": -0.9579042792320251, + "logps/chosen": -0.8112492561340332, + "logps/rejected": -1.1215100288391113, + "loss": 1.4411, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.028123140335083, + "rewards/margins": 0.7756521701812744, + "rewards/rejected": -2.8037750720977783, + "step": 409 + }, + { + "epoch": 0.8763024312049158, + "grad_norm": 7.996156215667725, + "learning_rate": 4.4761226670592066e-08, + "logits/chosen": -0.8929157853126526, + "logits/rejected": -0.9894087910652161, + "logps/chosen": -0.5036193132400513, + "logps/rejected": -0.4987982213497162, + "loss": 1.6218, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2590482234954834, + "rewards/margins": -0.012052726000547409, + "rewards/rejected": -1.246995449066162, + "step": 410 + }, + { + "epoch": 0.8784397542078547, + "grad_norm": 6.660581588745117, + "learning_rate": 4.322727117869951e-08, + "logits/chosen": -0.9380248188972473, + "logits/rejected": -0.9620922207832336, + "logps/chosen": -0.41842323541641235, + "logps/rejected": -0.47635918855667114, + "loss": 1.4806, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0460580587387085, + "rewards/margins": 0.1448398381471634, + "rewards/rejected": -1.1908979415893555, + "step": 411 + }, + { + "epoch": 0.8805770772107935, + "grad_norm": 13.565817832946777, + "learning_rate": 4.17188720650119e-08, + "logits/chosen": -0.928549587726593, + "logits/rejected": -0.8930441737174988, + "logps/chosen": -0.49229955673217773, + "logps/rejected": -0.6658884286880493, + "loss": 1.484, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2307488918304443, + "rewards/margins": 0.43397217988967896, + "rewards/rejected": -1.664721131324768, + "step": 412 + }, + { + "epoch": 0.8827144002137323, + "grad_norm": 4.4607954025268555, + "learning_rate": 4.023611372427471e-08, + "logits/chosen": -0.9794715642929077, + "logits/rejected": -0.901313304901123, + "logps/chosen": -0.4945882260799408, + "logps/rejected": -0.4721212387084961, + "loss": 1.5612, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2364706993103027, + "rewards/margins": -0.056167569011449814, + "rewards/rejected": -1.1803030967712402, + "step": 413 + }, + { + "epoch": 0.8848517232166712, + "grad_norm": 4.917453289031982, + "learning_rate": 3.877907911663542e-08, + "logits/chosen": -1.101521372795105, + "logits/rejected": -1.0795665979385376, + "logps/chosen": -0.4808153808116913, + "logps/rejected": -0.5762184262275696, + "loss": 1.4704, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2020386457443237, + "rewards/margins": 0.23850753903388977, + "rewards/rejected": -1.4405461549758911, + "step": 414 + }, + { + "epoch": 0.88698904621961, + "grad_norm": 16.45602798461914, + "learning_rate": 3.734784976300165e-08, + "logits/chosen": -1.0213111639022827, + "logits/rejected": -1.037355661392212, + "logps/chosen": -0.7609426975250244, + "logps/rejected": -0.9708657264709473, + "loss": 1.474, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9023566246032715, + "rewards/margins": 0.5248077511787415, + "rewards/rejected": -2.427164316177368, + "step": 415 + }, + { + "epoch": 0.8891263692225487, + "grad_norm": 5.8920817375183105, + "learning_rate": 3.594250574048058e-08, + "logits/chosen": -1.0958168506622314, + "logits/rejected": -1.1276291608810425, + "logps/chosen": -0.7167930006980896, + "logps/rejected": -0.9383392930030823, + "loss": 1.4605, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.791982650756836, + "rewards/margins": 0.5538656711578369, + "rewards/rejected": -2.345848321914673, + "step": 416 + }, + { + "epoch": 0.8912636922254876, + "grad_norm": 15.450013160705566, + "learning_rate": 3.456312567789793e-08, + "logits/chosen": -0.956957995891571, + "logits/rejected": -0.9777557253837585, + "logps/chosen": -0.6754245162010193, + "logps/rejected": -0.6781172752380371, + "loss": 1.6078, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.688561201095581, + "rewards/margins": 0.006731957197189331, + "rewards/rejected": -1.6952930688858032, + "step": 417 + }, + { + "epoch": 0.8934010152284264, + "grad_norm": 8.894111633300781, + "learning_rate": 3.3209786751399184e-08, + "logits/chosen": -1.1309380531311035, + "logits/rejected": -1.0127419233322144, + "logps/chosen": -0.5305185914039612, + "logps/rejected": -0.5370005369186401, + "loss": 1.5549, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.326296329498291, + "rewards/margins": 0.016204845160245895, + "rewards/rejected": -1.342501163482666, + "step": 418 + }, + { + "epoch": 0.8955383382313652, + "grad_norm": 4.215895175933838, + "learning_rate": 3.188256468013139e-08, + "logits/chosen": -1.168836236000061, + "logits/rejected": -1.26181960105896, + "logps/chosen": -0.536091685295105, + "logps/rejected": -0.77155601978302, + "loss": 1.4393, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3402291536331177, + "rewards/margins": 0.5886609554290771, + "rewards/rejected": -1.9288899898529053, + "step": 419 + }, + { + "epoch": 0.897675661234304, + "grad_norm": 9.309587478637695, + "learning_rate": 3.058153372200695e-08, + "logits/chosen": -0.9437912106513977, + "logits/rejected": -0.8996111154556274, + "logps/chosen": -0.398969829082489, + "logps/rejected": -0.4293844699859619, + "loss": 1.5813, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.9974247217178345, + "rewards/margins": 0.07603646069765091, + "rewards/rejected": -1.0734611749649048, + "step": 420 + }, + { + "epoch": 0.8998129842372429, + "grad_norm": 9.308667182922363, + "learning_rate": 2.9306766669548457e-08, + "logits/chosen": -1.051709532737732, + "logits/rejected": -1.0886300802230835, + "logps/chosen": -0.6116673946380615, + "logps/rejected": -0.8640693426132202, + "loss": 1.4563, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5291682481765747, + "rewards/margins": 0.6310049295425415, + "rewards/rejected": -2.1601734161376953, + "step": 421 + }, + { + "epoch": 0.9019503072401817, + "grad_norm": 6.306589126586914, + "learning_rate": 2.805833484581621e-08, + "logits/chosen": -1.1781387329101562, + "logits/rejected": -1.1265491247177124, + "logps/chosen": -0.5683103203773499, + "logps/rejected": -0.6715315580368042, + "loss": 1.5119, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4207758903503418, + "rewards/margins": 0.2580530643463135, + "rewards/rejected": -1.6788289546966553, + "step": 422 + }, + { + "epoch": 0.9040876302431204, + "grad_norm": 6.854678153991699, + "learning_rate": 2.6836308100417872e-08, + "logits/chosen": -1.0027626752853394, + "logits/rejected": -1.021713137626648, + "logps/chosen": -0.6026887893676758, + "logps/rejected": -0.685580313205719, + "loss": 1.5287, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5067217350006104, + "rewards/margins": 0.207228884100914, + "rewards/rejected": -1.7139506340026855, + "step": 423 + }, + { + "epoch": 0.9062249532460593, + "grad_norm": 4.880378723144531, + "learning_rate": 2.5640754805600128e-08, + "logits/chosen": -0.9207165837287903, + "logits/rejected": -0.7281609177589417, + "logps/chosen": -0.48149988055229187, + "logps/rejected": -1.1003289222717285, + "loss": 1.4379, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.203749656677246, + "rewards/margins": 1.5470730066299438, + "rewards/rejected": -2.7508223056793213, + "step": 424 + }, + { + "epoch": 0.9083622762489981, + "grad_norm": 7.307305335998535, + "learning_rate": 2.4471741852423233e-08, + "logits/chosen": -0.970041811466217, + "logits/rejected": -0.9423936605453491, + "logps/chosen": -0.518881618976593, + "logps/rejected": -0.7002631425857544, + "loss": 1.5549, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2972040176391602, + "rewards/margins": 0.45345383882522583, + "rewards/rejected": -1.7506577968597412, + "step": 425 + }, + { + "epoch": 0.9104995992519369, + "grad_norm": 4.65291166305542, + "learning_rate": 2.3329334647018694e-08, + "logits/chosen": -1.0624005794525146, + "logits/rejected": -0.9302091598510742, + "logps/chosen": -0.4433947205543518, + "logps/rejected": -0.4743190407752991, + "loss": 1.5049, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1084867715835571, + "rewards/margins": 0.07731082290410995, + "rewards/rejected": -1.1857975721359253, + "step": 426 + }, + { + "epoch": 0.9126369222548758, + "grad_norm": 7.424281120300293, + "learning_rate": 2.2213597106929605e-08, + "logits/chosen": -0.9913086891174316, + "logits/rejected": -0.9474751949310303, + "logps/chosen": -0.46963855624198914, + "logps/rejected": -0.558479905128479, + "loss": 1.5135, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1740963459014893, + "rewards/margins": 0.22210323810577393, + "rewards/rejected": -1.3961995840072632, + "step": 427 + }, + { + "epoch": 0.9147742452578146, + "grad_norm": 6.601301193237305, + "learning_rate": 2.1124591657534774e-08, + "logits/chosen": -1.0710477828979492, + "logits/rejected": -1.064525842666626, + "logps/chosen": -0.52205491065979, + "logps/rejected": -0.6624395847320557, + "loss": 1.4596, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3051373958587646, + "rewards/margins": 0.35096171498298645, + "rewards/rejected": -1.6560990810394287, + "step": 428 + }, + { + "epoch": 0.9169115682607534, + "grad_norm": 6.440933704376221, + "learning_rate": 2.0062379228555525e-08, + "logits/chosen": -0.9846968054771423, + "logits/rejected": -0.9657022356987, + "logps/chosen": -0.7298382520675659, + "logps/rejected": -0.8390324115753174, + "loss": 1.4462, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8245956897735596, + "rewards/margins": 0.2729852795600891, + "rewards/rejected": -2.097581148147583, + "step": 429 + }, + { + "epoch": 0.9190488912636923, + "grad_norm": 6.04892635345459, + "learning_rate": 1.9027019250647036e-08, + "logits/chosen": -0.970207691192627, + "logits/rejected": -0.9668480157852173, + "logps/chosen": -0.45557963848114014, + "logps/rejected": -0.5887754559516907, + "loss": 1.4271, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1389491558074951, + "rewards/margins": 0.332989364862442, + "rewards/rejected": -1.4719386100769043, + "step": 430 + }, + { + "epoch": 0.921186214266631, + "grad_norm": 8.778804779052734, + "learning_rate": 1.8018569652073378e-08, + "logits/chosen": -0.9118247032165527, + "logits/rejected": -0.944421648979187, + "logps/chosen": -0.607579231262207, + "logps/rejected": -0.9298698902130127, + "loss": 1.4657, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5189483165740967, + "rewards/margins": 0.8057265281677246, + "rewards/rejected": -2.3246748447418213, + "step": 431 + }, + { + "epoch": 0.9233235372695698, + "grad_norm": 4.861128330230713, + "learning_rate": 1.7037086855465898e-08, + "logits/chosen": -1.0250306129455566, + "logits/rejected": -1.0059497356414795, + "logps/chosen": -0.5032894015312195, + "logps/rejected": -0.8505322933197021, + "loss": 1.4828, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2582234144210815, + "rewards/margins": 0.8681074976921082, + "rewards/rejected": -2.126330852508545, + "step": 432 + }, + { + "epoch": 0.9254608602725087, + "grad_norm": 8.301541328430176, + "learning_rate": 1.6082625774666792e-08, + "logits/chosen": -1.0359551906585693, + "logits/rejected": -0.9847449660301208, + "logps/chosen": -0.47514110803604126, + "logps/rejected": -0.5986137986183167, + "loss": 1.4601, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1878528594970703, + "rewards/margins": 0.3086817264556885, + "rewards/rejected": -1.4965344667434692, + "step": 433 + }, + { + "epoch": 0.9275981832754475, + "grad_norm": 11.128478050231934, + "learning_rate": 1.5155239811656562e-08, + "logits/chosen": -1.0488301515579224, + "logits/rejected": -0.9755731225013733, + "logps/chosen": -0.45144030451774597, + "logps/rejected": -0.4765854477882385, + "loss": 1.4288, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1286007165908813, + "rewards/margins": 0.06286279857158661, + "rewards/rejected": -1.1914634704589844, + "step": 434 + }, + { + "epoch": 0.9297355062783863, + "grad_norm": 7.042767524719238, + "learning_rate": 1.4254980853566246e-08, + "logits/chosen": -1.1306668519973755, + "logits/rejected": -1.0746090412139893, + "logps/chosen": -0.6658691763877869, + "logps/rejected": -0.8278828263282776, + "loss": 1.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6646732091903687, + "rewards/margins": 0.40503403544425964, + "rewards/rejected": -2.069707155227661, + "step": 435 + }, + { + "epoch": 0.9318728292813251, + "grad_norm": 6.516074180603027, + "learning_rate": 1.3381899269774289e-08, + "logits/chosen": -1.160095453262329, + "logits/rejected": -1.1833710670471191, + "logps/chosen": -0.869493842124939, + "logps/rejected": -0.9384310245513916, + "loss": 1.512, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.173734426498413, + "rewards/margins": 0.17234301567077637, + "rewards/rejected": -2.3460774421691895, + "step": 436 + }, + { + "epoch": 0.934010152284264, + "grad_norm": 5.304040431976318, + "learning_rate": 1.253604390908819e-08, + "logits/chosen": -0.9810507297515869, + "logits/rejected": -0.9905288219451904, + "logps/chosen": -0.43267419934272766, + "logps/rejected": -0.44727227091789246, + "loss": 1.5062, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0816855430603027, + "rewards/margins": 0.03649521246552467, + "rewards/rejected": -1.118180751800537, + "step": 437 + }, + { + "epoch": 0.9361474752872028, + "grad_norm": 7.976593971252441, + "learning_rate": 1.1717462097011855e-08, + "logits/chosen": -1.1212152242660522, + "logits/rejected": -1.1439484357833862, + "logps/chosen": -0.46723484992980957, + "logps/rejected": -0.6574444770812988, + "loss": 1.4262, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1680870056152344, + "rewards/margins": 0.4755241572856903, + "rewards/rejected": -1.6436113119125366, + "step": 438 + }, + { + "epoch": 0.9382847982901416, + "grad_norm": 11.167140007019043, + "learning_rate": 1.0926199633097154e-08, + "logits/chosen": -1.108040690422058, + "logits/rejected": -1.049588680267334, + "logps/chosen": -0.41383224725723267, + "logps/rejected": -0.7777174711227417, + "loss": 1.5436, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0345805883407593, + "rewards/margins": 0.9097130298614502, + "rewards/rejected": -1.94429349899292, + "step": 439 + }, + { + "epoch": 0.9404221212930804, + "grad_norm": 4.780636787414551, + "learning_rate": 1.016230078838226e-08, + "logits/chosen": -0.9971197843551636, + "logits/rejected": -0.906645655632019, + "logps/chosen": -0.4516942501068115, + "logps/rejected": -0.5613692402839661, + "loss": 1.5136, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1292357444763184, + "rewards/margins": 0.2741874158382416, + "rewards/rejected": -1.4034231901168823, + "step": 440 + }, + { + "epoch": 0.9425594442960192, + "grad_norm": 5.779046535491943, + "learning_rate": 9.425808302913728e-09, + "logits/chosen": -0.9931034445762634, + "logits/rejected": -0.9382791519165039, + "logps/chosen": -0.4940332770347595, + "logps/rejected": -0.5491997003555298, + "loss": 1.454, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2350831031799316, + "rewards/margins": 0.13791611790657043, + "rewards/rejected": -1.3729993104934692, + "step": 441 + }, + { + "epoch": 0.944696767298958, + "grad_norm": 4.751095771789551, + "learning_rate": 8.716763383355862e-09, + "logits/chosen": -0.9679308533668518, + "logits/rejected": -0.9130020141601562, + "logps/chosen": -0.39680802822113037, + "logps/rejected": -0.45057764649391174, + "loss": 1.4856, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9920201301574707, + "rewards/margins": 0.13442395627498627, + "rewards/rejected": -1.1264441013336182, + "step": 442 + }, + { + "epoch": 0.9468340903018969, + "grad_norm": 7.848532199859619, + "learning_rate": 8.035205700685165e-09, + "logits/chosen": -1.0988008975982666, + "logits/rejected": -1.0214226245880127, + "logps/chosen": -0.5310029983520508, + "logps/rejected": -0.46627891063690186, + "loss": 1.5292, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3275076150894165, + "rewards/margins": -0.1618102788925171, + "rewards/rejected": -1.1656973361968994, + "step": 443 + }, + { + "epoch": 0.9489714133048357, + "grad_norm": 20.29781723022461, + "learning_rate": 7.381173387970397e-09, + "logits/chosen": -1.1159013509750366, + "logits/rejected": -1.024048089981079, + "logps/chosen": -0.504087507724762, + "logps/rejected": -0.4435691833496094, + "loss": 1.4914, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2602187395095825, + "rewards/margins": -0.1512957364320755, + "rewards/rejected": -1.108923077583313, + "step": 444 + }, + { + "epoch": 0.9511087363077745, + "grad_norm": 4.859796524047852, + "learning_rate": 6.754703038239329e-09, + "logits/chosen": -1.0576136112213135, + "logits/rejected": -1.0677433013916016, + "logps/chosen": -0.5914698243141174, + "logps/rejected": -0.6684586405754089, + "loss": 1.5836, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4786746501922607, + "rewards/margins": 0.19247180223464966, + "rewards/rejected": -1.6711465120315552, + "step": 445 + }, + { + "epoch": 0.9532460593107134, + "grad_norm": 7.60525369644165, + "learning_rate": 6.15582970243117e-09, + "logits/chosen": -1.094986915588379, + "logits/rejected": -1.052059292793274, + "logps/chosen": -0.6280755996704102, + "logps/rejected": -0.7196720838546753, + "loss": 1.4718, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.570189118385315, + "rewards/margins": 0.22899119555950165, + "rewards/rejected": -1.799180269241333, + "step": 446 + }, + { + "epoch": 0.9553833823136522, + "grad_norm": 5.170019149780273, + "learning_rate": 5.5845868874357385e-09, + "logits/chosen": -0.925014853477478, + "logits/rejected": -0.9863495826721191, + "logps/chosen": -0.46621960401535034, + "logps/rejected": -0.4518744945526123, + "loss": 1.4332, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1655490398406982, + "rewards/margins": -0.0358627624809742, + "rewards/rejected": -1.1296862363815308, + "step": 447 + }, + { + "epoch": 0.957520705316591, + "grad_norm": 4.508576393127441, + "learning_rate": 5.0410065542185184e-09, + "logits/chosen": -1.0208594799041748, + "logits/rejected": -0.9924222826957703, + "logps/chosen": -0.45458984375, + "logps/rejected": -0.6638749241828918, + "loss": 1.4904, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1364747285842896, + "rewards/margins": 0.5232126116752625, + "rewards/rejected": -1.6596875190734863, + "step": 448 + }, + { + "epoch": 0.9596580283195298, + "grad_norm": 6.562451362609863, + "learning_rate": 4.5251191160326495e-09, + "logits/chosen": -1.1275100708007812, + "logits/rejected": -1.124595046043396, + "logps/chosen": -0.8562003970146179, + "logps/rejected": -1.0813689231872559, + "loss": 1.5302, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.140500783920288, + "rewards/margins": 0.5629212260246277, + "rewards/rejected": -2.7034220695495605, + "step": 449 + }, + { + "epoch": 0.9617953513224686, + "grad_norm": 13.340657234191895, + "learning_rate": 4.036953436716895e-09, + "logits/chosen": -0.8729239702224731, + "logits/rejected": -0.9192299842834473, + "logps/chosen": -0.6237303614616394, + "logps/rejected": -0.7104634046554565, + "loss": 1.5047, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.559325933456421, + "rewards/margins": 0.21683269739151, + "rewards/rejected": -1.7761585712432861, + "step": 450 + }, + { + "epoch": 0.9639326743254074, + "grad_norm": 10.217489242553711, + "learning_rate": 3.5765368290813223e-09, + "logits/chosen": -1.0247201919555664, + "logits/rejected": -1.038936734199524, + "logps/chosen": -0.6359574794769287, + "logps/rejected": -0.6257486939430237, + "loss": 1.5193, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5898935794830322, + "rewards/margins": -0.0255217757076025, + "rewards/rejected": -1.5643717050552368, + "step": 451 + }, + { + "epoch": 0.9660699973283462, + "grad_norm": 10.847533226013184, + "learning_rate": 3.1438950533786977e-09, + "logits/chosen": -1.0954786539077759, + "logits/rejected": -1.102776288986206, + "logps/chosen": -0.5472733378410339, + "logps/rejected": -0.6362195611000061, + "loss": 1.5412, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.3681833744049072, + "rewards/margins": 0.22236546874046326, + "rewards/rejected": -1.5905488729476929, + "step": 452 + }, + { + "epoch": 0.9682073203312851, + "grad_norm": 6.472089767456055, + "learning_rate": 2.739052315863355e-09, + "logits/chosen": -1.1393368244171143, + "logits/rejected": -1.0646111965179443, + "logps/chosen": -0.7743910551071167, + "logps/rejected": -0.7680612802505493, + "loss": 1.633, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.9359774589538574, + "rewards/margins": -0.01582423597574234, + "rewards/rejected": -1.9201533794403076, + "step": 453 + }, + { + "epoch": 0.9703446433342239, + "grad_norm": 7.718911170959473, + "learning_rate": 2.3620312674367816e-09, + "logits/chosen": -1.0543270111083984, + "logits/rejected": -1.031882643699646, + "logps/chosen": -0.6770450472831726, + "logps/rejected": -0.675162672996521, + "loss": 1.4921, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.692612648010254, + "rewards/margins": -0.004705796018242836, + "rewards/rejected": -1.6879067420959473, + "step": 454 + }, + { + "epoch": 0.9724819663371627, + "grad_norm": 6.906121253967285, + "learning_rate": 2.0128530023804656e-09, + "logits/chosen": -1.0301321744918823, + "logits/rejected": -0.9676041007041931, + "logps/chosen": -0.610703706741333, + "logps/rejected": -0.576894223690033, + "loss": 1.5818, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.526759386062622, + "rewards/margins": -0.0845237672328949, + "rewards/rejected": -1.4422357082366943, + "step": 455 + }, + { + "epoch": 0.9746192893401016, + "grad_norm": 6.149905204772949, + "learning_rate": 1.6915370571756181e-09, + "logits/chosen": -0.9057004451751709, + "logits/rejected": -1.0025300979614258, + "logps/chosen": -0.6752175688743591, + "logps/rejected": -1.0043590068817139, + "loss": 1.4284, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6880438327789307, + "rewards/margins": 0.8228534460067749, + "rewards/rejected": -2.510897397994995, + "step": 456 + }, + { + "epoch": 0.9767566123430403, + "grad_norm": 8.217068672180176, + "learning_rate": 1.3981014094099353e-09, + "logits/chosen": -1.186248540878296, + "logits/rejected": -1.3456785678863525, + "logps/chosen": -0.8704374432563782, + "logps/rejected": -0.86018306016922, + "loss": 1.6605, + "rewards/accuracies": 0.4375, + "rewards/chosen": -2.176093578338623, + "rewards/margins": -0.02563604712486267, + "rewards/rejected": -2.1504576206207275, + "step": 457 + }, + { + "epoch": 0.9788939353459791, + "grad_norm": 4.915297508239746, + "learning_rate": 1.1325624767719588e-09, + "logits/chosen": -1.0867304801940918, + "logits/rejected": -1.0485161542892456, + "logps/chosen": -0.5887439846992493, + "logps/rejected": -0.789469838142395, + "loss": 1.5245, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4718600511550903, + "rewards/margins": 0.5018147826194763, + "rewards/rejected": -1.9736747741699219, + "step": 458 + }, + { + "epoch": 0.981031258348918, + "grad_norm": 6.181196689605713, + "learning_rate": 8.949351161324225e-10, + "logits/chosen": -1.0734320878982544, + "logits/rejected": -1.1374140977859497, + "logps/chosen": -0.8127413392066956, + "logps/rejected": -0.9145556688308716, + "loss": 1.384, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.031853675842285, + "rewards/margins": 0.2545357644557953, + "rewards/rejected": -2.286389112472534, + "step": 459 + }, + { + "epoch": 0.9831685813518568, + "grad_norm": 8.571794509887695, + "learning_rate": 6.852326227130833e-10, + "logits/chosen": -0.8654438257217407, + "logits/rejected": -0.7517091631889343, + "logps/chosen": -0.43124887347221375, + "logps/rejected": -0.5231287479400635, + "loss": 1.4778, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0781221389770508, + "rewards/margins": 0.22969970107078552, + "rewards/rejected": -1.3078218698501587, + "step": 460 + }, + { + "epoch": 0.9853059043547956, + "grad_norm": 6.118160724639893, + "learning_rate": 5.034667293427053e-10, + "logits/chosen": -1.136691689491272, + "logits/rejected": -1.0298508405685425, + "logps/chosen": -0.5144182443618774, + "logps/rejected": -0.5670837759971619, + "loss": 1.4558, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2860456705093384, + "rewards/margins": 0.13166388869285583, + "rewards/rejected": -1.417709469795227, + "step": 461 + }, + { + "epoch": 0.9874432273577345, + "grad_norm": 5.212719440460205, + "learning_rate": 3.4964760580069585e-10, + "logits/chosen": -1.0757759809494019, + "logits/rejected": -1.0036826133728027, + "logps/chosen": -0.4357612431049347, + "logps/rejected": -0.8209949731826782, + "loss": 1.3862, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0894030332565308, + "rewards/margins": 0.9630845785140991, + "rewards/rejected": -2.052487373352051, + "step": 462 + }, + { + "epoch": 0.9895805503606733, + "grad_norm": 4.922801971435547, + "learning_rate": 2.2378385824833866e-10, + "logits/chosen": -1.1101552248001099, + "logits/rejected": -1.023587942123413, + "logps/chosen": -0.5474473237991333, + "logps/rejected": -0.6908574104309082, + "loss": 1.4534, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.368618369102478, + "rewards/margins": 0.35852521657943726, + "rewards/rejected": -1.7271435260772705, + "step": 463 + }, + { + "epoch": 0.9917178733636121, + "grad_norm": 5.8598952293396, + "learning_rate": 1.2588252874673466e-10, + "logits/chosen": -1.0550298690795898, + "logits/rejected": -1.0083141326904297, + "logps/chosen": -0.419270783662796, + "logps/rejected": -0.5867845416069031, + "loss": 1.4247, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.048176884651184, + "rewards/margins": 0.41878432035446167, + "rewards/rejected": -1.4669612646102905, + "step": 464 + }, + { + "epoch": 0.9938551963665508, + "grad_norm": 4.032866477966309, + "learning_rate": 5.594909486328348e-11, + "logits/chosen": -1.215372920036316, + "logits/rejected": -1.0714528560638428, + "logps/chosen": -0.42737770080566406, + "logps/rejected": -0.5524105429649353, + "loss": 1.5264, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0684442520141602, + "rewards/margins": 0.31258225440979004, + "rewards/rejected": -1.3810263872146606, + "step": 465 + }, + { + "epoch": 0.9959925193694897, + "grad_norm": 7.993014812469482, + "learning_rate": 1.3987469365095429e-11, + "logits/chosen": -0.9364662766456604, + "logits/rejected": -0.8720629811286926, + "logps/chosen": -0.5930238962173462, + "logps/rejected": -0.5850106477737427, + "loss": 1.5437, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.4825596809387207, + "rewards/margins": -0.02003306895494461, + "rewards/rejected": -1.462526798248291, + "step": 466 + }, + { + "epoch": 0.9981298423724285, + "grad_norm": 7.496538162231445, + "learning_rate": 0.0, + "logits/chosen": -0.8774080276489258, + "logits/rejected": -0.8403609395027161, + "logps/chosen": -0.4634042978286743, + "logps/rejected": -0.4342804551124573, + "loss": 1.502, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.158510684967041, + "rewards/margins": -0.07280956953763962, + "rewards/rejected": -1.0857011079788208, + "step": 467 + }, + { + "epoch": 0.9981298423724285, + "eval_logits/chosen": -1.2647873163223267, + "eval_logits/rejected": -1.2294323444366455, + "eval_logps/chosen": -0.5171914100646973, + "eval_logps/rejected": -0.6633016467094421, + "eval_loss": 1.490802526473999, + "eval_rewards/accuracies": 0.600806474685669, + "eval_rewards/chosen": -1.2929786443710327, + "eval_rewards/margins": 0.3652755916118622, + "eval_rewards/rejected": -1.6582541465759277, + "eval_runtime": 77.7277, + "eval_samples_per_second": 25.229, + "eval_steps_per_second": 0.798, + "step": 467 + }, + { + "epoch": 0.9981298423724285, + "step": 467, + "total_flos": 0.0, + "train_loss": 1.540159606576221, + "train_runtime": 15000.5444, + "train_samples_per_second": 3.992, + "train_steps_per_second": 0.031 + } + ], + "logging_steps": 1, + "max_steps": 467, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 32, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}