simpo-baseline-1e-7 / trainer_state.json
ZefanW's picture
Model save
a1fd88a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9981298423724285,
"eval_steps": 500,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021373230029388193,
"grad_norm": 3.1614959239959717,
"learning_rate": 2.127659574468085e-08,
"logits/chosen": -1.1381689310073853,
"logits/rejected": -0.9913416504859924,
"logps/chosen": -0.2839311957359314,
"logps/rejected": -0.2955534756183624,
"loss": 1.608,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7098279595375061,
"rewards/margins": 0.029055748134851456,
"rewards/rejected": -0.7388837337493896,
"step": 1
},
{
"epoch": 0.004274646005877639,
"grad_norm": 7.915106773376465,
"learning_rate": 4.25531914893617e-08,
"logits/chosen": -1.0311710834503174,
"logits/rejected": -0.8901023864746094,
"logps/chosen": -0.24952735006809235,
"logps/rejected": -0.24253402650356293,
"loss": 1.6086,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6238184571266174,
"rewards/margins": -0.01748332567512989,
"rewards/rejected": -0.6063351035118103,
"step": 2
},
{
"epoch": 0.006411969008816457,
"grad_norm": 7.10002326965332,
"learning_rate": 6.382978723404254e-08,
"logits/chosen": -0.9257642030715942,
"logits/rejected": -0.8286958932876587,
"logps/chosen": -0.2627497911453247,
"logps/rejected": -0.2670031189918518,
"loss": 1.6177,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6568744778633118,
"rewards/margins": 0.010633318684995174,
"rewards/rejected": -0.6675078272819519,
"step": 3
},
{
"epoch": 0.008549292011755277,
"grad_norm": 4.93231201171875,
"learning_rate": 8.51063829787234e-08,
"logits/chosen": -0.8513132929801941,
"logits/rejected": -0.7548086643218994,
"logps/chosen": -0.2775232195854187,
"logps/rejected": -0.26499998569488525,
"loss": 1.6472,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6938079595565796,
"rewards/margins": -0.03130800649523735,
"rewards/rejected": -0.6625000238418579,
"step": 4
},
{
"epoch": 0.010686615014694095,
"grad_norm": 7.3820319175720215,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -1.1639097929000854,
"logits/rejected": -1.2206344604492188,
"logps/chosen": -0.28828343749046326,
"logps/rejected": -0.29974380135536194,
"loss": 1.6096,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7207085490226746,
"rewards/margins": 0.028650924563407898,
"rewards/rejected": -0.7493594884872437,
"step": 5
},
{
"epoch": 0.012823938017632914,
"grad_norm": 4.583745002746582,
"learning_rate": 1.2765957446808508e-07,
"logits/chosen": -1.078372597694397,
"logits/rejected": -1.0467870235443115,
"logps/chosen": -0.2583101987838745,
"logps/rejected": -0.27993106842041016,
"loss": 1.6052,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.645775556564331,
"rewards/margins": 0.054052069783210754,
"rewards/rejected": -0.6998275518417358,
"step": 6
},
{
"epoch": 0.014961261020571734,
"grad_norm": 7.005415916442871,
"learning_rate": 1.4893617021276595e-07,
"logits/chosen": -0.7785229682922363,
"logits/rejected": -0.7655000686645508,
"logps/chosen": -0.2553212642669678,
"logps/rejected": -0.24673190712928772,
"loss": 1.6078,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6383031606674194,
"rewards/margins": -0.021473374217748642,
"rewards/rejected": -0.6168297529220581,
"step": 7
},
{
"epoch": 0.017098584023510555,
"grad_norm": 3.668518304824829,
"learning_rate": 1.702127659574468e-07,
"logits/chosen": -1.0131795406341553,
"logits/rejected": -1.0326677560806274,
"logps/chosen": -0.24508661031723022,
"logps/rejected": -0.26711565256118774,
"loss": 1.5828,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6127164959907532,
"rewards/margins": 0.0550725981593132,
"rewards/rejected": -0.667789101600647,
"step": 8
},
{
"epoch": 0.01923590702644937,
"grad_norm": 3.8814697265625,
"learning_rate": 1.9148936170212765e-07,
"logits/chosen": -0.9995537996292114,
"logits/rejected": -0.8747727274894714,
"logps/chosen": -0.2774883508682251,
"logps/rejected": -0.25593557953834534,
"loss": 1.6179,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.693720817565918,
"rewards/margins": -0.05388186126947403,
"rewards/rejected": -0.6398389935493469,
"step": 9
},
{
"epoch": 0.02137323002938819,
"grad_norm": 7.050100803375244,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -1.0684819221496582,
"logits/rejected": -0.9995761513710022,
"logps/chosen": -0.3254011273384094,
"logps/rejected": -0.31700819730758667,
"loss": 1.6215,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.8135027885437012,
"rewards/margins": -0.020982395857572556,
"rewards/rejected": -0.7925204038619995,
"step": 10
},
{
"epoch": 0.02351055303232701,
"grad_norm": 6.378317356109619,
"learning_rate": 2.3404255319148937e-07,
"logits/chosen": -0.960267186164856,
"logits/rejected": -0.8816654086112976,
"logps/chosen": -0.28366273641586304,
"logps/rejected": -0.2510431110858917,
"loss": 1.6101,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.70915687084198,
"rewards/margins": -0.0815490186214447,
"rewards/rejected": -0.6276078224182129,
"step": 11
},
{
"epoch": 0.02564787603526583,
"grad_norm": 2.851073980331421,
"learning_rate": 2.5531914893617016e-07,
"logits/chosen": -0.9916080832481384,
"logits/rejected": -1.0304456949234009,
"logps/chosen": -0.2935165464878082,
"logps/rejected": -0.26734259724617004,
"loss": 1.6198,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7337914705276489,
"rewards/margins": -0.06543491035699844,
"rewards/rejected": -0.6683565378189087,
"step": 12
},
{
"epoch": 0.027785199038204648,
"grad_norm": 6.843414306640625,
"learning_rate": 2.7659574468085106e-07,
"logits/chosen": -0.865218997001648,
"logits/rejected": -0.8769045472145081,
"logps/chosen": -0.3183431327342987,
"logps/rejected": -0.31171083450317383,
"loss": 1.6,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7958577275276184,
"rewards/margins": -0.016580628231167793,
"rewards/rejected": -0.7792771458625793,
"step": 13
},
{
"epoch": 0.029922522041143467,
"grad_norm": 5.762980937957764,
"learning_rate": 2.978723404255319e-07,
"logits/chosen": -0.8773849606513977,
"logits/rejected": -0.8515525460243225,
"logps/chosen": -0.31379514932632446,
"logps/rejected": -0.29587048292160034,
"loss": 1.624,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7844878435134888,
"rewards/margins": -0.04481163248419762,
"rewards/rejected": -0.7396762371063232,
"step": 14
},
{
"epoch": 0.03205984504408229,
"grad_norm": 5.013691425323486,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -1.0915770530700684,
"logits/rejected": -0.9239784479141235,
"logps/chosen": -0.2713887393474579,
"logps/rejected": -0.2902137339115143,
"loss": 1.5727,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6784718632698059,
"rewards/margins": 0.047062430530786514,
"rewards/rejected": -0.7255342602729797,
"step": 15
},
{
"epoch": 0.03419716804702111,
"grad_norm": 4.163066387176514,
"learning_rate": 3.404255319148936e-07,
"logits/chosen": -0.9121224880218506,
"logits/rejected": -0.8888986706733704,
"logps/chosen": -0.2657870948314667,
"logps/rejected": -0.28352442383766174,
"loss": 1.6097,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6644677519798279,
"rewards/margins": 0.04434328153729439,
"rewards/rejected": -0.7088110446929932,
"step": 16
},
{
"epoch": 0.03633449104995993,
"grad_norm": 7.174858570098877,
"learning_rate": 3.617021276595745e-07,
"logits/chosen": -0.7462605834007263,
"logits/rejected": -0.7856797575950623,
"logps/chosen": -0.27784857153892517,
"logps/rejected": -0.2904101610183716,
"loss": 1.5875,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6946213841438293,
"rewards/margins": 0.031404003500938416,
"rewards/rejected": -0.7260254621505737,
"step": 17
},
{
"epoch": 0.03847181405289874,
"grad_norm": 4.658429145812988,
"learning_rate": 3.829787234042553e-07,
"logits/chosen": -1.097366213798523,
"logits/rejected": -1.181847333908081,
"logps/chosen": -0.2530037760734558,
"logps/rejected": -0.2565597891807556,
"loss": 1.6262,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6325094699859619,
"rewards/margins": 0.008889976888895035,
"rewards/rejected": -0.6413994431495667,
"step": 18
},
{
"epoch": 0.04060913705583756,
"grad_norm": 6.409156799316406,
"learning_rate": 4.0425531914893614e-07,
"logits/chosen": -1.1247719526290894,
"logits/rejected": -1.0388509035110474,
"logps/chosen": -0.2717263698577881,
"logps/rejected": -0.28198879957199097,
"loss": 1.6261,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6793159246444702,
"rewards/margins": 0.02565601095557213,
"rewards/rejected": -0.704971969127655,
"step": 19
},
{
"epoch": 0.04274646005877638,
"grad_norm": 4.293993949890137,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -1.0168251991271973,
"logits/rejected": -1.0034692287445068,
"logps/chosen": -0.24037577211856842,
"logps/rejected": -0.23601552844047546,
"loss": 1.618,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6009395122528076,
"rewards/margins": -0.01090063713490963,
"rewards/rejected": -0.5900388360023499,
"step": 20
},
{
"epoch": 0.0448837830617152,
"grad_norm": 10.061914443969727,
"learning_rate": 4.4680851063829783e-07,
"logits/chosen": -1.1607167720794678,
"logits/rejected": -1.0358400344848633,
"logps/chosen": -0.3354223072528839,
"logps/rejected": -0.27505114674568176,
"loss": 1.6588,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.8385557532310486,
"rewards/margins": -0.1509278416633606,
"rewards/rejected": -0.6876278519630432,
"step": 21
},
{
"epoch": 0.04702110606465402,
"grad_norm": 2.9843106269836426,
"learning_rate": 4.6808510638297873e-07,
"logits/chosen": -1.0770177841186523,
"logits/rejected": -1.0807621479034424,
"logps/chosen": -0.26381832361221313,
"logps/rejected": -0.2722492814064026,
"loss": 1.6007,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.6595457792282104,
"rewards/margins": 0.02107742801308632,
"rewards/rejected": -0.6806231737136841,
"step": 22
},
{
"epoch": 0.04915842906759284,
"grad_norm": 6.568375110626221,
"learning_rate": 4.893617021276595e-07,
"logits/chosen": -0.8165597915649414,
"logits/rejected": -0.9183764457702637,
"logps/chosen": -0.2863512933254242,
"logps/rejected": -0.29046231508255005,
"loss": 1.567,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7158782482147217,
"rewards/margins": 0.010277565568685532,
"rewards/rejected": -0.7261557579040527,
"step": 23
},
{
"epoch": 0.05129575207053166,
"grad_norm": 6.657776355743408,
"learning_rate": 5.106382978723403e-07,
"logits/chosen": -1.026950716972351,
"logits/rejected": -1.0364724397659302,
"logps/chosen": -0.2393265664577484,
"logps/rejected": -0.2672955393791199,
"loss": 1.5978,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5983164310455322,
"rewards/margins": 0.06992244720458984,
"rewards/rejected": -0.6682388782501221,
"step": 24
},
{
"epoch": 0.053433075073470476,
"grad_norm": 4.221382141113281,
"learning_rate": 5.319148936170212e-07,
"logits/chosen": -1.0916444063186646,
"logits/rejected": -0.9851425886154175,
"logps/chosen": -0.2879200279712677,
"logps/rejected": -0.32731401920318604,
"loss": 1.5978,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7198000550270081,
"rewards/margins": 0.09848497807979584,
"rewards/rejected": -0.8182849884033203,
"step": 25
},
{
"epoch": 0.055570398076409296,
"grad_norm": 4.939723968505859,
"learning_rate": 5.531914893617021e-07,
"logits/chosen": -1.227979063987732,
"logits/rejected": -1.0881894826889038,
"logps/chosen": -0.3044770658016205,
"logps/rejected": -0.29919499158859253,
"loss": 1.6223,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7611927390098572,
"rewards/margins": -0.013205248862504959,
"rewards/rejected": -0.7479875087738037,
"step": 26
},
{
"epoch": 0.057707721079348115,
"grad_norm": 4.797338485717773,
"learning_rate": 5.74468085106383e-07,
"logits/chosen": -0.9999684691429138,
"logits/rejected": -0.9719603061676025,
"logps/chosen": -0.2676877975463867,
"logps/rejected": -0.2837084233760834,
"loss": 1.6208,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6692195534706116,
"rewards/margins": 0.040051497519016266,
"rewards/rejected": -0.7092710137367249,
"step": 27
},
{
"epoch": 0.059845044082286934,
"grad_norm": 5.77163028717041,
"learning_rate": 5.957446808510638e-07,
"logits/chosen": -0.8914034366607666,
"logits/rejected": -0.9402093291282654,
"logps/chosen": -0.2679407596588135,
"logps/rejected": -0.2759988307952881,
"loss": 1.618,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6698518991470337,
"rewards/margins": 0.020145151764154434,
"rewards/rejected": -0.6899970173835754,
"step": 28
},
{
"epoch": 0.061982367085225754,
"grad_norm": 4.006846904754639,
"learning_rate": 6.170212765957446e-07,
"logits/chosen": -1.0984641313552856,
"logits/rejected": -1.0189847946166992,
"logps/chosen": -0.26423007249832153,
"logps/rejected": -0.2647170126438141,
"loss": 1.6184,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.660575270652771,
"rewards/margins": 0.001217234879732132,
"rewards/rejected": -0.6617924571037292,
"step": 29
},
{
"epoch": 0.06411969008816458,
"grad_norm": 7.086247444152832,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -1.1409313678741455,
"logits/rejected": -0.911389172077179,
"logps/chosen": -0.27589118480682373,
"logps/rejected": -0.25251269340515137,
"loss": 1.6156,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.6897279024124146,
"rewards/margins": -0.05844618380069733,
"rewards/rejected": -0.6312817335128784,
"step": 30
},
{
"epoch": 0.06625701309110339,
"grad_norm": 5.860540866851807,
"learning_rate": 6.595744680851063e-07,
"logits/chosen": -1.0596109628677368,
"logits/rejected": -0.9196721315383911,
"logps/chosen": -0.2705378532409668,
"logps/rejected": -0.32445117831230164,
"loss": 1.5929,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6763445734977722,
"rewards/margins": 0.13478338718414307,
"rewards/rejected": -0.8111280202865601,
"step": 31
},
{
"epoch": 0.06839433609404222,
"grad_norm": 9.763710021972656,
"learning_rate": 6.808510638297872e-07,
"logits/chosen": -0.8758100867271423,
"logits/rejected": -0.8177347183227539,
"logps/chosen": -0.26637426018714905,
"logps/rejected": -0.29131248593330383,
"loss": 1.5963,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6659356355667114,
"rewards/margins": 0.06234561279416084,
"rewards/rejected": -0.7282812595367432,
"step": 32
},
{
"epoch": 0.07053165909698103,
"grad_norm": 5.033257484436035,
"learning_rate": 7.021276595744681e-07,
"logits/chosen": -1.0621229410171509,
"logits/rejected": -0.9115914702415466,
"logps/chosen": -0.30500558018684387,
"logps/rejected": -0.2638895809650421,
"loss": 1.6286,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.7625139951705933,
"rewards/margins": -0.10279002785682678,
"rewards/rejected": -0.6597238779067993,
"step": 33
},
{
"epoch": 0.07266898209991986,
"grad_norm": 6.103218078613281,
"learning_rate": 7.23404255319149e-07,
"logits/chosen": -1.0077497959136963,
"logits/rejected": -0.9838278889656067,
"logps/chosen": -0.29778575897216797,
"logps/rejected": -0.3331536054611206,
"loss": 1.5943,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7444643378257751,
"rewards/margins": 0.08841972053050995,
"rewards/rejected": -0.8328840732574463,
"step": 34
},
{
"epoch": 0.07480630510285867,
"grad_norm": 19.696596145629883,
"learning_rate": 7.446808510638297e-07,
"logits/chosen": -1.0255154371261597,
"logits/rejected": -1.0052193403244019,
"logps/chosen": -0.2500300705432892,
"logps/rejected": -0.284759521484375,
"loss": 1.6112,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6250751614570618,
"rewards/margins": 0.0868237167596817,
"rewards/rejected": -0.7118988633155823,
"step": 35
},
{
"epoch": 0.07694362810579748,
"grad_norm": 8.413592338562012,
"learning_rate": 7.659574468085106e-07,
"logits/chosen": -0.7250477075576782,
"logits/rejected": -0.5996040105819702,
"logps/chosen": -0.27290984988212585,
"logps/rejected": -0.29640769958496094,
"loss": 1.5972,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.682274580001831,
"rewards/margins": 0.05874472111463547,
"rewards/rejected": -0.7410193085670471,
"step": 36
},
{
"epoch": 0.07908095110873631,
"grad_norm": 5.761231899261475,
"learning_rate": 7.872340425531915e-07,
"logits/chosen": -0.9720395803451538,
"logits/rejected": -0.9105511903762817,
"logps/chosen": -0.27565720677375793,
"logps/rejected": -0.2629649341106415,
"loss": 1.6134,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6891430616378784,
"rewards/margins": -0.03173065185546875,
"rewards/rejected": -0.6574123501777649,
"step": 37
},
{
"epoch": 0.08121827411167512,
"grad_norm": 11.410785675048828,
"learning_rate": 8.085106382978723e-07,
"logits/chosen": -0.8644598722457886,
"logits/rejected": -0.9072043895721436,
"logps/chosen": -0.3011826276779175,
"logps/rejected": -0.29128536581993103,
"loss": 1.6565,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7529566287994385,
"rewards/margins": -0.024743108078837395,
"rewards/rejected": -0.7282134890556335,
"step": 38
},
{
"epoch": 0.08335559711461395,
"grad_norm": 6.3143696784973145,
"learning_rate": 8.297872340425532e-07,
"logits/chosen": -1.0631940364837646,
"logits/rejected": -1.1349154710769653,
"logps/chosen": -0.256188303232193,
"logps/rejected": -0.27545157074928284,
"loss": 1.5869,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6404708027839661,
"rewards/margins": 0.048158105462789536,
"rewards/rejected": -0.6886288523674011,
"step": 39
},
{
"epoch": 0.08549292011755276,
"grad_norm": 4.655215740203857,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -1.0722713470458984,
"logits/rejected": -1.090339183807373,
"logps/chosen": -0.2882351577281952,
"logps/rejected": -0.29485568404197693,
"loss": 1.6342,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7205879092216492,
"rewards/margins": 0.01655130460858345,
"rewards/rejected": -0.7371392846107483,
"step": 40
},
{
"epoch": 0.08763024312049159,
"grad_norm": 10.655563354492188,
"learning_rate": 8.723404255319149e-07,
"logits/chosen": -1.0175336599349976,
"logits/rejected": -0.878496527671814,
"logps/chosen": -0.31010520458221436,
"logps/rejected": -0.33926013112068176,
"loss": 1.5843,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7752628922462463,
"rewards/margins": 0.07288742810487747,
"rewards/rejected": -0.8481503129005432,
"step": 41
},
{
"epoch": 0.0897675661234304,
"grad_norm": 6.481273174285889,
"learning_rate": 8.936170212765957e-07,
"logits/chosen": -1.056595802307129,
"logits/rejected": -0.9111218452453613,
"logps/chosen": -0.28797075152397156,
"logps/rejected": -0.3270872235298157,
"loss": 1.6069,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7199268937110901,
"rewards/margins": 0.0977911651134491,
"rewards/rejected": -0.8177180290222168,
"step": 42
},
{
"epoch": 0.09190488912636922,
"grad_norm": 6.524636745452881,
"learning_rate": 9.148936170212766e-07,
"logits/chosen": -0.7545532584190369,
"logits/rejected": -0.8094898462295532,
"logps/chosen": -0.35466429591178894,
"logps/rejected": -0.28596031665802,
"loss": 1.6206,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.8866608738899231,
"rewards/margins": -0.17176005244255066,
"rewards/rejected": -0.7149007320404053,
"step": 43
},
{
"epoch": 0.09404221212930804,
"grad_norm": 15.254437446594238,
"learning_rate": 9.361702127659575e-07,
"logits/chosen": -0.909596860408783,
"logits/rejected": -0.8520998954772949,
"logps/chosen": -0.41655728220939636,
"logps/rejected": -0.4312165379524231,
"loss": 1.5864,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0413931608200073,
"rewards/margins": 0.03664811700582504,
"rewards/rejected": -1.0780413150787354,
"step": 44
},
{
"epoch": 0.09617953513224686,
"grad_norm": 3.75406551361084,
"learning_rate": 9.574468085106384e-07,
"logits/chosen": -0.9761062264442444,
"logits/rejected": -0.8622941374778748,
"logps/chosen": -0.31024113297462463,
"logps/rejected": -0.29050612449645996,
"loss": 1.6133,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7756028175354004,
"rewards/margins": -0.0493374839425087,
"rewards/rejected": -0.7262653708457947,
"step": 45
},
{
"epoch": 0.09831685813518568,
"grad_norm": 4.00916051864624,
"learning_rate": 9.78723404255319e-07,
"logits/chosen": -0.8587102890014648,
"logits/rejected": -0.9007890820503235,
"logps/chosen": -0.2729268968105316,
"logps/rejected": -0.28749462962150574,
"loss": 1.6015,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6823172569274902,
"rewards/margins": 0.036419324576854706,
"rewards/rejected": -0.7187365889549255,
"step": 46
},
{
"epoch": 0.1004541811381245,
"grad_norm": 6.543757438659668,
"learning_rate": 1e-06,
"logits/chosen": -0.9861698150634766,
"logits/rejected": -1.0527900457382202,
"logps/chosen": -0.261736124753952,
"logps/rejected": -0.30613070726394653,
"loss": 1.5661,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6543402671813965,
"rewards/margins": 0.11098647862672806,
"rewards/rejected": -0.7653267979621887,
"step": 47
},
{
"epoch": 0.10259150414106331,
"grad_norm": 4.550771713256836,
"learning_rate": 9.999860125306348e-07,
"logits/chosen": -0.765229344367981,
"logits/rejected": -0.9412630796432495,
"logps/chosen": -0.30382850766181946,
"logps/rejected": -0.3371865153312683,
"loss": 1.6164,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7595713138580322,
"rewards/margins": 0.08339503407478333,
"rewards/rejected": -0.8429663181304932,
"step": 48
},
{
"epoch": 0.10472882714400214,
"grad_norm": 3.9836678504943848,
"learning_rate": 9.999440509051367e-07,
"logits/chosen": -1.0350228548049927,
"logits/rejected": -0.8914788961410522,
"logps/chosen": -0.31590813398361206,
"logps/rejected": -0.34474560618400574,
"loss": 1.5679,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7897703647613525,
"rewards/margins": 0.07209358364343643,
"rewards/rejected": -0.861863911151886,
"step": 49
},
{
"epoch": 0.10686615014694095,
"grad_norm": 4.762299537658691,
"learning_rate": 9.998741174712533e-07,
"logits/chosen": -0.99764084815979,
"logits/rejected": -0.9120419025421143,
"logps/chosen": -0.32988840341567993,
"logps/rejected": -0.37920716404914856,
"loss": 1.5959,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8247209787368774,
"rewards/margins": 0.12329696863889694,
"rewards/rejected": -0.9480178356170654,
"step": 50
},
{
"epoch": 0.10900347314987978,
"grad_norm": 4.453758239746094,
"learning_rate": 9.997762161417517e-07,
"logits/chosen": -0.7613246440887451,
"logits/rejected": -0.7792637348175049,
"logps/chosen": -0.2225038707256317,
"logps/rejected": -0.2605491876602173,
"loss": 1.5962,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5562596321105957,
"rewards/margins": 0.09511324763298035,
"rewards/rejected": -0.6513729095458984,
"step": 51
},
{
"epoch": 0.11114079615281859,
"grad_norm": 5.08111047744751,
"learning_rate": 9.996503523941992e-07,
"logits/chosen": -0.9457738399505615,
"logits/rejected": -1.0001921653747559,
"logps/chosen": -0.30008092522621155,
"logps/rejected": -0.33674395084381104,
"loss": 1.5988,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7502023577690125,
"rewards/margins": 0.09165750443935394,
"rewards/rejected": -0.8418598175048828,
"step": 52
},
{
"epoch": 0.11327811915575742,
"grad_norm": 19.494726181030273,
"learning_rate": 9.994965332706572e-07,
"logits/chosen": -0.9162960052490234,
"logits/rejected": -0.8188440799713135,
"logps/chosen": -0.28970617055892944,
"logps/rejected": -0.31052201986312866,
"loss": 1.6028,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.724265456199646,
"rewards/margins": 0.05203955993056297,
"rewards/rejected": -0.7763050198554993,
"step": 53
},
{
"epoch": 0.11541544215869623,
"grad_norm": 4.843199729919434,
"learning_rate": 9.99314767377287e-07,
"logits/chosen": -1.0256187915802002,
"logits/rejected": -0.9491410851478577,
"logps/chosen": -0.2758142650127411,
"logps/rejected": -0.30541473627090454,
"loss": 1.5812,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6895356178283691,
"rewards/margins": 0.07400117814540863,
"rewards/rejected": -0.7635368704795837,
"step": 54
},
{
"epoch": 0.11755276516163506,
"grad_norm": 4.721622943878174,
"learning_rate": 9.991050648838675e-07,
"logits/chosen": -1.0089685916900635,
"logits/rejected": -0.8246462345123291,
"logps/chosen": -0.2628995478153229,
"logps/rejected": -0.26574474573135376,
"loss": 1.624,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.657248854637146,
"rewards/margins": 0.007113074883818626,
"rewards/rejected": -0.6643618941307068,
"step": 55
},
{
"epoch": 0.11969008816457387,
"grad_norm": 17.51543426513672,
"learning_rate": 9.98867437523228e-07,
"logits/chosen": -0.8552293181419373,
"logits/rejected": -0.8588695526123047,
"logps/chosen": -0.3222196102142334,
"logps/rejected": -0.3183228075504303,
"loss": 1.5887,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8055489659309387,
"rewards/margins": -0.009741947054862976,
"rewards/rejected": -0.7958070039749146,
"step": 56
},
{
"epoch": 0.1218274111675127,
"grad_norm": 4.062877178192139,
"learning_rate": 9.986018985905899e-07,
"logits/chosen": -0.941673219203949,
"logits/rejected": -0.9555226564407349,
"logps/chosen": -0.245195671916008,
"logps/rejected": -0.28370341658592224,
"loss": 1.6183,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6129892468452454,
"rewards/margins": 0.09626930952072144,
"rewards/rejected": -0.709258496761322,
"step": 57
},
{
"epoch": 0.12396473417045151,
"grad_norm": 4.429107189178467,
"learning_rate": 9.983084629428244e-07,
"logits/chosen": -1.1889485120773315,
"logits/rejected": -1.0553314685821533,
"logps/chosen": -0.34671419858932495,
"logps/rejected": -0.3131285011768341,
"loss": 1.6173,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8667855262756348,
"rewards/margins": -0.0839642882347107,
"rewards/rejected": -0.7828212976455688,
"step": 58
},
{
"epoch": 0.12610205717339032,
"grad_norm": 5.847541332244873,
"learning_rate": 9.979871469976195e-07,
"logits/chosen": -0.8485190868377686,
"logits/rejected": -0.8948504328727722,
"logps/chosen": -0.23155483603477478,
"logps/rejected": -0.2654906213283539,
"loss": 1.6077,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5788871049880981,
"rewards/margins": 0.08483947813510895,
"rewards/rejected": -0.6637265682220459,
"step": 59
},
{
"epoch": 0.12823938017632916,
"grad_norm": 4.097656726837158,
"learning_rate": 9.97637968732563e-07,
"logits/chosen": -0.963527500629425,
"logits/rejected": -0.9208120703697205,
"logps/chosen": -0.28316250443458557,
"logps/rejected": -0.28164517879486084,
"loss": 1.582,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7079063057899475,
"rewards/margins": -0.0037933755666017532,
"rewards/rejected": -0.7041129469871521,
"step": 60
},
{
"epoch": 0.13037670317926797,
"grad_norm": 12.278409004211426,
"learning_rate": 9.972609476841365e-07,
"logits/chosen": -0.8986336588859558,
"logits/rejected": -0.9140303134918213,
"logps/chosen": -0.2507745921611786,
"logps/rejected": -0.25815582275390625,
"loss": 1.5789,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6269363164901733,
"rewards/margins": 0.018453147262334824,
"rewards/rejected": -0.6453895568847656,
"step": 61
},
{
"epoch": 0.13251402618220678,
"grad_norm": 8.683540344238281,
"learning_rate": 9.968561049466213e-07,
"logits/chosen": -0.9374087452888489,
"logits/rejected": -1.089040994644165,
"logps/chosen": -0.26520273089408875,
"logps/rejected": -0.2902393937110901,
"loss": 1.6468,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6630067825317383,
"rewards/margins": 0.06259168684482574,
"rewards/rejected": -0.7255985140800476,
"step": 62
},
{
"epoch": 0.1346513491851456,
"grad_norm": 4.520249843597412,
"learning_rate": 9.964234631709185e-07,
"logits/chosen": -0.9794274568557739,
"logits/rejected": -1.1607794761657715,
"logps/chosen": -0.2808447480201721,
"logps/rejected": -0.305492103099823,
"loss": 1.6685,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7021118402481079,
"rewards/margins": 0.06161835044622421,
"rewards/rejected": -0.7637301683425903,
"step": 63
},
{
"epoch": 0.13678867218808444,
"grad_norm": 4.595892906188965,
"learning_rate": 9.959630465632831e-07,
"logits/chosen": -0.9844315052032471,
"logits/rejected": -0.8630974888801575,
"logps/chosen": -0.37489163875579834,
"logps/rejected": -0.2756366729736328,
"loss": 1.652,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.9372289180755615,
"rewards/margins": -0.24813732504844666,
"rewards/rejected": -0.689091682434082,
"step": 64
},
{
"epoch": 0.13892599519102325,
"grad_norm": 8.821249008178711,
"learning_rate": 9.954748808839674e-07,
"logits/chosen": -1.018763780593872,
"logits/rejected": -1.0369865894317627,
"logps/chosen": -0.2777743935585022,
"logps/rejected": -0.2588828206062317,
"loss": 1.6033,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.6944360136985779,
"rewards/margins": -0.047228869050741196,
"rewards/rejected": -0.6472070813179016,
"step": 65
},
{
"epoch": 0.14106331819396206,
"grad_norm": 3.644948959350586,
"learning_rate": 9.949589934457814e-07,
"logits/chosen": -1.0388680696487427,
"logits/rejected": -1.0096322298049927,
"logps/chosen": -0.3784153163433075,
"logps/rejected": -0.4555840790271759,
"loss": 1.5949,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9460383057594299,
"rewards/margins": 0.19292199611663818,
"rewards/rejected": -1.1389602422714233,
"step": 66
},
{
"epoch": 0.14320064119690087,
"grad_norm": 3.4237723350524902,
"learning_rate": 9.944154131125642e-07,
"logits/chosen": -0.9876866936683655,
"logits/rejected": -1.1050664186477661,
"logps/chosen": -0.6056898236274719,
"logps/rejected": -0.5897864699363708,
"loss": 1.6075,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5142244100570679,
"rewards/margins": -0.03975825011730194,
"rewards/rejected": -1.4744662046432495,
"step": 67
},
{
"epoch": 0.14533796419983971,
"grad_norm": 4.325132846832275,
"learning_rate": 9.938441702975689e-07,
"logits/chosen": -1.0274345874786377,
"logits/rejected": -0.9806603789329529,
"logps/chosen": -0.38565298914909363,
"logps/rejected": -0.34850040078163147,
"loss": 1.6198,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.9641326069831848,
"rewards/margins": -0.09288151562213898,
"rewards/rejected": -0.8712509870529175,
"step": 68
},
{
"epoch": 0.14747528720277853,
"grad_norm": 7.449862480163574,
"learning_rate": 9.932452969617607e-07,
"logits/chosen": -0.9729686975479126,
"logits/rejected": -0.8644128441810608,
"logps/chosen": -0.3008931577205658,
"logps/rejected": -0.4920026361942291,
"loss": 1.5732,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.7522329092025757,
"rewards/margins": 0.47777363657951355,
"rewards/rejected": -1.2300065755844116,
"step": 69
},
{
"epoch": 0.14961261020571734,
"grad_norm": 4.059993743896484,
"learning_rate": 9.926188266120295e-07,
"logits/chosen": -0.9896879196166992,
"logits/rejected": -0.9051238894462585,
"logps/chosen": -0.30066195130348206,
"logps/rejected": -0.25544169545173645,
"loss": 1.7178,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.751654863357544,
"rewards/margins": -0.11305058002471924,
"rewards/rejected": -0.6386042833328247,
"step": 70
},
{
"epoch": 0.15174993320865615,
"grad_norm": 4.462944507598877,
"learning_rate": 9.919647942993147e-07,
"logits/chosen": -1.0837461948394775,
"logits/rejected": -1.0208004713058472,
"logps/chosen": -0.33584755659103394,
"logps/rejected": -0.49902307987213135,
"loss": 1.6114,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8396189212799072,
"rewards/margins": 0.40793871879577637,
"rewards/rejected": -1.2475576400756836,
"step": 71
},
{
"epoch": 0.15388725621159496,
"grad_norm": 3.49957275390625,
"learning_rate": 9.912832366166441e-07,
"logits/chosen": -1.057317852973938,
"logits/rejected": -1.04885995388031,
"logps/chosen": -0.27743101119995117,
"logps/rejected": -0.2855708599090576,
"loss": 1.5374,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6935775279998779,
"rewards/margins": 0.02034958079457283,
"rewards/rejected": -0.7139270901679993,
"step": 72
},
{
"epoch": 0.1560245792145338,
"grad_norm": 6.884411811828613,
"learning_rate": 9.905741916970863e-07,
"logits/chosen": -0.7818067073822021,
"logits/rejected": -0.791998028755188,
"logps/chosen": -0.44259050488471985,
"logps/rejected": -0.6377058029174805,
"loss": 1.5573,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1064761877059937,
"rewards/margins": 0.4877880811691284,
"rewards/rejected": -1.594264268875122,
"step": 73
},
{
"epoch": 0.15816190221747262,
"grad_norm": 25.20929718017578,
"learning_rate": 9.898376992116177e-07,
"logits/chosen": -0.9175440073013306,
"logits/rejected": -0.9666653275489807,
"logps/chosen": -0.3093183934688568,
"logps/rejected": -0.29086652398109436,
"loss": 1.6364,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7732959985733032,
"rewards/margins": -0.04612968489527702,
"rewards/rejected": -0.7271662950515747,
"step": 74
},
{
"epoch": 0.16029922522041143,
"grad_norm": 7.956474781036377,
"learning_rate": 9.890738003669027e-07,
"logits/chosen": -1.12689208984375,
"logits/rejected": -1.0710712671279907,
"logps/chosen": -0.30693331360816956,
"logps/rejected": -0.39591145515441895,
"loss": 1.4775,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7673332691192627,
"rewards/margins": 0.22244536876678467,
"rewards/rejected": -0.9897785186767578,
"step": 75
},
{
"epoch": 0.16243654822335024,
"grad_norm": 5.171202659606934,
"learning_rate": 9.882825379029882e-07,
"logits/chosen": -1.2351804971694946,
"logits/rejected": -1.2987933158874512,
"logps/chosen": -0.26825714111328125,
"logps/rejected": -0.3104426860809326,
"loss": 1.633,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6706428527832031,
"rewards/margins": 0.1054638922214508,
"rewards/rejected": -0.7761067152023315,
"step": 76
},
{
"epoch": 0.16457387122628908,
"grad_norm": 4.766253471374512,
"learning_rate": 9.874639560909118e-07,
"logits/chosen": -0.9250165224075317,
"logits/rejected": -0.8637949228286743,
"logps/chosen": -0.3042350709438324,
"logps/rejected": -0.37980011105537415,
"loss": 1.5511,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.760587751865387,
"rewards/margins": 0.18891258537769318,
"rewards/rejected": -0.949500322341919,
"step": 77
},
{
"epoch": 0.1667111942292279,
"grad_norm": 10.560595512390137,
"learning_rate": 9.866181007302256e-07,
"logits/chosen": -1.0456985235214233,
"logits/rejected": -1.0086830854415894,
"logps/chosen": -0.3072804808616638,
"logps/rejected": -0.2886378765106201,
"loss": 1.5779,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.7682012915611267,
"rewards/margins": -0.046606533229351044,
"rewards/rejected": -0.7215947508811951,
"step": 78
},
{
"epoch": 0.1688485172321667,
"grad_norm": 6.801359176635742,
"learning_rate": 9.857450191464337e-07,
"logits/chosen": -0.8191673159599304,
"logits/rejected": -0.9271634817123413,
"logps/chosen": -0.38711708784103394,
"logps/rejected": -0.6279405951499939,
"loss": 1.533,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9677926898002625,
"rewards/margins": 0.602059006690979,
"rewards/rejected": -1.5698516368865967,
"step": 79
},
{
"epoch": 0.17098584023510552,
"grad_norm": 9.583541870117188,
"learning_rate": 9.848447601883433e-07,
"logits/chosen": -1.2207063436508179,
"logits/rejected": -1.0649924278259277,
"logps/chosen": -0.39880311489105225,
"logps/rejected": -0.33250871300697327,
"loss": 1.5893,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.9970077276229858,
"rewards/margins": -0.16573598980903625,
"rewards/rejected": -0.831271767616272,
"step": 80
},
{
"epoch": 0.17312316323804436,
"grad_norm": 8.752836227416992,
"learning_rate": 9.839173742253334e-07,
"logits/chosen": -1.181122899055481,
"logits/rejected": -1.1287283897399902,
"logps/chosen": -0.3108143210411072,
"logps/rejected": -0.38757073879241943,
"loss": 1.586,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7770357728004456,
"rewards/margins": 0.19189102947711945,
"rewards/rejected": -0.9689267873764038,
"step": 81
},
{
"epoch": 0.17526048624098317,
"grad_norm": 9.698758125305176,
"learning_rate": 9.82962913144534e-07,
"logits/chosen": -1.0705887079238892,
"logits/rejected": -0.8216981887817383,
"logps/chosen": -0.34870025515556335,
"logps/rejected": -0.339597225189209,
"loss": 1.5169,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8717506527900696,
"rewards/margins": -0.022757653146982193,
"rewards/rejected": -0.8489930629730225,
"step": 82
},
{
"epoch": 0.17739780924392198,
"grad_norm": 3.818091869354248,
"learning_rate": 9.819814303479267e-07,
"logits/chosen": -0.8575353026390076,
"logits/rejected": -0.873344898223877,
"logps/chosen": -0.28072673082351685,
"logps/rejected": -0.28653833270072937,
"loss": 1.6003,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7018167972564697,
"rewards/margins": 0.014529004693031311,
"rewards/rejected": -0.7163459062576294,
"step": 83
},
{
"epoch": 0.1795351322468608,
"grad_norm": 8.025263786315918,
"learning_rate": 9.80972980749353e-07,
"logits/chosen": -0.886692225933075,
"logits/rejected": -0.9297844171524048,
"logps/chosen": -0.3129986524581909,
"logps/rejected": -0.30843472480773926,
"loss": 1.6748,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7824965715408325,
"rewards/margins": -0.011409677565097809,
"rewards/rejected": -0.7710868716239929,
"step": 84
},
{
"epoch": 0.18167245524979964,
"grad_norm": 3.9557864665985107,
"learning_rate": 9.799376207714444e-07,
"logits/chosen": -0.9471138715744019,
"logits/rejected": -0.9405109882354736,
"logps/chosen": -0.2820201814174652,
"logps/rejected": -0.3643006980419159,
"loss": 1.5473,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7050503492355347,
"rewards/margins": 0.20570139586925507,
"rewards/rejected": -0.9107517600059509,
"step": 85
},
{
"epoch": 0.18380977825273845,
"grad_norm": 3.63122820854187,
"learning_rate": 9.788754083424652e-07,
"logits/chosen": -0.7530944347381592,
"logits/rejected": -0.7066674828529358,
"logps/chosen": -0.29479140043258667,
"logps/rejected": -0.35999545454978943,
"loss": 1.5651,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7369785904884338,
"rewards/margins": 0.16301003098487854,
"rewards/rejected": -0.8999886512756348,
"step": 86
},
{
"epoch": 0.18594710125567726,
"grad_norm": 5.069854259490967,
"learning_rate": 9.777864028930705e-07,
"logits/chosen": -0.9427808523178101,
"logits/rejected": -0.8523333668708801,
"logps/chosen": -0.3117142617702484,
"logps/rejected": -0.35370469093322754,
"loss": 1.5806,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7792856097221375,
"rewards/margins": 0.10497600585222244,
"rewards/rejected": -0.8842616677284241,
"step": 87
},
{
"epoch": 0.18808442425861607,
"grad_norm": 9.928412437438965,
"learning_rate": 9.766706653529812e-07,
"logits/chosen": -0.9436711072921753,
"logits/rejected": -0.8924795985221863,
"logps/chosen": -0.29760316014289856,
"logps/rejected": -0.6252572536468506,
"loss": 1.5214,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7440078854560852,
"rewards/margins": 0.8191351890563965,
"rewards/rejected": -1.5631431341171265,
"step": 88
},
{
"epoch": 0.1902217472615549,
"grad_norm": 5.750680446624756,
"learning_rate": 9.755282581475767e-07,
"logits/chosen": -1.0517209768295288,
"logits/rejected": -1.055407166481018,
"logps/chosen": -0.2816842794418335,
"logps/rejected": -0.3634580373764038,
"loss": 1.6133,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7042107582092285,
"rewards/margins": 0.20443443953990936,
"rewards/rejected": -0.9086451530456543,
"step": 89
},
{
"epoch": 0.19235907026449373,
"grad_norm": 4.923956871032715,
"learning_rate": 9.743592451943998e-07,
"logits/chosen": -1.0395478010177612,
"logits/rejected": -0.9153550863265991,
"logps/chosen": -0.28231117129325867,
"logps/rejected": -0.35972079634666443,
"loss": 1.5655,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7057778239250183,
"rewards/margins": 0.19352424144744873,
"rewards/rejected": -0.8993021249771118,
"step": 90
},
{
"epoch": 0.19449639326743254,
"grad_norm": 6.7967634201049805,
"learning_rate": 9.73163691899582e-07,
"logits/chosen": -1.0115331411361694,
"logits/rejected": -1.0663210153579712,
"logps/chosen": -0.2789320647716522,
"logps/rejected": -0.3152926564216614,
"loss": 1.5292,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.697330117225647,
"rewards/margins": 0.09090138971805573,
"rewards/rejected": -0.788231611251831,
"step": 91
},
{
"epoch": 0.19663371627037135,
"grad_norm": 5.242959022521973,
"learning_rate": 9.719416651541837e-07,
"logits/chosen": -1.1914602518081665,
"logits/rejected": -1.237089991569519,
"logps/chosen": -0.3149641156196594,
"logps/rejected": -0.4288913309574127,
"loss": 1.5711,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.787410318851471,
"rewards/margins": 0.28481805324554443,
"rewards/rejected": -1.0722283124923706,
"step": 92
},
{
"epoch": 0.1987710392733102,
"grad_norm": 7.285670280456543,
"learning_rate": 9.706932333304517e-07,
"logits/chosen": -1.0469316244125366,
"logits/rejected": -1.016736388206482,
"logps/chosen": -0.32207292318344116,
"logps/rejected": -0.33990058302879333,
"loss": 1.5721,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8051823377609253,
"rewards/margins": 0.04456908628344536,
"rewards/rejected": -0.8497514128684998,
"step": 93
},
{
"epoch": 0.200908362276249,
"grad_norm": 5.663970947265625,
"learning_rate": 9.694184662779929e-07,
"logits/chosen": -0.83307284116745,
"logits/rejected": -0.8582972288131714,
"logps/chosen": -0.29542282223701477,
"logps/rejected": -0.32081592082977295,
"loss": 1.5804,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7385570406913757,
"rewards/margins": 0.06348275393247604,
"rewards/rejected": -0.8020397424697876,
"step": 94
},
{
"epoch": 0.20304568527918782,
"grad_norm": 5.039822578430176,
"learning_rate": 9.681174353198686e-07,
"logits/chosen": -1.19273042678833,
"logits/rejected": -1.1986303329467773,
"logps/chosen": -0.4751918315887451,
"logps/rejected": -0.42982685565948486,
"loss": 1.5248,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1879795789718628,
"rewards/margins": -0.11341254413127899,
"rewards/rejected": -1.0745670795440674,
"step": 95
},
{
"epoch": 0.20518300828212663,
"grad_norm": 2.779069423675537,
"learning_rate": 9.667902132486008e-07,
"logits/chosen": -1.0337846279144287,
"logits/rejected": -0.978040337562561,
"logps/chosen": -0.3200725317001343,
"logps/rejected": -0.3141818940639496,
"loss": 1.5839,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8001812696456909,
"rewards/margins": -0.014726534485816956,
"rewards/rejected": -0.7854547500610352,
"step": 96
},
{
"epoch": 0.20732033128506547,
"grad_norm": 15.442702293395996,
"learning_rate": 9.65436874322102e-07,
"logits/chosen": -1.0476655960083008,
"logits/rejected": -1.1004842519760132,
"logps/chosen": -0.3345443606376648,
"logps/rejected": -0.34287557005882263,
"loss": 1.6456,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8363610506057739,
"rewards/margins": 0.020828042179346085,
"rewards/rejected": -0.8571889400482178,
"step": 97
},
{
"epoch": 0.20945765428800428,
"grad_norm": 3.940214157104492,
"learning_rate": 9.640574942595194e-07,
"logits/chosen": -1.1419146060943604,
"logits/rejected": -1.018257737159729,
"logps/chosen": -0.3185333013534546,
"logps/rejected": -0.32143470644950867,
"loss": 1.5681,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7963333129882812,
"rewards/margins": 0.007253464311361313,
"rewards/rejected": -0.8035867214202881,
"step": 98
},
{
"epoch": 0.2115949772909431,
"grad_norm": 10.583402633666992,
"learning_rate": 9.626521502369983e-07,
"logits/chosen": -1.102158784866333,
"logits/rejected": -0.9770756959915161,
"logps/chosen": -0.3017268180847168,
"logps/rejected": -0.3784370422363281,
"loss": 1.5746,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.754317045211792,
"rewards/margins": 0.19177556037902832,
"rewards/rejected": -0.9460926055908203,
"step": 99
},
{
"epoch": 0.2137323002938819,
"grad_norm": 4.430131435394287,
"learning_rate": 9.612209208833646e-07,
"logits/chosen": -1.2631207704544067,
"logits/rejected": -1.2441554069519043,
"logps/chosen": -0.4847136437892914,
"logps/rejected": -0.38210996985435486,
"loss": 1.6529,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.2117840051651,
"rewards/margins": -0.25650906562805176,
"rewards/rejected": -0.9552749395370483,
"step": 100
},
{
"epoch": 0.21586962329682075,
"grad_norm": 15.133020401000977,
"learning_rate": 9.597638862757253e-07,
"logits/chosen": -1.0296027660369873,
"logits/rejected": -1.0020573139190674,
"logps/chosen": -0.32216939330101013,
"logps/rejected": -0.4231048822402954,
"loss": 1.5492,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8054234981536865,
"rewards/margins": 0.2523386776447296,
"rewards/rejected": -1.0577621459960938,
"step": 101
},
{
"epoch": 0.21800694629975956,
"grad_norm": 6.248828887939453,
"learning_rate": 9.58281127934988e-07,
"logits/chosen": -1.0385347604751587,
"logits/rejected": -1.007280945777893,
"logps/chosen": -0.3016239106655121,
"logps/rejected": -0.3423752188682556,
"loss": 1.553,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7540597319602966,
"rewards/margins": 0.10187835991382599,
"rewards/rejected": -0.8559381365776062,
"step": 102
},
{
"epoch": 0.22014426930269837,
"grad_norm": 9.791584968566895,
"learning_rate": 9.567727288213004e-07,
"logits/chosen": -1.0230026245117188,
"logits/rejected": -0.9750150442123413,
"logps/chosen": -0.27773773670196533,
"logps/rejected": -0.32571396231651306,
"loss": 1.5772,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6943443417549133,
"rewards/margins": 0.11994057148694992,
"rewards/rejected": -0.8142849206924438,
"step": 103
},
{
"epoch": 0.22228159230563718,
"grad_norm": 4.045163631439209,
"learning_rate": 9.552387733294078e-07,
"logits/chosen": -1.1475125551223755,
"logits/rejected": -0.9796882271766663,
"logps/chosen": -0.309417724609375,
"logps/rejected": -0.3697960674762726,
"loss": 1.5822,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7735442519187927,
"rewards/margins": 0.15094594657421112,
"rewards/rejected": -0.924490213394165,
"step": 104
},
{
"epoch": 0.224418915308576,
"grad_norm": 5.604581832885742,
"learning_rate": 9.536793472839324e-07,
"logits/chosen": -1.047705888748169,
"logits/rejected": -1.0832607746124268,
"logps/chosen": -0.32854345440864563,
"logps/rejected": -0.43526744842529297,
"loss": 1.6265,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.8213586211204529,
"rewards/margins": 0.26680994033813477,
"rewards/rejected": -1.0881686210632324,
"step": 105
},
{
"epoch": 0.22655623831151483,
"grad_norm": 3.9951133728027344,
"learning_rate": 9.520945379345699e-07,
"logits/chosen": -0.9593117237091064,
"logits/rejected": -0.8573833107948303,
"logps/chosen": -0.4196315109729767,
"logps/rejected": -0.35942524671554565,
"loss": 1.5512,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.0490788221359253,
"rewards/margins": -0.15051564574241638,
"rewards/rejected": -0.8985630869865417,
"step": 106
},
{
"epoch": 0.22869356131445365,
"grad_norm": 8.718377113342285,
"learning_rate": 9.504844339512094e-07,
"logits/chosen": -0.8320434093475342,
"logits/rejected": -0.7738104462623596,
"logps/chosen": -0.31670740246772766,
"logps/rejected": -0.32797324657440186,
"loss": 1.5849,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.7917685508728027,
"rewards/margins": 0.028164558112621307,
"rewards/rejected": -0.8199330568313599,
"step": 107
},
{
"epoch": 0.23083088431739246,
"grad_norm": 3.695155382156372,
"learning_rate": 9.488491254189716e-07,
"logits/chosen": -0.8110507130622864,
"logits/rejected": -0.7713247537612915,
"logps/chosen": -0.35122472047805786,
"logps/rejected": -0.43228679895401,
"loss": 1.6413,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8780617713928223,
"rewards/margins": 0.20265522599220276,
"rewards/rejected": -1.0807169675827026,
"step": 108
},
{
"epoch": 0.23296820732033127,
"grad_norm": 6.846314430236816,
"learning_rate": 9.471887038331684e-07,
"logits/chosen": -0.9511977434158325,
"logits/rejected": -1.0073680877685547,
"logps/chosen": -0.3145419657230377,
"logps/rejected": -0.34916961193084717,
"loss": 1.5855,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7863549590110779,
"rewards/margins": 0.08656906336545944,
"rewards/rejected": -0.8729239702224731,
"step": 109
},
{
"epoch": 0.2351055303232701,
"grad_norm": 6.659543037414551,
"learning_rate": 9.455032620941839e-07,
"logits/chosen": -1.2216616868972778,
"logits/rejected": -1.1229971647262573,
"logps/chosen": -0.35468706488609314,
"logps/rejected": -0.5576741695404053,
"loss": 1.5417,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8867175579071045,
"rewards/margins": 0.5074679851531982,
"rewards/rejected": -1.3941854238510132,
"step": 110
},
{
"epoch": 0.23724285332620892,
"grad_norm": 3.1694774627685547,
"learning_rate": 9.43792894502277e-07,
"logits/chosen": -0.8341147303581238,
"logits/rejected": -0.9695035219192505,
"logps/chosen": -0.32898104190826416,
"logps/rejected": -0.36305293440818787,
"loss": 1.5743,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8224526643753052,
"rewards/margins": 0.08517970144748688,
"rewards/rejected": -0.9076323509216309,
"step": 111
},
{
"epoch": 0.23938017632914774,
"grad_norm": 3.8049800395965576,
"learning_rate": 9.420576967523048e-07,
"logits/chosen": -0.8800846338272095,
"logits/rejected": -0.902974009513855,
"logps/chosen": -0.30384454131126404,
"logps/rejected": -0.33347606658935547,
"loss": 1.5319,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7596113681793213,
"rewards/margins": 0.07407879829406738,
"rewards/rejected": -0.8336902260780334,
"step": 112
},
{
"epoch": 0.24151749933208655,
"grad_norm": 6.169259548187256,
"learning_rate": 9.402977659283689e-07,
"logits/chosen": -0.9351974725723267,
"logits/rejected": -0.8290128707885742,
"logps/chosen": -0.33734872937202454,
"logps/rejected": -0.4412783086299896,
"loss": 1.4782,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8433719277381897,
"rewards/margins": 0.2598239779472351,
"rewards/rejected": -1.1031959056854248,
"step": 113
},
{
"epoch": 0.2436548223350254,
"grad_norm": 2.9584643840789795,
"learning_rate": 9.385132004983832e-07,
"logits/chosen": -1.0792999267578125,
"logits/rejected": -1.1223125457763672,
"logps/chosen": -0.36617720127105713,
"logps/rejected": -0.4576934576034546,
"loss": 1.5475,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.9154431223869324,
"rewards/margins": 0.22879061102867126,
"rewards/rejected": -1.1442335844039917,
"step": 114
},
{
"epoch": 0.2457921453379642,
"grad_norm": 9.862983703613281,
"learning_rate": 9.367041003085648e-07,
"logits/chosen": -1.1801700592041016,
"logits/rejected": -1.075859785079956,
"logps/chosen": -0.3576958477497101,
"logps/rejected": -0.2694101333618164,
"loss": 1.6048,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.894239604473114,
"rewards/margins": -0.22071433067321777,
"rewards/rejected": -0.6735252737998962,
"step": 115
},
{
"epoch": 0.24792946834090301,
"grad_norm": 4.047823905944824,
"learning_rate": 9.348705665778477e-07,
"logits/chosen": -1.1844444274902344,
"logits/rejected": -1.1176280975341797,
"logps/chosen": -0.3374484181404114,
"logps/rejected": -0.3525134325027466,
"loss": 1.5249,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8436209559440613,
"rewards/margins": 0.03766253590583801,
"rewards/rejected": -0.8812835812568665,
"step": 116
},
{
"epoch": 0.25006679134384185,
"grad_norm": 7.761148929595947,
"learning_rate": 9.330127018922193e-07,
"logits/chosen": -1.1179434061050415,
"logits/rejected": -1.0968945026397705,
"logps/chosen": -0.38077402114868164,
"logps/rejected": -0.42625805735588074,
"loss": 1.5768,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.9519350528717041,
"rewards/margins": 0.11371012032032013,
"rewards/rejected": -1.0656450986862183,
"step": 117
},
{
"epoch": 0.25220411434678064,
"grad_norm": 2.0378005504608154,
"learning_rate": 9.311306101989812e-07,
"logits/chosen": -0.970659613609314,
"logits/rejected": -1.0176433324813843,
"logps/chosen": -0.3453654944896698,
"logps/rejected": -0.32289958000183105,
"loss": 1.5867,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8634137511253357,
"rewards/margins": -0.056164830923080444,
"rewards/rejected": -0.8072489500045776,
"step": 118
},
{
"epoch": 0.2543414373497195,
"grad_norm": 7.108942031860352,
"learning_rate": 9.29224396800933e-07,
"logits/chosen": -1.1055564880371094,
"logits/rejected": -1.001468539237976,
"logps/chosen": -0.33387163281440735,
"logps/rejected": -0.3175872564315796,
"loss": 1.643,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.834679126739502,
"rewards/margins": -0.04071101173758507,
"rewards/rejected": -0.7939680814743042,
"step": 119
},
{
"epoch": 0.2564787603526583,
"grad_norm": 5.642314434051514,
"learning_rate": 9.272941683504808e-07,
"logits/chosen": -0.9950228333473206,
"logits/rejected": -0.9583395719528198,
"logps/chosen": -0.2971169650554657,
"logps/rejected": -0.4023984670639038,
"loss": 1.5387,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7427924275398254,
"rewards/margins": 0.2632036805152893,
"rewards/rejected": -1.0059961080551147,
"step": 120
},
{
"epoch": 0.2586160833555971,
"grad_norm": 6.028919219970703,
"learning_rate": 9.253400328436698e-07,
"logits/chosen": -1.2061349153518677,
"logits/rejected": -1.1848161220550537,
"logps/chosen": -0.5155015587806702,
"logps/rejected": -0.6664289236068726,
"loss": 1.5549,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2887539863586426,
"rewards/margins": 0.3773185610771179,
"rewards/rejected": -1.6660724878311157,
"step": 121
},
{
"epoch": 0.26075340635853594,
"grad_norm": 8.189311027526855,
"learning_rate": 9.233620996141421e-07,
"logits/chosen": -1.0705747604370117,
"logits/rejected": -1.0496965646743774,
"logps/chosen": -0.4411606192588806,
"logps/rejected": -0.4381504952907562,
"loss": 1.7164,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.102901577949524,
"rewards/margins": -0.007525326684117317,
"rewards/rejected": -1.0953762531280518,
"step": 122
},
{
"epoch": 0.26289072936147473,
"grad_norm": 8.586820602416992,
"learning_rate": 9.213604793270196e-07,
"logits/chosen": -1.1702253818511963,
"logits/rejected": -1.166388750076294,
"logps/chosen": -0.3666711151599884,
"logps/rejected": -0.3490923047065735,
"loss": 1.6561,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9166778326034546,
"rewards/margins": -0.04394708573818207,
"rewards/rejected": -0.8727307319641113,
"step": 123
},
{
"epoch": 0.26502805236441357,
"grad_norm": 6.50911808013916,
"learning_rate": 9.19335283972712e-07,
"logits/chosen": -1.0470083951950073,
"logits/rejected": -1.0692466497421265,
"logps/chosen": -0.4158693253993988,
"logps/rejected": -0.5129181742668152,
"loss": 1.6143,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0396732091903687,
"rewards/margins": 0.24262209236621857,
"rewards/rejected": -1.2822954654693604,
"step": 124
},
{
"epoch": 0.2671653753673524,
"grad_norm": 10.414886474609375,
"learning_rate": 9.172866268606513e-07,
"logits/chosen": -1.0535264015197754,
"logits/rejected": -1.0312259197235107,
"logps/chosen": -0.8472201824188232,
"logps/rejected": -0.8241742849349976,
"loss": 1.6177,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1180503368377686,
"rewards/margins": -0.05761456489562988,
"rewards/rejected": -2.0604357719421387,
"step": 125
},
{
"epoch": 0.2693026983702912,
"grad_norm": 4.565366744995117,
"learning_rate": 9.152146226129518e-07,
"logits/chosen": -1.0052158832550049,
"logits/rejected": -1.1335080862045288,
"logps/chosen": -0.36246898770332336,
"logps/rejected": -0.5093556046485901,
"loss": 1.5169,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9061724543571472,
"rewards/margins": 0.36721646785736084,
"rewards/rejected": -1.2733888626098633,
"step": 126
},
{
"epoch": 0.27144002137323003,
"grad_norm": 6.2105021476745605,
"learning_rate": 9.131193871579974e-07,
"logits/chosen": -1.1646008491516113,
"logits/rejected": -1.1527330875396729,
"logps/chosen": -0.40890955924987793,
"logps/rejected": -0.5555287599563599,
"loss": 1.5485,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.0222738981246948,
"rewards/margins": 0.36654818058013916,
"rewards/rejected": -1.388822078704834,
"step": 127
},
{
"epoch": 0.2735773443761689,
"grad_norm": 11.547090530395508,
"learning_rate": 9.11001037723955e-07,
"logits/chosen": -0.9489108324050903,
"logits/rejected": -0.9458054304122925,
"logps/chosen": -0.3760164678096771,
"logps/rejected": -0.4494747519493103,
"loss": 1.5733,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.940041184425354,
"rewards/margins": 0.1836456060409546,
"rewards/rejected": -1.1236867904663086,
"step": 128
},
{
"epoch": 0.27571466737910766,
"grad_norm": 5.789463043212891,
"learning_rate": 9.088596928322157e-07,
"logits/chosen": -1.2436193227767944,
"logits/rejected": -1.1626383066177368,
"logps/chosen": -0.33822306990623474,
"logps/rejected": -0.3627959191799164,
"loss": 1.5896,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8455577492713928,
"rewards/margins": 0.06143195554614067,
"rewards/rejected": -0.9069896936416626,
"step": 129
},
{
"epoch": 0.2778519903820465,
"grad_norm": 4.706400394439697,
"learning_rate": 9.066954722907638e-07,
"logits/chosen": -1.1288872957229614,
"logits/rejected": -1.2139040231704712,
"logps/chosen": -0.4233206808567047,
"logps/rejected": -0.49574923515319824,
"loss": 1.5678,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.0583016872406006,
"rewards/margins": 0.18107128143310547,
"rewards/rejected": -1.2393728494644165,
"step": 130
},
{
"epoch": 0.2799893133849853,
"grad_norm": 3.986043930053711,
"learning_rate": 9.045084971874737e-07,
"logits/chosen": -1.0815867185592651,
"logits/rejected": -1.079985499382019,
"logps/chosen": -0.45107316970825195,
"logps/rejected": -0.4917645752429962,
"loss": 1.5342,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1276829242706299,
"rewards/margins": 0.10172851383686066,
"rewards/rejected": -1.2294114828109741,
"step": 131
},
{
"epoch": 0.2821266363879241,
"grad_norm": 9.682952880859375,
"learning_rate": 9.022988898833342e-07,
"logits/chosen": -0.9553432464599609,
"logits/rejected": -0.882793664932251,
"logps/chosen": -0.29320845007896423,
"logps/rejected": -0.3070768415927887,
"loss": 1.6546,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7330211400985718,
"rewards/margins": 0.03467091917991638,
"rewards/rejected": -0.7676920890808105,
"step": 132
},
{
"epoch": 0.28426395939086296,
"grad_norm": 3.61600923538208,
"learning_rate": 9.000667740056032e-07,
"logits/chosen": -1.1807315349578857,
"logits/rejected": -1.174274206161499,
"logps/chosen": -0.28325143456459045,
"logps/rejected": -0.3123588562011719,
"loss": 1.5242,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7081285715103149,
"rewards/margins": 0.07276856154203415,
"rewards/rejected": -0.7808971405029297,
"step": 133
},
{
"epoch": 0.28640128239380175,
"grad_norm": 3.5451016426086426,
"learning_rate": 8.978122744408905e-07,
"logits/chosen": -0.9926080703735352,
"logits/rejected": -0.9227127432823181,
"logps/chosen": -0.36737683415412903,
"logps/rejected": -0.48489928245544434,
"loss": 1.5447,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9184421300888062,
"rewards/margins": 0.29380616545677185,
"rewards/rejected": -1.2122482061386108,
"step": 134
},
{
"epoch": 0.2885386053967406,
"grad_norm": 7.112710952758789,
"learning_rate": 8.955355173281707e-07,
"logits/chosen": -1.156009554862976,
"logits/rejected": -1.0397788286209106,
"logps/chosen": -0.42765456438064575,
"logps/rejected": -0.33751052618026733,
"loss": 1.494,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.069136381149292,
"rewards/margins": -0.22536009550094604,
"rewards/rejected": -0.8437763452529907,
"step": 135
},
{
"epoch": 0.29067592839967943,
"grad_norm": 3.9115548133850098,
"learning_rate": 8.932366300517249e-07,
"logits/chosen": -1.0605583190917969,
"logits/rejected": -1.0287466049194336,
"logps/chosen": -0.429559588432312,
"logps/rejected": -0.5004155039787292,
"loss": 1.51,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0738990306854248,
"rewards/margins": 0.17713983356952667,
"rewards/rejected": -1.251038908958435,
"step": 136
},
{
"epoch": 0.2928132514026182,
"grad_norm": 5.627295017242432,
"learning_rate": 8.909157412340149e-07,
"logits/chosen": -1.058396816253662,
"logits/rejected": -1.0246150493621826,
"logps/chosen": -0.36611852049827576,
"logps/rejected": -0.3920373022556305,
"loss": 1.5874,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9152963161468506,
"rewards/margins": 0.06479701399803162,
"rewards/rejected": -0.9800933003425598,
"step": 137
},
{
"epoch": 0.29495057440555705,
"grad_norm": 10.381692886352539,
"learning_rate": 8.885729807284854e-07,
"logits/chosen": -0.9971895217895508,
"logits/rejected": -1.0381770133972168,
"logps/chosen": -0.3338506519794464,
"logps/rejected": -0.3878532648086548,
"loss": 1.6782,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8346267342567444,
"rewards/margins": 0.13500656187534332,
"rewards/rejected": -0.9696332216262817,
"step": 138
},
{
"epoch": 0.29708789740849584,
"grad_norm": 3.448967933654785,
"learning_rate": 8.862084796122997e-07,
"logits/chosen": -1.0352673530578613,
"logits/rejected": -0.9383904933929443,
"logps/chosen": -0.3496807813644409,
"logps/rejected": -0.3880283236503601,
"loss": 1.533,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8742019534111023,
"rewards/margins": 0.09586883336305618,
"rewards/rejected": -0.9700708389282227,
"step": 139
},
{
"epoch": 0.2992252204114347,
"grad_norm": 7.80885648727417,
"learning_rate": 8.838223701790055e-07,
"logits/chosen": -1.027004599571228,
"logits/rejected": -1.0879004001617432,
"logps/chosen": -0.3734505772590637,
"logps/rejected": -0.4567010700702667,
"loss": 1.5578,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9336264729499817,
"rewards/margins": 0.2081262618303299,
"rewards/rejected": -1.1417527198791504,
"step": 140
},
{
"epoch": 0.3013625434143735,
"grad_norm": 4.153167247772217,
"learning_rate": 8.814147859311332e-07,
"logits/chosen": -0.9403591156005859,
"logits/rejected": -0.9337973594665527,
"logps/chosen": -0.473748117685318,
"logps/rejected": -0.4763396084308624,
"loss": 1.5861,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1843702793121338,
"rewards/margins": 0.006478846073150635,
"rewards/rejected": -1.1908490657806396,
"step": 141
},
{
"epoch": 0.3034998664173123,
"grad_norm": 6.681884288787842,
"learning_rate": 8.789858615727264e-07,
"logits/chosen": -1.2222094535827637,
"logits/rejected": -1.1642351150512695,
"logps/chosen": -0.37209561467170715,
"logps/rejected": -0.3360154330730438,
"loss": 1.5777,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9302390217781067,
"rewards/margins": -0.09020054340362549,
"rewards/rejected": -0.840038537979126,
"step": 142
},
{
"epoch": 0.30563718942025114,
"grad_norm": 4.460201740264893,
"learning_rate": 8.765357330018055e-07,
"logits/chosen": -1.1878383159637451,
"logits/rejected": -1.1819963455200195,
"logps/chosen": -0.39119935035705566,
"logps/rejected": -0.5452548265457153,
"loss": 1.5387,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.9779983758926392,
"rewards/margins": 0.3851388394832611,
"rewards/rejected": -1.3631370067596436,
"step": 143
},
{
"epoch": 0.3077745124231899,
"grad_norm": 9.836545944213867,
"learning_rate": 8.740645373027634e-07,
"logits/chosen": -0.8851446509361267,
"logits/rejected": -0.8251763582229614,
"logps/chosen": -0.29692715406417847,
"logps/rejected": -0.3346107602119446,
"loss": 1.5638,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.742317795753479,
"rewards/margins": 0.09420904517173767,
"rewards/rejected": -0.8365269303321838,
"step": 144
},
{
"epoch": 0.30991183542612877,
"grad_norm": 6.311913013458252,
"learning_rate": 8.71572412738697e-07,
"logits/chosen": -1.0461251735687256,
"logits/rejected": -1.1545249223709106,
"logps/chosen": -0.3259159028530121,
"logps/rejected": -0.40478914976119995,
"loss": 1.5561,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8147897720336914,
"rewards/margins": 0.19718317687511444,
"rewards/rejected": -1.0119729042053223,
"step": 145
},
{
"epoch": 0.3120491584290676,
"grad_norm": 3.136129379272461,
"learning_rate": 8.690594987436704e-07,
"logits/chosen": -0.9903483390808105,
"logits/rejected": -0.9337629675865173,
"logps/chosen": -0.6526888012886047,
"logps/rejected": -0.8471518754959106,
"loss": 1.5544,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6317218542099,
"rewards/margins": 0.4861578047275543,
"rewards/rejected": -2.117879629135132,
"step": 146
},
{
"epoch": 0.3141864814320064,
"grad_norm": 4.960554599761963,
"learning_rate": 8.66525935914913e-07,
"logits/chosen": -0.7758511900901794,
"logits/rejected": -0.8652888536453247,
"logps/chosen": -0.3104845881462097,
"logps/rejected": -0.3932048976421356,
"loss": 1.5272,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7762114405632019,
"rewards/margins": 0.20680075883865356,
"rewards/rejected": -0.9830121994018555,
"step": 147
},
{
"epoch": 0.31632380443494523,
"grad_norm": 6.391895771026611,
"learning_rate": 8.639718660049554e-07,
"logits/chosen": -0.988434374332428,
"logits/rejected": -0.9107407331466675,
"logps/chosen": -0.40984153747558594,
"logps/rejected": -0.46740108728408813,
"loss": 1.5732,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.0246037244796753,
"rewards/margins": 0.14389894902706146,
"rewards/rejected": -1.168502688407898,
"step": 148
},
{
"epoch": 0.3184611274378841,
"grad_norm": 5.031012535095215,
"learning_rate": 8.613974319136957e-07,
"logits/chosen": -1.0476540327072144,
"logits/rejected": -1.2046645879745483,
"logps/chosen": -0.4192398190498352,
"logps/rejected": -0.551673173904419,
"loss": 1.5839,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0480995178222656,
"rewards/margins": 0.33108341693878174,
"rewards/rejected": -1.379183053970337,
"step": 149
},
{
"epoch": 0.32059845044082286,
"grad_norm": 4.456223011016846,
"learning_rate": 8.588027776804058e-07,
"logits/chosen": -1.0767971277236938,
"logits/rejected": -0.9776477217674255,
"logps/chosen": -0.3873605728149414,
"logps/rejected": -0.41790682077407837,
"loss": 1.5818,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9684014320373535,
"rewards/margins": 0.07636569440364838,
"rewards/rejected": -1.044767141342163,
"step": 150
},
{
"epoch": 0.3227357734437617,
"grad_norm": 4.048471927642822,
"learning_rate": 8.561880484756724e-07,
"logits/chosen": -1.0004703998565674,
"logits/rejected": -1.0351473093032837,
"logps/chosen": -0.3233994245529175,
"logps/rejected": -0.6644845008850098,
"loss": 1.5014,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8084985017776489,
"rewards/margins": 0.8527127504348755,
"rewards/rejected": -1.6612112522125244,
"step": 151
},
{
"epoch": 0.3248730964467005,
"grad_norm": 4.682628154754639,
"learning_rate": 8.535533905932737e-07,
"logits/chosen": -1.0235004425048828,
"logits/rejected": -0.9808617234230042,
"logps/chosen": -0.34479135274887085,
"logps/rejected": -0.3569888770580292,
"loss": 1.5905,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8619784712791443,
"rewards/margins": 0.03049362078309059,
"rewards/rejected": -0.8924720287322998,
"step": 152
},
{
"epoch": 0.3270104194496393,
"grad_norm": 3.4882867336273193,
"learning_rate": 8.508989514419958e-07,
"logits/chosen": -1.0409698486328125,
"logits/rejected": -0.8566800951957703,
"logps/chosen": -0.3815579414367676,
"logps/rejected": -0.5463601350784302,
"loss": 1.5394,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.953894853591919,
"rewards/margins": 0.4120054841041565,
"rewards/rejected": -1.3659002780914307,
"step": 153
},
{
"epoch": 0.32914774245257816,
"grad_norm": 7.419760227203369,
"learning_rate": 8.482248795373835e-07,
"logits/chosen": -1.1312333345413208,
"logits/rejected": -1.1359437704086304,
"logps/chosen": -0.4473015069961548,
"logps/rejected": -0.4743323028087616,
"loss": 1.563,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1182537078857422,
"rewards/margins": 0.06757716834545135,
"rewards/rejected": -1.1858309507369995,
"step": 154
},
{
"epoch": 0.33128506545551695,
"grad_norm": 5.330379962921143,
"learning_rate": 8.455313244934324e-07,
"logits/chosen": -1.0080257654190063,
"logits/rejected": -0.9858949780464172,
"logps/chosen": -0.39618024230003357,
"logps/rejected": -0.39888572692871094,
"loss": 1.5041,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.9904506206512451,
"rewards/margins": 0.006763685494661331,
"rewards/rejected": -0.9972144365310669,
"step": 155
},
{
"epoch": 0.3334223884584558,
"grad_norm": 7.0863518714904785,
"learning_rate": 8.428184370142171e-07,
"logits/chosen": -1.1136622428894043,
"logits/rejected": -1.1101243495941162,
"logps/chosen": -0.33028310537338257,
"logps/rejected": -0.3945181369781494,
"loss": 1.4914,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8257076740264893,
"rewards/margins": 0.16058766841888428,
"rewards/rejected": -0.9862953424453735,
"step": 156
},
{
"epoch": 0.3355597114613946,
"grad_norm": 23.59922218322754,
"learning_rate": 8.400863688854596e-07,
"logits/chosen": -1.038999319076538,
"logits/rejected": -1.1245837211608887,
"logps/chosen": -0.4650399386882782,
"logps/rejected": -0.651479959487915,
"loss": 1.6086,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1625999212265015,
"rewards/margins": 0.46609991788864136,
"rewards/rejected": -1.628699779510498,
"step": 157
},
{
"epoch": 0.3376970344643334,
"grad_norm": 7.815871715545654,
"learning_rate": 8.373352729660372e-07,
"logits/chosen": -0.8359465599060059,
"logits/rejected": -0.9364669919013977,
"logps/chosen": -0.3437475562095642,
"logps/rejected": -0.42519667744636536,
"loss": 1.6168,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8593689203262329,
"rewards/margins": 0.2036227285861969,
"rewards/rejected": -1.0629916191101074,
"step": 158
},
{
"epoch": 0.33983435746727225,
"grad_norm": 5.957912445068359,
"learning_rate": 8.34565303179429e-07,
"logits/chosen": -0.9782019257545471,
"logits/rejected": -0.9117124080657959,
"logps/chosen": -0.35762521624565125,
"logps/rejected": -0.3135136663913727,
"loss": 1.6004,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8940630555152893,
"rewards/margins": -0.11027887463569641,
"rewards/rejected": -0.7837841510772705,
"step": 159
},
{
"epoch": 0.34197168047021104,
"grad_norm": 5.102325916290283,
"learning_rate": 8.317766145051057e-07,
"logits/chosen": -1.137721061706543,
"logits/rejected": -0.9964947700500488,
"logps/chosen": -0.28172147274017334,
"logps/rejected": -0.2709539830684662,
"loss": 1.5036,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7043037414550781,
"rewards/margins": -0.02691873162984848,
"rewards/rejected": -0.6773849725723267,
"step": 160
},
{
"epoch": 0.3441090034731499,
"grad_norm": 12.086938858032227,
"learning_rate": 8.289693629698563e-07,
"logits/chosen": -1.0542073249816895,
"logits/rejected": -1.0224194526672363,
"logps/chosen": -0.49710360169410706,
"logps/rejected": -0.5311066508293152,
"loss": 1.4818,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2427589893341064,
"rewards/margins": 0.08500773459672928,
"rewards/rejected": -1.3277666568756104,
"step": 161
},
{
"epoch": 0.3462463264760887,
"grad_norm": 4.8447089195251465,
"learning_rate": 8.261437056390606e-07,
"logits/chosen": -0.9805575609207153,
"logits/rejected": -0.940955638885498,
"logps/chosen": -0.3420554995536804,
"logps/rejected": -0.32786181569099426,
"loss": 1.5357,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8551387190818787,
"rewards/margins": -0.03548429161310196,
"rewards/rejected": -0.8196544647216797,
"step": 162
},
{
"epoch": 0.3483836494790275,
"grad_norm": 7.347986221313477,
"learning_rate": 8.232998006078997e-07,
"logits/chosen": -1.0599087476730347,
"logits/rejected": -1.0014201402664185,
"logps/chosen": -0.36005347967147827,
"logps/rejected": -0.3714003264904022,
"loss": 1.5657,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.9001337289810181,
"rewards/margins": 0.028367016464471817,
"rewards/rejected": -0.928500771522522,
"step": 163
},
{
"epoch": 0.35052097248196634,
"grad_norm": 5.851932048797607,
"learning_rate": 8.20437806992512e-07,
"logits/chosen": -1.1148821115493774,
"logits/rejected": -1.01279616355896,
"logps/chosen": -0.4296346604824066,
"logps/rejected": -0.4424145221710205,
"loss": 1.5853,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.0740867853164673,
"rewards/margins": 0.03194954991340637,
"rewards/rejected": -1.1060361862182617,
"step": 164
},
{
"epoch": 0.3526582954849052,
"grad_norm": 4.438355922698975,
"learning_rate": 8.175578849210894e-07,
"logits/chosen": -1.0889073610305786,
"logits/rejected": -0.9964409470558167,
"logps/chosen": -0.4054264426231384,
"logps/rejected": -0.6688516736030579,
"loss": 1.5004,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0135661363601685,
"rewards/margins": 0.6585631370544434,
"rewards/rejected": -1.6721292734146118,
"step": 165
},
{
"epoch": 0.35479561848784397,
"grad_norm": 10.995403289794922,
"learning_rate": 8.146601955249187e-07,
"logits/chosen": -1.0129978656768799,
"logits/rejected": -1.0595769882202148,
"logps/chosen": -0.4223453998565674,
"logps/rejected": -0.49435415863990784,
"loss": 1.5537,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.055863380432129,
"rewards/margins": 0.1800219863653183,
"rewards/rejected": -1.2358853816986084,
"step": 166
},
{
"epoch": 0.3569329414907828,
"grad_norm": 5.968749046325684,
"learning_rate": 8.117449009293668e-07,
"logits/chosen": -0.9696434736251831,
"logits/rejected": -0.9188713431358337,
"logps/chosen": -0.3301633894443512,
"logps/rejected": -0.5673984289169312,
"loss": 1.5133,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8254085183143616,
"rewards/margins": 0.5930875539779663,
"rewards/rejected": -1.4184958934783936,
"step": 167
},
{
"epoch": 0.3590702644937216,
"grad_norm": 5.678130626678467,
"learning_rate": 8.088121642448089e-07,
"logits/chosen": -1.0743975639343262,
"logits/rejected": -1.1318800449371338,
"logps/chosen": -0.5486389398574829,
"logps/rejected": -0.498245507478714,
"loss": 1.5696,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3715972900390625,
"rewards/margins": -0.1259835809469223,
"rewards/rejected": -1.2456138134002686,
"step": 168
},
{
"epoch": 0.36120758749666043,
"grad_norm": 4.451059818267822,
"learning_rate": 8.058621495575031e-07,
"logits/chosen": -0.9348894953727722,
"logits/rejected": -0.9463680386543274,
"logps/chosen": -0.45605984330177307,
"logps/rejected": -0.5887668132781982,
"loss": 1.4619,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1401495933532715,
"rewards/margins": 0.3317675292491913,
"rewards/rejected": -1.4719170331954956,
"step": 169
},
{
"epoch": 0.36334491049959927,
"grad_norm": 4.130940914154053,
"learning_rate": 8.028950219204099e-07,
"logits/chosen": -1.0746759176254272,
"logits/rejected": -1.0093308687210083,
"logps/chosen": -0.3871830701828003,
"logps/rejected": -0.6328672170639038,
"loss": 1.537,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.9679576754570007,
"rewards/margins": 0.6142103672027588,
"rewards/rejected": -1.5821679830551147,
"step": 170
},
{
"epoch": 0.36548223350253806,
"grad_norm": 4.6137824058532715,
"learning_rate": 7.99910947343957e-07,
"logits/chosen": -1.15254807472229,
"logits/rejected": -1.2261916399002075,
"logps/chosen": -0.42545759677886963,
"logps/rejected": -0.834107518196106,
"loss": 1.415,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0636440515518188,
"rewards/margins": 1.0216246843338013,
"rewards/rejected": -2.08526873588562,
"step": 171
},
{
"epoch": 0.3676195565054769,
"grad_norm": 4.724302291870117,
"learning_rate": 7.969100927867507e-07,
"logits/chosen": -1.178253412246704,
"logits/rejected": -1.1223783493041992,
"logps/chosen": -0.4991285800933838,
"logps/rejected": -0.5416950583457947,
"loss": 1.4574,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.24782133102417,
"rewards/margins": 0.10641634464263916,
"rewards/rejected": -1.354237675666809,
"step": 172
},
{
"epoch": 0.36975687950841574,
"grad_norm": 6.497979640960693,
"learning_rate": 7.938926261462365e-07,
"logits/chosen": -0.8218459486961365,
"logits/rejected": -0.7933223843574524,
"logps/chosen": -0.7412954568862915,
"logps/rejected": -0.8470227718353271,
"loss": 1.5134,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.853238582611084,
"rewards/margins": 0.2643181085586548,
"rewards/rejected": -2.117556571960449,
"step": 173
},
{
"epoch": 0.3718942025113545,
"grad_norm": 7.887294292449951,
"learning_rate": 7.908587162493028e-07,
"logits/chosen": -1.0390602350234985,
"logits/rejected": -0.9814541935920715,
"logps/chosen": -0.3448028862476349,
"logps/rejected": -0.37495365738868713,
"loss": 1.5008,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.862007200717926,
"rewards/margins": 0.07537690550088882,
"rewards/rejected": -0.9373840689659119,
"step": 174
},
{
"epoch": 0.37403152551429336,
"grad_norm": 6.731198310852051,
"learning_rate": 7.878085328428368e-07,
"logits/chosen": -0.9766249656677246,
"logits/rejected": -1.0798594951629639,
"logps/chosen": -0.3443922698497772,
"logps/rejected": -0.4274132549762726,
"loss": 1.5293,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8609806299209595,
"rewards/margins": 0.20755250751972198,
"rewards/rejected": -1.068533182144165,
"step": 175
},
{
"epoch": 0.37616884851723215,
"grad_norm": 6.996652603149414,
"learning_rate": 7.84742246584226e-07,
"logits/chosen": -1.1186981201171875,
"logits/rejected": -1.1801022291183472,
"logps/chosen": -0.375766396522522,
"logps/rejected": -0.4341758191585541,
"loss": 1.5967,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9394159317016602,
"rewards/margins": 0.14602357149124146,
"rewards/rejected": -1.0854394435882568,
"step": 176
},
{
"epoch": 0.378306171520171,
"grad_norm": 4.98004150390625,
"learning_rate": 7.81660029031811e-07,
"logits/chosen": -1.0316510200500488,
"logits/rejected": -1.0762914419174194,
"logps/chosen": -0.5189335346221924,
"logps/rejected": -0.5200464725494385,
"loss": 1.5431,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.2973339557647705,
"rewards/margins": 0.0027822405099868774,
"rewards/rejected": -1.3001161813735962,
"step": 177
},
{
"epoch": 0.3804434945231098,
"grad_norm": 5.705168724060059,
"learning_rate": 7.785620526352861e-07,
"logits/chosen": -1.0669925212860107,
"logits/rejected": -1.1380958557128906,
"logps/chosen": -0.682020902633667,
"logps/rejected": -1.2245979309082031,
"loss": 1.5066,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.705052375793457,
"rewards/margins": 1.3564426898956299,
"rewards/rejected": -3.061494827270508,
"step": 178
},
{
"epoch": 0.3825808175260486,
"grad_norm": 7.848588466644287,
"learning_rate": 7.754484907260512e-07,
"logits/chosen": -1.2138592004776,
"logits/rejected": -1.069401502609253,
"logps/chosen": -0.49140703678131104,
"logps/rejected": -0.5626181364059448,
"loss": 1.5185,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2285176515579224,
"rewards/margins": 0.1780276894569397,
"rewards/rejected": -1.4065454006195068,
"step": 179
},
{
"epoch": 0.38471814052898745,
"grad_norm": 4.452049732208252,
"learning_rate": 7.723195175075135e-07,
"logits/chosen": -0.9280627369880676,
"logits/rejected": -0.9011946320533752,
"logps/chosen": -0.5324864387512207,
"logps/rejected": -0.7281622886657715,
"loss": 1.5208,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3312160968780518,
"rewards/margins": 0.48918962478637695,
"rewards/rejected": -1.8204057216644287,
"step": 180
},
{
"epoch": 0.38685546353192624,
"grad_norm": 6.341145038604736,
"learning_rate": 7.691753080453411e-07,
"logits/chosen": -0.9016430974006653,
"logits/rejected": -0.9220845103263855,
"logps/chosen": -0.42495110630989075,
"logps/rejected": -0.7001734972000122,
"loss": 1.5602,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0623778104782104,
"rewards/margins": 0.6880559325218201,
"rewards/rejected": -1.7504336833953857,
"step": 181
},
{
"epoch": 0.3889927865348651,
"grad_norm": 8.017127990722656,
"learning_rate": 7.660160382576683e-07,
"logits/chosen": -1.072341799736023,
"logits/rejected": -1.0071234703063965,
"logps/chosen": -0.3740030825138092,
"logps/rejected": -0.42607858777046204,
"loss": 1.5577,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.9350078105926514,
"rewards/margins": 0.1301887333393097,
"rewards/rejected": -1.0651965141296387,
"step": 182
},
{
"epoch": 0.3911301095378039,
"grad_norm": 8.377826690673828,
"learning_rate": 7.628418849052523e-07,
"logits/chosen": -1.172293782234192,
"logits/rejected": -1.061620831489563,
"logps/chosen": -0.4983487129211426,
"logps/rejected": -0.44917386770248413,
"loss": 1.6102,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2458717823028564,
"rewards/margins": -0.12293709814548492,
"rewards/rejected": -1.1229346990585327,
"step": 183
},
{
"epoch": 0.3932674325407427,
"grad_norm": 5.036559581756592,
"learning_rate": 7.596530255815845e-07,
"logits/chosen": -1.2493027448654175,
"logits/rejected": -1.249387264251709,
"logps/chosen": -0.3506978154182434,
"logps/rejected": -0.7018638849258423,
"loss": 1.5607,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8767446279525757,
"rewards/margins": 0.8779150247573853,
"rewards/rejected": -1.754659652709961,
"step": 184
},
{
"epoch": 0.39540475554368154,
"grad_norm": 5.925335884094238,
"learning_rate": 7.564496387029531e-07,
"logits/chosen": -1.0624215602874756,
"logits/rejected": -1.0493366718292236,
"logps/chosen": -0.5904009342193604,
"logps/rejected": -0.8243600130081177,
"loss": 1.5543,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4760024547576904,
"rewards/margins": 0.5848975777626038,
"rewards/rejected": -2.0608999729156494,
"step": 185
},
{
"epoch": 0.3975420785466204,
"grad_norm": 5.356679916381836,
"learning_rate": 7.532319034984614e-07,
"logits/chosen": -1.097068190574646,
"logits/rejected": -1.1813206672668457,
"logps/chosen": -0.6806610822677612,
"logps/rejected": -1.157942533493042,
"loss": 1.595,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7016526460647583,
"rewards/margins": 1.1932036876678467,
"rewards/rejected": -2.8948559761047363,
"step": 186
},
{
"epoch": 0.39967940154955917,
"grad_norm": 16.624237060546875,
"learning_rate": 7.5e-07,
"logits/chosen": -1.0386743545532227,
"logits/rejected": -0.9917205572128296,
"logps/chosen": -0.46858224272727966,
"logps/rejected": -0.5119574666023254,
"loss": 1.5693,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1714555025100708,
"rewards/margins": 0.10843798518180847,
"rewards/rejected": -1.2798936367034912,
"step": 187
},
{
"epoch": 0.401816724552498,
"grad_norm": 4.841832160949707,
"learning_rate": 7.467541090321733e-07,
"logits/chosen": -1.1640836000442505,
"logits/rejected": -1.1297622919082642,
"logps/chosen": -0.4247099757194519,
"logps/rejected": -0.44790923595428467,
"loss": 1.4403,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0617749691009521,
"rewards/margins": 0.05799813196063042,
"rewards/rejected": -1.1197729110717773,
"step": 188
},
{
"epoch": 0.4039540475554368,
"grad_norm": 4.527658939361572,
"learning_rate": 7.434944122021836e-07,
"logits/chosen": -1.0958904027938843,
"logits/rejected": -1.0618705749511719,
"logps/chosen": -0.492468923330307,
"logps/rejected": -0.7371240854263306,
"loss": 1.4343,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2311724424362183,
"rewards/margins": 0.6116377115249634,
"rewards/rejected": -1.8428101539611816,
"step": 189
},
{
"epoch": 0.40609137055837563,
"grad_norm": 5.381464004516602,
"learning_rate": 7.402210918896689e-07,
"logits/chosen": -1.1516568660736084,
"logits/rejected": -1.075246810913086,
"logps/chosen": -0.40711236000061035,
"logps/rejected": -0.6080544590950012,
"loss": 1.4249,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0177809000015259,
"rewards/margins": 0.5023550987243652,
"rewards/rejected": -1.5201361179351807,
"step": 190
},
{
"epoch": 0.40822869356131447,
"grad_norm": 3.7837634086608887,
"learning_rate": 7.369343312364993e-07,
"logits/chosen": -0.8244751691818237,
"logits/rejected": -0.9799928069114685,
"logps/chosen": -0.3934876620769501,
"logps/rejected": -0.8731093406677246,
"loss": 1.4991,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9837191104888916,
"rewards/margins": 1.19905424118042,
"rewards/rejected": -2.1827735900878906,
"step": 191
},
{
"epoch": 0.41036601656425326,
"grad_norm": 15.497288703918457,
"learning_rate": 7.33634314136531e-07,
"logits/chosen": -0.9831448793411255,
"logits/rejected": -0.9924254417419434,
"logps/chosen": -0.4264066815376282,
"logps/rejected": -0.47621241211891174,
"loss": 1.5327,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.066016674041748,
"rewards/margins": 0.12451447546482086,
"rewards/rejected": -1.1905312538146973,
"step": 192
},
{
"epoch": 0.4125033395671921,
"grad_norm": 8.750000953674316,
"learning_rate": 7.303212252253161e-07,
"logits/chosen": -1.0581997632980347,
"logits/rejected": -0.9458177089691162,
"logps/chosen": -0.44811898469924927,
"logps/rejected": -0.5640084147453308,
"loss": 1.515,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1202974319458008,
"rewards/margins": 0.28972357511520386,
"rewards/rejected": -1.4100210666656494,
"step": 193
},
{
"epoch": 0.41464066257013094,
"grad_norm": 6.45157527923584,
"learning_rate": 7.269952498697734e-07,
"logits/chosen": -1.1394294500350952,
"logits/rejected": -1.0299630165100098,
"logps/chosen": -0.5914661884307861,
"logps/rejected": -0.7591831684112549,
"loss": 1.4898,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4786653518676758,
"rewards/margins": 0.41929247975349426,
"rewards/rejected": -1.8979580402374268,
"step": 194
},
{
"epoch": 0.4167779855730697,
"grad_norm": 12.109285354614258,
"learning_rate": 7.236565741578162e-07,
"logits/chosen": -1.0763144493103027,
"logits/rejected": -1.0646532773971558,
"logps/chosen": -0.6971418261528015,
"logps/rejected": -0.7001355886459351,
"loss": 1.6446,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.7428545951843262,
"rewards/margins": 0.0074843429028987885,
"rewards/rejected": -1.7503387928009033,
"step": 195
},
{
"epoch": 0.41891530857600856,
"grad_norm": 5.321496486663818,
"learning_rate": 7.203053848879418e-07,
"logits/chosen": -1.0298616886138916,
"logits/rejected": -1.0654942989349365,
"logps/chosen": -0.3775436580181122,
"logps/rejected": -0.4247080981731415,
"loss": 1.5052,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.9438591599464417,
"rewards/margins": 0.11791113764047623,
"rewards/rejected": -1.0617702007293701,
"step": 196
},
{
"epoch": 0.42105263157894735,
"grad_norm": 5.436696529388428,
"learning_rate": 7.16941869558779e-07,
"logits/chosen": -0.8212717771530151,
"logits/rejected": -0.8311895728111267,
"logps/chosen": -0.3471302390098572,
"logps/rejected": -0.38963112235069275,
"loss": 1.4919,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8678255677223206,
"rewards/margins": 0.10625223070383072,
"rewards/rejected": -0.9740778803825378,
"step": 197
},
{
"epoch": 0.4231899545818862,
"grad_norm": 4.8039374351501465,
"learning_rate": 7.135662163585984e-07,
"logits/chosen": -1.043779969215393,
"logits/rejected": -1.0483262538909912,
"logps/chosen": -0.4499708116054535,
"logps/rejected": -0.7638281583786011,
"loss": 1.4337,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.124927043914795,
"rewards/margins": 0.7846433520317078,
"rewards/rejected": -1.9095702171325684,
"step": 198
},
{
"epoch": 0.425327277584825,
"grad_norm": 16.792325973510742,
"learning_rate": 7.101786141547828e-07,
"logits/chosen": -1.0792505741119385,
"logits/rejected": -1.0246937274932861,
"logps/chosen": -0.5006421804428101,
"logps/rejected": -0.7959640026092529,
"loss": 1.6284,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.25160551071167,
"rewards/margins": 0.7383045554161072,
"rewards/rejected": -1.9899098873138428,
"step": 199
},
{
"epoch": 0.4274646005877638,
"grad_norm": 3.4819066524505615,
"learning_rate": 7.067792524832603e-07,
"logits/chosen": -1.146781086921692,
"logits/rejected": -1.1264899969100952,
"logps/chosen": -0.41286712884902954,
"logps/rejected": -0.5021071434020996,
"loss": 1.5828,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0321677923202515,
"rewards/margins": 0.22309982776641846,
"rewards/rejected": -1.25526762008667,
"step": 200
},
{
"epoch": 0.42960192359070265,
"grad_norm": 4.699824810028076,
"learning_rate": 7.033683215379002e-07,
"logits/chosen": -1.204376459121704,
"logits/rejected": -1.0951697826385498,
"logps/chosen": -0.5037363171577454,
"logps/rejected": -0.5278567671775818,
"loss": 1.4982,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.259340763092041,
"rewards/margins": 0.060300953686237335,
"rewards/rejected": -1.3196419477462769,
"step": 201
},
{
"epoch": 0.4317392465936415,
"grad_norm": 6.670827865600586,
"learning_rate": 6.999460121598704e-07,
"logits/chosen": -1.240894079208374,
"logits/rejected": -1.1903066635131836,
"logps/chosen": -0.43070024251937866,
"logps/rejected": -0.5488986968994141,
"loss": 1.4348,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.076750636100769,
"rewards/margins": 0.29549601674079895,
"rewards/rejected": -1.3722467422485352,
"step": 202
},
{
"epoch": 0.4338765695965803,
"grad_norm": 5.494087219238281,
"learning_rate": 6.965125158269618e-07,
"logits/chosen": -0.9625495672225952,
"logits/rejected": -1.0179930925369263,
"logps/chosen": -0.46157702803611755,
"logps/rejected": -0.46555888652801514,
"loss": 1.5279,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.153942584991455,
"rewards/margins": 0.009954705834388733,
"rewards/rejected": -1.1638972759246826,
"step": 203
},
{
"epoch": 0.4360138925995191,
"grad_norm": 2.9398574829101562,
"learning_rate": 6.93068024642873e-07,
"logits/chosen": -1.0253064632415771,
"logits/rejected": -1.0069971084594727,
"logps/chosen": -0.7158012390136719,
"logps/rejected": -1.0074169635772705,
"loss": 1.4758,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7895030975341797,
"rewards/margins": 0.7290392518043518,
"rewards/rejected": -2.5185422897338867,
"step": 204
},
{
"epoch": 0.4381512156024579,
"grad_norm": 19.90937614440918,
"learning_rate": 6.896127313264642e-07,
"logits/chosen": -1.1298977136611938,
"logits/rejected": -1.038535475730896,
"logps/chosen": -0.4616938531398773,
"logps/rejected": -0.5090115070343018,
"loss": 1.5697,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1542346477508545,
"rewards/margins": 0.11829426884651184,
"rewards/rejected": -1.272528886795044,
"step": 205
},
{
"epoch": 0.44028853860539674,
"grad_norm": 8.115141868591309,
"learning_rate": 6.861468292009726e-07,
"logits/chosen": -1.1551835536956787,
"logits/rejected": -1.0220582485198975,
"logps/chosen": -0.5703631639480591,
"logps/rejected": -0.639542818069458,
"loss": 1.5053,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.4259079694747925,
"rewards/margins": 0.17294909060001373,
"rewards/rejected": -1.5988571643829346,
"step": 206
},
{
"epoch": 0.4424258616083356,
"grad_norm": 4.750410079956055,
"learning_rate": 6.826705121831976e-07,
"logits/chosen": -1.0615158081054688,
"logits/rejected": -1.0426111221313477,
"logps/chosen": -0.4923417270183563,
"logps/rejected": -0.7326016426086426,
"loss": 1.6027,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.2308542728424072,
"rewards/margins": 0.6006497144699097,
"rewards/rejected": -1.8315041065216064,
"step": 207
},
{
"epoch": 0.44456318461127436,
"grad_norm": 10.167664527893066,
"learning_rate": 6.7918397477265e-07,
"logits/chosen": -0.9964532852172852,
"logits/rejected": -0.9452884793281555,
"logps/chosen": -0.44161027669906616,
"logps/rejected": -0.46314936876296997,
"loss": 1.4903,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1040257215499878,
"rewards/margins": 0.05384783446788788,
"rewards/rejected": -1.157873511314392,
"step": 208
},
{
"epoch": 0.4467005076142132,
"grad_norm": 6.646627426147461,
"learning_rate": 6.756874120406714e-07,
"logits/chosen": -1.2888025045394897,
"logits/rejected": -1.1944079399108887,
"logps/chosen": -0.5191195607185364,
"logps/rejected": -0.5014922022819519,
"loss": 1.5891,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.297798991203308,
"rewards/margins": -0.04406837746500969,
"rewards/rejected": -1.2537304162979126,
"step": 209
},
{
"epoch": 0.448837830617152,
"grad_norm": 14.135600090026855,
"learning_rate": 6.721810196195174e-07,
"logits/chosen": -1.063171148300171,
"logits/rejected": -0.992850124835968,
"logps/chosen": -0.6121366024017334,
"logps/rejected": -0.8903471231460571,
"loss": 1.5107,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.530341625213623,
"rewards/margins": 0.695526123046875,
"rewards/rejected": -2.225867748260498,
"step": 210
},
{
"epoch": 0.45097515362009083,
"grad_norm": 5.86182165145874,
"learning_rate": 6.68664993691415e-07,
"logits/chosen": -1.0612831115722656,
"logits/rejected": -0.9907495975494385,
"logps/chosen": -0.5007533431053162,
"logps/rejected": -0.558498203754425,
"loss": 1.4862,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.2518832683563232,
"rewards/margins": 0.144362211227417,
"rewards/rejected": -1.3962455987930298,
"step": 211
},
{
"epoch": 0.45311247662302967,
"grad_norm": 6.808183670043945,
"learning_rate": 6.651395309775836e-07,
"logits/chosen": -1.0104148387908936,
"logits/rejected": -0.9466644525527954,
"logps/chosen": -0.5144920349121094,
"logps/rejected": -0.7595266103744507,
"loss": 1.5023,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.286230206489563,
"rewards/margins": 0.612586259841919,
"rewards/rejected": -1.898816466331482,
"step": 212
},
{
"epoch": 0.45524979962596845,
"grad_norm": 10.564079284667969,
"learning_rate": 6.6160482872723e-07,
"logits/chosen": -0.9594148397445679,
"logits/rejected": -0.6907047629356384,
"logps/chosen": -0.5221553444862366,
"logps/rejected": -0.5170431733131409,
"loss": 1.3874,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.305388331413269,
"rewards/margins": -0.012780493125319481,
"rewards/rejected": -1.2926077842712402,
"step": 213
},
{
"epoch": 0.4573871226289073,
"grad_norm": 3.8123362064361572,
"learning_rate": 6.580610847065123e-07,
"logits/chosen": -1.187415599822998,
"logits/rejected": -1.2069289684295654,
"logps/chosen": -0.5202743411064148,
"logps/rejected": -0.7161551117897034,
"loss": 1.4731,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3006858825683594,
"rewards/margins": 0.4897017776966095,
"rewards/rejected": -1.790387749671936,
"step": 214
},
{
"epoch": 0.45952444563184613,
"grad_norm": 5.941867351531982,
"learning_rate": 6.545084971874736e-07,
"logits/chosen": -1.1221826076507568,
"logits/rejected": -1.225363850593567,
"logps/chosen": -0.5203535556793213,
"logps/rejected": -0.721761167049408,
"loss": 1.5282,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3008837699890137,
"rewards/margins": 0.5035191178321838,
"rewards/rejected": -1.8044028282165527,
"step": 215
},
{
"epoch": 0.4616617686347849,
"grad_norm": 9.767511367797852,
"learning_rate": 6.509472649369509e-07,
"logits/chosen": -1.0308146476745605,
"logits/rejected": -1.0452611446380615,
"logps/chosen": -0.5595332980155945,
"logps/rejected": -0.8848905563354492,
"loss": 1.4983,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.3988330364227295,
"rewards/margins": 0.8133932948112488,
"rewards/rejected": -2.212226152420044,
"step": 216
},
{
"epoch": 0.46379909163772376,
"grad_norm": 10.346195220947266,
"learning_rate": 6.473775872054521e-07,
"logits/chosen": -1.065643072128296,
"logits/rejected": -1.0063228607177734,
"logps/chosen": -0.6225321888923645,
"logps/rejected": -0.695502758026123,
"loss": 1.579,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.5563305616378784,
"rewards/margins": 0.18242624402046204,
"rewards/rejected": -1.7387568950653076,
"step": 217
},
{
"epoch": 0.46593641464066254,
"grad_norm": 5.300914764404297,
"learning_rate": 6.437996637160086e-07,
"logits/chosen": -1.1427751779556274,
"logits/rejected": -1.0561769008636475,
"logps/chosen": -0.34371453523635864,
"logps/rejected": -0.5036041140556335,
"loss": 1.569,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8592862486839294,
"rewards/margins": 0.39972397685050964,
"rewards/rejected": -1.2590101957321167,
"step": 218
},
{
"epoch": 0.4680737376436014,
"grad_norm": 4.065801620483398,
"learning_rate": 6.402136946530014e-07,
"logits/chosen": -0.9067018032073975,
"logits/rejected": -0.8860527873039246,
"logps/chosen": -0.4043842852115631,
"logps/rejected": -0.37620705366134644,
"loss": 1.5412,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0109608173370361,
"rewards/margins": -0.07044325768947601,
"rewards/rejected": -0.9405175447463989,
"step": 219
},
{
"epoch": 0.4702110606465402,
"grad_norm": 7.65117073059082,
"learning_rate": 6.3661988065096e-07,
"logits/chosen": -1.2153977155685425,
"logits/rejected": -1.1215893030166626,
"logps/chosen": -0.7252041101455688,
"logps/rejected": -0.8050605058670044,
"loss": 1.5311,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.813010334968567,
"rewards/margins": 0.19964104890823364,
"rewards/rejected": -2.0126514434814453,
"step": 220
},
{
"epoch": 0.472348383649479,
"grad_norm": 11.2428560256958,
"learning_rate": 6.330184227833375e-07,
"logits/chosen": -0.9373965859413147,
"logits/rejected": -0.9961149096488953,
"logps/chosen": -0.35726746916770935,
"logps/rejected": -0.3844991624355316,
"loss": 1.6773,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.893168568611145,
"rewards/margins": 0.06807927042245865,
"rewards/rejected": -0.9612478613853455,
"step": 221
},
{
"epoch": 0.47448570665241785,
"grad_norm": 4.207274913787842,
"learning_rate": 6.294095225512604e-07,
"logits/chosen": -1.1654491424560547,
"logits/rejected": -0.9704052805900574,
"logps/chosen": -0.5772050023078918,
"logps/rejected": -0.5813780426979065,
"loss": 1.5306,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4430124759674072,
"rewards/margins": 0.010432573035359383,
"rewards/rejected": -1.4534451961517334,
"step": 222
},
{
"epoch": 0.4766230296553567,
"grad_norm": 9.872684478759766,
"learning_rate": 6.257933818722542e-07,
"logits/chosen": -1.1065782308578491,
"logits/rejected": -1.0201388597488403,
"logps/chosen": -0.45536190271377563,
"logps/rejected": -0.501686155796051,
"loss": 1.6091,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1384047269821167,
"rewards/margins": 0.1158108189702034,
"rewards/rejected": -1.2542154788970947,
"step": 223
},
{
"epoch": 0.4787603526582955,
"grad_norm": 4.423720359802246,
"learning_rate": 6.22170203068947e-07,
"logits/chosen": -1.0781972408294678,
"logits/rejected": -1.1105687618255615,
"logps/chosen": -0.4524337351322174,
"logps/rejected": -0.6186787486076355,
"loss": 1.4693,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1310843229293823,
"rewards/margins": 0.41561245918273926,
"rewards/rejected": -1.5466969013214111,
"step": 224
},
{
"epoch": 0.4808976756612343,
"grad_norm": 4.056224822998047,
"learning_rate": 6.185401888577487e-07,
"logits/chosen": -0.9637656211853027,
"logits/rejected": -1.049843668937683,
"logps/chosen": -0.4497312009334564,
"logps/rejected": -0.48656415939331055,
"loss": 1.5703,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1243281364440918,
"rewards/margins": 0.09208241105079651,
"rewards/rejected": -1.2164103984832764,
"step": 225
},
{
"epoch": 0.4830349986641731,
"grad_norm": 7.398245334625244,
"learning_rate": 6.149035423375098e-07,
"logits/chosen": -0.9626678228378296,
"logits/rejected": -0.9362674355506897,
"logps/chosen": -0.6995605230331421,
"logps/rejected": -0.7957741022109985,
"loss": 1.4827,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7489013671875,
"rewards/margins": 0.24053387343883514,
"rewards/rejected": -1.9894351959228516,
"step": 226
},
{
"epoch": 0.48517232166711194,
"grad_norm": 6.272643089294434,
"learning_rate": 6.112604669781572e-07,
"logits/chosen": -1.1032443046569824,
"logits/rejected": -0.9800142049789429,
"logps/chosen": -0.46503975987434387,
"logps/rejected": -0.6332381963729858,
"loss": 1.478,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1625994443893433,
"rewards/margins": 0.42049604654312134,
"rewards/rejected": -1.5830953121185303,
"step": 227
},
{
"epoch": 0.4873096446700508,
"grad_norm": 8.032245635986328,
"learning_rate": 6.07611166609311e-07,
"logits/chosen": -1.2633821964263916,
"logits/rejected": -1.1391656398773193,
"logps/chosen": -0.442541241645813,
"logps/rejected": -0.7329965829849243,
"loss": 1.5334,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1063531637191772,
"rewards/margins": 0.7261385917663574,
"rewards/rejected": -1.8324915170669556,
"step": 228
},
{
"epoch": 0.48944696767298956,
"grad_norm": 3.549194097518921,
"learning_rate": 6.039558454088795e-07,
"logits/chosen": -1.1071525812149048,
"logits/rejected": -0.9752082228660583,
"logps/chosen": -0.7183459401130676,
"logps/rejected": -0.5012776255607605,
"loss": 1.5703,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.7958645820617676,
"rewards/margins": -0.542670726776123,
"rewards/rejected": -1.2531940937042236,
"step": 229
},
{
"epoch": 0.4915842906759284,
"grad_norm": 7.271309852600098,
"learning_rate": 6.002947078916364e-07,
"logits/chosen": -1.1285755634307861,
"logits/rejected": -1.0309889316558838,
"logps/chosen": -0.5892479419708252,
"logps/rejected": -0.8529112339019775,
"loss": 1.5424,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4731197357177734,
"rewards/margins": 0.65915846824646,
"rewards/rejected": -2.1322782039642334,
"step": 230
},
{
"epoch": 0.49372161367886724,
"grad_norm": 9.371764183044434,
"learning_rate": 5.966279588977766e-07,
"logits/chosen": -1.1868503093719482,
"logits/rejected": -1.0965262651443481,
"logps/chosen": -0.5913974642753601,
"logps/rejected": -0.5998914837837219,
"loss": 1.5586,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.4784936904907227,
"rewards/margins": 0.02123492956161499,
"rewards/rejected": -1.4997284412384033,
"step": 231
},
{
"epoch": 0.49585893668180603,
"grad_norm": 3.9326915740966797,
"learning_rate": 5.929558035814574e-07,
"logits/chosen": -1.0558724403381348,
"logits/rejected": -1.0998754501342773,
"logps/chosen": -0.5341198444366455,
"logps/rejected": -0.6374375820159912,
"loss": 1.4501,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3352996110916138,
"rewards/margins": 0.25829440355300903,
"rewards/rejected": -1.593593955039978,
"step": 232
},
{
"epoch": 0.49799625968474487,
"grad_norm": 5.740602016448975,
"learning_rate": 5.892784473993183e-07,
"logits/chosen": -0.9249231815338135,
"logits/rejected": -0.9186200499534607,
"logps/chosen": -0.5150828957557678,
"logps/rejected": -0.7307932376861572,
"loss": 1.5555,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2877074480056763,
"rewards/margins": 0.5392757654190063,
"rewards/rejected": -1.826983094215393,
"step": 233
},
{
"epoch": 0.5001335826876837,
"grad_norm": 4.63511323928833,
"learning_rate": 5.855960960989876e-07,
"logits/chosen": -1.0853294134140015,
"logits/rejected": -0.9906125068664551,
"logps/chosen": -0.6735414266586304,
"logps/rejected": -0.6355391144752502,
"loss": 1.5485,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.6838535070419312,
"rewards/margins": -0.09500567615032196,
"rewards/rejected": -1.5888478755950928,
"step": 234
},
{
"epoch": 0.5022709056906225,
"grad_norm": 4.9034810066223145,
"learning_rate": 5.819089557075688e-07,
"logits/chosen": -1.1883198022842407,
"logits/rejected": -1.111728310585022,
"logps/chosen": -0.5791587233543396,
"logps/rejected": -0.5992032289505005,
"loss": 1.4893,
"rewards/accuracies": 0.1875,
"rewards/chosen": -1.4478968381881714,
"rewards/margins": 0.05011126026511192,
"rewards/rejected": -1.498008131980896,
"step": 235
},
{
"epoch": 0.5044082286935613,
"grad_norm": 4.951679706573486,
"learning_rate": 5.782172325201155e-07,
"logits/chosen": -1.099345088005066,
"logits/rejected": -0.9732553362846375,
"logps/chosen": -0.41670212149620056,
"logps/rejected": -0.49779534339904785,
"loss": 1.4412,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.041755199432373,
"rewards/margins": 0.2027330994606018,
"rewards/rejected": -1.2444884777069092,
"step": 236
},
{
"epoch": 0.5065455516965002,
"grad_norm": 5.122452259063721,
"learning_rate": 5.745211330880872e-07,
"logits/chosen": -1.1886011362075806,
"logits/rejected": -1.0731868743896484,
"logps/chosen": -0.5531087517738342,
"logps/rejected": -0.5820314884185791,
"loss": 1.4998,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.3827718496322632,
"rewards/margins": 0.07230678200721741,
"rewards/rejected": -1.4550786018371582,
"step": 237
},
{
"epoch": 0.508682874699439,
"grad_norm": 5.16255521774292,
"learning_rate": 5.708208642077945e-07,
"logits/chosen": -0.9164212942123413,
"logits/rejected": -0.9998050332069397,
"logps/chosen": -0.4382399022579193,
"logps/rejected": -0.526303768157959,
"loss": 1.4613,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0955997705459595,
"rewards/margins": 0.22015975415706635,
"rewards/rejected": -1.315759539604187,
"step": 238
},
{
"epoch": 0.5108201977023777,
"grad_norm": 5.4947052001953125,
"learning_rate": 5.671166329088277e-07,
"logits/chosen": -1.2119641304016113,
"logits/rejected": -1.2597813606262207,
"logps/chosen": -0.5000596046447754,
"logps/rejected": -0.6806124448776245,
"loss": 1.49,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2501490116119385,
"rewards/margins": 0.4513818621635437,
"rewards/rejected": -1.701530933380127,
"step": 239
},
{
"epoch": 0.5129575207053166,
"grad_norm": 6.559435844421387,
"learning_rate": 5.634086464424742e-07,
"logits/chosen": -1.217972755432129,
"logits/rejected": -1.1487551927566528,
"logps/chosen": -0.49298563599586487,
"logps/rejected": -0.6521138548851013,
"loss": 1.5189,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.232464075088501,
"rewards/margins": 0.3978206217288971,
"rewards/rejected": -1.6302846670150757,
"step": 240
},
{
"epoch": 0.5150948437082554,
"grad_norm": 11.107866287231445,
"learning_rate": 5.596971122701221e-07,
"logits/chosen": -1.1741474866867065,
"logits/rejected": -1.1197490692138672,
"logps/chosen": -0.3616866171360016,
"logps/rejected": -0.4305798411369324,
"loss": 1.5347,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9042165875434875,
"rewards/margins": 0.17223304510116577,
"rewards/rejected": -1.0764496326446533,
"step": 241
},
{
"epoch": 0.5172321667111942,
"grad_norm": 6.960956573486328,
"learning_rate": 5.559822380516539e-07,
"logits/chosen": -1.2302253246307373,
"logits/rejected": -1.075247883796692,
"logps/chosen": -0.4363064169883728,
"logps/rejected": -0.31692779064178467,
"loss": 1.5663,
"rewards/accuracies": 0.25,
"rewards/chosen": -1.090766191482544,
"rewards/margins": -0.29844653606414795,
"rewards/rejected": -0.7923195958137512,
"step": 242
},
{
"epoch": 0.5193694897141331,
"grad_norm": 6.810981273651123,
"learning_rate": 5.522642316338268e-07,
"logits/chosen": -1.0663301944732666,
"logits/rejected": -1.1324117183685303,
"logps/chosen": -0.4831632673740387,
"logps/rejected": -0.7216737270355225,
"loss": 1.4868,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.207908272743225,
"rewards/margins": 0.596276044845581,
"rewards/rejected": -1.8041841983795166,
"step": 243
},
{
"epoch": 0.5215068127170719,
"grad_norm": 5.594532489776611,
"learning_rate": 5.48543301038644e-07,
"logits/chosen": -1.1370617151260376,
"logits/rejected": -1.1158241033554077,
"logps/chosen": -0.49420881271362305,
"logps/rejected": -0.6353664398193359,
"loss": 1.4837,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2355221509933472,
"rewards/margins": 0.35289400815963745,
"rewards/rejected": -1.5884160995483398,
"step": 244
},
{
"epoch": 0.5236441357200107,
"grad_norm": 5.150038719177246,
"learning_rate": 5.448196544517167e-07,
"logits/chosen": -0.9938709139823914,
"logits/rejected": -0.9924468398094177,
"logps/chosen": -0.46965187788009644,
"logps/rejected": -0.5854384899139404,
"loss": 1.4984,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.174129843711853,
"rewards/margins": 0.28946635127067566,
"rewards/rejected": -1.4635961055755615,
"step": 245
},
{
"epoch": 0.5257814587229495,
"grad_norm": 5.775374412536621,
"learning_rate": 5.410935002106152e-07,
"logits/chosen": -1.2858668565750122,
"logits/rejected": -1.339996099472046,
"logps/chosen": -0.5900648832321167,
"logps/rejected": -0.743665337562561,
"loss": 1.5673,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.475162148475647,
"rewards/margins": 0.38400113582611084,
"rewards/rejected": -1.8591632843017578,
"step": 246
},
{
"epoch": 0.5279187817258884,
"grad_norm": 6.673636436462402,
"learning_rate": 5.373650467932121e-07,
"logits/chosen": -0.9241991639137268,
"logits/rejected": -0.9246317744255066,
"logps/chosen": -0.7461893558502197,
"logps/rejected": -1.305277705192566,
"loss": 1.5427,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8654735088348389,
"rewards/margins": 1.3977206945419312,
"rewards/rejected": -3.2631943225860596,
"step": 247
},
{
"epoch": 0.5300561047288271,
"grad_norm": 10.964753150939941,
"learning_rate": 5.336345028060199e-07,
"logits/chosen": -1.1549595594406128,
"logits/rejected": -1.0779494047164917,
"logps/chosen": -0.4138936996459961,
"logps/rejected": -0.5448856353759766,
"loss": 1.5164,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0347342491149902,
"rewards/margins": 0.32747986912727356,
"rewards/rejected": -1.362214207649231,
"step": 248
},
{
"epoch": 0.5321934277317659,
"grad_norm": 7.340090751647949,
"learning_rate": 5.299020769725171e-07,
"logits/chosen": -1.2774559259414673,
"logits/rejected": -1.161049723625183,
"logps/chosen": -0.5907987356185913,
"logps/rejected": -0.784879207611084,
"loss": 1.5591,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.4769967794418335,
"rewards/margins": 0.48520126938819885,
"rewards/rejected": -1.96219801902771,
"step": 249
},
{
"epoch": 0.5343307507347048,
"grad_norm": 9.162212371826172,
"learning_rate": 5.26167978121472e-07,
"logits/chosen": -0.9980146884918213,
"logits/rejected": -0.9839775562286377,
"logps/chosen": -0.5775759816169739,
"logps/rejected": -0.9316012859344482,
"loss": 1.5055,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4439398050308228,
"rewards/margins": 0.8850634694099426,
"rewards/rejected": -2.32900333404541,
"step": 250
},
{
"epoch": 0.5364680737376436,
"grad_norm": 6.221843242645264,
"learning_rate": 5.224324151752575e-07,
"logits/chosen": -1.1261907815933228,
"logits/rejected": -1.119322657585144,
"logps/chosen": -0.6372994184494019,
"logps/rejected": -0.7105351686477661,
"loss": 1.5162,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5932482481002808,
"rewards/margins": 0.1830897331237793,
"rewards/rejected": -1.7763381004333496,
"step": 251
},
{
"epoch": 0.5386053967405824,
"grad_norm": 7.211760997772217,
"learning_rate": 5.18695597138163e-07,
"logits/chosen": -1.0601727962493896,
"logits/rejected": -1.0903403759002686,
"logps/chosen": -0.6433329582214355,
"logps/rejected": -0.7051988840103149,
"loss": 1.5038,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.6083323955535889,
"rewards/margins": 0.15466496348381042,
"rewards/rejected": -1.7629972696304321,
"step": 252
},
{
"epoch": 0.5407427197435213,
"grad_norm": 11.955366134643555,
"learning_rate": 5.149577330846992e-07,
"logits/chosen": -0.9905640482902527,
"logits/rejected": -0.9827526807785034,
"logps/chosen": -0.460682213306427,
"logps/rejected": -0.5131441950798035,
"loss": 1.5193,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.1517056226730347,
"rewards/margins": 0.13115480542182922,
"rewards/rejected": -1.282860279083252,
"step": 253
},
{
"epoch": 0.5428800427464601,
"grad_norm": 7.990007400512695,
"learning_rate": 5.112190321479025e-07,
"logits/chosen": -1.0986907482147217,
"logits/rejected": -1.047619342803955,
"logps/chosen": -0.42801418900489807,
"logps/rejected": -0.4528927206993103,
"loss": 1.5259,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.070035457611084,
"rewards/margins": 0.062196291983127594,
"rewards/rejected": -1.1322317123413086,
"step": 254
},
{
"epoch": 0.5450173657493989,
"grad_norm": 12.59540843963623,
"learning_rate": 5.074797035076318e-07,
"logits/chosen": -1.088548183441162,
"logits/rejected": -1.094880223274231,
"logps/chosen": -0.44651007652282715,
"logps/rejected": -0.6774522066116333,
"loss": 1.4684,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1162753105163574,
"rewards/margins": 0.577355146408081,
"rewards/rejected": -1.6936304569244385,
"step": 255
},
{
"epoch": 0.5471546887523377,
"grad_norm": 8.063529968261719,
"learning_rate": 5.037399563788664e-07,
"logits/chosen": -1.0905590057373047,
"logits/rejected": -1.0908201932907104,
"logps/chosen": -0.571644127368927,
"logps/rejected": -1.0555922985076904,
"loss": 1.4844,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.429110050201416,
"rewards/margins": 1.2098705768585205,
"rewards/rejected": -2.6389808654785156,
"step": 256
},
{
"epoch": 0.5492920117552765,
"grad_norm": 4.676781177520752,
"learning_rate": 5e-07,
"logits/chosen": -0.9753708839416504,
"logits/rejected": -0.8780065774917603,
"logps/chosen": -0.4151499271392822,
"logps/rejected": -0.6088293790817261,
"loss": 1.5588,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0378748178482056,
"rewards/margins": 0.4841986298561096,
"rewards/rejected": -1.5220733880996704,
"step": 257
},
{
"epoch": 0.5514293347582153,
"grad_norm": 5.617074489593506,
"learning_rate": 4.962600436211335e-07,
"logits/chosen": -1.0411148071289062,
"logits/rejected": -1.0378780364990234,
"logps/chosen": -0.4362824261188507,
"logps/rejected": -0.6974292993545532,
"loss": 1.5426,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0907059907913208,
"rewards/margins": 0.6528674960136414,
"rewards/rejected": -1.743573546409607,
"step": 258
},
{
"epoch": 0.5535666577611541,
"grad_norm": 4.551398754119873,
"learning_rate": 4.925202964923683e-07,
"logits/chosen": -1.2078725099563599,
"logits/rejected": -1.1912176609039307,
"logps/chosen": -0.6027969121932983,
"logps/rejected": -0.6982436180114746,
"loss": 1.4439,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.506992220878601,
"rewards/margins": 0.23861676454544067,
"rewards/rejected": -1.7456090450286865,
"step": 259
},
{
"epoch": 0.555703980764093,
"grad_norm": 7.812004566192627,
"learning_rate": 4.887809678520975e-07,
"logits/chosen": -0.9420537948608398,
"logits/rejected": -0.9832956790924072,
"logps/chosen": -0.4232637584209442,
"logps/rejected": -0.5239140391349792,
"loss": 1.531,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.058159351348877,
"rewards/margins": 0.2516256868839264,
"rewards/rejected": -1.30978524684906,
"step": 260
},
{
"epoch": 0.5578413037670318,
"grad_norm": 6.302420139312744,
"learning_rate": 4.850422669153009e-07,
"logits/chosen": -0.898323655128479,
"logits/rejected": -0.8499814867973328,
"logps/chosen": -0.4516730308532715,
"logps/rejected": -0.48423612117767334,
"loss": 1.5732,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1291825771331787,
"rewards/margins": 0.08140772581100464,
"rewards/rejected": -1.2105903625488281,
"step": 261
},
{
"epoch": 0.5599786267699706,
"grad_norm": 4.167732238769531,
"learning_rate": 4.813044028618372e-07,
"logits/chosen": -1.16361403465271,
"logits/rejected": -1.165586233139038,
"logps/chosen": -0.7509727478027344,
"logps/rejected": -1.160327672958374,
"loss": 1.531,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.8774319887161255,
"rewards/margins": 1.0233874320983887,
"rewards/rejected": -2.9008195400238037,
"step": 262
},
{
"epoch": 0.5621159497729095,
"grad_norm": 3.631260633468628,
"learning_rate": 4.775675848247427e-07,
"logits/chosen": -1.097744107246399,
"logits/rejected": -1.1007401943206787,
"logps/chosen": -0.6823064088821411,
"logps/rejected": -0.7260396480560303,
"loss": 1.4614,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.705765962600708,
"rewards/margins": 0.10933306813240051,
"rewards/rejected": -1.8150990009307861,
"step": 263
},
{
"epoch": 0.5642532727758482,
"grad_norm": 10.018603324890137,
"learning_rate": 4.7383202187852804e-07,
"logits/chosen": -1.0766931772232056,
"logits/rejected": -1.059554934501648,
"logps/chosen": -0.592974066734314,
"logps/rejected": -0.5639240741729736,
"loss": 1.6513,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.4824351072311401,
"rewards/margins": -0.07262498140335083,
"rewards/rejected": -1.4098100662231445,
"step": 264
},
{
"epoch": 0.566390595778787,
"grad_norm": 10.986533164978027,
"learning_rate": 4.700979230274829e-07,
"logits/chosen": -1.1942667961120605,
"logits/rejected": -1.2509047985076904,
"logps/chosen": -0.5944747924804688,
"logps/rejected": -0.7951716184616089,
"loss": 1.5124,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.486187219619751,
"rewards/margins": 0.5017418265342712,
"rewards/rejected": -1.987929105758667,
"step": 265
},
{
"epoch": 0.5685279187817259,
"grad_norm": 7.592911720275879,
"learning_rate": 4.6636549719398016e-07,
"logits/chosen": -1.0581759214401245,
"logits/rejected": -1.1627366542816162,
"logps/chosen": -0.4522143006324768,
"logps/rejected": -0.7358817458152771,
"loss": 1.518,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1305358409881592,
"rewards/margins": 0.7091686129570007,
"rewards/rejected": -1.8397043943405151,
"step": 266
},
{
"epoch": 0.5706652417846647,
"grad_norm": 6.3432488441467285,
"learning_rate": 4.626349532067879e-07,
"logits/chosen": -1.2379893064498901,
"logits/rejected": -1.2148290872573853,
"logps/chosen": -0.6970117092132568,
"logps/rejected": -0.6927489638328552,
"loss": 1.5069,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7425293922424316,
"rewards/margins": -0.010656729340553284,
"rewards/rejected": -1.73187255859375,
"step": 267
},
{
"epoch": 0.5728025647876035,
"grad_norm": 17.149002075195312,
"learning_rate": 4.5890649978938487e-07,
"logits/chosen": -1.0269464254379272,
"logits/rejected": -0.992821216583252,
"logps/chosen": -0.5960928201675415,
"logps/rejected": -0.9721090793609619,
"loss": 1.4951,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.490232229232788,
"rewards/margins": 0.940040647983551,
"rewards/rejected": -2.4302728176116943,
"step": 268
},
{
"epoch": 0.5749398877905424,
"grad_norm": 14.956734657287598,
"learning_rate": 4.5518034554828327e-07,
"logits/chosen": -1.1823533773422241,
"logits/rejected": -1.1165564060211182,
"logps/chosen": -0.6763143539428711,
"logps/rejected": -0.6308431029319763,
"loss": 1.6765,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.6907857656478882,
"rewards/margins": -0.11367809772491455,
"rewards/rejected": -1.5771077871322632,
"step": 269
},
{
"epoch": 0.5770772107934812,
"grad_norm": 10.486448287963867,
"learning_rate": 4.514566989613559e-07,
"logits/chosen": -0.9095754027366638,
"logits/rejected": -0.9553719162940979,
"logps/chosen": -0.4061740040779114,
"logps/rejected": -0.5486086010932922,
"loss": 1.6421,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0154350996017456,
"rewards/margins": 0.3560864329338074,
"rewards/rejected": -1.3715215921401978,
"step": 270
},
{
"epoch": 0.57921453379642,
"grad_norm": 6.875598907470703,
"learning_rate": 4.477357683661733e-07,
"logits/chosen": -1.016709327697754,
"logits/rejected": -1.0593208074569702,
"logps/chosen": -0.9499566555023193,
"logps/rejected": -1.4164179563522339,
"loss": 1.4581,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.374891519546509,
"rewards/margins": 1.1661533117294312,
"rewards/rejected": -3.5410449504852295,
"step": 271
},
{
"epoch": 0.5813518567993589,
"grad_norm": 4.006314754486084,
"learning_rate": 4.4401776194834603e-07,
"logits/chosen": -1.007668375968933,
"logits/rejected": -0.886061429977417,
"logps/chosen": -0.8610125780105591,
"logps/rejected": -0.9118555188179016,
"loss": 1.5326,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.152531385421753,
"rewards/margins": 0.12710730731487274,
"rewards/rejected": -2.2796387672424316,
"step": 272
},
{
"epoch": 0.5834891798022976,
"grad_norm": 5.665445804595947,
"learning_rate": 4.403028877298779e-07,
"logits/chosen": -0.9741629362106323,
"logits/rejected": -0.9361596703529358,
"logps/chosen": -0.6918255090713501,
"logps/rejected": -0.7569224834442139,
"loss": 1.4643,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.729563593864441,
"rewards/margins": 0.16274265944957733,
"rewards/rejected": -1.8923062086105347,
"step": 273
},
{
"epoch": 0.5856265028052364,
"grad_norm": 6.989828109741211,
"learning_rate": 4.3659135355752593e-07,
"logits/chosen": -0.9656753540039062,
"logits/rejected": -0.9632652997970581,
"logps/chosen": -0.719750702381134,
"logps/rejected": -0.7859822511672974,
"loss": 1.6282,
"rewards/accuracies": 0.1875,
"rewards/chosen": -1.7993768453598022,
"rewards/margins": 0.16557902097702026,
"rewards/rejected": -1.9649556875228882,
"step": 274
},
{
"epoch": 0.5877638258081752,
"grad_norm": 13.183172225952148,
"learning_rate": 4.328833670911724e-07,
"logits/chosen": -0.7260496616363525,
"logits/rejected": -0.6252482533454895,
"logps/chosen": -0.7889982461929321,
"logps/rejected": -0.7441343069076538,
"loss": 1.5726,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.9724955558776855,
"rewards/margins": -0.11215980350971222,
"rewards/rejected": -1.8603358268737793,
"step": 275
},
{
"epoch": 0.5899011488111141,
"grad_norm": 4.624598979949951,
"learning_rate": 4.2917913579220553e-07,
"logits/chosen": -1.2080626487731934,
"logits/rejected": -1.0251206159591675,
"logps/chosen": -0.6163781881332397,
"logps/rejected": -0.5924357175827026,
"loss": 1.5521,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.5409454107284546,
"rewards/margins": -0.05985613912343979,
"rewards/rejected": -1.4810893535614014,
"step": 276
},
{
"epoch": 0.5920384718140529,
"grad_norm": 5.583042144775391,
"learning_rate": 4.254788669119127e-07,
"logits/chosen": -1.2352536916732788,
"logits/rejected": -1.1895160675048828,
"logps/chosen": -0.6844548583030701,
"logps/rejected": -1.2121189832687378,
"loss": 1.5148,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.711137056350708,
"rewards/margins": 1.3191602230072021,
"rewards/rejected": -3.0302975177764893,
"step": 277
},
{
"epoch": 0.5941757948169917,
"grad_norm": 7.106680393218994,
"learning_rate": 4.2178276747988444e-07,
"logits/chosen": -1.0981608629226685,
"logits/rejected": -1.097581148147583,
"logps/chosen": -0.7495326995849609,
"logps/rejected": -0.9760425090789795,
"loss": 1.556,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.8738317489624023,
"rewards/margins": 0.5662744641304016,
"rewards/rejected": -2.440106153488159,
"step": 278
},
{
"epoch": 0.5963131178199306,
"grad_norm": 18.446353912353516,
"learning_rate": 4.180910442924311e-07,
"logits/chosen": -1.1512420177459717,
"logits/rejected": -1.117828607559204,
"logps/chosen": -0.7383052110671997,
"logps/rejected": -0.5387950539588928,
"loss": 1.5904,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.845763087272644,
"rewards/margins": -0.49877557158470154,
"rewards/rejected": -1.3469874858856201,
"step": 279
},
{
"epoch": 0.5984504408228694,
"grad_norm": 7.653874397277832,
"learning_rate": 4.144039039010124e-07,
"logits/chosen": -0.9732711315155029,
"logits/rejected": -0.9874970316886902,
"logps/chosen": -0.5405304431915283,
"logps/rejected": -0.5719125866889954,
"loss": 1.4652,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.3513259887695312,
"rewards/margins": 0.07845549285411835,
"rewards/rejected": -1.429781436920166,
"step": 280
},
{
"epoch": 0.6005877638258081,
"grad_norm": 21.542905807495117,
"learning_rate": 4.107215526006817e-07,
"logits/chosen": -0.9731748700141907,
"logits/rejected": -0.8227044939994812,
"logps/chosen": -0.7879041433334351,
"logps/rejected": -1.0974005460739136,
"loss": 1.5757,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9697604179382324,
"rewards/margins": 0.773740828037262,
"rewards/rejected": -2.7435011863708496,
"step": 281
},
{
"epoch": 0.602725086828747,
"grad_norm": 23.93320655822754,
"learning_rate": 4.070441964185427e-07,
"logits/chosen": -0.95560622215271,
"logits/rejected": -0.8649751543998718,
"logps/chosen": -0.7008312940597534,
"logps/rejected": -0.8380212187767029,
"loss": 1.4773,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.7520781755447388,
"rewards/margins": 0.34297484159469604,
"rewards/rejected": -2.09505295753479,
"step": 282
},
{
"epoch": 0.6048624098316858,
"grad_norm": 8.638965606689453,
"learning_rate": 4.0337204110222347e-07,
"logits/chosen": -1.095788598060608,
"logits/rejected": -1.0814367532730103,
"logps/chosen": -1.298113226890564,
"logps/rejected": -0.9762169718742371,
"loss": 1.6721,
"rewards/accuracies": 0.5,
"rewards/chosen": -3.2452828884124756,
"rewards/margins": -0.804740309715271,
"rewards/rejected": -2.440542459487915,
"step": 283
},
{
"epoch": 0.6069997328346246,
"grad_norm": 10.137685775756836,
"learning_rate": 3.997052921083636e-07,
"logits/chosen": -0.9736285209655762,
"logits/rejected": -0.9580354690551758,
"logps/chosen": -0.760413408279419,
"logps/rejected": -0.6742191910743713,
"loss": 1.6567,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9010334014892578,
"rewards/margins": -0.2154853194952011,
"rewards/rejected": -1.6855480670928955,
"step": 284
},
{
"epoch": 0.6091370558375635,
"grad_norm": 9.106868743896484,
"learning_rate": 3.960441545911204e-07,
"logits/chosen": -1.0179665088653564,
"logits/rejected": -1.0651259422302246,
"logps/chosen": -0.6087648272514343,
"logps/rejected": -0.8786913156509399,
"loss": 1.4928,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5219119787216187,
"rewards/margins": 0.6748162508010864,
"rewards/rejected": -2.196728467941284,
"step": 285
},
{
"epoch": 0.6112743788405023,
"grad_norm": 26.156328201293945,
"learning_rate": 3.92388833390689e-07,
"logits/chosen": -0.7999076843261719,
"logits/rejected": -0.8813593983650208,
"logps/chosen": -0.564832329750061,
"logps/rejected": -0.6178359985351562,
"loss": 1.5268,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4120807647705078,
"rewards/margins": 0.13250917196273804,
"rewards/rejected": -1.544589877128601,
"step": 286
},
{
"epoch": 0.6134117018434411,
"grad_norm": 6.264410018920898,
"learning_rate": 3.8873953302184283e-07,
"logits/chosen": -0.9414565563201904,
"logits/rejected": -0.8855915069580078,
"logps/chosen": -0.5107077360153198,
"logps/rejected": -0.4826885461807251,
"loss": 1.6146,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2767692804336548,
"rewards/margins": -0.0700480192899704,
"rewards/rejected": -1.206721305847168,
"step": 287
},
{
"epoch": 0.6155490248463799,
"grad_norm": 4.398477554321289,
"learning_rate": 3.8509645766249034e-07,
"logits/chosen": -1.208186388015747,
"logits/rejected": -1.1303879022598267,
"logps/chosen": -0.6073251962661743,
"logps/rejected": -0.7171632647514343,
"loss": 1.4436,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.518312931060791,
"rewards/margins": 0.27459537982940674,
"rewards/rejected": -1.7929084300994873,
"step": 288
},
{
"epoch": 0.6176863478493188,
"grad_norm": 7.238714218139648,
"learning_rate": 3.814598111422513e-07,
"logits/chosen": -1.1826701164245605,
"logits/rejected": -1.10812246799469,
"logps/chosen": -0.5312564373016357,
"logps/rejected": -0.8379625678062439,
"loss": 1.4728,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3281410932540894,
"rewards/margins": 0.7667654752731323,
"rewards/rejected": -2.0949063301086426,
"step": 289
},
{
"epoch": 0.6198236708522575,
"grad_norm": 9.654561042785645,
"learning_rate": 3.778297969310529e-07,
"logits/chosen": -1.042212724685669,
"logits/rejected": -1.0525404214859009,
"logps/chosen": -0.48385101556777954,
"logps/rejected": -0.5251243114471436,
"loss": 1.5846,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2096275091171265,
"rewards/margins": 0.10318319499492645,
"rewards/rejected": -1.3128107786178589,
"step": 290
},
{
"epoch": 0.6219609938551963,
"grad_norm": 7.571155071258545,
"learning_rate": 3.742066181277457e-07,
"logits/chosen": -1.013511061668396,
"logits/rejected": -1.0030831098556519,
"logps/chosen": -0.518464207649231,
"logps/rejected": -0.6727429628372192,
"loss": 1.4901,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2961605787277222,
"rewards/margins": 0.3856966495513916,
"rewards/rejected": -1.6818573474884033,
"step": 291
},
{
"epoch": 0.6240983168581352,
"grad_norm": 14.75028133392334,
"learning_rate": 3.7059047744873955e-07,
"logits/chosen": -1.0597176551818848,
"logits/rejected": -1.0602428913116455,
"logps/chosen": -0.406850129365921,
"logps/rejected": -0.6300184726715088,
"loss": 1.5866,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0171253681182861,
"rewards/margins": 0.5579207539558411,
"rewards/rejected": -1.5750460624694824,
"step": 292
},
{
"epoch": 0.626235639861074,
"grad_norm": 15.745006561279297,
"learning_rate": 3.669815772166625e-07,
"logits/chosen": -1.2174315452575684,
"logits/rejected": -1.1674164533615112,
"logps/chosen": -0.7850491404533386,
"logps/rejected": -0.519411027431488,
"loss": 1.6349,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.9626227617263794,
"rewards/margins": -0.6640951633453369,
"rewards/rejected": -1.298527717590332,
"step": 293
},
{
"epoch": 0.6283729628640128,
"grad_norm": 5.319033622741699,
"learning_rate": 3.6338011934904e-07,
"logits/chosen": -1.2410860061645508,
"logits/rejected": -1.2556794881820679,
"logps/chosen": -0.5623264908790588,
"logps/rejected": -0.6723726987838745,
"loss": 1.5504,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.4058163166046143,
"rewards/margins": 0.27511537075042725,
"rewards/rejected": -1.680931568145752,
"step": 294
},
{
"epoch": 0.6305102858669517,
"grad_norm": 6.130387783050537,
"learning_rate": 3.5978630534699865e-07,
"logits/chosen": -1.0128402709960938,
"logits/rejected": -0.9643303155899048,
"logps/chosen": -0.7096951007843018,
"logps/rejected": -0.9845619797706604,
"loss": 1.4712,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.7742375135421753,
"rewards/margins": 0.6871672868728638,
"rewards/rejected": -2.461405038833618,
"step": 295
},
{
"epoch": 0.6326476088698905,
"grad_norm": 5.171699047088623,
"learning_rate": 3.562003362839914e-07,
"logits/chosen": -0.9853148460388184,
"logits/rejected": -0.9575502872467041,
"logps/chosen": -0.5504204034805298,
"logps/rejected": -0.6407392621040344,
"loss": 1.4703,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.3760509490966797,
"rewards/margins": 0.22579708695411682,
"rewards/rejected": -1.6018481254577637,
"step": 296
},
{
"epoch": 0.6347849318728293,
"grad_norm": 6.909521102905273,
"learning_rate": 3.526224127945478e-07,
"logits/chosen": -1.0654948949813843,
"logits/rejected": -0.9859604239463806,
"logps/chosen": -0.49610620737075806,
"logps/rejected": -0.657010018825531,
"loss": 1.5083,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2402657270431519,
"rewards/margins": 0.40225934982299805,
"rewards/rejected": -1.6425249576568604,
"step": 297
},
{
"epoch": 0.6369222548757681,
"grad_norm": 8.302495956420898,
"learning_rate": 3.49052735063049e-07,
"logits/chosen": -0.9159524440765381,
"logits/rejected": -0.9115422964096069,
"logps/chosen": -0.47944843769073486,
"logps/rejected": -0.8350514769554138,
"loss": 1.4795,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.198621153831482,
"rewards/margins": 0.8890076875686646,
"rewards/rejected": -2.0876286029815674,
"step": 298
},
{
"epoch": 0.6390595778787069,
"grad_norm": 8.518818855285645,
"learning_rate": 3.454915028125263e-07,
"logits/chosen": -1.1150455474853516,
"logits/rejected": -1.0280684232711792,
"logps/chosen": -0.6259050965309143,
"logps/rejected": -0.7404144406318665,
"loss": 1.6097,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.5647625923156738,
"rewards/margins": 0.2862735688686371,
"rewards/rejected": -1.8510361909866333,
"step": 299
},
{
"epoch": 0.6411969008816457,
"grad_norm": 5.814731597900391,
"learning_rate": 3.4193891529348795e-07,
"logits/chosen": -0.8910290002822876,
"logits/rejected": -0.887702465057373,
"logps/chosen": -0.5074589848518372,
"logps/rejected": -0.5351721048355103,
"loss": 1.5944,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2686476707458496,
"rewards/margins": 0.06928272545337677,
"rewards/rejected": -1.3379302024841309,
"step": 300
},
{
"epoch": 0.6433342238845846,
"grad_norm": 6.027761459350586,
"learning_rate": 3.3839517127277004e-07,
"logits/chosen": -0.9173066020011902,
"logits/rejected": -0.8492714762687683,
"logps/chosen": -0.5284210443496704,
"logps/rejected": -0.9472272992134094,
"loss": 1.4989,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3210527896881104,
"rewards/margins": 1.04701566696167,
"rewards/rejected": -2.368068218231201,
"step": 301
},
{
"epoch": 0.6454715468875234,
"grad_norm": 14.567846298217773,
"learning_rate": 3.348604690224166e-07,
"logits/chosen": -1.006145715713501,
"logits/rejected": -1.1280263662338257,
"logps/chosen": -0.552309513092041,
"logps/rejected": -0.8673663139343262,
"loss": 1.4528,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3807737827301025,
"rewards/margins": 0.7876418232917786,
"rewards/rejected": -2.1684157848358154,
"step": 302
},
{
"epoch": 0.6476088698904622,
"grad_norm": 6.112685203552246,
"learning_rate": 3.31335006308585e-07,
"logits/chosen": -1.1263082027435303,
"logits/rejected": -1.1103566884994507,
"logps/chosen": -0.5413545966148376,
"logps/rejected": -0.5674502849578857,
"loss": 1.5207,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.3533867597579956,
"rewards/margins": 0.06523902714252472,
"rewards/rejected": -1.4186255931854248,
"step": 303
},
{
"epoch": 0.649746192893401,
"grad_norm": 8.962607383728027,
"learning_rate": 3.2781898038048237e-07,
"logits/chosen": -1.2004753351211548,
"logits/rejected": -1.023838758468628,
"logps/chosen": -0.4371573030948639,
"logps/rejected": -0.4363090991973877,
"loss": 1.5006,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.092893362045288,
"rewards/margins": -0.0021204352378845215,
"rewards/rejected": -1.0907728672027588,
"step": 304
},
{
"epoch": 0.6518835158963399,
"grad_norm": 4.282663822174072,
"learning_rate": 3.243125879593286e-07,
"logits/chosen": -1.0454260110855103,
"logits/rejected": -1.0447431802749634,
"logps/chosen": -0.624133288860321,
"logps/rejected": -0.8872759342193604,
"loss": 1.4943,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5603333711624146,
"rewards/margins": 0.6578565239906311,
"rewards/rejected": -2.2181897163391113,
"step": 305
},
{
"epoch": 0.6540208388992786,
"grad_norm": 5.838572978973389,
"learning_rate": 3.2081602522734985e-07,
"logits/chosen": -1.0571078062057495,
"logits/rejected": -1.1394490003585815,
"logps/chosen": -0.5408369302749634,
"logps/rejected": -0.632168710231781,
"loss": 1.4793,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3520923852920532,
"rewards/margins": 0.22832949459552765,
"rewards/rejected": -1.580422043800354,
"step": 306
},
{
"epoch": 0.6561581619022174,
"grad_norm": 8.524316787719727,
"learning_rate": 3.173294878168025e-07,
"logits/chosen": -1.079796552658081,
"logits/rejected": -1.080225944519043,
"logps/chosen": -0.5492815971374512,
"logps/rejected": -0.7810524702072144,
"loss": 1.5853,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.373204231262207,
"rewards/margins": 0.5794269442558289,
"rewards/rejected": -1.9526311159133911,
"step": 307
},
{
"epoch": 0.6582954849051563,
"grad_norm": 4.74807071685791,
"learning_rate": 3.138531707990274e-07,
"logits/chosen": -0.903827428817749,
"logits/rejected": -1.0083296298980713,
"logps/chosen": -0.5046001672744751,
"logps/rejected": -0.6112987995147705,
"loss": 1.4597,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2615002393722534,
"rewards/margins": 0.2667466402053833,
"rewards/rejected": -1.5282469987869263,
"step": 308
},
{
"epoch": 0.6604328079080951,
"grad_norm": 5.756943702697754,
"learning_rate": 3.1038726867353583e-07,
"logits/chosen": -1.2377115488052368,
"logits/rejected": -1.2392162084579468,
"logps/chosen": -0.5622467398643494,
"logps/rejected": -0.7166787385940552,
"loss": 1.643,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4056168794631958,
"rewards/margins": 0.3860801160335541,
"rewards/rejected": -1.7916970252990723,
"step": 309
},
{
"epoch": 0.6625701309110339,
"grad_norm": 10.2656831741333,
"learning_rate": 3.069319753571269e-07,
"logits/chosen": -0.8976331949234009,
"logits/rejected": -0.9083345532417297,
"logps/chosen": -0.45293816924095154,
"logps/rejected": -0.57112717628479,
"loss": 1.6145,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1323453187942505,
"rewards/margins": 0.29547253251075745,
"rewards/rejected": -1.427817940711975,
"step": 310
},
{
"epoch": 0.6647074539139728,
"grad_norm": 9.763343811035156,
"learning_rate": 3.034874841730382e-07,
"logits/chosen": -1.4269440174102783,
"logits/rejected": -1.2256879806518555,
"logps/chosen": -0.48573237657546997,
"logps/rejected": -0.7592737674713135,
"loss": 1.5495,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.214331030845642,
"rewards/margins": 0.6838533878326416,
"rewards/rejected": -1.8981844186782837,
"step": 311
},
{
"epoch": 0.6668447769169116,
"grad_norm": 5.243341445922852,
"learning_rate": 3.000539878401296e-07,
"logits/chosen": -1.1378743648529053,
"logits/rejected": -1.1113413572311401,
"logps/chosen": -0.5226577520370483,
"logps/rejected": -0.47466573119163513,
"loss": 1.538,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.3066444396972656,
"rewards/margins": -0.11998005956411362,
"rewards/rejected": -1.186664342880249,
"step": 312
},
{
"epoch": 0.6689820999198504,
"grad_norm": 8.089217185974121,
"learning_rate": 2.9663167846209996e-07,
"logits/chosen": -1.2200907468795776,
"logits/rejected": -1.200815200805664,
"logps/chosen": -0.5275171399116516,
"logps/rejected": -0.6412789821624756,
"loss": 1.4825,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.3187928199768066,
"rewards/margins": 0.28440454602241516,
"rewards/rejected": -1.6031973361968994,
"step": 313
},
{
"epoch": 0.6711194229227893,
"grad_norm": 5.0957417488098145,
"learning_rate": 2.9322074751673974e-07,
"logits/chosen": -1.0893193483352661,
"logits/rejected": -1.135164499282837,
"logps/chosen": -0.573096752166748,
"logps/rejected": -0.6840115785598755,
"loss": 1.5689,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.4327419996261597,
"rewards/margins": 0.27728694677352905,
"rewards/rejected": -1.710028886795044,
"step": 314
},
{
"epoch": 0.673256745925728,
"grad_norm": 6.795984745025635,
"learning_rate": 2.898213858452173e-07,
"logits/chosen": -1.0072894096374512,
"logits/rejected": -0.949766993522644,
"logps/chosen": -0.6433035135269165,
"logps/rejected": -0.7086694836616516,
"loss": 1.5963,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.6082587242126465,
"rewards/margins": 0.16341499984264374,
"rewards/rejected": -1.7716736793518066,
"step": 315
},
{
"epoch": 0.6753940689286668,
"grad_norm": 5.6735053062438965,
"learning_rate": 2.864337836414018e-07,
"logits/chosen": -0.9407764077186584,
"logits/rejected": -1.0202523469924927,
"logps/chosen": -0.47564423084259033,
"logps/rejected": -0.8013235330581665,
"loss": 1.5025,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1891103982925415,
"rewards/margins": 0.8141986131668091,
"rewards/rejected": -2.0033090114593506,
"step": 316
},
{
"epoch": 0.6775313919316056,
"grad_norm": 4.4380059242248535,
"learning_rate": 2.8305813044122093e-07,
"logits/chosen": -1.073567271232605,
"logits/rejected": -1.010934591293335,
"logps/chosen": -0.412260502576828,
"logps/rejected": -0.5673598051071167,
"loss": 1.4443,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.030651330947876,
"rewards/margins": 0.38774824142456055,
"rewards/rejected": -1.4183995723724365,
"step": 317
},
{
"epoch": 0.6796687149345445,
"grad_norm": 11.309561729431152,
"learning_rate": 2.7969461511205806e-07,
"logits/chosen": -1.146227478981018,
"logits/rejected": -1.0426236391067505,
"logps/chosen": -0.766873300075531,
"logps/rejected": -0.9035167694091797,
"loss": 1.4907,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.9171831607818604,
"rewards/margins": 0.34160885214805603,
"rewards/rejected": -2.258791923522949,
"step": 318
},
{
"epoch": 0.6818060379374833,
"grad_norm": 6.63892126083374,
"learning_rate": 2.763434258421836e-07,
"logits/chosen": -1.126558542251587,
"logits/rejected": -1.1528400182724,
"logps/chosen": -0.5386890172958374,
"logps/rejected": -0.9026880264282227,
"loss": 1.5645,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3467226028442383,
"rewards/margins": 0.9099973440170288,
"rewards/rejected": -2.2567198276519775,
"step": 319
},
{
"epoch": 0.6839433609404221,
"grad_norm": 5.889532566070557,
"learning_rate": 2.730047501302266e-07,
"logits/chosen": -1.1312450170516968,
"logits/rejected": -1.0725492238998413,
"logps/chosen": -0.4850122928619385,
"logps/rejected": -0.5434764623641968,
"loss": 1.5617,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2125308513641357,
"rewards/margins": 0.14616040885448456,
"rewards/rejected": -1.3586912155151367,
"step": 320
},
{
"epoch": 0.686080683943361,
"grad_norm": 4.353449821472168,
"learning_rate": 2.696787747746839e-07,
"logits/chosen": -0.8624957203865051,
"logits/rejected": -0.8295655846595764,
"logps/chosen": -0.40189000964164734,
"logps/rejected": -0.47676557302474976,
"loss": 1.4684,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0047250986099243,
"rewards/margins": 0.18718883395195007,
"rewards/rejected": -1.1919138431549072,
"step": 321
},
{
"epoch": 0.6882180069462998,
"grad_norm": 9.558797836303711,
"learning_rate": 2.6636568586346897e-07,
"logits/chosen": -0.9744606614112854,
"logits/rejected": -0.9377647042274475,
"logps/chosen": -0.5311119556427002,
"logps/rejected": -0.5584622621536255,
"loss": 1.5027,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.3277798891067505,
"rewards/margins": 0.06837557256221771,
"rewards/rejected": -1.3961554765701294,
"step": 322
},
{
"epoch": 0.6903553299492385,
"grad_norm": 5.22324275970459,
"learning_rate": 2.6306566876350067e-07,
"logits/chosen": -1.044816017150879,
"logits/rejected": -1.068107008934021,
"logps/chosen": -0.5444616079330444,
"logps/rejected": -0.6575828790664673,
"loss": 1.5442,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.3611540794372559,
"rewards/margins": 0.2828032672405243,
"rewards/rejected": -1.6439573764801025,
"step": 323
},
{
"epoch": 0.6924926529521774,
"grad_norm": 5.757807731628418,
"learning_rate": 2.597789081103313e-07,
"logits/chosen": -1.0435606241226196,
"logits/rejected": -1.078909158706665,
"logps/chosen": -0.4910418689250946,
"logps/rejected": -0.6150948405265808,
"loss": 1.5089,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.227604627609253,
"rewards/margins": 0.3101324737071991,
"rewards/rejected": -1.5377371311187744,
"step": 324
},
{
"epoch": 0.6946299759551162,
"grad_norm": 5.650123119354248,
"learning_rate": 2.5650558779781635e-07,
"logits/chosen": -0.9889379739761353,
"logits/rejected": -1.084942102432251,
"logps/chosen": -0.5245276689529419,
"logps/rejected": -0.564633846282959,
"loss": 1.4216,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.311319351196289,
"rewards/margins": 0.10026533901691437,
"rewards/rejected": -1.4115846157073975,
"step": 325
},
{
"epoch": 0.696767298958055,
"grad_norm": 5.227182865142822,
"learning_rate": 2.5324589096782656e-07,
"logits/chosen": -1.1572788953781128,
"logits/rejected": -0.9472928047180176,
"logps/chosen": -0.6365960836410522,
"logps/rejected": -0.8092702031135559,
"loss": 1.5535,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.5914900302886963,
"rewards/margins": 0.43168535828590393,
"rewards/rejected": -2.0231754779815674,
"step": 326
},
{
"epoch": 0.6989046219609939,
"grad_norm": 9.626143455505371,
"learning_rate": 2.500000000000001e-07,
"logits/chosen": -0.9521257281303406,
"logits/rejected": -1.0960386991500854,
"logps/chosen": -0.5820955634117126,
"logps/rejected": -0.8665407299995422,
"loss": 1.5325,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.455238938331604,
"rewards/margins": 0.7111131548881531,
"rewards/rejected": -2.1663520336151123,
"step": 327
},
{
"epoch": 0.7010419449639327,
"grad_norm": 8.55288028717041,
"learning_rate": 2.467680965015387e-07,
"logits/chosen": -1.1446974277496338,
"logits/rejected": -1.0407241582870483,
"logps/chosen": -0.579289972782135,
"logps/rejected": -0.6028792858123779,
"loss": 1.5148,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4482249021530151,
"rewards/margins": 0.058973364531993866,
"rewards/rejected": -1.5071980953216553,
"step": 328
},
{
"epoch": 0.7031792679668715,
"grad_norm": 9.617560386657715,
"learning_rate": 2.435503612970469e-07,
"logits/chosen": -1.177872657775879,
"logits/rejected": -1.0678505897521973,
"logps/chosen": -0.5095410346984863,
"logps/rejected": -0.559869110584259,
"loss": 1.4961,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2738525867462158,
"rewards/margins": 0.12582018971443176,
"rewards/rejected": -1.3996728658676147,
"step": 329
},
{
"epoch": 0.7053165909698104,
"grad_norm": 6.568669319152832,
"learning_rate": 2.403469744184154e-07,
"logits/chosen": -1.017891764640808,
"logits/rejected": -1.0297455787658691,
"logps/chosen": -0.552146852016449,
"logps/rejected": -1.3806557655334473,
"loss": 1.3951,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3803672790527344,
"rewards/margins": 2.071272373199463,
"rewards/rejected": -3.4516396522521973,
"step": 330
},
{
"epoch": 0.7074539139727491,
"grad_norm": 5.693061828613281,
"learning_rate": 2.371581150947476e-07,
"logits/chosen": -1.121291160583496,
"logits/rejected": -1.0978386402130127,
"logps/chosen": -0.7388095855712891,
"logps/rejected": -0.7743204236030579,
"loss": 1.4663,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.8470239639282227,
"rewards/margins": 0.08877717703580856,
"rewards/rejected": -1.9358010292053223,
"step": 331
},
{
"epoch": 0.7095912369756879,
"grad_norm": 6.6337151527404785,
"learning_rate": 2.3398396174233176e-07,
"logits/chosen": -1.0051803588867188,
"logits/rejected": -1.1046079397201538,
"logps/chosen": -0.5532403588294983,
"logps/rejected": -0.8187743425369263,
"loss": 1.547,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.383100986480713,
"rewards/margins": 0.6638349294662476,
"rewards/rejected": -2.046935796737671,
"step": 332
},
{
"epoch": 0.7117285599786267,
"grad_norm": 7.117689609527588,
"learning_rate": 2.3082469195465893e-07,
"logits/chosen": -1.0851317644119263,
"logits/rejected": -1.0983937978744507,
"logps/chosen": -0.4725731611251831,
"logps/rejected": -0.5834302306175232,
"loss": 1.3948,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.181432843208313,
"rewards/margins": 0.2771427631378174,
"rewards/rejected": -1.4585756063461304,
"step": 333
},
{
"epoch": 0.7138658829815656,
"grad_norm": 6.473294258117676,
"learning_rate": 2.2768048249248644e-07,
"logits/chosen": -0.9775727987289429,
"logits/rejected": -0.9463880658149719,
"logps/chosen": -0.6236757040023804,
"logps/rejected": -0.6484177112579346,
"loss": 1.512,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.5591893196105957,
"rewards/margins": 0.061854973435401917,
"rewards/rejected": -1.6210441589355469,
"step": 334
},
{
"epoch": 0.7160032059845044,
"grad_norm": 5.626604080200195,
"learning_rate": 2.2455150927394878e-07,
"logits/chosen": -1.044307827949524,
"logits/rejected": -1.1238620281219482,
"logps/chosen": -0.5363501906394958,
"logps/rejected": -0.6088119745254517,
"loss": 1.5171,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.340875506401062,
"rewards/margins": 0.18115444481372833,
"rewards/rejected": -1.5220298767089844,
"step": 335
},
{
"epoch": 0.7181405289874432,
"grad_norm": 7.3833513259887695,
"learning_rate": 2.2143794736471388e-07,
"logits/chosen": -1.03908371925354,
"logits/rejected": -1.1508772373199463,
"logps/chosen": -0.46650949120521545,
"logps/rejected": -0.8443108797073364,
"loss": 1.4405,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.166273593902588,
"rewards/margins": 0.944503664970398,
"rewards/rejected": -2.1107773780822754,
"step": 336
},
{
"epoch": 0.7202778519903821,
"grad_norm": 6.9946184158325195,
"learning_rate": 2.1833997096818895e-07,
"logits/chosen": -1.017868161201477,
"logits/rejected": -0.9449481964111328,
"logps/chosen": -0.5905267000198364,
"logps/rejected": -0.6067604422569275,
"loss": 1.6098,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4763169288635254,
"rewards/margins": 0.04058411717414856,
"rewards/rejected": -1.5169010162353516,
"step": 337
},
{
"epoch": 0.7224151749933209,
"grad_norm": 6.809935569763184,
"learning_rate": 2.1525775341577402e-07,
"logits/chosen": -0.874271035194397,
"logits/rejected": -0.7729781270027161,
"logps/chosen": -0.42772772908210754,
"logps/rejected": -0.5716719627380371,
"loss": 1.3901,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0693193674087524,
"rewards/margins": 0.35986068844795227,
"rewards/rejected": -1.4291800260543823,
"step": 338
},
{
"epoch": 0.7245524979962596,
"grad_norm": 5.1241021156311035,
"learning_rate": 2.121914671571633e-07,
"logits/chosen": -1.1260241270065308,
"logits/rejected": -1.147855520248413,
"logps/chosen": -0.6018639206886292,
"logps/rejected": -0.8497560024261475,
"loss": 1.574,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5046600103378296,
"rewards/margins": 0.6197301149368286,
"rewards/rejected": -2.124390125274658,
"step": 339
},
{
"epoch": 0.7266898209991985,
"grad_norm": 11.285929679870605,
"learning_rate": 2.0914128375069722e-07,
"logits/chosen": -1.0443346500396729,
"logits/rejected": -0.9825816750526428,
"logps/chosen": -0.5199556350708008,
"logps/rejected": -0.5952978134155273,
"loss": 1.5966,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.299889087677002,
"rewards/margins": 0.1883554458618164,
"rewards/rejected": -1.4882445335388184,
"step": 340
},
{
"epoch": 0.7288271440021373,
"grad_norm": 6.03069543838501,
"learning_rate": 2.0610737385376348e-07,
"logits/chosen": -1.0298320055007935,
"logits/rejected": -0.9670383930206299,
"logps/chosen": -0.5948835015296936,
"logps/rejected": -0.7645187377929688,
"loss": 1.4949,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.4872087240219116,
"rewards/margins": 0.42408809065818787,
"rewards/rejected": -1.9112968444824219,
"step": 341
},
{
"epoch": 0.7309644670050761,
"grad_norm": 8.48175048828125,
"learning_rate": 2.0308990721324926e-07,
"logits/chosen": -0.7453219294548035,
"logits/rejected": -0.7619246244430542,
"logps/chosen": -0.4561096131801605,
"logps/rejected": -0.5257768034934998,
"loss": 1.477,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1402740478515625,
"rewards/margins": 0.17416785657405853,
"rewards/rejected": -1.3144419193267822,
"step": 342
},
{
"epoch": 0.733101790008015,
"grad_norm": 4.618956089019775,
"learning_rate": 2.0008905265604315e-07,
"logits/chosen": -1.058908462524414,
"logits/rejected": -1.0160064697265625,
"logps/chosen": -0.7591407895088196,
"logps/rejected": -0.741392195224762,
"loss": 1.4273,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.8978519439697266,
"rewards/margins": -0.04437139630317688,
"rewards/rejected": -1.853480339050293,
"step": 343
},
{
"epoch": 0.7352391130109538,
"grad_norm": 4.5929155349731445,
"learning_rate": 1.971049780795901e-07,
"logits/chosen": -1.0634278059005737,
"logits/rejected": -1.0275781154632568,
"logps/chosen": -0.6927691102027893,
"logps/rejected": -0.7188006639480591,
"loss": 1.4542,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.7319227457046509,
"rewards/margins": 0.06507880985736847,
"rewards/rejected": -1.7970017194747925,
"step": 344
},
{
"epoch": 0.7373764360138926,
"grad_norm": 7.66823673248291,
"learning_rate": 1.9413785044249676e-07,
"logits/chosen": -1.0500935316085815,
"logits/rejected": -0.9972812533378601,
"logps/chosen": -0.6515956521034241,
"logps/rejected": -0.7696123123168945,
"loss": 1.509,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.6289891004562378,
"rewards/margins": 0.29504185914993286,
"rewards/rejected": -1.9240310192108154,
"step": 345
},
{
"epoch": 0.7395137590168315,
"grad_norm": 13.36498737335205,
"learning_rate": 1.9118783575519109e-07,
"logits/chosen": -1.2645362615585327,
"logits/rejected": -1.1548783779144287,
"logps/chosen": -0.6232104301452637,
"logps/rejected": -0.6466085910797119,
"loss": 1.5575,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.5580260753631592,
"rewards/margins": 0.058495476841926575,
"rewards/rejected": -1.6165215969085693,
"step": 346
},
{
"epoch": 0.7416510820197703,
"grad_norm": 6.211390018463135,
"learning_rate": 1.8825509907063326e-07,
"logits/chosen": -0.987378716468811,
"logits/rejected": -0.9803218841552734,
"logps/chosen": -0.6375169157981873,
"logps/rejected": -0.8728115558624268,
"loss": 1.518,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.593792200088501,
"rewards/margins": 0.5882365107536316,
"rewards/rejected": -2.1820287704467773,
"step": 347
},
{
"epoch": 0.743788405022709,
"grad_norm": 7.908841609954834,
"learning_rate": 1.8533980447508135e-07,
"logits/chosen": -0.9738492965698242,
"logits/rejected": -1.0749965906143188,
"logps/chosen": -0.43783146142959595,
"logps/rejected": -0.7721287608146667,
"loss": 1.4855,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.094578742980957,
"rewards/margins": 0.8357431888580322,
"rewards/rejected": -1.9303218126296997,
"step": 348
},
{
"epoch": 0.7459257280256478,
"grad_norm": 4.574754238128662,
"learning_rate": 1.824421150789106e-07,
"logits/chosen": -1.07004714012146,
"logits/rejected": -1.0875684022903442,
"logps/chosen": -0.5463494062423706,
"logps/rejected": -0.5291048288345337,
"loss": 1.5418,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.3658735752105713,
"rewards/margins": -0.04311151057481766,
"rewards/rejected": -1.3227618932724,
"step": 349
},
{
"epoch": 0.7480630510285867,
"grad_norm": 7.958865642547607,
"learning_rate": 1.7956219300748792e-07,
"logits/chosen": -1.0490962266921997,
"logits/rejected": -1.0383286476135254,
"logps/chosen": -0.5858332514762878,
"logps/rejected": -0.6578047275543213,
"loss": 1.5021,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.464583158493042,
"rewards/margins": 0.1799287348985672,
"rewards/rejected": -1.6445120573043823,
"step": 350
},
{
"epoch": 0.7502003740315255,
"grad_norm": 6.273632526397705,
"learning_rate": 1.7670019939210023e-07,
"logits/chosen": -1.12949538230896,
"logits/rejected": -1.0367292165756226,
"logps/chosen": -0.4628137946128845,
"logps/rejected": -0.4637930393218994,
"loss": 1.4832,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1570343971252441,
"rewards/margins": 0.00244816392660141,
"rewards/rejected": -1.1594825983047485,
"step": 351
},
{
"epoch": 0.7523376970344643,
"grad_norm": 5.706187725067139,
"learning_rate": 1.7385629436093956e-07,
"logits/chosen": -1.0996813774108887,
"logits/rejected": -1.0577008724212646,
"logps/chosen": -0.7115722298622131,
"logps/rejected": -0.7016228437423706,
"loss": 1.5491,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.7789306640625,
"rewards/margins": -0.024873413145542145,
"rewards/rejected": -1.7540570497512817,
"step": 352
},
{
"epoch": 0.7544750200374032,
"grad_norm": 9.662369728088379,
"learning_rate": 1.710306370301437e-07,
"logits/chosen": -1.0781750679016113,
"logits/rejected": -0.9331997036933899,
"logps/chosen": -0.5081960558891296,
"logps/rejected": -0.5510014295578003,
"loss": 1.5991,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.270490050315857,
"rewards/margins": 0.10701363533735275,
"rewards/rejected": -1.3775036334991455,
"step": 353
},
{
"epoch": 0.756612343040342,
"grad_norm": 4.557639122009277,
"learning_rate": 1.6822338549489446e-07,
"logits/chosen": -1.024949550628662,
"logits/rejected": -1.026584267616272,
"logps/chosen": -0.46658557653427124,
"logps/rejected": -0.6099130511283875,
"loss": 1.453,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1664639711380005,
"rewards/margins": 0.3583187162876129,
"rewards/rejected": -1.524782657623291,
"step": 354
},
{
"epoch": 0.7587496660432808,
"grad_norm": 6.783056259155273,
"learning_rate": 1.6543469682057104e-07,
"logits/chosen": -1.0237020254135132,
"logits/rejected": -1.01318359375,
"logps/chosen": -0.5265443921089172,
"logps/rejected": -0.5865951776504517,
"loss": 1.4962,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.3163609504699707,
"rewards/margins": 0.15012702345848083,
"rewards/rejected": -1.466488003730774,
"step": 355
},
{
"epoch": 0.7608869890462197,
"grad_norm": 8.898886680603027,
"learning_rate": 1.6266472703396284e-07,
"logits/chosen": -1.1975302696228027,
"logits/rejected": -1.0899324417114258,
"logps/chosen": -0.6463479399681091,
"logps/rejected": -1.0257855653762817,
"loss": 1.4989,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.6158698797225952,
"rewards/margins": 0.9485940337181091,
"rewards/rejected": -2.5644638538360596,
"step": 356
},
{
"epoch": 0.7630243120491584,
"grad_norm": 11.671951293945312,
"learning_rate": 1.599136311145402e-07,
"logits/chosen": -1.1834355592727661,
"logits/rejected": -1.1248061656951904,
"logps/chosen": -0.7515526413917542,
"logps/rejected": -0.8097901344299316,
"loss": 1.5244,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.878881573677063,
"rewards/margins": 0.1455937623977661,
"rewards/rejected": -2.024475336074829,
"step": 357
},
{
"epoch": 0.7651616350520972,
"grad_norm": 5.740311622619629,
"learning_rate": 1.5718156298578288e-07,
"logits/chosen": -1.0501099824905396,
"logits/rejected": -1.0831395387649536,
"logps/chosen": -0.4516759216785431,
"logps/rejected": -0.547008752822876,
"loss": 1.4943,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1291898488998413,
"rewards/margins": 0.2383321225643158,
"rewards/rejected": -1.3675217628479004,
"step": 358
},
{
"epoch": 0.7672989580550361,
"grad_norm": 8.075958251953125,
"learning_rate": 1.5446867550656767e-07,
"logits/chosen": -1.0409198999404907,
"logits/rejected": -1.2201882600784302,
"logps/chosen": -0.620537519454956,
"logps/rejected": -1.1603175401687622,
"loss": 1.4869,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.5513437986373901,
"rewards/margins": 1.3494499921798706,
"rewards/rejected": -2.90079402923584,
"step": 359
},
{
"epoch": 0.7694362810579749,
"grad_norm": 5.148620128631592,
"learning_rate": 1.5177512046261666e-07,
"logits/chosen": -0.7799254655838013,
"logits/rejected": -0.8649179935455322,
"logps/chosen": -0.5494642853736877,
"logps/rejected": -0.8911303281784058,
"loss": 1.4401,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.3736608028411865,
"rewards/margins": 0.8541650772094727,
"rewards/rejected": -2.227825880050659,
"step": 360
},
{
"epoch": 0.7715736040609137,
"grad_norm": 5.721947193145752,
"learning_rate": 1.4910104855800426e-07,
"logits/chosen": -1.1887235641479492,
"logits/rejected": -1.0089201927185059,
"logps/chosen": -0.5180599689483643,
"logps/rejected": -0.49416518211364746,
"loss": 1.608,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2951500415802002,
"rewards/margins": -0.05973710119724274,
"rewards/rejected": -1.2354129552841187,
"step": 361
},
{
"epoch": 0.7737109270638525,
"grad_norm": 5.896087169647217,
"learning_rate": 1.4644660940672627e-07,
"logits/chosen": -1.0971053838729858,
"logits/rejected": -1.021689772605896,
"logps/chosen": -0.4189419448375702,
"logps/rejected": -0.5696815848350525,
"loss": 1.4472,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.047354817390442,
"rewards/margins": 0.37684905529022217,
"rewards/rejected": -1.424203872680664,
"step": 362
},
{
"epoch": 0.7758482500667914,
"grad_norm": 8.45847225189209,
"learning_rate": 1.4381195152432769e-07,
"logits/chosen": -0.8624813556671143,
"logits/rejected": -0.8767452239990234,
"logps/chosen": -0.615929126739502,
"logps/rejected": -0.6732630729675293,
"loss": 1.4672,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5398229360580444,
"rewards/margins": 0.14333483576774597,
"rewards/rejected": -1.6831578016281128,
"step": 363
},
{
"epoch": 0.7779855730697302,
"grad_norm": 6.6143927574157715,
"learning_rate": 1.4119722231959403e-07,
"logits/chosen": -1.0300284624099731,
"logits/rejected": -1.08317232131958,
"logps/chosen": -0.5381268858909607,
"logps/rejected": -0.5894446969032288,
"loss": 1.5538,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3453172445297241,
"rewards/margins": 0.12829461693763733,
"rewards/rejected": -1.473611831665039,
"step": 364
},
{
"epoch": 0.7801228960726689,
"grad_norm": 6.556136608123779,
"learning_rate": 1.3860256808630427e-07,
"logits/chosen": -1.0168136358261108,
"logits/rejected": -1.0091896057128906,
"logps/chosen": -0.8003075122833252,
"logps/rejected": -0.7712854146957397,
"loss": 1.4586,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.0007686614990234,
"rewards/margins": -0.07255513966083527,
"rewards/rejected": -1.9282134771347046,
"step": 365
},
{
"epoch": 0.7822602190756078,
"grad_norm": 4.969550609588623,
"learning_rate": 1.3602813399504458e-07,
"logits/chosen": -1.1460951566696167,
"logits/rejected": -1.0588514804840088,
"logps/chosen": -0.6409847736358643,
"logps/rejected": -0.6086816787719727,
"loss": 1.5951,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.6024619340896606,
"rewards/margins": -0.08075767010450363,
"rewards/rejected": -1.5217043161392212,
"step": 366
},
{
"epoch": 0.7843975420785466,
"grad_norm": 7.199689865112305,
"learning_rate": 1.3347406408508694e-07,
"logits/chosen": -1.2313283681869507,
"logits/rejected": -1.2684742212295532,
"logps/chosen": -0.60216224193573,
"logps/rejected": -0.8374693393707275,
"loss": 1.4751,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5054056644439697,
"rewards/margins": 0.5882677435874939,
"rewards/rejected": -2.0936732292175293,
"step": 367
},
{
"epoch": 0.7865348650814854,
"grad_norm": 8.949746131896973,
"learning_rate": 1.3094050125632972e-07,
"logits/chosen": -0.9064935445785522,
"logits/rejected": -0.9444929957389832,
"logps/chosen": -0.45091521739959717,
"logps/rejected": -0.5422642230987549,
"loss": 1.4476,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1272881031036377,
"rewards/margins": 0.22837252914905548,
"rewards/rejected": -1.3556605577468872,
"step": 368
},
{
"epoch": 0.7886721880844243,
"grad_norm": 4.023352146148682,
"learning_rate": 1.284275872613028e-07,
"logits/chosen": -1.0363601446151733,
"logits/rejected": -0.9367664456367493,
"logps/chosen": -0.4743805229663849,
"logps/rejected": -0.448375940322876,
"loss": 1.5308,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.1859513521194458,
"rewards/margins": -0.06501153111457825,
"rewards/rejected": -1.12093985080719,
"step": 369
},
{
"epoch": 0.7908095110873631,
"grad_norm": 9.706334114074707,
"learning_rate": 1.2593546269723647e-07,
"logits/chosen": -1.0525028705596924,
"logits/rejected": -1.0606484413146973,
"logps/chosen": -0.5636960864067078,
"logps/rejected": -0.6165448427200317,
"loss": 1.5128,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4092402458190918,
"rewards/margins": 0.13212202489376068,
"rewards/rejected": -1.5413621664047241,
"step": 370
},
{
"epoch": 0.7929468340903019,
"grad_norm": 11.990076065063477,
"learning_rate": 1.2346426699819456e-07,
"logits/chosen": -1.1552562713623047,
"logits/rejected": -1.0523865222930908,
"logps/chosen": -0.615561842918396,
"logps/rejected": -0.8749452233314514,
"loss": 1.4922,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5389046669006348,
"rewards/margins": 0.6484581232070923,
"rewards/rejected": -2.1873626708984375,
"step": 371
},
{
"epoch": 0.7950841570932408,
"grad_norm": 4.068089485168457,
"learning_rate": 1.2101413842727343e-07,
"logits/chosen": -1.1697150468826294,
"logits/rejected": -1.2012141942977905,
"logps/chosen": -0.5214999914169312,
"logps/rejected": -0.6672156453132629,
"loss": 1.4757,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3037500381469727,
"rewards/margins": 0.3642891049385071,
"rewards/rejected": -1.668039083480835,
"step": 372
},
{
"epoch": 0.7972214800961795,
"grad_norm": 8.035099983215332,
"learning_rate": 1.1858521406886674e-07,
"logits/chosen": -0.7799820899963379,
"logits/rejected": -0.8855147361755371,
"logps/chosen": -0.4354492723941803,
"logps/rejected": -0.508285641670227,
"loss": 1.5703,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0886231660842896,
"rewards/margins": 0.18209105730056763,
"rewards/rejected": -1.270714282989502,
"step": 373
},
{
"epoch": 0.7993588030991183,
"grad_norm": 11.540173530578613,
"learning_rate": 1.1617762982099444e-07,
"logits/chosen": -1.2532049417495728,
"logits/rejected": -1.293265700340271,
"logps/chosen": -0.6085352897644043,
"logps/rejected": -0.6329178810119629,
"loss": 1.5638,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.5213382244110107,
"rewards/margins": 0.06095648184418678,
"rewards/rejected": -1.5822947025299072,
"step": 374
},
{
"epoch": 0.8014961261020572,
"grad_norm": 6.516201019287109,
"learning_rate": 1.1379152038770029e-07,
"logits/chosen": -0.9992303848266602,
"logits/rejected": -1.0280938148498535,
"logps/chosen": -0.5312505960464478,
"logps/rejected": -0.7041913866996765,
"loss": 1.5746,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3281266689300537,
"rewards/margins": 0.432352215051651,
"rewards/rejected": -1.7604787349700928,
"step": 375
},
{
"epoch": 0.803633449104996,
"grad_norm": 9.061089515686035,
"learning_rate": 1.1142701927151454e-07,
"logits/chosen": -1.1462222337722778,
"logits/rejected": -0.9769701361656189,
"logps/chosen": -0.5302731990814209,
"logps/rejected": -0.4824178218841553,
"loss": 1.5691,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.3256831169128418,
"rewards/margins": -0.11963848769664764,
"rewards/rejected": -1.2060445547103882,
"step": 376
},
{
"epoch": 0.8057707721079348,
"grad_norm": 15.69382381439209,
"learning_rate": 1.090842587659851e-07,
"logits/chosen": -1.1206320524215698,
"logits/rejected": -1.034964919090271,
"logps/chosen": -0.6023938655853271,
"logps/rejected": -0.5703404545783997,
"loss": 1.534,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.5059847831726074,
"rewards/margins": -0.08013379573822021,
"rewards/rejected": -1.4258509874343872,
"step": 377
},
{
"epoch": 0.8079080951108736,
"grad_norm": 6.683454990386963,
"learning_rate": 1.0676336994827512e-07,
"logits/chosen": -1.093888282775879,
"logits/rejected": -1.2383679151535034,
"logps/chosen": -0.38456153869628906,
"logps/rejected": -0.6876378655433655,
"loss": 1.5004,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9614039063453674,
"rewards/margins": 0.7576908469200134,
"rewards/rejected": -1.7190947532653809,
"step": 378
},
{
"epoch": 0.8100454181138125,
"grad_norm": 6.571846961975098,
"learning_rate": 1.044644826718295e-07,
"logits/chosen": -1.2148207426071167,
"logits/rejected": -1.1005587577819824,
"logps/chosen": -0.7769785523414612,
"logps/rejected": -1.3037354946136475,
"loss": 1.4388,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9424464702606201,
"rewards/margins": 1.316892147064209,
"rewards/rejected": -3.259338617324829,
"step": 379
},
{
"epoch": 0.8121827411167513,
"grad_norm": 4.527327537536621,
"learning_rate": 1.0218772555910954e-07,
"logits/chosen": -1.1576111316680908,
"logits/rejected": -1.1151889562606812,
"logps/chosen": -0.6413387656211853,
"logps/rejected": -0.7667942643165588,
"loss": 1.466,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.603346824645996,
"rewards/margins": 0.31363871693611145,
"rewards/rejected": -1.9169857501983643,
"step": 380
},
{
"epoch": 0.81432006411969,
"grad_norm": 4.658472061157227,
"learning_rate": 9.99332259943969e-08,
"logits/chosen": -0.8665165305137634,
"logits/rejected": -0.8802721500396729,
"logps/chosen": -0.5357838869094849,
"logps/rejected": -0.9002077579498291,
"loss": 1.4755,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3394598960876465,
"rewards/margins": 0.9110593199729919,
"rewards/rejected": -2.250519275665283,
"step": 381
},
{
"epoch": 0.8164573871226289,
"grad_norm": 7.652352333068848,
"learning_rate": 9.770111011666582e-08,
"logits/chosen": -1.3007032871246338,
"logits/rejected": -1.2247170209884644,
"logps/chosen": -0.5824642181396484,
"logps/rejected": -0.6378771662712097,
"loss": 1.5725,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.456160545349121,
"rewards/margins": 0.13853231072425842,
"rewards/rejected": -1.5946928262710571,
"step": 382
},
{
"epoch": 0.8185947101255677,
"grad_norm": 6.376747131347656,
"learning_rate": 9.549150281252632e-08,
"logits/chosen": -1.0382481813430786,
"logits/rejected": -0.9399144053459167,
"logps/chosen": -0.5589631199836731,
"logps/rejected": -0.6459823846817017,
"loss": 1.4609,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3974076509475708,
"rewards/margins": 0.2175484150648117,
"rewards/rejected": -1.6149561405181885,
"step": 383
},
{
"epoch": 0.8207320331285065,
"grad_norm": 11.018497467041016,
"learning_rate": 9.330452770923603e-08,
"logits/chosen": -1.2223553657531738,
"logits/rejected": -1.238554835319519,
"logps/chosen": -0.5341588258743286,
"logps/rejected": -0.7176991701126099,
"loss": 1.4492,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3353970050811768,
"rewards/margins": 0.4588509202003479,
"rewards/rejected": -1.7942478656768799,
"step": 384
},
{
"epoch": 0.8228693561314454,
"grad_norm": 4.911256313323975,
"learning_rate": 9.114030716778432e-08,
"logits/chosen": -0.9694303274154663,
"logits/rejected": -1.0394315719604492,
"logps/chosen": -0.4685894250869751,
"logps/rejected": -0.6834438443183899,
"loss": 1.4237,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1714736223220825,
"rewards/margins": 0.537135899066925,
"rewards/rejected": -1.7086098194122314,
"step": 385
},
{
"epoch": 0.8250066791343842,
"grad_norm": 11.421992301940918,
"learning_rate": 8.899896227604508e-08,
"logits/chosen": -1.1830335855484009,
"logits/rejected": -1.157397747039795,
"logps/chosen": -0.39773958921432495,
"logps/rejected": -0.46552199125289917,
"loss": 1.4859,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.9943490028381348,
"rewards/margins": 0.1694560945034027,
"rewards/rejected": -1.1638050079345703,
"step": 386
},
{
"epoch": 0.827144002137323,
"grad_norm": 6.942233085632324,
"learning_rate": 8.688061284200265e-08,
"logits/chosen": -1.0675994157791138,
"logits/rejected": -1.0512460470199585,
"logps/chosen": -0.5208725929260254,
"logps/rejected": -0.6654115319252014,
"loss": 1.3969,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.302181363105774,
"rewards/margins": 0.3613475561141968,
"rewards/rejected": -1.6635288000106812,
"step": 387
},
{
"epoch": 0.8292813251402619,
"grad_norm": 4.8206377029418945,
"learning_rate": 8.478537738704811e-08,
"logits/chosen": -1.1155000925064087,
"logits/rejected": -1.0786762237548828,
"logps/chosen": -0.5618028044700623,
"logps/rejected": -0.7909839749336243,
"loss": 1.4737,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.404506802558899,
"rewards/margins": 0.5729530453681946,
"rewards/rejected": -1.9774597883224487,
"step": 388
},
{
"epoch": 0.8314186481432007,
"grad_norm": 7.505739212036133,
"learning_rate": 8.271337313934867e-08,
"logits/chosen": -1.2557884454727173,
"logits/rejected": -1.2007073163986206,
"logps/chosen": -0.5404600501060486,
"logps/rejected": -0.6826989650726318,
"loss": 1.5093,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3511501550674438,
"rewards/margins": 0.35559743642807007,
"rewards/rejected": -1.7067475318908691,
"step": 389
},
{
"epoch": 0.8335559711461394,
"grad_norm": 6.654153347015381,
"learning_rate": 8.066471602728803e-08,
"logits/chosen": -1.155353307723999,
"logits/rejected": -1.1592519283294678,
"logps/chosen": -0.6218467950820923,
"logps/rejected": -0.9340539574623108,
"loss": 1.4667,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5546170473098755,
"rewards/margins": 0.7805179953575134,
"rewards/rejected": -2.335134983062744,
"step": 390
},
{
"epoch": 0.8356932941490782,
"grad_norm": 5.7856011390686035,
"learning_rate": 7.863952067298041e-08,
"logits/chosen": -0.8018568754196167,
"logits/rejected": -0.7033423185348511,
"logps/chosen": -0.4321405291557312,
"logps/rejected": -0.49103930592536926,
"loss": 1.4969,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.0803513526916504,
"rewards/margins": 0.1472470909357071,
"rewards/rejected": -1.2275984287261963,
"step": 391
},
{
"epoch": 0.8378306171520171,
"grad_norm": 4.890209197998047,
"learning_rate": 7.663790038585794e-08,
"logits/chosen": -1.1529498100280762,
"logits/rejected": -1.1529746055603027,
"logps/chosen": -0.5385481119155884,
"logps/rejected": -0.8943907022476196,
"loss": 1.4222,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.3463702201843262,
"rewards/margins": 0.8896064758300781,
"rewards/rejected": -2.2359766960144043,
"step": 392
},
{
"epoch": 0.8399679401549559,
"grad_norm": 7.706808567047119,
"learning_rate": 7.465996715633027e-08,
"logits/chosen": -1.055177927017212,
"logits/rejected": -1.0194472074508667,
"logps/chosen": -0.6158031821250916,
"logps/rejected": -0.6109176278114319,
"loss": 1.5822,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.5395078659057617,
"rewards/margins": -0.012213785201311111,
"rewards/rejected": -1.527294397354126,
"step": 393
},
{
"epoch": 0.8421052631578947,
"grad_norm": 5.577699184417725,
"learning_rate": 7.270583164951926e-08,
"logits/chosen": -0.9695263504981995,
"logits/rejected": -0.8854237794876099,
"logps/chosen": -0.6802433133125305,
"logps/rejected": -0.6754633188247681,
"loss": 1.4759,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.700608253479004,
"rewards/margins": -0.011950075626373291,
"rewards/rejected": -1.6886582374572754,
"step": 394
},
{
"epoch": 0.8442425861608336,
"grad_norm": 4.328030586242676,
"learning_rate": 7.077560319906694e-08,
"logits/chosen": -0.9989792108535767,
"logits/rejected": -0.8419893980026245,
"logps/chosen": -0.4701617658138275,
"logps/rejected": -0.4230220913887024,
"loss": 1.5796,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.17540442943573,
"rewards/margins": -0.11784917116165161,
"rewards/rejected": -1.0575551986694336,
"step": 395
},
{
"epoch": 0.8463799091637724,
"grad_norm": 7.562992572784424,
"learning_rate": 6.886938980101869e-08,
"logits/chosen": -1.193518042564392,
"logits/rejected": -1.1734905242919922,
"logps/chosen": -0.6230531930923462,
"logps/rejected": -0.735701322555542,
"loss": 1.52,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5576331615447998,
"rewards/margins": 0.281620055437088,
"rewards/rejected": -1.8392531871795654,
"step": 396
},
{
"epoch": 0.8485172321667112,
"grad_norm": 3.633221387863159,
"learning_rate": 6.698729810778064e-08,
"logits/chosen": -1.1731326580047607,
"logits/rejected": -1.0659189224243164,
"logps/chosen": -0.5739960670471191,
"logps/rejected": -0.941279947757721,
"loss": 1.5875,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4349900484085083,
"rewards/margins": 0.9182097911834717,
"rewards/rejected": -2.3531999588012695,
"step": 397
},
{
"epoch": 0.85065455516965,
"grad_norm": 9.181096076965332,
"learning_rate": 6.512943342215232e-08,
"logits/chosen": -0.9327074885368347,
"logits/rejected": -0.9438521265983582,
"logps/chosen": -0.6690176725387573,
"logps/rejected": -0.7583512663841248,
"loss": 1.5198,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.672544240951538,
"rewards/margins": 0.22333401441574097,
"rewards/rejected": -1.8958781957626343,
"step": 398
},
{
"epoch": 0.8527918781725888,
"grad_norm": 5.558097839355469,
"learning_rate": 6.329589969143517e-08,
"logits/chosen": -0.8800374269485474,
"logits/rejected": -0.8219183683395386,
"logps/chosen": -0.5247557759284973,
"logps/rejected": -0.6464003324508667,
"loss": 1.4196,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3118892908096313,
"rewards/margins": 0.30411145091056824,
"rewards/rejected": -1.616000771522522,
"step": 399
},
{
"epoch": 0.8549292011755276,
"grad_norm": 3.8024463653564453,
"learning_rate": 6.148679950161672e-08,
"logits/chosen": -0.9593957662582397,
"logits/rejected": -0.9404938817024231,
"logps/chosen": -0.7068220973014832,
"logps/rejected": -0.7800813913345337,
"loss": 1.4438,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7670555114746094,
"rewards/margins": 0.1831480711698532,
"rewards/rejected": -1.9502032995224,
"step": 400
},
{
"epoch": 0.8570665241784665,
"grad_norm": 30.781763076782227,
"learning_rate": 5.9702234071631e-08,
"logits/chosen": -0.9686922430992126,
"logits/rejected": -0.9655731320381165,
"logps/chosen": -0.5401520729064941,
"logps/rejected": -0.5928196310997009,
"loss": 1.5804,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.350380301475525,
"rewards/margins": 0.13166896998882294,
"rewards/rejected": -1.4820491075515747,
"step": 401
},
{
"epoch": 0.8592038471814053,
"grad_norm": 14.888254165649414,
"learning_rate": 5.794230324769517e-08,
"logits/chosen": -1.049428105354309,
"logits/rejected": -0.9786369800567627,
"logps/chosen": -0.6207393407821655,
"logps/rejected": -0.659958004951477,
"loss": 1.517,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5518485307693481,
"rewards/margins": 0.09804654121398926,
"rewards/rejected": -1.6498949527740479,
"step": 402
},
{
"epoch": 0.8613411701843441,
"grad_norm": 5.892756938934326,
"learning_rate": 5.620710549772295e-08,
"logits/chosen": -1.0603619813919067,
"logits/rejected": -1.0535688400268555,
"logps/chosen": -0.6214827299118042,
"logps/rejected": -0.7300854921340942,
"loss": 1.5846,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5537068843841553,
"rewards/margins": 0.2715071439743042,
"rewards/rejected": -1.82521390914917,
"step": 403
},
{
"epoch": 0.863478493187283,
"grad_norm": 6.201269626617432,
"learning_rate": 5.44967379058161e-08,
"logits/chosen": -1.0106871128082275,
"logits/rejected": -0.8725728988647461,
"logps/chosen": -0.41202932596206665,
"logps/rejected": -0.657246470451355,
"loss": 1.5421,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0300732851028442,
"rewards/margins": 0.6130428910255432,
"rewards/rejected": -1.6431162357330322,
"step": 404
},
{
"epoch": 0.8656158161902218,
"grad_norm": 8.44877815246582,
"learning_rate": 5.2811296166831666e-08,
"logits/chosen": -1.1635454893112183,
"logits/rejected": -1.0440311431884766,
"logps/chosen": -0.8248953223228455,
"logps/rejected": -0.709219217300415,
"loss": 1.5559,
"rewards/accuracies": 0.375,
"rewards/chosen": -2.0622382164001465,
"rewards/margins": -0.2891903817653656,
"rewards/rejected": -1.7730481624603271,
"step": 405
},
{
"epoch": 0.8677531391931605,
"grad_norm": 5.078684329986572,
"learning_rate": 5.11508745810284e-08,
"logits/chosen": -1.1656029224395752,
"logits/rejected": -1.0778682231903076,
"logps/chosen": -0.512388288974762,
"logps/rejected": -0.5113582015037537,
"loss": 1.485,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.280970811843872,
"rewards/margins": -0.0025753136724233627,
"rewards/rejected": -1.2783952951431274,
"step": 406
},
{
"epoch": 0.8698904621960993,
"grad_norm": 6.306398868560791,
"learning_rate": 4.951556604879048e-08,
"logits/chosen": -1.1278554201126099,
"logits/rejected": -1.149449348449707,
"logps/chosen": -0.5426372289657593,
"logps/rejected": -0.8084475994110107,
"loss": 1.5014,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.356593132019043,
"rewards/margins": 0.6645258665084839,
"rewards/rejected": -2.0211191177368164,
"step": 407
},
{
"epoch": 0.8720277851990382,
"grad_norm": 9.290410995483398,
"learning_rate": 4.7905462065429946e-08,
"logits/chosen": -0.8804436922073364,
"logits/rejected": -0.856006383895874,
"logps/chosen": -0.41280895471572876,
"logps/rejected": -0.8294933438301086,
"loss": 1.4497,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0320223569869995,
"rewards/margins": 1.0417108535766602,
"rewards/rejected": -2.07373309135437,
"step": 408
},
{
"epoch": 0.874165108201977,
"grad_norm": 11.550924301147461,
"learning_rate": 4.6320652716067555e-08,
"logits/chosen": -1.0321075916290283,
"logits/rejected": -0.9579042792320251,
"logps/chosen": -0.8112492561340332,
"logps/rejected": -1.1215100288391113,
"loss": 1.4411,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.028123140335083,
"rewards/margins": 0.7756521701812744,
"rewards/rejected": -2.8037750720977783,
"step": 409
},
{
"epoch": 0.8763024312049158,
"grad_norm": 7.996156215667725,
"learning_rate": 4.4761226670592066e-08,
"logits/chosen": -0.8929157853126526,
"logits/rejected": -0.9894087910652161,
"logps/chosen": -0.5036193132400513,
"logps/rejected": -0.4987982213497162,
"loss": 1.6218,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2590482234954834,
"rewards/margins": -0.012052726000547409,
"rewards/rejected": -1.246995449066162,
"step": 410
},
{
"epoch": 0.8784397542078547,
"grad_norm": 6.660581588745117,
"learning_rate": 4.322727117869951e-08,
"logits/chosen": -0.9380248188972473,
"logits/rejected": -0.9620922207832336,
"logps/chosen": -0.41842323541641235,
"logps/rejected": -0.47635918855667114,
"loss": 1.4806,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0460580587387085,
"rewards/margins": 0.1448398381471634,
"rewards/rejected": -1.1908979415893555,
"step": 411
},
{
"epoch": 0.8805770772107935,
"grad_norm": 13.565817832946777,
"learning_rate": 4.17188720650119e-08,
"logits/chosen": -0.928549587726593,
"logits/rejected": -0.8930441737174988,
"logps/chosen": -0.49229955673217773,
"logps/rejected": -0.6658884286880493,
"loss": 1.484,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.2307488918304443,
"rewards/margins": 0.43397217988967896,
"rewards/rejected": -1.664721131324768,
"step": 412
},
{
"epoch": 0.8827144002137323,
"grad_norm": 4.4607954025268555,
"learning_rate": 4.023611372427471e-08,
"logits/chosen": -0.9794715642929077,
"logits/rejected": -0.901313304901123,
"logps/chosen": -0.4945882260799408,
"logps/rejected": -0.4721212387084961,
"loss": 1.5612,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.2364706993103027,
"rewards/margins": -0.056167569011449814,
"rewards/rejected": -1.1803030967712402,
"step": 413
},
{
"epoch": 0.8848517232166712,
"grad_norm": 4.917453289031982,
"learning_rate": 3.877907911663542e-08,
"logits/chosen": -1.101521372795105,
"logits/rejected": -1.0795665979385376,
"logps/chosen": -0.4808153808116913,
"logps/rejected": -0.5762184262275696,
"loss": 1.4704,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2020386457443237,
"rewards/margins": 0.23850753903388977,
"rewards/rejected": -1.4405461549758911,
"step": 414
},
{
"epoch": 0.88698904621961,
"grad_norm": 16.45602798461914,
"learning_rate": 3.734784976300165e-08,
"logits/chosen": -1.0213111639022827,
"logits/rejected": -1.037355661392212,
"logps/chosen": -0.7609426975250244,
"logps/rejected": -0.9708657264709473,
"loss": 1.474,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.9023566246032715,
"rewards/margins": 0.5248077511787415,
"rewards/rejected": -2.427164316177368,
"step": 415
},
{
"epoch": 0.8891263692225487,
"grad_norm": 5.8920817375183105,
"learning_rate": 3.594250574048058e-08,
"logits/chosen": -1.0958168506622314,
"logits/rejected": -1.1276291608810425,
"logps/chosen": -0.7167930006980896,
"logps/rejected": -0.9383392930030823,
"loss": 1.4605,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.791982650756836,
"rewards/margins": 0.5538656711578369,
"rewards/rejected": -2.345848321914673,
"step": 416
},
{
"epoch": 0.8912636922254876,
"grad_norm": 15.450013160705566,
"learning_rate": 3.456312567789793e-08,
"logits/chosen": -0.956957995891571,
"logits/rejected": -0.9777557253837585,
"logps/chosen": -0.6754245162010193,
"logps/rejected": -0.6781172752380371,
"loss": 1.6078,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.688561201095581,
"rewards/margins": 0.006731957197189331,
"rewards/rejected": -1.6952930688858032,
"step": 417
},
{
"epoch": 0.8934010152284264,
"grad_norm": 8.894111633300781,
"learning_rate": 3.3209786751399184e-08,
"logits/chosen": -1.1309380531311035,
"logits/rejected": -1.0127419233322144,
"logps/chosen": -0.5305185914039612,
"logps/rejected": -0.5370005369186401,
"loss": 1.5549,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.326296329498291,
"rewards/margins": 0.016204845160245895,
"rewards/rejected": -1.342501163482666,
"step": 418
},
{
"epoch": 0.8955383382313652,
"grad_norm": 4.215895175933838,
"learning_rate": 3.188256468013139e-08,
"logits/chosen": -1.168836236000061,
"logits/rejected": -1.26181960105896,
"logps/chosen": -0.536091685295105,
"logps/rejected": -0.77155601978302,
"loss": 1.4393,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.3402291536331177,
"rewards/margins": 0.5886609554290771,
"rewards/rejected": -1.9288899898529053,
"step": 419
},
{
"epoch": 0.897675661234304,
"grad_norm": 9.309587478637695,
"learning_rate": 3.058153372200695e-08,
"logits/chosen": -0.9437912106513977,
"logits/rejected": -0.8996111154556274,
"logps/chosen": -0.398969829082489,
"logps/rejected": -0.4293844699859619,
"loss": 1.5813,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.9974247217178345,
"rewards/margins": 0.07603646069765091,
"rewards/rejected": -1.0734611749649048,
"step": 420
},
{
"epoch": 0.8998129842372429,
"grad_norm": 9.308667182922363,
"learning_rate": 2.9306766669548457e-08,
"logits/chosen": -1.051709532737732,
"logits/rejected": -1.0886300802230835,
"logps/chosen": -0.6116673946380615,
"logps/rejected": -0.8640693426132202,
"loss": 1.4563,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5291682481765747,
"rewards/margins": 0.6310049295425415,
"rewards/rejected": -2.1601734161376953,
"step": 421
},
{
"epoch": 0.9019503072401817,
"grad_norm": 6.306589126586914,
"learning_rate": 2.805833484581621e-08,
"logits/chosen": -1.1781387329101562,
"logits/rejected": -1.1265491247177124,
"logps/chosen": -0.5683103203773499,
"logps/rejected": -0.6715315580368042,
"loss": 1.5119,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.4207758903503418,
"rewards/margins": 0.2580530643463135,
"rewards/rejected": -1.6788289546966553,
"step": 422
},
{
"epoch": 0.9040876302431204,
"grad_norm": 6.854678153991699,
"learning_rate": 2.6836308100417872e-08,
"logits/chosen": -1.0027626752853394,
"logits/rejected": -1.021713137626648,
"logps/chosen": -0.6026887893676758,
"logps/rejected": -0.685580313205719,
"loss": 1.5287,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.5067217350006104,
"rewards/margins": 0.207228884100914,
"rewards/rejected": -1.7139506340026855,
"step": 423
},
{
"epoch": 0.9062249532460593,
"grad_norm": 4.880378723144531,
"learning_rate": 2.5640754805600128e-08,
"logits/chosen": -0.9207165837287903,
"logits/rejected": -0.7281609177589417,
"logps/chosen": -0.48149988055229187,
"logps/rejected": -1.1003289222717285,
"loss": 1.4379,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.203749656677246,
"rewards/margins": 1.5470730066299438,
"rewards/rejected": -2.7508223056793213,
"step": 424
},
{
"epoch": 0.9083622762489981,
"grad_norm": 7.307305335998535,
"learning_rate": 2.4471741852423233e-08,
"logits/chosen": -0.970041811466217,
"logits/rejected": -0.9423936605453491,
"logps/chosen": -0.518881618976593,
"logps/rejected": -0.7002631425857544,
"loss": 1.5549,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2972040176391602,
"rewards/margins": 0.45345383882522583,
"rewards/rejected": -1.7506577968597412,
"step": 425
},
{
"epoch": 0.9104995992519369,
"grad_norm": 4.65291166305542,
"learning_rate": 2.3329334647018694e-08,
"logits/chosen": -1.0624005794525146,
"logits/rejected": -0.9302091598510742,
"logps/chosen": -0.4433947205543518,
"logps/rejected": -0.4743190407752991,
"loss": 1.5049,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1084867715835571,
"rewards/margins": 0.07731082290410995,
"rewards/rejected": -1.1857975721359253,
"step": 426
},
{
"epoch": 0.9126369222548758,
"grad_norm": 7.424281120300293,
"learning_rate": 2.2213597106929605e-08,
"logits/chosen": -0.9913086891174316,
"logits/rejected": -0.9474751949310303,
"logps/chosen": -0.46963855624198914,
"logps/rejected": -0.558479905128479,
"loss": 1.5135,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1740963459014893,
"rewards/margins": 0.22210323810577393,
"rewards/rejected": -1.3961995840072632,
"step": 427
},
{
"epoch": 0.9147742452578146,
"grad_norm": 6.601301193237305,
"learning_rate": 2.1124591657534774e-08,
"logits/chosen": -1.0710477828979492,
"logits/rejected": -1.064525842666626,
"logps/chosen": -0.52205491065979,
"logps/rejected": -0.6624395847320557,
"loss": 1.4596,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3051373958587646,
"rewards/margins": 0.35096171498298645,
"rewards/rejected": -1.6560990810394287,
"step": 428
},
{
"epoch": 0.9169115682607534,
"grad_norm": 6.440933704376221,
"learning_rate": 2.0062379228555525e-08,
"logits/chosen": -0.9846968054771423,
"logits/rejected": -0.9657022356987,
"logps/chosen": -0.7298382520675659,
"logps/rejected": -0.8390324115753174,
"loss": 1.4462,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.8245956897735596,
"rewards/margins": 0.2729852795600891,
"rewards/rejected": -2.097581148147583,
"step": 429
},
{
"epoch": 0.9190488912636923,
"grad_norm": 6.04892635345459,
"learning_rate": 1.9027019250647036e-08,
"logits/chosen": -0.970207691192627,
"logits/rejected": -0.9668480157852173,
"logps/chosen": -0.45557963848114014,
"logps/rejected": -0.5887754559516907,
"loss": 1.4271,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1389491558074951,
"rewards/margins": 0.332989364862442,
"rewards/rejected": -1.4719386100769043,
"step": 430
},
{
"epoch": 0.921186214266631,
"grad_norm": 8.778804779052734,
"learning_rate": 1.8018569652073378e-08,
"logits/chosen": -0.9118247032165527,
"logits/rejected": -0.944421648979187,
"logps/chosen": -0.607579231262207,
"logps/rejected": -0.9298698902130127,
"loss": 1.4657,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5189483165740967,
"rewards/margins": 0.8057265281677246,
"rewards/rejected": -2.3246748447418213,
"step": 431
},
{
"epoch": 0.9233235372695698,
"grad_norm": 4.861128330230713,
"learning_rate": 1.7037086855465898e-08,
"logits/chosen": -1.0250306129455566,
"logits/rejected": -1.0059497356414795,
"logps/chosen": -0.5032894015312195,
"logps/rejected": -0.8505322933197021,
"loss": 1.4828,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2582234144210815,
"rewards/margins": 0.8681074976921082,
"rewards/rejected": -2.126330852508545,
"step": 432
},
{
"epoch": 0.9254608602725087,
"grad_norm": 8.301541328430176,
"learning_rate": 1.6082625774666792e-08,
"logits/chosen": -1.0359551906585693,
"logits/rejected": -0.9847449660301208,
"logps/chosen": -0.47514110803604126,
"logps/rejected": -0.5986137986183167,
"loss": 1.4601,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1878528594970703,
"rewards/margins": 0.3086817264556885,
"rewards/rejected": -1.4965344667434692,
"step": 433
},
{
"epoch": 0.9275981832754475,
"grad_norm": 11.128478050231934,
"learning_rate": 1.5155239811656562e-08,
"logits/chosen": -1.0488301515579224,
"logits/rejected": -0.9755731225013733,
"logps/chosen": -0.45144030451774597,
"logps/rejected": -0.4765854477882385,
"loss": 1.4288,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1286007165908813,
"rewards/margins": 0.06286279857158661,
"rewards/rejected": -1.1914634704589844,
"step": 434
},
{
"epoch": 0.9297355062783863,
"grad_norm": 7.042767524719238,
"learning_rate": 1.4254980853566246e-08,
"logits/chosen": -1.1306668519973755,
"logits/rejected": -1.0746090412139893,
"logps/chosen": -0.6658691763877869,
"logps/rejected": -0.8278828263282776,
"loss": 1.5199,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.6646732091903687,
"rewards/margins": 0.40503403544425964,
"rewards/rejected": -2.069707155227661,
"step": 435
},
{
"epoch": 0.9318728292813251,
"grad_norm": 6.516074180603027,
"learning_rate": 1.3381899269774289e-08,
"logits/chosen": -1.160095453262329,
"logits/rejected": -1.1833710670471191,
"logps/chosen": -0.869493842124939,
"logps/rejected": -0.9384310245513916,
"loss": 1.512,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.173734426498413,
"rewards/margins": 0.17234301567077637,
"rewards/rejected": -2.3460774421691895,
"step": 436
},
{
"epoch": 0.934010152284264,
"grad_norm": 5.304040431976318,
"learning_rate": 1.253604390908819e-08,
"logits/chosen": -0.9810507297515869,
"logits/rejected": -0.9905288219451904,
"logps/chosen": -0.43267419934272766,
"logps/rejected": -0.44727227091789246,
"loss": 1.5062,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.0816855430603027,
"rewards/margins": 0.03649521246552467,
"rewards/rejected": -1.118180751800537,
"step": 437
},
{
"epoch": 0.9361474752872028,
"grad_norm": 7.976593971252441,
"learning_rate": 1.1717462097011855e-08,
"logits/chosen": -1.1212152242660522,
"logits/rejected": -1.1439484357833862,
"logps/chosen": -0.46723484992980957,
"logps/rejected": -0.6574444770812988,
"loss": 1.4262,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1680870056152344,
"rewards/margins": 0.4755241572856903,
"rewards/rejected": -1.6436113119125366,
"step": 438
},
{
"epoch": 0.9382847982901416,
"grad_norm": 11.167140007019043,
"learning_rate": 1.0926199633097154e-08,
"logits/chosen": -1.108040690422058,
"logits/rejected": -1.049588680267334,
"logps/chosen": -0.41383224725723267,
"logps/rejected": -0.7777174711227417,
"loss": 1.5436,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0345805883407593,
"rewards/margins": 0.9097130298614502,
"rewards/rejected": -1.94429349899292,
"step": 439
},
{
"epoch": 0.9404221212930804,
"grad_norm": 4.780636787414551,
"learning_rate": 1.016230078838226e-08,
"logits/chosen": -0.9971197843551636,
"logits/rejected": -0.906645655632019,
"logps/chosen": -0.4516942501068115,
"logps/rejected": -0.5613692402839661,
"loss": 1.5136,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1292357444763184,
"rewards/margins": 0.2741874158382416,
"rewards/rejected": -1.4034231901168823,
"step": 440
},
{
"epoch": 0.9425594442960192,
"grad_norm": 5.779046535491943,
"learning_rate": 9.425808302913728e-09,
"logits/chosen": -0.9931034445762634,
"logits/rejected": -0.9382791519165039,
"logps/chosen": -0.4940332770347595,
"logps/rejected": -0.5491997003555298,
"loss": 1.454,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2350831031799316,
"rewards/margins": 0.13791611790657043,
"rewards/rejected": -1.3729993104934692,
"step": 441
},
{
"epoch": 0.944696767298958,
"grad_norm": 4.751095771789551,
"learning_rate": 8.716763383355862e-09,
"logits/chosen": -0.9679308533668518,
"logits/rejected": -0.9130020141601562,
"logps/chosen": -0.39680802822113037,
"logps/rejected": -0.45057764649391174,
"loss": 1.4856,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9920201301574707,
"rewards/margins": 0.13442395627498627,
"rewards/rejected": -1.1264441013336182,
"step": 442
},
{
"epoch": 0.9468340903018969,
"grad_norm": 7.848532199859619,
"learning_rate": 8.035205700685165e-09,
"logits/chosen": -1.0988008975982666,
"logits/rejected": -1.0214226245880127,
"logps/chosen": -0.5310029983520508,
"logps/rejected": -0.46627891063690186,
"loss": 1.5292,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.3275076150894165,
"rewards/margins": -0.1618102788925171,
"rewards/rejected": -1.1656973361968994,
"step": 443
},
{
"epoch": 0.9489714133048357,
"grad_norm": 20.29781723022461,
"learning_rate": 7.381173387970397e-09,
"logits/chosen": -1.1159013509750366,
"logits/rejected": -1.024048089981079,
"logps/chosen": -0.504087507724762,
"logps/rejected": -0.4435691833496094,
"loss": 1.4914,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2602187395095825,
"rewards/margins": -0.1512957364320755,
"rewards/rejected": -1.108923077583313,
"step": 444
},
{
"epoch": 0.9511087363077745,
"grad_norm": 4.859796524047852,
"learning_rate": 6.754703038239329e-09,
"logits/chosen": -1.0576136112213135,
"logits/rejected": -1.0677433013916016,
"logps/chosen": -0.5914698243141174,
"logps/rejected": -0.6684586405754089,
"loss": 1.5836,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4786746501922607,
"rewards/margins": 0.19247180223464966,
"rewards/rejected": -1.6711465120315552,
"step": 445
},
{
"epoch": 0.9532460593107134,
"grad_norm": 7.60525369644165,
"learning_rate": 6.15582970243117e-09,
"logits/chosen": -1.094986915588379,
"logits/rejected": -1.052059292793274,
"logps/chosen": -0.6280755996704102,
"logps/rejected": -0.7196720838546753,
"loss": 1.4718,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.570189118385315,
"rewards/margins": 0.22899119555950165,
"rewards/rejected": -1.799180269241333,
"step": 446
},
{
"epoch": 0.9553833823136522,
"grad_norm": 5.170019149780273,
"learning_rate": 5.5845868874357385e-09,
"logits/chosen": -0.925014853477478,
"logits/rejected": -0.9863495826721191,
"logps/chosen": -0.46621960401535034,
"logps/rejected": -0.4518744945526123,
"loss": 1.4332,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1655490398406982,
"rewards/margins": -0.0358627624809742,
"rewards/rejected": -1.1296862363815308,
"step": 447
},
{
"epoch": 0.957520705316591,
"grad_norm": 4.508576393127441,
"learning_rate": 5.0410065542185184e-09,
"logits/chosen": -1.0208594799041748,
"logits/rejected": -0.9924222826957703,
"logps/chosen": -0.45458984375,
"logps/rejected": -0.6638749241828918,
"loss": 1.4904,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1364747285842896,
"rewards/margins": 0.5232126116752625,
"rewards/rejected": -1.6596875190734863,
"step": 448
},
{
"epoch": 0.9596580283195298,
"grad_norm": 6.562451362609863,
"learning_rate": 4.5251191160326495e-09,
"logits/chosen": -1.1275100708007812,
"logits/rejected": -1.124595046043396,
"logps/chosen": -0.8562003970146179,
"logps/rejected": -1.0813689231872559,
"loss": 1.5302,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.140500783920288,
"rewards/margins": 0.5629212260246277,
"rewards/rejected": -2.7034220695495605,
"step": 449
},
{
"epoch": 0.9617953513224686,
"grad_norm": 13.340657234191895,
"learning_rate": 4.036953436716895e-09,
"logits/chosen": -0.8729239702224731,
"logits/rejected": -0.9192299842834473,
"logps/chosen": -0.6237303614616394,
"logps/rejected": -0.7104634046554565,
"loss": 1.5047,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.559325933456421,
"rewards/margins": 0.21683269739151,
"rewards/rejected": -1.7761585712432861,
"step": 450
},
{
"epoch": 0.9639326743254074,
"grad_norm": 10.217489242553711,
"learning_rate": 3.5765368290813223e-09,
"logits/chosen": -1.0247201919555664,
"logits/rejected": -1.038936734199524,
"logps/chosen": -0.6359574794769287,
"logps/rejected": -0.6257486939430237,
"loss": 1.5193,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.5898935794830322,
"rewards/margins": -0.0255217757076025,
"rewards/rejected": -1.5643717050552368,
"step": 451
},
{
"epoch": 0.9660699973283462,
"grad_norm": 10.847533226013184,
"learning_rate": 3.1438950533786977e-09,
"logits/chosen": -1.0954786539077759,
"logits/rejected": -1.102776288986206,
"logps/chosen": -0.5472733378410339,
"logps/rejected": -0.6362195611000061,
"loss": 1.5412,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.3681833744049072,
"rewards/margins": 0.22236546874046326,
"rewards/rejected": -1.5905488729476929,
"step": 452
},
{
"epoch": 0.9682073203312851,
"grad_norm": 6.472089767456055,
"learning_rate": 2.739052315863355e-09,
"logits/chosen": -1.1393368244171143,
"logits/rejected": -1.0646111965179443,
"logps/chosen": -0.7743910551071167,
"logps/rejected": -0.7680612802505493,
"loss": 1.633,
"rewards/accuracies": 0.25,
"rewards/chosen": -1.9359774589538574,
"rewards/margins": -0.01582423597574234,
"rewards/rejected": -1.9201533794403076,
"step": 453
},
{
"epoch": 0.9703446433342239,
"grad_norm": 7.718911170959473,
"learning_rate": 2.3620312674367816e-09,
"logits/chosen": -1.0543270111083984,
"logits/rejected": -1.031882643699646,
"logps/chosen": -0.6770450472831726,
"logps/rejected": -0.675162672996521,
"loss": 1.4921,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.692612648010254,
"rewards/margins": -0.004705796018242836,
"rewards/rejected": -1.6879067420959473,
"step": 454
},
{
"epoch": 0.9724819663371627,
"grad_norm": 6.906121253967285,
"learning_rate": 2.0128530023804656e-09,
"logits/chosen": -1.0301321744918823,
"logits/rejected": -0.9676041007041931,
"logps/chosen": -0.610703706741333,
"logps/rejected": -0.576894223690033,
"loss": 1.5818,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.526759386062622,
"rewards/margins": -0.0845237672328949,
"rewards/rejected": -1.4422357082366943,
"step": 455
},
{
"epoch": 0.9746192893401016,
"grad_norm": 6.149905204772949,
"learning_rate": 1.6915370571756181e-09,
"logits/chosen": -0.9057004451751709,
"logits/rejected": -1.0025300979614258,
"logps/chosen": -0.6752175688743591,
"logps/rejected": -1.0043590068817139,
"loss": 1.4284,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.6880438327789307,
"rewards/margins": 0.8228534460067749,
"rewards/rejected": -2.510897397994995,
"step": 456
},
{
"epoch": 0.9767566123430403,
"grad_norm": 8.217068672180176,
"learning_rate": 1.3981014094099353e-09,
"logits/chosen": -1.186248540878296,
"logits/rejected": -1.3456785678863525,
"logps/chosen": -0.8704374432563782,
"logps/rejected": -0.86018306016922,
"loss": 1.6605,
"rewards/accuracies": 0.4375,
"rewards/chosen": -2.176093578338623,
"rewards/margins": -0.02563604712486267,
"rewards/rejected": -2.1504576206207275,
"step": 457
},
{
"epoch": 0.9788939353459791,
"grad_norm": 4.915297508239746,
"learning_rate": 1.1325624767719588e-09,
"logits/chosen": -1.0867304801940918,
"logits/rejected": -1.0485161542892456,
"logps/chosen": -0.5887439846992493,
"logps/rejected": -0.789469838142395,
"loss": 1.5245,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4718600511550903,
"rewards/margins": 0.5018147826194763,
"rewards/rejected": -1.9736747741699219,
"step": 458
},
{
"epoch": 0.981031258348918,
"grad_norm": 6.181196689605713,
"learning_rate": 8.949351161324225e-10,
"logits/chosen": -1.0734320878982544,
"logits/rejected": -1.1374140977859497,
"logps/chosen": -0.8127413392066956,
"logps/rejected": -0.9145556688308716,
"loss": 1.384,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.031853675842285,
"rewards/margins": 0.2545357644557953,
"rewards/rejected": -2.286389112472534,
"step": 459
},
{
"epoch": 0.9831685813518568,
"grad_norm": 8.571794509887695,
"learning_rate": 6.852326227130833e-10,
"logits/chosen": -0.8654438257217407,
"logits/rejected": -0.7517091631889343,
"logps/chosen": -0.43124887347221375,
"logps/rejected": -0.5231287479400635,
"loss": 1.4778,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0781221389770508,
"rewards/margins": 0.22969970107078552,
"rewards/rejected": -1.3078218698501587,
"step": 460
},
{
"epoch": 0.9853059043547956,
"grad_norm": 6.118160724639893,
"learning_rate": 5.034667293427053e-10,
"logits/chosen": -1.136691689491272,
"logits/rejected": -1.0298508405685425,
"logps/chosen": -0.5144182443618774,
"logps/rejected": -0.5670837759971619,
"loss": 1.4558,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2860456705093384,
"rewards/margins": 0.13166388869285583,
"rewards/rejected": -1.417709469795227,
"step": 461
},
{
"epoch": 0.9874432273577345,
"grad_norm": 5.212719440460205,
"learning_rate": 3.4964760580069585e-10,
"logits/chosen": -1.0757759809494019,
"logits/rejected": -1.0036826133728027,
"logps/chosen": -0.4357612431049347,
"logps/rejected": -0.8209949731826782,
"loss": 1.3862,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0894030332565308,
"rewards/margins": 0.9630845785140991,
"rewards/rejected": -2.052487373352051,
"step": 462
},
{
"epoch": 0.9895805503606733,
"grad_norm": 4.922801971435547,
"learning_rate": 2.2378385824833866e-10,
"logits/chosen": -1.1101552248001099,
"logits/rejected": -1.023587942123413,
"logps/chosen": -0.5474473237991333,
"logps/rejected": -0.6908574104309082,
"loss": 1.4534,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.368618369102478,
"rewards/margins": 0.35852521657943726,
"rewards/rejected": -1.7271435260772705,
"step": 463
},
{
"epoch": 0.9917178733636121,
"grad_norm": 5.8598952293396,
"learning_rate": 1.2588252874673466e-10,
"logits/chosen": -1.0550298690795898,
"logits/rejected": -1.0083141326904297,
"logps/chosen": -0.419270783662796,
"logps/rejected": -0.5867845416069031,
"loss": 1.4247,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.048176884651184,
"rewards/margins": 0.41878432035446167,
"rewards/rejected": -1.4669612646102905,
"step": 464
},
{
"epoch": 0.9938551963665508,
"grad_norm": 4.032866477966309,
"learning_rate": 5.594909486328348e-11,
"logits/chosen": -1.215372920036316,
"logits/rejected": -1.0714528560638428,
"logps/chosen": -0.42737770080566406,
"logps/rejected": -0.5524105429649353,
"loss": 1.5264,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0684442520141602,
"rewards/margins": 0.31258225440979004,
"rewards/rejected": -1.3810263872146606,
"step": 465
},
{
"epoch": 0.9959925193694897,
"grad_norm": 7.993014812469482,
"learning_rate": 1.3987469365095429e-11,
"logits/chosen": -0.9364662766456604,
"logits/rejected": -0.8720629811286926,
"logps/chosen": -0.5930238962173462,
"logps/rejected": -0.5850106477737427,
"loss": 1.5437,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.4825596809387207,
"rewards/margins": -0.02003306895494461,
"rewards/rejected": -1.462526798248291,
"step": 466
},
{
"epoch": 0.9981298423724285,
"grad_norm": 7.496538162231445,
"learning_rate": 0.0,
"logits/chosen": -0.8774080276489258,
"logits/rejected": -0.8403609395027161,
"logps/chosen": -0.4634042978286743,
"logps/rejected": -0.4342804551124573,
"loss": 1.502,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.158510684967041,
"rewards/margins": -0.07280956953763962,
"rewards/rejected": -1.0857011079788208,
"step": 467
},
{
"epoch": 0.9981298423724285,
"eval_logits/chosen": -1.2647873163223267,
"eval_logits/rejected": -1.2294323444366455,
"eval_logps/chosen": -0.5171914100646973,
"eval_logps/rejected": -0.6633016467094421,
"eval_loss": 1.490802526473999,
"eval_rewards/accuracies": 0.600806474685669,
"eval_rewards/chosen": -1.2929786443710327,
"eval_rewards/margins": 0.3652755916118622,
"eval_rewards/rejected": -1.6582541465759277,
"eval_runtime": 77.7277,
"eval_samples_per_second": 25.229,
"eval_steps_per_second": 0.798,
"step": 467
},
{
"epoch": 0.9981298423724285,
"step": 467,
"total_flos": 0.0,
"train_loss": 1.540159606576221,
"train_runtime": 15000.5444,
"train_samples_per_second": 3.992,
"train_steps_per_second": 0.031
}
],
"logging_steps": 1,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 32,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}