qwen2.5-vl-3b-fft-fdpo / trainer_state.json
chancharikm's picture
Upload folder using huggingface_hub
c4e1fdf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.710508474576271,
"eval_steps": 100,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013559322033898305,
"grad_norm": 154.68963322032346,
"learning_rate": 4.504504504504504e-08,
"logits/chosen": -1.6006476879119873,
"logits/rejected": -1.7503880262374878,
"logps/chosen": -136.20535278320312,
"logps/rejected": -650.806396484375,
"loss": 0.6902,
"rewards/accuracies": 0.40000003576278687,
"rewards/chosen": -0.002106652595102787,
"rewards/margins": 0.007925467565655708,
"rewards/rejected": -0.01003211922943592,
"step": 10
},
{
"epoch": 0.02711864406779661,
"grad_norm": 143.60255538961036,
"learning_rate": 9.009009009009008e-08,
"logits/chosen": -1.4859544038772583,
"logits/rejected": -1.6451891660690308,
"logps/chosen": -125.66300964355469,
"logps/rejected": -636.0386962890625,
"loss": 0.688,
"rewards/accuracies": 0.5374999642372131,
"rewards/chosen": 0.003618550719693303,
"rewards/margins": 0.012560315430164337,
"rewards/rejected": -0.008941764943301678,
"step": 20
},
{
"epoch": 0.04067796610169491,
"grad_norm": 146.5406135926217,
"learning_rate": 1.3513513513513515e-07,
"logits/chosen": -1.6544740200042725,
"logits/rejected": -1.831311583518982,
"logps/chosen": -134.34368896484375,
"logps/rejected": -654.5028686523438,
"loss": 0.672,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.009306993335485458,
"rewards/margins": 0.04563351720571518,
"rewards/rejected": -0.03632652387022972,
"step": 30
},
{
"epoch": 0.05423728813559322,
"grad_norm": 134.93316662114188,
"learning_rate": 1.8018018018018017e-07,
"logits/chosen": -1.6334645748138428,
"logits/rejected": -1.8082122802734375,
"logps/chosen": -150.40078735351562,
"logps/rejected": -714.775390625,
"loss": 0.5965,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.00473959231749177,
"rewards/margins": 0.20695891976356506,
"rewards/rejected": -0.20221932232379913,
"step": 40
},
{
"epoch": 0.06779661016949153,
"grad_norm": 85.1543469130844,
"learning_rate": 2.2522522522522522e-07,
"logits/chosen": -1.636574387550354,
"logits/rejected": -1.8049229383468628,
"logps/chosen": -142.1752471923828,
"logps/rejected": -667.3391723632812,
"loss": 0.4427,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.007638626731932163,
"rewards/margins": 0.6071017980575562,
"rewards/rejected": -0.5994631052017212,
"step": 50
},
{
"epoch": 0.08135593220338982,
"grad_norm": 55.45578483376317,
"learning_rate": 2.702702702702703e-07,
"logits/chosen": -1.7117058038711548,
"logits/rejected": -1.887060523033142,
"logps/chosen": -117.88133239746094,
"logps/rejected": -649.540283203125,
"loss": 0.2845,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0008934163488447666,
"rewards/margins": 1.142379879951477,
"rewards/rejected": -1.1414865255355835,
"step": 60
},
{
"epoch": 0.09491525423728814,
"grad_norm": 9.23309067720513,
"learning_rate": 3.153153153153153e-07,
"logits/chosen": -1.4379091262817383,
"logits/rejected": -1.6131579875946045,
"logps/chosen": -134.87896728515625,
"logps/rejected": -696.23779296875,
"loss": 0.0565,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.001966355135664344,
"rewards/margins": 3.1120121479034424,
"rewards/rejected": -3.113978624343872,
"step": 70
},
{
"epoch": 0.10847457627118644,
"grad_norm": 2.7786137414686887,
"learning_rate": 3.6036036036036033e-07,
"logits/chosen": -1.531922698020935,
"logits/rejected": -1.6740307807922363,
"logps/chosen": -130.48211669921875,
"logps/rejected": -688.8399047851562,
"loss": 0.0138,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.001919470028951764,
"rewards/margins": 4.491945266723633,
"rewards/rejected": -4.490025997161865,
"step": 80
},
{
"epoch": 0.12203389830508475,
"grad_norm": 0.011215121612896709,
"learning_rate": 4.054054054054054e-07,
"logits/chosen": -1.5453845262527466,
"logits/rejected": -1.702711820602417,
"logps/chosen": -137.26138305664062,
"logps/rejected": -749.4444580078125,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.00951399840414524,
"rewards/margins": 8.38076400756836,
"rewards/rejected": -8.37125015258789,
"step": 90
},
{
"epoch": 0.13559322033898305,
"grad_norm": 0.002748606315174295,
"learning_rate": 4.5045045045045043e-07,
"logits/chosen": -1.6784169673919678,
"logits/rejected": -1.8867419958114624,
"logps/chosen": -134.819091796875,
"logps/rejected": -786.6911010742188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0101728904992342,
"rewards/margins": 12.241233825683594,
"rewards/rejected": -12.231060981750488,
"step": 100
},
{
"epoch": 0.13559322033898305,
"eval_logits/chosen": -1.5288071632385254,
"eval_logits/rejected": -1.6835100650787354,
"eval_logps/chosen": -129.43997192382812,
"eval_logps/rejected": -748.4630126953125,
"eval_loss": 3.7055913253425388e-06,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.012690085917711258,
"eval_rewards/margins": 13.008617401123047,
"eval_rewards/rejected": -12.995927810668945,
"eval_runtime": 23.2529,
"eval_samples_per_second": 4.301,
"eval_steps_per_second": 1.075,
"step": 100
},
{
"epoch": 0.14915254237288136,
"grad_norm": 0.0011093150463870978,
"learning_rate": 4.954954954954955e-07,
"logits/chosen": -1.57046377658844,
"logits/rejected": -1.7331207990646362,
"logps/chosen": -127.67959594726562,
"logps/rejected": -752.2487182617188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.004207385703921318,
"rewards/margins": 13.458518981933594,
"rewards/rejected": -13.462725639343262,
"step": 110
},
{
"epoch": 0.16271186440677965,
"grad_norm": 0.0005762634187664357,
"learning_rate": 4.999773405362863e-07,
"logits/chosen": -1.6726722717285156,
"logits/rejected": -1.8734807968139648,
"logps/chosen": -143.523193359375,
"logps/rejected": -832.698974609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014902787283062935,
"rewards/margins": 13.953229904174805,
"rewards/rejected": -13.938325881958008,
"step": 120
},
{
"epoch": 0.17627118644067796,
"grad_norm": 0.0003332599393998685,
"learning_rate": 4.998990167994546e-07,
"logits/chosen": -1.6244195699691772,
"logits/rejected": -1.790928602218628,
"logps/chosen": -137.99049377441406,
"logps/rejected": -785.167236328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014037198387086391,
"rewards/margins": 13.896438598632812,
"rewards/rejected": -13.882402420043945,
"step": 130
},
{
"epoch": 0.18983050847457628,
"grad_norm": 0.0003146655208168566,
"learning_rate": 4.997647665674343e-07,
"logits/chosen": -1.6478142738342285,
"logits/rejected": -1.860294222831726,
"logps/chosen": -147.6448516845703,
"logps/rejected": -811.6314086914062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015513481572270393,
"rewards/margins": 14.067631721496582,
"rewards/rejected": -14.052118301391602,
"step": 140
},
{
"epoch": 0.2033898305084746,
"grad_norm": 0.0002489502956030974,
"learning_rate": 4.995746198849412e-07,
"logits/chosen": -1.5896422863006592,
"logits/rejected": -1.7557127475738525,
"logps/chosen": -140.1243896484375,
"logps/rejected": -822.0531616210938,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02557023987174034,
"rewards/margins": 14.115985870361328,
"rewards/rejected": -14.090415954589844,
"step": 150
},
{
"epoch": 0.21694915254237288,
"grad_norm": 0.00037978880201729675,
"learning_rate": 4.993286193061145e-07,
"logits/chosen": -1.6275484561920166,
"logits/rejected": -1.8082895278930664,
"logps/chosen": -145.8615264892578,
"logps/rejected": -797.1973876953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.005417819134891033,
"rewards/margins": 14.054203033447266,
"rewards/rejected": -14.048785209655762,
"step": 160
},
{
"epoch": 0.2305084745762712,
"grad_norm": 0.0011709678087963664,
"learning_rate": 4.99026819884993e-07,
"logits/chosen": -1.691068172454834,
"logits/rejected": -1.8546725511550903,
"logps/chosen": -140.33941650390625,
"logps/rejected": -833.8118896484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.005253595765680075,
"rewards/margins": 14.147695541381836,
"rewards/rejected": -14.142441749572754,
"step": 170
},
{
"epoch": 0.2440677966101695,
"grad_norm": 0.0005430831103481389,
"learning_rate": 4.986692891631945e-07,
"logits/chosen": -1.654329776763916,
"logits/rejected": -1.8268946409225464,
"logps/chosen": -142.9085235595703,
"logps/rejected": -781.0303955078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.004031305201351643,
"rewards/margins": 13.904911041259766,
"rewards/rejected": -13.900879859924316,
"step": 180
},
{
"epoch": 0.2576271186440678,
"grad_norm": 0.0002685256860972653,
"learning_rate": 4.982561071548001e-07,
"logits/chosen": -1.537014365196228,
"logits/rejected": -1.6596903800964355,
"logps/chosen": -123.82093811035156,
"logps/rejected": -741.5814208984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.018150925636291504,
"rewards/margins": 14.061857223510742,
"rewards/rejected": -14.043705940246582,
"step": 190
},
{
"epoch": 0.2711864406779661,
"grad_norm": 0.00047407562273882125,
"learning_rate": 4.977873663284474e-07,
"logits/chosen": -1.6226494312286377,
"logits/rejected": -1.8021833896636963,
"logps/chosen": -152.94691467285156,
"logps/rejected": -808.9371337890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.021439315751194954,
"rewards/margins": 14.185613632202148,
"rewards/rejected": -14.164173126220703,
"step": 200
},
{
"epoch": 0.2711864406779661,
"eval_logits/chosen": -1.5458643436431885,
"eval_logits/rejected": -1.6861909627914429,
"eval_logps/chosen": -129.42483520507812,
"eval_logps/rejected": -757.507080078125,
"eval_loss": 1.5668128980905749e-06,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.014203321188688278,
"eval_rewards/margins": 13.914541244506836,
"eval_rewards/rejected": -13.900337219238281,
"eval_runtime": 23.2677,
"eval_samples_per_second": 4.298,
"eval_steps_per_second": 1.074,
"step": 200
},
{
"epoch": 0.2847457627118644,
"grad_norm": 0.0005141024917889797,
"learning_rate": 4.972631715866361e-07,
"logits/chosen": -1.5850636959075928,
"logits/rejected": -1.7246123552322388,
"logps/chosen": -143.24696350097656,
"logps/rejected": -778.4476318359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.008326123468577862,
"rewards/margins": 13.95406436920166,
"rewards/rejected": -13.945737838745117,
"step": 210
},
{
"epoch": 0.2983050847457627,
"grad_norm": 0.0006130760590228257,
"learning_rate": 4.966836402422515e-07,
"logits/chosen": -1.6016970872879028,
"logits/rejected": -1.7708076238632202,
"logps/chosen": -132.4145050048828,
"logps/rejected": -792.4443359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.017194140702486038,
"rewards/margins": 14.093986511230469,
"rewards/rejected": -14.07679271697998,
"step": 220
},
{
"epoch": 0.31186440677966104,
"grad_norm": 0.000828819568409969,
"learning_rate": 4.960489019923105e-07,
"logits/chosen": -1.679842233657837,
"logits/rejected": -1.8294236660003662,
"logps/chosen": -136.24644470214844,
"logps/rejected": -809.9606323242188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.016033075749874115,
"rewards/margins": 14.131494522094727,
"rewards/rejected": -14.115462303161621,
"step": 230
},
{
"epoch": 0.3254237288135593,
"grad_norm": 0.0008292339546344823,
"learning_rate": 4.95359098888935e-07,
"logits/chosen": -1.6466869115829468,
"logits/rejected": -1.8071887493133545,
"logps/chosen": -144.3760986328125,
"logps/rejected": -816.1300659179688,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.00023554242216050625,
"rewards/margins": 14.113880157470703,
"rewards/rejected": -14.113645553588867,
"step": 240
},
{
"epoch": 0.3389830508474576,
"grad_norm": 0.0007326232553653241,
"learning_rate": 4.946143853075625e-07,
"logits/chosen": -1.6482115983963013,
"logits/rejected": -1.8068441152572632,
"logps/chosen": -136.09292602539062,
"logps/rejected": -786.30908203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0055378577671945095,
"rewards/margins": 14.151810646057129,
"rewards/rejected": -14.146272659301758,
"step": 250
},
{
"epoch": 0.3525423728813559,
"grad_norm": 0.0006201108834653548,
"learning_rate": 4.938149279123959e-07,
"logits/chosen": -1.614100694656372,
"logits/rejected": -1.7720319032669067,
"logps/chosen": -145.45883178710938,
"logps/rejected": -772.9180908203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.014799129217863083,
"rewards/margins": 13.972376823425293,
"rewards/rejected": -13.987175941467285,
"step": 260
},
{
"epoch": 0.36610169491525424,
"grad_norm": 0.0002824417146726921,
"learning_rate": 4.929609056191057e-07,
"logits/chosen": -1.6527501344680786,
"logits/rejected": -1.8346607685089111,
"logps/chosen": -147.9451904296875,
"logps/rejected": -845.679931640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.073175558820367e-05,
"rewards/margins": 14.214624404907227,
"rewards/rejected": -14.214543342590332,
"step": 270
},
{
"epoch": 0.37966101694915255,
"grad_norm": 0.0003459147020709031,
"learning_rate": 4.920525095547895e-07,
"logits/chosen": -1.700380802154541,
"logits/rejected": -1.8940513134002686,
"logps/chosen": -148.5547332763672,
"logps/rejected": -806.921142578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0022345068864524364,
"rewards/margins": 13.974288940429688,
"rewards/rejected": -13.976522445678711,
"step": 280
},
{
"epoch": 0.39322033898305087,
"grad_norm": 0.0007686863218550116,
"learning_rate": 4.910899430151973e-07,
"logits/chosen": -1.690828800201416,
"logits/rejected": -1.8689790964126587,
"logps/chosen": -127.06858825683594,
"logps/rejected": -781.9804077148438,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014154733158648014,
"rewards/margins": 13.95429801940918,
"rewards/rejected": -13.940142631530762,
"step": 290
},
{
"epoch": 0.4067796610169492,
"grad_norm": 0.0007833273852497298,
"learning_rate": 4.900734214192358e-07,
"logits/chosen": -1.657442331314087,
"logits/rejected": -1.8555173873901367,
"logps/chosen": -142.8946075439453,
"logps/rejected": -833.9373168945312,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.016108307987451553,
"rewards/margins": 14.204878807067871,
"rewards/rejected": -14.188769340515137,
"step": 300
},
{
"epoch": 0.4067796610169492,
"eval_logits/chosen": -1.5289416313171387,
"eval_logits/rejected": -1.6836309432983398,
"eval_logps/chosen": -129.3828582763672,
"eval_logps/rejected": -758.2346801757812,
"eval_loss": 1.4355653092934517e-06,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.018402252346277237,
"eval_rewards/margins": 13.991499900817871,
"eval_rewards/rejected": -13.97309684753418,
"eval_runtime": 23.0859,
"eval_samples_per_second": 4.332,
"eval_steps_per_second": 1.083,
"step": 300
},
{
"epoch": 0.42033898305084744,
"grad_norm": 0.00021728835196828888,
"learning_rate": 4.890031722607576e-07,
"logits/chosen": -1.5837361812591553,
"logits/rejected": -1.7333931922912598,
"logps/chosen": -133.32366943359375,
"logps/rejected": -758.0372314453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0008433577604591846,
"rewards/margins": 14.209864616394043,
"rewards/rejected": -14.209020614624023,
"step": 310
},
{
"epoch": 0.43389830508474575,
"grad_norm": 0.00042770904659586716,
"learning_rate": 4.878794350576498e-07,
"logits/chosen": -1.577714443206787,
"logits/rejected": -1.759061336517334,
"logps/chosen": -128.18081665039062,
"logps/rejected": -749.0996704101562,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.006202464923262596,
"rewards/margins": 14.050898551940918,
"rewards/rejected": -14.044694900512695,
"step": 320
},
{
"epoch": 0.44745762711864406,
"grad_norm": 0.0005431267045930932,
"learning_rate": 4.867024612982295e-07,
"logits/chosen": -1.5479421615600586,
"logits/rejected": -1.6868603229522705,
"logps/chosen": -145.11839294433594,
"logps/rejected": -793.3886108398438,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.007111231796443462,
"rewards/margins": 14.120939254760742,
"rewards/rejected": -14.1138277053833,
"step": 330
},
{
"epoch": 0.4610169491525424,
"grad_norm": 0.0004452926543919869,
"learning_rate": 4.854725143849631e-07,
"logits/chosen": -1.5681132078170776,
"logits/rejected": -1.7265939712524414,
"logps/chosen": -135.96771240234375,
"logps/rejected": -776.887939453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01606500893831253,
"rewards/margins": 13.997052192687988,
"rewards/rejected": -13.980987548828125,
"step": 340
},
{
"epoch": 0.4745762711864407,
"grad_norm": 0.0003403651283500103,
"learning_rate": 4.841898695755167e-07,
"logits/chosen": -1.6001578569412231,
"logits/rejected": -1.7874952554702759,
"logps/chosen": -128.35435485839844,
"logps/rejected": -756.4078979492188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.018694868311285973,
"rewards/margins": 14.069332122802734,
"rewards/rejected": -14.050636291503906,
"step": 350
},
{
"epoch": 0.488135593220339,
"grad_norm": 0.0005571850400383578,
"learning_rate": 4.828548139211545e-07,
"logits/chosen": -1.6838054656982422,
"logits/rejected": -1.8665508031845093,
"logps/chosen": -135.80624389648438,
"logps/rejected": -794.292724609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02806948870420456,
"rewards/margins": 14.219563484191895,
"rewards/rejected": -14.19149398803711,
"step": 360
},
{
"epoch": 0.5016949152542373,
"grad_norm": 0.0007950002555106897,
"learning_rate": 4.814676462024987e-07,
"logits/chosen": -1.657123327255249,
"logits/rejected": -1.8259265422821045,
"logps/chosen": -151.13162231445312,
"logps/rejected": -806.3003540039062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.016831912100315094,
"rewards/margins": 14.099932670593262,
"rewards/rejected": -14.08310317993164,
"step": 370
},
{
"epoch": 0.5152542372881356,
"grad_norm": 0.0003242442253345165,
"learning_rate": 4.800286768626621e-07,
"logits/chosen": -1.657513976097107,
"logits/rejected": -1.8597712516784668,
"logps/chosen": -139.53028869628906,
"logps/rejected": -826.2210083007812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014754395000636578,
"rewards/margins": 14.141820907592773,
"rewards/rejected": -14.127065658569336,
"step": 380
},
{
"epoch": 0.5288135593220339,
"grad_norm": 0.0005090620291333299,
"learning_rate": 4.785382279377733e-07,
"logits/chosen": -1.6210606098175049,
"logits/rejected": -1.8241839408874512,
"logps/chosen": -143.5242462158203,
"logps/rejected": -836.617431640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02124781347811222,
"rewards/margins": 14.220711708068848,
"rewards/rejected": -14.199464797973633,
"step": 390
},
{
"epoch": 0.5423728813559322,
"grad_norm": 0.000825922097218788,
"learning_rate": 4.769966329849054e-07,
"logits/chosen": -1.6811935901641846,
"logits/rejected": -1.8629109859466553,
"logps/chosen": -150.13429260253906,
"logps/rejected": -827.9759521484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0035764453932642937,
"rewards/margins": 14.165657997131348,
"rewards/rejected": -14.162080764770508,
"step": 400
},
{
"epoch": 0.5423728813559322,
"eval_logits/chosen": -1.5275362730026245,
"eval_logits/rejected": -1.6860315799713135,
"eval_logps/chosen": -129.4043426513672,
"eval_logps/rejected": -758.570556640625,
"eval_loss": 1.3993409311297e-06,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.016253653913736343,
"eval_rewards/margins": 14.022944450378418,
"eval_rewards/rejected": -14.006690979003906,
"eval_runtime": 23.1703,
"eval_samples_per_second": 4.316,
"eval_steps_per_second": 1.079,
"step": 400
},
{
"epoch": 0.5559322033898305,
"grad_norm": 0.0007397757326606043,
"learning_rate": 4.7540423700742726e-07,
"logits/chosen": -1.6916306018829346,
"logits/rejected": -1.8490281105041504,
"logps/chosen": -149.16802978515625,
"logps/rejected": -804.88525390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.008371539413928986,
"rewards/margins": 14.190282821655273,
"rewards/rejected": -14.18191146850586,
"step": 410
},
{
"epoch": 0.5694915254237288,
"grad_norm": 0.0005353994450256941,
"learning_rate": 4.7376139637779354e-07,
"logits/chosen": -1.6099956035614014,
"logits/rejected": -1.7850123643875122,
"logps/chosen": -122.01039123535156,
"logps/rejected": -747.48974609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.017607325688004494,
"rewards/margins": 14.0084867477417,
"rewards/rejected": -13.99087905883789,
"step": 420
},
{
"epoch": 0.5830508474576271,
"grad_norm": 0.00048794620873617576,
"learning_rate": 4.7206847875778913e-07,
"logits/chosen": -1.6391756534576416,
"logits/rejected": -1.8201954364776611,
"logps/chosen": -131.18875122070312,
"logps/rejected": -759.473388671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01886049658060074,
"rewards/margins": 14.103827476501465,
"rewards/rejected": -14.084967613220215,
"step": 430
},
{
"epoch": 0.5966101694915255,
"grad_norm": 0.00044451168413258085,
"learning_rate": 4.70325863016248e-07,
"logits/chosen": -1.678541660308838,
"logits/rejected": -1.8704532384872437,
"logps/chosen": -144.02674865722656,
"logps/rejected": -808.55859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015889765694737434,
"rewards/margins": 14.10900592803955,
"rewards/rejected": -14.093117713928223,
"step": 440
},
{
"epoch": 0.6101694915254238,
"grad_norm": 0.00038795072502636444,
"learning_rate": 4.68533939144264e-07,
"logits/chosen": -1.6224497556686401,
"logits/rejected": -1.7962324619293213,
"logps/chosen": -140.4901123046875,
"logps/rejected": -825.280517578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.005872346460819244,
"rewards/margins": 14.365765571594238,
"rewards/rejected": -14.359892845153809,
"step": 450
},
{
"epoch": 0.6237288135593221,
"grad_norm": 0.00038646259276520423,
"learning_rate": 4.6669310816791184e-07,
"logits/chosen": -1.7635605335235596,
"logits/rejected": -1.951772928237915,
"logps/chosen": -136.94752502441406,
"logps/rejected": -807.515869140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0075184679590165615,
"rewards/margins": 14.222395896911621,
"rewards/rejected": -14.214877128601074,
"step": 460
},
{
"epoch": 0.6372881355932203,
"grad_norm": 0.0003666927496908433,
"learning_rate": 4.6480378205849926e-07,
"logits/chosen": -1.6419250965118408,
"logits/rejected": -1.7922152280807495,
"logps/chosen": -130.02239990234375,
"logps/rejected": -759.2348022460938,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014397606253623962,
"rewards/margins": 14.155649185180664,
"rewards/rejected": -14.141252517700195,
"step": 470
},
{
"epoch": 0.6508474576271186,
"grad_norm": 0.00046994952764087256,
"learning_rate": 4.6286638364036905e-07,
"logits/chosen": -1.6200618743896484,
"logits/rejected": -1.797215461730957,
"logps/chosen": -130.50233459472656,
"logps/rejected": -829.3895263671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02378329634666443,
"rewards/margins": 14.199002265930176,
"rewards/rejected": -14.17521858215332,
"step": 480
},
{
"epoch": 0.6644067796610169,
"grad_norm": 0.00029735714895166485,
"learning_rate": 4.6088134649627284e-07,
"logits/chosen": -1.5747830867767334,
"logits/rejected": -1.753875494003296,
"logps/chosen": -138.4869842529297,
"logps/rejected": -819.8696899414062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.010798036120831966,
"rewards/margins": 14.393420219421387,
"rewards/rejected": -14.382623672485352,
"step": 490
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.00035937901840089274,
"learning_rate": 4.5884911487033665e-07,
"logits/chosen": -1.6192841529846191,
"logits/rejected": -1.7925256490707397,
"logps/chosen": -143.53257751464844,
"logps/rejected": -787.250244140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.016855549067258835,
"rewards/margins": 14.227276802062988,
"rewards/rejected": -14.210421562194824,
"step": 500
},
{
"epoch": 0.6779661016949152,
"eval_logits/chosen": -1.5297393798828125,
"eval_logits/rejected": -1.6854040622711182,
"eval_logps/chosen": -129.43572998046875,
"eval_logps/rejected": -759.3566284179688,
"eval_loss": 1.3430080798570998e-06,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.01311517134308815,
"eval_rewards/margins": 14.098403930664062,
"eval_rewards/rejected": -14.085289001464844,
"eval_runtime": 23.0065,
"eval_samples_per_second": 4.347,
"eval_steps_per_second": 1.087,
"step": 500
},
{
"epoch": 0.6915254237288135,
"grad_norm": 0.00020815218989513314,
"learning_rate": 4.567701435686404e-07,
"logits/chosen": -1.6033257246017456,
"logits/rejected": -1.7678654193878174,
"logps/chosen": -143.47254943847656,
"logps/rejected": -833.3479614257812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.018440861254930496,
"rewards/margins": 14.311261177062988,
"rewards/rejected": -14.292821884155273,
"step": 510
},
{
"epoch": 0.7050847457627119,
"grad_norm": 0.00044702973171401343,
"learning_rate": 4.5464489785743454e-07,
"logits/chosen": -1.588118076324463,
"logits/rejected": -1.7491432428359985,
"logps/chosen": -121.74967193603516,
"logps/rejected": -763.8538818359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.021985743194818497,
"rewards/margins": 14.20384693145752,
"rewards/rejected": -14.181861877441406,
"step": 520
},
{
"epoch": 0.7186440677966102,
"grad_norm": 0.00048331111256865464,
"learning_rate": 4.5247385335901457e-07,
"logits/chosen": -1.62116277217865,
"logits/rejected": -1.798133134841919,
"logps/chosen": -144.86624145507812,
"logps/rejected": -807.5030517578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.006796732544898987,
"rewards/margins": 14.287379264831543,
"rewards/rejected": -14.280582427978516,
"step": 530
},
{
"epoch": 0.7322033898305085,
"grad_norm": 0.0004276849725743657,
"learning_rate": 4.5025749594527895e-07,
"logits/chosen": -1.5937074422836304,
"logits/rejected": -1.766377329826355,
"logps/chosen": -130.42138671875,
"logps/rejected": -770.9468994140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011606786400079727,
"rewards/margins": 14.063494682312012,
"rewards/rejected": -14.051887512207031,
"step": 540
},
{
"epoch": 0.7457627118644068,
"grad_norm": 0.00047701124764661757,
"learning_rate": 4.4799632162899236e-07,
"logits/chosen": -1.586263656616211,
"logits/rejected": -1.7527462244033813,
"logps/chosen": -138.99436950683594,
"logps/rejected": -832.9317626953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01648543030023575,
"rewards/margins": 14.375706672668457,
"rewards/rejected": -14.359220504760742,
"step": 550
},
{
"epoch": 0.7593220338983051,
"grad_norm": 0.0005331903399639826,
"learning_rate": 4.456908364527802e-07,
"logits/chosen": -1.6967836618423462,
"logits/rejected": -1.8608123064041138,
"logps/chosen": -135.91598510742188,
"logps/rejected": -800.591796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.020324843004345894,
"rewards/margins": 14.120574951171875,
"rewards/rejected": -14.100250244140625,
"step": 560
},
{
"epoch": 0.7728813559322034,
"grad_norm": 0.000533186088613475,
"learning_rate": 4.433415563758778e-07,
"logits/chosen": -1.6483113765716553,
"logits/rejected": -1.8176215887069702,
"logps/chosen": -138.11244201660156,
"logps/rejected": -791.1510620117188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01274613942950964,
"rewards/margins": 14.025276184082031,
"rewards/rejected": -14.012529373168945,
"step": 570
},
{
"epoch": 0.7864406779661017,
"grad_norm": 0.0003330593802523546,
"learning_rate": 4.409490071586606e-07,
"logits/chosen": -1.58778715133667,
"logits/rejected": -1.7679705619812012,
"logps/chosen": -143.45578002929688,
"logps/rejected": -871.753173828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.004776511341333389,
"rewards/margins": 14.490696907043457,
"rewards/rejected": -14.485920906066895,
"step": 580
},
{
"epoch": 0.8,
"grad_norm": 0.0004356549740596174,
"learning_rate": 4.38513724244981e-07,
"logits/chosen": -1.677300214767456,
"logits/rejected": -1.823121428489685,
"logps/chosen": -130.93118286132812,
"logps/rejected": -767.2311401367188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01857886277139187,
"rewards/margins": 14.194330215454102,
"rewards/rejected": -14.175750732421875,
"step": 590
},
{
"epoch": 0.8135593220338984,
"grad_norm": 0.0006738152861953452,
"learning_rate": 4.360362526423382e-07,
"logits/chosen": -1.6017038822174072,
"logits/rejected": -1.7450367212295532,
"logps/chosen": -134.8054656982422,
"logps/rejected": -829.2752075195312,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.023766428232192993,
"rewards/margins": 14.333120346069336,
"rewards/rejected": -14.309354782104492,
"step": 600
},
{
"epoch": 0.8135593220338984,
"eval_logits/chosen": -1.535141944885254,
"eval_logits/rejected": -1.6854389905929565,
"eval_logps/chosen": -129.3685302734375,
"eval_logps/rejected": -759.9224853515625,
"eval_loss": 1.2519085430540144e-06,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.019834483042359352,
"eval_rewards/margins": 14.161703109741211,
"eval_rewards/rejected": -14.14186954498291,
"eval_runtime": 23.2942,
"eval_samples_per_second": 4.293,
"eval_steps_per_second": 1.073,
"step": 600
},
{
"epoch": 0.8271186440677966,
"grad_norm": 0.00028376978865923564,
"learning_rate": 4.3351714679990706e-07,
"logits/chosen": -1.5073317289352417,
"logits/rejected": -1.630771517753601,
"logps/chosen": -108.68913269042969,
"logps/rejected": -755.2454223632812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.021552443504333496,
"rewards/margins": 14.418371200561523,
"rewards/rejected": -14.396819114685059,
"step": 610
},
{
"epoch": 0.8406779661016949,
"grad_norm": 0.00048164470828510026,
"learning_rate": 4.3095697048445447e-07,
"logits/chosen": -1.6341415643692017,
"logits/rejected": -1.7738165855407715,
"logps/chosen": -140.3278045654297,
"logps/rejected": -794.1578369140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0028394628316164017,
"rewards/margins": 14.288480758666992,
"rewards/rejected": -14.28564167022705,
"step": 620
},
{
"epoch": 0.8542372881355932,
"grad_norm": 0.00046143931913359906,
"learning_rate": 4.283562966541707e-07,
"logits/chosen": -1.5982298851013184,
"logits/rejected": -1.7638392448425293,
"logps/chosen": -141.6815185546875,
"logps/rejected": -788.85888671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.018078766763210297,
"rewards/margins": 14.164934158325195,
"rewards/rejected": -14.146854400634766,
"step": 630
},
{
"epoch": 0.8677966101694915,
"grad_norm": 0.00032792057687413,
"learning_rate": 4.25715707330443e-07,
"logits/chosen": -1.5992499589920044,
"logits/rejected": -1.7798690795898438,
"logps/chosen": -128.59031677246094,
"logps/rejected": -792.5185546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.009904641658067703,
"rewards/margins": 14.5419340133667,
"rewards/rejected": -14.532029151916504,
"step": 640
},
{
"epoch": 0.8813559322033898,
"grad_norm": 0.00021017148877000384,
"learning_rate": 4.2303579346760173e-07,
"logits/chosen": -1.613771915435791,
"logits/rejected": -1.801413655281067,
"logps/chosen": -143.33572387695312,
"logps/rejected": -811.8010864257812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015471378341317177,
"rewards/margins": 14.446039199829102,
"rewards/rejected": -14.430566787719727,
"step": 650
},
{
"epoch": 0.8949152542372881,
"grad_norm": 0.0003556697469879977,
"learning_rate": 4.2031715482066655e-07,
"logits/chosen": -1.585208535194397,
"logits/rejected": -1.7628443241119385,
"logps/chosen": -133.2225799560547,
"logps/rejected": -783.7073974609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01283181644976139,
"rewards/margins": 14.336665153503418,
"rewards/rejected": -14.323833465576172,
"step": 660
},
{
"epoch": 0.9084745762711864,
"grad_norm": 0.00032469878909931687,
"learning_rate": 4.1756039981112373e-07,
"logits/chosen": -1.5636094808578491,
"logits/rejected": -1.7128851413726807,
"logps/chosen": -138.43899536132812,
"logps/rejected": -804.6480712890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.008499560877680779,
"rewards/margins": 14.382369995117188,
"rewards/rejected": -14.373868942260742,
"step": 670
},
{
"epoch": 0.9220338983050848,
"grad_norm": 0.0002716865446675816,
"learning_rate": 4.147661453907635e-07,
"logits/chosen": -1.6562086343765259,
"logits/rejected": -1.8350610733032227,
"logps/chosen": -153.1056365966797,
"logps/rejected": -784.3397216796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011557278223335743,
"rewards/margins": 14.29238510131836,
"rewards/rejected": -14.280828475952148,
"step": 680
},
{
"epoch": 0.9355932203389831,
"grad_norm": 0.00021878390732588127,
"learning_rate": 4.1193501690360834e-07,
"logits/chosen": -1.582091212272644,
"logits/rejected": -1.7254104614257812,
"logps/chosen": -141.56887817382812,
"logps/rejected": -816.9537353515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.025769073516130447,
"rewards/margins": 14.515641212463379,
"rewards/rejected": -14.489871978759766,
"step": 690
},
{
"epoch": 0.9491525423728814,
"grad_norm": 0.0006094805270326929,
"learning_rate": 4.0906764794596347e-07,
"logits/chosen": -1.5609385967254639,
"logits/rejected": -1.7095026969909668,
"logps/chosen": -142.61056518554688,
"logps/rejected": -779.0126953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011463040485978127,
"rewards/margins": 14.310674667358398,
"rewards/rejected": -14.299211502075195,
"step": 700
},
{
"epoch": 0.9491525423728814,
"eval_logits/chosen": -1.5444095134735107,
"eval_logits/rejected": -1.686043620109558,
"eval_logps/chosen": -129.370361328125,
"eval_logps/rejected": -760.5565795898438,
"eval_loss": 1.1656707101792563e-06,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.019651925191283226,
"eval_rewards/margins": 14.224937438964844,
"eval_rewards/rejected": -14.205286026000977,
"eval_runtime": 23.3494,
"eval_samples_per_second": 4.283,
"eval_steps_per_second": 1.071,
"step": 700
},
{
"epoch": 0.9627118644067797,
"grad_norm": 0.0006059188782453061,
"learning_rate": 4.0616468022462013e-07,
"logits/chosen": -1.6205476522445679,
"logits/rejected": -1.785776138305664,
"logps/chosen": -137.9543914794922,
"logps/rejected": -812.373291015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015250766649842262,
"rewards/margins": 14.447315216064453,
"rewards/rejected": -14.432065963745117,
"step": 710
},
{
"epoch": 0.976271186440678,
"grad_norm": 0.00027120641837016974,
"learning_rate": 4.0322676341324414e-07,
"logits/chosen": -1.613734245300293,
"logits/rejected": -1.7981483936309814,
"logps/chosen": -133.99325561523438,
"logps/rejected": -817.4915771484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011865440756082535,
"rewards/margins": 14.308055877685547,
"rewards/rejected": -14.296191215515137,
"step": 720
},
{
"epoch": 0.9898305084745763,
"grad_norm": 0.000231523046440184,
"learning_rate": 4.002545550069808e-07,
"logits/chosen": -1.6496268510818481,
"logits/rejected": -1.841505527496338,
"logps/chosen": -140.759765625,
"logps/rejected": -784.6021118164062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014270871877670288,
"rewards/margins": 14.330395698547363,
"rewards/rejected": -14.31612491607666,
"step": 730
},
{
"epoch": 1.0027118644067796,
"grad_norm": 0.00021386925480897464,
"learning_rate": 3.972487201753106e-07,
"logits/chosen": -1.6074280738830566,
"logits/rejected": -1.7795652151107788,
"logps/chosen": -136.89691162109375,
"logps/rejected": -791.8516845703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0169425867497921,
"rewards/margins": 14.432229995727539,
"rewards/rejected": -14.415288925170898,
"step": 740
},
{
"epoch": 1.016271186440678,
"grad_norm": 0.00040145408076981584,
"learning_rate": 3.9420993161318615e-07,
"logits/chosen": -1.6055034399032593,
"logits/rejected": -1.77395761013031,
"logps/chosen": -134.6471405029297,
"logps/rejected": -791.5114135742188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014754224568605423,
"rewards/margins": 14.438673973083496,
"rewards/rejected": -14.423918724060059,
"step": 750
},
{
"epoch": 1.0298305084745762,
"grad_norm": 0.00025553044965608517,
"learning_rate": 3.911388693904854e-07,
"logits/chosen": -1.6558409929275513,
"logits/rejected": -1.8468029499053955,
"logps/chosen": -140.86526489257812,
"logps/rejected": -796.6107788085938,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.010989008471369743,
"rewards/margins": 14.3390531539917,
"rewards/rejected": -14.32806396484375,
"step": 760
},
{
"epoch": 1.0433898305084746,
"grad_norm": 0.00032980079352337916,
"learning_rate": 3.8803622079981496e-07,
"logits/chosen": -1.578873872756958,
"logits/rejected": -1.7349332571029663,
"logps/chosen": -130.14547729492188,
"logps/rejected": -764.939208984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0018688759300857782,
"rewards/margins": 14.415413856506348,
"rewards/rejected": -14.413544654846191,
"step": 770
},
{
"epoch": 1.0569491525423729,
"grad_norm": 0.00031172746262580516,
"learning_rate": 3.8490268020269614e-07,
"logits/chosen": -1.6029382944107056,
"logits/rejected": -1.7419565916061401,
"logps/chosen": -140.53709411621094,
"logps/rejected": -811.80908203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.016495605930685997,
"rewards/margins": 14.483687400817871,
"rewards/rejected": -14.467193603515625,
"step": 780
},
{
"epoch": 1.0705084745762712,
"grad_norm": 0.0003301712565175788,
"learning_rate": 3.817389488741694e-07,
"logits/chosen": -1.57521653175354,
"logits/rejected": -1.7471016645431519,
"logps/chosen": -133.0887451171875,
"logps/rejected": -820.1575927734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.027268720790743828,
"rewards/margins": 14.49877643585205,
"rewards/rejected": -14.471506118774414,
"step": 790
},
{
"epoch": 1.0840677966101695,
"grad_norm": 0.000600013552539676,
"learning_rate": 3.785457348458516e-07,
"logits/chosen": -1.6120831966400146,
"logits/rejected": -1.783928394317627,
"logps/chosen": -145.61685180664062,
"logps/rejected": -781.661376953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0003941003233194351,
"rewards/margins": 14.212892532348633,
"rewards/rejected": -14.21249771118164,
"step": 800
},
{
"epoch": 1.0840677966101695,
"eval_logits/chosen": -1.5446038246154785,
"eval_logits/rejected": -1.6855926513671875,
"eval_logps/chosen": -129.36080932617188,
"eval_logps/rejected": -761.1478881835938,
"eval_loss": 1.1145154985570116e-06,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.020607449114322662,
"eval_rewards/margins": 14.285026550292969,
"eval_rewards/rejected": -14.264419555664062,
"eval_runtime": 23.5778,
"eval_samples_per_second": 4.241,
"eval_steps_per_second": 1.06,
"step": 800
},
{
"epoch": 1.0976271186440678,
"grad_norm": 0.00037026369248627336,
"learning_rate": 3.753237527474812e-07,
"logits/chosen": -1.5942147970199585,
"logits/rejected": -1.7750327587127686,
"logps/chosen": -137.2639923095703,
"logps/rejected": -795.8204956054688,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.018559224903583527,
"rewards/margins": 14.402730941772461,
"rewards/rejected": -14.384172439575195,
"step": 810
},
{
"epoch": 1.1111864406779661,
"grad_norm": 0.0002646065707675486,
"learning_rate": 3.7207372364698645e-07,
"logits/chosen": -1.6505929231643677,
"logits/rejected": -1.810257911682129,
"logps/chosen": -137.0296173095703,
"logps/rejected": -838.5280151367188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02975718304514885,
"rewards/margins": 14.56403923034668,
"rewards/rejected": -14.534282684326172,
"step": 820
},
{
"epoch": 1.1247457627118644,
"grad_norm": 0.0004516110377872378,
"learning_rate": 3.687963748891131e-07,
"logits/chosen": -1.5973923206329346,
"logits/rejected": -1.7931503057479858,
"logps/chosen": -148.9749755859375,
"logps/rejected": -849.8316650390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.007867500185966492,
"rewards/margins": 14.508625030517578,
"rewards/rejected": -14.50075626373291,
"step": 830
},
{
"epoch": 1.1383050847457628,
"grad_norm": 0.000400542368638696,
"learning_rate": 3.6549243993264747e-07,
"logits/chosen": -1.645015001296997,
"logits/rejected": -1.8152064085006714,
"logps/chosen": -134.02035522460938,
"logps/rejected": -784.4261474609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.00821724720299244,
"rewards/margins": 14.368156433105469,
"rewards/rejected": -14.359938621520996,
"step": 840
},
{
"epoch": 1.151864406779661,
"grad_norm": 0.0007766829343963702,
"learning_rate": 3.6216265818627066e-07,
"logits/chosen": -1.6092370748519897,
"logits/rejected": -1.7626619338989258,
"logps/chosen": -147.05447387695312,
"logps/rejected": -797.9004516601562,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.005066628567874432,
"rewards/margins": 14.369694709777832,
"rewards/rejected": -14.36462688446045,
"step": 850
},
{
"epoch": 1.1654237288135594,
"grad_norm": 0.00034472860967714935,
"learning_rate": 3.588077748430819e-07,
"logits/chosen": -1.53743577003479,
"logits/rejected": -1.6860629320144653,
"logps/chosen": -130.87753295898438,
"logps/rejected": -766.9425048828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.017799021676182747,
"rewards/margins": 14.47961139678955,
"rewards/rejected": -14.461811065673828,
"step": 860
},
{
"epoch": 1.1789830508474577,
"grad_norm": 0.00026858839945804227,
"learning_rate": 3.554285407138269e-07,
"logits/chosen": -1.5462911128997803,
"logits/rejected": -1.691042423248291,
"logps/chosen": -129.9382781982422,
"logps/rejected": -737.553955078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0059865121729671955,
"rewards/margins": 14.388315200805664,
"rewards/rejected": -14.382328987121582,
"step": 870
},
{
"epoch": 1.192542372881356,
"grad_norm": 0.00031207053329295663,
"learning_rate": 3.5202571205886913e-07,
"logits/chosen": -1.6693938970565796,
"logits/rejected": -1.8746066093444824,
"logps/chosen": -148.41986083984375,
"logps/rejected": -816.6522827148438,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.023402290418744087,
"rewards/margins": 14.48322582244873,
"rewards/rejected": -14.459823608398438,
"step": 880
},
{
"epoch": 1.2061016949152543,
"grad_norm": 0.0004730088609444601,
"learning_rate": 3.486000504189414e-07,
"logits/chosen": -1.5571904182434082,
"logits/rejected": -1.739476203918457,
"logps/chosen": -131.6705322265625,
"logps/rejected": -770.3902587890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011558154597878456,
"rewards/margins": 14.456482887268066,
"rewards/rejected": -14.444924354553223,
"step": 890
},
{
"epoch": 1.2196610169491526,
"grad_norm": 0.0004757504728478808,
"learning_rate": 3.4515232244471606e-07,
"logits/chosen": -1.4787291288375854,
"logits/rejected": -1.661275029182434,
"logps/chosen": -151.16482543945312,
"logps/rejected": -888.5305786132812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014035153202712536,
"rewards/margins": 14.75745964050293,
"rewards/rejected": -14.743424415588379,
"step": 900
},
{
"epoch": 1.2196610169491526,
"eval_logits/chosen": -1.5419373512268066,
"eval_logits/rejected": -1.685987949371338,
"eval_logps/chosen": -129.385986328125,
"eval_logps/rejected": -761.9303588867188,
"eval_loss": 1.042011717800051e-06,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.01808955892920494,
"eval_rewards/margins": 14.360756874084473,
"eval_rewards/rejected": -14.342667579650879,
"eval_runtime": 23.2568,
"eval_samples_per_second": 4.3,
"eval_steps_per_second": 1.075,
"step": 900
},
{
"epoch": 1.2332203389830507,
"grad_norm": 0.0003716686876859065,
"learning_rate": 3.41683299725231e-07,
"logits/chosen": -1.6072585582733154,
"logits/rejected": -1.7751410007476807,
"logps/chosen": -121.9486312866211,
"logps/rejected": -751.9805297851562,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014703430235385895,
"rewards/margins": 14.331403732299805,
"rewards/rejected": -14.316699981689453,
"step": 910
},
{
"epoch": 1.2467796610169493,
"grad_norm": 0.00045583582930412936,
"learning_rate": 3.3819375861521116e-07,
"logits/chosen": -1.6996678113937378,
"logits/rejected": -1.8710010051727295,
"logps/chosen": -144.1748046875,
"logps/rejected": -781.7122192382812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.00705701345577836,
"rewards/margins": 14.237908363342285,
"rewards/rejected": -14.244966506958008,
"step": 920
},
{
"epoch": 1.2603389830508474,
"grad_norm": 0.00027042786850133124,
"learning_rate": 3.346844800613229e-07,
"logits/chosen": -1.635947823524475,
"logits/rejected": -1.776626467704773,
"logps/chosen": -122.72543334960938,
"logps/rejected": -782.336669921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01217057928442955,
"rewards/margins": 14.620203971862793,
"rewards/rejected": -14.608033180236816,
"step": 930
},
{
"epoch": 1.2738983050847459,
"grad_norm": 0.00012613818860590913,
"learning_rate": 3.311562494274009e-07,
"logits/chosen": -1.5437029600143433,
"logits/rejected": -1.7053155899047852,
"logps/chosen": -146.37957763671875,
"logps/rejected": -853.6106567382812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.013366146944463253,
"rewards/margins": 14.802824020385742,
"rewards/rejected": -14.789458274841309,
"step": 940
},
{
"epoch": 1.287457627118644,
"grad_norm": 0.0009108583927914208,
"learning_rate": 3.2760985631868716e-07,
"logits/chosen": -1.6577401161193848,
"logits/rejected": -1.8012009859085083,
"logps/chosen": -128.98199462890625,
"logps/rejected": -803.785400390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0071465689688920975,
"rewards/margins": 14.534771919250488,
"rewards/rejected": -14.527626037597656,
"step": 950
},
{
"epoch": 1.3010169491525423,
"grad_norm": 0.00018290415757236828,
"learning_rate": 3.240460944051194e-07,
"logits/chosen": -1.6779531240463257,
"logits/rejected": -1.866011142730713,
"logps/chosen": -139.95425415039062,
"logps/rejected": -777.5191650390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.005402210168540478,
"rewards/margins": 14.424090385437012,
"rewards/rejected": -14.41868782043457,
"step": 960
},
{
"epoch": 1.3145762711864406,
"grad_norm": 0.00032737612531831567,
"learning_rate": 3.2046576124371106e-07,
"logits/chosen": -1.6980135440826416,
"logits/rejected": -1.8551394939422607,
"logps/chosen": -135.28433227539062,
"logps/rejected": -785.8731689453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01490587368607521,
"rewards/margins": 14.301523208618164,
"rewards/rejected": -14.286617279052734,
"step": 970
},
{
"epoch": 1.328135593220339,
"grad_norm": 0.00025153854347185144,
"learning_rate": 3.1686965810006104e-07,
"logits/chosen": -1.617310643196106,
"logits/rejected": -1.804062008857727,
"logps/chosen": -139.4668731689453,
"logps/rejected": -800.4359130859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.021481037139892578,
"rewards/margins": 14.51413345336914,
"rewards/rejected": -14.492652893066406,
"step": 980
},
{
"epoch": 1.3416949152542372,
"grad_norm": 0.000131467090970899,
"learning_rate": 3.132585897690329e-07,
"logits/chosen": -1.6868890523910522,
"logits/rejected": -1.8882062435150146,
"logps/chosen": -145.70948791503906,
"logps/rejected": -857.4185180664062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.008370953612029552,
"rewards/margins": 14.516642570495605,
"rewards/rejected": -14.508271217346191,
"step": 990
},
{
"epoch": 1.3552542372881355,
"grad_norm": 0.00031956784200775317,
"learning_rate": 3.096333643946452e-07,
"logits/chosen": -1.6276357173919678,
"logits/rejected": -1.8146514892578125,
"logps/chosen": -142.74488830566406,
"logps/rejected": -807.506103515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.012263872660696507,
"rewards/margins": 14.435070037841797,
"rewards/rejected": -14.422805786132812,
"step": 1000
},
{
"epoch": 1.3552542372881355,
"eval_logits/chosen": -1.5454390048980713,
"eval_logits/rejected": -1.6858749389648438,
"eval_logps/chosen": -129.4228973388672,
"eval_logps/rejected": -762.3302612304688,
"eval_loss": 9.916642511598184e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.014398334547877312,
"eval_rewards/margins": 14.39704704284668,
"eval_rewards/rejected": -14.382649421691895,
"eval_runtime": 23.2909,
"eval_samples_per_second": 4.294,
"eval_steps_per_second": 1.073,
"step": 1000
},
{
"epoch": 1.3688135593220339,
"grad_norm": 0.0005110214664274667,
"learning_rate": 3.059947932892113e-07,
"logits/chosen": -1.581527590751648,
"logits/rejected": -1.741445541381836,
"logps/chosen": -130.16134643554688,
"logps/rejected": -802.2377319335938,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.008667388930916786,
"rewards/margins": 14.611343383789062,
"rewards/rejected": -14.602675437927246,
"step": 1010
},
{
"epoch": 1.3823728813559322,
"grad_norm": 0.000385109902014585,
"learning_rate": 3.0234369075177105e-07,
"logits/chosen": -1.6576886177062988,
"logits/rejected": -1.8101606369018555,
"logps/chosen": -139.96914672851562,
"logps/rejected": -817.3280639648438,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.023594459518790245,
"rewards/margins": 14.629423141479492,
"rewards/rejected": -14.605830192565918,
"step": 1020
},
{
"epoch": 1.3959322033898305,
"grad_norm": 0.00040091688342177467,
"learning_rate": 2.9868087388585344e-07,
"logits/chosen": -1.5842094421386719,
"logits/rejected": -1.7406607866287231,
"logps/chosen": -129.07803344726562,
"logps/rejected": -770.8359985351562,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.013609381392598152,
"rewards/margins": 14.5753812789917,
"rewards/rejected": -14.561771392822266,
"step": 1030
},
{
"epoch": 1.4094915254237288,
"grad_norm": 0.00026033035395961934,
"learning_rate": 2.950071624166115e-07,
"logits/chosen": -1.7157691717147827,
"logits/rejected": -1.9252738952636719,
"logps/chosen": -141.04685974121094,
"logps/rejected": -809.7179565429688,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015553833916783333,
"rewards/margins": 14.591537475585938,
"rewards/rejected": -14.575984954833984,
"step": 1040
},
{
"epoch": 1.423050847457627,
"grad_norm": 0.00019886076594595082,
"learning_rate": 2.9132337850737127e-07,
"logits/chosen": -1.5866303443908691,
"logits/rejected": -1.745476245880127,
"logps/chosen": -136.6873779296875,
"logps/rejected": -779.5513916015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01606130599975586,
"rewards/margins": 14.531635284423828,
"rewards/rejected": -14.51557445526123,
"step": 1050
},
{
"epoch": 1.4366101694915254,
"grad_norm": 0.0003434669449456214,
"learning_rate": 2.8763034657563425e-07,
"logits/chosen": -1.594861626625061,
"logits/rejected": -1.7643938064575195,
"logps/chosen": -139.26364135742188,
"logps/rejected": -812.1429443359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015910768881440163,
"rewards/margins": 14.620904922485352,
"rewards/rejected": -14.60499382019043,
"step": 1060
},
{
"epoch": 1.4501694915254237,
"grad_norm": 0.00024295622286491588,
"learning_rate": 2.839288931085761e-07,
"logits/chosen": -1.6374831199645996,
"logits/rejected": -1.8399394750595093,
"logps/chosen": -146.24412536621094,
"logps/rejected": -838.77587890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0162392258644104,
"rewards/margins": 14.61961555480957,
"rewards/rejected": -14.603377342224121,
"step": 1070
},
{
"epoch": 1.463728813559322,
"grad_norm": 0.00021624383711316104,
"learning_rate": 2.802198464780814e-07,
"logits/chosen": -1.6567904949188232,
"logits/rejected": -1.8536536693572998,
"logps/chosen": -140.39337158203125,
"logps/rejected": -793.81640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.022600781172513962,
"rewards/margins": 14.519584655761719,
"rewards/rejected": -14.496984481811523,
"step": 1080
},
{
"epoch": 1.4772881355932204,
"grad_norm": 0.00025925077468732225,
"learning_rate": 2.765040367553572e-07,
"logits/chosen": -1.6113574504852295,
"logits/rejected": -1.8030925989151,
"logps/chosen": -137.23849487304688,
"logps/rejected": -808.6797485351562,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.005808805115520954,
"rewards/margins": 14.623893737792969,
"rewards/rejected": -14.618083953857422,
"step": 1090
},
{
"epoch": 1.4908474576271187,
"grad_norm": 0.0003907081889108827,
"learning_rate": 2.727822955251663e-07,
"logits/chosen": -1.6336138248443604,
"logits/rejected": -1.7939205169677734,
"logps/chosen": -135.02374267578125,
"logps/rejected": -803.9449462890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01596558652818203,
"rewards/margins": 14.46599292755127,
"rewards/rejected": -14.450026512145996,
"step": 1100
},
{
"epoch": 1.4908474576271187,
"eval_logits/chosen": -1.5441298484802246,
"eval_logits/rejected": -1.6875685453414917,
"eval_logps/chosen": -129.44482421875,
"eval_logps/rejected": -762.7908325195312,
"eval_loss": 9.502831517238519e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.012204695492982864,
"eval_rewards/margins": 14.44091510772705,
"eval_rewards/rejected": -14.4287109375,
"eval_runtime": 23.159,
"eval_samples_per_second": 4.318,
"eval_steps_per_second": 1.079,
"step": 1100
},
{
"epoch": 1.504406779661017,
"grad_norm": 0.00022865739977937088,
"learning_rate": 2.6905545569972124e-07,
"logits/chosen": -1.5306434631347656,
"logits/rejected": -1.7007068395614624,
"logps/chosen": -141.87411499023438,
"logps/rejected": -800.179931640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.020658917725086212,
"rewards/margins": 14.778863906860352,
"rewards/rejected": -14.758204460144043,
"step": 1110
},
{
"epoch": 1.5179661016949153,
"grad_norm": 0.00013390402806158302,
"learning_rate": 2.6532435133228176e-07,
"logits/chosen": -1.6342971324920654,
"logits/rejected": -1.7988243103027344,
"logps/chosen": -137.4864501953125,
"logps/rejected": -778.6915283203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011233944445848465,
"rewards/margins": 14.523719787597656,
"rewards/rejected": -14.51248550415039,
"step": 1120
},
{
"epoch": 1.5315254237288136,
"grad_norm": 0.0005026258076613325,
"learning_rate": 2.615898174304967e-07,
"logits/chosen": -1.6171948909759521,
"logits/rejected": -1.7615530490875244,
"logps/chosen": -157.30349731445312,
"logps/rejected": -839.1676635742188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011025247164070606,
"rewards/margins": 14.724686622619629,
"rewards/rejected": -14.713661193847656,
"step": 1130
},
{
"epoch": 1.5450847457627117,
"grad_norm": 0.00042097845856288896,
"learning_rate": 2.5785268976953204e-07,
"logits/chosen": -1.6210533380508423,
"logits/rejected": -1.776179552078247,
"logps/chosen": -138.84580993652344,
"logps/rejected": -801.5419921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.006160397548228502,
"rewards/margins": 14.480558395385742,
"rewards/rejected": -14.474397659301758,
"step": 1140
},
{
"epoch": 1.5586440677966102,
"grad_norm": 0.00020478295097651352,
"learning_rate": 2.541138047050281e-07,
"logits/chosen": -1.5814204216003418,
"logits/rejected": -1.7641853094100952,
"logps/chosen": -137.55215454101562,
"logps/rejected": -799.6993408203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.021200550720095634,
"rewards/margins": 14.847275733947754,
"rewards/rejected": -14.826072692871094,
"step": 1150
},
{
"epoch": 1.5722033898305083,
"grad_norm": 0.00023188287664193435,
"learning_rate": 2.5037399898592537e-07,
"logits/chosen": -1.5283310413360596,
"logits/rejected": -1.697199821472168,
"logps/chosen": -136.95700073242188,
"logps/rejected": -806.5762329101562,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.025980135425925255,
"rewards/margins": 14.58230209350586,
"rewards/rejected": -14.55632209777832,
"step": 1160
},
{
"epoch": 1.5857627118644069,
"grad_norm": 0.0002882430862738881,
"learning_rate": 2.466341095672036e-07,
"logits/chosen": -1.5664174556732178,
"logits/rejected": -1.7281914949417114,
"logps/chosen": -140.96392822265625,
"logps/rejected": -860.5864868164062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015253797173500061,
"rewards/margins": 14.810201644897461,
"rewards/rejected": -14.794947624206543,
"step": 1170
},
{
"epoch": 1.599322033898305,
"grad_norm": 0.00032442091374497006,
"learning_rate": 2.428949734225744e-07,
"logits/chosen": -1.5551780462265015,
"logits/rejected": -1.7200207710266113,
"logps/chosen": -116.5546646118164,
"logps/rejected": -769.7291259765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.020303260535001755,
"rewards/margins": 14.525774002075195,
"rewards/rejected": -14.505470275878906,
"step": 1180
},
{
"epoch": 1.6128813559322035,
"grad_norm": 0.00026348293289965986,
"learning_rate": 2.3915742735716914e-07,
"logits/chosen": -1.5562556982040405,
"logits/rejected": -1.721585988998413,
"logps/chosen": -132.5054931640625,
"logps/rejected": -780.1019287109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0179964117705822,
"rewards/margins": 14.633602142333984,
"rewards/rejected": -14.615605354309082,
"step": 1190
},
{
"epoch": 1.6264406779661016,
"grad_norm": 0.0004614000733649941,
"learning_rate": 2.3542230782026533e-07,
"logits/chosen": -1.6813287734985352,
"logits/rejected": -1.8313826322555542,
"logps/chosen": -139.7794189453125,
"logps/rejected": -794.0023803710938,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.018270863220095634,
"rewards/margins": 14.51429271697998,
"rewards/rejected": -14.496021270751953,
"step": 1200
},
{
"epoch": 1.6264406779661016,
"eval_logits/chosen": -1.5334513187408447,
"eval_logits/rejected": -1.6858574151992798,
"eval_logps/chosen": -129.38221740722656,
"eval_logps/rejected": -763.2522583007812,
"eval_loss": 8.998525800052448e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.01846611313521862,
"eval_rewards/margins": 14.493313789367676,
"eval_rewards/rejected": -14.474847793579102,
"eval_runtime": 23.3442,
"eval_samples_per_second": 4.284,
"eval_steps_per_second": 1.071,
"step": 1200
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.0002485397852219414,
"learning_rate": 2.3169045071809214e-07,
"logits/chosen": -1.6468091011047363,
"logits/rejected": -1.837449312210083,
"logps/chosen": -132.42532348632812,
"logps/rejected": -768.0850830078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.118643017951399e-05,
"rewards/margins": 14.453639030456543,
"rewards/rejected": -14.453720092773438,
"step": 1210
},
{
"epoch": 1.6535593220338982,
"grad_norm": 0.0002600309213234688,
"learning_rate": 2.279626912267576e-07,
"logits/chosen": -1.6016912460327148,
"logits/rejected": -1.7917726039886475,
"logps/chosen": -124.97222137451172,
"logps/rejected": -763.31396484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0005855015479028225,
"rewards/margins": 14.508626937866211,
"rewards/rejected": -14.509212493896484,
"step": 1220
},
{
"epoch": 1.6671186440677968,
"grad_norm": 0.0001866086825603447,
"learning_rate": 2.2423986360533944e-07,
"logits/chosen": -1.5999516248703003,
"logits/rejected": -1.7527964115142822,
"logps/chosen": -127.54315185546875,
"logps/rejected": -779.42626953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03687942773103714,
"rewards/margins": 14.615551948547363,
"rewards/rejected": -14.578672409057617,
"step": 1230
},
{
"epoch": 1.6806779661016948,
"grad_norm": 0.0001806454551986488,
"learning_rate": 2.2052280100918053e-07,
"logits/chosen": -1.6087480783462524,
"logits/rejected": -1.733353614807129,
"logps/chosen": -139.215576171875,
"logps/rejected": -828.4327392578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.012436101213097572,
"rewards/margins": 14.784907341003418,
"rewards/rejected": -14.77247142791748,
"step": 1240
},
{
"epoch": 1.6942372881355934,
"grad_norm": 0.0003598095391952748,
"learning_rate": 2.1681233530343226e-07,
"logits/chosen": -1.6320774555206299,
"logits/rejected": -1.770911455154419,
"logps/chosen": -130.1085968017578,
"logps/rejected": -781.075927734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.008734717965126038,
"rewards/margins": 14.645079612731934,
"rewards/rejected": -14.636345863342285,
"step": 1250
},
{
"epoch": 1.7077966101694915,
"grad_norm": 0.00021229397199613504,
"learning_rate": 2.131092968768856e-07,
"logits/chosen": -1.6914520263671875,
"logits/rejected": -1.8534152507781982,
"logps/chosen": -137.82667541503906,
"logps/rejected": -795.1356811523438,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01237824559211731,
"rewards/margins": 14.534427642822266,
"rewards/rejected": -14.522048950195312,
"step": 1260
},
{
"epoch": 1.7213559322033898,
"grad_norm": 0.00034558435149114095,
"learning_rate": 2.094145144561334e-07,
"logits/chosen": -1.6149834394454956,
"logits/rejected": -1.781679391860962,
"logps/chosen": -135.41860961914062,
"logps/rejected": -788.1875610351562,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.007942800410091877,
"rewards/margins": 14.633131980895996,
"rewards/rejected": -14.625189781188965,
"step": 1270
},
{
"epoch": 1.734915254237288,
"grad_norm": 0.0003052738823642222,
"learning_rate": 2.057288149201042e-07,
"logits/chosen": -1.6356430053710938,
"logits/rejected": -1.7994301319122314,
"logps/chosen": -148.37130737304688,
"logps/rejected": -835.1704711914062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.00246133329346776,
"rewards/margins": 14.854413032531738,
"rewards/rejected": -14.851951599121094,
"step": 1280
},
{
"epoch": 1.7484745762711864,
"grad_norm": 0.00021946391817271224,
"learning_rate": 2.0205302311501e-07,
"logits/chosen": -1.4806194305419922,
"logits/rejected": -1.6502048969268799,
"logps/chosen": -143.40792846679688,
"logps/rejected": -826.89990234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.022700820118188858,
"rewards/margins": 14.935534477233887,
"rewards/rejected": -14.912833213806152,
"step": 1290
},
{
"epoch": 1.7620338983050847,
"grad_norm": 0.00018901278388815272,
"learning_rate": 1.9838796166974835e-07,
"logits/chosen": -1.6020697355270386,
"logits/rejected": -1.7288063764572144,
"logps/chosen": -118.82467651367188,
"logps/rejected": -745.2700805664062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.023728668689727783,
"rewards/margins": 14.655136108398438,
"rewards/rejected": -14.631406784057617,
"step": 1300
},
{
"epoch": 1.7620338983050847,
"eval_logits/chosen": -1.541303038597107,
"eval_logits/rejected": -1.6866235733032227,
"eval_logps/chosen": -129.41481018066406,
"eval_logps/rejected": -763.7774658203125,
"eval_loss": 8.536229643141269e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.01520751602947712,
"eval_rewards/margins": 14.542576789855957,
"eval_rewards/rejected": -14.527369499206543,
"eval_runtime": 23.2319,
"eval_samples_per_second": 4.304,
"eval_steps_per_second": 1.076,
"step": 1300
},
{
"epoch": 1.775593220338983,
"grad_norm": 0.00024516636429012606,
"learning_rate": 1.947344508118013e-07,
"logits/chosen": -1.7006336450576782,
"logits/rejected": -1.9313997030258179,
"logps/chosen": -135.2695770263672,
"logps/rejected": -803.58984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01268923282623291,
"rewards/margins": 14.559757232666016,
"rewards/rejected": -14.54706859588623,
"step": 1310
},
{
"epoch": 1.7891525423728813,
"grad_norm": 0.0002825945001585042,
"learning_rate": 1.9109330818367103e-07,
"logits/chosen": -1.6026619672775269,
"logits/rejected": -1.7476422786712646,
"logps/chosen": -138.92730712890625,
"logps/rejected": -774.2352294921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011833753436803818,
"rewards/margins": 14.747849464416504,
"rewards/rejected": -14.736017227172852,
"step": 1320
},
{
"epoch": 1.8027118644067797,
"grad_norm": 0.00023343486470064288,
"learning_rate": 1.8746534865989477e-07,
"logits/chosen": -1.6764352321624756,
"logits/rejected": -1.8599662780761719,
"logps/chosen": -136.24755859375,
"logps/rejected": -778.046630859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.016610510647296906,
"rewards/margins": 14.57569694519043,
"rewards/rejected": -14.559085845947266,
"step": 1330
},
{
"epoch": 1.816271186440678,
"grad_norm": 0.00020349358848448975,
"learning_rate": 1.8385138416467886e-07,
"logits/chosen": -1.619372010231018,
"logits/rejected": -1.770355224609375,
"logps/chosen": -137.48997497558594,
"logps/rejected": -796.6224975585938,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01347330305725336,
"rewards/margins": 14.743675231933594,
"rewards/rejected": -14.730201721191406,
"step": 1340
},
{
"epoch": 1.8298305084745763,
"grad_norm": 0.000259228080505575,
"learning_rate": 1.802522234901927e-07,
"logits/chosen": -1.6564035415649414,
"logits/rejected": -1.8519361019134521,
"logps/chosen": -129.4320068359375,
"logps/rejected": -815.1136474609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03748949244618416,
"rewards/margins": 14.78825855255127,
"rewards/rejected": -14.750767707824707,
"step": 1350
},
{
"epoch": 1.8433898305084746,
"grad_norm": 0.0002871334654289991,
"learning_rate": 1.7666867211556436e-07,
"logits/chosen": -1.6476807594299316,
"logits/rejected": -1.7843477725982666,
"logps/chosen": -132.54351806640625,
"logps/rejected": -779.9259643554688,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.004673462361097336,
"rewards/margins": 14.616876602172852,
"rewards/rejected": -14.612202644348145,
"step": 1360
},
{
"epoch": 1.856949152542373,
"grad_norm": 0.00022022136706817435,
"learning_rate": 1.7310153202661698e-07,
"logits/chosen": -1.5748957395553589,
"logits/rejected": -1.7366337776184082,
"logps/chosen": -142.26052856445312,
"logps/rejected": -840.873779296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01894519291818142,
"rewards/margins": 14.885005950927734,
"rewards/rejected": -14.866060256958008,
"step": 1370
},
{
"epoch": 1.8705084745762712,
"grad_norm": 0.0002676827155917166,
"learning_rate": 1.695516015363876e-07,
"logits/chosen": -1.6750328540802002,
"logits/rejected": -1.8339290618896484,
"logps/chosen": -144.47813415527344,
"logps/rejected": -832.0415649414062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.023609986528754234,
"rewards/margins": 14.777373313903809,
"rewards/rejected": -14.753763198852539,
"step": 1380
},
{
"epoch": 1.8840677966101695,
"grad_norm": 0.00024546650545054666,
"learning_rate": 1.6601967510646718e-07,
"logits/chosen": -1.725512981414795,
"logits/rejected": -1.8677659034729004,
"logps/chosen": -147.13035583496094,
"logps/rejected": -842.0433349609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.013028274290263653,
"rewards/margins": 15.025127410888672,
"rewards/rejected": -15.012099266052246,
"step": 1390
},
{
"epoch": 1.8976271186440679,
"grad_norm": 0.00018426525023186526,
"learning_rate": 1.6250654316920325e-07,
"logits/chosen": -1.6960985660552979,
"logits/rejected": -1.874474287033081,
"logps/chosen": -138.58255004882812,
"logps/rejected": -803.3692016601562,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.012537289410829544,
"rewards/margins": 14.797500610351562,
"rewards/rejected": -14.784963607788086,
"step": 1400
},
{
"epoch": 1.8976271186440679,
"eval_logits/chosen": -1.5352431535720825,
"eval_logits/rejected": -1.686131238937378,
"eval_logps/chosen": -129.33590698242188,
"eval_logps/rejected": -764.0711059570312,
"eval_loss": 8.283525403385283e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.02309771627187729,
"eval_rewards/margins": 14.579834938049316,
"eval_rewards/rejected": -14.556737899780273,
"eval_runtime": 23.3395,
"eval_samples_per_second": 4.285,
"eval_steps_per_second": 1.071,
"step": 1400
},
{
"epoch": 1.911186440677966,
"grad_norm": 0.0002575697935659544,
"learning_rate": 1.5901299195080392e-07,
"logits/chosen": -1.6227922439575195,
"logits/rejected": -1.7947490215301514,
"logps/chosen": -130.98719787597656,
"logps/rejected": -791.5205078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.00749362725764513,
"rewards/margins": 14.689105987548828,
"rewards/rejected": -14.681612014770508,
"step": 1410
},
{
"epoch": 1.9247457627118645,
"grad_norm": 0.00018017946205386775,
"learning_rate": 1.5553980329538323e-07,
"logits/chosen": -1.5999001264572144,
"logits/rejected": -1.7650830745697021,
"logps/chosen": -139.0697021484375,
"logps/rejected": -810.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.010547073557972908,
"rewards/margins": 14.652795791625977,
"rewards/rejected": -14.642248153686523,
"step": 1420
},
{
"epoch": 1.9383050847457626,
"grad_norm": 0.0001479458221901959,
"learning_rate": 1.520877544899875e-07,
"logits/chosen": -1.6732416152954102,
"logits/rejected": -1.8719912767410278,
"logps/chosen": -154.01966857910156,
"logps/rejected": -817.1717529296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0030850698240101337,
"rewards/margins": 14.651540756225586,
"rewards/rejected": -14.654626846313477,
"step": 1430
},
{
"epoch": 1.951864406779661,
"grad_norm": 0.0002763487512077753,
"learning_rate": 1.4865761809064097e-07,
"logits/chosen": -1.6813652515411377,
"logits/rejected": -1.878997802734375,
"logps/chosen": -125.57931518554688,
"logps/rejected": -793.8370361328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015030990354716778,
"rewards/margins": 14.654011726379395,
"rewards/rejected": -14.638980865478516,
"step": 1440
},
{
"epoch": 1.9654237288135592,
"grad_norm": 0.00016799758819048652,
"learning_rate": 1.4525016174945103e-07,
"logits/chosen": -1.7774598598480225,
"logits/rejected": -1.9586834907531738,
"logps/chosen": -128.68084716796875,
"logps/rejected": -789.97021484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.020244726911187172,
"rewards/margins": 14.630430221557617,
"rewards/rejected": -14.610186576843262,
"step": 1450
},
{
"epoch": 1.9789830508474577,
"grad_norm": 0.00023868564973718838,
"learning_rate": 1.4186614804280978e-07,
"logits/chosen": -1.6796725988388062,
"logits/rejected": -1.8473913669586182,
"logps/chosen": -146.37466430664062,
"logps/rejected": -850.5054931640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.009393090382218361,
"rewards/margins": 14.856435775756836,
"rewards/rejected": -14.84704303741455,
"step": 1460
},
{
"epoch": 1.9925423728813558,
"grad_norm": 0.000159343350698008,
"learning_rate": 1.3850633430073286e-07,
"logits/chosen": -1.5762964487075806,
"logits/rejected": -1.7709100246429443,
"logps/chosen": -151.92115783691406,
"logps/rejected": -829.155029296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.013924097642302513,
"rewards/margins": 14.711325645446777,
"rewards/rejected": -14.69740104675293,
"step": 1470
},
{
"epoch": 2.0054237288135592,
"grad_norm": 0.00024916837620644227,
"learning_rate": 1.3517147243737148e-07,
"logits/chosen": -1.5804271697998047,
"logits/rejected": -1.7189936637878418,
"logps/chosen": -127.54490661621094,
"logps/rejected": -792.7044067382812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.021060097962617874,
"rewards/margins": 14.793603897094727,
"rewards/rejected": -14.772544860839844,
"step": 1480
},
{
"epoch": 2.0189830508474578,
"grad_norm": 0.00028667386075358764,
"learning_rate": 1.3186230878273653e-07,
"logits/chosen": -1.6898704767227173,
"logits/rejected": -1.8163692951202393,
"logps/chosen": -133.3569793701172,
"logps/rejected": -762.7427368164062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.00874436367303133,
"rewards/margins": 14.47949504852295,
"rewards/rejected": -14.47075080871582,
"step": 1490
},
{
"epoch": 2.032542372881356,
"grad_norm": 0.00030745885090509886,
"learning_rate": 1.285795839156729e-07,
"logits/chosen": -1.6830826997756958,
"logits/rejected": -1.8209247589111328,
"logps/chosen": -137.370361328125,
"logps/rejected": -783.111083984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.009560978040099144,
"rewards/margins": 14.544479370117188,
"rewards/rejected": -14.534917831420898,
"step": 1500
},
{
"epoch": 2.032542372881356,
"eval_logits/chosen": -1.5413455963134766,
"eval_logits/rejected": -1.6867271661758423,
"eval_logps/chosen": -129.37197875976562,
"eval_logps/rejected": -764.1895751953125,
"eval_loss": 8.282594876618532e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.01949072629213333,
"eval_rewards/margins": 14.588081359863281,
"eval_rewards/rejected": -14.56859016418457,
"eval_runtime": 23.3878,
"eval_samples_per_second": 4.276,
"eval_steps_per_second": 1.069,
"step": 1500
},
{
"epoch": 2.0461016949152544,
"grad_norm": 0.00027440430133344047,
"learning_rate": 1.2532403249812073e-07,
"logits/chosen": -1.629027247428894,
"logits/rejected": -1.7673338651657104,
"logps/chosen": -143.42172241210938,
"logps/rejected": -824.6468505859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.004273869562894106,
"rewards/margins": 14.795296669006348,
"rewards/rejected": -14.791022300720215,
"step": 1510
},
{
"epoch": 2.0596610169491525,
"grad_norm": 0.00023891678563468625,
"learning_rate": 1.2209638311070024e-07,
"logits/chosen": -1.6091238260269165,
"logits/rejected": -1.7953072786331177,
"logps/chosen": -127.51914978027344,
"logps/rejected": -782.3043212890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0034638545475900173,
"rewards/margins": 14.720000267028809,
"rewards/rejected": -14.716536521911621,
"step": 1520
},
{
"epoch": 2.073220338983051,
"grad_norm": 0.00024388492083099195,
"learning_rate": 1.1889735808965853e-07,
"logits/chosen": -1.6395263671875,
"logits/rejected": -1.8181636333465576,
"logps/chosen": -135.23330688476562,
"logps/rejected": -783.9580078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.009885641746222973,
"rewards/margins": 14.711644172668457,
"rewards/rejected": -14.701759338378906,
"step": 1530
},
{
"epoch": 2.086779661016949,
"grad_norm": 5.583927886360231e-05,
"learning_rate": 1.1572767336521322e-07,
"logits/chosen": -1.7426564693450928,
"logits/rejected": -1.9373841285705566,
"logps/chosen": -163.0467529296875,
"logps/rejected": -873.5623779296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01771201565861702,
"rewards/margins": 14.745927810668945,
"rewards/rejected": -14.728215217590332,
"step": 1540
},
{
"epoch": 2.1003389830508477,
"grad_norm": 0.00017823331489187885,
"learning_rate": 1.125880383013294e-07,
"logits/chosen": -1.5471131801605225,
"logits/rejected": -1.7288830280303955,
"logps/chosen": -122.73018646240234,
"logps/rejected": -782.6522216796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.029936904087662697,
"rewards/margins": 14.729475021362305,
"rewards/rejected": -14.699538230895996,
"step": 1550
},
{
"epoch": 2.1138983050847457,
"grad_norm": 0.0003091912161991017,
"learning_rate": 1.0947915553696741e-07,
"logits/chosen": -1.5830570459365845,
"logits/rejected": -1.7391778230667114,
"logps/chosen": -124.14356994628906,
"logps/rejected": -795.6461791992188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01131663378328085,
"rewards/margins": 14.700472831726074,
"rewards/rejected": -14.689155578613281,
"step": 1560
},
{
"epoch": 2.127457627118644,
"grad_norm": 0.00016494367259474433,
"learning_rate": 1.0640172082883377e-07,
"logits/chosen": -1.575786828994751,
"logits/rejected": -1.7403693199157715,
"logps/chosen": -143.2032470703125,
"logps/rejected": -797.1260986328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01606675237417221,
"rewards/margins": 14.676783561706543,
"rewards/rejected": -14.660717010498047,
"step": 1570
},
{
"epoch": 2.1410169491525424,
"grad_norm": 0.0001055398603476797,
"learning_rate": 1.0335642289567453e-07,
"logits/chosen": -1.5635545253753662,
"logits/rejected": -1.6949539184570312,
"logps/chosen": -137.00326538085938,
"logps/rejected": -795.691650390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.017065811902284622,
"rewards/margins": 14.849153518676758,
"rewards/rejected": -14.832088470458984,
"step": 1580
},
{
"epoch": 2.154576271186441,
"grad_norm": 0.00015184404381463285,
"learning_rate": 1.003439432641412e-07,
"logits/chosen": -1.5640537738800049,
"logits/rejected": -1.718758225440979,
"logps/chosen": -145.822998046875,
"logps/rejected": -822.9422607421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.027230584993958473,
"rewards/margins": 14.735852241516113,
"rewards/rejected": -14.70862102508545,
"step": 1590
},
{
"epoch": 2.168135593220339,
"grad_norm": 0.00017944006421282506,
"learning_rate": 9.736495611626869e-08,
"logits/chosen": -1.7426433563232422,
"logits/rejected": -1.9045021533966064,
"logps/chosen": -136.1781463623047,
"logps/rejected": -826.3635864257812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.009065323509275913,
"rewards/margins": 14.765503883361816,
"rewards/rejected": -14.774569511413574,
"step": 1600
},
{
"epoch": 2.168135593220339,
"eval_logits/chosen": -1.5352487564086914,
"eval_logits/rejected": -1.6869568824768066,
"eval_logps/chosen": -129.40280151367188,
"eval_logps/rejected": -764.318359375,
"eval_loss": 8.185303954633127e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.016407504677772522,
"eval_rewards/margins": 14.597875595092773,
"eval_rewards/rejected": -14.58146858215332,
"eval_runtime": 23.0522,
"eval_samples_per_second": 4.338,
"eval_steps_per_second": 1.084,
"step": 1600
},
{
"epoch": 2.181694915254237,
"grad_norm": 0.0005351685788183639,
"learning_rate": 9.442012813859495e-08,
"logits/chosen": -1.7391018867492676,
"logits/rejected": -1.9293949604034424,
"logps/chosen": -147.13917541503906,
"logps/rejected": -805.6615600585938,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.002429434796795249,
"rewards/margins": 14.607394218444824,
"rewards/rejected": -14.604965209960938,
"step": 1610
},
{
"epoch": 2.1952542372881356,
"grad_norm": 0.0001727788888810501,
"learning_rate": 9.151011837295967e-08,
"logits/chosen": -1.5907230377197266,
"logits/rejected": -1.779097318649292,
"logps/chosen": -135.67388916015625,
"logps/rejected": -828.772705078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02872363105416298,
"rewards/margins": 14.719782829284668,
"rewards/rejected": -14.691059112548828,
"step": 1620
},
{
"epoch": 2.2088135593220337,
"grad_norm": 0.00025741808212769845,
"learning_rate": 8.863557806901232e-08,
"logits/chosen": -1.8002914190292358,
"logits/rejected": -1.9408048391342163,
"logps/chosen": -132.03172302246094,
"logps/rejected": -784.93115234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011570243164896965,
"rewards/margins": 14.813987731933594,
"rewards/rejected": -14.80241584777832,
"step": 1630
},
{
"epoch": 2.2223728813559323,
"grad_norm": 0.00021793936297483956,
"learning_rate": 8.579715053846584e-08,
"logits/chosen": -1.5967586040496826,
"logits/rejected": -1.7385027408599854,
"logps/chosen": -154.05177307128906,
"logps/rejected": -800.445068359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.005721386056393385,
"rewards/margins": 14.734086036682129,
"rewards/rejected": -14.728364944458008,
"step": 1640
},
{
"epoch": 2.2359322033898303,
"grad_norm": 0.0002391850164899579,
"learning_rate": 8.299547101112466e-08,
"logits/chosen": -1.6040098667144775,
"logits/rejected": -1.767350435256958,
"logps/chosen": -144.35110473632812,
"logps/rejected": -783.85107421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.00950249470770359,
"rewards/margins": 14.55084228515625,
"rewards/rejected": -14.560342788696289,
"step": 1650
},
{
"epoch": 2.249491525423729,
"grad_norm": 0.00031210724967463475,
"learning_rate": 8.023116649272357e-08,
"logits/chosen": -1.6417827606201172,
"logits/rejected": -1.843612790107727,
"logps/chosen": -140.8085479736328,
"logps/rejected": -832.3277587890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01454589981585741,
"rewards/margins": 14.897970199584961,
"rewards/rejected": -14.883424758911133,
"step": 1660
},
{
"epoch": 2.263050847457627,
"grad_norm": 0.0001537621460843329,
"learning_rate": 7.750485562460529e-08,
"logits/chosen": -1.637737512588501,
"logits/rejected": -1.838837742805481,
"logps/chosen": -151.76361083984375,
"logps/rejected": -879.8406982421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.028323961421847343,
"rewards/margins": 15.002603530883789,
"rewards/rejected": -14.974279403686523,
"step": 1670
},
{
"epoch": 2.2766101694915255,
"grad_norm": 0.00014037312273128166,
"learning_rate": 7.48171485452716e-08,
"logits/chosen": -1.644343614578247,
"logits/rejected": -1.8316127061843872,
"logps/chosen": -148.27976989746094,
"logps/rejected": -807.6490478515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.002391491085290909,
"rewards/margins": 14.774595260620117,
"rewards/rejected": -14.776987075805664,
"step": 1680
},
{
"epoch": 2.2901694915254236,
"grad_norm": 0.0003568019109756037,
"learning_rate": 7.216864675383566e-08,
"logits/chosen": -1.7114001512527466,
"logits/rejected": -1.8946892023086548,
"logps/chosen": -136.16168212890625,
"logps/rejected": -822.518310546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0228084959089756,
"rewards/margins": 14.955730438232422,
"rewards/rejected": -14.93292236328125,
"step": 1690
},
{
"epoch": 2.303728813559322,
"grad_norm": 0.00020642497846486078,
"learning_rate": 6.955994297540946e-08,
"logits/chosen": -1.582494854927063,
"logits/rejected": -1.7430905103683472,
"logps/chosen": -127.01213073730469,
"logps/rejected": -749.1889038085938,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.004720491822808981,
"rewards/margins": 14.59855842590332,
"rewards/rejected": -14.593838691711426,
"step": 1700
},
{
"epoch": 2.303728813559322,
"eval_logits/chosen": -1.5455721616744995,
"eval_logits/rejected": -1.6867108345031738,
"eval_logps/chosen": -129.34518432617188,
"eval_logps/rejected": -764.3837890625,
"eval_loss": 8.27629833111132e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.022169284522533417,
"eval_rewards/margins": 14.610172271728516,
"eval_rewards/rejected": -14.588001251220703,
"eval_runtime": 23.2684,
"eval_samples_per_second": 4.298,
"eval_steps_per_second": 1.074,
"step": 1700
},
{
"epoch": 2.31728813559322,
"grad_norm": 0.0003301800906268837,
"learning_rate": 6.699162102845371e-08,
"logits/chosen": -1.6968494653701782,
"logits/rejected": -1.8610255718231201,
"logps/chosen": -144.05172729492188,
"logps/rejected": -792.3526000976562,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0033090743236243725,
"rewards/margins": 14.642684936523438,
"rewards/rejected": -14.645994186401367,
"step": 1710
},
{
"epoch": 2.3308474576271188,
"grad_norm": 0.00022300666262349114,
"learning_rate": 6.446425569412145e-08,
"logits/chosen": -1.5895376205444336,
"logits/rejected": -1.7664618492126465,
"logps/chosen": -147.58518981933594,
"logps/rejected": -821.5425415039062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.013544298708438873,
"rewards/margins": 14.664928436279297,
"rewards/rejected": -14.651383399963379,
"step": 1720
},
{
"epoch": 2.344406779661017,
"grad_norm": 0.0002689016484107736,
"learning_rate": 6.197841258762393e-08,
"logits/chosen": -1.6732720136642456,
"logits/rejected": -1.8248271942138672,
"logps/chosen": -143.9190216064453,
"logps/rejected": -798.0980224609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.002302667824551463,
"rewards/margins": 14.700139999389648,
"rewards/rejected": -14.70244312286377,
"step": 1730
},
{
"epoch": 2.3579661016949154,
"grad_norm": 0.00013823061030867806,
"learning_rate": 5.95346480316484e-08,
"logits/chosen": -1.6567462682724,
"logits/rejected": -1.8002218008041382,
"logps/chosen": -133.57330322265625,
"logps/rejected": -790.8118896484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01070284005254507,
"rewards/margins": 14.786917686462402,
"rewards/rejected": -14.776214599609375,
"step": 1740
},
{
"epoch": 2.3715254237288135,
"grad_norm": 0.00012708429822748938,
"learning_rate": 5.713350893185462e-08,
"logits/chosen": -1.540064811706543,
"logits/rejected": -1.6900475025177002,
"logps/chosen": -139.0050048828125,
"logps/rejected": -813.723388671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011193746700882912,
"rewards/margins": 14.780536651611328,
"rewards/rejected": -14.769343376159668,
"step": 1750
},
{
"epoch": 2.385084745762712,
"grad_norm": 0.00031972660791255883,
"learning_rate": 5.4775532654479703e-08,
"logits/chosen": -1.6927651166915894,
"logits/rejected": -1.8659099340438843,
"logps/chosen": -140.10357666015625,
"logps/rejected": -804.2578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.026886438950896263,
"rewards/margins": 14.513021469116211,
"rewards/rejected": -14.486133575439453,
"step": 1760
},
{
"epoch": 2.39864406779661,
"grad_norm": 0.0002306535610201857,
"learning_rate": 5.246124690607739e-08,
"logits/chosen": -1.597672939300537,
"logits/rejected": -1.7689580917358398,
"logps/chosen": -136.6164093017578,
"logps/rejected": -814.9773559570312,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015394443646073341,
"rewards/margins": 14.934272766113281,
"rewards/rejected": -14.918877601623535,
"step": 1770
},
{
"epoch": 2.4122033898305086,
"grad_norm": 0.00025827411726207306,
"learning_rate": 5.019116961541928e-08,
"logits/chosen": -1.6076412200927734,
"logits/rejected": -1.787865400314331,
"logps/chosen": -130.0896759033203,
"logps/rejected": -771.5728149414062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.018673747777938843,
"rewards/margins": 14.67123794555664,
"rewards/rejected": -14.652565002441406,
"step": 1780
},
{
"epoch": 2.4257627118644067,
"grad_norm": 0.0001607149373365815,
"learning_rate": 4.796580881758394e-08,
"logits/chosen": -1.5243449211120605,
"logits/rejected": -1.7028474807739258,
"logps/chosen": -135.9856719970703,
"logps/rejected": -788.3903198242188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.013456443324685097,
"rewards/margins": 14.943410873413086,
"rewards/rejected": -14.92995548248291,
"step": 1790
},
{
"epoch": 2.4393220338983053,
"grad_norm": 0.0008019666936544235,
"learning_rate": 4.5785662540261035e-08,
"logits/chosen": -1.5732166767120361,
"logits/rejected": -1.7345067262649536,
"logps/chosen": -123.94300842285156,
"logps/rejected": -809.0182495117188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.009817011654376984,
"rewards/margins": 14.806741714477539,
"rewards/rejected": -14.796926498413086,
"step": 1800
},
{
"epoch": 2.4393220338983053,
"eval_logits/chosen": -1.5405449867248535,
"eval_logits/rejected": -1.6869794130325317,
"eval_logps/chosen": -129.44265747070312,
"eval_logps/rejected": -764.5172729492188,
"eval_loss": 8.009520229279588e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.012421955354511738,
"eval_rewards/margins": 14.613780975341797,
"eval_rewards/rejected": -14.601359367370605,
"eval_runtime": 23.2676,
"eval_samples_per_second": 4.298,
"eval_steps_per_second": 1.074,
"step": 1800
},
{
"epoch": 2.4528813559322034,
"grad_norm": 0.0005455920009654483,
"learning_rate": 4.365121869229399e-08,
"logits/chosen": -1.6582889556884766,
"logits/rejected": -1.8295698165893555,
"logps/chosen": -117.7739486694336,
"logps/rejected": -741.6177368164062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011549338698387146,
"rewards/margins": 14.508467674255371,
"rewards/rejected": -14.496919631958008,
"step": 1810
},
{
"epoch": 2.4664406779661014,
"grad_norm": 0.00023816862497805363,
"learning_rate": 4.1562954954488194e-08,
"logits/chosen": -1.6208901405334473,
"logits/rejected": -1.81270432472229,
"logps/chosen": -137.12130737304688,
"logps/rejected": -798.8505249023438,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011716622859239578,
"rewards/margins": 14.657186508178711,
"rewards/rejected": -14.64547061920166,
"step": 1820
},
{
"epoch": 2.48,
"grad_norm": 0.0002564351546730618,
"learning_rate": 3.952133867270749e-08,
"logits/chosen": -1.6017396450042725,
"logits/rejected": -1.760289192199707,
"logps/chosen": -135.62570190429688,
"logps/rejected": -787.9927978515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.021429751068353653,
"rewards/margins": 14.778738021850586,
"rewards/rejected": -14.757308006286621,
"step": 1830
},
{
"epoch": 2.4935593220338985,
"grad_norm": 0.00024431220204588973,
"learning_rate": 3.7526826753284055e-08,
"logits/chosen": -1.5733165740966797,
"logits/rejected": -1.734438419342041,
"logps/chosen": -135.39663696289062,
"logps/rejected": -781.9830322265625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.021723030135035515,
"rewards/margins": 14.634653091430664,
"rewards/rejected": -14.612930297851562,
"step": 1840
},
{
"epoch": 2.5071186440677966,
"grad_norm": 0.0002804770855324591,
"learning_rate": 3.5579865560764086e-08,
"logits/chosen": -1.6363115310668945,
"logits/rejected": -1.8157660961151123,
"logps/chosen": -143.70252990722656,
"logps/rejected": -803.8099365234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.008160289376974106,
"rewards/margins": 14.804468154907227,
"rewards/rejected": -14.796307563781738,
"step": 1850
},
{
"epoch": 2.5206779661016947,
"grad_norm": 0.00042995034846870784,
"learning_rate": 3.3680890818013506e-08,
"logits/chosen": -1.7073619365692139,
"logits/rejected": -1.8638834953308105,
"logps/chosen": -136.8450469970703,
"logps/rejected": -822.551025390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.012727834284305573,
"rewards/margins": 14.761212348937988,
"rewards/rejected": -14.74848461151123,
"step": 1860
},
{
"epoch": 2.5342372881355932,
"grad_norm": 0.0005052071520552922,
"learning_rate": 3.183032750870443e-08,
"logits/chosen": -1.7014129161834717,
"logits/rejected": -1.890128493309021,
"logps/chosen": -144.991943359375,
"logps/rejected": -852.4500732421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.019090861082077026,
"rewards/margins": 14.904670715332031,
"rewards/rejected": -14.885579109191895,
"step": 1870
},
{
"epoch": 2.5477966101694918,
"grad_norm": 0.00029171939654467954,
"learning_rate": 3.002858978220535e-08,
"logits/chosen": -1.6215015649795532,
"logits/rejected": -1.8319523334503174,
"logps/chosen": -148.42018127441406,
"logps/rejected": -826.58349609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.017919525504112244,
"rewards/margins": 14.893685340881348,
"rewards/rejected": -14.875765800476074,
"step": 1880
},
{
"epoch": 2.56135593220339,
"grad_norm": 0.00014374560387721727,
"learning_rate": 2.8276080860896223e-08,
"logits/chosen": -1.5488464832305908,
"logits/rejected": -1.7413768768310547,
"logps/chosen": -148.57704162597656,
"logps/rejected": -838.784423828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015819568186998367,
"rewards/margins": 15.020793914794922,
"rewards/rejected": -15.004974365234375,
"step": 1890
},
{
"epoch": 2.574915254237288,
"grad_norm": 0.00013430221339783704,
"learning_rate": 2.65731929499286e-08,
"logits/chosen": -1.6086156368255615,
"logits/rejected": -1.7848008871078491,
"logps/chosen": -144.15988159179688,
"logps/rejected": -854.092529296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02485763281583786,
"rewards/margins": 14.896612167358398,
"rewards/rejected": -14.871755599975586,
"step": 1900
},
{
"epoch": 2.574915254237288,
"eval_logits/chosen": -1.5360183715820312,
"eval_logits/rejected": -1.686428189277649,
"eval_logps/chosen": -129.38560485839844,
"eval_logps/rejected": -764.64111328125,
"eval_loss": 7.962561880958674e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.018127169460058212,
"eval_rewards/margins": 14.63186264038086,
"eval_rewards/rejected": -14.613737106323242,
"eval_runtime": 23.1267,
"eval_samples_per_second": 4.324,
"eval_steps_per_second": 1.081,
"step": 1900
},
{
"epoch": 2.5884745762711865,
"grad_norm": 0.0003822317334269131,
"learning_rate": 2.492030714945162e-08,
"logits/chosen": -1.5995548963546753,
"logits/rejected": -1.76951003074646,
"logps/chosen": -135.77032470703125,
"logps/rejected": -790.2930908203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.00021793553605675697,
"rewards/margins": 14.569367408752441,
"rewards/rejected": -14.569584846496582,
"step": 1910
},
{
"epoch": 2.6020338983050846,
"grad_norm": 0.000792494216087496,
"learning_rate": 2.3317793369322992e-08,
"logits/chosen": -1.6509454250335693,
"logits/rejected": -1.8071436882019043,
"logps/chosen": -132.31130981445312,
"logps/rejected": -799.173583984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.030081693083047867,
"rewards/margins": 14.740245819091797,
"rewards/rejected": -14.710164070129395,
"step": 1920
},
{
"epoch": 2.615593220338983,
"grad_norm": 0.0002708186052822774,
"learning_rate": 2.1766010246324795e-08,
"logits/chosen": -1.5743701457977295,
"logits/rejected": -1.7702776193618774,
"logps/chosen": -147.4547119140625,
"logps/rejected": -827.8794555664062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.006160350050777197,
"rewards/margins": 14.79060173034668,
"rewards/rejected": -14.784441947937012,
"step": 1930
},
{
"epoch": 2.629152542372881,
"grad_norm": 0.00020442518354563292,
"learning_rate": 2.026530506390156e-08,
"logits/chosen": -1.5262815952301025,
"logits/rejected": -1.6566275358200073,
"logps/chosen": -147.53939819335938,
"logps/rejected": -822.6124267578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014457417652010918,
"rewards/margins": 14.995853424072266,
"rewards/rejected": -14.981395721435547,
"step": 1940
},
{
"epoch": 2.6427118644067797,
"grad_norm": 0.00014356861276548948,
"learning_rate": 1.8816013674439885e-08,
"logits/chosen": -1.6366736888885498,
"logits/rejected": -1.816946268081665,
"logps/chosen": -133.2000274658203,
"logps/rejected": -794.9268798828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0276030246168375,
"rewards/margins": 14.789546012878418,
"rewards/rejected": -14.761942863464355,
"step": 1950
},
{
"epoch": 2.656271186440678,
"grad_norm": 0.00018299695034949964,
"learning_rate": 1.741846042410533e-08,
"logits/chosen": -1.5427926778793335,
"logits/rejected": -1.7075831890106201,
"logps/chosen": -137.50941467285156,
"logps/rejected": -807.8783569335938,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.010896590538322926,
"rewards/margins": 14.576469421386719,
"rewards/rejected": -14.565572738647461,
"step": 1960
},
{
"epoch": 2.6698305084745764,
"grad_norm": 0.00016135810197605974,
"learning_rate": 1.607295808025558e-08,
"logits/chosen": -1.6068065166473389,
"logits/rejected": -1.7660378217697144,
"logps/chosen": -131.94476318359375,
"logps/rejected": -811.60791015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.019821301102638245,
"rewards/margins": 14.834625244140625,
"rewards/rejected": -14.814804077148438,
"step": 1970
},
{
"epoch": 2.6833898305084745,
"grad_norm": 0.00023692180234191582,
"learning_rate": 1.4779807761443635e-08,
"logits/chosen": -1.6394813060760498,
"logits/rejected": -1.8306336402893066,
"logps/chosen": -143.0768585205078,
"logps/rejected": -815.6318359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.021681049838662148,
"rewards/margins": 14.815585136413574,
"rewards/rejected": -14.793903350830078,
"step": 1980
},
{
"epoch": 2.696949152542373,
"grad_norm": 0.00026461222708204653,
"learning_rate": 1.353929887002897e-08,
"logits/chosen": -1.5212616920471191,
"logits/rejected": -1.702871561050415,
"logps/chosen": -137.1354522705078,
"logps/rejected": -811.0231323242188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.022010358050465584,
"rewards/margins": 14.848760604858398,
"rewards/rejected": -14.826749801635742,
"step": 1990
},
{
"epoch": 2.710508474576271,
"grad_norm": 0.00034035092300092515,
"learning_rate": 1.2351709027410145e-08,
"logits/chosen": -1.5257236957550049,
"logits/rejected": -1.6637029647827148,
"logps/chosen": -125.89049530029297,
"logps/rejected": -765.981201171875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02222251147031784,
"rewards/margins": 14.72728157043457,
"rewards/rejected": -14.705059051513672,
"step": 2000
},
{
"epoch": 2.710508474576271,
"eval_logits/chosen": -1.5505026578903198,
"eval_logits/rejected": -1.686079740524292,
"eval_logps/chosen": -129.3984832763672,
"eval_logps/rejected": -764.6372680664062,
"eval_loss": 7.91568538716092e-07,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.016839873045682907,
"eval_rewards/margins": 14.630191802978516,
"eval_rewards/rejected": -14.613351821899414,
"eval_runtime": 23.1883,
"eval_samples_per_second": 4.313,
"eval_steps_per_second": 1.078,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 2211,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 145012404715520.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}