{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2942, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.694915254237288e-09, "logits/chosen": -3.241600751876831, "logits/rejected": -2.8775925636291504, "logps/chosen": -233.8565673828125, "logps/rejected": -768.6746215820312, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.6949152542372882e-08, "logits/chosen": -3.183110475540161, "logits/rejected": -3.2940425872802734, "logps/chosen": -479.7237548828125, "logps/rejected": -508.1022644042969, "loss": 0.6914, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": -0.0011810138821601868, "rewards/margins": -0.001814355025999248, "rewards/rejected": 0.0006333404453471303, "step": 10 }, { "epoch": 0.01, "learning_rate": 3.3898305084745764e-08, "logits/chosen": -3.196739912033081, "logits/rejected": -3.3269875049591064, "logps/chosen": -235.5388946533203, "logps/rejected": -412.62353515625, "loss": 0.6541, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.04803984612226486, "rewards/margins": 0.09699604660272598, "rewards/rejected": -0.048956211656332016, "step": 20 }, { "epoch": 0.01, "learning_rate": 5.0847457627118645e-08, "logits/chosen": -3.134932041168213, "logits/rejected": -3.2440593242645264, "logps/chosen": -389.2142639160156, "logps/rejected": -413.16461181640625, "loss": 0.5485, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0791018083691597, "rewards/margins": 0.25357091426849365, "rewards/rejected": -0.17446911334991455, "step": 30 }, { "epoch": 0.01, "learning_rate": 6.779661016949153e-08, "logits/chosen": -3.1792006492614746, "logits/rejected": -3.3407816886901855, "logps/chosen": -251.45498657226562, "logps/rejected": -452.0169982910156, "loss": 0.4281, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3247135281562805, "rewards/margins": 0.7740644812583923, "rewards/rejected": -0.4493509829044342, "step": 40 }, { "epoch": 0.02, "learning_rate": 8.47457627118644e-08, "logits/chosen": -3.2240090370178223, "logits/rejected": -3.144906520843506, "logps/chosen": -191.65354919433594, "logps/rejected": -543.9041748046875, "loss": 0.3076, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7691439986228943, "rewards/margins": 1.466748595237732, "rewards/rejected": -0.6976046562194824, "step": 50 }, { "epoch": 0.02, "learning_rate": 1.0169491525423729e-07, "logits/chosen": -3.2373671531677246, "logits/rejected": -3.348659038543701, "logps/chosen": -271.8299560546875, "logps/rejected": -320.3260803222656, "loss": 0.2537, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.8796433210372925, "rewards/margins": 1.8711017370224, "rewards/rejected": -0.9914585947990417, "step": 60 }, { "epoch": 0.02, "learning_rate": 1.1864406779661017e-07, "logits/chosen": -3.1753342151641846, "logits/rejected": -3.235198974609375, "logps/chosen": -256.61114501953125, "logps/rejected": -491.931640625, "loss": 0.2074, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8632518649101257, "rewards/margins": 2.171854019165039, "rewards/rejected": -1.3086023330688477, "step": 70 }, { "epoch": 0.03, "learning_rate": 1.3559322033898305e-07, "logits/chosen": -3.1588549613952637, "logits/rejected": -3.2258479595184326, "logps/chosen": -386.23126220703125, "logps/rejected": -396.27880859375, "loss": 0.1891, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9385740160942078, "rewards/margins": 2.8416147232055664, "rewards/rejected": -1.9030405282974243, "step": 80 }, { "epoch": 0.03, "learning_rate": 1.5254237288135593e-07, "logits/chosen": -3.170820951461792, "logits/rejected": -3.3037006855010986, "logps/chosen": -204.9710235595703, "logps/rejected": -283.68133544921875, "loss": 0.1613, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.398224115371704, "rewards/margins": 3.5001823902130127, "rewards/rejected": -2.1019580364227295, "step": 90 }, { "epoch": 0.03, "learning_rate": 1.694915254237288e-07, "logits/chosen": -3.1509833335876465, "logits/rejected": -3.2727856636047363, "logps/chosen": -182.78797912597656, "logps/rejected": -422.30426025390625, "loss": 0.1608, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3794167041778564, "rewards/margins": 4.1482086181640625, "rewards/rejected": -2.768791913986206, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": -3.2044522762298584, "eval_logits/rejected": -3.2204079627990723, "eval_logps/chosen": -243.43142700195312, "eval_logps/rejected": -450.3221740722656, "eval_loss": 0.16541637480258942, "eval_rewards/accuracies": 0.9570707082748413, "eval_rewards/chosen": 1.2374151945114136, "eval_rewards/margins": 3.846338987350464, "eval_rewards/rejected": -2.6089231967926025, "eval_runtime": 452.6529, "eval_samples_per_second": 20.987, "eval_steps_per_second": 0.656, "step": 100 }, { "epoch": 0.04, "learning_rate": 1.8644067796610168e-07, "logits/chosen": -3.1778359413146973, "logits/rejected": -3.0348751544952393, "logps/chosen": -191.72665405273438, "logps/rejected": -667.9417724609375, "loss": 0.1325, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5171259641647339, "rewards/margins": 3.9871017932891846, "rewards/rejected": -2.4699759483337402, "step": 110 }, { "epoch": 0.04, "learning_rate": 2.0338983050847458e-07, "logits/chosen": -3.21209979057312, "logits/rejected": -3.245016574859619, "logps/chosen": -206.03567504882812, "logps/rejected": -441.95758056640625, "loss": 0.1188, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.7967685461044312, "rewards/margins": 4.877519607543945, "rewards/rejected": -3.0807507038116455, "step": 120 }, { "epoch": 0.04, "learning_rate": 2.2033898305084743e-07, "logits/chosen": -3.121410369873047, "logits/rejected": -3.2283928394317627, "logps/chosen": -255.36746215820312, "logps/rejected": -485.18017578125, "loss": 0.2426, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4470430612564087, "rewards/margins": 5.143087387084961, "rewards/rejected": -3.6960442066192627, "step": 130 }, { "epoch": 0.05, "learning_rate": 2.3728813559322033e-07, "logits/chosen": -3.1555418968200684, "logits/rejected": -3.1503801345825195, "logps/chosen": -167.48965454101562, "logps/rejected": -516.3311767578125, "loss": 0.1002, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1309149265289307, "rewards/margins": 4.941633701324463, "rewards/rejected": -3.810718059539795, "step": 140 }, { "epoch": 0.05, "learning_rate": 2.542372881355932e-07, "logits/chosen": -3.143369674682617, "logits/rejected": -3.172342538833618, "logps/chosen": -197.7274169921875, "logps/rejected": -469.4510803222656, "loss": 0.1209, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3236812353134155, "rewards/margins": 5.159438133239746, "rewards/rejected": -3.8357574939727783, "step": 150 }, { "epoch": 0.05, "learning_rate": 2.711864406779661e-07, "logits/chosen": -3.113320827484131, "logits/rejected": -3.179581642150879, "logps/chosen": -190.3457794189453, "logps/rejected": -422.5477600097656, "loss": 0.1031, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2049860954284668, "rewards/margins": 5.337760925292969, "rewards/rejected": -4.132774353027344, "step": 160 }, { "epoch": 0.06, "learning_rate": 2.88135593220339e-07, "logits/chosen": -3.1512656211853027, "logits/rejected": -3.2015151977539062, "logps/chosen": -291.2518615722656, "logps/rejected": -427.808837890625, "loss": 0.0989, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2022926807403564, "rewards/margins": 6.1538920402526855, "rewards/rejected": -4.951599597930908, "step": 170 }, { "epoch": 0.06, "learning_rate": 3.0508474576271186e-07, "logits/chosen": -3.0925116539001465, "logits/rejected": -3.1885197162628174, "logps/chosen": -316.8929138183594, "logps/rejected": -568.46435546875, "loss": 0.1677, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3121305704116821, "rewards/margins": 6.406019687652588, "rewards/rejected": -5.093889236450195, "step": 180 }, { "epoch": 0.06, "learning_rate": 3.220338983050847e-07, "logits/chosen": -3.191131830215454, "logits/rejected": -3.2551910877227783, "logps/chosen": -200.42120361328125, "logps/rejected": -473.00506591796875, "loss": 0.0712, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.504188060760498, "rewards/margins": 6.774598598480225, "rewards/rejected": -5.270411491394043, "step": 190 }, { "epoch": 0.07, "learning_rate": 3.389830508474576e-07, "logits/chosen": -3.1266510486602783, "logits/rejected": -3.2319884300231934, "logps/chosen": -254.8800811767578, "logps/rejected": -386.7709045410156, "loss": 0.1349, "rewards/accuracies": 0.9375, "rewards/chosen": 0.716153621673584, "rewards/margins": 6.651679992675781, "rewards/rejected": -5.935526371002197, "step": 200 }, { "epoch": 0.07, "eval_logits/chosen": -3.2216391563415527, "eval_logits/rejected": -3.1898040771484375, "eval_logps/chosen": -246.39938354492188, "eval_logps/rejected": -487.68365478515625, "eval_loss": 0.09607043862342834, "eval_rewards/accuracies": 0.9755892157554626, "eval_rewards/chosen": 0.9406206011772156, "eval_rewards/margins": 7.285686492919922, "eval_rewards/rejected": -6.345065593719482, "eval_runtime": 451.6853, "eval_samples_per_second": 21.032, "eval_steps_per_second": 0.658, "step": 200 }, { "epoch": 0.07, "learning_rate": 3.559322033898305e-07, "logits/chosen": -3.162513256072998, "logits/rejected": -3.0620617866516113, "logps/chosen": -185.5023651123047, "logps/rejected": -624.7681274414062, "loss": 0.1602, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2032901048660278, "rewards/margins": 7.2529778480529785, "rewards/rejected": -6.04968786239624, "step": 210 }, { "epoch": 0.07, "learning_rate": 3.7288135593220336e-07, "logits/chosen": -3.16162109375, "logits/rejected": -3.2060294151306152, "logps/chosen": -306.1747131347656, "logps/rejected": -416.8641662597656, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": 1.0064268112182617, "rewards/margins": 7.897356986999512, "rewards/rejected": -6.890931129455566, "step": 220 }, { "epoch": 0.08, "learning_rate": 3.898305084745763e-07, "logits/chosen": -3.1517322063446045, "logits/rejected": -3.153977394104004, "logps/chosen": -265.21539306640625, "logps/rejected": -504.4422912597656, "loss": 0.0677, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7571786642074585, "rewards/margins": 7.3733367919921875, "rewards/rejected": -6.616157531738281, "step": 230 }, { "epoch": 0.08, "learning_rate": 4.0677966101694916e-07, "logits/chosen": -3.0925240516662598, "logits/rejected": -3.102187395095825, "logps/chosen": -250.0133514404297, "logps/rejected": -633.9852294921875, "loss": 0.0745, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.48636823892593384, "rewards/margins": 9.285165786743164, "rewards/rejected": -8.798797607421875, "step": 240 }, { "epoch": 0.08, "learning_rate": 4.23728813559322e-07, "logits/chosen": -3.176499843597412, "logits/rejected": -3.1582164764404297, "logps/chosen": -195.40415954589844, "logps/rejected": -477.47833251953125, "loss": 0.3177, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0647075176239014, "rewards/margins": 8.443204879760742, "rewards/rejected": -7.378498077392578, "step": 250 }, { "epoch": 0.09, "learning_rate": 4.4067796610169486e-07, "logits/chosen": -3.1202080249786377, "logits/rejected": -3.175307273864746, "logps/chosen": -255.7227783203125, "logps/rejected": -521.5419311523438, "loss": 0.0741, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.017443299293518, "rewards/margins": 9.245269775390625, "rewards/rejected": -8.227825164794922, "step": 260 }, { "epoch": 0.09, "learning_rate": 4.576271186440678e-07, "logits/chosen": -3.1279916763305664, "logits/rejected": -3.19228196144104, "logps/chosen": -238.5277862548828, "logps/rejected": -373.4498596191406, "loss": 0.0622, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6001161336898804, "rewards/margins": 8.970643043518066, "rewards/rejected": -8.370526313781738, "step": 270 }, { "epoch": 0.1, "learning_rate": 4.7457627118644066e-07, "logits/chosen": -3.114872694015503, "logits/rejected": -3.2416419982910156, "logps/chosen": -214.206298828125, "logps/rejected": -393.0050354003906, "loss": 0.0953, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.1830778867006302, "rewards/margins": 10.744420051574707, "rewards/rejected": -10.561342239379883, "step": 280 }, { "epoch": 0.1, "learning_rate": 4.915254237288136e-07, "logits/chosen": -3.1324026584625244, "logits/rejected": -3.228682041168213, "logps/chosen": -186.04457092285156, "logps/rejected": -486.37286376953125, "loss": 0.0347, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.23065343499183655, "rewards/margins": 7.208334445953369, "rewards/rejected": -6.9776811599731445, "step": 290 }, { "epoch": 0.1, "learning_rate": 4.990555345674349e-07, "logits/chosen": -3.194873094558716, "logits/rejected": -3.2273032665252686, "logps/chosen": -205.6195831298828, "logps/rejected": -589.7122192382812, "loss": 0.1065, "rewards/accuracies": 1.0, "rewards/chosen": 0.19616279006004333, "rewards/margins": 8.628320693969727, "rewards/rejected": -8.432156562805176, "step": 300 }, { "epoch": 0.1, "eval_logits/chosen": -3.2283244132995605, "eval_logits/rejected": -3.1999356746673584, "eval_logps/chosen": -258.00885009765625, "eval_logps/rejected": -516.9434204101562, "eval_loss": 0.10145324468612671, "eval_rewards/accuracies": 0.9840067625045776, "eval_rewards/chosen": -0.22032807767391205, "eval_rewards/margins": 9.050712585449219, "eval_rewards/rejected": -9.271040916442871, "eval_runtime": 451.8175, "eval_samples_per_second": 21.026, "eval_steps_per_second": 0.657, "step": 300 }, { "epoch": 0.11, "learning_rate": 4.971666037023044e-07, "logits/chosen": -3.2539610862731934, "logits/rejected": -3.189645290374756, "logps/chosen": -218.11282348632812, "logps/rejected": -482.13238525390625, "loss": 0.1651, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5336198210716248, "rewards/margins": 8.224891662597656, "rewards/rejected": -8.758512496948242, "step": 310 }, { "epoch": 0.11, "learning_rate": 4.952776728371742e-07, "logits/chosen": -3.1139378547668457, "logits/rejected": -3.1811156272888184, "logps/chosen": -395.83807373046875, "logps/rejected": -422.86126708984375, "loss": 0.0746, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5153623819351196, "rewards/margins": 8.82536506652832, "rewards/rejected": -9.340726852416992, "step": 320 }, { "epoch": 0.11, "learning_rate": 4.933887419720438e-07, "logits/chosen": -3.185594320297241, "logits/rejected": -3.141575574874878, "logps/chosen": -274.31475830078125, "logps/rejected": -605.5130004882812, "loss": 0.0585, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5278395414352417, "rewards/margins": 9.545147895812988, "rewards/rejected": -10.07298755645752, "step": 330 }, { "epoch": 0.12, "learning_rate": 4.914998111069135e-07, "logits/chosen": -3.2110514640808105, "logits/rejected": -3.252934217453003, "logps/chosen": -197.74810791015625, "logps/rejected": -527.8907470703125, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": -0.15357597172260284, "rewards/margins": 10.004459381103516, "rewards/rejected": -10.158035278320312, "step": 340 }, { "epoch": 0.12, "learning_rate": 4.896108802417831e-07, "logits/chosen": -3.2196593284606934, "logits/rejected": -3.116806745529175, "logps/chosen": -252.00607299804688, "logps/rejected": -586.9210205078125, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": -0.3348214626312256, "rewards/margins": 10.310604095458984, "rewards/rejected": -10.645425796508789, "step": 350 }, { "epoch": 0.12, "learning_rate": 4.877219493766528e-07, "logits/chosen": -3.2220287322998047, "logits/rejected": -3.3167202472686768, "logps/chosen": -195.5267333984375, "logps/rejected": -458.151123046875, "loss": 0.0394, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5834547877311707, "rewards/margins": 13.117718696594238, "rewards/rejected": -13.701173782348633, "step": 360 }, { "epoch": 0.13, "learning_rate": 4.858330185115224e-07, "logits/chosen": -3.207420825958252, "logits/rejected": -3.1515700817108154, "logps/chosen": -204.42086791992188, "logps/rejected": -775.6949462890625, "loss": 0.1166, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6016790270805359, "rewards/margins": 10.775850296020508, "rewards/rejected": -11.377527236938477, "step": 370 }, { "epoch": 0.13, "learning_rate": 4.839440876463921e-07, "logits/chosen": -3.261585235595703, "logits/rejected": -3.1481096744537354, "logps/chosen": -224.3874053955078, "logps/rejected": -682.3482055664062, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": -0.30562129616737366, "rewards/margins": 11.442914962768555, "rewards/rejected": -11.748537063598633, "step": 380 }, { "epoch": 0.13, "learning_rate": 4.820551567812618e-07, "logits/chosen": -3.1784658432006836, "logits/rejected": -3.128221273422241, "logps/chosen": -276.6926574707031, "logps/rejected": -459.2041931152344, "loss": 0.0602, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6345301866531372, "rewards/margins": 11.814035415649414, "rewards/rejected": -12.448564529418945, "step": 390 }, { "epoch": 0.14, "learning_rate": 4.801662259161314e-07, "logits/chosen": -3.2648723125457764, "logits/rejected": -3.161604404449463, "logps/chosen": -217.42861938476562, "logps/rejected": -682.0848388671875, "loss": 0.0876, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6267936825752258, "rewards/margins": 13.1732177734375, "rewards/rejected": -13.800012588500977, "step": 400 }, { "epoch": 0.14, "eval_logits/chosen": -3.275331974029541, "eval_logits/rejected": -3.206566095352173, "eval_logps/chosen": -270.2173767089844, "eval_logps/rejected": -561.2250366210938, "eval_loss": 0.059712644666433334, "eval_rewards/accuracies": 0.9865319728851318, "eval_rewards/chosen": -1.4411805868148804, "eval_rewards/margins": 12.258034706115723, "eval_rewards/rejected": -13.699216842651367, "eval_runtime": 451.4664, "eval_samples_per_second": 21.043, "eval_steps_per_second": 0.658, "step": 400 }, { "epoch": 0.14, "learning_rate": 4.782772950510011e-07, "logits/chosen": -3.2593846321105957, "logits/rejected": -3.095431327819824, "logps/chosen": -216.7010498046875, "logps/rejected": -542.0743408203125, "loss": 0.0473, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9579986333847046, "rewards/margins": 10.643548965454102, "rewards/rejected": -11.601548194885254, "step": 410 }, { "epoch": 0.14, "learning_rate": 4.7638836418587073e-07, "logits/chosen": -3.1919872760772705, "logits/rejected": -3.1519837379455566, "logps/chosen": -222.6879425048828, "logps/rejected": -716.9598388671875, "loss": 0.0867, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.237994909286499, "rewards/margins": 13.0872220993042, "rewards/rejected": -14.325218200683594, "step": 420 }, { "epoch": 0.15, "learning_rate": 4.7449943332074044e-07, "logits/chosen": -3.1929614543914795, "logits/rejected": -3.092634677886963, "logps/chosen": -203.4615020751953, "logps/rejected": -651.5323486328125, "loss": 0.0356, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7688810229301453, "rewards/margins": 12.568059921264648, "rewards/rejected": -13.336939811706543, "step": 430 }, { "epoch": 0.15, "learning_rate": 4.7261050245561014e-07, "logits/chosen": -3.2217884063720703, "logits/rejected": -3.214984178543091, "logps/chosen": -209.8181915283203, "logps/rejected": -500.721923828125, "loss": 0.039, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.114246129989624, "rewards/margins": 12.425156593322754, "rewards/rejected": -13.539402961730957, "step": 440 }, { "epoch": 0.15, "learning_rate": 4.7072157159047975e-07, "logits/chosen": -3.191771984100342, "logits/rejected": -3.340414524078369, "logps/chosen": -251.2989501953125, "logps/rejected": -453.38812255859375, "loss": 0.0678, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3671858310699463, "rewards/margins": 13.762568473815918, "rewards/rejected": -15.129753112792969, "step": 450 }, { "epoch": 0.16, "learning_rate": 4.6883264072534946e-07, "logits/chosen": -3.229691982269287, "logits/rejected": -3.2845585346221924, "logps/chosen": -275.047119140625, "logps/rejected": -483.4437561035156, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": -0.8710290789604187, "rewards/margins": 13.83751106262207, "rewards/rejected": -14.708539962768555, "step": 460 }, { "epoch": 0.16, "learning_rate": 4.6694370986021906e-07, "logits/chosen": -3.261676073074341, "logits/rejected": -3.308655261993408, "logps/chosen": -364.6744689941406, "logps/rejected": -348.94195556640625, "loss": 0.2232, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5191086530685425, "rewards/margins": 12.976631164550781, "rewards/rejected": -14.49573802947998, "step": 470 }, { "epoch": 0.16, "learning_rate": 4.6505477899508877e-07, "logits/chosen": -3.34987211227417, "logits/rejected": -3.1941416263580322, "logps/chosen": -215.4343719482422, "logps/rejected": -669.6368408203125, "loss": 0.1063, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3562166690826416, "rewards/margins": 15.658292770385742, "rewards/rejected": -17.014511108398438, "step": 480 }, { "epoch": 0.17, "learning_rate": 4.631658481299584e-07, "logits/chosen": -3.346367359161377, "logits/rejected": -3.2884230613708496, "logps/chosen": -229.24880981445312, "logps/rejected": -600.4979858398438, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": -0.6670745015144348, "rewards/margins": 18.63577651977539, "rewards/rejected": -19.3028507232666, "step": 490 }, { "epoch": 0.17, "learning_rate": 4.612769172648281e-07, "logits/chosen": -3.2286019325256348, "logits/rejected": -3.272526264190674, "logps/chosen": -331.59771728515625, "logps/rejected": -384.60089111328125, "loss": 0.304, "rewards/accuracies": 1.0, "rewards/chosen": 0.12641265988349915, "rewards/margins": 16.35049819946289, "rewards/rejected": -16.224084854125977, "step": 500 }, { "epoch": 0.17, "eval_logits/chosen": -3.2601242065429688, "eval_logits/rejected": -3.20926833152771, "eval_logps/chosen": -258.48223876953125, "eval_logps/rejected": -596.7301635742188, "eval_loss": 0.08744455128908157, "eval_rewards/accuracies": 0.9890572428703308, "eval_rewards/chosen": -0.26766717433929443, "eval_rewards/margins": 16.982051849365234, "eval_rewards/rejected": -17.249719619750977, "eval_runtime": 452.2491, "eval_samples_per_second": 21.006, "eval_steps_per_second": 0.657, "step": 500 }, { "epoch": 0.17, "learning_rate": 4.5938798639969773e-07, "logits/chosen": -3.1900811195373535, "logits/rejected": -3.165273904800415, "logps/chosen": -310.04840087890625, "logps/rejected": -706.6512451171875, "loss": 0.0779, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.08271221071481705, "rewards/margins": 16.163110733032227, "rewards/rejected": -16.24582290649414, "step": 510 }, { "epoch": 0.18, "learning_rate": 4.574990555345674e-07, "logits/chosen": -3.233689546585083, "logits/rejected": -3.1376094818115234, "logps/chosen": -227.1941680908203, "logps/rejected": -826.2154541015625, "loss": 0.3105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.272916555404663, "rewards/margins": 36.48918151855469, "rewards/rejected": -37.76210021972656, "step": 520 }, { "epoch": 0.18, "learning_rate": 4.556101246694371e-07, "logits/chosen": -3.3540146350860596, "logits/rejected": -3.1351585388183594, "logps/chosen": -206.82461547851562, "logps/rejected": -887.9490966796875, "loss": 0.1344, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5983909368515015, "rewards/margins": 37.151283264160156, "rewards/rejected": -37.749671936035156, "step": 530 }, { "epoch": 0.18, "learning_rate": 4.5372119380430675e-07, "logits/chosen": -3.2952301502227783, "logits/rejected": -3.2790439128875732, "logps/chosen": -196.64913940429688, "logps/rejected": -452.2648010253906, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -0.035662006586790085, "rewards/margins": 12.354232788085938, "rewards/rejected": -12.389894485473633, "step": 540 }, { "epoch": 0.19, "learning_rate": 4.518322629391764e-07, "logits/chosen": -3.339197874069214, "logits/rejected": -3.2826008796691895, "logps/chosen": -215.4700469970703, "logps/rejected": -526.360595703125, "loss": 0.0365, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8227304220199585, "rewards/margins": 13.043925285339355, "rewards/rejected": -13.866655349731445, "step": 550 }, { "epoch": 0.19, "learning_rate": 4.4994333207404607e-07, "logits/chosen": -3.238922119140625, "logits/rejected": -3.2724907398223877, "logps/chosen": -336.5367126464844, "logps/rejected": -491.58416748046875, "loss": 0.0928, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.3954570293426514, "rewards/margins": 12.810078620910645, "rewards/rejected": -14.205537796020508, "step": 560 }, { "epoch": 0.19, "learning_rate": 4.480544012089157e-07, "logits/chosen": -3.1199750900268555, "logits/rejected": -3.2144436836242676, "logps/chosen": -372.39630126953125, "logps/rejected": -409.29168701171875, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -0.8765771985054016, "rewards/margins": 15.976984977722168, "rewards/rejected": -16.85356330871582, "step": 570 }, { "epoch": 0.2, "learning_rate": 4.461654703437854e-07, "logits/chosen": -3.1464171409606934, "logits/rejected": -3.1977782249450684, "logps/chosen": -329.33721923828125, "logps/rejected": -579.863525390625, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": -0.042082421481609344, "rewards/margins": 18.116785049438477, "rewards/rejected": -18.158864974975586, "step": 580 }, { "epoch": 0.2, "learning_rate": 4.442765394786551e-07, "logits/chosen": -3.2647204399108887, "logits/rejected": -3.173513889312744, "logps/chosen": -210.3753662109375, "logps/rejected": -446.36175537109375, "loss": 0.1479, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5811902284622192, "rewards/margins": 14.033243179321289, "rewards/rejected": -14.614431381225586, "step": 590 }, { "epoch": 0.2, "learning_rate": 4.423876086135247e-07, "logits/chosen": -3.1431689262390137, "logits/rejected": -3.110055685043335, "logps/chosen": -322.7793884277344, "logps/rejected": -717.8176879882812, "loss": 0.1206, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4665941298007965, "rewards/margins": 16.126140594482422, "rewards/rejected": -16.592737197875977, "step": 600 }, { "epoch": 0.2, "eval_logits/chosen": -3.2024269104003906, "eval_logits/rejected": -3.168919801712036, "eval_logps/chosen": -260.0578308105469, "eval_logps/rejected": -580.7472534179688, "eval_loss": 0.06855383515357971, "eval_rewards/accuracies": 0.9890572428703308, "eval_rewards/chosen": -0.42522314190864563, "eval_rewards/margins": 15.226205825805664, "eval_rewards/rejected": -15.65142822265625, "eval_runtime": 462.2117, "eval_samples_per_second": 20.553, "eval_steps_per_second": 0.643, "step": 600 }, { "epoch": 0.21, "learning_rate": 4.404986777483944e-07, "logits/chosen": -3.0911104679107666, "logits/rejected": -3.2177834510803223, "logps/chosen": -253.5253448486328, "logps/rejected": -505.5723571777344, "loss": 0.0857, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4803188443183899, "rewards/margins": 14.278889656066895, "rewards/rejected": -14.759210586547852, "step": 610 }, { "epoch": 0.21, "learning_rate": 4.3860974688326405e-07, "logits/chosen": -3.17213773727417, "logits/rejected": -3.153881072998047, "logps/chosen": -341.3065185546875, "logps/rejected": -633.9520874023438, "loss": 0.0714, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8739015460014343, "rewards/margins": 14.758657455444336, "rewards/rejected": -15.632558822631836, "step": 620 }, { "epoch": 0.21, "learning_rate": 4.367208160181337e-07, "logits/chosen": -3.1356098651885986, "logits/rejected": -3.269990921020508, "logps/chosen": -300.65411376953125, "logps/rejected": -361.2610778808594, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": -0.36743250489234924, "rewards/margins": 14.380537033081055, "rewards/rejected": -14.747968673706055, "step": 630 }, { "epoch": 0.22, "learning_rate": 4.348318851530034e-07, "logits/chosen": -3.2255866527557373, "logits/rejected": -3.2463996410369873, "logps/chosen": -273.51776123046875, "logps/rejected": -633.8323364257812, "loss": 0.4839, "rewards/accuracies": 1.0, "rewards/chosen": -0.413738876581192, "rewards/margins": 17.93083381652832, "rewards/rejected": -18.344573974609375, "step": 640 }, { "epoch": 0.22, "learning_rate": 4.32942954287873e-07, "logits/chosen": -3.2475905418395996, "logits/rejected": -3.3233604431152344, "logps/chosen": -264.77996826171875, "logps/rejected": -494.20855712890625, "loss": 0.0405, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.034877061843872, "rewards/margins": 19.03714942932129, "rewards/rejected": -20.072025299072266, "step": 650 }, { "epoch": 0.22, "learning_rate": 4.3105402342274273e-07, "logits/chosen": -3.2771410942077637, "logits/rejected": -3.288753032684326, "logps/chosen": -217.06277465820312, "logps/rejected": -617.0843505859375, "loss": 0.0595, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8464560508728027, "rewards/margins": 17.9014949798584, "rewards/rejected": -18.74795150756836, "step": 660 }, { "epoch": 0.23, "learning_rate": 4.2916509255761233e-07, "logits/chosen": -3.221287488937378, "logits/rejected": -3.186659812927246, "logps/chosen": -258.8490295410156, "logps/rejected": -667.2969970703125, "loss": 0.0473, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5467805862426758, "rewards/margins": 15.2776460647583, "rewards/rejected": -16.82442855834961, "step": 670 }, { "epoch": 0.23, "learning_rate": 4.2727616169248204e-07, "logits/chosen": -3.259873151779175, "logits/rejected": -3.214630126953125, "logps/chosen": -230.55038452148438, "logps/rejected": -636.394287109375, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -1.225334644317627, "rewards/margins": 16.479169845581055, "rewards/rejected": -17.70450210571289, "step": 680 }, { "epoch": 0.23, "learning_rate": 4.253872308273517e-07, "logits/chosen": -3.2281653881073, "logits/rejected": -3.0544328689575195, "logps/chosen": -301.4876403808594, "logps/rejected": -634.31689453125, "loss": 0.0151, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3511894941329956, "rewards/margins": 17.707965850830078, "rewards/rejected": -18.05915641784668, "step": 690 }, { "epoch": 0.24, "learning_rate": 4.2349829996222135e-07, "logits/chosen": -3.2615528106689453, "logits/rejected": -3.146155834197998, "logps/chosen": -257.6978759765625, "logps/rejected": -651.7713012695312, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -0.4299692213535309, "rewards/margins": 16.86044692993164, "rewards/rejected": -17.290416717529297, "step": 700 }, { "epoch": 0.24, "eval_logits/chosen": -3.2957923412323, "eval_logits/rejected": -3.2304515838623047, "eval_logps/chosen": -262.88763427734375, "eval_logps/rejected": -599.524169921875, "eval_loss": 0.06300165504217148, "eval_rewards/accuracies": 0.9932659864425659, "eval_rewards/chosen": -0.7082026600837708, "eval_rewards/margins": 16.8209228515625, "eval_rewards/rejected": -17.529123306274414, "eval_runtime": 459.6358, "eval_samples_per_second": 20.669, "eval_steps_per_second": 0.646, "step": 700 }, { "epoch": 0.24, "learning_rate": 4.2160936909709106e-07, "logits/chosen": -3.19771409034729, "logits/rejected": -3.223461866378784, "logps/chosen": -260.47796630859375, "logps/rejected": -475.68341064453125, "loss": 0.0625, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8059867024421692, "rewards/margins": 14.349054336547852, "rewards/rejected": -15.155041694641113, "step": 710 }, { "epoch": 0.24, "learning_rate": 4.1972043823196066e-07, "logits/chosen": -3.252122402191162, "logits/rejected": -3.2310726642608643, "logps/chosen": -230.65432739257812, "logps/rejected": -538.1376953125, "loss": 0.0743, "rewards/accuracies": 1.0, "rewards/chosen": -1.229381799697876, "rewards/margins": 19.127460479736328, "rewards/rejected": -20.356840133666992, "step": 720 }, { "epoch": 0.25, "learning_rate": 4.1783150736683037e-07, "logits/chosen": -3.29115629196167, "logits/rejected": -3.1870217323303223, "logps/chosen": -289.22088623046875, "logps/rejected": -615.3138427734375, "loss": 0.0194, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0578917264938354, "rewards/margins": 16.45914649963379, "rewards/rejected": -17.51703643798828, "step": 730 }, { "epoch": 0.25, "learning_rate": 4.1594257650170003e-07, "logits/chosen": -3.2585701942443848, "logits/rejected": -3.274698257446289, "logps/chosen": -291.2822570800781, "logps/rejected": -534.2890625, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -1.3135229349136353, "rewards/margins": 16.959497451782227, "rewards/rejected": -18.273021697998047, "step": 740 }, { "epoch": 0.25, "learning_rate": 4.140536456365697e-07, "logits/chosen": -3.2825417518615723, "logits/rejected": -3.2054946422576904, "logps/chosen": -204.35067749023438, "logps/rejected": -632.234375, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": -0.6541246771812439, "rewards/margins": 18.069482803344727, "rewards/rejected": -18.723609924316406, "step": 750 }, { "epoch": 0.26, "learning_rate": 4.1216471477143934e-07, "logits/chosen": -3.3405938148498535, "logits/rejected": -3.2676734924316406, "logps/chosen": -214.34793090820312, "logps/rejected": -491.17803955078125, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": -0.9016032218933105, "rewards/margins": 17.351411819458008, "rewards/rejected": -18.253013610839844, "step": 760 }, { "epoch": 0.26, "learning_rate": 4.10275783906309e-07, "logits/chosen": -3.272359848022461, "logits/rejected": -3.3219470977783203, "logps/chosen": -323.8666687011719, "logps/rejected": -581.1951904296875, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -0.8133894205093384, "rewards/margins": 15.973344802856445, "rewards/rejected": -16.786733627319336, "step": 770 }, { "epoch": 0.27, "learning_rate": 4.0838685304117865e-07, "logits/chosen": -3.353651762008667, "logits/rejected": -3.259639263153076, "logps/chosen": -336.8185729980469, "logps/rejected": -695.9039306640625, "loss": 0.2987, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0787861347198486, "rewards/margins": 17.26241683959961, "rewards/rejected": -18.34119987487793, "step": 780 }, { "epoch": 0.27, "learning_rate": 4.0649792217604836e-07, "logits/chosen": -3.4104390144348145, "logits/rejected": -3.359205961227417, "logps/chosen": -263.760498046875, "logps/rejected": -435.3912658691406, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -0.8803040385246277, "rewards/margins": 15.400744438171387, "rewards/rejected": -16.281049728393555, "step": 790 }, { "epoch": 0.27, "learning_rate": 4.04608991310918e-07, "logits/chosen": -3.4024837017059326, "logits/rejected": -3.3290324211120605, "logps/chosen": -274.2511291503906, "logps/rejected": -561.3245239257812, "loss": 0.0461, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1979191303253174, "rewards/margins": 17.333105087280273, "rewards/rejected": -18.531024932861328, "step": 800 }, { "epoch": 0.27, "eval_logits/chosen": -3.5158255100250244, "eval_logits/rejected": -3.3936259746551514, "eval_logps/chosen": -268.3476867675781, "eval_logps/rejected": -636.7913818359375, "eval_loss": 0.034081265330314636, "eval_rewards/accuracies": 0.9932659864425659, "eval_rewards/chosen": -1.2542104721069336, "eval_rewards/margins": 20.001632690429688, "eval_rewards/rejected": -21.255842208862305, "eval_runtime": 459.1675, "eval_samples_per_second": 20.69, "eval_steps_per_second": 0.647, "step": 800 }, { "epoch": 0.28, "learning_rate": 4.0272006044578767e-07, "logits/chosen": -3.4222965240478516, "logits/rejected": -3.3901965618133545, "logps/chosen": -271.127685546875, "logps/rejected": -702.6803588867188, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -0.3194767236709595, "rewards/margins": 20.698434829711914, "rewards/rejected": -21.01791000366211, "step": 810 }, { "epoch": 0.28, "learning_rate": 4.0083112958065733e-07, "logits/chosen": -3.3571228981018066, "logits/rejected": -3.378098964691162, "logps/chosen": -354.65338134765625, "logps/rejected": -460.4593811035156, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 0.2477954924106598, "rewards/margins": 16.9534854888916, "rewards/rejected": -16.705690383911133, "step": 820 }, { "epoch": 0.28, "learning_rate": 3.98942198715527e-07, "logits/chosen": -3.4696907997131348, "logits/rejected": -3.3343029022216797, "logps/chosen": -200.86773681640625, "logps/rejected": -668.7745361328125, "loss": 0.0249, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9370819926261902, "rewards/margins": 18.672664642333984, "rewards/rejected": -19.609745025634766, "step": 830 }, { "epoch": 0.29, "learning_rate": 3.970532678503967e-07, "logits/chosen": -3.428400754928589, "logits/rejected": -3.392636775970459, "logps/chosen": -278.60614013671875, "logps/rejected": -547.3959350585938, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": -0.5795613527297974, "rewards/margins": 18.447797775268555, "rewards/rejected": -19.027359008789062, "step": 840 }, { "epoch": 0.29, "learning_rate": 3.951643369852663e-07, "logits/chosen": -3.3655319213867188, "logits/rejected": -3.2096476554870605, "logps/chosen": -277.06854248046875, "logps/rejected": -800.4194946289062, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -0.36317068338394165, "rewards/margins": 18.841623306274414, "rewards/rejected": -19.204795837402344, "step": 850 }, { "epoch": 0.29, "learning_rate": 3.93275406120136e-07, "logits/chosen": -3.3958353996276855, "logits/rejected": -3.364926815032959, "logps/chosen": -260.45867919921875, "logps/rejected": -638.11328125, "loss": 0.0255, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.29275327920913696, "rewards/margins": 20.899669647216797, "rewards/rejected": -21.192420959472656, "step": 860 }, { "epoch": 0.3, "learning_rate": 3.913864752550056e-07, "logits/chosen": -3.555651903152466, "logits/rejected": -3.362936019897461, "logps/chosen": -336.64990234375, "logps/rejected": -678.8165283203125, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 0.29712626338005066, "rewards/margins": 20.055971145629883, "rewards/rejected": -19.758846282958984, "step": 870 }, { "epoch": 0.3, "learning_rate": 3.894975443898753e-07, "logits/chosen": -3.5221924781799316, "logits/rejected": -3.4192261695861816, "logps/chosen": -184.4869384765625, "logps/rejected": -517.7220458984375, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 0.2702915668487549, "rewards/margins": 17.328258514404297, "rewards/rejected": -17.05797004699707, "step": 880 }, { "epoch": 0.3, "learning_rate": 3.87608613524745e-07, "logits/chosen": -3.3060905933380127, "logits/rejected": -3.3240628242492676, "logps/chosen": -249.1829833984375, "logps/rejected": -423.19183349609375, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 0.4387660622596741, "rewards/margins": 15.609105110168457, "rewards/rejected": -15.17033863067627, "step": 890 }, { "epoch": 0.31, "learning_rate": 3.857196826596146e-07, "logits/chosen": -3.432100296020508, "logits/rejected": -3.3346171379089355, "logps/chosen": -213.72622680664062, "logps/rejected": -522.6224365234375, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 0.36038094758987427, "rewards/margins": 17.75735855102539, "rewards/rejected": -17.39698028564453, "step": 900 }, { "epoch": 0.31, "eval_logits/chosen": -3.494093418121338, "eval_logits/rejected": -3.3745272159576416, "eval_logps/chosen": -252.02415466308594, "eval_logps/rejected": -596.7079467773438, "eval_loss": 0.029099902138113976, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.3781438171863556, "eval_rewards/margins": 17.625646591186523, "eval_rewards/rejected": -17.24750328063965, "eval_runtime": 458.9428, "eval_samples_per_second": 20.7, "eval_steps_per_second": 0.647, "step": 900 }, { "epoch": 0.31, "learning_rate": 3.8383075179448433e-07, "logits/chosen": -3.4732635021209717, "logits/rejected": -3.32080340385437, "logps/chosen": -299.28045654296875, "logps/rejected": -412.11737060546875, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 0.5949827432632446, "rewards/margins": 14.9349365234375, "rewards/rejected": -14.339953422546387, "step": 910 }, { "epoch": 0.31, "learning_rate": 3.8194182092935394e-07, "logits/chosen": -3.5183627605438232, "logits/rejected": -3.418959856033325, "logps/chosen": -206.648193359375, "logps/rejected": -540.2952880859375, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 0.14846962690353394, "rewards/margins": 18.112279891967773, "rewards/rejected": -17.963809967041016, "step": 920 }, { "epoch": 0.32, "learning_rate": 3.8005289006422365e-07, "logits/chosen": -3.4474635124206543, "logits/rejected": -3.4216742515563965, "logps/chosen": -245.8963165283203, "logps/rejected": -637.1114501953125, "loss": 0.0291, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.09156160801649094, "rewards/margins": 20.223173141479492, "rewards/rejected": -20.13161277770996, "step": 930 }, { "epoch": 0.32, "learning_rate": 3.7816395919909325e-07, "logits/chosen": -3.43304443359375, "logits/rejected": -3.3371405601501465, "logps/chosen": -205.36312866210938, "logps/rejected": -577.2255249023438, "loss": 0.0663, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.1940477043390274, "rewards/margins": 17.12152099609375, "rewards/rejected": -17.31557273864746, "step": 940 }, { "epoch": 0.32, "learning_rate": 3.7627502833396296e-07, "logits/chosen": -3.4164633750915527, "logits/rejected": -3.334111452102661, "logps/chosen": -252.3224334716797, "logps/rejected": -522.1593017578125, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -0.06414094567298889, "rewards/margins": 16.263051986694336, "rewards/rejected": -16.327190399169922, "step": 950 }, { "epoch": 0.33, "learning_rate": 3.7438609746883267e-07, "logits/chosen": -3.3646836280822754, "logits/rejected": -3.2790799140930176, "logps/chosen": -303.45623779296875, "logps/rejected": -564.6026611328125, "loss": 0.138, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1330849528312683, "rewards/margins": 15.452998161315918, "rewards/rejected": -15.319913864135742, "step": 960 }, { "epoch": 0.33, "learning_rate": 3.7249716660370227e-07, "logits/chosen": -3.5046660900115967, "logits/rejected": -3.309417724609375, "logps/chosen": -193.89120483398438, "logps/rejected": -745.4686279296875, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -0.28295058012008667, "rewards/margins": 19.531200408935547, "rewards/rejected": -19.814151763916016, "step": 970 }, { "epoch": 0.33, "learning_rate": 3.70608235738572e-07, "logits/chosen": -3.4429755210876465, "logits/rejected": -3.297156572341919, "logps/chosen": -311.4560852050781, "logps/rejected": -706.6754150390625, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 0.6654003858566284, "rewards/margins": 20.311656951904297, "rewards/rejected": -19.646259307861328, "step": 980 }, { "epoch": 0.34, "learning_rate": 3.687193048734416e-07, "logits/chosen": -3.448244094848633, "logits/rejected": -3.379380464553833, "logps/chosen": -186.96884155273438, "logps/rejected": -682.4005126953125, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": 0.3271837532520294, "rewards/margins": 18.818578720092773, "rewards/rejected": -18.491397857666016, "step": 990 }, { "epoch": 0.34, "learning_rate": 3.668303740083113e-07, "logits/chosen": -3.448310136795044, "logits/rejected": -3.360579013824463, "logps/chosen": -269.36297607421875, "logps/rejected": -513.7430419921875, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 0.5543234944343567, "rewards/margins": 17.900325775146484, "rewards/rejected": -17.34600067138672, "step": 1000 }, { "epoch": 0.34, "eval_logits/chosen": -3.4703125953674316, "eval_logits/rejected": -3.3235578536987305, "eval_logps/chosen": -256.8191223144531, "eval_logps/rejected": -620.40966796875, "eval_loss": 0.024830101057887077, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": -0.10135477781295776, "eval_rewards/margins": 19.51631736755371, "eval_rewards/rejected": -19.6176700592041, "eval_runtime": 459.9759, "eval_samples_per_second": 20.653, "eval_steps_per_second": 0.646, "step": 1000 }, { "epoch": 0.34, "learning_rate": 3.6494144314318094e-07, "logits/chosen": -3.414768695831299, "logits/rejected": -3.3657734394073486, "logps/chosen": -279.57891845703125, "logps/rejected": -491.0535583496094, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 0.23943224549293518, "rewards/margins": 14.393826484680176, "rewards/rejected": -14.154393196105957, "step": 1010 }, { "epoch": 0.35, "learning_rate": 3.630525122780506e-07, "logits/chosen": -3.4718196392059326, "logits/rejected": -3.3277339935302734, "logps/chosen": -190.89041137695312, "logps/rejected": -588.8604736328125, "loss": 0.0648, "rewards/accuracies": 1.0, "rewards/chosen": -0.5905485153198242, "rewards/margins": 20.05153465270996, "rewards/rejected": -20.642086029052734, "step": 1020 }, { "epoch": 0.35, "learning_rate": 3.6116358141292026e-07, "logits/chosen": -3.4760546684265137, "logits/rejected": -3.3435845375061035, "logps/chosen": -203.86050415039062, "logps/rejected": -592.036865234375, "loss": 0.0302, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7946874499320984, "rewards/margins": 24.075565338134766, "rewards/rejected": -24.87025260925293, "step": 1030 }, { "epoch": 0.35, "learning_rate": 3.592746505477899e-07, "logits/chosen": -3.3917622566223145, "logits/rejected": -3.2945053577423096, "logps/chosen": -364.39324951171875, "logps/rejected": -807.2418212890625, "loss": 0.0895, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8276251554489136, "rewards/margins": 28.354557037353516, "rewards/rejected": -29.182178497314453, "step": 1040 }, { "epoch": 0.36, "learning_rate": 3.573857196826596e-07, "logits/chosen": -3.579265594482422, "logits/rejected": -3.4756500720977783, "logps/chosen": -352.31451416015625, "logps/rejected": -720.6471557617188, "loss": 0.1426, "rewards/accuracies": 1.0, "rewards/chosen": -1.192226529121399, "rewards/margins": 35.108890533447266, "rewards/rejected": -36.301116943359375, "step": 1050 }, { "epoch": 0.36, "learning_rate": 3.554967888175293e-07, "logits/chosen": -3.6324493885040283, "logits/rejected": -3.4453964233398438, "logps/chosen": -275.65631103515625, "logps/rejected": -759.9483642578125, "loss": 0.0578, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.29346764087677, "rewards/margins": 34.53205490112305, "rewards/rejected": -35.825523376464844, "step": 1060 }, { "epoch": 0.36, "learning_rate": 3.5360785795239893e-07, "logits/chosen": -3.4921135902404785, "logits/rejected": -3.409318208694458, "logps/chosen": -326.6520080566406, "logps/rejected": -557.5526733398438, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -1.002267599105835, "rewards/margins": 22.072717666625977, "rewards/rejected": -23.07498550415039, "step": 1070 }, { "epoch": 0.37, "learning_rate": 3.517189270872686e-07, "logits/chosen": -3.4556102752685547, "logits/rejected": -3.42388916015625, "logps/chosen": -266.01983642578125, "logps/rejected": -539.4820556640625, "loss": 0.1283, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8836309313774109, "rewards/margins": 21.98915672302246, "rewards/rejected": -22.872791290283203, "step": 1080 }, { "epoch": 0.37, "learning_rate": 3.4982999622213824e-07, "logits/chosen": -3.318305253982544, "logits/rejected": -3.3859639167785645, "logps/chosen": -262.29730224609375, "logps/rejected": -423.01531982421875, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": -0.06474297493696213, "rewards/margins": 20.654415130615234, "rewards/rejected": -20.719158172607422, "step": 1090 }, { "epoch": 0.37, "learning_rate": 3.479410653570079e-07, "logits/chosen": -3.4146296977996826, "logits/rejected": -3.3967928886413574, "logps/chosen": -263.7212219238281, "logps/rejected": -663.2433471679688, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 0.18020686507225037, "rewards/margins": 23.35451889038086, "rewards/rejected": -23.174312591552734, "step": 1100 }, { "epoch": 0.37, "eval_logits/chosen": -3.512702226638794, "eval_logits/rejected": -3.369966745376587, "eval_logps/chosen": -253.3647918701172, "eval_logps/rejected": -652.9178466796875, "eval_loss": 0.04759080708026886, "eval_rewards/accuracies": 0.9949495196342468, "eval_rewards/chosen": 0.24408026039600372, "eval_rewards/margins": 23.112565994262695, "eval_rewards/rejected": -22.868486404418945, "eval_runtime": 459.4583, "eval_samples_per_second": 20.677, "eval_steps_per_second": 0.646, "step": 1100 }, { "epoch": 0.38, "learning_rate": 3.460521344918776e-07, "logits/chosen": -3.4741005897521973, "logits/rejected": -3.3692924976348877, "logps/chosen": -177.2493896484375, "logps/rejected": -684.059326171875, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": 0.10438014566898346, "rewards/margins": 19.24637794494629, "rewards/rejected": -19.141998291015625, "step": 1110 }, { "epoch": 0.38, "learning_rate": 3.441632036267472e-07, "logits/chosen": -3.519453525543213, "logits/rejected": -3.450646162033081, "logps/chosen": -206.3206787109375, "logps/rejected": -591.7532348632812, "loss": 0.1613, "rewards/accuracies": 1.0, "rewards/chosen": 0.19403085112571716, "rewards/margins": 18.250011444091797, "rewards/rejected": -18.055980682373047, "step": 1120 }, { "epoch": 0.38, "learning_rate": 3.422742727616169e-07, "logits/chosen": -3.4171173572540283, "logits/rejected": -3.402754545211792, "logps/chosen": -285.92596435546875, "logps/rejected": -420.0677795410156, "loss": 0.0851, "rewards/accuracies": 1.0, "rewards/chosen": 0.29394811391830444, "rewards/margins": 17.166831970214844, "rewards/rejected": -16.872886657714844, "step": 1130 }, { "epoch": 0.39, "learning_rate": 3.403853418964866e-07, "logits/chosen": -3.395599842071533, "logits/rejected": -3.352531909942627, "logps/chosen": -267.9078674316406, "logps/rejected": -602.1204833984375, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -0.25522318482398987, "rewards/margins": 24.411819458007812, "rewards/rejected": -24.667041778564453, "step": 1140 }, { "epoch": 0.39, "learning_rate": 3.3849641103135623e-07, "logits/chosen": -3.4090888500213623, "logits/rejected": -3.3088366985321045, "logps/chosen": -302.02484130859375, "logps/rejected": -429.8524475097656, "loss": 0.078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5110098123550415, "rewards/margins": 17.73603057861328, "rewards/rejected": -18.247039794921875, "step": 1150 }, { "epoch": 0.39, "learning_rate": 3.3660748016622594e-07, "logits/chosen": -3.284252882003784, "logits/rejected": -3.364304304122925, "logps/chosen": -329.56622314453125, "logps/rejected": -570.0607299804688, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 0.14421747624874115, "rewards/margins": 23.145597457885742, "rewards/rejected": -23.00138282775879, "step": 1160 }, { "epoch": 0.4, "learning_rate": 3.3471854930109554e-07, "logits/chosen": -3.2979743480682373, "logits/rejected": -3.361307144165039, "logps/chosen": -186.98870849609375, "logps/rejected": -614.3134765625, "loss": 0.074, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3126475214958191, "rewards/margins": 21.26107406616211, "rewards/rejected": -20.948427200317383, "step": 1170 }, { "epoch": 0.4, "learning_rate": 3.3282961843596525e-07, "logits/chosen": -3.3583598136901855, "logits/rejected": -3.2492566108703613, "logps/chosen": -241.7067108154297, "logps/rejected": -792.7431640625, "loss": 0.0112, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.38423070311546326, "rewards/margins": 24.471080780029297, "rewards/rejected": -24.086851119995117, "step": 1180 }, { "epoch": 0.4, "learning_rate": 3.3094068757083485e-07, "logits/chosen": -3.342761516571045, "logits/rejected": -3.278207778930664, "logps/chosen": -275.2469482421875, "logps/rejected": -528.4226684570312, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 0.5847635269165039, "rewards/margins": 23.485641479492188, "rewards/rejected": -22.900880813598633, "step": 1190 }, { "epoch": 0.41, "learning_rate": 3.2905175670570456e-07, "logits/chosen": -3.3539364337921143, "logits/rejected": -3.304426670074463, "logps/chosen": -338.57464599609375, "logps/rejected": -586.469970703125, "loss": 0.0153, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.03903156518936157, "rewards/margins": 19.734745025634766, "rewards/rejected": -19.77377700805664, "step": 1200 }, { "epoch": 0.41, "eval_logits/chosen": -3.443300485610962, "eval_logits/rejected": -3.3281137943267822, "eval_logps/chosen": -253.46900939941406, "eval_logps/rejected": -634.9552612304688, "eval_loss": 0.03437602147459984, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 0.23365840315818787, "eval_rewards/margins": 21.305892944335938, "eval_rewards/rejected": -21.072233200073242, "eval_runtime": 460.0736, "eval_samples_per_second": 20.649, "eval_steps_per_second": 0.646, "step": 1200 }, { "epoch": 0.41, "learning_rate": 3.271628258405742e-07, "logits/chosen": -3.3404109477996826, "logits/rejected": -3.3278732299804688, "logps/chosen": -266.605712890625, "logps/rejected": -432.56243896484375, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 0.16924318671226501, "rewards/margins": 20.001480102539062, "rewards/rejected": -19.832239151000977, "step": 1210 }, { "epoch": 0.41, "learning_rate": 3.252738949754439e-07, "logits/chosen": -3.325308322906494, "logits/rejected": -3.2718875408172607, "logps/chosen": -239.7605743408203, "logps/rejected": -689.46240234375, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 1.257117748260498, "rewards/margins": 22.868335723876953, "rewards/rejected": -21.611217498779297, "step": 1220 }, { "epoch": 0.42, "learning_rate": 3.233849641103136e-07, "logits/chosen": -3.409510850906372, "logits/rejected": -3.2908267974853516, "logps/chosen": -186.55523681640625, "logps/rejected": -838.28271484375, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 0.19115009903907776, "rewards/margins": 21.021907806396484, "rewards/rejected": -20.830759048461914, "step": 1230 }, { "epoch": 0.42, "learning_rate": 3.214960332451832e-07, "logits/chosen": -3.3691024780273438, "logits/rejected": -3.2960734367370605, "logps/chosen": -276.49310302734375, "logps/rejected": -673.3426513671875, "loss": 0.027, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.20441074669361115, "rewards/margins": 21.827564239501953, "rewards/rejected": -22.03197479248047, "step": 1240 }, { "epoch": 0.42, "learning_rate": 3.196071023800529e-07, "logits/chosen": -3.3508994579315186, "logits/rejected": -3.2464585304260254, "logps/chosen": -262.32366943359375, "logps/rejected": -801.7515869140625, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.30456364154815674, "rewards/margins": 18.644428253173828, "rewards/rejected": -18.948993682861328, "step": 1250 }, { "epoch": 0.43, "learning_rate": 3.1771817151492255e-07, "logits/chosen": -3.3253204822540283, "logits/rejected": -3.182976245880127, "logps/chosen": -271.783203125, "logps/rejected": -792.1323852539062, "loss": 0.0606, "rewards/accuracies": 1.0, "rewards/chosen": 0.4825393557548523, "rewards/margins": 23.283037185668945, "rewards/rejected": -22.800498962402344, "step": 1260 }, { "epoch": 0.43, "learning_rate": 3.158292406497922e-07, "logits/chosen": -3.281102418899536, "logits/rejected": -3.305457353591919, "logps/chosen": -303.48016357421875, "logps/rejected": -653.7716674804688, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": 0.34954649209976196, "rewards/margins": 18.922035217285156, "rewards/rejected": -18.57248306274414, "step": 1270 }, { "epoch": 0.44, "learning_rate": 3.1394030978466186e-07, "logits/chosen": -3.4600696563720703, "logits/rejected": -3.3820204734802246, "logps/chosen": -226.12631225585938, "logps/rejected": -523.9305419921875, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -0.39240407943725586, "rewards/margins": 21.62295913696289, "rewards/rejected": -22.015361785888672, "step": 1280 }, { "epoch": 0.44, "learning_rate": 3.120513789195315e-07, "logits/chosen": -3.4715068340301514, "logits/rejected": -3.343583583831787, "logps/chosen": -208.66445922851562, "logps/rejected": -671.6092529296875, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": -0.2728857100009918, "rewards/margins": 23.407129287719727, "rewards/rejected": -23.68001365661621, "step": 1290 }, { "epoch": 0.44, "learning_rate": 3.1016244805440117e-07, "logits/chosen": -3.3316524028778076, "logits/rejected": -3.259505033493042, "logps/chosen": -252.8136749267578, "logps/rejected": -625.1239013671875, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": -0.11937759071588516, "rewards/margins": 17.959081649780273, "rewards/rejected": -18.078458786010742, "step": 1300 }, { "epoch": 0.44, "eval_logits/chosen": -3.2942728996276855, "eval_logits/rejected": -3.2086024284362793, "eval_logps/chosen": -251.94058227539062, "eval_logps/rejected": -619.3322143554688, "eval_loss": 0.03203802928328514, "eval_rewards/accuracies": 0.994107723236084, "eval_rewards/chosen": 0.3865027129650116, "eval_rewards/margins": 19.89643096923828, "eval_rewards/rejected": -19.50992774963379, "eval_runtime": 459.9792, "eval_samples_per_second": 20.653, "eval_steps_per_second": 0.646, "step": 1300 }, { "epoch": 0.45, "learning_rate": 3.082735171892709e-07, "logits/chosen": -3.283964157104492, "logits/rejected": -3.3076889514923096, "logps/chosen": -200.6337432861328, "logps/rejected": -526.4237060546875, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": -0.0861460343003273, "rewards/margins": 19.039958953857422, "rewards/rejected": -19.126102447509766, "step": 1310 }, { "epoch": 0.45, "learning_rate": 3.0638458632414054e-07, "logits/chosen": -3.3101184368133545, "logits/rejected": -3.2691454887390137, "logps/chosen": -248.6476593017578, "logps/rejected": -730.8521728515625, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 0.3345467150211334, "rewards/margins": 21.247516632080078, "rewards/rejected": -20.912973403930664, "step": 1320 }, { "epoch": 0.45, "learning_rate": 3.044956554590102e-07, "logits/chosen": -3.3269455432891846, "logits/rejected": -3.3831734657287598, "logps/chosen": -250.7469024658203, "logps/rejected": -490.6947326660156, "loss": 0.0732, "rewards/accuracies": 1.0, "rewards/chosen": -0.0001120984525186941, "rewards/margins": 21.17990493774414, "rewards/rejected": -21.180017471313477, "step": 1330 }, { "epoch": 0.46, "learning_rate": 3.0260672459387985e-07, "logits/chosen": -3.445746660232544, "logits/rejected": -3.3551487922668457, "logps/chosen": -194.33663940429688, "logps/rejected": -587.1473388671875, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -0.2983397841453552, "rewards/margins": 21.926280975341797, "rewards/rejected": -22.224620819091797, "step": 1340 }, { "epoch": 0.46, "learning_rate": 3.007177937287495e-07, "logits/chosen": -3.3813576698303223, "logits/rejected": -3.2786917686462402, "logps/chosen": -200.80307006835938, "logps/rejected": -760.9759521484375, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.03139156848192215, "rewards/margins": 25.731273651123047, "rewards/rejected": -25.762670516967773, "step": 1350 }, { "epoch": 0.46, "learning_rate": 2.988288628636192e-07, "logits/chosen": -3.431568145751953, "logits/rejected": -3.312471389770508, "logps/chosen": -261.0316162109375, "logps/rejected": -559.0184326171875, "loss": 0.057, "rewards/accuracies": 1.0, "rewards/chosen": 0.4650034010410309, "rewards/margins": 19.260265350341797, "rewards/rejected": -18.795259475708008, "step": 1360 }, { "epoch": 0.47, "learning_rate": 2.969399319984888e-07, "logits/chosen": -3.423460006713867, "logits/rejected": -3.2820401191711426, "logps/chosen": -222.3740692138672, "logps/rejected": -875.9083862304688, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -0.7400540709495544, "rewards/margins": 24.780017852783203, "rewards/rejected": -25.520071029663086, "step": 1370 }, { "epoch": 0.47, "learning_rate": 2.950510011333585e-07, "logits/chosen": -3.4326465129852295, "logits/rejected": -3.351288318634033, "logps/chosen": -222.06887817382812, "logps/rejected": -691.3297729492188, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": 0.06572739779949188, "rewards/margins": 22.379207611083984, "rewards/rejected": -22.313480377197266, "step": 1380 }, { "epoch": 0.47, "learning_rate": 2.9316207026822813e-07, "logits/chosen": -3.4397430419921875, "logits/rejected": -3.294466733932495, "logps/chosen": -224.94287109375, "logps/rejected": -672.6677856445312, "loss": 0.0298, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.15702833235263824, "rewards/margins": 21.14691734313965, "rewards/rejected": -21.303943634033203, "step": 1390 }, { "epoch": 0.48, "learning_rate": 2.9127313940309784e-07, "logits/chosen": -3.423677921295166, "logits/rejected": -3.4165546894073486, "logps/chosen": -191.03762817382812, "logps/rejected": -612.9573364257812, "loss": 0.0085, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3937866985797882, "rewards/margins": 22.95120620727539, "rewards/rejected": -23.34499740600586, "step": 1400 }, { "epoch": 0.48, "eval_logits/chosen": -3.505523920059204, "eval_logits/rejected": -3.3688061237335205, "eval_logps/chosen": -259.40972900390625, "eval_logps/rejected": -670.28564453125, "eval_loss": 0.01642242632806301, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": -0.36041346192359924, "eval_rewards/margins": 24.244855880737305, "eval_rewards/rejected": -24.605268478393555, "eval_runtime": 459.6598, "eval_samples_per_second": 20.667, "eval_steps_per_second": 0.646, "step": 1400 }, { "epoch": 0.48, "learning_rate": 2.8938420853796754e-07, "logits/chosen": -3.4179089069366455, "logits/rejected": -3.3702964782714844, "logps/chosen": -280.6731262207031, "logps/rejected": -737.249267578125, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -0.025803815573453903, "rewards/margins": 28.388423919677734, "rewards/rejected": -28.414230346679688, "step": 1410 }, { "epoch": 0.48, "learning_rate": 2.8749527767283715e-07, "logits/chosen": -3.369783878326416, "logits/rejected": -3.376282215118408, "logps/chosen": -268.02752685546875, "logps/rejected": -681.7142944335938, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.40402960777282715, "rewards/margins": 21.377483367919922, "rewards/rejected": -21.78151512145996, "step": 1420 }, { "epoch": 0.49, "learning_rate": 2.8560634680770686e-07, "logits/chosen": -3.4302501678466797, "logits/rejected": -3.362534761428833, "logps/chosen": -268.420166015625, "logps/rejected": -660.0174560546875, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -0.9907222986221313, "rewards/margins": 23.22337532043457, "rewards/rejected": -24.214096069335938, "step": 1430 }, { "epoch": 0.49, "learning_rate": 2.8371741594257646e-07, "logits/chosen": -3.4184436798095703, "logits/rejected": -3.3809783458709717, "logps/chosen": -341.3996887207031, "logps/rejected": -584.3710327148438, "loss": 0.1015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.279394268989563, "rewards/margins": 25.102916717529297, "rewards/rejected": -25.382308959960938, "step": 1440 }, { "epoch": 0.49, "learning_rate": 2.8182848507744617e-07, "logits/chosen": -3.5210273265838623, "logits/rejected": -3.411952495574951, "logps/chosen": -221.7975311279297, "logps/rejected": -520.7232666015625, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -1.764832854270935, "rewards/margins": 22.8917236328125, "rewards/rejected": -24.65655517578125, "step": 1450 }, { "epoch": 0.5, "learning_rate": 2.799395542123158e-07, "logits/chosen": -3.3874526023864746, "logits/rejected": -3.3241126537323, "logps/chosen": -336.1128845214844, "logps/rejected": -680.0740356445312, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -0.9126793742179871, "rewards/margins": 21.444177627563477, "rewards/rejected": -22.35685920715332, "step": 1460 }, { "epoch": 0.5, "learning_rate": 2.780506233471855e-07, "logits/chosen": -3.438469409942627, "logits/rejected": -3.3389315605163574, "logps/chosen": -200.33642578125, "logps/rejected": -602.5469360351562, "loss": 0.0139, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0372685194015503, "rewards/margins": 25.210084915161133, "rewards/rejected": -26.247350692749023, "step": 1470 }, { "epoch": 0.5, "learning_rate": 2.7616169248205513e-07, "logits/chosen": -3.3702118396759033, "logits/rejected": -3.3400275707244873, "logps/chosen": -205.5651397705078, "logps/rejected": -772.1658935546875, "loss": 0.0084, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8077577352523804, "rewards/margins": 37.4057731628418, "rewards/rejected": -39.213531494140625, "step": 1480 }, { "epoch": 0.51, "learning_rate": 2.742727616169248e-07, "logits/chosen": -3.3958728313446045, "logits/rejected": -3.326359510421753, "logps/chosen": -254.0430450439453, "logps/rejected": -525.82568359375, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -1.1838462352752686, "rewards/margins": 29.607269287109375, "rewards/rejected": -30.791118621826172, "step": 1490 }, { "epoch": 0.51, "learning_rate": 2.723838307517945e-07, "logits/chosen": -3.3379979133605957, "logits/rejected": -3.2290759086608887, "logps/chosen": -330.87176513671875, "logps/rejected": -727.2958984375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.1591452956199646, "rewards/margins": 32.132423400878906, "rewards/rejected": -32.29157257080078, "step": 1500 }, { "epoch": 0.51, "eval_logits/chosen": -3.4454987049102783, "eval_logits/rejected": -3.298642158508301, "eval_logps/chosen": -264.3898010253906, "eval_logps/rejected": -762.0860595703125, "eval_loss": 0.011499395594000816, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": -0.8584219813346863, "eval_rewards/margins": 32.92688751220703, "eval_rewards/rejected": -33.785308837890625, "eval_runtime": 460.1896, "eval_samples_per_second": 20.644, "eval_steps_per_second": 0.645, "step": 1500 }, { "epoch": 0.51, "learning_rate": 2.7049489988666416e-07, "logits/chosen": -3.3881945610046387, "logits/rejected": -3.26824951171875, "logps/chosen": -310.400634765625, "logps/rejected": -629.2418212890625, "loss": 0.0191, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2945066690444946, "rewards/margins": 28.674571990966797, "rewards/rejected": -29.96907615661621, "step": 1510 }, { "epoch": 0.52, "learning_rate": 2.686059690215338e-07, "logits/chosen": -3.300244092941284, "logits/rejected": -3.251279830932617, "logps/chosen": -273.0030212402344, "logps/rejected": -812.864990234375, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": -0.8532045483589172, "rewards/margins": 32.147369384765625, "rewards/rejected": -33.00057601928711, "step": 1520 }, { "epoch": 0.52, "learning_rate": 2.6671703815640347e-07, "logits/chosen": -3.333925247192383, "logits/rejected": -3.2999801635742188, "logps/chosen": -247.5169219970703, "logps/rejected": -541.9185791015625, "loss": 0.0593, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.754341721534729, "rewards/margins": 25.133520126342773, "rewards/rejected": -25.887863159179688, "step": 1530 }, { "epoch": 0.52, "learning_rate": 2.648281072912731e-07, "logits/chosen": -3.4177489280700684, "logits/rejected": -3.2177319526672363, "logps/chosen": -210.8187713623047, "logps/rejected": -762.6394653320312, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5407046675682068, "rewards/margins": 20.313051223754883, "rewards/rejected": -20.853755950927734, "step": 1540 }, { "epoch": 0.53, "learning_rate": 2.629391764261428e-07, "logits/chosen": -3.3744702339172363, "logits/rejected": -3.3114571571350098, "logps/chosen": -278.4391784667969, "logps/rejected": -541.5650634765625, "loss": 0.0069, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8419182896614075, "rewards/margins": 19.7562313079834, "rewards/rejected": -20.598148345947266, "step": 1550 }, { "epoch": 0.53, "learning_rate": 2.610502455610125e-07, "logits/chosen": -3.3633506298065186, "logits/rejected": -3.3126094341278076, "logps/chosen": -352.4827575683594, "logps/rejected": -738.1781005859375, "loss": 0.1372, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.72525554895401, "rewards/margins": 22.449813842773438, "rewards/rejected": -23.175067901611328, "step": 1560 }, { "epoch": 0.53, "learning_rate": 2.591613146958821e-07, "logits/chosen": -3.4918789863586426, "logits/rejected": -3.401181697845459, "logps/chosen": -248.28060913085938, "logps/rejected": -687.9749145507812, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -0.5774227380752563, "rewards/margins": 24.075176239013672, "rewards/rejected": -24.652597427368164, "step": 1570 }, { "epoch": 0.54, "learning_rate": 2.572723838307518e-07, "logits/chosen": -3.4203097820281982, "logits/rejected": -3.419062852859497, "logps/chosen": -262.29730224609375, "logps/rejected": -591.0722045898438, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -0.19718003273010254, "rewards/margins": 21.856969833374023, "rewards/rejected": -22.05415153503418, "step": 1580 }, { "epoch": 0.54, "learning_rate": 2.5538345296562145e-07, "logits/chosen": -3.4380905628204346, "logits/rejected": -3.3508377075195312, "logps/chosen": -261.0439147949219, "logps/rejected": -576.0928955078125, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": -0.15919718146324158, "rewards/margins": 24.959590911865234, "rewards/rejected": -25.11878776550293, "step": 1590 }, { "epoch": 0.54, "learning_rate": 2.534945221004911e-07, "logits/chosen": -3.292537212371826, "logits/rejected": -3.3550610542297363, "logps/chosen": -380.9278869628906, "logps/rejected": -537.5498657226562, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 1.219398856163025, "rewards/margins": 21.093488693237305, "rewards/rejected": -19.874088287353516, "step": 1600 }, { "epoch": 0.54, "eval_logits/chosen": -3.481621742248535, "eval_logits/rejected": -3.3371667861938477, "eval_logps/chosen": -259.466796875, "eval_logps/rejected": -648.6592407226562, "eval_loss": 0.05247886851429939, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.3661208748817444, "eval_rewards/margins": 22.07651138305664, "eval_rewards/rejected": -22.442630767822266, "eval_runtime": 459.9365, "eval_samples_per_second": 20.655, "eval_steps_per_second": 0.646, "step": 1600 }, { "epoch": 0.55, "learning_rate": 2.516055912353608e-07, "logits/chosen": -3.3571033477783203, "logits/rejected": -3.2502875328063965, "logps/chosen": -277.83837890625, "logps/rejected": -724.0299072265625, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -0.8198422193527222, "rewards/margins": 20.712133407592773, "rewards/rejected": -21.53197479248047, "step": 1610 }, { "epoch": 0.55, "learning_rate": 2.497166603702304e-07, "logits/chosen": -3.3982932567596436, "logits/rejected": -3.273448944091797, "logps/chosen": -262.4102478027344, "logps/rejected": -759.910888671875, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -0.7558759450912476, "rewards/margins": 27.525121688842773, "rewards/rejected": -28.2810001373291, "step": 1620 }, { "epoch": 0.55, "learning_rate": 2.4782772950510013e-07, "logits/chosen": -3.3453495502471924, "logits/rejected": -3.254922389984131, "logps/chosen": -278.51287841796875, "logps/rejected": -550.3945922851562, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": -0.517066478729248, "rewards/margins": 19.643749237060547, "rewards/rejected": -20.160816192626953, "step": 1630 }, { "epoch": 0.56, "learning_rate": 2.459387986399698e-07, "logits/chosen": -3.198303699493408, "logits/rejected": -3.2498583793640137, "logps/chosen": -262.79925537109375, "logps/rejected": -609.2022705078125, "loss": 0.0064, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8170091509819031, "rewards/margins": 23.31692123413086, "rewards/rejected": -24.133930206298828, "step": 1640 }, { "epoch": 0.56, "learning_rate": 2.4404986777483944e-07, "logits/chosen": -3.2812728881835938, "logits/rejected": -3.2436320781707764, "logps/chosen": -318.45599365234375, "logps/rejected": -511.8076171875, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 0.28894877433776855, "rewards/margins": 20.95669937133789, "rewards/rejected": -20.667749404907227, "step": 1650 }, { "epoch": 0.56, "learning_rate": 2.421609369097091e-07, "logits/chosen": -3.3857626914978027, "logits/rejected": -3.2701289653778076, "logps/chosen": -277.44281005859375, "logps/rejected": -628.5038452148438, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -0.5810535550117493, "rewards/margins": 24.878406524658203, "rewards/rejected": -25.459457397460938, "step": 1660 }, { "epoch": 0.57, "learning_rate": 2.4027200604457875e-07, "logits/chosen": -3.3911805152893066, "logits/rejected": -3.285238742828369, "logps/chosen": -245.50717163085938, "logps/rejected": -631.26611328125, "loss": 0.0045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.19214244186878204, "rewards/margins": 23.272733688354492, "rewards/rejected": -23.46487808227539, "step": 1670 }, { "epoch": 0.57, "learning_rate": 2.383830751794484e-07, "logits/chosen": -3.3616461753845215, "logits/rejected": -3.2842185497283936, "logps/chosen": -346.10980224609375, "logps/rejected": -848.2496948242188, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.1335320770740509, "rewards/margins": 26.36749839782715, "rewards/rejected": -26.50103187561035, "step": 1680 }, { "epoch": 0.57, "learning_rate": 2.364941443143181e-07, "logits/chosen": -3.392210006713867, "logits/rejected": -3.366656541824341, "logps/chosen": -214.9364471435547, "logps/rejected": -542.2637939453125, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6650384664535522, "rewards/margins": 20.180553436279297, "rewards/rejected": -20.845592498779297, "step": 1690 }, { "epoch": 0.58, "learning_rate": 2.3460521344918775e-07, "logits/chosen": -3.4405808448791504, "logits/rejected": -3.241579055786133, "logps/chosen": -228.2866973876953, "logps/rejected": -684.4705810546875, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -0.8028446435928345, "rewards/margins": 30.044261932373047, "rewards/rejected": -30.847110748291016, "step": 1700 }, { "epoch": 0.58, "eval_logits/chosen": -3.4487576484680176, "eval_logits/rejected": -3.310159206390381, "eval_logps/chosen": -260.0583801269531, "eval_logps/rejected": -667.2958374023438, "eval_loss": 0.05137615278363228, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": -0.42527905106544495, "eval_rewards/margins": 23.88100814819336, "eval_rewards/rejected": -24.30628776550293, "eval_runtime": 459.8654, "eval_samples_per_second": 20.658, "eval_steps_per_second": 0.646, "step": 1700 }, { "epoch": 0.58, "learning_rate": 2.327162825840574e-07, "logits/chosen": -3.330414295196533, "logits/rejected": -3.2635841369628906, "logps/chosen": -326.1940612792969, "logps/rejected": -795.8521728515625, "loss": 0.0068, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.16869966685771942, "rewards/margins": 20.82526206970215, "rewards/rejected": -20.993961334228516, "step": 1710 }, { "epoch": 0.58, "learning_rate": 2.3082735171892708e-07, "logits/chosen": -3.359579086303711, "logits/rejected": -3.1651229858398438, "logps/chosen": -249.9299774169922, "logps/rejected": -1015.4552001953125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.624853789806366, "rewards/margins": 25.351341247558594, "rewards/rejected": -25.9761962890625, "step": 1720 }, { "epoch": 0.59, "learning_rate": 2.2893842085379674e-07, "logits/chosen": -3.4433200359344482, "logits/rejected": -3.279329776763916, "logps/chosen": -218.9633026123047, "logps/rejected": -560.0067138671875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.8118854761123657, "rewards/margins": 23.033185958862305, "rewards/rejected": -23.845069885253906, "step": 1730 }, { "epoch": 0.59, "learning_rate": 2.2704948998866642e-07, "logits/chosen": -3.2414298057556152, "logits/rejected": -3.2678425312042236, "logps/chosen": -395.7589111328125, "logps/rejected": -570.04345703125, "loss": 0.1014, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.5239030122756958, "rewards/margins": 21.01217269897461, "rewards/rejected": -22.536075592041016, "step": 1740 }, { "epoch": 0.59, "learning_rate": 2.2516055912353608e-07, "logits/chosen": -3.3212122917175293, "logits/rejected": -3.2659897804260254, "logps/chosen": -303.695556640625, "logps/rejected": -606.518310546875, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 0.16412094235420227, "rewards/margins": 30.60677146911621, "rewards/rejected": -30.442651748657227, "step": 1750 }, { "epoch": 0.6, "learning_rate": 2.2327162825840573e-07, "logits/chosen": -3.2955994606018066, "logits/rejected": -3.330519199371338, "logps/chosen": -313.94586181640625, "logps/rejected": -576.6971435546875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.5809284448623657, "rewards/margins": 21.45393180847168, "rewards/rejected": -22.034860610961914, "step": 1760 }, { "epoch": 0.6, "learning_rate": 2.213826973932754e-07, "logits/chosen": -3.3576037883758545, "logits/rejected": -3.1798653602600098, "logps/chosen": -205.84744262695312, "logps/rejected": -822.9035034179688, "loss": 0.0085, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8852649927139282, "rewards/margins": 27.4652156829834, "rewards/rejected": -28.350482940673828, "step": 1770 }, { "epoch": 0.61, "learning_rate": 2.1949376652814505e-07, "logits/chosen": -3.352653980255127, "logits/rejected": -3.3083279132843018, "logps/chosen": -281.2442932128906, "logps/rejected": -563.5089111328125, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.149827241897583, "rewards/margins": 20.38897132873535, "rewards/rejected": -21.53879737854004, "step": 1780 }, { "epoch": 0.61, "learning_rate": 2.176048356630147e-07, "logits/chosen": -3.310255765914917, "logits/rejected": -3.2874767780303955, "logps/chosen": -390.83807373046875, "logps/rejected": -587.5394897460938, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.459928035736084, "rewards/margins": 24.047969818115234, "rewards/rejected": -24.507898330688477, "step": 1790 }, { "epoch": 0.61, "learning_rate": 2.157159047978844e-07, "logits/chosen": -3.4679012298583984, "logits/rejected": -3.2024967670440674, "logps/chosen": -222.7393035888672, "logps/rejected": -737.6758422851562, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.8875459432601929, "rewards/margins": 23.48456573486328, "rewards/rejected": -24.372106552124023, "step": 1800 }, { "epoch": 0.61, "eval_logits/chosen": -3.415928602218628, "eval_logits/rejected": -3.2893950939178467, "eval_logps/chosen": -259.3686828613281, "eval_logps/rejected": -665.7247314453125, "eval_loss": 0.03561777248978615, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": -0.3563132882118225, "eval_rewards/margins": 23.792869567871094, "eval_rewards/rejected": -24.149181365966797, "eval_runtime": 460.2637, "eval_samples_per_second": 20.64, "eval_steps_per_second": 0.645, "step": 1800 }, { "epoch": 0.62, "learning_rate": 2.1382697393275407e-07, "logits/chosen": -3.35868501663208, "logits/rejected": -3.294783353805542, "logps/chosen": -320.07965087890625, "logps/rejected": -738.1607666015625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.3287244737148285, "rewards/margins": 26.954553604125977, "rewards/rejected": -27.283273696899414, "step": 1810 }, { "epoch": 0.62, "learning_rate": 2.1193804306762372e-07, "logits/chosen": -3.297001600265503, "logits/rejected": -3.2148499488830566, "logps/chosen": -269.9095153808594, "logps/rejected": -863.6824340820312, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.7257425785064697, "rewards/margins": 20.9593448638916, "rewards/rejected": -21.685089111328125, "step": 1820 }, { "epoch": 0.62, "learning_rate": 2.1004911220249338e-07, "logits/chosen": -3.3534445762634277, "logits/rejected": -3.3217594623565674, "logps/chosen": -259.9757080078125, "logps/rejected": -613.6614990234375, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 0.3644919991493225, "rewards/margins": 22.092815399169922, "rewards/rejected": -21.728322982788086, "step": 1830 }, { "epoch": 0.63, "learning_rate": 2.0816018133736303e-07, "logits/chosen": -3.335651397705078, "logits/rejected": -3.2650818824768066, "logps/chosen": -335.0670166015625, "logps/rejected": -782.08349609375, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 0.0012598276371136308, "rewards/margins": 27.04427719116211, "rewards/rejected": -27.043018341064453, "step": 1840 }, { "epoch": 0.63, "learning_rate": 2.0627125047223271e-07, "logits/chosen": -3.357959032058716, "logits/rejected": -3.278252363204956, "logps/chosen": -266.217529296875, "logps/rejected": -641.0094604492188, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": -0.38472968339920044, "rewards/margins": 23.43660545349121, "rewards/rejected": -23.821334838867188, "step": 1850 }, { "epoch": 0.63, "learning_rate": 2.0438231960710237e-07, "logits/chosen": -3.4566192626953125, "logits/rejected": -3.297550678253174, "logps/chosen": -265.8533630371094, "logps/rejected": -815.4766845703125, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -1.3703159093856812, "rewards/margins": 29.9644832611084, "rewards/rejected": -31.334796905517578, "step": 1860 }, { "epoch": 0.64, "learning_rate": 2.0249338874197203e-07, "logits/chosen": -3.4326648712158203, "logits/rejected": -3.4103198051452637, "logps/chosen": -268.6964416503906, "logps/rejected": -670.5267333984375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.2914245128631592, "rewards/margins": 30.470592498779297, "rewards/rejected": -31.762014389038086, "step": 1870 }, { "epoch": 0.64, "learning_rate": 2.0060445787684168e-07, "logits/chosen": -3.5192794799804688, "logits/rejected": -3.332420825958252, "logps/chosen": -209.93576049804688, "logps/rejected": -745.6237182617188, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.117185354232788, "rewards/margins": 33.509422302246094, "rewards/rejected": -34.62660598754883, "step": 1880 }, { "epoch": 0.64, "learning_rate": 1.9871552701171136e-07, "logits/chosen": -3.4282066822052, "logits/rejected": -3.368283748626709, "logps/chosen": -285.8004150390625, "logps/rejected": -652.3516235351562, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -0.227819561958313, "rewards/margins": 26.842388153076172, "rewards/rejected": -27.070209503173828, "step": 1890 }, { "epoch": 0.65, "learning_rate": 1.9682659614658105e-07, "logits/chosen": -3.4334263801574707, "logits/rejected": -3.336756467819214, "logps/chosen": -335.62725830078125, "logps/rejected": -702.3507080078125, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -0.38077887892723083, "rewards/margins": 29.044042587280273, "rewards/rejected": -29.424823760986328, "step": 1900 }, { "epoch": 0.65, "eval_logits/chosen": -3.534825086593628, "eval_logits/rejected": -3.3839564323425293, "eval_logps/chosen": -265.37176513671875, "eval_logps/rejected": -763.1902465820312, "eval_loss": 0.03813723102211952, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": -0.9566193222999573, "eval_rewards/margins": 32.939109802246094, "eval_rewards/rejected": -33.895729064941406, "eval_runtime": 460.0454, "eval_samples_per_second": 20.65, "eval_steps_per_second": 0.646, "step": 1900 }, { "epoch": 0.65, "learning_rate": 1.949376652814507e-07, "logits/chosen": -3.396048069000244, "logits/rejected": -3.275059461593628, "logps/chosen": -252.13223266601562, "logps/rejected": -747.9521484375, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7216171026229858, "rewards/margins": 32.15826416015625, "rewards/rejected": -32.87987518310547, "step": 1910 }, { "epoch": 0.65, "learning_rate": 1.9304873441632036e-07, "logits/chosen": -3.5610382556915283, "logits/rejected": -3.372722625732422, "logps/chosen": -199.77662658691406, "logps/rejected": -584.327392578125, "loss": 0.2531, "rewards/accuracies": 1.0, "rewards/chosen": -1.1961396932601929, "rewards/margins": 29.65005874633789, "rewards/rejected": -30.8461971282959, "step": 1920 }, { "epoch": 0.66, "learning_rate": 1.9115980355119001e-07, "logits/chosen": -3.369832992553711, "logits/rejected": -3.314237117767334, "logps/chosen": -215.8191680908203, "logps/rejected": -815.2225952148438, "loss": 0.0611, "rewards/accuracies": 1.0, "rewards/chosen": -0.3732140064239502, "rewards/margins": 27.058151245117188, "rewards/rejected": -27.431365966796875, "step": 1930 }, { "epoch": 0.66, "learning_rate": 1.8927087268605967e-07, "logits/chosen": -3.4612975120544434, "logits/rejected": -3.288508176803589, "logps/chosen": -218.56201171875, "logps/rejected": -906.2979736328125, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -1.040290117263794, "rewards/margins": 31.08365249633789, "rewards/rejected": -32.12394714355469, "step": 1940 }, { "epoch": 0.66, "learning_rate": 1.8738194182092935e-07, "logits/chosen": -3.348356246948242, "logits/rejected": -3.2337405681610107, "logps/chosen": -391.52935791015625, "logps/rejected": -651.825927734375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.7969802021980286, "rewards/margins": 28.728500366210938, "rewards/rejected": -27.9315185546875, "step": 1950 }, { "epoch": 0.67, "learning_rate": 1.85493010955799e-07, "logits/chosen": -3.409937620162964, "logits/rejected": -3.3914718627929688, "logps/chosen": -265.65386962890625, "logps/rejected": -596.263916015625, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -0.3935322165489197, "rewards/margins": 24.100194931030273, "rewards/rejected": -24.49372673034668, "step": 1960 }, { "epoch": 0.67, "learning_rate": 1.8360408009066866e-07, "logits/chosen": -3.3308022022247314, "logits/rejected": -3.2177085876464844, "logps/chosen": -282.54046630859375, "logps/rejected": -823.97412109375, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -0.41234469413757324, "rewards/margins": 22.745586395263672, "rewards/rejected": -23.157930374145508, "step": 1970 }, { "epoch": 0.67, "learning_rate": 1.8171514922553835e-07, "logits/chosen": -3.446866512298584, "logits/rejected": -3.372515916824341, "logps/chosen": -276.74468994140625, "logps/rejected": -599.4273071289062, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.35287076234817505, "rewards/margins": 21.927663803100586, "rewards/rejected": -22.280534744262695, "step": 1980 }, { "epoch": 0.68, "learning_rate": 1.79826218360408e-07, "logits/chosen": -3.317155361175537, "logits/rejected": -3.3876793384552, "logps/chosen": -318.00860595703125, "logps/rejected": -510.11151123046875, "loss": 0.0041, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5086533427238464, "rewards/margins": 24.184043884277344, "rewards/rejected": -24.692697525024414, "step": 1990 }, { "epoch": 0.68, "learning_rate": 1.7793728749527768e-07, "logits/chosen": -3.3701987266540527, "logits/rejected": -3.286350727081299, "logps/chosen": -267.9636535644531, "logps/rejected": -580.6959838867188, "loss": 0.006, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3567093014717102, "rewards/margins": 24.048980712890625, "rewards/rejected": -24.40568733215332, "step": 2000 }, { "epoch": 0.68, "eval_logits/chosen": -3.4805188179016113, "eval_logits/rejected": -3.337117910385132, "eval_logps/chosen": -257.2082824707031, "eval_logps/rejected": -686.7160034179688, "eval_loss": 0.0072143604047596455, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.14026859402656555, "eval_rewards/margins": 26.10803985595703, "eval_rewards/rejected": -26.248306274414062, "eval_runtime": 460.6018, "eval_samples_per_second": 20.625, "eval_steps_per_second": 0.645, "step": 2000 }, { "epoch": 0.68, "learning_rate": 1.7604835663014734e-07, "logits/chosen": -3.408255100250244, "logits/rejected": -3.403702974319458, "logps/chosen": -203.19618225097656, "logps/rejected": -665.8862915039062, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -0.6091235280036926, "rewards/margins": 24.911001205444336, "rewards/rejected": -25.520126342773438, "step": 2010 }, { "epoch": 0.69, "learning_rate": 1.74159425765017e-07, "logits/chosen": -3.443638324737549, "logits/rejected": -3.3524250984191895, "logps/chosen": -209.58401489257812, "logps/rejected": -749.86181640625, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.21705660223960876, "rewards/margins": 27.513622283935547, "rewards/rejected": -27.730676651000977, "step": 2020 }, { "epoch": 0.69, "learning_rate": 1.7227049489988665e-07, "logits/chosen": -3.3942184448242188, "logits/rejected": -3.311061143875122, "logps/chosen": -317.77410888671875, "logps/rejected": -552.6033325195312, "loss": 0.014, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5486845970153809, "rewards/margins": 22.28003692626953, "rewards/rejected": -22.82872200012207, "step": 2030 }, { "epoch": 0.69, "learning_rate": 1.703815640347563e-07, "logits/chosen": -3.5390067100524902, "logits/rejected": -3.3566482067108154, "logps/chosen": -201.36410522460938, "logps/rejected": -666.7164306640625, "loss": 0.0079, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4069572389125824, "rewards/margins": 28.751266479492188, "rewards/rejected": -29.158227920532227, "step": 2040 }, { "epoch": 0.7, "learning_rate": 1.6849263316962596e-07, "logits/chosen": -3.3728034496307373, "logits/rejected": -3.3503639698028564, "logps/chosen": -267.21881103515625, "logps/rejected": -626.498046875, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 0.2131740152835846, "rewards/margins": 27.280838012695312, "rewards/rejected": -27.067663192749023, "step": 2050 }, { "epoch": 0.7, "learning_rate": 1.6660370230449564e-07, "logits/chosen": -3.4185855388641357, "logits/rejected": -3.3411049842834473, "logps/chosen": -272.4405212402344, "logps/rejected": -621.2442626953125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.10709935426712036, "rewards/margins": 26.78093910217285, "rewards/rejected": -26.673839569091797, "step": 2060 }, { "epoch": 0.7, "learning_rate": 1.6471477143936533e-07, "logits/chosen": -3.4380111694335938, "logits/rejected": -3.3767757415771484, "logps/chosen": -248.4059295654297, "logps/rejected": -633.5654296875, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 0.7223705649375916, "rewards/margins": 27.201065063476562, "rewards/rejected": -26.478694915771484, "step": 2070 }, { "epoch": 0.71, "learning_rate": 1.6282584057423498e-07, "logits/chosen": -3.524405002593994, "logits/rejected": -3.448660373687744, "logps/chosen": -197.21920776367188, "logps/rejected": -635.7800903320312, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": -0.5619519948959351, "rewards/margins": 32.2112922668457, "rewards/rejected": -32.77324676513672, "step": 2080 }, { "epoch": 0.71, "learning_rate": 1.6093690970910464e-07, "logits/chosen": -3.3876919746398926, "logits/rejected": -3.3305823802948, "logps/chosen": -287.99945068359375, "logps/rejected": -709.9437255859375, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.21997538208961487, "rewards/margins": 31.6516170501709, "rewards/rejected": -31.431640625, "step": 2090 }, { "epoch": 0.71, "learning_rate": 1.590479788439743e-07, "logits/chosen": -3.3942599296569824, "logits/rejected": -3.2963924407958984, "logps/chosen": -314.06060791015625, "logps/rejected": -908.1524658203125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.1970634162425995, "rewards/margins": 28.25901222229004, "rewards/rejected": -28.456073760986328, "step": 2100 }, { "epoch": 0.71, "eval_logits/chosen": -3.497438669204712, "eval_logits/rejected": -3.355656623840332, "eval_logps/chosen": -257.67596435546875, "eval_logps/rejected": -714.7033081054688, "eval_loss": 0.010176397860050201, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": -0.18703816831111908, "eval_rewards/margins": 28.85999298095703, "eval_rewards/rejected": -29.047029495239258, "eval_runtime": 460.179, "eval_samples_per_second": 20.644, "eval_steps_per_second": 0.645, "step": 2100 }, { "epoch": 0.72, "learning_rate": 1.5715904797884398e-07, "logits/chosen": -3.422182559967041, "logits/rejected": -3.3635239601135254, "logps/chosen": -237.2140350341797, "logps/rejected": -596.4075317382812, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -0.42542657256126404, "rewards/margins": 25.331525802612305, "rewards/rejected": -25.7569522857666, "step": 2110 }, { "epoch": 0.72, "learning_rate": 1.5527011711371363e-07, "logits/chosen": -3.46155047416687, "logits/rejected": -3.287555694580078, "logps/chosen": -221.025634765625, "logps/rejected": -760.5899047851562, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -0.587006688117981, "rewards/margins": 23.792795181274414, "rewards/rejected": -24.379802703857422, "step": 2120 }, { "epoch": 0.72, "learning_rate": 1.533811862485833e-07, "logits/chosen": -3.4075939655303955, "logits/rejected": -3.2999377250671387, "logps/chosen": -274.0217590332031, "logps/rejected": -778.3746337890625, "loss": 0.0048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0100147724151611, "rewards/margins": 29.110565185546875, "rewards/rejected": -30.120580673217773, "step": 2130 }, { "epoch": 0.73, "learning_rate": 1.5149225538345294e-07, "logits/chosen": -3.4074718952178955, "logits/rejected": -3.328861951828003, "logps/chosen": -234.0587921142578, "logps/rejected": -573.584228515625, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": -0.8820997476577759, "rewards/margins": 25.29749870300293, "rewards/rejected": -26.17959976196289, "step": 2140 }, { "epoch": 0.73, "learning_rate": 1.496033245183226e-07, "logits/chosen": -3.433687925338745, "logits/rejected": -3.2161545753479004, "logps/chosen": -210.7549285888672, "logps/rejected": -769.3878173828125, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -0.381535142660141, "rewards/margins": 30.15536117553711, "rewards/rejected": -30.536895751953125, "step": 2150 }, { "epoch": 0.73, "learning_rate": 1.477143936531923e-07, "logits/chosen": -3.316527843475342, "logits/rejected": -3.3601183891296387, "logps/chosen": -334.7134094238281, "logps/rejected": -479.4236755371094, "loss": 0.0101, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2200862169265747, "rewards/margins": 26.942337036132812, "rewards/rejected": -27.162424087524414, "step": 2160 }, { "epoch": 0.74, "learning_rate": 1.4582546278806196e-07, "logits/chosen": -3.372246503829956, "logits/rejected": -3.286343812942505, "logps/chosen": -269.2080993652344, "logps/rejected": -628.6188354492188, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.732342541217804, "rewards/margins": 29.969751358032227, "rewards/rejected": -30.702091217041016, "step": 2170 }, { "epoch": 0.74, "learning_rate": 1.4393653192293162e-07, "logits/chosen": -3.4120821952819824, "logits/rejected": -3.2668113708496094, "logps/chosen": -272.43658447265625, "logps/rejected": -670.0343627929688, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -0.10129819065332413, "rewards/margins": 27.09328842163086, "rewards/rejected": -27.194589614868164, "step": 2180 }, { "epoch": 0.74, "learning_rate": 1.4204760105780127e-07, "logits/chosen": -3.430737257003784, "logits/rejected": -3.3516769409179688, "logps/chosen": -270.27667236328125, "logps/rejected": -783.06884765625, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.26623135805130005, "rewards/margins": 37.538352966308594, "rewards/rejected": -37.80458450317383, "step": 2190 }, { "epoch": 0.75, "learning_rate": 1.4015867019267093e-07, "logits/chosen": -3.218029499053955, "logits/rejected": -3.3251090049743652, "logps/chosen": -504.22454833984375, "logps/rejected": -690.3221435546875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.57887864112854, "rewards/margins": 31.29214096069336, "rewards/rejected": -31.871023178100586, "step": 2200 }, { "epoch": 0.75, "eval_logits/chosen": -3.5046215057373047, "eval_logits/rejected": -3.3551418781280518, "eval_logps/chosen": -260.6087341308594, "eval_logps/rejected": -723.0064086914062, "eval_loss": 0.00784117542207241, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": -0.48031851649284363, "eval_rewards/margins": 29.39702606201172, "eval_rewards/rejected": -29.877344131469727, "eval_runtime": 460.3396, "eval_samples_per_second": 20.637, "eval_steps_per_second": 0.645, "step": 2200 }, { "epoch": 0.75, "learning_rate": 1.382697393275406e-07, "logits/chosen": -3.2150256633758545, "logits/rejected": -3.264436721801758, "logps/chosen": -429.79412841796875, "logps/rejected": -691.61767578125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.6222277283668518, "rewards/margins": 24.18454360961914, "rewards/rejected": -24.806772232055664, "step": 2210 }, { "epoch": 0.75, "learning_rate": 1.3638080846241027e-07, "logits/chosen": -3.491255283355713, "logits/rejected": -3.2765109539031982, "logps/chosen": -193.16854858398438, "logps/rejected": -634.7061767578125, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -0.7472248673439026, "rewards/margins": 27.92132568359375, "rewards/rejected": -28.66855239868164, "step": 2220 }, { "epoch": 0.76, "learning_rate": 1.3449187759727992e-07, "logits/chosen": -3.4371280670166016, "logits/rejected": -3.394758701324463, "logps/chosen": -200.95811462402344, "logps/rejected": -568.9024047851562, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -0.9408467411994934, "rewards/margins": 25.23788833618164, "rewards/rejected": -26.178735733032227, "step": 2230 }, { "epoch": 0.76, "learning_rate": 1.3260294673214958e-07, "logits/chosen": -3.448991060256958, "logits/rejected": -3.209871292114258, "logps/chosen": -215.184326171875, "logps/rejected": -723.2108154296875, "loss": 0.0051, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9981611371040344, "rewards/margins": 30.49850845336914, "rewards/rejected": -31.496667861938477, "step": 2240 }, { "epoch": 0.76, "learning_rate": 1.3071401586701926e-07, "logits/chosen": -3.442951202392578, "logits/rejected": -3.4533190727233887, "logps/chosen": -226.30252075195312, "logps/rejected": -531.5767822265625, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.7389757633209229, "rewards/margins": 25.352436065673828, "rewards/rejected": -26.091405868530273, "step": 2250 }, { "epoch": 0.77, "learning_rate": 1.2882508500188894e-07, "logits/chosen": -3.467149257659912, "logits/rejected": -3.3945045471191406, "logps/chosen": -211.5034637451172, "logps/rejected": -699.1697387695312, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -0.5048588514328003, "rewards/margins": 28.00215721130371, "rewards/rejected": -28.50701904296875, "step": 2260 }, { "epoch": 0.77, "learning_rate": 1.269361541367586e-07, "logits/chosen": -3.3899827003479004, "logits/rejected": -3.4136524200439453, "logps/chosen": -312.1116943359375, "logps/rejected": -670.1107177734375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.6437836289405823, "rewards/margins": 30.72330665588379, "rewards/rejected": -31.367090225219727, "step": 2270 }, { "epoch": 0.77, "learning_rate": 1.2504722327162826e-07, "logits/chosen": -3.4552528858184814, "logits/rejected": -3.3502261638641357, "logps/chosen": -222.96029663085938, "logps/rejected": -634.1143798828125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.35589832067489624, "rewards/margins": 26.0993595123291, "rewards/rejected": -26.455257415771484, "step": 2280 }, { "epoch": 0.78, "learning_rate": 1.231582924064979e-07, "logits/chosen": -3.4500694274902344, "logits/rejected": -3.356391191482544, "logps/chosen": -226.2766876220703, "logps/rejected": -721.9498901367188, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -0.8104637265205383, "rewards/margins": 28.76261329650879, "rewards/rejected": -29.573078155517578, "step": 2290 }, { "epoch": 0.78, "learning_rate": 1.2126936154136757e-07, "logits/chosen": -3.4218978881835938, "logits/rejected": -3.3485636711120605, "logps/chosen": -249.322265625, "logps/rejected": -696.6746826171875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.3162160813808441, "rewards/margins": 24.947269439697266, "rewards/rejected": -25.263486862182617, "step": 2300 }, { "epoch": 0.78, "eval_logits/chosen": -3.4948203563690186, "eval_logits/rejected": -3.345940589904785, "eval_logps/chosen": -260.5770263671875, "eval_logps/rejected": -708.5813598632812, "eval_loss": 0.007498822640627623, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": -0.4771437644958496, "eval_rewards/margins": 27.957693099975586, "eval_rewards/rejected": -28.434837341308594, "eval_runtime": 461.2562, "eval_samples_per_second": 20.596, "eval_steps_per_second": 0.644, "step": 2300 }, { "epoch": 0.79, "learning_rate": 1.1938043067623725e-07, "logits/chosen": -3.370896816253662, "logits/rejected": -3.3326168060302734, "logps/chosen": -347.14337158203125, "logps/rejected": -689.7429809570312, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 0.18211320042610168, "rewards/margins": 27.03213119506836, "rewards/rejected": -26.850021362304688, "step": 2310 }, { "epoch": 0.79, "learning_rate": 1.1749149981110692e-07, "logits/chosen": -3.3958702087402344, "logits/rejected": -3.369236707687378, "logps/chosen": -246.0237579345703, "logps/rejected": -561.996826171875, "loss": 0.0076, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.559197723865509, "rewards/margins": 20.26263427734375, "rewards/rejected": -20.82183074951172, "step": 2320 }, { "epoch": 0.79, "learning_rate": 1.1560256894597657e-07, "logits/chosen": -3.3493895530700684, "logits/rejected": -3.370333433151245, "logps/chosen": -424.63336181640625, "logps/rejected": -620.3344116210938, "loss": 0.0134, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.06108301132917404, "rewards/margins": 25.891162872314453, "rewards/rejected": -25.952245712280273, "step": 2330 }, { "epoch": 0.8, "learning_rate": 1.1371363808084623e-07, "logits/chosen": -3.4243323802948, "logits/rejected": -3.247948169708252, "logps/chosen": -254.7300262451172, "logps/rejected": -1042.1585693359375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.3867115378379822, "rewards/margins": 35.40499496459961, "rewards/rejected": -35.01828384399414, "step": 2340 }, { "epoch": 0.8, "learning_rate": 1.118247072157159e-07, "logits/chosen": -3.4594218730926514, "logits/rejected": -3.3651282787323, "logps/chosen": -242.59677124023438, "logps/rejected": -538.2279663085938, "loss": 0.0067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.49601420760154724, "rewards/margins": 23.69955062866211, "rewards/rejected": -24.19556427001953, "step": 2350 }, { "epoch": 0.8, "learning_rate": 1.0993577635058557e-07, "logits/chosen": -3.511322498321533, "logits/rejected": -3.356180191040039, "logps/chosen": -196.69949340820312, "logps/rejected": -571.466796875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 0.02175600454211235, "rewards/margins": 25.043487548828125, "rewards/rejected": -25.021728515625, "step": 2360 }, { "epoch": 0.81, "learning_rate": 1.0804684548545522e-07, "logits/chosen": -3.4236786365509033, "logits/rejected": -3.338402509689331, "logps/chosen": -247.4124755859375, "logps/rejected": -755.9312133789062, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.05236620828509331, "rewards/margins": 24.615774154663086, "rewards/rejected": -24.668140411376953, "step": 2370 }, { "epoch": 0.81, "learning_rate": 1.0615791462032489e-07, "logits/chosen": -3.413336992263794, "logits/rejected": -3.3395888805389404, "logps/chosen": -274.93182373046875, "logps/rejected": -602.7622680664062, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.4528873562812805, "rewards/margins": 26.461483001708984, "rewards/rejected": -26.91436767578125, "step": 2380 }, { "epoch": 0.81, "learning_rate": 1.0426898375519455e-07, "logits/chosen": -3.4481735229492188, "logits/rejected": -3.3065543174743652, "logps/chosen": -273.5448913574219, "logps/rejected": -812.5321044921875, "loss": 0.1677, "rewards/accuracies": 1.0, "rewards/chosen": -0.46621403098106384, "rewards/margins": 29.433242797851562, "rewards/rejected": -29.899456024169922, "step": 2390 }, { "epoch": 0.82, "learning_rate": 1.0238005289006423e-07, "logits/chosen": -3.498518466949463, "logits/rejected": -3.352489948272705, "logps/chosen": -229.6658172607422, "logps/rejected": -674.809814453125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.506945788860321, "rewards/margins": 24.02678680419922, "rewards/rejected": -24.533733367919922, "step": 2400 }, { "epoch": 0.82, "eval_logits/chosen": -3.4949593544006348, "eval_logits/rejected": -3.3488945960998535, "eval_logps/chosen": -257.8039245605469, "eval_logps/rejected": -704.2630615234375, "eval_loss": 0.00467069773003459, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.19983212649822235, "eval_rewards/margins": 27.803180694580078, "eval_rewards/rejected": -28.00301170349121, "eval_runtime": 460.1772, "eval_samples_per_second": 20.644, "eval_steps_per_second": 0.645, "step": 2400 }, { "epoch": 0.82, "learning_rate": 1.0049112202493389e-07, "logits/chosen": -3.458670139312744, "logits/rejected": -3.34519624710083, "logps/chosen": -229.53012084960938, "logps/rejected": -653.47021484375, "loss": 0.9205, "rewards/accuracies": 1.0, "rewards/chosen": -0.46636518836021423, "rewards/margins": 31.985393524169922, "rewards/rejected": -32.45175552368164, "step": 2410 }, { "epoch": 0.82, "learning_rate": 9.860219115980354e-08, "logits/chosen": -3.354882001876831, "logits/rejected": -3.3362224102020264, "logps/chosen": -298.93890380859375, "logps/rejected": -575.4816284179688, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.2504803538322449, "rewards/margins": 25.459463119506836, "rewards/rejected": -25.208984375, "step": 2420 }, { "epoch": 0.83, "learning_rate": 9.671326029467321e-08, "logits/chosen": -3.409818649291992, "logits/rejected": -3.294722080230713, "logps/chosen": -264.5761413574219, "logps/rejected": -753.7411499023438, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -0.6481723189353943, "rewards/margins": 29.749752044677734, "rewards/rejected": -30.39792251586914, "step": 2430 }, { "epoch": 0.83, "learning_rate": 9.482432942954287e-08, "logits/chosen": -3.398315906524658, "logits/rejected": -3.2785720825195312, "logps/chosen": -275.75506591796875, "logps/rejected": -881.2058715820312, "loss": 0.0051, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4502645432949066, "rewards/margins": 33.76361846923828, "rewards/rejected": -34.21388626098633, "step": 2440 }, { "epoch": 0.83, "learning_rate": 9.293539856441255e-08, "logits/chosen": -3.4217476844787598, "logits/rejected": -3.160505533218384, "logps/chosen": -211.6370849609375, "logps/rejected": -842.2235107421875, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -0.4303048253059387, "rewards/margins": 34.46601486206055, "rewards/rejected": -34.89632034301758, "step": 2450 }, { "epoch": 0.84, "learning_rate": 9.10464676992822e-08, "logits/chosen": -3.360055446624756, "logits/rejected": -3.2219321727752686, "logps/chosen": -197.63504028320312, "logps/rejected": -700.8560791015625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.5080915689468384, "rewards/margins": 28.26936912536621, "rewards/rejected": -28.7774600982666, "step": 2460 }, { "epoch": 0.84, "learning_rate": 8.915753683415186e-08, "logits/chosen": -3.324397325515747, "logits/rejected": -3.2371230125427246, "logps/chosen": -265.84197998046875, "logps/rejected": -706.1388549804688, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 0.3217003345489502, "rewards/margins": 31.357486724853516, "rewards/rejected": -31.035785675048828, "step": 2470 }, { "epoch": 0.84, "learning_rate": 8.726860596902153e-08, "logits/chosen": -3.324913501739502, "logits/rejected": -3.31086802482605, "logps/chosen": -289.2586364746094, "logps/rejected": -618.4796142578125, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 0.31292805075645447, "rewards/margins": 25.791996002197266, "rewards/rejected": -25.47906494140625, "step": 2480 }, { "epoch": 0.85, "learning_rate": 8.53796751038912e-08, "logits/chosen": -3.358564853668213, "logits/rejected": -3.3472542762756348, "logps/chosen": -252.2576141357422, "logps/rejected": -531.1395263671875, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -0.5012627840042114, "rewards/margins": 24.36186408996582, "rewards/rejected": -24.86312484741211, "step": 2490 }, { "epoch": 0.85, "learning_rate": 8.349074423876085e-08, "logits/chosen": -3.2991740703582764, "logits/rejected": -3.3889808654785156, "logps/chosen": -312.4256591796875, "logps/rejected": -605.0225830078125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 0.34426942467689514, "rewards/margins": 31.89084243774414, "rewards/rejected": -31.546573638916016, "step": 2500 }, { "epoch": 0.85, "eval_logits/chosen": -3.4427988529205322, "eval_logits/rejected": -3.3025379180908203, "eval_logps/chosen": -258.5765075683594, "eval_logps/rejected": -716.5906372070312, "eval_loss": 0.004813064821064472, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.2770873010158539, "eval_rewards/margins": 28.958681106567383, "eval_rewards/rejected": -29.23576545715332, "eval_runtime": 460.0317, "eval_samples_per_second": 20.651, "eval_steps_per_second": 0.646, "step": 2500 }, { "epoch": 0.85, "learning_rate": 8.160181337363052e-08, "logits/chosen": -3.3120341300964355, "logits/rejected": -3.2296226024627686, "logps/chosen": -280.31353759765625, "logps/rejected": -694.3499755859375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.37215790152549744, "rewards/margins": 28.98830795288086, "rewards/rejected": -29.3604679107666, "step": 2510 }, { "epoch": 0.86, "learning_rate": 7.971288250850018e-08, "logits/chosen": -3.4131808280944824, "logits/rejected": -3.1928412914276123, "logps/chosen": -205.73983764648438, "logps/rejected": -883.18017578125, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -0.5982518196105957, "rewards/margins": 27.798303604125977, "rewards/rejected": -28.39655113220215, "step": 2520 }, { "epoch": 0.86, "learning_rate": 7.782395164336985e-08, "logits/chosen": -3.383462429046631, "logits/rejected": -3.292088747024536, "logps/chosen": -337.71136474609375, "logps/rejected": -649.876220703125, "loss": 0.3124, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.490572690963745, "rewards/margins": 20.789615631103516, "rewards/rejected": -23.280189514160156, "step": 2530 }, { "epoch": 0.86, "learning_rate": 7.593502077823952e-08, "logits/chosen": -3.3705170154571533, "logits/rejected": -3.262524127960205, "logps/chosen": -210.7335205078125, "logps/rejected": -573.064453125, "loss": 0.2135, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.23895084857940674, "rewards/margins": 23.423107147216797, "rewards/rejected": -23.662057876586914, "step": 2540 }, { "epoch": 0.87, "learning_rate": 7.404608991310917e-08, "logits/chosen": -3.3921730518341064, "logits/rejected": -3.3298392295837402, "logps/chosen": -256.29339599609375, "logps/rejected": -685.7628784179688, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.020406579598784447, "rewards/margins": 28.857458114624023, "rewards/rejected": -28.837055206298828, "step": 2550 }, { "epoch": 0.87, "learning_rate": 7.215715904797884e-08, "logits/chosen": -3.4744045734405518, "logits/rejected": -3.286008834838867, "logps/chosen": -214.1748046875, "logps/rejected": -831.2682495117188, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -0.8809574842453003, "rewards/margins": 33.92351150512695, "rewards/rejected": -34.804466247558594, "step": 2560 }, { "epoch": 0.87, "learning_rate": 7.02682281828485e-08, "logits/chosen": -3.3835952281951904, "logits/rejected": -3.3155086040496826, "logps/chosen": -275.273193359375, "logps/rejected": -767.3925170898438, "loss": 0.0098, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5842048525810242, "rewards/margins": 30.7226505279541, "rewards/rejected": -31.30685806274414, "step": 2570 }, { "epoch": 0.88, "learning_rate": 6.837929731771818e-08, "logits/chosen": -3.317415952682495, "logits/rejected": -3.3041164875030518, "logps/chosen": -258.95086669921875, "logps/rejected": -488.6026916503906, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.17078134417533875, "rewards/margins": 23.854570388793945, "rewards/rejected": -24.025352478027344, "step": 2580 }, { "epoch": 0.88, "learning_rate": 6.649036645258783e-08, "logits/chosen": -3.344968795776367, "logits/rejected": -3.2897567749023438, "logps/chosen": -314.9465026855469, "logps/rejected": -535.2317504882812, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.07202013581991196, "rewards/margins": 28.91226577758789, "rewards/rejected": -28.984283447265625, "step": 2590 }, { "epoch": 0.88, "learning_rate": 6.460143558745749e-08, "logits/chosen": -3.419276475906372, "logits/rejected": -3.2689127922058105, "logps/chosen": -205.4881134033203, "logps/rejected": -798.7406005859375, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 0.08600559085607529, "rewards/margins": 38.14585494995117, "rewards/rejected": -38.059852600097656, "step": 2600 }, { "epoch": 0.88, "eval_logits/chosen": -3.4804773330688477, "eval_logits/rejected": -3.332041025161743, "eval_logps/chosen": -257.89501953125, "eval_logps/rejected": -720.7188720703125, "eval_loss": 0.004375319927930832, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.20894265174865723, "eval_rewards/margins": 29.43964385986328, "eval_rewards/rejected": -29.64858627319336, "eval_runtime": 460.0382, "eval_samples_per_second": 20.65, "eval_steps_per_second": 0.646, "step": 2600 }, { "epoch": 0.89, "learning_rate": 6.271250472232716e-08, "logits/chosen": -3.394615888595581, "logits/rejected": -3.2852377891540527, "logps/chosen": -266.36767578125, "logps/rejected": -717.4393310546875, "loss": 0.0151, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6554498672485352, "rewards/margins": 26.674755096435547, "rewards/rejected": -26.019306182861328, "step": 2610 }, { "epoch": 0.89, "learning_rate": 6.082357385719683e-08, "logits/chosen": -3.381989002227783, "logits/rejected": -3.3894596099853516, "logps/chosen": -342.04913330078125, "logps/rejected": -546.2716674804688, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.6526740193367004, "rewards/margins": 21.150623321533203, "rewards/rejected": -21.803295135498047, "step": 2620 }, { "epoch": 0.89, "learning_rate": 5.893464299206649e-08, "logits/chosen": -3.4155497550964355, "logits/rejected": -3.2923424243927, "logps/chosen": -223.69140625, "logps/rejected": -731.6795043945312, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.3694123923778534, "rewards/margins": 28.902706146240234, "rewards/rejected": -29.27211570739746, "step": 2630 }, { "epoch": 0.9, "learning_rate": 5.704571212693615e-08, "logits/chosen": -3.330949306488037, "logits/rejected": -3.3261523246765137, "logps/chosen": -248.51528930664062, "logps/rejected": -587.5772705078125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.7009295225143433, "rewards/margins": 24.234630584716797, "rewards/rejected": -24.935558319091797, "step": 2640 }, { "epoch": 0.9, "learning_rate": 5.5156781261805816e-08, "logits/chosen": -3.4656805992126465, "logits/rejected": -3.3214333057403564, "logps/chosen": -191.90036010742188, "logps/rejected": -628.0974731445312, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.088071584701538, "rewards/margins": 31.438037872314453, "rewards/rejected": -32.5261116027832, "step": 2650 }, { "epoch": 0.9, "learning_rate": 5.326785039667548e-08, "logits/chosen": -3.4162774085998535, "logits/rejected": -3.4043285846710205, "logps/chosen": -270.454345703125, "logps/rejected": -554.826171875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.00885864533483982, "rewards/margins": 29.386489868164062, "rewards/rejected": -29.395349502563477, "step": 2660 }, { "epoch": 0.91, "learning_rate": 5.137891953154514e-08, "logits/chosen": -3.5397841930389404, "logits/rejected": -3.299877166748047, "logps/chosen": -220.99899291992188, "logps/rejected": -832.1795654296875, "loss": 0.0054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.49379825592041016, "rewards/margins": 30.90732192993164, "rewards/rejected": -31.4011173248291, "step": 2670 }, { "epoch": 0.91, "learning_rate": 4.948998866641481e-08, "logits/chosen": -3.422128200531006, "logits/rejected": -3.272432327270508, "logps/chosen": -266.85186767578125, "logps/rejected": -875.5877075195312, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.2771952152252197, "rewards/margins": 30.630329132080078, "rewards/rejected": -30.90752601623535, "step": 2680 }, { "epoch": 0.91, "learning_rate": 4.760105780128447e-08, "logits/chosen": -3.4342148303985596, "logits/rejected": -3.2355589866638184, "logps/chosen": -297.9632873535156, "logps/rejected": -929.0408935546875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.417285293340683, "rewards/margins": 34.31209182739258, "rewards/rejected": -34.729373931884766, "step": 2690 }, { "epoch": 0.92, "learning_rate": 4.5712126936154134e-08, "logits/chosen": -3.488246202468872, "logits/rejected": -3.3506011962890625, "logps/chosen": -173.14797973632812, "logps/rejected": -620.3333740234375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.3520640432834625, "rewards/margins": 29.699413299560547, "rewards/rejected": -30.05147933959961, "step": 2700 }, { "epoch": 0.92, "eval_logits/chosen": -3.482171058654785, "eval_logits/rejected": -3.3307948112487793, "eval_logps/chosen": -257.48095703125, "eval_logps/rejected": -726.024169921875, "eval_loss": 0.004117514006793499, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.16753698885440826, "eval_rewards/margins": 30.011581420898438, "eval_rewards/rejected": -30.179115295410156, "eval_runtime": 459.6071, "eval_samples_per_second": 20.67, "eval_steps_per_second": 0.646, "step": 2700 }, { "epoch": 0.92, "learning_rate": 4.3823196071023796e-08, "logits/chosen": -3.4520740509033203, "logits/rejected": -3.2903473377227783, "logps/chosen": -204.6480712890625, "logps/rejected": -706.2102661132812, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.4886886179447174, "rewards/margins": 32.48357009887695, "rewards/rejected": -32.972259521484375, "step": 2710 }, { "epoch": 0.92, "learning_rate": 4.1934265205893465e-08, "logits/chosen": -3.3444876670837402, "logits/rejected": -3.3331406116485596, "logps/chosen": -345.8191223144531, "logps/rejected": -787.9623413085938, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.5544703006744385, "rewards/margins": 30.112964630126953, "rewards/rejected": -29.558496475219727, "step": 2720 }, { "epoch": 0.93, "learning_rate": 4.004533434076313e-08, "logits/chosen": -3.449005126953125, "logits/rejected": -3.3541347980499268, "logps/chosen": -205.72506713867188, "logps/rejected": -782.6888427734375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.2865965962409973, "rewards/margins": 26.386667251586914, "rewards/rejected": -26.673263549804688, "step": 2730 }, { "epoch": 0.93, "learning_rate": 3.815640347563279e-08, "logits/chosen": -3.330857038497925, "logits/rejected": -3.237947463989258, "logps/chosen": -281.54742431640625, "logps/rejected": -995.6536865234375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.840170681476593, "rewards/margins": 30.559356689453125, "rewards/rejected": -31.399526596069336, "step": 2740 }, { "epoch": 0.93, "learning_rate": 3.626747261050245e-08, "logits/chosen": -3.4661128520965576, "logits/rejected": -3.30942964553833, "logps/chosen": -212.7915496826172, "logps/rejected": -827.4059448242188, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -0.515861988067627, "rewards/margins": 28.297542572021484, "rewards/rejected": -28.813405990600586, "step": 2750 }, { "epoch": 0.94, "learning_rate": 3.4378541745372115e-08, "logits/chosen": -3.424384355545044, "logits/rejected": -3.36513090133667, "logps/chosen": -288.58001708984375, "logps/rejected": -790.6275634765625, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.2595888078212738, "rewards/margins": 32.224891662597656, "rewards/rejected": -31.965301513671875, "step": 2760 }, { "epoch": 0.94, "learning_rate": 3.2489610880241784e-08, "logits/chosen": -3.450979232788086, "logits/rejected": -3.3050060272216797, "logps/chosen": -198.88816833496094, "logps/rejected": -727.8658447265625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.09249986708164215, "rewards/margins": 30.053543090820312, "rewards/rejected": -29.96103858947754, "step": 2770 }, { "epoch": 0.94, "learning_rate": 3.0600680015111446e-08, "logits/chosen": -3.4271132946014404, "logits/rejected": -3.3997604846954346, "logps/chosen": -195.2742919921875, "logps/rejected": -512.84619140625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.01265418529510498, "rewards/margins": 30.05193519592285, "rewards/rejected": -30.064586639404297, "step": 2780 }, { "epoch": 0.95, "learning_rate": 2.871174914998111e-08, "logits/chosen": -3.450589656829834, "logits/rejected": -3.260341167449951, "logps/chosen": -221.8329315185547, "logps/rejected": -699.6900634765625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.08895153552293777, "rewards/margins": 34.512027740478516, "rewards/rejected": -34.423072814941406, "step": 2790 }, { "epoch": 0.95, "learning_rate": 2.682281828485077e-08, "logits/chosen": -3.405123472213745, "logits/rejected": -3.2812092304229736, "logps/chosen": -212.4518585205078, "logps/rejected": -927.1981201171875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.3205551505088806, "rewards/margins": 29.993661880493164, "rewards/rejected": -30.314218521118164, "step": 2800 }, { "epoch": 0.95, "eval_logits/chosen": -3.4751133918762207, "eval_logits/rejected": -3.3266823291778564, "eval_logps/chosen": -255.20126342773438, "eval_logps/rejected": -718.1399536132812, "eval_loss": 0.003812924027442932, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.06043152138590813, "eval_rewards/margins": 29.45113182067871, "eval_rewards/rejected": -29.390703201293945, "eval_runtime": 459.6851, "eval_samples_per_second": 20.666, "eval_steps_per_second": 0.646, "step": 2800 }, { "epoch": 0.96, "learning_rate": 2.4933887419720436e-08, "logits/chosen": -3.470411777496338, "logits/rejected": -3.371030330657959, "logps/chosen": -203.39833068847656, "logps/rejected": -554.4922485351562, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.4396284222602844, "rewards/margins": 28.658405303955078, "rewards/rejected": -29.0980281829834, "step": 2810 }, { "epoch": 0.96, "learning_rate": 2.30449565545901e-08, "logits/chosen": -3.403038740158081, "logits/rejected": -3.3339126110076904, "logps/chosen": -218.4235076904297, "logps/rejected": -501.0118713378906, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -0.3079354763031006, "rewards/margins": 22.659008026123047, "rewards/rejected": -22.96694564819336, "step": 2820 }, { "epoch": 0.96, "learning_rate": 2.1156025689459764e-08, "logits/chosen": -3.360138416290283, "logits/rejected": -3.204312562942505, "logps/chosen": -275.4970703125, "logps/rejected": -807.7415771484375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.563244104385376, "rewards/margins": 28.03116226196289, "rewards/rejected": -27.467914581298828, "step": 2830 }, { "epoch": 0.97, "learning_rate": 1.926709482432943e-08, "logits/chosen": -3.4086861610412598, "logits/rejected": -3.273714542388916, "logps/chosen": -241.6406707763672, "logps/rejected": -834.037109375, "loss": 0.0142, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5851477384567261, "rewards/margins": 38.93583679199219, "rewards/rejected": -38.350685119628906, "step": 2840 }, { "epoch": 0.97, "learning_rate": 1.7378163959199092e-08, "logits/chosen": -3.443998336791992, "logits/rejected": -3.288005828857422, "logps/chosen": -259.77630615234375, "logps/rejected": -637.4151611328125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.25292807817459106, "rewards/margins": 29.613880157470703, "rewards/rejected": -29.360952377319336, "step": 2850 }, { "epoch": 0.97, "learning_rate": 1.5489233094068758e-08, "logits/chosen": -3.4653220176696777, "logits/rejected": -3.3464725017547607, "logps/chosen": -222.8519287109375, "logps/rejected": -588.8685913085938, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.2632875442504883, "rewards/margins": 26.466629028320312, "rewards/rejected": -26.729915618896484, "step": 2860 }, { "epoch": 0.98, "learning_rate": 1.3600302228938419e-08, "logits/chosen": -3.376246929168701, "logits/rejected": -3.2897162437438965, "logps/chosen": -312.7208251953125, "logps/rejected": -748.8970336914062, "loss": 0.0038, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.849084734916687, "rewards/margins": 28.013254165649414, "rewards/rejected": -27.164173126220703, "step": 2870 }, { "epoch": 0.98, "learning_rate": 1.1711371363808084e-08, "logits/chosen": -3.442333936691284, "logits/rejected": -3.225363254547119, "logps/chosen": -266.2436828613281, "logps/rejected": -922.4913330078125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.09482870250940323, "rewards/margins": 27.245941162109375, "rewards/rejected": -27.340768814086914, "step": 2880 }, { "epoch": 0.98, "learning_rate": 9.822440498677748e-09, "logits/chosen": -3.2727837562561035, "logits/rejected": -3.270648241043091, "logps/chosen": -396.17156982421875, "logps/rejected": -819.6370239257812, "loss": 0.0042, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2518737316131592, "rewards/margins": 33.05315017700195, "rewards/rejected": -32.80127716064453, "step": 2890 }, { "epoch": 0.99, "learning_rate": 7.933509633547412e-09, "logits/chosen": -3.4724514484405518, "logits/rejected": -3.2815041542053223, "logps/chosen": -211.4152374267578, "logps/rejected": -643.9974365234375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.16061940789222717, "rewards/margins": 27.16357421875, "rewards/rejected": -27.324193954467773, "step": 2900 }, { "epoch": 0.99, "eval_logits/chosen": -3.4722537994384766, "eval_logits/rejected": -3.3257179260253906, "eval_logps/chosen": -254.35963439941406, "eval_logps/rejected": -714.0263671875, "eval_loss": 0.003982194699347019, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.1445969194173813, "eval_rewards/margins": 29.123931884765625, "eval_rewards/rejected": -28.979337692260742, "eval_runtime": 459.5744, "eval_samples_per_second": 20.671, "eval_steps_per_second": 0.646, "step": 2900 }, { "epoch": 0.99, "learning_rate": 6.044578768417076e-09, "logits/chosen": -3.3734288215637207, "logits/rejected": -3.2156894207000732, "logps/chosen": -206.44552612304688, "logps/rejected": -797.2452392578125, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -0.3702377378940582, "rewards/margins": 27.78318214416504, "rewards/rejected": -28.153417587280273, "step": 2910 }, { "epoch": 0.99, "learning_rate": 4.15564790328674e-09, "logits/chosen": -3.3592917919158936, "logits/rejected": -3.2777762413024902, "logps/chosen": -216.9835968017578, "logps/rejected": -824.8385620117188, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.20981796085834503, "rewards/margins": 28.391815185546875, "rewards/rejected": -28.601633071899414, "step": 2920 }, { "epoch": 1.0, "learning_rate": 2.2667170381564033e-09, "logits/chosen": -3.416182041168213, "logits/rejected": -3.359605312347412, "logps/chosen": -203.88467407226562, "logps/rejected": -526.6511840820312, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.26971641182899475, "rewards/margins": 26.130962371826172, "rewards/rejected": -26.400676727294922, "step": 2930 }, { "epoch": 1.0, "learning_rate": 3.7778617302606723e-10, "logits/chosen": -3.350004196166992, "logits/rejected": -3.2843971252441406, "logps/chosen": -274.75946044921875, "logps/rejected": -673.1805419921875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.04258955270051956, "rewards/margins": 28.780603408813477, "rewards/rejected": -28.82318687438965, "step": 2940 }, { "epoch": 1.0, "step": 2942, "total_flos": 0.0, "train_loss": 0.0552501158515984, "train_runtime": 31216.715, "train_samples_per_second": 6.032, "train_steps_per_second": 0.094 } ], "logging_steps": 10, "max_steps": 2942, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }