{ "best_global_step": 664, "best_metric": 6.731750011444092, "best_model_checkpoint": "/tmp/svadugur/39816/informativity_and_cost_preference-speaker=gemma-listener=pixtral_ft-length_conditioned=True-contexts=hard-39816/checkpoint-664", "epoch": 1.8043478260869565, "eval_steps": 83, "global_step": 747, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_logits/chosen": -2.2093393802642822, "eval_logits/rejected": -2.1264047622680664, "eval_logps/chosen": -54.660919189453125, "eval_logps/rejected": -76.33399963378906, "eval_loss": 1.0, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0, "eval_runtime": 1037.2369, "eval_samples_per_second": 0.524, "eval_steps_per_second": 0.262, "step": 0 }, { "epoch": 0.0024154589371980675, "grad_norm": 3.0429580211639404, "learning_rate": 1e-06, "logits/chosen": -2.1798338890075684, "logits/rejected": -2.0687780380249023, "logps/chosen": -55.53208923339844, "logps/rejected": -70.28501892089844, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004830917874396135, "grad_norm": 3.15818190574646, "learning_rate": 9.997584541062802e-07, "logits/chosen": -2.185657024383545, "logits/rejected": -2.0796873569488525, "logps/chosen": -54.209293365478516, "logps/rejected": -74.5587387084961, "loss": 0.9987, "rewards/accuracies": 0.46875, "rewards/chosen": -0.006059063598513603, "rewards/margins": 0.0053293583914637566, "rewards/rejected": -0.011388419196009636, "step": 2 }, { "epoch": 0.007246376811594203, "grad_norm": 3.5606508255004883, "learning_rate": 9.995169082125602e-07, "logits/chosen": -2.174759864807129, "logits/rejected": -2.091489553451538, "logps/chosen": -54.30859375, "logps/rejected": -78.98703002929688, "loss": 1.0024, "rewards/accuracies": 0.5, "rewards/chosen": -0.0179147832095623, "rewards/margins": -0.009833194315433502, "rewards/rejected": -0.008081591688096523, "step": 3 }, { "epoch": 0.00966183574879227, "grad_norm": 2.852982521057129, "learning_rate": 9.992753623188407e-07, "logits/chosen": -2.116687536239624, "logits/rejected": -2.0002129077911377, "logps/chosen": -55.65742111206055, "logps/rejected": -76.23722076416016, "loss": 1.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.007890520617365837, "rewards/margins": -0.00021605519577860832, "rewards/rejected": 0.008106577210128307, "step": 4 }, { "epoch": 0.012077294685990338, "grad_norm": 3.0592751502990723, "learning_rate": 9.990338164251207e-07, "logits/chosen": -2.1783370971679688, "logits/rejected": -2.0514919757843018, "logps/chosen": -58.191497802734375, "logps/rejected": -83.69381713867188, "loss": 1.0065, "rewards/accuracies": 0.3125, "rewards/chosen": -0.012840795330703259, "rewards/margins": -0.02606627717614174, "rewards/rejected": 0.013225484639406204, "step": 5 }, { "epoch": 0.014492753623188406, "grad_norm": 3.0722241401672363, "learning_rate": 9.98792270531401e-07, "logits/chosen": -2.2019009590148926, "logits/rejected": -2.1441385746002197, "logps/chosen": -56.08127975463867, "logps/rejected": -75.88961029052734, "loss": 0.991, "rewards/accuracies": 0.59375, "rewards/chosen": -0.002174068707972765, "rewards/margins": 0.03619088977575302, "rewards/rejected": -0.0383649580180645, "step": 6 }, { "epoch": 0.016908212560386472, "grad_norm": 3.1828646659851074, "learning_rate": 9.985507246376812e-07, "logits/chosen": -2.1618258953094482, "logits/rejected": -2.079982280731201, "logps/chosen": -53.25628662109375, "logps/rejected": -70.75797271728516, "loss": 1.002, "rewards/accuracies": 0.46875, "rewards/chosen": -0.029145803302526474, "rewards/margins": -0.007991778664290905, "rewards/rejected": -0.021154023706912994, "step": 7 }, { "epoch": 0.01932367149758454, "grad_norm": 2.907177448272705, "learning_rate": 9.983091787439612e-07, "logits/chosen": -2.1674747467041016, "logits/rejected": -2.094683885574341, "logps/chosen": -55.59547805786133, "logps/rejected": -75.8181381225586, "loss": 0.9919, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0025108223780989647, "rewards/margins": 0.03235812112689018, "rewards/rejected": -0.03486894816160202, "step": 8 }, { "epoch": 0.021739130434782608, "grad_norm": 2.9329004287719727, "learning_rate": 9.980676328502414e-07, "logits/chosen": -2.200835943222046, "logits/rejected": -2.161302328109741, "logps/chosen": -58.444419860839844, "logps/rejected": -76.10246276855469, "loss": 0.9987, "rewards/accuracies": 0.5, "rewards/chosen": -0.016871558502316475, "rewards/margins": 0.005223977379500866, "rewards/rejected": -0.022095536813139915, "step": 9 }, { "epoch": 0.024154589371980676, "grad_norm": 3.0249266624450684, "learning_rate": 9.978260869565217e-07, "logits/chosen": -2.159430980682373, "logits/rejected": -2.0208024978637695, "logps/chosen": -58.294193267822266, "logps/rejected": -74.16793823242188, "loss": 1.0008, "rewards/accuracies": 0.65625, "rewards/chosen": -0.008628785610198975, "rewards/margins": -0.003160417778417468, "rewards/rejected": -0.0054683685302734375, "step": 10 }, { "epoch": 0.026570048309178744, "grad_norm": 3.034073829650879, "learning_rate": 9.97584541062802e-07, "logits/chosen": -2.2328178882598877, "logits/rejected": -2.1740686893463135, "logps/chosen": -59.664695739746094, "logps/rejected": -78.3049087524414, "loss": 0.9981, "rewards/accuracies": 0.5, "rewards/chosen": -0.003595668589696288, "rewards/margins": 0.007823066785931587, "rewards/rejected": -0.011418737471103668, "step": 11 }, { "epoch": 0.028985507246376812, "grad_norm": 3.2965574264526367, "learning_rate": 9.973429951690821e-07, "logits/chosen": -2.1628873348236084, "logits/rejected": -2.0654044151306152, "logps/chosen": -56.73882293701172, "logps/rejected": -77.66434478759766, "loss": 0.9928, "rewards/accuracies": 0.625, "rewards/chosen": -0.024080883711576462, "rewards/margins": 0.02885613590478897, "rewards/rejected": -0.05293701961636543, "step": 12 }, { "epoch": 0.03140096618357488, "grad_norm": 2.8927173614501953, "learning_rate": 9.971014492753624e-07, "logits/chosen": -2.204869031906128, "logits/rejected": -2.1478219032287598, "logps/chosen": -57.5111198425293, "logps/rejected": -76.60840606689453, "loss": 0.9923, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0026124240830540657, "rewards/margins": 0.030973147600889206, "rewards/rejected": -0.028360724449157715, "step": 13 }, { "epoch": 0.033816425120772944, "grad_norm": 2.8865323066711426, "learning_rate": 9.968599033816424e-07, "logits/chosen": -2.251356840133667, "logits/rejected": -2.141745090484619, "logps/chosen": -58.431297302246094, "logps/rejected": -74.82398223876953, "loss": 0.9932, "rewards/accuracies": 0.625, "rewards/chosen": -0.029634952545166016, "rewards/margins": 0.02695634588599205, "rewards/rejected": -0.056591302156448364, "step": 14 }, { "epoch": 0.036231884057971016, "grad_norm": 3.24186372756958, "learning_rate": 9.966183574879226e-07, "logits/chosen": -2.1430516242980957, "logits/rejected": -2.0817880630493164, "logps/chosen": -60.609130859375, "logps/rejected": -78.17417907714844, "loss": 0.9995, "rewards/accuracies": 0.53125, "rewards/chosen": -0.04622345045208931, "rewards/margins": 0.0019305117893964052, "rewards/rejected": -0.04815395548939705, "step": 15 }, { "epoch": 0.03864734299516908, "grad_norm": 3.7530367374420166, "learning_rate": 9.963768115942029e-07, "logits/chosen": -2.23286509513855, "logits/rejected": -2.095599889755249, "logps/chosen": -57.6390380859375, "logps/rejected": -75.51988983154297, "loss": 0.9912, "rewards/accuracies": 0.71875, "rewards/chosen": 0.007264888845384121, "rewards/margins": 0.03506787121295929, "rewards/rejected": -0.027802981436252594, "step": 16 }, { "epoch": 0.04106280193236715, "grad_norm": 3.217910051345825, "learning_rate": 9.96135265700483e-07, "logits/chosen": -2.0936472415924072, "logits/rejected": -1.9874910116195679, "logps/chosen": -56.1012077331543, "logps/rejected": -76.48030090332031, "loss": 1.0027, "rewards/accuracies": 0.46875, "rewards/chosen": -0.030757127329707146, "rewards/margins": -0.010928690433502197, "rewards/rejected": -0.019828438758850098, "step": 17 }, { "epoch": 0.043478260869565216, "grad_norm": 3.9939563274383545, "learning_rate": 9.958937198067634e-07, "logits/chosen": -2.1566176414489746, "logits/rejected": -2.064446449279785, "logps/chosen": -56.2030143737793, "logps/rejected": -74.75016784667969, "loss": 0.994, "rewards/accuracies": 0.5, "rewards/chosen": -0.04238457605242729, "rewards/margins": 0.023941900581121445, "rewards/rejected": -0.06632647663354874, "step": 18 }, { "epoch": 0.04589371980676329, "grad_norm": 3.9325368404388428, "learning_rate": 9.956521739130434e-07, "logits/chosen": -2.192810535430908, "logits/rejected": -2.068943977355957, "logps/chosen": -55.5321044921875, "logps/rejected": -71.56244659423828, "loss": 0.9946, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0008583534508943558, "rewards/margins": 0.02171945571899414, "rewards/rejected": -0.020861100405454636, "step": 19 }, { "epoch": 0.04830917874396135, "grad_norm": 3.350891590118408, "learning_rate": 9.954106280193236e-07, "logits/chosen": -2.1775379180908203, "logits/rejected": -2.0757737159729004, "logps/chosen": -54.31844711303711, "logps/rejected": -79.82599639892578, "loss": 0.9989, "rewards/accuracies": 0.5, "rewards/chosen": -0.030152268707752228, "rewards/margins": 0.004537244793027639, "rewards/rejected": -0.03468950837850571, "step": 20 }, { "epoch": 0.050724637681159424, "grad_norm": 3.503999948501587, "learning_rate": 9.951690821256039e-07, "logits/chosen": -2.1727867126464844, "logits/rejected": -2.052229881286621, "logps/chosen": -55.86065673828125, "logps/rejected": -77.97817993164062, "loss": 0.9928, "rewards/accuracies": 0.625, "rewards/chosen": -0.004577433690428734, "rewards/margins": 0.02873416244983673, "rewards/rejected": -0.033311594277620316, "step": 21 }, { "epoch": 0.05314009661835749, "grad_norm": 3.0702574253082275, "learning_rate": 9.949275362318839e-07, "logits/chosen": -2.112445116043091, "logits/rejected": -2.043785810470581, "logps/chosen": -50.93365478515625, "logps/rejected": -69.0789566040039, "loss": 0.9925, "rewards/accuracies": 0.5625, "rewards/chosen": 0.002129757311195135, "rewards/margins": 0.02988673374056816, "rewards/rejected": -0.027756979689002037, "step": 22 }, { "epoch": 0.05555555555555555, "grad_norm": 3.078096389770508, "learning_rate": 9.946859903381643e-07, "logits/chosen": -2.163736581802368, "logits/rejected": -2.0748178958892822, "logps/chosen": -59.94377899169922, "logps/rejected": -81.28279113769531, "loss": 0.9923, "rewards/accuracies": 0.53125, "rewards/chosen": -0.048874400556087494, "rewards/margins": 0.03070862591266632, "rewards/rejected": -0.07958302646875381, "step": 23 }, { "epoch": 0.057971014492753624, "grad_norm": 2.9827322959899902, "learning_rate": 9.944444444444444e-07, "logits/chosen": -2.1157422065734863, "logits/rejected": -2.052717685699463, "logps/chosen": -55.268043518066406, "logps/rejected": -75.28286743164062, "loss": 0.9921, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02637934684753418, "rewards/margins": 0.03174954652786255, "rewards/rejected": -0.05812889337539673, "step": 24 }, { "epoch": 0.06038647342995169, "grad_norm": 7.797964096069336, "learning_rate": 9.942028985507246e-07, "logits/chosen": -2.175492286682129, "logits/rejected": -2.157451868057251, "logps/chosen": -57.29520034790039, "logps/rejected": -74.09068298339844, "loss": 0.9822, "rewards/accuracies": 0.84375, "rewards/chosen": -0.022821415215730667, "rewards/margins": 0.07154972851276398, "rewards/rejected": -0.09437114000320435, "step": 25 }, { "epoch": 0.06280193236714976, "grad_norm": 3.1010725498199463, "learning_rate": 9.939613526570048e-07, "logits/chosen": -2.2009546756744385, "logits/rejected": -2.0971322059631348, "logps/chosen": -56.14605712890625, "logps/rejected": -73.01823425292969, "loss": 0.9902, "rewards/accuracies": 0.6875, "rewards/chosen": -0.053522758185863495, "rewards/margins": 0.039226334542036057, "rewards/rejected": -0.09274908155202866, "step": 26 }, { "epoch": 0.06521739130434782, "grad_norm": 3.188265323638916, "learning_rate": 9.937198067632849e-07, "logits/chosen": -2.2028567790985107, "logits/rejected": -2.1324918270111084, "logps/chosen": -55.90058898925781, "logps/rejected": -78.44905090332031, "loss": 0.9778, "rewards/accuracies": 0.75, "rewards/chosen": -0.002383911982178688, "rewards/margins": 0.08928676694631577, "rewards/rejected": -0.0916706770658493, "step": 27 }, { "epoch": 0.06763285024154589, "grad_norm": 2.8552348613739014, "learning_rate": 9.934782608695653e-07, "logits/chosen": -2.104480504989624, "logits/rejected": -2.041468620300293, "logps/chosen": -56.77963638305664, "logps/rejected": -72.64840698242188, "loss": 0.9891, "rewards/accuracies": 0.75, "rewards/chosen": -0.031476616859436035, "rewards/margins": 0.04379361867904663, "rewards/rejected": -0.07527023553848267, "step": 28 }, { "epoch": 0.07004830917874397, "grad_norm": 3.0251026153564453, "learning_rate": 9.932367149758453e-07, "logits/chosen": -2.147563934326172, "logits/rejected": -2.0573384761810303, "logps/chosen": -58.242591857910156, "logps/rejected": -74.96723937988281, "loss": 0.9859, "rewards/accuracies": 0.625, "rewards/chosen": -0.01550295390188694, "rewards/margins": 0.05679529532790184, "rewards/rejected": -0.07229825109243393, "step": 29 }, { "epoch": 0.07246376811594203, "grad_norm": 2.966425657272339, "learning_rate": 9.929951690821256e-07, "logits/chosen": -2.1050286293029785, "logits/rejected": -2.0702872276306152, "logps/chosen": -59.17656707763672, "logps/rejected": -73.3216552734375, "loss": 0.9957, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03756285831332207, "rewards/margins": 0.017120588570833206, "rewards/rejected": -0.05468344688415527, "step": 30 }, { "epoch": 0.0748792270531401, "grad_norm": 3.5576157569885254, "learning_rate": 9.927536231884058e-07, "logits/chosen": -2.233815908432007, "logits/rejected": -2.114673137664795, "logps/chosen": -62.32474136352539, "logps/rejected": -84.44629669189453, "loss": 0.9844, "rewards/accuracies": 0.78125, "rewards/chosen": -0.04068344086408615, "rewards/margins": 0.06243426352739334, "rewards/rejected": -0.10311770439147949, "step": 31 }, { "epoch": 0.07729468599033816, "grad_norm": 3.221014976501465, "learning_rate": 9.925120772946858e-07, "logits/chosen": -2.175903797149658, "logits/rejected": -2.080284595489502, "logps/chosen": -56.33634948730469, "logps/rejected": -73.84745788574219, "loss": 0.9878, "rewards/accuracies": 0.625, "rewards/chosen": -0.05116400867700577, "rewards/margins": 0.04905060678720474, "rewards/rejected": -0.10021461546421051, "step": 32 }, { "epoch": 0.07971014492753623, "grad_norm": 3.2825539112091064, "learning_rate": 9.92270531400966e-07, "logits/chosen": -2.1555540561676025, "logits/rejected": -2.063032865524292, "logps/chosen": -54.67486572265625, "logps/rejected": -75.35707092285156, "loss": 0.9865, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05663792043924332, "rewards/margins": 0.05430224910378456, "rewards/rejected": -0.11094017326831818, "step": 33 }, { "epoch": 0.0821256038647343, "grad_norm": 2.951279878616333, "learning_rate": 9.920289855072463e-07, "logits/chosen": -2.1700334548950195, "logits/rejected": -2.0385971069335938, "logps/chosen": -58.87738037109375, "logps/rejected": -69.98744201660156, "loss": 0.9906, "rewards/accuracies": 0.625, "rewards/chosen": -0.05489972233772278, "rewards/margins": 0.03792424499988556, "rewards/rejected": -0.09282395988702774, "step": 34 }, { "epoch": 0.08454106280193237, "grad_norm": 3.168715238571167, "learning_rate": 9.917874396135265e-07, "logits/chosen": -2.2106122970581055, "logits/rejected": -2.144092321395874, "logps/chosen": -55.157230377197266, "logps/rejected": -73.0668716430664, "loss": 0.9849, "rewards/accuracies": 0.71875, "rewards/chosen": -0.04511646181344986, "rewards/margins": 0.060931552201509476, "rewards/rejected": -0.10604801028966904, "step": 35 }, { "epoch": 0.08695652173913043, "grad_norm": 3.4809346199035645, "learning_rate": 9.915458937198068e-07, "logits/chosen": -2.139030933380127, "logits/rejected": -2.104886770248413, "logps/chosen": -54.396095275878906, "logps/rejected": -73.87648010253906, "loss": 0.9752, "rewards/accuracies": 0.78125, "rewards/chosen": -0.04402846470475197, "rewards/margins": 0.09969419240951538, "rewards/rejected": -0.14372265338897705, "step": 36 }, { "epoch": 0.0893719806763285, "grad_norm": 3.7356555461883545, "learning_rate": 9.91304347826087e-07, "logits/chosen": -2.194314956665039, "logits/rejected": -2.0617728233337402, "logps/chosen": -55.47432327270508, "logps/rejected": -79.8756332397461, "loss": 0.979, "rewards/accuracies": 0.71875, "rewards/chosen": -0.044937457889318466, "rewards/margins": 0.08458155393600464, "rewards/rejected": -0.1295190006494522, "step": 37 }, { "epoch": 0.09178743961352658, "grad_norm": 3.0910565853118896, "learning_rate": 9.91062801932367e-07, "logits/chosen": -2.1801981925964355, "logits/rejected": -2.0934720039367676, "logps/chosen": -53.593238830566406, "logps/rejected": -75.2836685180664, "loss": 0.9742, "rewards/accuracies": 0.84375, "rewards/chosen": -0.04649990051984787, "rewards/margins": 0.10354039818048477, "rewards/rejected": -0.15004031360149384, "step": 38 }, { "epoch": 0.09420289855072464, "grad_norm": 3.1982295513153076, "learning_rate": 9.908212560386473e-07, "logits/chosen": -2.153974771499634, "logits/rejected": -2.0641424655914307, "logps/chosen": -54.921730041503906, "logps/rejected": -72.93656158447266, "loss": 0.9778, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07385598868131638, "rewards/margins": 0.089473195374012, "rewards/rejected": -0.16332918405532837, "step": 39 }, { "epoch": 0.0966183574879227, "grad_norm": 3.1685304641723633, "learning_rate": 9.905797101449275e-07, "logits/chosen": -2.1810054779052734, "logits/rejected": -2.0948283672332764, "logps/chosen": -54.54977798461914, "logps/rejected": -78.91806030273438, "loss": 0.9684, "rewards/accuracies": 0.84375, "rewards/chosen": -0.07338592410087585, "rewards/margins": 0.127638041973114, "rewards/rejected": -0.20102396607398987, "step": 40 }, { "epoch": 0.09903381642512077, "grad_norm": 3.5197527408599854, "learning_rate": 9.903381642512075e-07, "logits/chosen": -2.216768980026245, "logits/rejected": -2.0431559085845947, "logps/chosen": -55.11878967285156, "logps/rejected": -78.70977020263672, "loss": 0.9638, "rewards/accuracies": 0.875, "rewards/chosen": -0.04387350007891655, "rewards/margins": 0.1461183875799179, "rewards/rejected": -0.18999189138412476, "step": 41 }, { "epoch": 0.10144927536231885, "grad_norm": 3.3186628818511963, "learning_rate": 9.90096618357488e-07, "logits/chosen": -2.192880153656006, "logits/rejected": -2.116703748703003, "logps/chosen": -54.28813934326172, "logps/rejected": -73.70767974853516, "loss": 0.9782, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06513218581676483, "rewards/margins": 0.08786149322986603, "rewards/rejected": -0.15299369394779205, "step": 42 }, { "epoch": 0.10386473429951691, "grad_norm": 3.906416654586792, "learning_rate": 9.89855072463768e-07, "logits/chosen": -2.2246222496032715, "logits/rejected": -2.113999366760254, "logps/chosen": -57.24310302734375, "logps/rejected": -88.76587677001953, "loss": 0.9601, "rewards/accuracies": 0.875, "rewards/chosen": -0.11103013157844543, "rewards/margins": 0.1626303493976593, "rewards/rejected": -0.27366048097610474, "step": 43 }, { "epoch": 0.10628019323671498, "grad_norm": 3.7831757068634033, "learning_rate": 9.896135265700483e-07, "logits/chosen": -2.236570358276367, "logits/rejected": -2.090880870819092, "logps/chosen": -57.37206268310547, "logps/rejected": -80.96002197265625, "loss": 0.9637, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08610305190086365, "rewards/margins": 0.14671948552131653, "rewards/rejected": -0.23282252252101898, "step": 44 }, { "epoch": 0.10869565217391304, "grad_norm": 4.20023775100708, "learning_rate": 9.893719806763285e-07, "logits/chosen": -2.1486220359802246, "logits/rejected": -2.0989551544189453, "logps/chosen": -62.69160842895508, "logps/rejected": -84.50869750976562, "loss": 0.968, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12458223849534988, "rewards/margins": 0.129995658993721, "rewards/rejected": -0.2545778751373291, "step": 45 }, { "epoch": 0.1111111111111111, "grad_norm": 3.8163869380950928, "learning_rate": 9.891304347826085e-07, "logits/chosen": -2.175304889678955, "logits/rejected": -2.0427122116088867, "logps/chosen": -60.962459564208984, "logps/rejected": -83.22074890136719, "loss": 0.9625, "rewards/accuracies": 0.875, "rewards/chosen": -0.1153474822640419, "rewards/margins": 0.15176932513713837, "rewards/rejected": -0.26711681485176086, "step": 46 }, { "epoch": 0.11352657004830918, "grad_norm": 4.022053241729736, "learning_rate": 9.88888888888889e-07, "logits/chosen": -2.217334508895874, "logits/rejected": -2.150660514831543, "logps/chosen": -60.12694549560547, "logps/rejected": -86.43513488769531, "loss": 0.9612, "rewards/accuracies": 0.8125, "rewards/chosen": -0.14310479164123535, "rewards/margins": 0.15862800180912018, "rewards/rejected": -0.30173277854919434, "step": 47 }, { "epoch": 0.11594202898550725, "grad_norm": 5.428286552429199, "learning_rate": 9.88647342995169e-07, "logits/chosen": -2.1429531574249268, "logits/rejected": -2.0805399417877197, "logps/chosen": -55.12883758544922, "logps/rejected": -85.59800720214844, "loss": 0.9529, "rewards/accuracies": 0.875, "rewards/chosen": -0.12002723664045334, "rewards/margins": 0.1917877346277237, "rewards/rejected": -0.31181496381759644, "step": 48 }, { "epoch": 0.11835748792270531, "grad_norm": 3.350074052810669, "learning_rate": 9.884057971014492e-07, "logits/chosen": -2.220508575439453, "logits/rejected": -2.1514220237731934, "logps/chosen": -62.138694763183594, "logps/rejected": -77.3960189819336, "loss": 0.9649, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12617261707782745, "rewards/margins": 0.14236991107463837, "rewards/rejected": -0.2685425281524658, "step": 49 }, { "epoch": 0.12077294685990338, "grad_norm": 3.474019765853882, "learning_rate": 9.881642512077295e-07, "logits/chosen": -2.219294786453247, "logits/rejected": -2.054152727127075, "logps/chosen": -52.381744384765625, "logps/rejected": -74.04975891113281, "loss": 0.958, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07657895982265472, "rewards/margins": 0.1702602207660675, "rewards/rejected": -0.24683916568756104, "step": 50 }, { "epoch": 0.12318840579710146, "grad_norm": 4.844295024871826, "learning_rate": 9.879227053140095e-07, "logits/chosen": -2.1871438026428223, "logits/rejected": -2.1125855445861816, "logps/chosen": -52.766456604003906, "logps/rejected": -77.09812927246094, "loss": 0.9589, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1241784617304802, "rewards/margins": 0.16711951792240143, "rewards/rejected": -0.291297972202301, "step": 51 }, { "epoch": 0.12560386473429952, "grad_norm": 3.238046169281006, "learning_rate": 9.8768115942029e-07, "logits/chosen": -2.1389667987823486, "logits/rejected": -2.090848922729492, "logps/chosen": -57.09165954589844, "logps/rejected": -75.68721008300781, "loss": 0.9545, "rewards/accuracies": 0.96875, "rewards/chosen": -0.12278801202774048, "rewards/margins": 0.1858767569065094, "rewards/rejected": -0.30866479873657227, "step": 52 }, { "epoch": 0.1280193236714976, "grad_norm": 3.710906744003296, "learning_rate": 9.8743961352657e-07, "logits/chosen": -2.215223789215088, "logits/rejected": -2.148970127105713, "logps/chosen": -58.24617385864258, "logps/rejected": -76.91313171386719, "loss": 0.9435, "rewards/accuracies": 0.875, "rewards/chosen": -0.09419205784797668, "rewards/margins": 0.23171325027942657, "rewards/rejected": -0.32590532302856445, "step": 53 }, { "epoch": 0.13043478260869565, "grad_norm": 3.3852124214172363, "learning_rate": 9.871980676328502e-07, "logits/chosen": -2.2221927642822266, "logits/rejected": -2.1636431217193604, "logps/chosen": -59.3031005859375, "logps/rejected": -82.16263580322266, "loss": 0.946, "rewards/accuracies": 0.90625, "rewards/chosen": -0.16752149164676666, "rewards/margins": 0.22458712756633759, "rewards/rejected": -0.39210861921310425, "step": 54 }, { "epoch": 0.13285024154589373, "grad_norm": 3.753603219985962, "learning_rate": 9.869565217391304e-07, "logits/chosen": -2.2977852821350098, "logits/rejected": -2.162917137145996, "logps/chosen": -54.540767669677734, "logps/rejected": -79.27214050292969, "loss": 0.9368, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0700063407421112, "rewards/margins": 0.26008373498916626, "rewards/rejected": -0.33009007573127747, "step": 55 }, { "epoch": 0.13526570048309178, "grad_norm": 3.66054105758667, "learning_rate": 9.867149758454107e-07, "logits/chosen": -2.254305124282837, "logits/rejected": -2.1488749980926514, "logps/chosen": -55.60317611694336, "logps/rejected": -80.17803955078125, "loss": 0.9365, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10736099630594254, "rewards/margins": 0.2600057125091553, "rewards/rejected": -0.3673667311668396, "step": 56 }, { "epoch": 0.13768115942028986, "grad_norm": 3.6773712635040283, "learning_rate": 9.864734299516907e-07, "logits/chosen": -2.1754281520843506, "logits/rejected": -2.0978052616119385, "logps/chosen": -54.434627532958984, "logps/rejected": -72.70606231689453, "loss": 0.9455, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1077297031879425, "rewards/margins": 0.22271934151649475, "rewards/rejected": -0.33044904470443726, "step": 57 }, { "epoch": 0.14009661835748793, "grad_norm": 3.0269064903259277, "learning_rate": 9.86231884057971e-07, "logits/chosen": -2.118281364440918, "logits/rejected": -2.0807297229766846, "logps/chosen": -55.50238800048828, "logps/rejected": -73.99935913085938, "loss": 0.9409, "rewards/accuracies": 0.90625, "rewards/chosen": -0.14338436722755432, "rewards/margins": 0.24359984695911407, "rewards/rejected": -0.3869841694831848, "step": 58 }, { "epoch": 0.14251207729468598, "grad_norm": 3.4644172191619873, "learning_rate": 9.859903381642512e-07, "logits/chosen": -2.230489730834961, "logits/rejected": -2.106873035430908, "logps/chosen": -59.86170959472656, "logps/rejected": -83.14311981201172, "loss": 0.9452, "rewards/accuracies": 0.78125, "rewards/chosen": -0.19543373584747314, "rewards/margins": 0.2286863476037979, "rewards/rejected": -0.42412006855010986, "step": 59 }, { "epoch": 0.14492753623188406, "grad_norm": 2.8078765869140625, "learning_rate": 9.857487922705312e-07, "logits/chosen": -2.197187662124634, "logits/rejected": -2.1521568298339844, "logps/chosen": -61.111515045166016, "logps/rejected": -75.61607360839844, "loss": 0.9659, "rewards/accuracies": 0.78125, "rewards/chosen": -0.25362613797187805, "rewards/margins": 0.14123809337615967, "rewards/rejected": -0.39486420154571533, "step": 60 }, { "epoch": 0.1473429951690821, "grad_norm": 3.4679226875305176, "learning_rate": 9.855072463768117e-07, "logits/chosen": -2.2288501262664795, "logits/rejected": -2.1207547187805176, "logps/chosen": -52.87244415283203, "logps/rejected": -75.38878631591797, "loss": 0.9262, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10013985633850098, "rewards/margins": 0.3035953938961029, "rewards/rejected": -0.4037352502346039, "step": 61 }, { "epoch": 0.1497584541062802, "grad_norm": 2.946500539779663, "learning_rate": 9.852657004830917e-07, "logits/chosen": -2.1599059104919434, "logits/rejected": -2.0876963138580322, "logps/chosen": -58.7757568359375, "logps/rejected": -77.44908142089844, "loss": 0.9479, "rewards/accuracies": 0.84375, "rewards/chosen": -0.23947526514530182, "rewards/margins": 0.21813645958900452, "rewards/rejected": -0.45761173963546753, "step": 62 }, { "epoch": 0.15217391304347827, "grad_norm": 3.4866302013397217, "learning_rate": 9.85024154589372e-07, "logits/chosen": -2.230949640274048, "logits/rejected": -2.1312594413757324, "logps/chosen": -56.2661247253418, "logps/rejected": -78.19754028320312, "loss": 0.9226, "rewards/accuracies": 0.875, "rewards/chosen": -0.13062559068202972, "rewards/margins": 0.32240229845046997, "rewards/rejected": -0.4530278444290161, "step": 63 }, { "epoch": 0.15458937198067632, "grad_norm": 3.6147983074188232, "learning_rate": 9.847826086956522e-07, "logits/chosen": -2.1908016204833984, "logits/rejected": -2.1166574954986572, "logps/chosen": -57.25026321411133, "logps/rejected": -83.94319915771484, "loss": 0.9141, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20002618432044983, "rewards/margins": 0.3601999282836914, "rewards/rejected": -0.5602260828018188, "step": 64 }, { "epoch": 0.1570048309178744, "grad_norm": 3.434878349304199, "learning_rate": 9.845410628019322e-07, "logits/chosen": -2.227965831756592, "logits/rejected": -2.133065700531006, "logps/chosen": -60.02827835083008, "logps/rejected": -77.97032928466797, "loss": 0.9432, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2729240655899048, "rewards/margins": 0.2393362820148468, "rewards/rejected": -0.5122603178024292, "step": 65 }, { "epoch": 0.15942028985507245, "grad_norm": 3.360107660293579, "learning_rate": 9.842995169082126e-07, "logits/chosen": -2.1993441581726074, "logits/rejected": -2.124965190887451, "logps/chosen": -53.431427001953125, "logps/rejected": -76.56427764892578, "loss": 0.9181, "rewards/accuracies": 0.90625, "rewards/chosen": -0.15001334249973297, "rewards/margins": 0.34602633118629456, "rewards/rejected": -0.4960396885871887, "step": 66 }, { "epoch": 0.16183574879227053, "grad_norm": 3.5610783100128174, "learning_rate": 9.840579710144927e-07, "logits/chosen": -2.2281196117401123, "logits/rejected": -2.122161865234375, "logps/chosen": -54.35234832763672, "logps/rejected": -81.2714614868164, "loss": 0.8994, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14506840705871582, "rewards/margins": 0.42242270708084106, "rewards/rejected": -0.5674911141395569, "step": 67 }, { "epoch": 0.1642512077294686, "grad_norm": 3.6468021869659424, "learning_rate": 9.83816425120773e-07, "logits/chosen": -2.250663995742798, "logits/rejected": -2.173973560333252, "logps/chosen": -58.00518798828125, "logps/rejected": -84.83555603027344, "loss": 0.9099, "rewards/accuracies": 0.875, "rewards/chosen": -0.17466765642166138, "rewards/margins": 0.3819878399372101, "rewards/rejected": -0.5566555261611938, "step": 68 }, { "epoch": 0.16666666666666666, "grad_norm": 3.2184135913848877, "learning_rate": 9.835748792270531e-07, "logits/chosen": -2.1811649799346924, "logits/rejected": -2.0907533168792725, "logps/chosen": -60.35890579223633, "logps/rejected": -83.4397201538086, "loss": 0.9162, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2830909490585327, "rewards/margins": 0.3560815453529358, "rewards/rejected": -0.6391724944114685, "step": 69 }, { "epoch": 0.16908212560386474, "grad_norm": 4.003438472747803, "learning_rate": 9.833333333333332e-07, "logits/chosen": -2.1922760009765625, "logits/rejected": -2.1123034954071045, "logps/chosen": -56.275489807128906, "logps/rejected": -79.26179504394531, "loss": 0.9223, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19239559769630432, "rewards/margins": 0.33080923557281494, "rewards/rejected": -0.5232048034667969, "step": 70 }, { "epoch": 0.17149758454106281, "grad_norm": 3.3185019493103027, "learning_rate": 9.830917874396136e-07, "logits/chosen": -2.2129147052764893, "logits/rejected": -2.171663284301758, "logps/chosen": -58.045555114746094, "logps/rejected": -77.75115203857422, "loss": 0.9219, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22932061553001404, "rewards/margins": 0.3301212191581726, "rewards/rejected": -0.5594418048858643, "step": 71 }, { "epoch": 0.17391304347826086, "grad_norm": 3.7214760780334473, "learning_rate": 9.828502415458936e-07, "logits/chosen": -2.2829689979553223, "logits/rejected": -2.1774559020996094, "logps/chosen": -57.29518508911133, "logps/rejected": -82.46050262451172, "loss": 0.8839, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20179873704910278, "rewards/margins": 0.5029343366622925, "rewards/rejected": -0.7047330141067505, "step": 72 }, { "epoch": 0.17632850241545894, "grad_norm": 5.221538543701172, "learning_rate": 9.826086956521739e-07, "logits/chosen": -2.20510196685791, "logits/rejected": -2.1278748512268066, "logps/chosen": -54.117820739746094, "logps/rejected": -85.2557373046875, "loss": 0.879, "rewards/accuracies": 1.0, "rewards/chosen": -0.1442597657442093, "rewards/margins": 0.5213267803192139, "rewards/rejected": -0.665586531162262, "step": 73 }, { "epoch": 0.178743961352657, "grad_norm": 3.2785491943359375, "learning_rate": 9.823671497584541e-07, "logits/chosen": -2.267665147781372, "logits/rejected": -2.21690034866333, "logps/chosen": -61.9249267578125, "logps/rejected": -85.28959655761719, "loss": 0.8991, "rewards/accuracies": 0.90625, "rewards/chosen": -0.30863362550735474, "rewards/margins": 0.440001904964447, "rewards/rejected": -0.7486355304718018, "step": 74 }, { "epoch": 0.18115942028985507, "grad_norm": 3.591214179992676, "learning_rate": 9.821256038647344e-07, "logits/chosen": -2.176032066345215, "logits/rejected": -2.1920199394226074, "logps/chosen": -58.97840881347656, "logps/rejected": -84.46222686767578, "loss": 0.8815, "rewards/accuracies": 0.96875, "rewards/chosen": -0.17686772346496582, "rewards/margins": 0.5097454786300659, "rewards/rejected": -0.6866132020950317, "step": 75 }, { "epoch": 0.18357487922705315, "grad_norm": 3.6753652095794678, "learning_rate": 9.818840579710144e-07, "logits/chosen": -2.2473042011260986, "logits/rejected": -2.196948766708374, "logps/chosen": -59.67621994018555, "logps/rejected": -80.20405578613281, "loss": 0.9107, "rewards/accuracies": 0.96875, "rewards/chosen": -0.3872445821762085, "rewards/margins": 0.39357298612594604, "rewards/rejected": -0.7808175086975098, "step": 76 }, { "epoch": 0.1859903381642512, "grad_norm": 3.13348650932312, "learning_rate": 9.816425120772946e-07, "logits/chosen": -2.193026542663574, "logits/rejected": -2.100886344909668, "logps/chosen": -57.206451416015625, "logps/rejected": -78.05674743652344, "loss": 0.9091, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3260487914085388, "rewards/margins": 0.40092504024505615, "rewards/rejected": -0.7269737720489502, "step": 77 }, { "epoch": 0.18840579710144928, "grad_norm": 3.750610828399658, "learning_rate": 9.814009661835749e-07, "logits/chosen": -2.3667690753936768, "logits/rejected": -2.2209818363189697, "logps/chosen": -61.81901550292969, "logps/rejected": -88.00718688964844, "loss": 0.8574, "rewards/accuracies": 0.875, "rewards/chosen": -0.2520306706428528, "rewards/margins": 0.6347010731697083, "rewards/rejected": -0.886731743812561, "step": 78 }, { "epoch": 0.19082125603864733, "grad_norm": 3.3406925201416016, "learning_rate": 9.81159420289855e-07, "logits/chosen": -2.2818846702575684, "logits/rejected": -2.1649169921875, "logps/chosen": -60.75386047363281, "logps/rejected": -86.16616821289062, "loss": 0.8974, "rewards/accuracies": 0.8125, "rewards/chosen": -0.33393388986587524, "rewards/margins": 0.45529884099960327, "rewards/rejected": -0.7892327904701233, "step": 79 }, { "epoch": 0.1932367149758454, "grad_norm": 3.390854835510254, "learning_rate": 9.809178743961353e-07, "logits/chosen": -2.23146915435791, "logits/rejected": -2.162461280822754, "logps/chosen": -63.33086013793945, "logps/rejected": -89.98179626464844, "loss": 0.88, "rewards/accuracies": 0.90625, "rewards/chosen": -0.367904394865036, "rewards/margins": 0.5543071031570435, "rewards/rejected": -0.9222114682197571, "step": 80 }, { "epoch": 0.1956521739130435, "grad_norm": 3.150108575820923, "learning_rate": 9.806763285024154e-07, "logits/chosen": -2.2976901531219482, "logits/rejected": -2.2477355003356934, "logps/chosen": -62.57392120361328, "logps/rejected": -82.4486083984375, "loss": 0.8921, "rewards/accuracies": 0.9375, "rewards/chosen": -0.27390700578689575, "rewards/margins": 0.4843405783176422, "rewards/rejected": -0.7582475543022156, "step": 81 }, { "epoch": 0.19806763285024154, "grad_norm": 3.9153833389282227, "learning_rate": 9.804347826086956e-07, "logits/chosen": -2.307478904724121, "logits/rejected": -2.2203500270843506, "logps/chosen": -59.07624816894531, "logps/rejected": -83.35639953613281, "loss": 0.8624, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2888317108154297, "rewards/margins": 0.6169155240058899, "rewards/rejected": -0.9057472944259644, "step": 82 }, { "epoch": 0.20048309178743962, "grad_norm": 4.221527576446533, "learning_rate": 9.801932367149758e-07, "logits/chosen": -2.2277207374572754, "logits/rejected": -2.1178832054138184, "logps/chosen": -57.07787322998047, "logps/rejected": -88.88327026367188, "loss": 0.853, "rewards/accuracies": 0.84375, "rewards/chosen": -0.20382218062877655, "rewards/margins": 0.6616360545158386, "rewards/rejected": -0.8654581904411316, "step": 83 }, { "epoch": 0.20048309178743962, "eval_logits/chosen": -2.2890286445617676, "eval_logits/rejected": -2.211808681488037, "eval_logps/chosen": -57.145111083984375, "eval_logps/rejected": -85.5176773071289, "eval_loss": 0.8516154289245605, "eval_rewards/accuracies": 0.9540441036224365, "eval_rewards/chosen": -0.2484191209077835, "eval_rewards/margins": 0.6699486374855042, "eval_rewards/rejected": -0.9183678030967712, "eval_runtime": 1001.9708, "eval_samples_per_second": 0.543, "eval_steps_per_second": 0.271, "step": 83 }, { "epoch": 0.2028985507246377, "grad_norm": 3.3027172088623047, "learning_rate": 9.799516908212559e-07, "logits/chosen": -2.278017520904541, "logits/rejected": -2.167437791824341, "logps/chosen": -57.419273376464844, "logps/rejected": -78.08587646484375, "loss": 0.867, "rewards/accuracies": 0.96875, "rewards/chosen": -0.14905601739883423, "rewards/margins": 0.5810959339141846, "rewards/rejected": -0.7301519513130188, "step": 84 }, { "epoch": 0.20531400966183574, "grad_norm": 3.072516679763794, "learning_rate": 9.797101449275363e-07, "logits/chosen": -2.294316291809082, "logits/rejected": -2.189255475997925, "logps/chosen": -61.19104766845703, "logps/rejected": -89.32990264892578, "loss": 0.8674, "rewards/accuracies": 0.90625, "rewards/chosen": -0.35770153999328613, "rewards/margins": 0.6355671286582947, "rewards/rejected": -0.993268609046936, "step": 85 }, { "epoch": 0.20772946859903382, "grad_norm": 3.460599422454834, "learning_rate": 9.794685990338163e-07, "logits/chosen": -2.2485783100128174, "logits/rejected": -2.159168243408203, "logps/chosen": -59.789207458496094, "logps/rejected": -80.03404235839844, "loss": 0.8657, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2659055292606354, "rewards/margins": 0.5992401838302612, "rewards/rejected": -0.8651458024978638, "step": 86 }, { "epoch": 0.21014492753623187, "grad_norm": 3.3077950477600098, "learning_rate": 9.792270531400966e-07, "logits/chosen": -2.2848997116088867, "logits/rejected": -2.2509684562683105, "logps/chosen": -58.94207000732422, "logps/rejected": -78.87932586669922, "loss": 0.8595, "rewards/accuracies": 1.0, "rewards/chosen": -0.26551029086112976, "rewards/margins": 0.6183245778083801, "rewards/rejected": -0.8838348388671875, "step": 87 }, { "epoch": 0.21256038647342995, "grad_norm": 3.4476258754730225, "learning_rate": 9.789855072463768e-07, "logits/chosen": -2.3066136837005615, "logits/rejected": -2.279409646987915, "logps/chosen": -62.142295837402344, "logps/rejected": -86.51927947998047, "loss": 0.8518, "rewards/accuracies": 0.96875, "rewards/chosen": -0.3645138144493103, "rewards/margins": 0.6972485780715942, "rewards/rejected": -1.0617624521255493, "step": 88 }, { "epoch": 0.21497584541062803, "grad_norm": 3.476977586746216, "learning_rate": 9.787439613526568e-07, "logits/chosen": -2.3075695037841797, "logits/rejected": -2.2184438705444336, "logps/chosen": -61.79441452026367, "logps/rejected": -86.50084686279297, "loss": 0.8505, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2901073694229126, "rewards/margins": 0.7115721106529236, "rewards/rejected": -1.001679539680481, "step": 89 }, { "epoch": 0.21739130434782608, "grad_norm": 3.6310007572174072, "learning_rate": 9.785024154589373e-07, "logits/chosen": -2.295557737350464, "logits/rejected": -2.269164800643921, "logps/chosen": -55.787113189697266, "logps/rejected": -82.89498138427734, "loss": 0.8359, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18737250566482544, "rewards/margins": 0.7734319567680359, "rewards/rejected": -0.9608045220375061, "step": 90 }, { "epoch": 0.21980676328502416, "grad_norm": 3.440699577331543, "learning_rate": 9.782608695652173e-07, "logits/chosen": -2.2757749557495117, "logits/rejected": -2.184046745300293, "logps/chosen": -57.324100494384766, "logps/rejected": -90.95164489746094, "loss": 0.8266, "rewards/accuracies": 0.9375, "rewards/chosen": -0.298515647649765, "rewards/margins": 0.8212540149688721, "rewards/rejected": -1.11976957321167, "step": 91 }, { "epoch": 0.2222222222222222, "grad_norm": 3.4762017726898193, "learning_rate": 9.780193236714975e-07, "logits/chosen": -2.3000388145446777, "logits/rejected": -2.213244915008545, "logps/chosen": -56.57130432128906, "logps/rejected": -78.92903137207031, "loss": 0.8551, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2674526572227478, "rewards/margins": 0.6613882184028625, "rewards/rejected": -0.9288408756256104, "step": 92 }, { "epoch": 0.2246376811594203, "grad_norm": 3.1474366188049316, "learning_rate": 9.777777777777778e-07, "logits/chosen": -2.2862253189086914, "logits/rejected": -2.194581985473633, "logps/chosen": -61.982269287109375, "logps/rejected": -80.8535385131836, "loss": 0.8705, "rewards/accuracies": 0.875, "rewards/chosen": -0.331647127866745, "rewards/margins": 0.5918638706207275, "rewards/rejected": -0.9235109686851501, "step": 93 }, { "epoch": 0.22705314009661837, "grad_norm": 2.966994285583496, "learning_rate": 9.775362318840578e-07, "logits/chosen": -2.252514362335205, "logits/rejected": -2.165985345840454, "logps/chosen": -56.22334671020508, "logps/rejected": -84.79402160644531, "loss": 0.8394, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2741245627403259, "rewards/margins": 0.7735782265663147, "rewards/rejected": -1.0477027893066406, "step": 94 }, { "epoch": 0.22946859903381642, "grad_norm": 3.4068620204925537, "learning_rate": 9.772946859903383e-07, "logits/chosen": -2.2491507530212402, "logits/rejected": -2.205399990081787, "logps/chosen": -58.93451690673828, "logps/rejected": -78.78646850585938, "loss": 0.8274, "rewards/accuracies": 1.0, "rewards/chosen": -0.08866233378648758, "rewards/margins": 0.7997257113456726, "rewards/rejected": -0.8883880972862244, "step": 95 }, { "epoch": 0.2318840579710145, "grad_norm": 3.423389434814453, "learning_rate": 9.770531400966183e-07, "logits/chosen": -2.251279830932617, "logits/rejected": -2.193419933319092, "logps/chosen": -61.81010437011719, "logps/rejected": -88.6775131225586, "loss": 0.8431, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3560965061187744, "rewards/margins": 0.7174968123435974, "rewards/rejected": -1.073593258857727, "step": 96 }, { "epoch": 0.23429951690821257, "grad_norm": 3.4756994247436523, "learning_rate": 9.768115942028985e-07, "logits/chosen": -2.3408660888671875, "logits/rejected": -2.2359986305236816, "logps/chosen": -57.940433502197266, "logps/rejected": -84.94161987304688, "loss": 0.798, "rewards/accuracies": 0.90625, "rewards/chosen": -0.14098940789699554, "rewards/margins": 0.9589684009552002, "rewards/rejected": -1.099957823753357, "step": 97 }, { "epoch": 0.23671497584541062, "grad_norm": 3.4936118125915527, "learning_rate": 9.765700483091788e-07, "logits/chosen": -2.297363519668579, "logits/rejected": -2.191948890686035, "logps/chosen": -56.46318054199219, "logps/rejected": -82.9169921875, "loss": 0.8137, "rewards/accuracies": 0.9375, "rewards/chosen": -0.170719176530838, "rewards/margins": 0.8750810027122498, "rewards/rejected": -1.0458002090454102, "step": 98 }, { "epoch": 0.2391304347826087, "grad_norm": 3.714163064956665, "learning_rate": 9.76328502415459e-07, "logits/chosen": -2.325563669204712, "logits/rejected": -2.2660670280456543, "logps/chosen": -57.786956787109375, "logps/rejected": -93.63697814941406, "loss": 0.7764, "rewards/accuracies": 0.96875, "rewards/chosen": -0.1426037698984146, "rewards/margins": 1.088018536567688, "rewards/rejected": -1.2306222915649414, "step": 99 }, { "epoch": 0.24154589371980675, "grad_norm": 3.996478796005249, "learning_rate": 9.76086956521739e-07, "logits/chosen": -2.2702765464782715, "logits/rejected": -2.131767988204956, "logps/chosen": -62.33879852294922, "logps/rejected": -92.2889404296875, "loss": 0.826, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4037386476993561, "rewards/margins": 0.8528699278831482, "rewards/rejected": -1.256608486175537, "step": 100 }, { "epoch": 0.24396135265700483, "grad_norm": 3.699632406234741, "learning_rate": 9.758454106280193e-07, "logits/chosen": -2.2708077430725098, "logits/rejected": -2.170806646347046, "logps/chosen": -54.752052307128906, "logps/rejected": -94.33966064453125, "loss": 0.7681, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19517828524112701, "rewards/margins": 1.1165281534194946, "rewards/rejected": -1.3117064237594604, "step": 101 }, { "epoch": 0.2463768115942029, "grad_norm": 3.7638258934020996, "learning_rate": 9.756038647342995e-07, "logits/chosen": -2.3018972873687744, "logits/rejected": -2.196406841278076, "logps/chosen": -52.99442672729492, "logps/rejected": -83.90137481689453, "loss": 0.7863, "rewards/accuracies": 0.90625, "rewards/chosen": 0.0761173740029335, "rewards/margins": 1.0151633024215698, "rewards/rejected": -0.9390459060668945, "step": 102 }, { "epoch": 0.24879227053140096, "grad_norm": 2.8519299030303955, "learning_rate": 9.753623188405795e-07, "logits/chosen": -2.289015054702759, "logits/rejected": -2.184220790863037, "logps/chosen": -55.900577545166016, "logps/rejected": -81.32809448242188, "loss": 0.8553, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1805034875869751, "rewards/margins": 0.708261251449585, "rewards/rejected": -0.8887647986412048, "step": 103 }, { "epoch": 0.25120772946859904, "grad_norm": 3.1846654415130615, "learning_rate": 9.7512077294686e-07, "logits/chosen": -2.356912136077881, "logits/rejected": -2.2871451377868652, "logps/chosen": -59.181419372558594, "logps/rejected": -90.04704284667969, "loss": 0.7862, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10994533449411392, "rewards/margins": 1.0512235164642334, "rewards/rejected": -1.1611686944961548, "step": 104 }, { "epoch": 0.2536231884057971, "grad_norm": 3.4601407051086426, "learning_rate": 9.7487922705314e-07, "logits/chosen": -2.2330241203308105, "logits/rejected": -2.246720790863037, "logps/chosen": -56.43918228149414, "logps/rejected": -87.7164535522461, "loss": 0.773, "rewards/accuracies": 0.90625, "rewards/chosen": -0.098158098757267, "rewards/margins": 1.090391755104065, "rewards/rejected": -1.1885496377944946, "step": 105 }, { "epoch": 0.2560386473429952, "grad_norm": 3.2679049968719482, "learning_rate": 9.746376811594202e-07, "logits/chosen": -2.3369510173797607, "logits/rejected": -2.285665512084961, "logps/chosen": -54.4149055480957, "logps/rejected": -81.96646881103516, "loss": 0.802, "rewards/accuracies": 0.875, "rewards/chosen": -0.07663148641586304, "rewards/margins": 0.9439887404441833, "rewards/rejected": -1.0206202268600464, "step": 106 }, { "epoch": 0.2584541062801932, "grad_norm": 3.3541183471679688, "learning_rate": 9.743961352657005e-07, "logits/chosen": -2.306627035140991, "logits/rejected": -2.192627191543579, "logps/chosen": -59.173309326171875, "logps/rejected": -93.58621978759766, "loss": 0.7784, "rewards/accuracies": 0.96875, "rewards/chosen": -0.22295817732810974, "rewards/margins": 1.0935059785842896, "rewards/rejected": -1.3164640665054321, "step": 107 }, { "epoch": 0.2608695652173913, "grad_norm": 3.4040794372558594, "learning_rate": 9.741545893719805e-07, "logits/chosen": -2.269702911376953, "logits/rejected": -2.1799938678741455, "logps/chosen": -58.69389724731445, "logps/rejected": -86.68751525878906, "loss": 0.7992, "rewards/accuracies": 1.0, "rewards/chosen": -0.1514156460762024, "rewards/margins": 0.9760117530822754, "rewards/rejected": -1.1274274587631226, "step": 108 }, { "epoch": 0.2632850241545894, "grad_norm": 3.628713607788086, "learning_rate": 9.73913043478261e-07, "logits/chosen": -2.2502474784851074, "logits/rejected": -2.1374011039733887, "logps/chosen": -52.081199645996094, "logps/rejected": -80.32386779785156, "loss": 0.7506, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10555536299943924, "rewards/margins": 1.135793685913086, "rewards/rejected": -1.0302382707595825, "step": 109 }, { "epoch": 0.26570048309178745, "grad_norm": 3.5831806659698486, "learning_rate": 9.73671497584541e-07, "logits/chosen": -2.2348999977111816, "logits/rejected": -2.1446027755737305, "logps/chosen": -55.418846130371094, "logps/rejected": -85.92244720458984, "loss": 0.7747, "rewards/accuracies": 1.0, "rewards/chosen": -0.1170755922794342, "rewards/margins": 1.1006860733032227, "rewards/rejected": -1.217761516571045, "step": 110 }, { "epoch": 0.26811594202898553, "grad_norm": 3.3074471950531006, "learning_rate": 9.734299516908212e-07, "logits/chosen": -2.294966697692871, "logits/rejected": -2.1820757389068604, "logps/chosen": -56.69746780395508, "logps/rejected": -88.84689331054688, "loss": 0.7909, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10774332284927368, "rewards/margins": 1.0274325609207153, "rewards/rejected": -1.1351758241653442, "step": 111 }, { "epoch": 0.27053140096618356, "grad_norm": 3.1767892837524414, "learning_rate": 9.731884057971014e-07, "logits/chosen": -2.260707378387451, "logits/rejected": -2.221397876739502, "logps/chosen": -56.143341064453125, "logps/rejected": -84.01417541503906, "loss": 0.7969, "rewards/accuracies": 0.96875, "rewards/chosen": -0.09787449240684509, "rewards/margins": 1.0293837785720825, "rewards/rejected": -1.12725830078125, "step": 112 }, { "epoch": 0.27294685990338163, "grad_norm": 3.533839702606201, "learning_rate": 9.729468599033815e-07, "logits/chosen": -2.3237297534942627, "logits/rejected": -2.204219341278076, "logps/chosen": -50.071720123291016, "logps/rejected": -87.76797485351562, "loss": 0.7279, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21165218949317932, "rewards/margins": 1.3335213661193848, "rewards/rejected": -1.1218693256378174, "step": 113 }, { "epoch": 0.2753623188405797, "grad_norm": 3.0936439037323, "learning_rate": 9.72705314009662e-07, "logits/chosen": -2.3177530765533447, "logits/rejected": -2.228212356567383, "logps/chosen": -63.74668884277344, "logps/rejected": -88.31976318359375, "loss": 0.8111, "rewards/accuracies": 0.8125, "rewards/chosen": -0.25555557012557983, "rewards/margins": 0.9480069875717163, "rewards/rejected": -1.2035624980926514, "step": 114 }, { "epoch": 0.2777777777777778, "grad_norm": 3.234168529510498, "learning_rate": 9.72463768115942e-07, "logits/chosen": -2.219343662261963, "logits/rejected": -2.2236478328704834, "logps/chosen": -57.35887145996094, "logps/rejected": -92.42676544189453, "loss": 0.7419, "rewards/accuracies": 0.875, "rewards/chosen": -0.07330977916717529, "rewards/margins": 1.294518232345581, "rewards/rejected": -1.3678278923034668, "step": 115 }, { "epoch": 0.28019323671497587, "grad_norm": 2.7514395713806152, "learning_rate": 9.722222222222222e-07, "logits/chosen": -2.3954787254333496, "logits/rejected": -2.3111231327056885, "logps/chosen": -61.954715728759766, "logps/rejected": -83.8497543334961, "loss": 0.841, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2384076714515686, "rewards/margins": 0.8048770427703857, "rewards/rejected": -1.0432846546173096, "step": 116 }, { "epoch": 0.2826086956521739, "grad_norm": 3.491326332092285, "learning_rate": 9.719806763285024e-07, "logits/chosen": -2.353743076324463, "logits/rejected": -2.238327980041504, "logps/chosen": -51.694793701171875, "logps/rejected": -81.79730224609375, "loss": 0.736, "rewards/accuracies": 0.90625, "rewards/chosen": 0.17519983649253845, "rewards/margins": 1.2266534566879272, "rewards/rejected": -1.0514535903930664, "step": 117 }, { "epoch": 0.28502415458937197, "grad_norm": 3.063352584838867, "learning_rate": 9.717391304347827e-07, "logits/chosen": -2.2622148990631104, "logits/rejected": -2.212447166442871, "logps/chosen": -51.28094482421875, "logps/rejected": -82.94438934326172, "loss": 0.7583, "rewards/accuracies": 1.0, "rewards/chosen": 0.08011689782142639, "rewards/margins": 1.2188220024108887, "rewards/rejected": -1.1387050151824951, "step": 118 }, { "epoch": 0.28743961352657005, "grad_norm": 3.4492530822753906, "learning_rate": 9.714975845410627e-07, "logits/chosen": -2.272157669067383, "logits/rejected": -2.232870101928711, "logps/chosen": -52.8154411315918, "logps/rejected": -77.7109603881836, "loss": 0.7564, "rewards/accuracies": 0.96875, "rewards/chosen": 0.2388775795698166, "rewards/margins": 1.192641019821167, "rewards/rejected": -0.9537634253501892, "step": 119 }, { "epoch": 0.2898550724637681, "grad_norm": 3.172791004180908, "learning_rate": 9.71256038647343e-07, "logits/chosen": -2.2152040004730225, "logits/rejected": -2.197338819503784, "logps/chosen": -54.32249069213867, "logps/rejected": -85.97286224365234, "loss": 0.753, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1796606481075287, "rewards/margins": 1.2466286420822144, "rewards/rejected": -1.0669679641723633, "step": 120 }, { "epoch": 0.2922705314009662, "grad_norm": 3.4210493564605713, "learning_rate": 9.710144927536232e-07, "logits/chosen": -2.33479642868042, "logits/rejected": -2.2687888145446777, "logps/chosen": -55.78400802612305, "logps/rejected": -93.82915496826172, "loss": 0.7204, "rewards/accuracies": 0.9375, "rewards/chosen": 0.026723608374595642, "rewards/margins": 1.43898606300354, "rewards/rejected": -1.4122624397277832, "step": 121 }, { "epoch": 0.2946859903381642, "grad_norm": 3.1729371547698975, "learning_rate": 9.707729468599034e-07, "logits/chosen": -2.2406094074249268, "logits/rejected": -2.1845293045043945, "logps/chosen": -54.25534439086914, "logps/rejected": -81.0296859741211, "loss": 0.7572, "rewards/accuracies": 0.96875, "rewards/chosen": 0.202220156788826, "rewards/margins": 1.190375566482544, "rewards/rejected": -0.9881554245948792, "step": 122 }, { "epoch": 0.2971014492753623, "grad_norm": 3.40986704826355, "learning_rate": 9.705314009661836e-07, "logits/chosen": -2.3311476707458496, "logits/rejected": -2.2265355587005615, "logps/chosen": -55.359886169433594, "logps/rejected": -85.95184326171875, "loss": 0.7463, "rewards/accuracies": 0.96875, "rewards/chosen": 0.20277152955532074, "rewards/margins": 1.2997337579727173, "rewards/rejected": -1.0969622135162354, "step": 123 }, { "epoch": 0.2995169082125604, "grad_norm": 3.006821870803833, "learning_rate": 9.702898550724637e-07, "logits/chosen": -2.3420357704162598, "logits/rejected": -2.2983174324035645, "logps/chosen": -55.89625549316406, "logps/rejected": -75.03305053710938, "loss": 0.801, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16753354668617249, "rewards/margins": 0.9761591553688049, "rewards/rejected": -0.8086256980895996, "step": 124 }, { "epoch": 0.30193236714975846, "grad_norm": 3.532233715057373, "learning_rate": 9.70048309178744e-07, "logits/chosen": -2.329763889312744, "logits/rejected": -2.2076759338378906, "logps/chosen": -47.44447326660156, "logps/rejected": -87.25931549072266, "loss": 0.6559, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6570374369621277, "rewards/margins": 1.7069330215454102, "rewards/rejected": -1.0498957633972168, "step": 125 }, { "epoch": 0.30434782608695654, "grad_norm": 2.9186999797821045, "learning_rate": 9.698067632850241e-07, "logits/chosen": -2.3536696434020996, "logits/rejected": -2.30159330368042, "logps/chosen": -56.37314987182617, "logps/rejected": -93.95807647705078, "loss": 0.7289, "rewards/accuracies": 0.90625, "rewards/chosen": 0.006314806640148163, "rewards/margins": 1.4323101043701172, "rewards/rejected": -1.4259953498840332, "step": 126 }, { "epoch": 0.30676328502415456, "grad_norm": 3.1164448261260986, "learning_rate": 9.695652173913042e-07, "logits/chosen": -2.389404296875, "logits/rejected": -2.31735897064209, "logps/chosen": -58.619503021240234, "logps/rejected": -84.8133544921875, "loss": 0.7718, "rewards/accuracies": 0.96875, "rewards/chosen": -0.005681496113538742, "rewards/margins": 1.1739697456359863, "rewards/rejected": -1.1796513795852661, "step": 127 }, { "epoch": 0.30917874396135264, "grad_norm": 3.196716070175171, "learning_rate": 9.693236714975846e-07, "logits/chosen": -2.310983657836914, "logits/rejected": -2.260636568069458, "logps/chosen": -59.045310974121094, "logps/rejected": -88.60388946533203, "loss": 0.7532, "rewards/accuracies": 0.9375, "rewards/chosen": -0.021131912246346474, "rewards/margins": 1.357606053352356, "rewards/rejected": -1.3787379264831543, "step": 128 }, { "epoch": 0.3115942028985507, "grad_norm": 4.070439338684082, "learning_rate": 9.690821256038646e-07, "logits/chosen": -2.3051958084106445, "logits/rejected": -2.2528228759765625, "logps/chosen": -54.20303726196289, "logps/rejected": -88.32410430908203, "loss": 0.7105, "rewards/accuracies": 0.96875, "rewards/chosen": 0.11685170233249664, "rewards/margins": 1.4614847898483276, "rewards/rejected": -1.3446331024169922, "step": 129 }, { "epoch": 0.3140096618357488, "grad_norm": 3.8753561973571777, "learning_rate": 9.688405797101449e-07, "logits/chosen": -2.2933168411254883, "logits/rejected": -2.239321708679199, "logps/chosen": -54.718257904052734, "logps/rejected": -96.52857971191406, "loss": 0.6621, "rewards/accuracies": 0.96875, "rewards/chosen": 0.16238529980182648, "rewards/margins": 1.8562235832214355, "rewards/rejected": -1.693838357925415, "step": 130 }, { "epoch": 0.3164251207729469, "grad_norm": 3.206082820892334, "learning_rate": 9.685990338164251e-07, "logits/chosen": -2.256544351577759, "logits/rejected": -2.1808626651763916, "logps/chosen": -58.81806945800781, "logps/rejected": -91.34329986572266, "loss": 0.744, "rewards/accuracies": 0.90625, "rewards/chosen": -0.008322745561599731, "rewards/margins": 1.3985563516616821, "rewards/rejected": -1.4068790674209595, "step": 131 }, { "epoch": 0.3188405797101449, "grad_norm": 3.6543073654174805, "learning_rate": 9.683574879227051e-07, "logits/chosen": -2.4025378227233887, "logits/rejected": -2.336719512939453, "logps/chosen": -63.00627136230469, "logps/rejected": -86.74649810791016, "loss": 0.7741, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0181133896112442, "rewards/margins": 1.1297450065612793, "rewards/rejected": -1.1116317510604858, "step": 132 }, { "epoch": 0.321256038647343, "grad_norm": 3.511888265609741, "learning_rate": 9.681159420289856e-07, "logits/chosen": -2.2443206310272217, "logits/rejected": -2.2675938606262207, "logps/chosen": -53.57398223876953, "logps/rejected": -95.22499084472656, "loss": 0.6401, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3462355136871338, "rewards/margins": 1.931172490119934, "rewards/rejected": -1.5849370956420898, "step": 133 }, { "epoch": 0.32367149758454106, "grad_norm": 2.962742805480957, "learning_rate": 9.678743961352656e-07, "logits/chosen": -2.2566161155700684, "logits/rejected": -2.2698557376861572, "logps/chosen": -56.17855453491211, "logps/rejected": -90.3131103515625, "loss": 0.7548, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09341758489608765, "rewards/margins": 1.4284594058990479, "rewards/rejected": -1.3350419998168945, "step": 134 }, { "epoch": 0.32608695652173914, "grad_norm": 3.784647226333618, "learning_rate": 9.676328502415458e-07, "logits/chosen": -2.332900047302246, "logits/rejected": -2.2668776512145996, "logps/chosen": -57.32245635986328, "logps/rejected": -95.48822784423828, "loss": 0.696, "rewards/accuracies": 1.0, "rewards/chosen": 0.1228521391749382, "rewards/margins": 1.6484510898590088, "rewards/rejected": -1.5255987644195557, "step": 135 }, { "epoch": 0.3285024154589372, "grad_norm": 3.1569628715515137, "learning_rate": 9.67391304347826e-07, "logits/chosen": -2.3134477138519287, "logits/rejected": -2.2598745822906494, "logps/chosen": -53.09284210205078, "logps/rejected": -88.50869750976562, "loss": 0.7261, "rewards/accuracies": 0.875, "rewards/chosen": 0.22174233198165894, "rewards/margins": 1.4974372386932373, "rewards/rejected": -1.2756948471069336, "step": 136 }, { "epoch": 0.3309178743961353, "grad_norm": 3.261077880859375, "learning_rate": 9.671497584541063e-07, "logits/chosen": -2.29009747505188, "logits/rejected": -2.1925477981567383, "logps/chosen": -53.306278228759766, "logps/rejected": -90.60259246826172, "loss": 0.662, "rewards/accuracies": 0.96875, "rewards/chosen": 0.24996785819530487, "rewards/margins": 1.8010151386260986, "rewards/rejected": -1.5510473251342773, "step": 137 }, { "epoch": 0.3333333333333333, "grad_norm": 3.449899673461914, "learning_rate": 9.669082125603866e-07, "logits/chosen": -2.2165279388427734, "logits/rejected": -2.265605926513672, "logps/chosen": -55.91518020629883, "logps/rejected": -81.78260803222656, "loss": 0.7523, "rewards/accuracies": 0.9375, "rewards/chosen": 0.35373979806900024, "rewards/margins": 1.3326088190078735, "rewards/rejected": -0.9788689017295837, "step": 138 }, { "epoch": 0.3357487922705314, "grad_norm": 3.901172161102295, "learning_rate": 9.666666666666666e-07, "logits/chosen": -2.3463189601898193, "logits/rejected": -2.275749683380127, "logps/chosen": -49.27450180053711, "logps/rejected": -82.1705322265625, "loss": 0.6562, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5633030533790588, "rewards/margins": 1.8319091796875, "rewards/rejected": -1.2686063051223755, "step": 139 }, { "epoch": 0.33816425120772947, "grad_norm": 3.3979005813598633, "learning_rate": 9.664251207729468e-07, "logits/chosen": -2.314643621444702, "logits/rejected": -2.2632741928100586, "logps/chosen": -58.13816833496094, "logps/rejected": -88.18045806884766, "loss": 0.7101, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3186730742454529, "rewards/margins": 1.5578926801681519, "rewards/rejected": -1.2392195463180542, "step": 140 }, { "epoch": 0.34057971014492755, "grad_norm": 3.1647984981536865, "learning_rate": 9.66183574879227e-07, "logits/chosen": -2.2819771766662598, "logits/rejected": -2.2766690254211426, "logps/chosen": -52.70451354980469, "logps/rejected": -78.40229034423828, "loss": 0.6605, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5950640439987183, "rewards/margins": 1.7771875858306885, "rewards/rejected": -1.1821235418319702, "step": 141 }, { "epoch": 0.34299516908212563, "grad_norm": 3.6435201168060303, "learning_rate": 9.659420289855073e-07, "logits/chosen": -2.3259737491607666, "logits/rejected": -2.277271032333374, "logps/chosen": -51.86378479003906, "logps/rejected": -93.16667175292969, "loss": 0.6414, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3930056095123291, "rewards/margins": 1.9764835834503174, "rewards/rejected": -1.5834782123565674, "step": 142 }, { "epoch": 0.34541062801932365, "grad_norm": 3.6231186389923096, "learning_rate": 9.657004830917873e-07, "logits/chosen": -2.3680484294891357, "logits/rejected": -2.2181646823883057, "logps/chosen": -49.70751190185547, "logps/rejected": -94.32087707519531, "loss": 0.6515, "rewards/accuracies": 0.96875, "rewards/chosen": 0.597720742225647, "rewards/margins": 1.896154761314392, "rewards/rejected": -1.2984338998794556, "step": 143 }, { "epoch": 0.34782608695652173, "grad_norm": 3.2691354751586914, "learning_rate": 9.654589371980676e-07, "logits/chosen": -2.2804653644561768, "logits/rejected": -2.2504265308380127, "logps/chosen": -51.66891860961914, "logps/rejected": -87.27538299560547, "loss": 0.711, "rewards/accuracies": 0.84375, "rewards/chosen": 0.33132022619247437, "rewards/margins": 1.5797946453094482, "rewards/rejected": -1.2484745979309082, "step": 144 }, { "epoch": 0.3502415458937198, "grad_norm": 3.755666494369507, "learning_rate": 9.652173913043478e-07, "logits/chosen": -2.267045259475708, "logits/rejected": -2.2342777252197266, "logps/chosen": -52.3257942199707, "logps/rejected": -98.56880187988281, "loss": 0.6092, "rewards/accuracies": 1.0, "rewards/chosen": 0.5018959045410156, "rewards/margins": 2.3103103637695312, "rewards/rejected": -1.8084144592285156, "step": 145 }, { "epoch": 0.3526570048309179, "grad_norm": 3.2407286167144775, "learning_rate": 9.649758454106278e-07, "logits/chosen": -2.2480618953704834, "logits/rejected": -2.2528529167175293, "logps/chosen": -47.871219635009766, "logps/rejected": -90.9559326171875, "loss": 0.5783, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7561361193656921, "rewards/margins": 2.2651216983795166, "rewards/rejected": -1.5089857578277588, "step": 146 }, { "epoch": 0.35507246376811596, "grad_norm": 3.3278989791870117, "learning_rate": 9.647342995169083e-07, "logits/chosen": -2.2913618087768555, "logits/rejected": -2.201540470123291, "logps/chosen": -55.14674758911133, "logps/rejected": -82.68372344970703, "loss": 0.7498, "rewards/accuracies": 0.96875, "rewards/chosen": 0.15236274898052216, "rewards/margins": 1.3813133239746094, "rewards/rejected": -1.2289506196975708, "step": 147 }, { "epoch": 0.357487922705314, "grad_norm": 2.9457671642303467, "learning_rate": 9.644927536231883e-07, "logits/chosen": -2.31589412689209, "logits/rejected": -2.2678771018981934, "logps/chosen": -53.61341857910156, "logps/rejected": -91.53919982910156, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.16580304503440857, "rewards/margins": 1.839015007019043, "rewards/rejected": -1.6732120513916016, "step": 148 }, { "epoch": 0.35990338164251207, "grad_norm": 3.5133392810821533, "learning_rate": 9.642512077294685e-07, "logits/chosen": -2.32729434967041, "logits/rejected": -2.268157720565796, "logps/chosen": -49.988529205322266, "logps/rejected": -83.53311157226562, "loss": 0.6708, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4957839846611023, "rewards/margins": 1.715882658958435, "rewards/rejected": -1.220098853111267, "step": 149 }, { "epoch": 0.36231884057971014, "grad_norm": 3.135622262954712, "learning_rate": 9.640096618357488e-07, "logits/chosen": -2.3562495708465576, "logits/rejected": -2.3307247161865234, "logps/chosen": -59.22457504272461, "logps/rejected": -91.47920227050781, "loss": 0.759, "rewards/accuracies": 0.875, "rewards/chosen": -0.21006520092487335, "rewards/margins": 1.4346808195114136, "rewards/rejected": -1.644745945930481, "step": 150 }, { "epoch": 0.3647342995169082, "grad_norm": 3.3074865341186523, "learning_rate": 9.637681159420288e-07, "logits/chosen": -2.3411054611206055, "logits/rejected": -2.238778591156006, "logps/chosen": -52.129493713378906, "logps/rejected": -91.3369140625, "loss": 0.6663, "rewards/accuracies": 0.90625, "rewards/chosen": 0.41917553544044495, "rewards/margins": 1.920169472694397, "rewards/rejected": -1.5009942054748535, "step": 151 }, { "epoch": 0.3671497584541063, "grad_norm": 2.711138963699341, "learning_rate": 9.635265700483093e-07, "logits/chosen": -2.3470280170440674, "logits/rejected": -2.255767822265625, "logps/chosen": -49.109535217285156, "logps/rejected": -83.4545669555664, "loss": 0.7047, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4896898865699768, "rewards/margins": 1.6826540231704712, "rewards/rejected": -1.1929641962051392, "step": 152 }, { "epoch": 0.3695652173913043, "grad_norm": 4.181684970855713, "learning_rate": 9.632850241545893e-07, "logits/chosen": -2.362712860107422, "logits/rejected": -2.2668404579162598, "logps/chosen": -51.574241638183594, "logps/rejected": -93.53417205810547, "loss": 0.6176, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5866237282752991, "rewards/margins": 2.241987943649292, "rewards/rejected": -1.6553640365600586, "step": 153 }, { "epoch": 0.3719806763285024, "grad_norm": 2.8684494495391846, "learning_rate": 9.630434782608695e-07, "logits/chosen": -2.2987892627716064, "logits/rejected": -2.2520816326141357, "logps/chosen": -51.78289794921875, "logps/rejected": -83.6980209350586, "loss": 0.7267, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3597334027290344, "rewards/margins": 1.6211501359939575, "rewards/rejected": -1.2614166736602783, "step": 154 }, { "epoch": 0.3743961352657005, "grad_norm": 3.2826900482177734, "learning_rate": 9.628019323671498e-07, "logits/chosen": -2.3255879878997803, "logits/rejected": -2.2764053344726562, "logps/chosen": -52.59089660644531, "logps/rejected": -89.0500259399414, "loss": 0.7105, "rewards/accuracies": 0.875, "rewards/chosen": 0.3230840265750885, "rewards/margins": 1.7195078134536743, "rewards/rejected": -1.3964238166809082, "step": 155 }, { "epoch": 0.37681159420289856, "grad_norm": 3.187901735305786, "learning_rate": 9.625603864734298e-07, "logits/chosen": -2.2965731620788574, "logits/rejected": -2.282358169555664, "logps/chosen": -52.980796813964844, "logps/rejected": -96.90644836425781, "loss": 0.6011, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5754542350769043, "rewards/margins": 2.357457160949707, "rewards/rejected": -1.7820028066635132, "step": 156 }, { "epoch": 0.37922705314009664, "grad_norm": 9.667221069335938, "learning_rate": 9.623188405797102e-07, "logits/chosen": -2.451840400695801, "logits/rejected": -2.3537161350250244, "logps/chosen": -47.928955078125, "logps/rejected": -87.97514343261719, "loss": 0.6729, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4809728264808655, "rewards/margins": 1.8919456005096436, "rewards/rejected": -1.4109727144241333, "step": 157 }, { "epoch": 0.38164251207729466, "grad_norm": 3.4335782527923584, "learning_rate": 9.620772946859903e-07, "logits/chosen": -2.3431177139282227, "logits/rejected": -2.2933247089385986, "logps/chosen": -48.386131286621094, "logps/rejected": -84.36343383789062, "loss": 0.6724, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5700366497039795, "rewards/margins": 1.9171305894851685, "rewards/rejected": -1.347093939781189, "step": 158 }, { "epoch": 0.38405797101449274, "grad_norm": 3.486736297607422, "learning_rate": 9.618357487922705e-07, "logits/chosen": -2.326632261276245, "logits/rejected": -2.3067824840545654, "logps/chosen": -55.532325744628906, "logps/rejected": -103.26422119140625, "loss": 0.6139, "rewards/accuracies": 1.0, "rewards/chosen": 0.12849339842796326, "rewards/margins": 2.334165334701538, "rewards/rejected": -2.205672025680542, "step": 159 }, { "epoch": 0.3864734299516908, "grad_norm": 3.3876030445098877, "learning_rate": 9.615942028985507e-07, "logits/chosen": -2.385134220123291, "logits/rejected": -2.2944798469543457, "logps/chosen": -50.85650634765625, "logps/rejected": -81.42852783203125, "loss": 0.6743, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5874818563461304, "rewards/margins": 1.874614953994751, "rewards/rejected": -1.287132978439331, "step": 160 }, { "epoch": 0.3888888888888889, "grad_norm": 3.847149610519409, "learning_rate": 9.61352657004831e-07, "logits/chosen": -2.316850423812866, "logits/rejected": -2.300536632537842, "logps/chosen": -53.51518249511719, "logps/rejected": -94.55809020996094, "loss": 0.6444, "rewards/accuracies": 0.9375, "rewards/chosen": 0.42991670966148376, "rewards/margins": 2.0658676624298096, "rewards/rejected": -1.6359509229660034, "step": 161 }, { "epoch": 0.391304347826087, "grad_norm": 2.6214003562927246, "learning_rate": 9.61111111111111e-07, "logits/chosen": -2.337808132171631, "logits/rejected": -2.308703660964966, "logps/chosen": -51.37057876586914, "logps/rejected": -94.29025268554688, "loss": 0.6466, "rewards/accuracies": 0.90625, "rewards/chosen": 0.47796130180358887, "rewards/margins": 2.182910680770874, "rewards/rejected": -1.7049493789672852, "step": 162 }, { "epoch": 0.39371980676328505, "grad_norm": 3.3327393531799316, "learning_rate": 9.608695652173912e-07, "logits/chosen": -2.3864784240722656, "logits/rejected": -2.303809881210327, "logps/chosen": -45.69596481323242, "logps/rejected": -75.72958374023438, "loss": 0.6991, "rewards/accuracies": 0.875, "rewards/chosen": 0.7854595184326172, "rewards/margins": 1.6514010429382324, "rewards/rejected": -0.8659414052963257, "step": 163 }, { "epoch": 0.3961352657004831, "grad_norm": 3.3268883228302, "learning_rate": 9.606280193236715e-07, "logits/chosen": -2.3743181228637695, "logits/rejected": -2.2795157432556152, "logps/chosen": -49.05430221557617, "logps/rejected": -94.22119140625, "loss": 0.6031, "rewards/accuracies": 1.0, "rewards/chosen": 0.7086178660392761, "rewards/margins": 2.4490113258361816, "rewards/rejected": -1.7403936386108398, "step": 164 }, { "epoch": 0.39855072463768115, "grad_norm": 3.4072673320770264, "learning_rate": 9.603864734299517e-07, "logits/chosen": -2.3257479667663574, "logits/rejected": -2.286900281906128, "logps/chosen": -48.528831481933594, "logps/rejected": -80.9318618774414, "loss": 0.6936, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7260949015617371, "rewards/margins": 1.8868012428283691, "rewards/rejected": -1.1607065200805664, "step": 165 }, { "epoch": 0.40096618357487923, "grad_norm": 3.513258218765259, "learning_rate": 9.60144927536232e-07, "logits/chosen": -2.390597343444824, "logits/rejected": -2.308412551879883, "logps/chosen": -51.55575942993164, "logps/rejected": -87.98255157470703, "loss": 0.7, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6457643508911133, "rewards/margins": 1.9498854875564575, "rewards/rejected": -1.3041211366653442, "step": 166 }, { "epoch": 0.40096618357487923, "eval_logits/chosen": -2.365568161010742, "eval_logits/rejected": -2.3339920043945312, "eval_logps/chosen": -48.18099594116211, "eval_logps/rejected": -94.10926055908203, "eval_loss": 0.5898884534835815, "eval_rewards/accuracies": 0.9558823704719543, "eval_rewards/chosen": 0.6479921340942383, "eval_rewards/margins": 2.425518035888672, "eval_rewards/rejected": -1.777525544166565, "eval_runtime": 998.7613, "eval_samples_per_second": 0.545, "eval_steps_per_second": 0.272, "step": 166 }, { "epoch": 0.4033816425120773, "grad_norm": 3.100403070449829, "learning_rate": 9.59903381642512e-07, "logits/chosen": -2.377315044403076, "logits/rejected": -2.299330472946167, "logps/chosen": -49.687156677246094, "logps/rejected": -97.35511779785156, "loss": 0.5706, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5971795320510864, "rewards/margins": 2.718764543533325, "rewards/rejected": -2.121584892272949, "step": 167 }, { "epoch": 0.4057971014492754, "grad_norm": 3.1486411094665527, "learning_rate": 9.596618357487922e-07, "logits/chosen": -2.3246402740478516, "logits/rejected": -2.2736074924468994, "logps/chosen": -52.392478942871094, "logps/rejected": -90.28579711914062, "loss": 0.6631, "rewards/accuracies": 0.96875, "rewards/chosen": 0.2667124569416046, "rewards/margins": 2.026221752166748, "rewards/rejected": -1.7595094442367554, "step": 168 }, { "epoch": 0.4082125603864734, "grad_norm": 2.892465591430664, "learning_rate": 9.594202898550724e-07, "logits/chosen": -2.2996675968170166, "logits/rejected": -2.2748169898986816, "logps/chosen": -55.181766510009766, "logps/rejected": -104.15827941894531, "loss": 0.6445, "rewards/accuracies": 0.90625, "rewards/chosen": 0.0925978273153305, "rewards/margins": 2.375025749206543, "rewards/rejected": -2.2824277877807617, "step": 169 }, { "epoch": 0.4106280193236715, "grad_norm": 4.274582386016846, "learning_rate": 9.591787439613525e-07, "logits/chosen": -2.370325803756714, "logits/rejected": -2.357706308364868, "logps/chosen": -48.63653564453125, "logps/rejected": -81.19602966308594, "loss": 0.658, "rewards/accuracies": 1.0, "rewards/chosen": 0.9011828303337097, "rewards/margins": 2.0563793182373047, "rewards/rejected": -1.1551964282989502, "step": 170 }, { "epoch": 0.41304347826086957, "grad_norm": 3.0377302169799805, "learning_rate": 9.58937198067633e-07, "logits/chosen": -2.3485796451568604, "logits/rejected": -2.267003297805786, "logps/chosen": -56.2918701171875, "logps/rejected": -89.30976104736328, "loss": 0.6969, "rewards/accuracies": 0.875, "rewards/chosen": 0.22118790447711945, "rewards/margins": 1.8061050176620483, "rewards/rejected": -1.5849171876907349, "step": 171 }, { "epoch": 0.41545893719806765, "grad_norm": 2.8464438915252686, "learning_rate": 9.58695652173913e-07, "logits/chosen": -2.3260302543640137, "logits/rejected": -2.285141944885254, "logps/chosen": -58.00645446777344, "logps/rejected": -100.36265563964844, "loss": 0.673, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0106443390250206, "rewards/margins": 2.094593048095703, "rewards/rejected": -2.083948850631714, "step": 172 }, { "epoch": 0.4178743961352657, "grad_norm": 3.3024420738220215, "learning_rate": 9.584541062801932e-07, "logits/chosen": -2.378603219985962, "logits/rejected": -2.3611135482788086, "logps/chosen": -50.19511032104492, "logps/rejected": -90.42086029052734, "loss": 0.6688, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5325266122817993, "rewards/margins": 2.1720998287200928, "rewards/rejected": -1.639573097229004, "step": 173 }, { "epoch": 0.42028985507246375, "grad_norm": 3.731874942779541, "learning_rate": 9.582125603864734e-07, "logits/chosen": -2.3822479248046875, "logits/rejected": -2.2955093383789062, "logps/chosen": -50.65265655517578, "logps/rejected": -86.72708892822266, "loss": 0.6499, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8081495761871338, "rewards/margins": 2.165111780166626, "rewards/rejected": -1.356961965560913, "step": 174 }, { "epoch": 0.4227053140096618, "grad_norm": 3.2551097869873047, "learning_rate": 9.579710144927534e-07, "logits/chosen": -2.3671493530273438, "logits/rejected": -2.310648202896118, "logps/chosen": -62.49588394165039, "logps/rejected": -92.23341369628906, "loss": 0.7182, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10324740409851074, "rewards/margins": 1.6765482425689697, "rewards/rejected": -1.77979576587677, "step": 175 }, { "epoch": 0.4251207729468599, "grad_norm": 4.035101890563965, "learning_rate": 9.577294685990339e-07, "logits/chosen": -2.408386707305908, "logits/rejected": -2.3505172729492188, "logps/chosen": -43.9778938293457, "logps/rejected": -91.50969696044922, "loss": 0.5361, "rewards/accuracies": 1.0, "rewards/chosen": 1.1897426843643188, "rewards/margins": 2.840880870819092, "rewards/rejected": -1.6511383056640625, "step": 176 }, { "epoch": 0.427536231884058, "grad_norm": 3.342092514038086, "learning_rate": 9.57487922705314e-07, "logits/chosen": -2.3097245693206787, "logits/rejected": -2.2979602813720703, "logps/chosen": -54.68159484863281, "logps/rejected": -98.10708618164062, "loss": 0.6925, "rewards/accuracies": 0.90625, "rewards/chosen": 0.21088410913944244, "rewards/margins": 2.1511194705963135, "rewards/rejected": -1.9402356147766113, "step": 177 }, { "epoch": 0.42995169082125606, "grad_norm": 2.8033907413482666, "learning_rate": 9.572463768115942e-07, "logits/chosen": -2.2684226036071777, "logits/rejected": -2.324843406677246, "logps/chosen": -49.17889404296875, "logps/rejected": -92.9868392944336, "loss": 0.5941, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6019500494003296, "rewards/margins": 2.5616507530212402, "rewards/rejected": -1.9597008228302002, "step": 178 }, { "epoch": 0.4323671497584541, "grad_norm": 3.3302555084228516, "learning_rate": 9.570048309178744e-07, "logits/chosen": -2.3465144634246826, "logits/rejected": -2.3324239253997803, "logps/chosen": -47.27120590209961, "logps/rejected": -92.93463134765625, "loss": 0.5783, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6868340969085693, "rewards/margins": 2.4132633209228516, "rewards/rejected": -1.7264293432235718, "step": 179 }, { "epoch": 0.43478260869565216, "grad_norm": 3.653714179992676, "learning_rate": 9.567632850241546e-07, "logits/chosen": -2.3650834560394287, "logits/rejected": -2.3196496963500977, "logps/chosen": -50.700679779052734, "logps/rejected": -90.77183532714844, "loss": 0.674, "rewards/accuracies": 0.875, "rewards/chosen": 0.4053439199924469, "rewards/margins": 2.174300193786621, "rewards/rejected": -1.768956184387207, "step": 180 }, { "epoch": 0.43719806763285024, "grad_norm": 3.893933057785034, "learning_rate": 9.565217391304349e-07, "logits/chosen": -2.4368464946746826, "logits/rejected": -2.3848042488098145, "logps/chosen": -57.071807861328125, "logps/rejected": -101.58794403076172, "loss": 0.6095, "rewards/accuracies": 0.96875, "rewards/chosen": 0.501520574092865, "rewards/margins": 2.7365615367889404, "rewards/rejected": -2.2350409030914307, "step": 181 }, { "epoch": 0.4396135265700483, "grad_norm": 3.049323558807373, "learning_rate": 9.562801932367149e-07, "logits/chosen": -2.3781540393829346, "logits/rejected": -2.313354730606079, "logps/chosen": -42.3465461730957, "logps/rejected": -101.71917724609375, "loss": 0.5066, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8807706236839294, "rewards/margins": 3.052083969116211, "rewards/rejected": -2.1713132858276367, "step": 182 }, { "epoch": 0.4420289855072464, "grad_norm": 3.186056613922119, "learning_rate": 9.560386473429951e-07, "logits/chosen": -2.4025325775146484, "logits/rejected": -2.323917865753174, "logps/chosen": -55.544551849365234, "logps/rejected": -114.05810546875, "loss": 0.5533, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2647658586502075, "rewards/margins": 2.9543206691741943, "rewards/rejected": -2.6895546913146973, "step": 183 }, { "epoch": 0.4444444444444444, "grad_norm": 3.4681525230407715, "learning_rate": 9.557971014492754e-07, "logits/chosen": -2.3224830627441406, "logits/rejected": -2.2342188358306885, "logps/chosen": -47.31353759765625, "logps/rejected": -96.4786148071289, "loss": 0.56, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7959888577461243, "rewards/margins": 2.8812108039855957, "rewards/rejected": -2.085221767425537, "step": 184 }, { "epoch": 0.4468599033816425, "grad_norm": 3.456681251525879, "learning_rate": 9.555555555555556e-07, "logits/chosen": -2.442183017730713, "logits/rejected": -2.4150490760803223, "logps/chosen": -49.78712844848633, "logps/rejected": -93.9900894165039, "loss": 0.6104, "rewards/accuracies": 0.875, "rewards/chosen": 0.8493247032165527, "rewards/margins": 2.514362096786499, "rewards/rejected": -1.6650372743606567, "step": 185 }, { "epoch": 0.4492753623188406, "grad_norm": 2.9479053020477295, "learning_rate": 9.553140096618356e-07, "logits/chosen": -2.4127583503723145, "logits/rejected": -2.3597631454467773, "logps/chosen": -46.06175994873047, "logps/rejected": -89.12422943115234, "loss": 0.5522, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0032442808151245, "rewards/margins": 2.8567416667938232, "rewards/rejected": -1.8534973859786987, "step": 186 }, { "epoch": 0.45169082125603865, "grad_norm": 4.504210948944092, "learning_rate": 9.550724637681159e-07, "logits/chosen": -2.3329973220825195, "logits/rejected": -2.26174259185791, "logps/chosen": -35.99477767944336, "logps/rejected": -96.57726287841797, "loss": 0.4492, "rewards/accuracies": 1.0, "rewards/chosen": 1.6901028156280518, "rewards/margins": 3.4221813678741455, "rewards/rejected": -1.7320786714553833, "step": 187 }, { "epoch": 0.45410628019323673, "grad_norm": 2.888645887374878, "learning_rate": 9.548309178743961e-07, "logits/chosen": -2.421168804168701, "logits/rejected": -2.402078628540039, "logps/chosen": -53.979339599609375, "logps/rejected": -97.84974670410156, "loss": 0.6277, "rewards/accuracies": 1.0, "rewards/chosen": 0.3610018193721771, "rewards/margins": 2.5617127418518066, "rewards/rejected": -2.2007107734680176, "step": 188 }, { "epoch": 0.45652173913043476, "grad_norm": 3.5676674842834473, "learning_rate": 9.545893719806763e-07, "logits/chosen": -2.2892725467681885, "logits/rejected": -2.249220848083496, "logps/chosen": -54.05513381958008, "logps/rejected": -95.3948745727539, "loss": 0.5999, "rewards/accuracies": 0.90625, "rewards/chosen": 0.46481436491012573, "rewards/margins": 2.492050886154175, "rewards/rejected": -2.027236223220825, "step": 189 }, { "epoch": 0.45893719806763283, "grad_norm": 2.911334753036499, "learning_rate": 9.543478260869566e-07, "logits/chosen": -2.3882715702056885, "logits/rejected": -2.3408617973327637, "logps/chosen": -51.913021087646484, "logps/rejected": -95.26074981689453, "loss": 0.6213, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6998246908187866, "rewards/margins": 2.7332944869995117, "rewards/rejected": -2.0334699153900146, "step": 190 }, { "epoch": 0.4613526570048309, "grad_norm": 3.098024368286133, "learning_rate": 9.541062801932366e-07, "logits/chosen": -2.3080661296844482, "logits/rejected": -2.3045425415039062, "logps/chosen": -47.47381591796875, "logps/rejected": -89.44043731689453, "loss": 0.6058, "rewards/accuracies": 1.0, "rewards/chosen": 0.9837526082992554, "rewards/margins": 2.506476402282715, "rewards/rejected": -1.522723913192749, "step": 191 }, { "epoch": 0.463768115942029, "grad_norm": 4.20827579498291, "learning_rate": 9.538647342995168e-07, "logits/chosen": -2.4094550609588623, "logits/rejected": -2.373499870300293, "logps/chosen": -38.25192642211914, "logps/rejected": -91.17253112792969, "loss": 0.4753, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5052112340927124, "rewards/margins": 3.3027334213256836, "rewards/rejected": -1.7975223064422607, "step": 192 }, { "epoch": 0.46618357487922707, "grad_norm": 3.315048933029175, "learning_rate": 9.536231884057971e-07, "logits/chosen": -2.326158285140991, "logits/rejected": -2.377600908279419, "logps/chosen": -47.90401077270508, "logps/rejected": -95.71041107177734, "loss": 0.5984, "rewards/accuracies": 0.875, "rewards/chosen": 0.5246797800064087, "rewards/margins": 2.7931933403015137, "rewards/rejected": -2.2685136795043945, "step": 193 }, { "epoch": 0.46859903381642515, "grad_norm": 3.3850924968719482, "learning_rate": 9.533816425120772e-07, "logits/chosen": -2.4204535484313965, "logits/rejected": -2.3674874305725098, "logps/chosen": -43.82933807373047, "logps/rejected": -97.6148452758789, "loss": 0.5626, "rewards/accuracies": 1.0, "rewards/chosen": 0.9025270342826843, "rewards/margins": 3.184600830078125, "rewards/rejected": -2.282073974609375, "step": 194 }, { "epoch": 0.47101449275362317, "grad_norm": 3.596919536590576, "learning_rate": 9.531400966183575e-07, "logits/chosen": -2.389918327331543, "logits/rejected": -2.3200087547302246, "logps/chosen": -46.76996612548828, "logps/rejected": -93.4959945678711, "loss": 0.5491, "rewards/accuracies": 1.0, "rewards/chosen": 1.1629520654678345, "rewards/margins": 3.0165724754333496, "rewards/rejected": -1.8536206483840942, "step": 195 }, { "epoch": 0.47342995169082125, "grad_norm": 3.3810629844665527, "learning_rate": 9.528985507246377e-07, "logits/chosen": -2.3733859062194824, "logits/rejected": -2.3469910621643066, "logps/chosen": -41.66775131225586, "logps/rejected": -95.92838287353516, "loss": 0.5475, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9303016662597656, "rewards/margins": 3.1371824741363525, "rewards/rejected": -2.2068803310394287, "step": 196 }, { "epoch": 0.4758454106280193, "grad_norm": 3.0492987632751465, "learning_rate": 9.526570048309178e-07, "logits/chosen": -2.4624180793762207, "logits/rejected": -2.4078421592712402, "logps/chosen": -55.92863464355469, "logps/rejected": -102.75527954101562, "loss": 0.6459, "rewards/accuracies": 0.875, "rewards/chosen": 0.3739568889141083, "rewards/margins": 2.6719298362731934, "rewards/rejected": -2.297973155975342, "step": 197 }, { "epoch": 0.4782608695652174, "grad_norm": 3.5942418575286865, "learning_rate": 9.52415458937198e-07, "logits/chosen": -2.431419849395752, "logits/rejected": -2.3602120876312256, "logps/chosen": -46.275856018066406, "logps/rejected": -95.09930419921875, "loss": 0.5697, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6769775152206421, "rewards/margins": 2.9234509468078613, "rewards/rejected": -2.246473550796509, "step": 198 }, { "epoch": 0.4806763285024155, "grad_norm": 3.1637861728668213, "learning_rate": 9.521739130434783e-07, "logits/chosen": -2.4953792095184326, "logits/rejected": -2.435885429382324, "logps/chosen": -50.1020622253418, "logps/rejected": -100.61132049560547, "loss": 0.6286, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5188183784484863, "rewards/margins": 2.902893543243408, "rewards/rejected": -2.384075164794922, "step": 199 }, { "epoch": 0.4830917874396135, "grad_norm": 3.3398678302764893, "learning_rate": 9.519323671497584e-07, "logits/chosen": -2.4248878955841064, "logits/rejected": -2.336984872817993, "logps/chosen": -51.37852096557617, "logps/rejected": -110.4693374633789, "loss": 0.574, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4616192877292633, "rewards/margins": 3.363734006881714, "rewards/rejected": -2.9021148681640625, "step": 200 }, { "epoch": 0.4855072463768116, "grad_norm": 4.415822982788086, "learning_rate": 9.516908212560386e-07, "logits/chosen": -2.422029972076416, "logits/rejected": -2.3866426944732666, "logps/chosen": -39.576141357421875, "logps/rejected": -87.60611724853516, "loss": 0.553, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4254077672958374, "rewards/margins": 3.0231995582580566, "rewards/rejected": -1.5977917909622192, "step": 201 }, { "epoch": 0.48792270531400966, "grad_norm": 3.3830578327178955, "learning_rate": 9.514492753623188e-07, "logits/chosen": -2.427609920501709, "logits/rejected": -2.441854476928711, "logps/chosen": -57.047889709472656, "logps/rejected": -110.34103393554688, "loss": 0.6045, "rewards/accuracies": 0.9375, "rewards/chosen": 0.29646381735801697, "rewards/margins": 3.0993449687957764, "rewards/rejected": -2.8028812408447266, "step": 202 }, { "epoch": 0.49033816425120774, "grad_norm": 3.9149081707000732, "learning_rate": 9.512077294685989e-07, "logits/chosen": -2.400846242904663, "logits/rejected": -2.442136764526367, "logps/chosen": -50.93895721435547, "logps/rejected": -104.69891357421875, "loss": 0.5023, "rewards/accuracies": 1.0, "rewards/chosen": 0.8935902118682861, "rewards/margins": 3.6935276985168457, "rewards/rejected": -2.7999379634857178, "step": 203 }, { "epoch": 0.4927536231884058, "grad_norm": 3.7553887367248535, "learning_rate": 9.509661835748793e-07, "logits/chosen": -2.359243631362915, "logits/rejected": -2.411240577697754, "logps/chosen": -44.97690200805664, "logps/rejected": -87.23602294921875, "loss": 0.591, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0406075716018677, "rewards/margins": 2.7207460403442383, "rewards/rejected": -1.6801385879516602, "step": 204 }, { "epoch": 0.49516908212560384, "grad_norm": 3.5524635314941406, "learning_rate": 9.507246376811594e-07, "logits/chosen": -2.444673538208008, "logits/rejected": -2.39163875579834, "logps/chosen": -55.13975524902344, "logps/rejected": -108.74736022949219, "loss": 0.5607, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3003854751586914, "rewards/margins": 3.2231452465057373, "rewards/rejected": -2.922759771347046, "step": 205 }, { "epoch": 0.4975845410628019, "grad_norm": 3.896021842956543, "learning_rate": 9.504830917874395e-07, "logits/chosen": -2.456538200378418, "logits/rejected": -2.40910005569458, "logps/chosen": -40.733856201171875, "logps/rejected": -88.70094299316406, "loss": 0.5423, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1379235982894897, "rewards/margins": 3.0559377670288086, "rewards/rejected": -1.918014407157898, "step": 206 }, { "epoch": 0.5, "grad_norm": 3.889582872390747, "learning_rate": 9.502415458937198e-07, "logits/chosen": -2.4392447471618652, "logits/rejected": -2.4044132232666016, "logps/chosen": -38.73738479614258, "logps/rejected": -106.35417175292969, "loss": 0.4841, "rewards/accuracies": 0.9375, "rewards/chosen": 1.241590976715088, "rewards/margins": 4.030216217041016, "rewards/rejected": -2.7886252403259277, "step": 207 }, { "epoch": 0.5024154589371981, "grad_norm": 3.242716073989868, "learning_rate": 9.499999999999999e-07, "logits/chosen": -2.3536105155944824, "logits/rejected": -2.3697052001953125, "logps/chosen": -38.926353454589844, "logps/rejected": -88.21583557128906, "loss": 0.4863, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5739591121673584, "rewards/margins": 3.5646791458129883, "rewards/rejected": -1.9907199144363403, "step": 208 }, { "epoch": 0.5048309178743962, "grad_norm": 3.2055392265319824, "learning_rate": 9.497584541062801e-07, "logits/chosen": -2.487933874130249, "logits/rejected": -2.441579818725586, "logps/chosen": -46.268680572509766, "logps/rejected": -105.05789184570312, "loss": 0.5353, "rewards/accuracies": 1.0, "rewards/chosen": 1.0797475576400757, "rewards/margins": 3.6875109672546387, "rewards/rejected": -2.6077632904052734, "step": 209 }, { "epoch": 0.5072463768115942, "grad_norm": 3.1763722896575928, "learning_rate": 9.495169082125604e-07, "logits/chosen": -2.438654899597168, "logits/rejected": -2.416214942932129, "logps/chosen": -48.722312927246094, "logps/rejected": -101.77375030517578, "loss": 0.5553, "rewards/accuracies": 0.875, "rewards/chosen": 0.6768189668655396, "rewards/margins": 3.4670841693878174, "rewards/rejected": -2.7902653217315674, "step": 210 }, { "epoch": 0.5096618357487923, "grad_norm": 3.156026601791382, "learning_rate": 9.492753623188405e-07, "logits/chosen": -2.394307851791382, "logits/rejected": -2.39660382270813, "logps/chosen": -47.31946563720703, "logps/rejected": -95.24667358398438, "loss": 0.5688, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9515083432197571, "rewards/margins": 3.1701481342315674, "rewards/rejected": -2.218639850616455, "step": 211 }, { "epoch": 0.5120772946859904, "grad_norm": 3.440938949584961, "learning_rate": 9.490338164251207e-07, "logits/chosen": -2.3958911895751953, "logits/rejected": -2.3748457431793213, "logps/chosen": -51.14911651611328, "logps/rejected": -105.98675537109375, "loss": 0.5624, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6446655988693237, "rewards/margins": 3.2447850704193115, "rewards/rejected": -2.60011887550354, "step": 212 }, { "epoch": 0.5144927536231884, "grad_norm": 2.764230489730835, "learning_rate": 9.487922705314009e-07, "logits/chosen": -2.350107192993164, "logits/rejected": -2.3956823348999023, "logps/chosen": -51.581748962402344, "logps/rejected": -101.46576690673828, "loss": 0.5481, "rewards/accuracies": 0.875, "rewards/chosen": 0.734555721282959, "rewards/margins": 3.2804930210113525, "rewards/rejected": -2.5459377765655518, "step": 213 }, { "epoch": 0.5169082125603864, "grad_norm": 3.799715042114258, "learning_rate": 9.485507246376811e-07, "logits/chosen": -2.4780561923980713, "logits/rejected": -2.3993735313415527, "logps/chosen": -57.149749755859375, "logps/rejected": -109.66681671142578, "loss": 0.6515, "rewards/accuracies": 0.90625, "rewards/chosen": 0.004517808556556702, "rewards/margins": 2.889435052871704, "rewards/rejected": -2.8849172592163086, "step": 214 }, { "epoch": 0.5193236714975845, "grad_norm": 3.7341091632843018, "learning_rate": 9.483091787439614e-07, "logits/chosen": -2.346224784851074, "logits/rejected": -2.387739658355713, "logps/chosen": -48.18901062011719, "logps/rejected": -106.41963958740234, "loss": 0.5693, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6916549801826477, "rewards/margins": 3.5841057300567627, "rewards/rejected": -2.8924508094787598, "step": 215 }, { "epoch": 0.5217391304347826, "grad_norm": 2.958317279815674, "learning_rate": 9.480676328502415e-07, "logits/chosen": -2.3934478759765625, "logits/rejected": -2.396972179412842, "logps/chosen": -46.43822479248047, "logps/rejected": -102.78262329101562, "loss": 0.5709, "rewards/accuracies": 0.875, "rewards/chosen": 0.8175812363624573, "rewards/margins": 3.549955129623413, "rewards/rejected": -2.7323741912841797, "step": 216 }, { "epoch": 0.5241545893719807, "grad_norm": 4.131035804748535, "learning_rate": 9.478260869565216e-07, "logits/chosen": -2.4447474479675293, "logits/rejected": -2.409370183944702, "logps/chosen": -46.62114715576172, "logps/rejected": -97.40592956542969, "loss": 0.5189, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0597814321517944, "rewards/margins": 3.5377302169799805, "rewards/rejected": -2.4779486656188965, "step": 217 }, { "epoch": 0.5265700483091788, "grad_norm": 3.4151856899261475, "learning_rate": 9.475845410628019e-07, "logits/chosen": -2.473346471786499, "logits/rejected": -2.4444844722747803, "logps/chosen": -47.272884368896484, "logps/rejected": -97.06806945800781, "loss": 0.5553, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9617114067077637, "rewards/margins": 3.1642003059387207, "rewards/rejected": -2.202488899230957, "step": 218 }, { "epoch": 0.5289855072463768, "grad_norm": 3.867438554763794, "learning_rate": 9.473429951690821e-07, "logits/chosen": -2.4074182510375977, "logits/rejected": -2.415796995162964, "logps/chosen": -43.021583557128906, "logps/rejected": -104.1479263305664, "loss": 0.515, "rewards/accuracies": 1.0, "rewards/chosen": 1.0376778841018677, "rewards/margins": 3.7066640853881836, "rewards/rejected": -2.6689860820770264, "step": 219 }, { "epoch": 0.5314009661835749, "grad_norm": 3.786442995071411, "learning_rate": 9.471014492753623e-07, "logits/chosen": -2.4401068687438965, "logits/rejected": -2.400364398956299, "logps/chosen": -51.390628814697266, "logps/rejected": -100.34043884277344, "loss": 0.6835, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2699607014656067, "rewards/margins": 2.788693904876709, "rewards/rejected": -2.518732786178589, "step": 220 }, { "epoch": 0.533816425120773, "grad_norm": 3.941727638244629, "learning_rate": 9.468599033816425e-07, "logits/chosen": -2.402312755584717, "logits/rejected": -2.4225306510925293, "logps/chosen": -42.824317932128906, "logps/rejected": -87.69535827636719, "loss": 0.5558, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1555919647216797, "rewards/margins": 3.063659191131592, "rewards/rejected": -1.9080675840377808, "step": 221 }, { "epoch": 0.5362318840579711, "grad_norm": 3.8628456592559814, "learning_rate": 9.466183574879226e-07, "logits/chosen": -2.4299259185791016, "logits/rejected": -2.373725175857544, "logps/chosen": -44.46099090576172, "logps/rejected": -100.47362518310547, "loss": 0.5541, "rewards/accuracies": 1.0, "rewards/chosen": 1.085592269897461, "rewards/margins": 3.526616334915161, "rewards/rejected": -2.4410243034362793, "step": 222 }, { "epoch": 0.538647342995169, "grad_norm": 3.247727632522583, "learning_rate": 9.463768115942029e-07, "logits/chosen": -2.3952796459198, "logits/rejected": -2.4046082496643066, "logps/chosen": -41.92512130737305, "logps/rejected": -89.20158386230469, "loss": 0.6164, "rewards/accuracies": 0.9375, "rewards/chosen": 1.235558271408081, "rewards/margins": 3.1582159996032715, "rewards/rejected": -1.92265784740448, "step": 223 }, { "epoch": 0.5410628019323671, "grad_norm": 3.187405586242676, "learning_rate": 9.461352657004831e-07, "logits/chosen": -2.5541656017303467, "logits/rejected": -2.502638339996338, "logps/chosen": -53.93061828613281, "logps/rejected": -94.06437683105469, "loss": 0.6028, "rewards/accuracies": 0.875, "rewards/chosen": 0.6719961762428284, "rewards/margins": 2.8900489807128906, "rewards/rejected": -2.218052864074707, "step": 224 }, { "epoch": 0.5434782608695652, "grad_norm": 4.409195899963379, "learning_rate": 9.458937198067632e-07, "logits/chosen": -2.423619508743286, "logits/rejected": -2.444389820098877, "logps/chosen": -40.1158332824707, "logps/rejected": -86.33052062988281, "loss": 0.5596, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6072686910629272, "rewards/margins": 3.2898802757263184, "rewards/rejected": -1.6826115846633911, "step": 225 }, { "epoch": 0.5458937198067633, "grad_norm": 2.683749198913574, "learning_rate": 9.456521739130434e-07, "logits/chosen": -2.3824515342712402, "logits/rejected": -2.415870189666748, "logps/chosen": -53.71796417236328, "logps/rejected": -113.54849243164062, "loss": 0.5103, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6734669804573059, "rewards/margins": 3.9396543502807617, "rewards/rejected": -3.2661871910095215, "step": 226 }, { "epoch": 0.5483091787439613, "grad_norm": 3.310263156890869, "learning_rate": 9.454106280193236e-07, "logits/chosen": -2.4408562183380127, "logits/rejected": -2.4611079692840576, "logps/chosen": -60.549949645996094, "logps/rejected": -113.5975112915039, "loss": 0.6055, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10353434085845947, "rewards/margins": 3.275341033935547, "rewards/rejected": -3.171806812286377, "step": 227 }, { "epoch": 0.5507246376811594, "grad_norm": 3.3727502822875977, "learning_rate": 9.451690821256039e-07, "logits/chosen": -2.45872163772583, "logits/rejected": -2.4018402099609375, "logps/chosen": -49.863529205322266, "logps/rejected": -92.95579528808594, "loss": 0.716, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5266175866127014, "rewards/margins": 2.530172109603882, "rewards/rejected": -2.003554344177246, "step": 228 }, { "epoch": 0.5531400966183575, "grad_norm": 3.3445546627044678, "learning_rate": 9.44927536231884e-07, "logits/chosen": -2.3928935527801514, "logits/rejected": -2.3595731258392334, "logps/chosen": -36.09088134765625, "logps/rejected": -95.80799102783203, "loss": 0.4518, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6852171421051025, "rewards/margins": 3.9573545455932617, "rewards/rejected": -2.272137403488159, "step": 229 }, { "epoch": 0.5555555555555556, "grad_norm": 3.730593204498291, "learning_rate": 9.446859903381642e-07, "logits/chosen": -2.394380807876587, "logits/rejected": -2.384265422821045, "logps/chosen": -39.81417465209961, "logps/rejected": -109.64883422851562, "loss": 0.4878, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2749314308166504, "rewards/margins": 4.160480499267578, "rewards/rejected": -2.8855490684509277, "step": 230 }, { "epoch": 0.5579710144927537, "grad_norm": 3.5951223373413086, "learning_rate": 9.444444444444444e-07, "logits/chosen": -2.406160593032837, "logits/rejected": -2.3955390453338623, "logps/chosen": -41.17851638793945, "logps/rejected": -95.32086181640625, "loss": 0.557, "rewards/accuracies": 0.875, "rewards/chosen": 1.065685510635376, "rewards/margins": 3.4217467308044434, "rewards/rejected": -2.3560614585876465, "step": 231 }, { "epoch": 0.5603864734299517, "grad_norm": 3.8430426120758057, "learning_rate": 9.442028985507245e-07, "logits/chosen": -2.381063222885132, "logits/rejected": -2.4095044136047363, "logps/chosen": -46.054969787597656, "logps/rejected": -104.06524658203125, "loss": 0.4997, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9116055369377136, "rewards/margins": 3.700819253921509, "rewards/rejected": -2.7892138957977295, "step": 232 }, { "epoch": 0.5628019323671497, "grad_norm": 2.9785237312316895, "learning_rate": 9.439613526570048e-07, "logits/chosen": -2.444920539855957, "logits/rejected": -2.3811769485473633, "logps/chosen": -48.54033279418945, "logps/rejected": -85.47212982177734, "loss": 0.6978, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0608608722686768, "rewards/margins": 2.4241366386413574, "rewards/rejected": -1.3632758855819702, "step": 233 }, { "epoch": 0.5652173913043478, "grad_norm": 3.964996576309204, "learning_rate": 9.43719806763285e-07, "logits/chosen": -2.4381163120269775, "logits/rejected": -2.4058594703674316, "logps/chosen": -52.223548889160156, "logps/rejected": -95.97307586669922, "loss": 0.6445, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7094055414199829, "rewards/margins": 2.7999107837677, "rewards/rejected": -2.090505361557007, "step": 234 }, { "epoch": 0.5676328502415459, "grad_norm": 3.2477035522460938, "learning_rate": 9.434782608695652e-07, "logits/chosen": -2.4647140502929688, "logits/rejected": -2.5267128944396973, "logps/chosen": -43.36991500854492, "logps/rejected": -96.81182861328125, "loss": 0.5104, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5163589715957642, "rewards/margins": 3.81447172164917, "rewards/rejected": -2.2981128692626953, "step": 235 }, { "epoch": 0.5700483091787439, "grad_norm": 3.011848211288452, "learning_rate": 9.432367149758454e-07, "logits/chosen": -2.53831148147583, "logits/rejected": -2.429218053817749, "logps/chosen": -38.14115524291992, "logps/rejected": -102.41593933105469, "loss": 0.4623, "rewards/accuracies": 0.875, "rewards/chosen": 1.4245160818099976, "rewards/margins": 4.311422348022461, "rewards/rejected": -2.8869059085845947, "step": 236 }, { "epoch": 0.572463768115942, "grad_norm": 3.794722318649292, "learning_rate": 9.429951690821255e-07, "logits/chosen": -2.369840383529663, "logits/rejected": -2.391162395477295, "logps/chosen": -39.347007751464844, "logps/rejected": -88.1730728149414, "loss": 0.6, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7000272274017334, "rewards/margins": 3.318488121032715, "rewards/rejected": -1.618461012840271, "step": 237 }, { "epoch": 0.5748792270531401, "grad_norm": 5.997779846191406, "learning_rate": 9.427536231884058e-07, "logits/chosen": -2.4193248748779297, "logits/rejected": -2.384380340576172, "logps/chosen": -35.26355743408203, "logps/rejected": -96.23162841796875, "loss": 0.4352, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7399412393569946, "rewards/margins": 4.066744327545166, "rewards/rejected": -2.326802968978882, "step": 238 }, { "epoch": 0.5772946859903382, "grad_norm": 3.317352294921875, "learning_rate": 9.42512077294686e-07, "logits/chosen": -2.384012460708618, "logits/rejected": -2.4061436653137207, "logps/chosen": -41.02629089355469, "logps/rejected": -91.09063720703125, "loss": 0.5634, "rewards/accuracies": 0.875, "rewards/chosen": 1.2878574132919312, "rewards/margins": 3.3559470176696777, "rewards/rejected": -2.068089485168457, "step": 239 }, { "epoch": 0.5797101449275363, "grad_norm": 4.495738983154297, "learning_rate": 9.422705314009661e-07, "logits/chosen": -2.441499948501587, "logits/rejected": -2.437145233154297, "logps/chosen": -51.59661865234375, "logps/rejected": -108.27403259277344, "loss": 0.5803, "rewards/accuracies": 0.9375, "rewards/chosen": 0.639395534992218, "rewards/margins": 3.709301710128784, "rewards/rejected": -3.069906234741211, "step": 240 }, { "epoch": 0.5821256038647343, "grad_norm": 4.088237285614014, "learning_rate": 9.420289855072463e-07, "logits/chosen": -2.3894917964935303, "logits/rejected": -2.4259657859802246, "logps/chosen": -50.434837341308594, "logps/rejected": -99.36415100097656, "loss": 0.6189, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5635926723480225, "rewards/margins": 3.175659656524658, "rewards/rejected": -2.6120667457580566, "step": 241 }, { "epoch": 0.5845410628019324, "grad_norm": 3.1718413829803467, "learning_rate": 9.417874396135266e-07, "logits/chosen": -2.46435809135437, "logits/rejected": -2.4769890308380127, "logps/chosen": -54.13407897949219, "logps/rejected": -109.25830841064453, "loss": 0.6027, "rewards/accuracies": 0.90625, "rewards/chosen": 0.48038387298583984, "rewards/margins": 3.65164852142334, "rewards/rejected": -3.171265125274658, "step": 242 }, { "epoch": 0.5869565217391305, "grad_norm": 4.950348854064941, "learning_rate": 9.415458937198067e-07, "logits/chosen": -2.5045547485351562, "logits/rejected": -2.4651477336883545, "logps/chosen": -52.496620178222656, "logps/rejected": -113.6917495727539, "loss": 0.5574, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3850274682044983, "rewards/margins": 3.577120780944824, "rewards/rejected": -3.1920931339263916, "step": 243 }, { "epoch": 0.5893719806763285, "grad_norm": 5.91154146194458, "learning_rate": 9.41304347826087e-07, "logits/chosen": -2.441277027130127, "logits/rejected": -2.409719467163086, "logps/chosen": -41.49176788330078, "logps/rejected": -94.32681274414062, "loss": 0.5527, "rewards/accuracies": 1.0, "rewards/chosen": 1.3833489418029785, "rewards/margins": 3.664475440979004, "rewards/rejected": -2.2811262607574463, "step": 244 }, { "epoch": 0.5917874396135265, "grad_norm": 7.646483421325684, "learning_rate": 9.410628019323671e-07, "logits/chosen": -2.4512205123901367, "logits/rejected": -2.4398441314697266, "logps/chosen": -52.204444885253906, "logps/rejected": -104.05003356933594, "loss": 0.5652, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6225485801696777, "rewards/margins": 3.4874958992004395, "rewards/rejected": -2.8649470806121826, "step": 245 }, { "epoch": 0.5942028985507246, "grad_norm": 3.745530128479004, "learning_rate": 9.408212560386472e-07, "logits/chosen": -2.4477884769439697, "logits/rejected": -2.452491283416748, "logps/chosen": -50.10590362548828, "logps/rejected": -107.38623809814453, "loss": 0.5425, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8860993981361389, "rewards/margins": 3.7136192321777344, "rewards/rejected": -2.827519655227661, "step": 246 }, { "epoch": 0.5966183574879227, "grad_norm": 3.2826061248779297, "learning_rate": 9.405797101449276e-07, "logits/chosen": -2.5508618354797363, "logits/rejected": -2.4886107444763184, "logps/chosen": -37.5369758605957, "logps/rejected": -110.14015197753906, "loss": 0.4633, "rewards/accuracies": 1.0, "rewards/chosen": 1.8560903072357178, "rewards/margins": 4.861093521118164, "rewards/rejected": -3.0050032138824463, "step": 247 }, { "epoch": 0.5990338164251208, "grad_norm": 3.4868226051330566, "learning_rate": 9.403381642512077e-07, "logits/chosen": -2.4821996688842773, "logits/rejected": -2.4565956592559814, "logps/chosen": -47.37244415283203, "logps/rejected": -95.37382507324219, "loss": 0.6414, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8975226879119873, "rewards/margins": 3.0713915824890137, "rewards/rejected": -2.1738688945770264, "step": 248 }, { "epoch": 0.6014492753623188, "grad_norm": 3.2455267906188965, "learning_rate": 9.400966183574878e-07, "logits/chosen": -2.409959554672241, "logits/rejected": -2.39658784866333, "logps/chosen": -47.110191345214844, "logps/rejected": -95.70393371582031, "loss": 0.5664, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8924843668937683, "rewards/margins": 3.238168478012085, "rewards/rejected": -2.3456838130950928, "step": 249 }, { "epoch": 0.6014492753623188, "eval_logits/chosen": -2.4716429710388184, "eval_logits/rejected": -2.4732799530029297, "eval_logps/chosen": -42.89160919189453, "eval_logps/rejected": -104.71270751953125, "eval_loss": 0.48780861496925354, "eval_rewards/accuracies": 0.9595588445663452, "eval_rewards/chosen": 1.1769310235977173, "eval_rewards/margins": 4.0148024559021, "eval_rewards/rejected": -2.8378708362579346, "eval_runtime": 996.0923, "eval_samples_per_second": 0.546, "eval_steps_per_second": 0.273, "step": 249 }, { "epoch": 0.6038647342995169, "grad_norm": 4.14283561706543, "learning_rate": 9.398550724637681e-07, "logits/chosen": -2.360948085784912, "logits/rejected": -2.397918462753296, "logps/chosen": -36.66550064086914, "logps/rejected": -96.7926254272461, "loss": 0.4892, "rewards/accuracies": 1.0, "rewards/chosen": 1.710310459136963, "rewards/margins": 4.151803016662598, "rewards/rejected": -2.4414925575256348, "step": 250 }, { "epoch": 0.606280193236715, "grad_norm": 3.1621694564819336, "learning_rate": 9.396135265700482e-07, "logits/chosen": -2.4788615703582764, "logits/rejected": -2.4912185668945312, "logps/chosen": -43.36605453491211, "logps/rejected": -107.74087524414062, "loss": 0.5196, "rewards/accuracies": 0.90625, "rewards/chosen": 1.422530174255371, "rewards/margins": 4.105506896972656, "rewards/rejected": -2.6829771995544434, "step": 251 }, { "epoch": 0.6086956521739131, "grad_norm": 3.6260292530059814, "learning_rate": 9.393719806763286e-07, "logits/chosen": -2.377248764038086, "logits/rejected": -2.3911399841308594, "logps/chosen": -35.16883850097656, "logps/rejected": -105.0933837890625, "loss": 0.3995, "rewards/accuracies": 0.96875, "rewards/chosen": 1.963022232055664, "rewards/margins": 4.780416488647461, "rewards/rejected": -2.8173940181732178, "step": 252 }, { "epoch": 0.6111111111111112, "grad_norm": 4.684390068054199, "learning_rate": 9.391304347826087e-07, "logits/chosen": -2.4666593074798584, "logits/rejected": -2.3853261470794678, "logps/chosen": -61.31965255737305, "logps/rejected": -115.84500885009766, "loss": 0.6535, "rewards/accuracies": 0.84375, "rewards/chosen": -0.11704443395137787, "rewards/margins": 3.311511516571045, "rewards/rejected": -3.428555965423584, "step": 253 }, { "epoch": 0.6135265700483091, "grad_norm": 3.0757408142089844, "learning_rate": 9.388888888888888e-07, "logits/chosen": -2.4808802604675293, "logits/rejected": -2.462473154067993, "logps/chosen": -50.92802047729492, "logps/rejected": -105.9898681640625, "loss": 0.5454, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8108518719673157, "rewards/margins": 3.7495203018188477, "rewards/rejected": -2.938668727874756, "step": 254 }, { "epoch": 0.6159420289855072, "grad_norm": 2.963932752609253, "learning_rate": 9.386473429951691e-07, "logits/chosen": -2.420710563659668, "logits/rejected": -2.40617299079895, "logps/chosen": -45.81290054321289, "logps/rejected": -104.33000183105469, "loss": 0.5378, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9166219830513, "rewards/margins": 3.916682243347168, "rewards/rejected": -3.0000603199005127, "step": 255 }, { "epoch": 0.6183574879227053, "grad_norm": 4.001266002655029, "learning_rate": 9.384057971014492e-07, "logits/chosen": -2.457606792449951, "logits/rejected": -2.446268320083618, "logps/chosen": -42.17203140258789, "logps/rejected": -107.0770263671875, "loss": 0.5305, "rewards/accuracies": 0.9375, "rewards/chosen": 0.964564323425293, "rewards/margins": 3.8652448654174805, "rewards/rejected": -2.9006805419921875, "step": 256 }, { "epoch": 0.6207729468599034, "grad_norm": 4.3497314453125, "learning_rate": 9.381642512077294e-07, "logits/chosen": -2.4794747829437256, "logits/rejected": -2.390202522277832, "logps/chosen": -35.30174255371094, "logps/rejected": -98.04337310791016, "loss": 0.5123, "rewards/accuracies": 0.96875, "rewards/chosen": 1.799075961112976, "rewards/margins": 4.166637420654297, "rewards/rejected": -2.3675615787506104, "step": 257 }, { "epoch": 0.6231884057971014, "grad_norm": 3.047870397567749, "learning_rate": 9.379227053140097e-07, "logits/chosen": -2.4626684188842773, "logits/rejected": -2.469101905822754, "logps/chosen": -51.186248779296875, "logps/rejected": -111.39012145996094, "loss": 0.5397, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8780765533447266, "rewards/margins": 4.339692115783691, "rewards/rejected": -3.461615562438965, "step": 258 }, { "epoch": 0.6256038647342995, "grad_norm": 3.522627115249634, "learning_rate": 9.376811594202898e-07, "logits/chosen": -2.474407434463501, "logits/rejected": -2.428779125213623, "logps/chosen": -46.51958465576172, "logps/rejected": -101.05657958984375, "loss": 0.5373, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1416734457015991, "rewards/margins": 3.580061435699463, "rewards/rejected": -2.438387632369995, "step": 259 }, { "epoch": 0.6280193236714976, "grad_norm": 3.228879928588867, "learning_rate": 9.374396135265699e-07, "logits/chosen": -2.471735715866089, "logits/rejected": -2.483614921569824, "logps/chosen": -60.286983489990234, "logps/rejected": -111.64439392089844, "loss": 0.6005, "rewards/accuracies": 0.875, "rewards/chosen": 0.3195608854293823, "rewards/margins": 3.7289159297943115, "rewards/rejected": -3.4093551635742188, "step": 260 }, { "epoch": 0.6304347826086957, "grad_norm": 3.8116354942321777, "learning_rate": 9.371980676328503e-07, "logits/chosen": -2.4746780395507812, "logits/rejected": -2.4583680629730225, "logps/chosen": -39.57741928100586, "logps/rejected": -93.15548706054688, "loss": 0.5629, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4204787015914917, "rewards/margins": 3.6340742111206055, "rewards/rejected": -2.2135958671569824, "step": 261 }, { "epoch": 0.6328502415458938, "grad_norm": 4.396117210388184, "learning_rate": 9.369565217391304e-07, "logits/chosen": -2.495042324066162, "logits/rejected": -2.4509313106536865, "logps/chosen": -41.425323486328125, "logps/rejected": -104.09886169433594, "loss": 0.4913, "rewards/accuracies": 1.0, "rewards/chosen": 1.1843721866607666, "rewards/margins": 4.094210624694824, "rewards/rejected": -2.9098386764526367, "step": 262 }, { "epoch": 0.6352657004830918, "grad_norm": 4.409356594085693, "learning_rate": 9.367149758454106e-07, "logits/chosen": -2.3715555667877197, "logits/rejected": -2.3837246894836426, "logps/chosen": -45.99847412109375, "logps/rejected": -99.0431137084961, "loss": 0.5874, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0364314317703247, "rewards/margins": 3.637906074523926, "rewards/rejected": -2.6014750003814697, "step": 263 }, { "epoch": 0.6376811594202898, "grad_norm": 3.889432907104492, "learning_rate": 9.364734299516908e-07, "logits/chosen": -2.4608609676361084, "logits/rejected": -2.4622559547424316, "logps/chosen": -51.41381072998047, "logps/rejected": -102.35616302490234, "loss": 0.6067, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7577417492866516, "rewards/margins": 3.5548319816589355, "rewards/rejected": -2.7970900535583496, "step": 264 }, { "epoch": 0.6400966183574879, "grad_norm": 3.8473544120788574, "learning_rate": 9.362318840579709e-07, "logits/chosen": -2.4411072731018066, "logits/rejected": -2.3955070972442627, "logps/chosen": -39.39863204956055, "logps/rejected": -88.08616638183594, "loss": 0.5856, "rewards/accuracies": 0.90625, "rewards/chosen": 1.371360182762146, "rewards/margins": 3.4231200218200684, "rewards/rejected": -2.051759958267212, "step": 265 }, { "epoch": 0.642512077294686, "grad_norm": 3.784208059310913, "learning_rate": 9.359903381642512e-07, "logits/chosen": -2.454892158508301, "logits/rejected": -2.459202766418457, "logps/chosen": -42.713958740234375, "logps/rejected": -105.99848175048828, "loss": 0.5537, "rewards/accuracies": 0.90625, "rewards/chosen": 1.128781795501709, "rewards/margins": 4.154096603393555, "rewards/rejected": -3.0253148078918457, "step": 266 }, { "epoch": 0.644927536231884, "grad_norm": 3.0883102416992188, "learning_rate": 9.357487922705314e-07, "logits/chosen": -2.4261229038238525, "logits/rejected": -2.460330009460449, "logps/chosen": -44.9234504699707, "logps/rejected": -97.42649841308594, "loss": 0.5345, "rewards/accuracies": 0.875, "rewards/chosen": 1.056779146194458, "rewards/margins": 3.70289945602417, "rewards/rejected": -2.646120071411133, "step": 267 }, { "epoch": 0.6473429951690821, "grad_norm": 6.923266887664795, "learning_rate": 9.355072463768115e-07, "logits/chosen": -2.4814324378967285, "logits/rejected": -2.471012592315674, "logps/chosen": -36.065948486328125, "logps/rejected": -86.52034759521484, "loss": 0.5621, "rewards/accuracies": 0.90625, "rewards/chosen": 1.87603759765625, "rewards/margins": 3.6324853897094727, "rewards/rejected": -1.7564477920532227, "step": 268 }, { "epoch": 0.6497584541062802, "grad_norm": 3.40692400932312, "learning_rate": 9.352657004830917e-07, "logits/chosen": -2.4832828044891357, "logits/rejected": -2.4490790367126465, "logps/chosen": -49.95439910888672, "logps/rejected": -108.77017974853516, "loss": 0.6097, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7744916677474976, "rewards/margins": 3.776317596435547, "rewards/rejected": -3.001826047897339, "step": 269 }, { "epoch": 0.6521739130434783, "grad_norm": 3.5461783409118652, "learning_rate": 9.350241545893719e-07, "logits/chosen": -2.3894731998443604, "logits/rejected": -2.4238011837005615, "logps/chosen": -39.08974075317383, "logps/rejected": -96.6617660522461, "loss": 0.5503, "rewards/accuracies": 0.84375, "rewards/chosen": 1.556689739227295, "rewards/margins": 3.950803279876709, "rewards/rejected": -2.394113540649414, "step": 270 }, { "epoch": 0.6545893719806763, "grad_norm": 4.17071008682251, "learning_rate": 9.347826086956522e-07, "logits/chosen": -2.460489511489868, "logits/rejected": -2.4112515449523926, "logps/chosen": -41.53469467163086, "logps/rejected": -94.22862243652344, "loss": 0.6073, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3276817798614502, "rewards/margins": 3.509056568145752, "rewards/rejected": -2.1813747882843018, "step": 271 }, { "epoch": 0.6570048309178744, "grad_norm": 4.861179351806641, "learning_rate": 9.345410628019324e-07, "logits/chosen": -2.4638004302978516, "logits/rejected": -2.4834892749786377, "logps/chosen": -52.02912902832031, "logps/rejected": -112.48854064941406, "loss": 0.5593, "rewards/accuracies": 1.0, "rewards/chosen": 0.876820981502533, "rewards/margins": 4.187595844268799, "rewards/rejected": -3.3107750415802, "step": 272 }, { "epoch": 0.6594202898550725, "grad_norm": 4.374242305755615, "learning_rate": 9.342995169082125e-07, "logits/chosen": -2.505175828933716, "logits/rejected": -2.5256664752960205, "logps/chosen": -43.506866455078125, "logps/rejected": -94.98977661132812, "loss": 0.5695, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2981172800064087, "rewards/margins": 3.5503687858581543, "rewards/rejected": -2.252251148223877, "step": 273 }, { "epoch": 0.6618357487922706, "grad_norm": 3.8685474395751953, "learning_rate": 9.340579710144927e-07, "logits/chosen": -2.4631712436676025, "logits/rejected": -2.417880058288574, "logps/chosen": -40.08359146118164, "logps/rejected": -101.81269073486328, "loss": 0.4988, "rewards/accuracies": 1.0, "rewards/chosen": 1.3852556943893433, "rewards/margins": 4.0965576171875, "rewards/rejected": -2.7113020420074463, "step": 274 }, { "epoch": 0.6642512077294686, "grad_norm": 3.049639940261841, "learning_rate": 9.338164251207729e-07, "logits/chosen": -2.5660221576690674, "logits/rejected": -2.5611515045166016, "logps/chosen": -47.125396728515625, "logps/rejected": -108.1530532836914, "loss": 0.5283, "rewards/accuracies": 0.9375, "rewards/chosen": 1.226516604423523, "rewards/margins": 4.578337669372559, "rewards/rejected": -3.3518214225769043, "step": 275 }, { "epoch": 0.6666666666666666, "grad_norm": 5.573445796966553, "learning_rate": 9.335748792270531e-07, "logits/chosen": -2.506178855895996, "logits/rejected": -2.4921820163726807, "logps/chosen": -34.040470123291016, "logps/rejected": -97.97453308105469, "loss": 0.4953, "rewards/accuracies": 1.0, "rewards/chosen": 1.8417214155197144, "rewards/margins": 4.145375728607178, "rewards/rejected": -2.303654193878174, "step": 276 }, { "epoch": 0.6690821256038647, "grad_norm": 3.5105135440826416, "learning_rate": 9.333333333333333e-07, "logits/chosen": -2.5574324131011963, "logits/rejected": -2.4971914291381836, "logps/chosen": -37.27132034301758, "logps/rejected": -87.67200469970703, "loss": 0.586, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7128596305847168, "rewards/margins": 3.595778465270996, "rewards/rejected": -1.8829188346862793, "step": 277 }, { "epoch": 0.6714975845410628, "grad_norm": 3.7591354846954346, "learning_rate": 9.330917874396135e-07, "logits/chosen": -2.4639320373535156, "logits/rejected": -2.4484546184539795, "logps/chosen": -35.75276184082031, "logps/rejected": -110.97541809082031, "loss": 0.4081, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7291368246078491, "rewards/margins": 5.006564140319824, "rewards/rejected": -3.2774271965026855, "step": 278 }, { "epoch": 0.6739130434782609, "grad_norm": 3.730602741241455, "learning_rate": 9.328502415458937e-07, "logits/chosen": -2.558556318283081, "logits/rejected": -2.5353739261627197, "logps/chosen": -49.384395599365234, "logps/rejected": -96.8267593383789, "loss": 0.6496, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7367831468582153, "rewards/margins": 3.1404032707214355, "rewards/rejected": -2.4036197662353516, "step": 279 }, { "epoch": 0.6763285024154589, "grad_norm": 3.685056447982788, "learning_rate": 9.326086956521738e-07, "logits/chosen": -2.3933863639831543, "logits/rejected": -2.470055341720581, "logps/chosen": -48.40487289428711, "logps/rejected": -105.58142852783203, "loss": 0.573, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5415738821029663, "rewards/margins": 3.8410732746124268, "rewards/rejected": -3.29949951171875, "step": 280 }, { "epoch": 0.678743961352657, "grad_norm": 4.127350807189941, "learning_rate": 9.323671497584541e-07, "logits/chosen": -2.501847982406616, "logits/rejected": -2.417461633682251, "logps/chosen": -42.85173034667969, "logps/rejected": -111.04852294921875, "loss": 0.5025, "rewards/accuracies": 0.9375, "rewards/chosen": 1.046492099761963, "rewards/margins": 4.25037956237793, "rewards/rejected": -3.203887701034546, "step": 281 }, { "epoch": 0.6811594202898551, "grad_norm": 3.781649112701416, "learning_rate": 9.321256038647343e-07, "logits/chosen": -2.4697999954223633, "logits/rejected": -2.4422378540039062, "logps/chosen": -52.544273376464844, "logps/rejected": -113.23193359375, "loss": 0.5992, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6133925318717957, "rewards/margins": 4.148641109466553, "rewards/rejected": -3.5352485179901123, "step": 282 }, { "epoch": 0.6835748792270532, "grad_norm": 4.524843692779541, "learning_rate": 9.318840579710144e-07, "logits/chosen": -2.5450165271759033, "logits/rejected": -2.499415159225464, "logps/chosen": -47.776485443115234, "logps/rejected": -113.05329132080078, "loss": 0.4703, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9108985662460327, "rewards/margins": 4.27054500579834, "rewards/rejected": -3.3596463203430176, "step": 283 }, { "epoch": 0.6859903381642513, "grad_norm": 3.5436666011810303, "learning_rate": 9.316425120772946e-07, "logits/chosen": -2.433868885040283, "logits/rejected": -2.4116101264953613, "logps/chosen": -45.73544692993164, "logps/rejected": -101.9124984741211, "loss": 0.5852, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7871979475021362, "rewards/margins": 3.6957921981811523, "rewards/rejected": -2.9085936546325684, "step": 284 }, { "epoch": 0.6884057971014492, "grad_norm": 3.8961846828460693, "learning_rate": 9.314009661835749e-07, "logits/chosen": -2.4389774799346924, "logits/rejected": -2.4834609031677246, "logps/chosen": -52.68549346923828, "logps/rejected": -104.46746826171875, "loss": 0.576, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7177494764328003, "rewards/margins": 3.910492181777954, "rewards/rejected": -3.1927428245544434, "step": 285 }, { "epoch": 0.6908212560386473, "grad_norm": 4.34315299987793, "learning_rate": 9.31159420289855e-07, "logits/chosen": -2.4344184398651123, "logits/rejected": -2.47406005859375, "logps/chosen": -48.8686408996582, "logps/rejected": -110.35588073730469, "loss": 0.5317, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8948407173156738, "rewards/margins": 4.355882167816162, "rewards/rejected": -3.4610414505004883, "step": 286 }, { "epoch": 0.6932367149758454, "grad_norm": 5.2951507568359375, "learning_rate": 9.309178743961353e-07, "logits/chosen": -2.5292327404022217, "logits/rejected": -2.4818336963653564, "logps/chosen": -28.315349578857422, "logps/rejected": -95.68604278564453, "loss": 0.4079, "rewards/accuracies": 0.875, "rewards/chosen": 2.175311326980591, "rewards/margins": 4.786733150482178, "rewards/rejected": -2.611422061920166, "step": 287 }, { "epoch": 0.6956521739130435, "grad_norm": 3.5735387802124023, "learning_rate": 9.306763285024154e-07, "logits/chosen": -2.5724384784698486, "logits/rejected": -2.5214757919311523, "logps/chosen": -46.41236877441406, "logps/rejected": -97.61429595947266, "loss": 0.627, "rewards/accuracies": 0.875, "rewards/chosen": 0.998763918876648, "rewards/margins": 3.6582725048065186, "rewards/rejected": -2.65950870513916, "step": 288 }, { "epoch": 0.6980676328502415, "grad_norm": 3.4348676204681396, "learning_rate": 9.304347826086955e-07, "logits/chosen": -2.477363109588623, "logits/rejected": -2.4562220573425293, "logps/chosen": -43.65902328491211, "logps/rejected": -112.35236358642578, "loss": 0.5409, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7880632877349854, "rewards/margins": 4.354307651519775, "rewards/rejected": -3.566244125366211, "step": 289 }, { "epoch": 0.7004830917874396, "grad_norm": 3.7520902156829834, "learning_rate": 9.301932367149759e-07, "logits/chosen": -2.531388282775879, "logits/rejected": -2.5469586849212646, "logps/chosen": -35.6341552734375, "logps/rejected": -93.1145248413086, "loss": 0.5316, "rewards/accuracies": 0.90625, "rewards/chosen": 1.8114008903503418, "rewards/margins": 3.821187973022461, "rewards/rejected": -2.00978684425354, "step": 290 }, { "epoch": 0.7028985507246377, "grad_norm": 4.0306549072265625, "learning_rate": 9.29951690821256e-07, "logits/chosen": -2.452010154724121, "logits/rejected": -2.4940145015716553, "logps/chosen": -48.16948699951172, "logps/rejected": -105.96197509765625, "loss": 0.5344, "rewards/accuracies": 0.90625, "rewards/chosen": 1.143178105354309, "rewards/margins": 4.2155327796936035, "rewards/rejected": -3.072354555130005, "step": 291 }, { "epoch": 0.7053140096618358, "grad_norm": 3.6885743141174316, "learning_rate": 9.297101449275361e-07, "logits/chosen": -2.590538740158081, "logits/rejected": -2.4985806941986084, "logps/chosen": -37.333518981933594, "logps/rejected": -107.58997344970703, "loss": 0.4859, "rewards/accuracies": 0.875, "rewards/chosen": 1.2867155075073242, "rewards/margins": 4.705907821655273, "rewards/rejected": -3.41919207572937, "step": 292 }, { "epoch": 0.7077294685990339, "grad_norm": 3.847398042678833, "learning_rate": 9.294685990338164e-07, "logits/chosen": -2.539038896560669, "logits/rejected": -2.5722336769104004, "logps/chosen": -40.100311279296875, "logps/rejected": -114.50240325927734, "loss": 0.4197, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6050822734832764, "rewards/margins": 5.148462295532227, "rewards/rejected": -3.54338002204895, "step": 293 }, { "epoch": 0.7101449275362319, "grad_norm": 3.6353635787963867, "learning_rate": 9.292270531400965e-07, "logits/chosen": -2.5322189331054688, "logits/rejected": -2.51462459564209, "logps/chosen": -42.51466369628906, "logps/rejected": -103.5584487915039, "loss": 0.4851, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4490764141082764, "rewards/margins": 4.463650703430176, "rewards/rejected": -3.0145740509033203, "step": 294 }, { "epoch": 0.7125603864734299, "grad_norm": 3.9232349395751953, "learning_rate": 9.289855072463769e-07, "logits/chosen": -2.5030012130737305, "logits/rejected": -2.5241472721099854, "logps/chosen": -40.01308059692383, "logps/rejected": -99.81531524658203, "loss": 0.5001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3216077089309692, "rewards/margins": 4.1550445556640625, "rewards/rejected": -2.833437204360962, "step": 295 }, { "epoch": 0.714975845410628, "grad_norm": 5.955278396606445, "learning_rate": 9.28743961352657e-07, "logits/chosen": -2.5334649085998535, "logits/rejected": -2.5370395183563232, "logps/chosen": -51.74592590332031, "logps/rejected": -132.45230102539062, "loss": 0.4721, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6958240866661072, "rewards/margins": 5.625500202178955, "rewards/rejected": -4.929676055908203, "step": 296 }, { "epoch": 0.717391304347826, "grad_norm": 4.075846195220947, "learning_rate": 9.285024154589371e-07, "logits/chosen": -2.3859786987304688, "logits/rejected": -2.394787549972534, "logps/chosen": -42.330360412597656, "logps/rejected": -115.19036102294922, "loss": 0.4546, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1815701723098755, "rewards/margins": 4.881466388702393, "rewards/rejected": -3.6998960971832275, "step": 297 }, { "epoch": 0.7198067632850241, "grad_norm": 3.873109817504883, "learning_rate": 9.282608695652174e-07, "logits/chosen": -2.6174886226654053, "logits/rejected": -2.5393567085266113, "logps/chosen": -38.9787712097168, "logps/rejected": -110.37605285644531, "loss": 0.4516, "rewards/accuracies": 0.84375, "rewards/chosen": 1.5842173099517822, "rewards/margins": 5.032350063323975, "rewards/rejected": -3.4481325149536133, "step": 298 }, { "epoch": 0.7222222222222222, "grad_norm": 2.957063913345337, "learning_rate": 9.280193236714975e-07, "logits/chosen": -2.4539012908935547, "logits/rejected": -2.408306121826172, "logps/chosen": -49.481529235839844, "logps/rejected": -98.05957794189453, "loss": 0.6276, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6427246332168579, "rewards/margins": 3.4373250007629395, "rewards/rejected": -2.794600248336792, "step": 299 }, { "epoch": 0.7246376811594203, "grad_norm": 4.045949935913086, "learning_rate": 9.277777777777777e-07, "logits/chosen": -2.463663101196289, "logits/rejected": -2.423729181289673, "logps/chosen": -45.49736785888672, "logps/rejected": -102.10606384277344, "loss": 0.5234, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0643390417099, "rewards/margins": 4.145237445831299, "rewards/rejected": -3.0808982849121094, "step": 300 }, { "epoch": 0.7270531400966184, "grad_norm": 3.9130823612213135, "learning_rate": 9.27536231884058e-07, "logits/chosen": -2.516166925430298, "logits/rejected": -2.4690308570861816, "logps/chosen": -45.76476287841797, "logps/rejected": -106.1125259399414, "loss": 0.5335, "rewards/accuracies": 1.0, "rewards/chosen": 1.1567177772521973, "rewards/margins": 4.561819553375244, "rewards/rejected": -3.405102014541626, "step": 301 }, { "epoch": 0.7294685990338164, "grad_norm": 3.6625568866729736, "learning_rate": 9.272946859903381e-07, "logits/chosen": -2.5427350997924805, "logits/rejected": -2.5435023307800293, "logps/chosen": -35.4171028137207, "logps/rejected": -91.94075775146484, "loss": 0.538, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9605871438980103, "rewards/margins": 4.3710103034973145, "rewards/rejected": -2.4104232788085938, "step": 302 }, { "epoch": 0.7318840579710145, "grad_norm": 3.741793632507324, "learning_rate": 9.270531400966182e-07, "logits/chosen": -2.477916717529297, "logits/rejected": -2.5330867767333984, "logps/chosen": -51.77217483520508, "logps/rejected": -107.24810791015625, "loss": 0.6272, "rewards/accuracies": 0.90625, "rewards/chosen": 0.561506450176239, "rewards/margins": 3.616818904876709, "rewards/rejected": -3.0553126335144043, "step": 303 }, { "epoch": 0.7342995169082126, "grad_norm": 4.195017337799072, "learning_rate": 9.268115942028986e-07, "logits/chosen": -2.4953858852386475, "logits/rejected": -2.5151872634887695, "logps/chosen": -49.257843017578125, "logps/rejected": -133.3972930908203, "loss": 0.4343, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7202117443084717, "rewards/margins": 5.606847763061523, "rewards/rejected": -4.886634826660156, "step": 304 }, { "epoch": 0.7367149758454107, "grad_norm": 4.846376895904541, "learning_rate": 9.265700483091787e-07, "logits/chosen": -2.4883742332458496, "logits/rejected": -2.4893953800201416, "logps/chosen": -44.051231384277344, "logps/rejected": -102.44544982910156, "loss": 0.5687, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0885162353515625, "rewards/margins": 3.9487478733062744, "rewards/rejected": -2.860231399536133, "step": 305 }, { "epoch": 0.7391304347826086, "grad_norm": 6.109338283538818, "learning_rate": 9.263285024154589e-07, "logits/chosen": -2.4926252365112305, "logits/rejected": -2.472515821456909, "logps/chosen": -44.72395324707031, "logps/rejected": -106.52933502197266, "loss": 0.5591, "rewards/accuracies": 1.0, "rewards/chosen": 0.9680125713348389, "rewards/margins": 4.280492305755615, "rewards/rejected": -3.3124799728393555, "step": 306 }, { "epoch": 0.7415458937198067, "grad_norm": 3.1914288997650146, "learning_rate": 9.260869565217391e-07, "logits/chosen": -2.4412739276885986, "logits/rejected": -2.450779914855957, "logps/chosen": -46.37862014770508, "logps/rejected": -111.28561401367188, "loss": 0.5793, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9476751089096069, "rewards/margins": 4.325104713439941, "rewards/rejected": -3.377429485321045, "step": 307 }, { "epoch": 0.7439613526570048, "grad_norm": 3.6168246269226074, "learning_rate": 9.258454106280192e-07, "logits/chosen": -2.5140669345855713, "logits/rejected": -2.5396032333374023, "logps/chosen": -39.95458984375, "logps/rejected": -97.33784484863281, "loss": 0.5377, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6031887531280518, "rewards/margins": 4.179357051849365, "rewards/rejected": -2.5761680603027344, "step": 308 }, { "epoch": 0.7463768115942029, "grad_norm": 4.433737754821777, "learning_rate": 9.256038647342995e-07, "logits/chosen": -2.596993923187256, "logits/rejected": -2.5456275939941406, "logps/chosen": -44.23625946044922, "logps/rejected": -88.64879608154297, "loss": 0.6329, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2908737659454346, "rewards/margins": 3.260775327682495, "rewards/rejected": -1.96990168094635, "step": 309 }, { "epoch": 0.748792270531401, "grad_norm": 4.8335185050964355, "learning_rate": 9.253623188405797e-07, "logits/chosen": -2.5084586143493652, "logits/rejected": -2.4540164470672607, "logps/chosen": -42.29496383666992, "logps/rejected": -114.84744262695312, "loss": 0.4837, "rewards/accuracies": 1.0, "rewards/chosen": 1.26219642162323, "rewards/margins": 5.019113540649414, "rewards/rejected": -3.7569165229797363, "step": 310 }, { "epoch": 0.751207729468599, "grad_norm": 5.073655605316162, "learning_rate": 9.251207729468598e-07, "logits/chosen": -2.5047905445098877, "logits/rejected": -2.474173069000244, "logps/chosen": -44.74121856689453, "logps/rejected": -113.0907211303711, "loss": 0.4994, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1850156784057617, "rewards/margins": 4.532312393188477, "rewards/rejected": -3.3472962379455566, "step": 311 }, { "epoch": 0.7536231884057971, "grad_norm": 3.5847086906433105, "learning_rate": 9.2487922705314e-07, "logits/chosen": -2.5204081535339355, "logits/rejected": -2.4745285511016846, "logps/chosen": -41.21735763549805, "logps/rejected": -106.37520599365234, "loss": 0.4687, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1243090629577637, "rewards/margins": 4.340332984924316, "rewards/rejected": -3.216024398803711, "step": 312 }, { "epoch": 0.7560386473429952, "grad_norm": 3.958892583847046, "learning_rate": 9.246376811594202e-07, "logits/chosen": -2.517052412033081, "logits/rejected": -2.523019790649414, "logps/chosen": -40.1868782043457, "logps/rejected": -103.82211303710938, "loss": 0.5381, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6541495323181152, "rewards/margins": 4.428343772888184, "rewards/rejected": -2.7741940021514893, "step": 313 }, { "epoch": 0.7584541062801933, "grad_norm": 4.78141450881958, "learning_rate": 9.243961352657005e-07, "logits/chosen": -2.576146125793457, "logits/rejected": -2.4739267826080322, "logps/chosen": -47.425819396972656, "logps/rejected": -125.41177368164062, "loss": 0.4434, "rewards/accuracies": 0.96875, "rewards/chosen": 1.148368000984192, "rewards/margins": 5.558729648590088, "rewards/rejected": -4.4103617668151855, "step": 314 }, { "epoch": 0.7608695652173914, "grad_norm": 3.8700404167175293, "learning_rate": 9.241545893719807e-07, "logits/chosen": -2.5485734939575195, "logits/rejected": -2.5261645317077637, "logps/chosen": -34.538814544677734, "logps/rejected": -94.87109375, "loss": 0.4624, "rewards/accuracies": 0.84375, "rewards/chosen": 2.016209602355957, "rewards/margins": 4.273966312408447, "rewards/rejected": -2.2577571868896484, "step": 315 }, { "epoch": 0.7632850241545893, "grad_norm": 3.8397397994995117, "learning_rate": 9.239130434782608e-07, "logits/chosen": -2.4999594688415527, "logits/rejected": -2.424330711364746, "logps/chosen": -44.97486877441406, "logps/rejected": -110.3058090209961, "loss": 0.5798, "rewards/accuracies": 1.0, "rewards/chosen": 1.1152441501617432, "rewards/margins": 4.205599308013916, "rewards/rejected": -3.090355157852173, "step": 316 }, { "epoch": 0.7657004830917874, "grad_norm": 4.281096935272217, "learning_rate": 9.23671497584541e-07, "logits/chosen": -2.4597980976104736, "logits/rejected": -2.4574413299560547, "logps/chosen": -54.10710525512695, "logps/rejected": -102.29536437988281, "loss": 0.578, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7455824613571167, "rewards/margins": 3.5681378841400146, "rewards/rejected": -2.8225555419921875, "step": 317 }, { "epoch": 0.7681159420289855, "grad_norm": 4.256350517272949, "learning_rate": 9.234299516908212e-07, "logits/chosen": -2.5369298458099365, "logits/rejected": -2.5661981105804443, "logps/chosen": -54.51380920410156, "logps/rejected": -109.2183837890625, "loss": 0.5707, "rewards/accuracies": 0.875, "rewards/chosen": 0.4728756248950958, "rewards/margins": 3.8985414505004883, "rewards/rejected": -3.4256656169891357, "step": 318 }, { "epoch": 0.7705314009661836, "grad_norm": 3.2651610374450684, "learning_rate": 9.231884057971014e-07, "logits/chosen": -2.516305923461914, "logits/rejected": -2.441011667251587, "logps/chosen": -29.583091735839844, "logps/rejected": -98.57688903808594, "loss": 0.4833, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9021461009979248, "rewards/margins": 4.292982578277588, "rewards/rejected": -2.390836715698242, "step": 319 }, { "epoch": 0.7729468599033816, "grad_norm": 3.8821985721588135, "learning_rate": 9.229468599033816e-07, "logits/chosen": -2.5351498126983643, "logits/rejected": -2.5871331691741943, "logps/chosen": -41.80889892578125, "logps/rejected": -107.30074310302734, "loss": 0.5346, "rewards/accuracies": 0.9375, "rewards/chosen": 1.464898943901062, "rewards/margins": 4.521340370178223, "rewards/rejected": -3.056441068649292, "step": 320 }, { "epoch": 0.7753623188405797, "grad_norm": 3.5362157821655273, "learning_rate": 9.227053140096618e-07, "logits/chosen": -2.5402536392211914, "logits/rejected": -2.5105488300323486, "logps/chosen": -30.04783821105957, "logps/rejected": -91.71286010742188, "loss": 0.4914, "rewards/accuracies": 0.96875, "rewards/chosen": 2.165177822113037, "rewards/margins": 4.4568939208984375, "rewards/rejected": -2.2917160987854004, "step": 321 }, { "epoch": 0.7777777777777778, "grad_norm": 3.6041295528411865, "learning_rate": 9.22463768115942e-07, "logits/chosen": -2.4966273307800293, "logits/rejected": -2.452550172805786, "logps/chosen": -46.645751953125, "logps/rejected": -89.15235900878906, "loss": 0.6295, "rewards/accuracies": 0.875, "rewards/chosen": 1.288475513458252, "rewards/margins": 3.2032501697540283, "rewards/rejected": -1.9147746562957764, "step": 322 }, { "epoch": 0.7801932367149759, "grad_norm": 4.694094657897949, "learning_rate": 9.222222222222222e-07, "logits/chosen": -2.490176200866699, "logits/rejected": -2.4429402351379395, "logps/chosen": -41.84352111816406, "logps/rejected": -84.85509490966797, "loss": 0.6596, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4443063735961914, "rewards/margins": 2.9031546115875244, "rewards/rejected": -1.458848476409912, "step": 323 }, { "epoch": 0.782608695652174, "grad_norm": 4.367844104766846, "learning_rate": 9.219806763285024e-07, "logits/chosen": -2.5060629844665527, "logits/rejected": -2.518244743347168, "logps/chosen": -43.65361785888672, "logps/rejected": -93.01005554199219, "loss": 0.5197, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3252304792404175, "rewards/margins": 3.673799991607666, "rewards/rejected": -2.348569393157959, "step": 324 }, { "epoch": 0.785024154589372, "grad_norm": 3.4698336124420166, "learning_rate": 9.217391304347826e-07, "logits/chosen": -2.5761475563049316, "logits/rejected": -2.548013210296631, "logps/chosen": -42.918724060058594, "logps/rejected": -97.62446594238281, "loss": 0.5989, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3841781616210938, "rewards/margins": 3.8162472248077393, "rewards/rejected": -2.4320693016052246, "step": 325 }, { "epoch": 0.7874396135265701, "grad_norm": 4.008787155151367, "learning_rate": 9.214975845410627e-07, "logits/chosen": -2.558777093887329, "logits/rejected": -2.5714502334594727, "logps/chosen": -42.44882583618164, "logps/rejected": -110.05502319335938, "loss": 0.4566, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5917760133743286, "rewards/margins": 4.972674369812012, "rewards/rejected": -3.3808982372283936, "step": 326 }, { "epoch": 0.7898550724637681, "grad_norm": 3.1457533836364746, "learning_rate": 9.212560386473429e-07, "logits/chosen": -2.5722081661224365, "logits/rejected": -2.4842708110809326, "logps/chosen": -43.704261779785156, "logps/rejected": -109.00733947753906, "loss": 0.5056, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5423885583877563, "rewards/margins": 4.799942970275879, "rewards/rejected": -3.257554769515991, "step": 327 }, { "epoch": 0.7922705314009661, "grad_norm": 3.097177743911743, "learning_rate": 9.210144927536232e-07, "logits/chosen": -2.514894962310791, "logits/rejected": -2.495974540710449, "logps/chosen": -49.500144958496094, "logps/rejected": -105.99454498291016, "loss": 0.5816, "rewards/accuracies": 0.875, "rewards/chosen": 1.1343824863433838, "rewards/margins": 4.2045369148254395, "rewards/rejected": -3.0701546669006348, "step": 328 }, { "epoch": 0.7946859903381642, "grad_norm": 5.299651622772217, "learning_rate": 9.207729468599033e-07, "logits/chosen": -2.527681589126587, "logits/rejected": -2.5545315742492676, "logps/chosen": -43.57232666015625, "logps/rejected": -102.68521881103516, "loss": 0.5407, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2021665573120117, "rewards/margins": 4.058761119842529, "rewards/rejected": -2.8565943241119385, "step": 329 }, { "epoch": 0.7971014492753623, "grad_norm": 5.175952434539795, "learning_rate": 9.205314009661836e-07, "logits/chosen": -2.572580099105835, "logits/rejected": -2.572793960571289, "logps/chosen": -41.41783142089844, "logps/rejected": -95.42486572265625, "loss": 0.5066, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5794336795806885, "rewards/margins": 4.167743682861328, "rewards/rejected": -2.5883097648620605, "step": 330 }, { "epoch": 0.7995169082125604, "grad_norm": 4.204919338226318, "learning_rate": 9.202898550724637e-07, "logits/chosen": -2.503392457962036, "logits/rejected": -2.4375455379486084, "logps/chosen": -43.8142204284668, "logps/rejected": -100.38452911376953, "loss": 0.6166, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9648387432098389, "rewards/margins": 3.616603374481201, "rewards/rejected": -2.6517646312713623, "step": 331 }, { "epoch": 0.8019323671497585, "grad_norm": 3.0964720249176025, "learning_rate": 9.200483091787438e-07, "logits/chosen": -2.5454702377319336, "logits/rejected": -2.495424747467041, "logps/chosen": -50.37091064453125, "logps/rejected": -107.96204376220703, "loss": 0.5889, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7240418195724487, "rewards/margins": 3.9129133224487305, "rewards/rejected": -3.188871383666992, "step": 332 }, { "epoch": 0.8019323671497585, "eval_logits/chosen": -2.560584545135498, "eval_logits/rejected": -2.5712900161743164, "eval_logps/chosen": -38.81827163696289, "eval_logps/rejected": -108.81658935546875, "eval_loss": 0.45841243863105774, "eval_rewards/accuracies": 0.9522058963775635, "eval_rewards/chosen": 1.5842646360397339, "eval_rewards/margins": 4.832522869110107, "eval_rewards/rejected": -3.248257875442505, "eval_runtime": 995.6654, "eval_samples_per_second": 0.546, "eval_steps_per_second": 0.273, "step": 332 }, { "epoch": 0.8043478260869565, "grad_norm": 6.496974468231201, "learning_rate": 9.198067632850242e-07, "logits/chosen": -2.5881762504577637, "logits/rejected": -2.599106550216675, "logps/chosen": -30.455768585205078, "logps/rejected": -93.40205383300781, "loss": 0.4731, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1209516525268555, "rewards/margins": 4.303684711456299, "rewards/rejected": -2.1827335357666016, "step": 333 }, { "epoch": 0.8067632850241546, "grad_norm": 3.664546251296997, "learning_rate": 9.195652173913043e-07, "logits/chosen": -2.6244120597839355, "logits/rejected": -2.5979573726654053, "logps/chosen": -44.21623229980469, "logps/rejected": -103.29528045654297, "loss": 0.5231, "rewards/accuracies": 0.875, "rewards/chosen": 1.2441719770431519, "rewards/margins": 4.332863807678223, "rewards/rejected": -3.0886921882629395, "step": 334 }, { "epoch": 0.8091787439613527, "grad_norm": 4.350682735443115, "learning_rate": 9.193236714975845e-07, "logits/chosen": -2.562102794647217, "logits/rejected": -2.492227792739868, "logps/chosen": -34.75768280029297, "logps/rejected": -109.14124298095703, "loss": 0.4737, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0459117889404297, "rewards/margins": 5.231290817260742, "rewards/rejected": -3.1853790283203125, "step": 335 }, { "epoch": 0.8115942028985508, "grad_norm": 3.750227212905884, "learning_rate": 9.190821256038647e-07, "logits/chosen": -2.4461758136749268, "logits/rejected": -2.4771809577941895, "logps/chosen": -47.29502487182617, "logps/rejected": -108.44086456298828, "loss": 0.5962, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8882317543029785, "rewards/margins": 4.251971244812012, "rewards/rejected": -3.363739490509033, "step": 336 }, { "epoch": 0.8140096618357487, "grad_norm": 4.211231708526611, "learning_rate": 9.188405797101448e-07, "logits/chosen": -2.609168291091919, "logits/rejected": -2.591647148132324, "logps/chosen": -43.94401931762695, "logps/rejected": -106.33031463623047, "loss": 0.504, "rewards/accuracies": 1.0, "rewards/chosen": 1.112139105796814, "rewards/margins": 4.391679763793945, "rewards/rejected": -3.279540777206421, "step": 337 }, { "epoch": 0.8164251207729468, "grad_norm": 4.1697797775268555, "learning_rate": 9.185990338164252e-07, "logits/chosen": -2.478952407836914, "logits/rejected": -2.4255824089050293, "logps/chosen": -39.13929748535156, "logps/rejected": -113.66283416748047, "loss": 0.4625, "rewards/accuracies": 0.96875, "rewards/chosen": 1.490719199180603, "rewards/margins": 5.184122562408447, "rewards/rejected": -3.693403482437134, "step": 338 }, { "epoch": 0.8188405797101449, "grad_norm": 2.960111141204834, "learning_rate": 9.183574879227053e-07, "logits/chosen": -2.5744845867156982, "logits/rejected": -2.585181951522827, "logps/chosen": -40.114501953125, "logps/rejected": -117.70671081542969, "loss": 0.444, "rewards/accuracies": 0.96875, "rewards/chosen": 1.810817003250122, "rewards/margins": 5.565342426300049, "rewards/rejected": -3.754525661468506, "step": 339 }, { "epoch": 0.821256038647343, "grad_norm": 3.609467029571533, "learning_rate": 9.181159420289854e-07, "logits/chosen": -2.57049560546875, "logits/rejected": -2.54813289642334, "logps/chosen": -33.401023864746094, "logps/rejected": -91.37448120117188, "loss": 0.5555, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7155591249465942, "rewards/margins": 4.162319660186768, "rewards/rejected": -2.446760654449463, "step": 340 }, { "epoch": 0.8236714975845411, "grad_norm": 3.6210765838623047, "learning_rate": 9.178743961352657e-07, "logits/chosen": -2.5616886615753174, "logits/rejected": -2.567004442214966, "logps/chosen": -51.22029495239258, "logps/rejected": -129.6854705810547, "loss": 0.5261, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6140583753585815, "rewards/margins": 5.457680702209473, "rewards/rejected": -4.843622207641602, "step": 341 }, { "epoch": 0.8260869565217391, "grad_norm": 5.619706630706787, "learning_rate": 9.176328502415458e-07, "logits/chosen": -2.5745484828948975, "logits/rejected": -2.563263416290283, "logps/chosen": -67.2214126586914, "logps/rejected": -127.83865356445312, "loss": 0.6207, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5487710237503052, "rewards/margins": 4.066376686096191, "rewards/rejected": -4.615147590637207, "step": 342 }, { "epoch": 0.8285024154589372, "grad_norm": 3.9835071563720703, "learning_rate": 9.17391304347826e-07, "logits/chosen": -2.580349922180176, "logits/rejected": -2.5062170028686523, "logps/chosen": -30.329322814941406, "logps/rejected": -97.40161895751953, "loss": 0.5079, "rewards/accuracies": 0.90625, "rewards/chosen": 2.2172646522521973, "rewards/margins": 4.766402721405029, "rewards/rejected": -2.549138069152832, "step": 343 }, { "epoch": 0.8309178743961353, "grad_norm": 3.6355953216552734, "learning_rate": 9.171497584541063e-07, "logits/chosen": -2.5570521354675293, "logits/rejected": -2.6467552185058594, "logps/chosen": -44.91547393798828, "logps/rejected": -110.40785217285156, "loss": 0.4457, "rewards/accuracies": 1.0, "rewards/chosen": 1.473638653755188, "rewards/margins": 5.06088924407959, "rewards/rejected": -3.587249755859375, "step": 344 }, { "epoch": 0.8333333333333334, "grad_norm": 3.8119349479675293, "learning_rate": 9.169082125603864e-07, "logits/chosen": -2.509772300720215, "logits/rejected": -2.4704818725585938, "logps/chosen": -37.876590728759766, "logps/rejected": -117.02124786376953, "loss": 0.4703, "rewards/accuracies": 0.875, "rewards/chosen": 1.7839607000350952, "rewards/margins": 5.416266441345215, "rewards/rejected": -3.6323060989379883, "step": 345 }, { "epoch": 0.8357487922705314, "grad_norm": 2.9115209579467773, "learning_rate": 9.166666666666665e-07, "logits/chosen": -2.6299362182617188, "logits/rejected": -2.5975759029388428, "logps/chosen": -47.81496810913086, "logps/rejected": -122.13037109375, "loss": 0.483, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9050412178039551, "rewards/margins": 5.158113956451416, "rewards/rejected": -4.253073215484619, "step": 346 }, { "epoch": 0.8381642512077294, "grad_norm": 4.4543538093566895, "learning_rate": 9.164251207729469e-07, "logits/chosen": -2.548072576522827, "logits/rejected": -2.5925683975219727, "logps/chosen": -51.658199310302734, "logps/rejected": -109.55927276611328, "loss": 0.6484, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5775338411331177, "rewards/margins": 4.064566612243652, "rewards/rejected": -3.487032890319824, "step": 347 }, { "epoch": 0.8405797101449275, "grad_norm": 4.233089447021484, "learning_rate": 9.16183574879227e-07, "logits/chosen": -2.464413642883301, "logits/rejected": -2.4944729804992676, "logps/chosen": -35.589637756347656, "logps/rejected": -110.54972839355469, "loss": 0.4437, "rewards/accuracies": 0.9375, "rewards/chosen": 1.840099811553955, "rewards/margins": 5.459802150726318, "rewards/rejected": -3.619702100753784, "step": 348 }, { "epoch": 0.8429951690821256, "grad_norm": 3.853801965713501, "learning_rate": 9.159420289855072e-07, "logits/chosen": -2.564760446548462, "logits/rejected": -2.6015748977661133, "logps/chosen": -39.863136291503906, "logps/rejected": -104.57846069335938, "loss": 0.5269, "rewards/accuracies": 1.0, "rewards/chosen": 1.5336872339248657, "rewards/margins": 4.602839469909668, "rewards/rejected": -3.069152593612671, "step": 349 }, { "epoch": 0.8454106280193237, "grad_norm": 3.82727313041687, "learning_rate": 9.157004830917874e-07, "logits/chosen": -2.5590548515319824, "logits/rejected": -2.5663084983825684, "logps/chosen": -47.42093276977539, "logps/rejected": -106.6297607421875, "loss": 0.6349, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8851682543754578, "rewards/margins": 4.232489585876465, "rewards/rejected": -3.3473212718963623, "step": 350 }, { "epoch": 0.8478260869565217, "grad_norm": 3.248603582382202, "learning_rate": 9.154589371980675e-07, "logits/chosen": -2.538403272628784, "logits/rejected": -2.542739152908325, "logps/chosen": -31.77102279663086, "logps/rejected": -110.1134033203125, "loss": 0.4284, "rewards/accuracies": 1.0, "rewards/chosen": 1.9913231134414673, "rewards/margins": 5.655474662780762, "rewards/rejected": -3.664151668548584, "step": 351 }, { "epoch": 0.8502415458937198, "grad_norm": 4.228363990783691, "learning_rate": 9.152173913043479e-07, "logits/chosen": -2.574882984161377, "logits/rejected": -2.531940460205078, "logps/chosen": -42.054718017578125, "logps/rejected": -111.2087173461914, "loss": 0.5147, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1953060626983643, "rewards/margins": 4.978707790374756, "rewards/rejected": -3.7834014892578125, "step": 352 }, { "epoch": 0.8526570048309179, "grad_norm": 5.092948913574219, "learning_rate": 9.14975845410628e-07, "logits/chosen": -2.6037731170654297, "logits/rejected": -2.5882599353790283, "logps/chosen": -57.34989547729492, "logps/rejected": -143.28428649902344, "loss": 0.5656, "rewards/accuracies": 0.90625, "rewards/chosen": -0.011776834726333618, "rewards/margins": 5.530036926269531, "rewards/rejected": -5.541813850402832, "step": 353 }, { "epoch": 0.855072463768116, "grad_norm": 3.155285120010376, "learning_rate": 9.147342995169081e-07, "logits/chosen": -2.5482304096221924, "logits/rejected": -2.559187412261963, "logps/chosen": -36.52967071533203, "logps/rejected": -121.75987243652344, "loss": 0.3788, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4779634475708008, "rewards/margins": 5.742661476135254, "rewards/rejected": -4.264698505401611, "step": 354 }, { "epoch": 0.857487922705314, "grad_norm": 3.6507668495178223, "learning_rate": 9.144927536231884e-07, "logits/chosen": -2.5355703830718994, "logits/rejected": -2.5358622074127197, "logps/chosen": -46.350257873535156, "logps/rejected": -102.2492446899414, "loss": 0.6318, "rewards/accuracies": 0.90625, "rewards/chosen": 0.925865113735199, "rewards/margins": 4.114509105682373, "rewards/rejected": -3.1886439323425293, "step": 355 }, { "epoch": 0.8599033816425121, "grad_norm": 3.8449721336364746, "learning_rate": 9.142512077294685e-07, "logits/chosen": -2.6137146949768066, "logits/rejected": -2.5366291999816895, "logps/chosen": -41.177371978759766, "logps/rejected": -123.2091293334961, "loss": 0.4649, "rewards/accuracies": 1.0, "rewards/chosen": 1.0338923931121826, "rewards/margins": 5.545881271362305, "rewards/rejected": -4.511989116668701, "step": 356 }, { "epoch": 0.8623188405797102, "grad_norm": 3.3409812450408936, "learning_rate": 9.140096618357488e-07, "logits/chosen": -2.5653862953186035, "logits/rejected": -2.558648109436035, "logps/chosen": -49.181949615478516, "logps/rejected": -107.46898651123047, "loss": 0.5467, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9686408042907715, "rewards/margins": 4.526411056518555, "rewards/rejected": -3.5577704906463623, "step": 357 }, { "epoch": 0.8647342995169082, "grad_norm": 4.260003566741943, "learning_rate": 9.13768115942029e-07, "logits/chosen": -2.509547710418701, "logits/rejected": -2.514936923980713, "logps/chosen": -47.06504440307617, "logps/rejected": -89.02937316894531, "loss": 0.6765, "rewards/accuracies": 0.875, "rewards/chosen": 1.2501319646835327, "rewards/margins": 3.208348274230957, "rewards/rejected": -1.9582164287567139, "step": 358 }, { "epoch": 0.8671497584541062, "grad_norm": 4.862539291381836, "learning_rate": 9.135265700483091e-07, "logits/chosen": -2.4983344078063965, "logits/rejected": -2.451474666595459, "logps/chosen": -40.271724700927734, "logps/rejected": -111.24837493896484, "loss": 0.4764, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4857211112976074, "rewards/margins": 5.084467887878418, "rewards/rejected": -3.5987467765808105, "step": 359 }, { "epoch": 0.8695652173913043, "grad_norm": 4.413487434387207, "learning_rate": 9.132850241545893e-07, "logits/chosen": -2.557560682296753, "logits/rejected": -2.533135414123535, "logps/chosen": -35.12999725341797, "logps/rejected": -107.58012390136719, "loss": 0.4307, "rewards/accuracies": 0.90625, "rewards/chosen": 1.8041138648986816, "rewards/margins": 5.0028581619262695, "rewards/rejected": -3.198744773864746, "step": 360 }, { "epoch": 0.8719806763285024, "grad_norm": 4.510954856872559, "learning_rate": 9.130434782608695e-07, "logits/chosen": -2.5617032051086426, "logits/rejected": -2.481861114501953, "logps/chosen": -38.11897277832031, "logps/rejected": -95.6335678100586, "loss": 0.5688, "rewards/accuracies": 1.0, "rewards/chosen": 1.6571972370147705, "rewards/margins": 4.113584041595459, "rewards/rejected": -2.4563865661621094, "step": 361 }, { "epoch": 0.8743961352657005, "grad_norm": 6.332159996032715, "learning_rate": 9.128019323671497e-07, "logits/chosen": -2.4729721546173096, "logits/rejected": -2.5260226726531982, "logps/chosen": -41.110023498535156, "logps/rejected": -97.20870971679688, "loss": 0.5904, "rewards/accuracies": 1.0, "rewards/chosen": 1.6982800960540771, "rewards/margins": 4.136590957641602, "rewards/rejected": -2.4383111000061035, "step": 362 }, { "epoch": 0.8768115942028986, "grad_norm": 3.4088709354400635, "learning_rate": 9.125603864734299e-07, "logits/chosen": -2.5540075302124023, "logits/rejected": -2.5776009559631348, "logps/chosen": -45.79363250732422, "logps/rejected": -91.88069915771484, "loss": 0.658, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9098385572433472, "rewards/margins": 3.1951680183410645, "rewards/rejected": -2.285329580307007, "step": 363 }, { "epoch": 0.8792270531400966, "grad_norm": 4.044880390167236, "learning_rate": 9.123188405797101e-07, "logits/chosen": -2.537700653076172, "logits/rejected": -2.5431265830993652, "logps/chosen": -39.80340576171875, "logps/rejected": -95.67684173583984, "loss": 0.4951, "rewards/accuracies": 0.96875, "rewards/chosen": 1.595968246459961, "rewards/margins": 4.10163688659668, "rewards/rejected": -2.505668878555298, "step": 364 }, { "epoch": 0.8816425120772947, "grad_norm": 4.435220718383789, "learning_rate": 9.120772946859903e-07, "logits/chosen": -2.4849061965942383, "logits/rejected": -2.501636505126953, "logps/chosen": -38.31965637207031, "logps/rejected": -85.95870208740234, "loss": 0.6137, "rewards/accuracies": 0.84375, "rewards/chosen": 1.536129355430603, "rewards/margins": 3.423894166946411, "rewards/rejected": -1.887764573097229, "step": 365 }, { "epoch": 0.8840579710144928, "grad_norm": 4.484607696533203, "learning_rate": 9.118357487922705e-07, "logits/chosen": -2.5649871826171875, "logits/rejected": -2.662753105163574, "logps/chosen": -41.79051971435547, "logps/rejected": -108.96809387207031, "loss": 0.5355, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4702800512313843, "rewards/margins": 4.604220390319824, "rewards/rejected": -3.1339404582977295, "step": 366 }, { "epoch": 0.8864734299516909, "grad_norm": 4.202518463134766, "learning_rate": 9.115942028985507e-07, "logits/chosen": -2.640455961227417, "logits/rejected": -2.58121919631958, "logps/chosen": -41.93785095214844, "logps/rejected": -90.97901916503906, "loss": 0.5941, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6247334480285645, "rewards/margins": 3.7960097789764404, "rewards/rejected": -2.171276330947876, "step": 367 }, { "epoch": 0.8888888888888888, "grad_norm": 4.575622081756592, "learning_rate": 9.113526570048309e-07, "logits/chosen": -2.530503511428833, "logits/rejected": -2.543321371078491, "logps/chosen": -33.22358703613281, "logps/rejected": -109.02338409423828, "loss": 0.4072, "rewards/accuracies": 1.0, "rewards/chosen": 2.026397943496704, "rewards/margins": 5.555720806121826, "rewards/rejected": -3.529322624206543, "step": 368 }, { "epoch": 0.8913043478260869, "grad_norm": 5.535022735595703, "learning_rate": 9.11111111111111e-07, "logits/chosen": -2.59295654296875, "logits/rejected": -2.546542167663574, "logps/chosen": -32.63548278808594, "logps/rejected": -97.45429229736328, "loss": 0.5169, "rewards/accuracies": 1.0, "rewards/chosen": 2.226288080215454, "rewards/margins": 4.66389274597168, "rewards/rejected": -2.4376046657562256, "step": 369 }, { "epoch": 0.893719806763285, "grad_norm": 3.806117296218872, "learning_rate": 9.108695652173912e-07, "logits/chosen": -2.543295383453369, "logits/rejected": -2.5516092777252197, "logps/chosen": -36.140621185302734, "logps/rejected": -114.68310546875, "loss": 0.5309, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7922418117523193, "rewards/margins": 5.381476402282715, "rewards/rejected": -3.5892343521118164, "step": 370 }, { "epoch": 0.8961352657004831, "grad_norm": 3.949913740158081, "learning_rate": 9.106280193236715e-07, "logits/chosen": -2.482182502746582, "logits/rejected": -2.5499792098999023, "logps/chosen": -43.45452880859375, "logps/rejected": -101.1965560913086, "loss": 0.5486, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2488086223602295, "rewards/margins": 4.1617608070373535, "rewards/rejected": -2.912951946258545, "step": 371 }, { "epoch": 0.8985507246376812, "grad_norm": 4.3584675788879395, "learning_rate": 9.103864734299517e-07, "logits/chosen": -2.561424732208252, "logits/rejected": -2.575453996658325, "logps/chosen": -44.928863525390625, "logps/rejected": -104.49229431152344, "loss": 0.5706, "rewards/accuracies": 0.875, "rewards/chosen": 1.4419984817504883, "rewards/margins": 4.377858638763428, "rewards/rejected": -2.9358603954315186, "step": 372 }, { "epoch": 0.9009661835748792, "grad_norm": 5.30784797668457, "learning_rate": 9.101449275362319e-07, "logits/chosen": -2.595905065536499, "logits/rejected": -2.5214715003967285, "logps/chosen": -40.6687126159668, "logps/rejected": -117.97053527832031, "loss": 0.4795, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6772773265838623, "rewards/margins": 5.365025997161865, "rewards/rejected": -3.687748670578003, "step": 373 }, { "epoch": 0.9033816425120773, "grad_norm": 3.406097412109375, "learning_rate": 9.09903381642512e-07, "logits/chosen": -2.5197653770446777, "logits/rejected": -2.6216180324554443, "logps/chosen": -29.022106170654297, "logps/rejected": -96.54446411132812, "loss": 0.4767, "rewards/accuracies": 0.84375, "rewards/chosen": 2.236659288406372, "rewards/margins": 4.792357444763184, "rewards/rejected": -2.5556976795196533, "step": 374 }, { "epoch": 0.9057971014492754, "grad_norm": 5.073281288146973, "learning_rate": 9.096618357487922e-07, "logits/chosen": -2.556464195251465, "logits/rejected": -2.6162328720092773, "logps/chosen": -49.35818862915039, "logps/rejected": -115.96080017089844, "loss": 0.5418, "rewards/accuracies": 1.0, "rewards/chosen": 0.6850069761276245, "rewards/margins": 4.6238813400268555, "rewards/rejected": -3.9388747215270996, "step": 375 }, { "epoch": 0.9082125603864735, "grad_norm": 5.263579368591309, "learning_rate": 9.094202898550725e-07, "logits/chosen": -2.5762362480163574, "logits/rejected": -2.550075054168701, "logps/chosen": -35.595035552978516, "logps/rejected": -107.63542938232422, "loss": 0.4481, "rewards/accuracies": 0.90625, "rewards/chosen": 1.700130581855774, "rewards/margins": 5.145581245422363, "rewards/rejected": -3.4454503059387207, "step": 376 }, { "epoch": 0.9106280193236715, "grad_norm": 3.307891607284546, "learning_rate": 9.091787439613526e-07, "logits/chosen": -2.608219861984253, "logits/rejected": -2.6503844261169434, "logps/chosen": -52.4626350402832, "logps/rejected": -124.2633056640625, "loss": 0.5186, "rewards/accuracies": 0.875, "rewards/chosen": 0.914147675037384, "rewards/margins": 5.3721137046813965, "rewards/rejected": -4.457965850830078, "step": 377 }, { "epoch": 0.9130434782608695, "grad_norm": 3.9141809940338135, "learning_rate": 9.089371980676328e-07, "logits/chosen": -2.5611050128936768, "logits/rejected": -2.6438496112823486, "logps/chosen": -32.9936637878418, "logps/rejected": -111.81655883789062, "loss": 0.4275, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2378203868865967, "rewards/margins": 5.688631057739258, "rewards/rejected": -3.450810432434082, "step": 378 }, { "epoch": 0.9154589371980676, "grad_norm": 4.670541763305664, "learning_rate": 9.08695652173913e-07, "logits/chosen": -2.4840047359466553, "logits/rejected": -2.475013494491577, "logps/chosen": -34.327484130859375, "logps/rejected": -103.15818786621094, "loss": 0.4532, "rewards/accuracies": 0.90625, "rewards/chosen": 2.0312342643737793, "rewards/margins": 4.88787317276001, "rewards/rejected": -2.8566384315490723, "step": 379 }, { "epoch": 0.9178743961352657, "grad_norm": 4.381028652191162, "learning_rate": 9.084541062801931e-07, "logits/chosen": -2.644726514816284, "logits/rejected": -2.575990676879883, "logps/chosen": -42.54375076293945, "logps/rejected": -110.23556518554688, "loss": 0.6167, "rewards/accuracies": 0.875, "rewards/chosen": 1.1485612392425537, "rewards/margins": 4.348472595214844, "rewards/rejected": -3.199911117553711, "step": 380 }, { "epoch": 0.9202898550724637, "grad_norm": 4.199840545654297, "learning_rate": 9.082125603864735e-07, "logits/chosen": -2.5101733207702637, "logits/rejected": -2.5294034481048584, "logps/chosen": -43.89669418334961, "logps/rejected": -118.1423568725586, "loss": 0.5166, "rewards/accuracies": 0.96875, "rewards/chosen": 1.174512267112732, "rewards/margins": 5.174269676208496, "rewards/rejected": -3.9997572898864746, "step": 381 }, { "epoch": 0.9227053140096618, "grad_norm": 3.5005788803100586, "learning_rate": 9.079710144927536e-07, "logits/chosen": -2.569408416748047, "logits/rejected": -2.5063400268554688, "logps/chosen": -36.4880485534668, "logps/rejected": -110.86457824707031, "loss": 0.4951, "rewards/accuracies": 0.96875, "rewards/chosen": 1.754501461982727, "rewards/margins": 5.055379867553711, "rewards/rejected": -3.3008780479431152, "step": 382 }, { "epoch": 0.9251207729468599, "grad_norm": 3.9688143730163574, "learning_rate": 9.077294685990337e-07, "logits/chosen": -2.5876312255859375, "logits/rejected": -2.5699832439422607, "logps/chosen": -50.705078125, "logps/rejected": -117.78565979003906, "loss": 0.5184, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5728832483291626, "rewards/margins": 4.6768717765808105, "rewards/rejected": -4.1039886474609375, "step": 383 }, { "epoch": 0.927536231884058, "grad_norm": 4.119326114654541, "learning_rate": 9.07487922705314e-07, "logits/chosen": -2.635777235031128, "logits/rejected": -2.5646181106567383, "logps/chosen": -42.16179275512695, "logps/rejected": -114.55857849121094, "loss": 0.4635, "rewards/accuracies": 0.90625, "rewards/chosen": 1.518638253211975, "rewards/margins": 5.272012233734131, "rewards/rejected": -3.7533740997314453, "step": 384 }, { "epoch": 0.9299516908212561, "grad_norm": 3.9469869136810303, "learning_rate": 9.072463768115942e-07, "logits/chosen": -2.6075456142425537, "logits/rejected": -2.5873093605041504, "logps/chosen": -40.16263198852539, "logps/rejected": -107.43567657470703, "loss": 0.4792, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5452861785888672, "rewards/margins": 4.959200382232666, "rewards/rejected": -3.4139137268066406, "step": 385 }, { "epoch": 0.9323671497584541, "grad_norm": 3.6966140270233154, "learning_rate": 9.070048309178743e-07, "logits/chosen": -2.604989528656006, "logits/rejected": -2.503765106201172, "logps/chosen": -35.48878479003906, "logps/rejected": -116.08406066894531, "loss": 0.4208, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8732293844223022, "rewards/margins": 5.936379432678223, "rewards/rejected": -4.063149929046631, "step": 386 }, { "epoch": 0.9347826086956522, "grad_norm": 3.478656053543091, "learning_rate": 9.067632850241546e-07, "logits/chosen": -2.5789568424224854, "logits/rejected": -2.667393207550049, "logps/chosen": -48.43965148925781, "logps/rejected": -109.35986328125, "loss": 0.5813, "rewards/accuracies": 0.90625, "rewards/chosen": 1.150270700454712, "rewards/margins": 4.645354270935059, "rewards/rejected": -3.4950833320617676, "step": 387 }, { "epoch": 0.9371980676328503, "grad_norm": 4.4677252769470215, "learning_rate": 9.065217391304347e-07, "logits/chosen": -2.4872982501983643, "logits/rejected": -2.5673670768737793, "logps/chosen": -46.027259826660156, "logps/rejected": -106.51093292236328, "loss": 0.5713, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8019748330116272, "rewards/margins": 4.433483123779297, "rewards/rejected": -3.6315078735351562, "step": 388 }, { "epoch": 0.9396135265700483, "grad_norm": 3.7577972412109375, "learning_rate": 9.062801932367148e-07, "logits/chosen": -2.4734537601470947, "logits/rejected": -2.475635528564453, "logps/chosen": -41.755916595458984, "logps/rejected": -113.14115142822266, "loss": 0.4531, "rewards/accuracies": 0.96875, "rewards/chosen": 1.238356113433838, "rewards/margins": 5.115884780883789, "rewards/rejected": -3.8775291442871094, "step": 389 }, { "epoch": 0.9420289855072463, "grad_norm": 4.937104225158691, "learning_rate": 9.060386473429952e-07, "logits/chosen": -2.5652263164520264, "logits/rejected": -2.5798065662384033, "logps/chosen": -40.11904525756836, "logps/rejected": -125.50858306884766, "loss": 0.4117, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7168697118759155, "rewards/margins": 6.386045932769775, "rewards/rejected": -4.66917610168457, "step": 390 }, { "epoch": 0.9444444444444444, "grad_norm": 4.1473517417907715, "learning_rate": 9.057971014492753e-07, "logits/chosen": -2.5726332664489746, "logits/rejected": -2.586306095123291, "logps/chosen": -56.93766784667969, "logps/rejected": -130.63262939453125, "loss": 0.5727, "rewards/accuracies": 1.0, "rewards/chosen": -0.15525510907173157, "rewards/margins": 4.808157920837402, "rewards/rejected": -4.963413238525391, "step": 391 }, { "epoch": 0.9468599033816425, "grad_norm": 3.9102461338043213, "learning_rate": 9.055555555555556e-07, "logits/chosen": -2.5750298500061035, "logits/rejected": -2.5827667713165283, "logps/chosen": -38.13002395629883, "logps/rejected": -123.48248291015625, "loss": 0.4646, "rewards/accuracies": 1.0, "rewards/chosen": 1.3876934051513672, "rewards/margins": 5.779297351837158, "rewards/rejected": -4.391603469848633, "step": 392 }, { "epoch": 0.9492753623188406, "grad_norm": 3.560605764389038, "learning_rate": 9.053140096618357e-07, "logits/chosen": -2.580095052719116, "logits/rejected": -2.5597782135009766, "logps/chosen": -49.22437286376953, "logps/rejected": -125.22557067871094, "loss": 0.4875, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9727988839149475, "rewards/margins": 5.5347208976745605, "rewards/rejected": -4.561922550201416, "step": 393 }, { "epoch": 0.9516908212560387, "grad_norm": 4.043674945831299, "learning_rate": 9.050724637681158e-07, "logits/chosen": -2.594910144805908, "logits/rejected": -2.594841957092285, "logps/chosen": -39.363624572753906, "logps/rejected": -100.55326080322266, "loss": 0.5675, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5355405807495117, "rewards/margins": 4.476558208465576, "rewards/rejected": -2.9410178661346436, "step": 394 }, { "epoch": 0.9541062801932367, "grad_norm": 3.643155336380005, "learning_rate": 9.048309178743962e-07, "logits/chosen": -2.586925506591797, "logits/rejected": -2.5040249824523926, "logps/chosen": -47.45025634765625, "logps/rejected": -98.60328674316406, "loss": 0.6294, "rewards/accuracies": 0.875, "rewards/chosen": 0.9115584492683411, "rewards/margins": 3.5847456455230713, "rewards/rejected": -2.673187255859375, "step": 395 }, { "epoch": 0.9565217391304348, "grad_norm": 2.4681665897369385, "learning_rate": 9.045893719806763e-07, "logits/chosen": -2.5646018981933594, "logits/rejected": -2.6251909732818604, "logps/chosen": -40.86127853393555, "logps/rejected": -116.07344055175781, "loss": 0.5295, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4219810962677002, "rewards/margins": 5.33503532409668, "rewards/rejected": -3.9130542278289795, "step": 396 }, { "epoch": 0.9589371980676329, "grad_norm": 4.145532131195068, "learning_rate": 9.043478260869564e-07, "logits/chosen": -2.623319625854492, "logits/rejected": -2.6125283241271973, "logps/chosen": -52.05299377441406, "logps/rejected": -126.041259765625, "loss": 0.524, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5299727916717529, "rewards/margins": 5.251662254333496, "rewards/rejected": -4.721689224243164, "step": 397 }, { "epoch": 0.961352657004831, "grad_norm": 6.432906627655029, "learning_rate": 9.041062801932367e-07, "logits/chosen": -2.5394299030303955, "logits/rejected": -2.5058071613311768, "logps/chosen": -31.607837677001953, "logps/rejected": -95.68463134765625, "loss": 0.4963, "rewards/accuracies": 0.9375, "rewards/chosen": 2.049859046936035, "rewards/margins": 4.5918869972229, "rewards/rejected": -2.5420279502868652, "step": 398 }, { "epoch": 0.9637681159420289, "grad_norm": 5.260996341705322, "learning_rate": 9.038647342995168e-07, "logits/chosen": -2.6250016689300537, "logits/rejected": -2.596329927444458, "logps/chosen": -56.72629165649414, "logps/rejected": -118.69141387939453, "loss": 0.5727, "rewards/accuracies": 0.90625, "rewards/chosen": 0.18851369619369507, "rewards/margins": 4.512336730957031, "rewards/rejected": -4.323822975158691, "step": 399 }, { "epoch": 0.966183574879227, "grad_norm": 3.300029754638672, "learning_rate": 9.036231884057971e-07, "logits/chosen": -2.6014511585235596, "logits/rejected": -2.5921669006347656, "logps/chosen": -33.27659606933594, "logps/rejected": -131.61688232421875, "loss": 0.2782, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2713136672973633, "rewards/margins": 7.307138919830322, "rewards/rejected": -5.035825729370117, "step": 400 }, { "epoch": 0.9685990338164251, "grad_norm": 4.022793292999268, "learning_rate": 9.033816425120773e-07, "logits/chosen": -2.5739669799804688, "logits/rejected": -2.600924491882324, "logps/chosen": -37.76321029663086, "logps/rejected": -124.6334228515625, "loss": 0.4016, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3887479305267334, "rewards/margins": 6.19658899307251, "rewards/rejected": -4.807840824127197, "step": 401 }, { "epoch": 0.9710144927536232, "grad_norm": 2.6676015853881836, "learning_rate": 9.031400966183574e-07, "logits/chosen": -2.6267013549804688, "logits/rejected": -2.5756804943084717, "logps/chosen": -52.89004898071289, "logps/rejected": -116.06746673583984, "loss": 0.6147, "rewards/accuracies": 0.875, "rewards/chosen": 0.2979165017604828, "rewards/margins": 4.435319900512695, "rewards/rejected": -4.13740348815918, "step": 402 }, { "epoch": 0.9734299516908212, "grad_norm": 4.778841495513916, "learning_rate": 9.028985507246376e-07, "logits/chosen": -2.4666380882263184, "logits/rejected": -2.496476650238037, "logps/chosen": -43.01049041748047, "logps/rejected": -99.77334594726562, "loss": 0.5299, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4005628824234009, "rewards/margins": 4.226039886474609, "rewards/rejected": -2.825477123260498, "step": 403 }, { "epoch": 0.9758454106280193, "grad_norm": 3.9966650009155273, "learning_rate": 9.026570048309178e-07, "logits/chosen": -2.605916976928711, "logits/rejected": -2.6443097591400146, "logps/chosen": -45.96596145629883, "logps/rejected": -117.61219024658203, "loss": 0.5154, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7266570329666138, "rewards/margins": 4.862361907958984, "rewards/rejected": -4.135704517364502, "step": 404 }, { "epoch": 0.9782608695652174, "grad_norm": 4.267725944519043, "learning_rate": 9.02415458937198e-07, "logits/chosen": -2.589993715286255, "logits/rejected": -2.532686233520508, "logps/chosen": -48.234867095947266, "logps/rejected": -103.91498565673828, "loss": 0.5513, "rewards/accuracies": 0.90625, "rewards/chosen": 0.875090479850769, "rewards/margins": 4.4006266593933105, "rewards/rejected": -3.525536060333252, "step": 405 }, { "epoch": 0.9806763285024155, "grad_norm": 3.8346047401428223, "learning_rate": 9.021739130434782e-07, "logits/chosen": -2.6126198768615723, "logits/rejected": -2.6067657470703125, "logps/chosen": -46.6517333984375, "logps/rejected": -97.02922058105469, "loss": 0.6313, "rewards/accuracies": 0.875, "rewards/chosen": 0.919677197933197, "rewards/margins": 4.085611343383789, "rewards/rejected": -3.1659340858459473, "step": 406 }, { "epoch": 0.9830917874396136, "grad_norm": 4.352594375610352, "learning_rate": 9.019323671497584e-07, "logits/chosen": -2.504377603530884, "logits/rejected": -2.5592892169952393, "logps/chosen": -43.395286560058594, "logps/rejected": -102.01634979248047, "loss": 0.5678, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1199901103973389, "rewards/margins": 4.00716495513916, "rewards/rejected": -2.887174606323242, "step": 407 }, { "epoch": 0.9855072463768116, "grad_norm": 4.231996536254883, "learning_rate": 9.016908212560386e-07, "logits/chosen": -2.5338449478149414, "logits/rejected": -2.52669620513916, "logps/chosen": -39.4952278137207, "logps/rejected": -95.57685852050781, "loss": 0.5814, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4964375495910645, "rewards/margins": 4.045610427856445, "rewards/rejected": -2.549172878265381, "step": 408 }, { "epoch": 0.9879227053140096, "grad_norm": 3.6780383586883545, "learning_rate": 9.014492753623189e-07, "logits/chosen": -2.530255079269409, "logits/rejected": -2.5300040245056152, "logps/chosen": -40.251861572265625, "logps/rejected": -119.73877716064453, "loss": 0.4767, "rewards/accuracies": 0.9375, "rewards/chosen": 1.340463399887085, "rewards/margins": 5.523618221282959, "rewards/rejected": -4.183155059814453, "step": 409 }, { "epoch": 0.9903381642512077, "grad_norm": 5.112320899963379, "learning_rate": 9.01207729468599e-07, "logits/chosen": -2.639279365539551, "logits/rejected": -2.5794835090637207, "logps/chosen": -35.72761917114258, "logps/rejected": -106.4506607055664, "loss": 0.4656, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5750339031219482, "rewards/margins": 5.295124053955078, "rewards/rejected": -3.720089912414551, "step": 410 }, { "epoch": 0.9927536231884058, "grad_norm": 5.2015156745910645, "learning_rate": 9.009661835748792e-07, "logits/chosen": -2.556722402572632, "logits/rejected": -2.58148455619812, "logps/chosen": -49.60634231567383, "logps/rejected": -129.91171264648438, "loss": 0.5354, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9266951084136963, "rewards/margins": 5.482293605804443, "rewards/rejected": -4.555598735809326, "step": 411 }, { "epoch": 0.9951690821256038, "grad_norm": 4.5497822761535645, "learning_rate": 9.007246376811594e-07, "logits/chosen": -2.5283968448638916, "logits/rejected": -2.5752365589141846, "logps/chosen": -41.9790153503418, "logps/rejected": -107.06869506835938, "loss": 0.4599, "rewards/accuracies": 0.90625, "rewards/chosen": 1.708872675895691, "rewards/margins": 4.966299533843994, "rewards/rejected": -3.2574267387390137, "step": 412 }, { "epoch": 0.9975845410628019, "grad_norm": 3.875924587249756, "learning_rate": 9.004830917874395e-07, "logits/chosen": -2.564849376678467, "logits/rejected": -2.5186262130737305, "logps/chosen": -42.85179901123047, "logps/rejected": -104.37037658691406, "loss": 0.4994, "rewards/accuracies": 0.875, "rewards/chosen": 1.2958309650421143, "rewards/margins": 4.628893852233887, "rewards/rejected": -3.3330628871917725, "step": 413 }, { "epoch": 1.0, "grad_norm": 4.661702632904053, "learning_rate": 9.002415458937198e-07, "logits/chosen": -2.6168019771575928, "logits/rejected": -2.574650764465332, "logps/chosen": -36.387413024902344, "logps/rejected": -92.80931091308594, "loss": 0.5378, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7471140623092651, "rewards/margins": 4.431889533996582, "rewards/rejected": -2.6847753524780273, "step": 414 }, { "epoch": 1.002415458937198, "grad_norm": 4.008213043212891, "learning_rate": 9e-07, "logits/chosen": -2.64083194732666, "logits/rejected": -2.6146888732910156, "logps/chosen": -42.82259750366211, "logps/rejected": -112.00997924804688, "loss": 0.4901, "rewards/accuracies": 0.96875, "rewards/chosen": 1.397817611694336, "rewards/margins": 4.875008583068848, "rewards/rejected": -3.4771909713745117, "step": 415 }, { "epoch": 1.002415458937198, "eval_logits/chosen": -2.621546506881714, "eval_logits/rejected": -2.629748821258545, "eval_logps/chosen": -39.844303131103516, "eval_logps/rejected": -116.99747467041016, "eval_loss": 0.44466692209243774, "eval_rewards/accuracies": 0.9522058963775635, "eval_rewards/chosen": 1.4816614389419556, "eval_rewards/margins": 5.548007965087891, "eval_rewards/rejected": -4.066347599029541, "eval_runtime": 1002.3699, "eval_samples_per_second": 0.543, "eval_steps_per_second": 0.271, "step": 415 }, { "epoch": 1.0048309178743962, "grad_norm": 3.1105797290802, "learning_rate": 8.997584541062802e-07, "logits/chosen": -2.605891227722168, "logits/rejected": -2.6059765815734863, "logps/chosen": -41.92118453979492, "logps/rejected": -130.33238220214844, "loss": 0.4049, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4648489952087402, "rewards/margins": 6.475766658782959, "rewards/rejected": -5.0109171867370605, "step": 416 }, { "epoch": 1.0072463768115942, "grad_norm": 3.393101215362549, "learning_rate": 8.995169082125603e-07, "logits/chosen": -2.6011667251586914, "logits/rejected": -2.561570644378662, "logps/chosen": -45.646270751953125, "logps/rejected": -109.6526870727539, "loss": 0.5407, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1028251647949219, "rewards/margins": 4.826967239379883, "rewards/rejected": -3.724141836166382, "step": 417 }, { "epoch": 1.0096618357487923, "grad_norm": 3.4533727169036865, "learning_rate": 8.992753623188405e-07, "logits/chosen": -2.6230344772338867, "logits/rejected": -2.6056902408599854, "logps/chosen": -59.169952392578125, "logps/rejected": -119.6939697265625, "loss": 0.5573, "rewards/accuracies": 0.90625, "rewards/chosen": 0.13236939907073975, "rewards/margins": 4.542210102081299, "rewards/rejected": -4.409841060638428, "step": 418 }, { "epoch": 1.0120772946859904, "grad_norm": 4.300008296966553, "learning_rate": 8.990338164251208e-07, "logits/chosen": -2.5820629596710205, "logits/rejected": -2.58339262008667, "logps/chosen": -38.93910217285156, "logps/rejected": -111.1641845703125, "loss": 0.4891, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4932830333709717, "rewards/margins": 5.235544204711914, "rewards/rejected": -3.7422611713409424, "step": 419 }, { "epoch": 1.0144927536231885, "grad_norm": 3.2706105709075928, "learning_rate": 8.987922705314009e-07, "logits/chosen": -2.527921438217163, "logits/rejected": -2.5123608112335205, "logps/chosen": -41.24884033203125, "logps/rejected": -107.8558578491211, "loss": 0.5283, "rewards/accuracies": 0.9375, "rewards/chosen": 1.535855770111084, "rewards/margins": 4.811504364013672, "rewards/rejected": -3.275649070739746, "step": 420 }, { "epoch": 1.0169082125603865, "grad_norm": 4.626036643981934, "learning_rate": 8.985507246376811e-07, "logits/chosen": -2.7379567623138428, "logits/rejected": -2.7250471115112305, "logps/chosen": -48.15859603881836, "logps/rejected": -110.42562866210938, "loss": 0.6005, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6665149927139282, "rewards/margins": 4.40208101272583, "rewards/rejected": -3.7355661392211914, "step": 421 }, { "epoch": 1.0193236714975846, "grad_norm": 3.650603771209717, "learning_rate": 8.983091787439613e-07, "logits/chosen": -2.607762098312378, "logits/rejected": -2.5070741176605225, "logps/chosen": -49.26483154296875, "logps/rejected": -114.49710083007812, "loss": 0.5597, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5383005142211914, "rewards/margins": 4.580692291259766, "rewards/rejected": -4.042391777038574, "step": 422 }, { "epoch": 1.0217391304347827, "grad_norm": 3.2837331295013428, "learning_rate": 8.980676328502414e-07, "logits/chosen": -2.6302976608276367, "logits/rejected": -2.617279052734375, "logps/chosen": -37.43922424316406, "logps/rejected": -105.23869323730469, "loss": 0.4851, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6859190464019775, "rewards/margins": 5.275753498077393, "rewards/rejected": -3.589834213256836, "step": 423 }, { "epoch": 1.0241545893719808, "grad_norm": 3.410054922103882, "learning_rate": 8.978260869565218e-07, "logits/chosen": -2.5940613746643066, "logits/rejected": -2.625770092010498, "logps/chosen": -40.14744567871094, "logps/rejected": -116.39813232421875, "loss": 0.4836, "rewards/accuracies": 0.9375, "rewards/chosen": 1.549361228942871, "rewards/margins": 5.675092697143555, "rewards/rejected": -4.125731468200684, "step": 424 }, { "epoch": 1.0265700483091786, "grad_norm": 4.526313304901123, "learning_rate": 8.975845410628019e-07, "logits/chosen": -2.596595525741577, "logits/rejected": -2.587202548980713, "logps/chosen": -41.0631103515625, "logps/rejected": -108.38114929199219, "loss": 0.4886, "rewards/accuracies": 0.9375, "rewards/chosen": 1.318481683731079, "rewards/margins": 4.6165924072265625, "rewards/rejected": -3.2981107234954834, "step": 425 }, { "epoch": 1.0289855072463767, "grad_norm": 3.953585386276245, "learning_rate": 8.97342995169082e-07, "logits/chosen": -2.6092567443847656, "logits/rejected": -2.5924837589263916, "logps/chosen": -28.693607330322266, "logps/rejected": -121.63151550292969, "loss": 0.3072, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9179410934448242, "rewards/margins": 6.700744152069092, "rewards/rejected": -4.782803535461426, "step": 426 }, { "epoch": 1.0314009661835748, "grad_norm": 4.098236083984375, "learning_rate": 8.971014492753623e-07, "logits/chosen": -2.620417356491089, "logits/rejected": -2.6209146976470947, "logps/chosen": -37.23324966430664, "logps/rejected": -106.52333068847656, "loss": 0.4765, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6917386054992676, "rewards/margins": 5.3139328956604, "rewards/rejected": -3.6221940517425537, "step": 427 }, { "epoch": 1.0338164251207729, "grad_norm": 4.718957901000977, "learning_rate": 8.968599033816425e-07, "logits/chosen": -2.637164354324341, "logits/rejected": -2.6115329265594482, "logps/chosen": -43.71669006347656, "logps/rejected": -121.5641860961914, "loss": 0.5075, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2822468280792236, "rewards/margins": 5.699988842010498, "rewards/rejected": -4.417741775512695, "step": 428 }, { "epoch": 1.036231884057971, "grad_norm": 3.518916606903076, "learning_rate": 8.966183574879226e-07, "logits/chosen": -2.6828060150146484, "logits/rejected": -2.7054603099823, "logps/chosen": -52.724910736083984, "logps/rejected": -118.19335174560547, "loss": 0.5468, "rewards/accuracies": 1.0, "rewards/chosen": 0.710909903049469, "rewards/margins": 4.774940490722656, "rewards/rejected": -4.064030170440674, "step": 429 }, { "epoch": 1.038647342995169, "grad_norm": 3.992846727371216, "learning_rate": 8.963768115942029e-07, "logits/chosen": -2.5932157039642334, "logits/rejected": -2.6021127700805664, "logps/chosen": -58.07545471191406, "logps/rejected": -103.75344848632812, "loss": 0.6576, "rewards/accuracies": 0.96875, "rewards/chosen": 0.17756041884422302, "rewards/margins": 3.422055721282959, "rewards/rejected": -3.244494915008545, "step": 430 }, { "epoch": 1.041062801932367, "grad_norm": 3.8365118503570557, "learning_rate": 8.96135265700483e-07, "logits/chosen": -2.5282747745513916, "logits/rejected": -2.527406692504883, "logps/chosen": -40.04437255859375, "logps/rejected": -95.08016204833984, "loss": 0.6159, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4930206537246704, "rewards/margins": 4.107263088226318, "rewards/rejected": -2.6142425537109375, "step": 431 }, { "epoch": 1.0434782608695652, "grad_norm": 3.957303285598755, "learning_rate": 8.958937198067633e-07, "logits/chosen": -2.6070945262908936, "logits/rejected": -2.640035390853882, "logps/chosen": -35.46105194091797, "logps/rejected": -124.81752014160156, "loss": 0.401, "rewards/accuracies": 1.0, "rewards/chosen": 1.594611644744873, "rewards/margins": 6.551550388336182, "rewards/rejected": -4.956938743591309, "step": 432 }, { "epoch": 1.0458937198067633, "grad_norm": 4.231368064880371, "learning_rate": 8.956521739130435e-07, "logits/chosen": -2.6493897438049316, "logits/rejected": -2.5814709663391113, "logps/chosen": -30.49374008178711, "logps/rejected": -114.44390106201172, "loss": 0.3952, "rewards/accuracies": 0.9375, "rewards/chosen": 2.174466133117676, "rewards/margins": 6.406378269195557, "rewards/rejected": -4.231912136077881, "step": 433 }, { "epoch": 1.0483091787439613, "grad_norm": 3.9634103775024414, "learning_rate": 8.954106280193236e-07, "logits/chosen": -2.528419017791748, "logits/rejected": -2.560661554336548, "logps/chosen": -50.04874801635742, "logps/rejected": -112.06989288330078, "loss": 0.5881, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6344185471534729, "rewards/margins": 4.548036098480225, "rewards/rejected": -3.9136178493499756, "step": 434 }, { "epoch": 1.0507246376811594, "grad_norm": 4.363529205322266, "learning_rate": 8.951690821256039e-07, "logits/chosen": -2.599360942840576, "logits/rejected": -2.572638988494873, "logps/chosen": -51.673240661621094, "logps/rejected": -123.86787414550781, "loss": 0.5433, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5150082111358643, "rewards/margins": 5.167509078979492, "rewards/rejected": -4.652501106262207, "step": 435 }, { "epoch": 1.0531400966183575, "grad_norm": 5.2640461921691895, "learning_rate": 8.94927536231884e-07, "logits/chosen": -2.6198720932006836, "logits/rejected": -2.532317876815796, "logps/chosen": -39.98441696166992, "logps/rejected": -125.29607391357422, "loss": 0.4616, "rewards/accuracies": 0.96875, "rewards/chosen": 1.471652865409851, "rewards/margins": 6.223515510559082, "rewards/rejected": -4.751863479614258, "step": 436 }, { "epoch": 1.0555555555555556, "grad_norm": 3.845383644104004, "learning_rate": 8.946859903381641e-07, "logits/chosen": -2.6558308601379395, "logits/rejected": -2.6907076835632324, "logps/chosen": -58.325130462646484, "logps/rejected": -128.4691925048828, "loss": 0.5497, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3821641504764557, "rewards/margins": 5.146702766418457, "rewards/rejected": -4.764538288116455, "step": 437 }, { "epoch": 1.0579710144927537, "grad_norm": 5.015959739685059, "learning_rate": 8.944444444444445e-07, "logits/chosen": -2.623521089553833, "logits/rejected": -2.5650579929351807, "logps/chosen": -46.84141540527344, "logps/rejected": -101.47367858886719, "loss": 0.666, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2063573598861694, "rewards/margins": 4.339887619018555, "rewards/rejected": -3.1335301399230957, "step": 438 }, { "epoch": 1.0603864734299517, "grad_norm": 3.861541271209717, "learning_rate": 8.942028985507246e-07, "logits/chosen": -2.5534307956695557, "logits/rejected": -2.619211435317993, "logps/chosen": -41.18574523925781, "logps/rejected": -117.04010772705078, "loss": 0.4525, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3582358360290527, "rewards/margins": 5.511935234069824, "rewards/rejected": -4.153698921203613, "step": 439 }, { "epoch": 1.0628019323671498, "grad_norm": 4.287777423858643, "learning_rate": 8.939613526570048e-07, "logits/chosen": -2.564906597137451, "logits/rejected": -2.5664446353912354, "logps/chosen": -44.55738830566406, "logps/rejected": -113.7560043334961, "loss": 0.4783, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2252479791641235, "rewards/margins": 5.092106819152832, "rewards/rejected": -3.866858959197998, "step": 440 }, { "epoch": 1.065217391304348, "grad_norm": 5.927083492279053, "learning_rate": 8.93719806763285e-07, "logits/chosen": -2.5639967918395996, "logits/rejected": -2.5775601863861084, "logps/chosen": -27.698307037353516, "logps/rejected": -94.7236099243164, "loss": 0.4896, "rewards/accuracies": 0.90625, "rewards/chosen": 2.360830783843994, "rewards/margins": 4.687188625335693, "rewards/rejected": -2.326357841491699, "step": 441 }, { "epoch": 1.067632850241546, "grad_norm": 4.864030838012695, "learning_rate": 8.934782608695651e-07, "logits/chosen": -2.598689556121826, "logits/rejected": -2.6164679527282715, "logps/chosen": -65.08509826660156, "logps/rejected": -124.68943786621094, "loss": 0.5777, "rewards/accuracies": 0.78125, "rewards/chosen": -0.147934228181839, "rewards/margins": 4.3760786056518555, "rewards/rejected": -4.524013519287109, "step": 442 }, { "epoch": 1.070048309178744, "grad_norm": 3.651963233947754, "learning_rate": 8.932367149758454e-07, "logits/chosen": -2.6168699264526367, "logits/rejected": -2.6397173404693604, "logps/chosen": -42.22628402709961, "logps/rejected": -116.4482192993164, "loss": 0.4731, "rewards/accuracies": 1.0, "rewards/chosen": 1.2857609987258911, "rewards/margins": 5.393580436706543, "rewards/rejected": -4.107819557189941, "step": 443 }, { "epoch": 1.0724637681159421, "grad_norm": 4.363641738891602, "learning_rate": 8.929951690821256e-07, "logits/chosen": -2.559016466140747, "logits/rejected": -2.529608726501465, "logps/chosen": -35.48774719238281, "logps/rejected": -97.11813354492188, "loss": 0.4705, "rewards/accuracies": 0.84375, "rewards/chosen": 1.9132006168365479, "rewards/margins": 4.632171154022217, "rewards/rejected": -2.718970537185669, "step": 444 }, { "epoch": 1.07487922705314, "grad_norm": 3.698287010192871, "learning_rate": 8.927536231884057e-07, "logits/chosen": -2.6405773162841797, "logits/rejected": -2.6093344688415527, "logps/chosen": -51.20888900756836, "logps/rejected": -120.60979461669922, "loss": 0.5984, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5810604691505432, "rewards/margins": 4.586058139801025, "rewards/rejected": -4.004997730255127, "step": 445 }, { "epoch": 1.077294685990338, "grad_norm": 4.003686904907227, "learning_rate": 8.925120772946859e-07, "logits/chosen": -2.5925021171569824, "logits/rejected": -2.6366899013519287, "logps/chosen": -38.46699142456055, "logps/rejected": -121.04063415527344, "loss": 0.4941, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7231742143630981, "rewards/margins": 6.026377201080322, "rewards/rejected": -4.303203582763672, "step": 446 }, { "epoch": 1.0797101449275361, "grad_norm": 4.047671794891357, "learning_rate": 8.922705314009662e-07, "logits/chosen": -2.5097999572753906, "logits/rejected": -2.556992530822754, "logps/chosen": -43.48598098754883, "logps/rejected": -127.9912338256836, "loss": 0.4027, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4223432540893555, "rewards/margins": 6.4824748039245605, "rewards/rejected": -5.060131549835205, "step": 447 }, { "epoch": 1.0821256038647342, "grad_norm": 4.62867546081543, "learning_rate": 8.920289855072464e-07, "logits/chosen": -2.5404891967773438, "logits/rejected": -2.526895761489868, "logps/chosen": -41.04097366333008, "logps/rejected": -118.57115936279297, "loss": 0.4438, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3611537218093872, "rewards/margins": 5.888693809509277, "rewards/rejected": -4.52754020690918, "step": 448 }, { "epoch": 1.0845410628019323, "grad_norm": 5.070927619934082, "learning_rate": 8.917874396135266e-07, "logits/chosen": -2.6608357429504395, "logits/rejected": -2.6115729808807373, "logps/chosen": -47.497344970703125, "logps/rejected": -118.48504638671875, "loss": 0.5848, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7477467060089111, "rewards/margins": 4.957511901855469, "rewards/rejected": -4.2097649574279785, "step": 449 }, { "epoch": 1.0869565217391304, "grad_norm": 4.6315083503723145, "learning_rate": 8.915458937198067e-07, "logits/chosen": -2.567425012588501, "logits/rejected": -2.568225145339966, "logps/chosen": -40.97093963623047, "logps/rejected": -111.2282485961914, "loss": 0.5123, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3016517162322998, "rewards/margins": 4.971374034881592, "rewards/rejected": -3.669722080230713, "step": 450 }, { "epoch": 1.0893719806763285, "grad_norm": 3.3192410469055176, "learning_rate": 8.913043478260869e-07, "logits/chosen": -2.67216157913208, "logits/rejected": -2.584441661834717, "logps/chosen": -46.75366973876953, "logps/rejected": -141.75843811035156, "loss": 0.4612, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7653296589851379, "rewards/margins": 6.2579474449157715, "rewards/rejected": -5.492618560791016, "step": 451 }, { "epoch": 1.0917874396135265, "grad_norm": 3.8954057693481445, "learning_rate": 8.910628019323672e-07, "logits/chosen": -2.58689546585083, "logits/rejected": -2.5526888370513916, "logps/chosen": -37.32801818847656, "logps/rejected": -105.34778594970703, "loss": 0.526, "rewards/accuracies": 1.0, "rewards/chosen": 1.7092726230621338, "rewards/margins": 4.8013458251953125, "rewards/rejected": -3.0920732021331787, "step": 452 }, { "epoch": 1.0942028985507246, "grad_norm": 3.9246585369110107, "learning_rate": 8.908212560386473e-07, "logits/chosen": -2.6513469219207764, "logits/rejected": -2.6312334537506104, "logps/chosen": -47.3332405090332, "logps/rejected": -121.871337890625, "loss": 0.4439, "rewards/accuracies": 0.96875, "rewards/chosen": 0.985320508480072, "rewards/margins": 5.751285552978516, "rewards/rejected": -4.765964984893799, "step": 453 }, { "epoch": 1.0966183574879227, "grad_norm": 3.617729425430298, "learning_rate": 8.905797101449275e-07, "logits/chosen": -2.599759817123413, "logits/rejected": -2.615044116973877, "logps/chosen": -36.75774383544922, "logps/rejected": -120.17108154296875, "loss": 0.4526, "rewards/accuracies": 0.96875, "rewards/chosen": 1.849367618560791, "rewards/margins": 6.1471686363220215, "rewards/rejected": -4.2978010177612305, "step": 454 }, { "epoch": 1.0990338164251208, "grad_norm": 4.528633117675781, "learning_rate": 8.903381642512077e-07, "logits/chosen": -2.5485222339630127, "logits/rejected": -2.5317156314849854, "logps/chosen": -46.754249572753906, "logps/rejected": -113.9783706665039, "loss": 0.4921, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2736999988555908, "rewards/margins": 4.9658074378967285, "rewards/rejected": -3.692107915878296, "step": 455 }, { "epoch": 1.1014492753623188, "grad_norm": 4.918299198150635, "learning_rate": 8.900966183574878e-07, "logits/chosen": -2.5609636306762695, "logits/rejected": -2.581353187561035, "logps/chosen": -33.19755554199219, "logps/rejected": -100.8149185180664, "loss": 0.4986, "rewards/accuracies": 1.0, "rewards/chosen": 2.055375337600708, "rewards/margins": 4.93740701675415, "rewards/rejected": -2.8820314407348633, "step": 456 }, { "epoch": 1.103864734299517, "grad_norm": 5.912226676940918, "learning_rate": 8.898550724637681e-07, "logits/chosen": -2.6320743560791016, "logits/rejected": -2.639190673828125, "logps/chosen": -35.38416290283203, "logps/rejected": -101.82452392578125, "loss": 0.4829, "rewards/accuracies": 1.0, "rewards/chosen": 2.1118929386138916, "rewards/margins": 5.158051490783691, "rewards/rejected": -3.046158790588379, "step": 457 }, { "epoch": 1.106280193236715, "grad_norm": 4.353630065917969, "learning_rate": 8.896135265700483e-07, "logits/chosen": -2.603883743286133, "logits/rejected": -2.5891594886779785, "logps/chosen": -47.13800048828125, "logps/rejected": -114.80472564697266, "loss": 0.4733, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1270616054534912, "rewards/margins": 4.98988676071167, "rewards/rejected": -3.8628249168395996, "step": 458 }, { "epoch": 1.108695652173913, "grad_norm": 6.951180934906006, "learning_rate": 8.893719806763285e-07, "logits/chosen": -2.63873028755188, "logits/rejected": -2.5769426822662354, "logps/chosen": -39.656715393066406, "logps/rejected": -107.7349853515625, "loss": 0.4738, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7218363285064697, "rewards/margins": 5.057541847229004, "rewards/rejected": -3.335705280303955, "step": 459 }, { "epoch": 1.1111111111111112, "grad_norm": 3.8291704654693604, "learning_rate": 8.891304347826086e-07, "logits/chosen": -2.566847801208496, "logits/rejected": -2.5362868309020996, "logps/chosen": -38.8458251953125, "logps/rejected": -119.05785369873047, "loss": 0.4542, "rewards/accuracies": 0.9375, "rewards/chosen": 1.311506748199463, "rewards/margins": 5.621739387512207, "rewards/rejected": -4.310233116149902, "step": 460 }, { "epoch": 1.1135265700483092, "grad_norm": 4.699629306793213, "learning_rate": 8.888888888888888e-07, "logits/chosen": -2.5719196796417236, "logits/rejected": -2.563946485519409, "logps/chosen": -46.87767791748047, "logps/rejected": -123.99354553222656, "loss": 0.4881, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0955252647399902, "rewards/margins": 5.5251240730285645, "rewards/rejected": -4.429598808288574, "step": 461 }, { "epoch": 1.1159420289855073, "grad_norm": 4.661625385284424, "learning_rate": 8.886473429951691e-07, "logits/chosen": -2.558675765991211, "logits/rejected": -2.556960344314575, "logps/chosen": -37.552879333496094, "logps/rejected": -101.47866821289062, "loss": 0.4716, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6755064725875854, "rewards/margins": 4.818243026733398, "rewards/rejected": -3.1427364349365234, "step": 462 }, { "epoch": 1.1183574879227054, "grad_norm": 3.6585965156555176, "learning_rate": 8.884057971014492e-07, "logits/chosen": -2.595059394836426, "logits/rejected": -2.568617582321167, "logps/chosen": -27.07090950012207, "logps/rejected": -110.729736328125, "loss": 0.3925, "rewards/accuracies": 0.96875, "rewards/chosen": 2.7220630645751953, "rewards/margins": 6.06181526184082, "rewards/rejected": -3.339752674102783, "step": 463 }, { "epoch": 1.1207729468599035, "grad_norm": 2.792640447616577, "learning_rate": 8.881642512077294e-07, "logits/chosen": -2.5390186309814453, "logits/rejected": -2.6093270778656006, "logps/chosen": -37.3294792175293, "logps/rejected": -116.31377410888672, "loss": 0.4825, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6544744968414307, "rewards/margins": 5.401432514190674, "rewards/rejected": -3.746958017349243, "step": 464 }, { "epoch": 1.1231884057971016, "grad_norm": 3.518075942993164, "learning_rate": 8.879227053140096e-07, "logits/chosen": -2.5373878479003906, "logits/rejected": -2.54280948638916, "logps/chosen": -33.468265533447266, "logps/rejected": -112.85226440429688, "loss": 0.3983, "rewards/accuracies": 1.0, "rewards/chosen": 2.050853967666626, "rewards/margins": 5.8605732917785645, "rewards/rejected": -3.8097195625305176, "step": 465 }, { "epoch": 1.1256038647342996, "grad_norm": 4.058253765106201, "learning_rate": 8.876811594202897e-07, "logits/chosen": -2.592343330383301, "logits/rejected": -2.62387752532959, "logps/chosen": -39.32971954345703, "logps/rejected": -109.27976989746094, "loss": 0.5054, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5980948209762573, "rewards/margins": 5.156721115112305, "rewards/rejected": -3.5586254596710205, "step": 466 }, { "epoch": 1.1280193236714977, "grad_norm": 4.616971969604492, "learning_rate": 8.874396135265701e-07, "logits/chosen": -2.560105323791504, "logits/rejected": -2.5134880542755127, "logps/chosen": -44.02205276489258, "logps/rejected": -106.42215728759766, "loss": 0.5091, "rewards/accuracies": 0.90625, "rewards/chosen": 1.062299370765686, "rewards/margins": 4.593163013458252, "rewards/rejected": -3.5308637619018555, "step": 467 }, { "epoch": 1.1304347826086956, "grad_norm": 3.6181716918945312, "learning_rate": 8.871980676328502e-07, "logits/chosen": -2.5406606197357178, "logits/rejected": -2.581204891204834, "logps/chosen": -38.6399040222168, "logps/rejected": -111.93077850341797, "loss": 0.4543, "rewards/accuracies": 1.0, "rewards/chosen": 1.532452940940857, "rewards/margins": 5.2753753662109375, "rewards/rejected": -3.742922306060791, "step": 468 }, { "epoch": 1.1328502415458936, "grad_norm": 3.999933958053589, "learning_rate": 8.869565217391303e-07, "logits/chosen": -2.645296573638916, "logits/rejected": -2.663938522338867, "logps/chosen": -35.73670196533203, "logps/rejected": -107.51795959472656, "loss": 0.475, "rewards/accuracies": 0.96875, "rewards/chosen": 1.719652533531189, "rewards/margins": 5.294371604919434, "rewards/rejected": -3.574718952178955, "step": 469 }, { "epoch": 1.1352657004830917, "grad_norm": 5.456958293914795, "learning_rate": 8.867149758454106e-07, "logits/chosen": -2.550243854522705, "logits/rejected": -2.518960952758789, "logps/chosen": -44.320838928222656, "logps/rejected": -110.39126586914062, "loss": 0.5295, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2381927967071533, "rewards/margins": 4.83841609954834, "rewards/rejected": -3.6002230644226074, "step": 470 }, { "epoch": 1.1376811594202898, "grad_norm": 5.1386518478393555, "learning_rate": 8.864734299516908e-07, "logits/chosen": -2.693704128265381, "logits/rejected": -2.6926581859588623, "logps/chosen": -37.99272918701172, "logps/rejected": -108.94185638427734, "loss": 0.5289, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6519465446472168, "rewards/margins": 5.049349784851074, "rewards/rejected": -3.3974032402038574, "step": 471 }, { "epoch": 1.1400966183574879, "grad_norm": 5.201127052307129, "learning_rate": 8.86231884057971e-07, "logits/chosen": -2.555697441101074, "logits/rejected": -2.611842155456543, "logps/chosen": -33.9857063293457, "logps/rejected": -105.97215270996094, "loss": 0.452, "rewards/accuracies": 0.9375, "rewards/chosen": 2.1384057998657227, "rewards/margins": 5.522233963012695, "rewards/rejected": -3.383828639984131, "step": 472 }, { "epoch": 1.142512077294686, "grad_norm": 3.5278124809265137, "learning_rate": 8.859903381642512e-07, "logits/chosen": -2.557520866394043, "logits/rejected": -2.602609157562256, "logps/chosen": -40.32299041748047, "logps/rejected": -100.34219360351562, "loss": 0.5842, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5130400657653809, "rewards/margins": 4.31466817855835, "rewards/rejected": -2.801628351211548, "step": 473 }, { "epoch": 1.144927536231884, "grad_norm": 4.1391706466674805, "learning_rate": 8.857487922705313e-07, "logits/chosen": -2.675992012023926, "logits/rejected": -2.580096960067749, "logps/chosen": -40.19935607910156, "logps/rejected": -120.40171813964844, "loss": 0.4885, "rewards/accuracies": 0.9375, "rewards/chosen": 1.487318992614746, "rewards/margins": 5.750810623168945, "rewards/rejected": -4.263491153717041, "step": 474 }, { "epoch": 1.1473429951690821, "grad_norm": 3.239680051803589, "learning_rate": 8.855072463768116e-07, "logits/chosen": -2.5956575870513916, "logits/rejected": -2.6009061336517334, "logps/chosen": -40.814353942871094, "logps/rejected": -95.38732147216797, "loss": 0.6698, "rewards/accuracies": 0.875, "rewards/chosen": 1.392287015914917, "rewards/margins": 4.090013027191162, "rewards/rejected": -2.697725772857666, "step": 475 }, { "epoch": 1.1497584541062802, "grad_norm": 5.721564769744873, "learning_rate": 8.852657004830918e-07, "logits/chosen": -2.5720534324645996, "logits/rejected": -2.5948493480682373, "logps/chosen": -41.52191162109375, "logps/rejected": -124.43560791015625, "loss": 0.5086, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7225943803787231, "rewards/margins": 5.833974361419678, "rewards/rejected": -4.111380100250244, "step": 476 }, { "epoch": 1.1521739130434783, "grad_norm": 4.8612847328186035, "learning_rate": 8.850241545893719e-07, "logits/chosen": -2.5972254276275635, "logits/rejected": -2.5714244842529297, "logps/chosen": -43.70191955566406, "logps/rejected": -123.94358825683594, "loss": 0.5016, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2752176523208618, "rewards/margins": 5.89178466796875, "rewards/rejected": -4.6165666580200195, "step": 477 }, { "epoch": 1.1545893719806763, "grad_norm": 4.336092472076416, "learning_rate": 8.847826086956522e-07, "logits/chosen": -2.7103521823883057, "logits/rejected": -2.6095168590545654, "logps/chosen": -51.897613525390625, "logps/rejected": -132.8423614501953, "loss": 0.54, "rewards/accuracies": 0.96875, "rewards/chosen": 0.28299322724342346, "rewards/margins": 5.522188663482666, "rewards/rejected": -5.239195823669434, "step": 478 }, { "epoch": 1.1570048309178744, "grad_norm": 3.9885659217834473, "learning_rate": 8.845410628019323e-07, "logits/chosen": -2.5761234760284424, "logits/rejected": -2.5469038486480713, "logps/chosen": -36.91161346435547, "logps/rejected": -102.27699279785156, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 1.7260080575942993, "rewards/margins": 4.93068790435791, "rewards/rejected": -3.2046799659729004, "step": 479 }, { "epoch": 1.1594202898550725, "grad_norm": 4.679018497467041, "learning_rate": 8.842995169082124e-07, "logits/chosen": -2.6167306900024414, "logits/rejected": -2.6375997066497803, "logps/chosen": -42.7772331237793, "logps/rejected": -111.87171936035156, "loss": 0.5817, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3370938301086426, "rewards/margins": 5.109349727630615, "rewards/rejected": -3.7722554206848145, "step": 480 }, { "epoch": 1.1618357487922706, "grad_norm": 2.520059585571289, "learning_rate": 8.840579710144928e-07, "logits/chosen": -2.4975733757019043, "logits/rejected": -2.526637554168701, "logps/chosen": -48.59960174560547, "logps/rejected": -127.90591430664062, "loss": 0.5251, "rewards/accuracies": 0.875, "rewards/chosen": 0.7671695947647095, "rewards/margins": 5.344730377197266, "rewards/rejected": -4.5775604248046875, "step": 481 }, { "epoch": 1.1642512077294687, "grad_norm": 5.807524681091309, "learning_rate": 8.838164251207729e-07, "logits/chosen": -2.628325939178467, "logits/rejected": -2.561757802963257, "logps/chosen": -40.319156646728516, "logps/rejected": -86.61615753173828, "loss": 0.6347, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6119352579116821, "rewards/margins": 3.6157643795013428, "rewards/rejected": -2.003829002380371, "step": 482 }, { "epoch": 1.1666666666666667, "grad_norm": 5.551951885223389, "learning_rate": 8.835748792270531e-07, "logits/chosen": -2.546905517578125, "logits/rejected": -2.648021697998047, "logps/chosen": -37.38425827026367, "logps/rejected": -123.63310241699219, "loss": 0.4234, "rewards/accuracies": 1.0, "rewards/chosen": 1.7930485010147095, "rewards/margins": 5.960102081298828, "rewards/rejected": -4.16705322265625, "step": 483 }, { "epoch": 1.1690821256038648, "grad_norm": 4.756727695465088, "learning_rate": 8.833333333333333e-07, "logits/chosen": -2.5423741340637207, "logits/rejected": -2.4938175678253174, "logps/chosen": -40.28768539428711, "logps/rejected": -125.98513793945312, "loss": 0.3375, "rewards/accuracies": 0.9375, "rewards/chosen": 1.637324333190918, "rewards/margins": 6.4369425773620605, "rewards/rejected": -4.799618721008301, "step": 484 }, { "epoch": 1.171497584541063, "grad_norm": 4.1681952476501465, "learning_rate": 8.830917874396134e-07, "logits/chosen": -2.6116371154785156, "logits/rejected": -2.5911974906921387, "logps/chosen": -47.98054885864258, "logps/rejected": -116.11734771728516, "loss": 0.5873, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8497700691223145, "rewards/margins": 4.793516159057617, "rewards/rejected": -3.943746328353882, "step": 485 }, { "epoch": 1.1739130434782608, "grad_norm": 4.576903820037842, "learning_rate": 8.828502415458938e-07, "logits/chosen": -2.54168438911438, "logits/rejected": -2.56864595413208, "logps/chosen": -44.608062744140625, "logps/rejected": -107.95592498779297, "loss": 0.5239, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1024928092956543, "rewards/margins": 4.612649440765381, "rewards/rejected": -3.5101566314697266, "step": 486 }, { "epoch": 1.1763285024154588, "grad_norm": 3.5666561126708984, "learning_rate": 8.826086956521739e-07, "logits/chosen": -2.598430633544922, "logits/rejected": -2.6350934505462646, "logps/chosen": -41.517311096191406, "logps/rejected": -116.35870361328125, "loss": 0.4943, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4595717191696167, "rewards/margins": 5.551303863525391, "rewards/rejected": -4.091732025146484, "step": 487 }, { "epoch": 1.178743961352657, "grad_norm": 4.290339469909668, "learning_rate": 8.82367149758454e-07, "logits/chosen": -2.6394858360290527, "logits/rejected": -2.560936450958252, "logps/chosen": -39.50251770019531, "logps/rejected": -110.6702651977539, "loss": 0.491, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4918005466461182, "rewards/margins": 5.363026142120361, "rewards/rejected": -3.871225595474243, "step": 488 }, { "epoch": 1.181159420289855, "grad_norm": 6.21342134475708, "learning_rate": 8.821256038647343e-07, "logits/chosen": -2.567479133605957, "logits/rejected": -2.5553698539733887, "logps/chosen": -41.920570373535156, "logps/rejected": -107.34149169921875, "loss": 0.5798, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5349477529525757, "rewards/margins": 5.006089210510254, "rewards/rejected": -3.4711415767669678, "step": 489 }, { "epoch": 1.183574879227053, "grad_norm": 5.291383266448975, "learning_rate": 8.818840579710145e-07, "logits/chosen": -2.5815303325653076, "logits/rejected": -2.5687167644500732, "logps/chosen": -48.7113151550293, "logps/rejected": -120.5376205444336, "loss": 0.5137, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7836085557937622, "rewards/margins": 5.2453765869140625, "rewards/rejected": -4.461767673492432, "step": 490 }, { "epoch": 1.1859903381642511, "grad_norm": 3.7997264862060547, "learning_rate": 8.816425120772947e-07, "logits/chosen": -2.6682956218719482, "logits/rejected": -2.6259522438049316, "logps/chosen": -44.53569793701172, "logps/rejected": -102.32382202148438, "loss": 0.6557, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0382131338119507, "rewards/margins": 4.091159820556641, "rewards/rejected": -3.0529470443725586, "step": 491 }, { "epoch": 1.1884057971014492, "grad_norm": 3.962477445602417, "learning_rate": 8.814009661835749e-07, "logits/chosen": -2.661832809448242, "logits/rejected": -2.617124319076538, "logps/chosen": -33.196372985839844, "logps/rejected": -113.92127227783203, "loss": 0.4096, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9635741710662842, "rewards/margins": 5.8490729331970215, "rewards/rejected": -3.885498523712158, "step": 492 }, { "epoch": 1.1908212560386473, "grad_norm": 4.188199520111084, "learning_rate": 8.81159420289855e-07, "logits/chosen": -2.589700222015381, "logits/rejected": -2.594041109085083, "logps/chosen": -44.69469451904297, "logps/rejected": -105.1494369506836, "loss": 0.5306, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2230077981948853, "rewards/margins": 4.755283355712891, "rewards/rejected": -3.532275438308716, "step": 493 }, { "epoch": 1.1932367149758454, "grad_norm": 3.917273759841919, "learning_rate": 8.809178743961352e-07, "logits/chosen": -2.543727159500122, "logits/rejected": -2.5433709621429443, "logps/chosen": -45.521240234375, "logps/rejected": -124.64077758789062, "loss": 0.4805, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2542438507080078, "rewards/margins": 5.692828178405762, "rewards/rejected": -4.4385833740234375, "step": 494 }, { "epoch": 1.1956521739130435, "grad_norm": 6.018936634063721, "learning_rate": 8.806763285024155e-07, "logits/chosen": -2.6401548385620117, "logits/rejected": -2.601133108139038, "logps/chosen": -42.497886657714844, "logps/rejected": -135.7871551513672, "loss": 0.4938, "rewards/accuracies": 1.0, "rewards/chosen": 1.186888337135315, "rewards/margins": 6.571922302246094, "rewards/rejected": -5.385034084320068, "step": 495 }, { "epoch": 1.1980676328502415, "grad_norm": 5.027449131011963, "learning_rate": 8.804347826086956e-07, "logits/chosen": -2.5978610515594482, "logits/rejected": -2.602315902709961, "logps/chosen": -30.7115535736084, "logps/rejected": -97.7164306640625, "loss": 0.554, "rewards/accuracies": 0.9375, "rewards/chosen": 2.3142714500427246, "rewards/margins": 5.07051944732666, "rewards/rejected": -2.756248712539673, "step": 496 }, { "epoch": 1.2004830917874396, "grad_norm": 4.549402236938477, "learning_rate": 8.801932367149758e-07, "logits/chosen": -2.587824583053589, "logits/rejected": -2.5786726474761963, "logps/chosen": -43.449180603027344, "logps/rejected": -120.54010009765625, "loss": 0.4965, "rewards/accuracies": 0.875, "rewards/chosen": 1.02234947681427, "rewards/margins": 5.74852180480957, "rewards/rejected": -4.72617244720459, "step": 497 }, { "epoch": 1.2028985507246377, "grad_norm": 6.5941901206970215, "learning_rate": 8.79951690821256e-07, "logits/chosen": -2.6201369762420654, "logits/rejected": -2.619906425476074, "logps/chosen": -48.37778854370117, "logps/rejected": -142.09486389160156, "loss": 0.5083, "rewards/accuracies": 0.90625, "rewards/chosen": 1.000125527381897, "rewards/margins": 6.786865234375, "rewards/rejected": -5.786739349365234, "step": 498 }, { "epoch": 1.2028985507246377, "eval_logits/chosen": -2.6225168704986572, "eval_logits/rejected": -2.6275761127471924, "eval_logps/chosen": -40.78343963623047, "eval_logps/rejected": -122.1749038696289, "eval_loss": 0.4391176998615265, "eval_rewards/accuracies": 0.9540441036224365, "eval_rewards/chosen": 1.3877477645874023, "eval_rewards/margins": 5.971837043762207, "eval_rewards/rejected": -4.584089279174805, "eval_runtime": 997.9307, "eval_samples_per_second": 0.545, "eval_steps_per_second": 0.273, "step": 498 }, { "epoch": 1.2053140096618358, "grad_norm": 4.367673397064209, "learning_rate": 8.797101449275361e-07, "logits/chosen": -2.4702634811401367, "logits/rejected": -2.481962203979492, "logps/chosen": -38.347412109375, "logps/rejected": -115.88072204589844, "loss": 0.4404, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6336613893508911, "rewards/margins": 5.98627233505249, "rewards/rejected": -4.3526105880737305, "step": 499 }, { "epoch": 1.2077294685990339, "grad_norm": 4.816328525543213, "learning_rate": 8.794685990338164e-07, "logits/chosen": -2.613982677459717, "logits/rejected": -2.5797643661499023, "logps/chosen": -44.4234619140625, "logps/rejected": -112.66798400878906, "loss": 0.4431, "rewards/accuracies": 1.0, "rewards/chosen": 1.3549902439117432, "rewards/margins": 5.411596775054932, "rewards/rejected": -4.056605815887451, "step": 500 }, { "epoch": 1.210144927536232, "grad_norm": 3.7755181789398193, "learning_rate": 8.792270531400966e-07, "logits/chosen": -2.63773775100708, "logits/rejected": -2.5949158668518066, "logps/chosen": -47.95201110839844, "logps/rejected": -135.83172607421875, "loss": 0.4492, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8609695434570312, "rewards/margins": 6.755655765533447, "rewards/rejected": -5.894685745239258, "step": 501 }, { "epoch": 1.21256038647343, "grad_norm": 4.424658298492432, "learning_rate": 8.789855072463768e-07, "logits/chosen": -2.648190975189209, "logits/rejected": -2.6051154136657715, "logps/chosen": -38.44743728637695, "logps/rejected": -127.455810546875, "loss": 0.3566, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5478490591049194, "rewards/margins": 6.4926886558532715, "rewards/rejected": -4.9448394775390625, "step": 502 }, { "epoch": 1.214975845410628, "grad_norm": 3.074314832687378, "learning_rate": 8.787439613526569e-07, "logits/chosen": -2.641392469406128, "logits/rejected": -2.6468138694763184, "logps/chosen": -49.5994873046875, "logps/rejected": -115.06494140625, "loss": 0.5846, "rewards/accuracies": 0.90625, "rewards/chosen": 0.726927638053894, "rewards/margins": 4.680724620819092, "rewards/rejected": -3.953796863555908, "step": 503 }, { "epoch": 1.2173913043478262, "grad_norm": 3.8795084953308105, "learning_rate": 8.785024154589371e-07, "logits/chosen": -2.623997211456299, "logits/rejected": -2.632861614227295, "logps/chosen": -39.202606201171875, "logps/rejected": -124.5756607055664, "loss": 0.46, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3870627880096436, "rewards/margins": 6.374445915222168, "rewards/rejected": -4.987382411956787, "step": 504 }, { "epoch": 1.2198067632850242, "grad_norm": 4.549842357635498, "learning_rate": 8.782608695652174e-07, "logits/chosen": -2.6146812438964844, "logits/rejected": -2.5857295989990234, "logps/chosen": -43.01447677612305, "logps/rejected": -96.06529235839844, "loss": 0.5581, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3548262119293213, "rewards/margins": 4.253104209899902, "rewards/rejected": -2.898277521133423, "step": 505 }, { "epoch": 1.2222222222222223, "grad_norm": 3.0059587955474854, "learning_rate": 8.780193236714975e-07, "logits/chosen": -2.575605869293213, "logits/rejected": -2.591783046722412, "logps/chosen": -47.13542556762695, "logps/rejected": -113.78711700439453, "loss": 0.5491, "rewards/accuracies": 0.875, "rewards/chosen": 1.0205599069595337, "rewards/margins": 4.84520959854126, "rewards/rejected": -3.8246495723724365, "step": 506 }, { "epoch": 1.2246376811594204, "grad_norm": 3.5703606605529785, "learning_rate": 8.777777777777777e-07, "logits/chosen": -2.608881711959839, "logits/rejected": -2.6169824600219727, "logps/chosen": -43.43038558959961, "logps/rejected": -123.7955093383789, "loss": 0.4777, "rewards/accuracies": 1.0, "rewards/chosen": 1.0655046701431274, "rewards/margins": 6.057831764221191, "rewards/rejected": -4.9923272132873535, "step": 507 }, { "epoch": 1.2270531400966185, "grad_norm": 3.436222553253174, "learning_rate": 8.775362318840579e-07, "logits/chosen": -2.58010196685791, "logits/rejected": -2.5796074867248535, "logps/chosen": -40.77398681640625, "logps/rejected": -116.09842681884766, "loss": 0.5221, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3955281972885132, "rewards/margins": 5.3214945793151855, "rewards/rejected": -3.925966262817383, "step": 508 }, { "epoch": 1.2294685990338163, "grad_norm": 3.683936357498169, "learning_rate": 8.772946859903382e-07, "logits/chosen": -2.5100953578948975, "logits/rejected": -2.5762572288513184, "logps/chosen": -36.284332275390625, "logps/rejected": -108.69664764404297, "loss": 0.5099, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8084419965744019, "rewards/margins": 5.34394645690918, "rewards/rejected": -3.5355050563812256, "step": 509 }, { "epoch": 1.2318840579710144, "grad_norm": 3.601877212524414, "learning_rate": 8.770531400966184e-07, "logits/chosen": -2.5846409797668457, "logits/rejected": -2.559783458709717, "logps/chosen": -43.971771240234375, "logps/rejected": -114.01638793945312, "loss": 0.5392, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1299428939819336, "rewards/margins": 5.05778694152832, "rewards/rejected": -3.927844524383545, "step": 510 }, { "epoch": 1.2342995169082125, "grad_norm": 3.6790828704833984, "learning_rate": 8.768115942028985e-07, "logits/chosen": -2.4930241107940674, "logits/rejected": -2.5492942333221436, "logps/chosen": -49.33845520019531, "logps/rejected": -119.4847412109375, "loss": 0.5472, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8769235014915466, "rewards/margins": 5.03084135055542, "rewards/rejected": -4.15391731262207, "step": 511 }, { "epoch": 1.2367149758454106, "grad_norm": 3.6361453533172607, "learning_rate": 8.765700483091787e-07, "logits/chosen": -2.612440824508667, "logits/rejected": -2.645217180252075, "logps/chosen": -40.50059127807617, "logps/rejected": -126.04774475097656, "loss": 0.4098, "rewards/accuracies": 0.96875, "rewards/chosen": 1.378836750984192, "rewards/margins": 6.212991714477539, "rewards/rejected": -4.834155082702637, "step": 512 }, { "epoch": 1.2391304347826086, "grad_norm": 4.5494065284729, "learning_rate": 8.763285024154589e-07, "logits/chosen": -2.5774128437042236, "logits/rejected": -2.5480189323425293, "logps/chosen": -50.6318473815918, "logps/rejected": -126.43612670898438, "loss": 0.5092, "rewards/accuracies": 1.0, "rewards/chosen": 0.8604894280433655, "rewards/margins": 5.835018634796143, "rewards/rejected": -4.974529266357422, "step": 513 }, { "epoch": 1.2415458937198067, "grad_norm": 4.306845664978027, "learning_rate": 8.760869565217391e-07, "logits/chosen": -2.6071949005126953, "logits/rejected": -2.585136651992798, "logps/chosen": -41.315147399902344, "logps/rejected": -112.09539794921875, "loss": 0.4098, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6930816173553467, "rewards/margins": 5.485130310058594, "rewards/rejected": -3.792048692703247, "step": 514 }, { "epoch": 1.2439613526570048, "grad_norm": 4.279751777648926, "learning_rate": 8.758454106280193e-07, "logits/chosen": -2.580202341079712, "logits/rejected": -2.5881621837615967, "logps/chosen": -50.73526382446289, "logps/rejected": -124.565673828125, "loss": 0.5606, "rewards/accuracies": 0.90625, "rewards/chosen": 0.48120373487472534, "rewards/margins": 5.378701210021973, "rewards/rejected": -4.897497177124023, "step": 515 }, { "epoch": 1.2463768115942029, "grad_norm": 5.493028163909912, "learning_rate": 8.756038647342995e-07, "logits/chosen": -2.64756178855896, "logits/rejected": -2.701164722442627, "logps/chosen": -41.019287109375, "logps/rejected": -117.61551666259766, "loss": 0.4704, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2647837400436401, "rewards/margins": 5.609374046325684, "rewards/rejected": -4.344590187072754, "step": 516 }, { "epoch": 1.248792270531401, "grad_norm": 3.0010483264923096, "learning_rate": 8.753623188405796e-07, "logits/chosen": -2.5331854820251465, "logits/rejected": -2.53788423538208, "logps/chosen": -45.70649719238281, "logps/rejected": -123.85284423828125, "loss": 0.5159, "rewards/accuracies": 0.875, "rewards/chosen": 1.0302846431732178, "rewards/margins": 5.727871894836426, "rewards/rejected": -4.697587013244629, "step": 517 }, { "epoch": 1.251207729468599, "grad_norm": 3.6678242683410645, "learning_rate": 8.751207729468599e-07, "logits/chosen": -2.5391907691955566, "logits/rejected": -2.561779737472534, "logps/chosen": -44.28172302246094, "logps/rejected": -113.71290588378906, "loss": 0.5859, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0291683673858643, "rewards/margins": 4.828496932983398, "rewards/rejected": -3.799328565597534, "step": 518 }, { "epoch": 1.2536231884057971, "grad_norm": 3.5130836963653564, "learning_rate": 8.748792270531401e-07, "logits/chosen": -2.5835378170013428, "logits/rejected": -2.5981292724609375, "logps/chosen": -37.25799560546875, "logps/rejected": -109.12039184570312, "loss": 0.4837, "rewards/accuracies": 1.0, "rewards/chosen": 1.9464008808135986, "rewards/margins": 5.37485408782959, "rewards/rejected": -3.428452730178833, "step": 519 }, { "epoch": 1.2560386473429952, "grad_norm": 4.300833225250244, "learning_rate": 8.746376811594202e-07, "logits/chosen": -2.6349403858184814, "logits/rejected": -2.6097352504730225, "logps/chosen": -47.272037506103516, "logps/rejected": -121.3675765991211, "loss": 0.5499, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6346207857131958, "rewards/margins": 5.016212463378906, "rewards/rejected": -4.381591796875, "step": 520 }, { "epoch": 1.2584541062801933, "grad_norm": 7.114952564239502, "learning_rate": 8.743961352657005e-07, "logits/chosen": -2.589517116546631, "logits/rejected": -2.565357208251953, "logps/chosen": -32.91340255737305, "logps/rejected": -99.86181640625, "loss": 0.4889, "rewards/accuracies": 0.90625, "rewards/chosen": 2.1678786277770996, "rewards/margins": 5.130422115325928, "rewards/rejected": -2.962543487548828, "step": 521 }, { "epoch": 1.2608695652173914, "grad_norm": 5.1555280685424805, "learning_rate": 8.741545893719806e-07, "logits/chosen": -2.590585947036743, "logits/rejected": -2.530109405517578, "logps/chosen": -28.900447845458984, "logps/rejected": -97.91444396972656, "loss": 0.4985, "rewards/accuracies": 0.90625, "rewards/chosen": 2.2688093185424805, "rewards/margins": 4.860871315002441, "rewards/rejected": -2.592062473297119, "step": 522 }, { "epoch": 1.2632850241545894, "grad_norm": 5.903955459594727, "learning_rate": 8.739130434782607e-07, "logits/chosen": -2.6523964405059814, "logits/rejected": -2.643751382827759, "logps/chosen": -41.21404266357422, "logps/rejected": -115.45970916748047, "loss": 0.5099, "rewards/accuracies": 1.0, "rewards/chosen": 1.0878793001174927, "rewards/margins": 5.2677764892578125, "rewards/rejected": -4.179897308349609, "step": 523 }, { "epoch": 1.2657004830917875, "grad_norm": 4.094972133636475, "learning_rate": 8.736714975845411e-07, "logits/chosen": -2.628006935119629, "logits/rejected": -2.6389102935791016, "logps/chosen": -36.067222595214844, "logps/rejected": -122.87637329101562, "loss": 0.4579, "rewards/accuracies": 1.0, "rewards/chosen": 1.7435822486877441, "rewards/margins": 6.398463249206543, "rewards/rejected": -4.654880523681641, "step": 524 }, { "epoch": 1.2681159420289856, "grad_norm": 4.245271682739258, "learning_rate": 8.734299516908212e-07, "logits/chosen": -2.646291732788086, "logits/rejected": -2.7032198905944824, "logps/chosen": -36.04389572143555, "logps/rejected": -98.7752685546875, "loss": 0.5329, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9274263381958008, "rewards/margins": 4.8153839111328125, "rewards/rejected": -2.8879573345184326, "step": 525 }, { "epoch": 1.2705314009661834, "grad_norm": 4.579368591308594, "learning_rate": 8.731884057971015e-07, "logits/chosen": -2.5867743492126465, "logits/rejected": -2.584933280944824, "logps/chosen": -40.66446304321289, "logps/rejected": -115.83330535888672, "loss": 0.4437, "rewards/accuracies": 1.0, "rewards/chosen": 1.2119849920272827, "rewards/margins": 5.588465213775635, "rewards/rejected": -4.376480579376221, "step": 526 }, { "epoch": 1.2729468599033815, "grad_norm": 4.0903520584106445, "learning_rate": 8.729468599033816e-07, "logits/chosen": -2.6032190322875977, "logits/rejected": -2.5973429679870605, "logps/chosen": -47.061500549316406, "logps/rejected": -116.77405548095703, "loss": 0.4993, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0095891952514648, "rewards/margins": 5.28641414642334, "rewards/rejected": -4.276824951171875, "step": 527 }, { "epoch": 1.2753623188405796, "grad_norm": 3.929943084716797, "learning_rate": 8.727053140096617e-07, "logits/chosen": -2.584667205810547, "logits/rejected": -2.6266279220581055, "logps/chosen": -37.78976821899414, "logps/rejected": -122.63629150390625, "loss": 0.4995, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5729589462280273, "rewards/margins": 6.013574123382568, "rewards/rejected": -4.440614700317383, "step": 528 }, { "epoch": 1.2777777777777777, "grad_norm": 3.963963508605957, "learning_rate": 8.724637681159421e-07, "logits/chosen": -2.60048508644104, "logits/rejected": -2.604565382003784, "logps/chosen": -41.84239959716797, "logps/rejected": -113.52980041503906, "loss": 0.5035, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2905207872390747, "rewards/margins": 5.498451232910156, "rewards/rejected": -4.207930088043213, "step": 529 }, { "epoch": 1.2801932367149758, "grad_norm": 3.8574464321136475, "learning_rate": 8.722222222222222e-07, "logits/chosen": -2.5261001586914062, "logits/rejected": -2.5275747776031494, "logps/chosen": -34.36771774291992, "logps/rejected": -106.24513244628906, "loss": 0.5116, "rewards/accuracies": 0.96875, "rewards/chosen": 1.802211046218872, "rewards/margins": 5.138618469238281, "rewards/rejected": -3.3364076614379883, "step": 530 }, { "epoch": 1.2826086956521738, "grad_norm": 5.583230972290039, "learning_rate": 8.719806763285023e-07, "logits/chosen": -2.5826733112335205, "logits/rejected": -2.573218822479248, "logps/chosen": -47.0417366027832, "logps/rejected": -114.32929992675781, "loss": 0.5724, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8309048414230347, "rewards/margins": 4.680610179901123, "rewards/rejected": -3.849705696105957, "step": 531 }, { "epoch": 1.285024154589372, "grad_norm": 5.074437618255615, "learning_rate": 8.717391304347826e-07, "logits/chosen": -2.575820207595825, "logits/rejected": -2.5320608615875244, "logps/chosen": -40.33082580566406, "logps/rejected": -117.71406555175781, "loss": 0.4725, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4199293851852417, "rewards/margins": 5.74783182144165, "rewards/rejected": -4.327902793884277, "step": 532 }, { "epoch": 1.28743961352657, "grad_norm": 3.343883514404297, "learning_rate": 8.714975845410628e-07, "logits/chosen": -2.6434147357940674, "logits/rejected": -2.628661632537842, "logps/chosen": -48.00621032714844, "logps/rejected": -109.32881164550781, "loss": 0.5478, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9166767001152039, "rewards/margins": 4.912251949310303, "rewards/rejected": -3.995575428009033, "step": 533 }, { "epoch": 1.289855072463768, "grad_norm": 3.5803322792053223, "learning_rate": 8.71256038647343e-07, "logits/chosen": -2.5506997108459473, "logits/rejected": -2.536973714828491, "logps/chosen": -41.55390167236328, "logps/rejected": -108.86581420898438, "loss": 0.4657, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5783028602600098, "rewards/margins": 5.179341793060303, "rewards/rejected": -3.6010384559631348, "step": 534 }, { "epoch": 1.2922705314009661, "grad_norm": 3.8942675590515137, "learning_rate": 8.710144927536232e-07, "logits/chosen": -2.572511911392212, "logits/rejected": -2.5585803985595703, "logps/chosen": -37.50930404663086, "logps/rejected": -132.12403869628906, "loss": 0.3766, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8557581901550293, "rewards/margins": 7.09745454788208, "rewards/rejected": -5.241696357727051, "step": 535 }, { "epoch": 1.2946859903381642, "grad_norm": 3.6763713359832764, "learning_rate": 8.707729468599033e-07, "logits/chosen": -2.5772156715393066, "logits/rejected": -2.5458974838256836, "logps/chosen": -45.846378326416016, "logps/rejected": -123.19483184814453, "loss": 0.4728, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1324716806411743, "rewards/margins": 5.4371337890625, "rewards/rejected": -4.304661750793457, "step": 536 }, { "epoch": 1.2971014492753623, "grad_norm": 4.071662425994873, "learning_rate": 8.705314009661835e-07, "logits/chosen": -2.618939161300659, "logits/rejected": -2.6004433631896973, "logps/chosen": -55.98966979980469, "logps/rejected": -120.10052490234375, "loss": 0.6178, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3046099841594696, "rewards/margins": 4.555054664611816, "rewards/rejected": -4.2504448890686035, "step": 537 }, { "epoch": 1.2995169082125604, "grad_norm": 3.788666009902954, "learning_rate": 8.702898550724638e-07, "logits/chosen": -2.700831413269043, "logits/rejected": -2.6250321865081787, "logps/chosen": -46.875099182128906, "logps/rejected": -93.09805297851562, "loss": 0.6211, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3219397068023682, "rewards/margins": 3.7817931175231934, "rewards/rejected": -2.459852933883667, "step": 538 }, { "epoch": 1.3019323671497585, "grad_norm": 4.994095802307129, "learning_rate": 8.700483091787439e-07, "logits/chosen": -2.514387607574463, "logits/rejected": -2.5725598335266113, "logps/chosen": -42.62664031982422, "logps/rejected": -120.79769897460938, "loss": 0.5, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1823296546936035, "rewards/margins": 5.783605575561523, "rewards/rejected": -4.601275444030762, "step": 539 }, { "epoch": 1.3043478260869565, "grad_norm": 3.005969524383545, "learning_rate": 8.698067632850241e-07, "logits/chosen": -2.579376459121704, "logits/rejected": -2.4902145862579346, "logps/chosen": -50.57032012939453, "logps/rejected": -117.80158996582031, "loss": 0.5967, "rewards/accuracies": 0.875, "rewards/chosen": 0.7858688235282898, "rewards/margins": 5.010700225830078, "rewards/rejected": -4.224831581115723, "step": 540 }, { "epoch": 1.3067632850241546, "grad_norm": 4.793842792510986, "learning_rate": 8.695652173913043e-07, "logits/chosen": -2.55857515335083, "logits/rejected": -2.6123886108398438, "logps/chosen": -37.367977142333984, "logps/rejected": -106.9542007446289, "loss": 0.4771, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5766115188598633, "rewards/margins": 5.162326812744141, "rewards/rejected": -3.5857150554656982, "step": 541 }, { "epoch": 1.3091787439613527, "grad_norm": 4.279107570648193, "learning_rate": 8.693236714975844e-07, "logits/chosen": -2.552586793899536, "logits/rejected": -2.5519394874572754, "logps/chosen": -34.693397521972656, "logps/rejected": -115.8341293334961, "loss": 0.4744, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9385361671447754, "rewards/margins": 5.974316596984863, "rewards/rejected": -4.03577995300293, "step": 542 }, { "epoch": 1.3115942028985508, "grad_norm": 3.517469882965088, "learning_rate": 8.690821256038647e-07, "logits/chosen": -2.586395740509033, "logits/rejected": -2.5853004455566406, "logps/chosen": -58.47728729248047, "logps/rejected": -147.62893676757812, "loss": 0.5378, "rewards/accuracies": 0.96875, "rewards/chosen": 0.11097880452871323, "rewards/margins": 6.483273983001709, "rewards/rejected": -6.3722944259643555, "step": 543 }, { "epoch": 1.3140096618357489, "grad_norm": 6.1047539710998535, "learning_rate": 8.688405797101449e-07, "logits/chosen": -2.5849082469940186, "logits/rejected": -2.599142074584961, "logps/chosen": -59.58332443237305, "logps/rejected": -127.36058044433594, "loss": 0.5777, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20097164809703827, "rewards/margins": 4.917682647705078, "rewards/rejected": -5.118653774261475, "step": 544 }, { "epoch": 1.316425120772947, "grad_norm": 6.601560592651367, "learning_rate": 8.685990338164251e-07, "logits/chosen": -2.562117576599121, "logits/rejected": -2.543095350265503, "logps/chosen": -64.36446380615234, "logps/rejected": -128.84616088867188, "loss": 0.628, "rewards/accuracies": 0.875, "rewards/chosen": -0.3809451460838318, "rewards/margins": 4.789636135101318, "rewards/rejected": -5.170580863952637, "step": 545 }, { "epoch": 1.318840579710145, "grad_norm": 4.333390235900879, "learning_rate": 8.683574879227052e-07, "logits/chosen": -2.612405776977539, "logits/rejected": -2.586735486984253, "logps/chosen": -49.68403625488281, "logps/rejected": -117.2911376953125, "loss": 0.6031, "rewards/accuracies": 1.0, "rewards/chosen": 0.6752129197120667, "rewards/margins": 4.800351142883301, "rewards/rejected": -4.125138282775879, "step": 546 }, { "epoch": 1.321256038647343, "grad_norm": 4.533911228179932, "learning_rate": 8.681159420289854e-07, "logits/chosen": -2.6170926094055176, "logits/rejected": -2.5746679306030273, "logps/chosen": -34.94110107421875, "logps/rejected": -104.33856201171875, "loss": 0.449, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8464380502700806, "rewards/margins": 5.221747398376465, "rewards/rejected": -3.3753087520599365, "step": 547 }, { "epoch": 1.3236714975845412, "grad_norm": 2.9672908782958984, "learning_rate": 8.678743961352657e-07, "logits/chosen": -2.6269211769104004, "logits/rejected": -2.6035306453704834, "logps/chosen": -37.48003387451172, "logps/rejected": -116.01831817626953, "loss": 0.4941, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6293480396270752, "rewards/margins": 5.6168718338012695, "rewards/rejected": -3.9875242710113525, "step": 548 }, { "epoch": 1.3260869565217392, "grad_norm": 3.857247829437256, "learning_rate": 8.676328502415459e-07, "logits/chosen": -2.597536087036133, "logits/rejected": -2.6009302139282227, "logps/chosen": -44.6639289855957, "logps/rejected": -130.0935821533203, "loss": 0.4116, "rewards/accuracies": 1.0, "rewards/chosen": 1.2360848188400269, "rewards/margins": 6.298506736755371, "rewards/rejected": -5.062421798706055, "step": 549 }, { "epoch": 1.3285024154589373, "grad_norm": 3.3064398765563965, "learning_rate": 8.67391304347826e-07, "logits/chosen": -2.655193567276001, "logits/rejected": -2.5799834728240967, "logps/chosen": -47.16593933105469, "logps/rejected": -115.07009887695312, "loss": 0.5595, "rewards/accuracies": 0.96875, "rewards/chosen": 1.170609712600708, "rewards/margins": 5.296371936798096, "rewards/rejected": -4.125762462615967, "step": 550 }, { "epoch": 1.3309178743961354, "grad_norm": 3.8480889797210693, "learning_rate": 8.671497584541062e-07, "logits/chosen": -2.5846564769744873, "logits/rejected": -2.5949478149414062, "logps/chosen": -30.574989318847656, "logps/rejected": -107.56541442871094, "loss": 0.3654, "rewards/accuracies": 1.0, "rewards/chosen": 2.0235774517059326, "rewards/margins": 6.036774635314941, "rewards/rejected": -4.013197422027588, "step": 551 }, { "epoch": 1.3333333333333333, "grad_norm": 3.0718460083007812, "learning_rate": 8.669082125603865e-07, "logits/chosen": -2.5993900299072266, "logits/rejected": -2.58600115776062, "logps/chosen": -50.773128509521484, "logps/rejected": -114.0188217163086, "loss": 0.5358, "rewards/accuracies": 0.875, "rewards/chosen": 0.7104572057723999, "rewards/margins": 4.784107208251953, "rewards/rejected": -4.073649883270264, "step": 552 }, { "epoch": 1.3357487922705313, "grad_norm": 4.25900411605835, "learning_rate": 8.666666666666667e-07, "logits/chosen": -2.5379602909088135, "logits/rejected": -2.574702739715576, "logps/chosen": -46.90471649169922, "logps/rejected": -114.39452362060547, "loss": 0.5216, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9336822032928467, "rewards/margins": 4.775083541870117, "rewards/rejected": -3.841400623321533, "step": 553 }, { "epoch": 1.3381642512077294, "grad_norm": 4.8328776359558105, "learning_rate": 8.664251207729468e-07, "logits/chosen": -2.5966053009033203, "logits/rejected": -2.5637576580047607, "logps/chosen": -44.19755554199219, "logps/rejected": -104.1707992553711, "loss": 0.6065, "rewards/accuracies": 0.9375, "rewards/chosen": 1.069654941558838, "rewards/margins": 4.603400230407715, "rewards/rejected": -3.533745527267456, "step": 554 }, { "epoch": 1.3405797101449275, "grad_norm": 4.182687282562256, "learning_rate": 8.66183574879227e-07, "logits/chosen": -2.609541893005371, "logits/rejected": -2.594705820083618, "logps/chosen": -41.33436584472656, "logps/rejected": -116.90367126464844, "loss": 0.4283, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7443307638168335, "rewards/margins": 5.931890487670898, "rewards/rejected": -4.187559127807617, "step": 555 }, { "epoch": 1.3429951690821256, "grad_norm": 4.3405327796936035, "learning_rate": 8.659420289855072e-07, "logits/chosen": -2.489593029022217, "logits/rejected": -2.56553316116333, "logps/chosen": -38.070892333984375, "logps/rejected": -117.236572265625, "loss": 0.4222, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5682603120803833, "rewards/margins": 5.6924519538879395, "rewards/rejected": -4.124191761016846, "step": 556 }, { "epoch": 1.3454106280193237, "grad_norm": 4.309421062469482, "learning_rate": 8.657004830917874e-07, "logits/chosen": -2.6706674098968506, "logits/rejected": -2.655468225479126, "logps/chosen": -39.78803253173828, "logps/rejected": -107.26142883300781, "loss": 0.5484, "rewards/accuracies": 0.90625, "rewards/chosen": 1.565994381904602, "rewards/margins": 4.865843772888184, "rewards/rejected": -3.299849033355713, "step": 557 }, { "epoch": 1.3478260869565217, "grad_norm": 3.7831733226776123, "learning_rate": 8.654589371980676e-07, "logits/chosen": -2.6094610691070557, "logits/rejected": -2.633852005004883, "logps/chosen": -45.308143615722656, "logps/rejected": -103.12156677246094, "loss": 0.6132, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3048095703125, "rewards/margins": 4.6758809089660645, "rewards/rejected": -3.3710713386535645, "step": 558 }, { "epoch": 1.3502415458937198, "grad_norm": 3.545933246612549, "learning_rate": 8.652173913043478e-07, "logits/chosen": -2.5782551765441895, "logits/rejected": -2.557966709136963, "logps/chosen": -34.888206481933594, "logps/rejected": -97.02593994140625, "loss": 0.5063, "rewards/accuracies": 0.875, "rewards/chosen": 1.8906137943267822, "rewards/margins": 4.818953037261963, "rewards/rejected": -2.9283392429351807, "step": 559 }, { "epoch": 1.3526570048309179, "grad_norm": 3.0413811206817627, "learning_rate": 8.649758454106279e-07, "logits/chosen": -2.593034505844116, "logits/rejected": -2.553887128829956, "logps/chosen": -27.59088134765625, "logps/rejected": -102.88499450683594, "loss": 0.5775, "rewards/accuracies": 0.9375, "rewards/chosen": 2.003147840499878, "rewards/margins": 5.308782577514648, "rewards/rejected": -3.3056342601776123, "step": 560 }, { "epoch": 1.355072463768116, "grad_norm": 4.207345008850098, "learning_rate": 8.647342995169082e-07, "logits/chosen": -2.58579683303833, "logits/rejected": -2.4600815773010254, "logps/chosen": -36.028865814208984, "logps/rejected": -106.75294494628906, "loss": 0.4916, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7617639303207397, "rewards/margins": 5.414809703826904, "rewards/rejected": -3.653045892715454, "step": 561 }, { "epoch": 1.357487922705314, "grad_norm": 6.7763800621032715, "learning_rate": 8.644927536231884e-07, "logits/chosen": -2.636630058288574, "logits/rejected": -2.6209330558776855, "logps/chosen": -44.88800811767578, "logps/rejected": -113.90579223632812, "loss": 0.5418, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9306947588920593, "rewards/margins": 4.822494029998779, "rewards/rejected": -3.8917996883392334, "step": 562 }, { "epoch": 1.3599033816425121, "grad_norm": 3.707807779312134, "learning_rate": 8.642512077294685e-07, "logits/chosen": -2.6216256618499756, "logits/rejected": -2.5005693435668945, "logps/chosen": -48.63080596923828, "logps/rejected": -114.59463500976562, "loss": 0.6572, "rewards/accuracies": 0.90625, "rewards/chosen": 0.684920608997345, "rewards/margins": 4.578422546386719, "rewards/rejected": -3.8935022354125977, "step": 563 }, { "epoch": 1.3623188405797102, "grad_norm": 3.7047572135925293, "learning_rate": 8.640096618357488e-07, "logits/chosen": -2.5170297622680664, "logits/rejected": -2.4486098289489746, "logps/chosen": -51.746978759765625, "logps/rejected": -118.36555480957031, "loss": 0.5028, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7669703960418701, "rewards/margins": 5.348423004150391, "rewards/rejected": -4.581452369689941, "step": 564 }, { "epoch": 1.3647342995169083, "grad_norm": 5.056445598602295, "learning_rate": 8.637681159420289e-07, "logits/chosen": -2.6099119186401367, "logits/rejected": -2.6112401485443115, "logps/chosen": -52.301734924316406, "logps/rejected": -119.91637420654297, "loss": 0.6029, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3383689522743225, "rewards/margins": 4.938800811767578, "rewards/rejected": -4.6004319190979, "step": 565 }, { "epoch": 1.3671497584541064, "grad_norm": 3.678584337234497, "learning_rate": 8.63526570048309e-07, "logits/chosen": -2.6694023609161377, "logits/rejected": -2.6744983196258545, "logps/chosen": -53.44922637939453, "logps/rejected": -121.66481018066406, "loss": 0.5356, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6239067316055298, "rewards/margins": 5.398603439331055, "rewards/rejected": -4.7746968269348145, "step": 566 }, { "epoch": 1.3695652173913042, "grad_norm": 4.505202293395996, "learning_rate": 8.632850241545894e-07, "logits/chosen": -2.5953402519226074, "logits/rejected": -2.5407190322875977, "logps/chosen": -33.95108413696289, "logps/rejected": -126.57755279541016, "loss": 0.3464, "rewards/accuracies": 1.0, "rewards/chosen": 2.1833674907684326, "rewards/margins": 6.877795219421387, "rewards/rejected": -4.694427967071533, "step": 567 }, { "epoch": 1.3719806763285023, "grad_norm": 4.382648468017578, "learning_rate": 8.630434782608695e-07, "logits/chosen": -2.63108229637146, "logits/rejected": -2.6382596492767334, "logps/chosen": -38.353065490722656, "logps/rejected": -121.75837707519531, "loss": 0.4738, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8324408531188965, "rewards/margins": 6.24570369720459, "rewards/rejected": -4.413263320922852, "step": 568 }, { "epoch": 1.3743961352657004, "grad_norm": 3.854320764541626, "learning_rate": 8.628019323671498e-07, "logits/chosen": -2.636542320251465, "logits/rejected": -2.6081786155700684, "logps/chosen": -46.832035064697266, "logps/rejected": -123.15992736816406, "loss": 0.5397, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0743257999420166, "rewards/margins": 5.70175838470459, "rewards/rejected": -4.627432823181152, "step": 569 }, { "epoch": 1.3768115942028984, "grad_norm": 3.8875982761383057, "learning_rate": 8.625603864734299e-07, "logits/chosen": -2.578627824783325, "logits/rejected": -2.5827527046203613, "logps/chosen": -39.551692962646484, "logps/rejected": -106.89331817626953, "loss": 0.5594, "rewards/accuracies": 1.0, "rewards/chosen": 1.4811124801635742, "rewards/margins": 5.243432521820068, "rewards/rejected": -3.7623202800750732, "step": 570 }, { "epoch": 1.3792270531400965, "grad_norm": 5.041428089141846, "learning_rate": 8.623188405797101e-07, "logits/chosen": -2.642104387283325, "logits/rejected": -2.639904022216797, "logps/chosen": -47.58982849121094, "logps/rejected": -130.91139221191406, "loss": 0.4854, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7308892011642456, "rewards/margins": 6.134689807891846, "rewards/rejected": -5.403801441192627, "step": 571 }, { "epoch": 1.3816425120772946, "grad_norm": 2.7337608337402344, "learning_rate": 8.620772946859904e-07, "logits/chosen": -2.5686099529266357, "logits/rejected": -2.488389253616333, "logps/chosen": -49.68255615234375, "logps/rejected": -129.46987915039062, "loss": 0.5309, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8304027318954468, "rewards/margins": 5.689323425292969, "rewards/rejected": -4.858921051025391, "step": 572 }, { "epoch": 1.3840579710144927, "grad_norm": 5.552570343017578, "learning_rate": 8.618357487922705e-07, "logits/chosen": -2.600843906402588, "logits/rejected": -2.5870959758758545, "logps/chosen": -36.55866622924805, "logps/rejected": -112.57905578613281, "loss": 0.5297, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3969305753707886, "rewards/margins": 5.632270812988281, "rewards/rejected": -4.235339164733887, "step": 573 }, { "epoch": 1.3864734299516908, "grad_norm": 4.142045021057129, "learning_rate": 8.615942028985506e-07, "logits/chosen": -2.6629562377929688, "logits/rejected": -2.69461727142334, "logps/chosen": -59.404483795166016, "logps/rejected": -132.8556365966797, "loss": 0.5749, "rewards/accuracies": 1.0, "rewards/chosen": 0.13891386985778809, "rewards/margins": 5.518850326538086, "rewards/rejected": -5.379937171936035, "step": 574 }, { "epoch": 1.3888888888888888, "grad_norm": 3.29067063331604, "learning_rate": 8.613526570048309e-07, "logits/chosen": -2.634812593460083, "logits/rejected": -2.657705545425415, "logps/chosen": -50.49504089355469, "logps/rejected": -138.5894012451172, "loss": 0.4408, "rewards/accuracies": 1.0, "rewards/chosen": 0.6945456266403198, "rewards/margins": 6.868215560913086, "rewards/rejected": -6.173669815063477, "step": 575 }, { "epoch": 1.391304347826087, "grad_norm": 3.7344911098480225, "learning_rate": 8.611111111111111e-07, "logits/chosen": -2.6364903450012207, "logits/rejected": -2.598935842514038, "logps/chosen": -36.995540618896484, "logps/rejected": -102.5750732421875, "loss": 0.5122, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8549349308013916, "rewards/margins": 4.8873066902160645, "rewards/rejected": -3.0323715209960938, "step": 576 }, { "epoch": 1.393719806763285, "grad_norm": 4.6780195236206055, "learning_rate": 8.608695652173913e-07, "logits/chosen": -2.5836055278778076, "logits/rejected": -2.5594608783721924, "logps/chosen": -57.969627380371094, "logps/rejected": -143.47698974609375, "loss": 0.5024, "rewards/accuracies": 0.96875, "rewards/chosen": 0.15652191638946533, "rewards/margins": 6.373619079589844, "rewards/rejected": -6.217097759246826, "step": 577 }, { "epoch": 1.396135265700483, "grad_norm": 3.8087947368621826, "learning_rate": 8.606280193236715e-07, "logits/chosen": -2.5943853855133057, "logits/rejected": -2.545358180999756, "logps/chosen": -50.3066520690918, "logps/rejected": -115.26985931396484, "loss": 0.5535, "rewards/accuracies": 1.0, "rewards/chosen": 0.9503607749938965, "rewards/margins": 5.277218818664551, "rewards/rejected": -4.326857566833496, "step": 578 }, { "epoch": 1.3985507246376812, "grad_norm": 3.7140557765960693, "learning_rate": 8.603864734299516e-07, "logits/chosen": -2.607656955718994, "logits/rejected": -2.5605010986328125, "logps/chosen": -33.575401306152344, "logps/rejected": -96.98208618164062, "loss": 0.4929, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9850105047225952, "rewards/margins": 4.763213157653809, "rewards/rejected": -2.778202772140503, "step": 579 }, { "epoch": 1.4009661835748792, "grad_norm": 4.264507293701172, "learning_rate": 8.601449275362318e-07, "logits/chosen": -2.6406197547912598, "logits/rejected": -2.5989954471588135, "logps/chosen": -47.43098831176758, "logps/rejected": -106.26336669921875, "loss": 0.6334, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8020913600921631, "rewards/margins": 4.351165771484375, "rewards/rejected": -3.5490739345550537, "step": 580 }, { "epoch": 1.4033816425120773, "grad_norm": 3.267721176147461, "learning_rate": 8.599033816425121e-07, "logits/chosen": -2.6128499507904053, "logits/rejected": -2.5369975566864014, "logps/chosen": -35.8553581237793, "logps/rejected": -121.19316101074219, "loss": 0.4116, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6950135231018066, "rewards/margins": 6.361786365509033, "rewards/rejected": -4.666772842407227, "step": 581 }, { "epoch": 1.4033816425120773, "eval_logits/chosen": -2.638453722000122, "eval_logits/rejected": -2.6480770111083984, "eval_logps/chosen": -40.79484939575195, "eval_logps/rejected": -125.57169342041016, "eval_loss": 0.4337269365787506, "eval_rewards/accuracies": 0.9540441036224365, "eval_rewards/chosen": 1.3866066932678223, "eval_rewards/margins": 6.310375690460205, "eval_rewards/rejected": -4.923768997192383, "eval_runtime": 1000.997, "eval_samples_per_second": 0.543, "eval_steps_per_second": 0.272, "step": 581 }, { "epoch": 1.4057971014492754, "grad_norm": 3.864335060119629, "learning_rate": 8.596618357487922e-07, "logits/chosen": -2.6225717067718506, "logits/rejected": -2.637845516204834, "logps/chosen": -43.97489547729492, "logps/rejected": -128.18112182617188, "loss": 0.4915, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9699992537498474, "rewards/margins": 6.0109100341796875, "rewards/rejected": -5.040910720825195, "step": 582 }, { "epoch": 1.4082125603864735, "grad_norm": 3.139582633972168, "learning_rate": 8.594202898550724e-07, "logits/chosen": -2.583704710006714, "logits/rejected": -2.590817928314209, "logps/chosen": -41.82133102416992, "logps/rejected": -122.95911407470703, "loss": 0.5338, "rewards/accuracies": 0.9375, "rewards/chosen": 1.360069990158081, "rewards/margins": 6.000023365020752, "rewards/rejected": -4.63995361328125, "step": 583 }, { "epoch": 1.4106280193236715, "grad_norm": 3.8150179386138916, "learning_rate": 8.591787439613526e-07, "logits/chosen": -2.545804500579834, "logits/rejected": -2.552536964416504, "logps/chosen": -48.681678771972656, "logps/rejected": -119.10025024414062, "loss": 0.556, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7572271823883057, "rewards/margins": 5.328928470611572, "rewards/rejected": -4.571701526641846, "step": 584 }, { "epoch": 1.4130434782608696, "grad_norm": 3.891421318054199, "learning_rate": 8.589371980676327e-07, "logits/chosen": -2.58689546585083, "logits/rejected": -2.641542911529541, "logps/chosen": -40.91241455078125, "logps/rejected": -125.552978515625, "loss": 0.4101, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5679221153259277, "rewards/margins": 6.22145938873291, "rewards/rejected": -4.653537273406982, "step": 585 }, { "epoch": 1.4154589371980677, "grad_norm": 3.4844353199005127, "learning_rate": 8.586956521739131e-07, "logits/chosen": -2.6356709003448486, "logits/rejected": -2.6289546489715576, "logps/chosen": -47.166236877441406, "logps/rejected": -113.757080078125, "loss": 0.5858, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9544004797935486, "rewards/margins": 5.064639568328857, "rewards/rejected": -4.110239028930664, "step": 586 }, { "epoch": 1.4178743961352658, "grad_norm": 4.043455123901367, "learning_rate": 8.584541062801932e-07, "logits/chosen": -2.6295006275177, "logits/rejected": -2.6011109352111816, "logps/chosen": -51.81975555419922, "logps/rejected": -115.92530822753906, "loss": 0.5977, "rewards/accuracies": 1.0, "rewards/chosen": 0.21205668151378632, "rewards/margins": 4.6397294998168945, "rewards/rejected": -4.427672863006592, "step": 587 }, { "epoch": 1.4202898550724639, "grad_norm": 3.7438857555389404, "learning_rate": 8.582125603864734e-07, "logits/chosen": -2.537080764770508, "logits/rejected": -2.554927110671997, "logps/chosen": -41.324134826660156, "logps/rejected": -123.67254638671875, "loss": 0.4512, "rewards/accuracies": 0.96875, "rewards/chosen": 1.407272219657898, "rewards/margins": 5.890748977661133, "rewards/rejected": -4.483476161956787, "step": 588 }, { "epoch": 1.422705314009662, "grad_norm": 3.627882480621338, "learning_rate": 8.579710144927536e-07, "logits/chosen": -2.698406219482422, "logits/rejected": -2.661328077316284, "logps/chosen": -36.58280563354492, "logps/rejected": -115.21754455566406, "loss": 0.4644, "rewards/accuracies": 0.9375, "rewards/chosen": 2.033268451690674, "rewards/margins": 6.054413795471191, "rewards/rejected": -4.021145343780518, "step": 589 }, { "epoch": 1.42512077294686, "grad_norm": 5.071658134460449, "learning_rate": 8.577294685990337e-07, "logits/chosen": -2.569148063659668, "logits/rejected": -2.640285015106201, "logps/chosen": -40.47565460205078, "logps/rejected": -114.40635681152344, "loss": 0.5018, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6640931367874146, "rewards/margins": 5.504258155822754, "rewards/rejected": -3.8401646614074707, "step": 590 }, { "epoch": 1.427536231884058, "grad_norm": 3.8516018390655518, "learning_rate": 8.57487922705314e-07, "logits/chosen": -2.5488622188568115, "logits/rejected": -2.5955557823181152, "logps/chosen": -48.44557189941406, "logps/rejected": -111.94387817382812, "loss": 0.5635, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7365989089012146, "rewards/margins": 5.080421447753906, "rewards/rejected": -4.343823432922363, "step": 591 }, { "epoch": 1.4299516908212562, "grad_norm": 3.1002068519592285, "learning_rate": 8.572463768115942e-07, "logits/chosen": -2.5892467498779297, "logits/rejected": -2.6092708110809326, "logps/chosen": -51.54841232299805, "logps/rejected": -137.8788299560547, "loss": 0.4603, "rewards/accuracies": 1.0, "rewards/chosen": 0.6992886662483215, "rewards/margins": 6.440556049346924, "rewards/rejected": -5.741267204284668, "step": 592 }, { "epoch": 1.432367149758454, "grad_norm": 4.329202651977539, "learning_rate": 8.570048309178743e-07, "logits/chosen": -2.6728107929229736, "logits/rejected": -2.5788466930389404, "logps/chosen": -44.29630661010742, "logps/rejected": -111.39143371582031, "loss": 0.5221, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0439730882644653, "rewards/margins": 5.066498756408691, "rewards/rejected": -4.022525787353516, "step": 593 }, { "epoch": 1.434782608695652, "grad_norm": 4.328762531280518, "learning_rate": 8.567632850241545e-07, "logits/chosen": -2.6138625144958496, "logits/rejected": -2.6235618591308594, "logps/chosen": -40.98774719238281, "logps/rejected": -111.70673370361328, "loss": 0.4482, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7129746675491333, "rewards/margins": 5.773551940917969, "rewards/rejected": -4.060576915740967, "step": 594 }, { "epoch": 1.4371980676328502, "grad_norm": 3.041201114654541, "learning_rate": 8.565217391304348e-07, "logits/chosen": -2.647418260574341, "logits/rejected": -2.5586068630218506, "logps/chosen": -51.257930755615234, "logps/rejected": -123.8143310546875, "loss": 0.5832, "rewards/accuracies": 1.0, "rewards/chosen": 0.34927526116371155, "rewards/margins": 5.271534442901611, "rewards/rejected": -4.9222588539123535, "step": 595 }, { "epoch": 1.4396135265700483, "grad_norm": 3.627396821975708, "learning_rate": 8.56280193236715e-07, "logits/chosen": -2.622251272201538, "logits/rejected": -2.578350782394409, "logps/chosen": -52.12342834472656, "logps/rejected": -132.21835327148438, "loss": 0.5642, "rewards/accuracies": 0.90625, "rewards/chosen": 0.25238656997680664, "rewards/margins": 5.810054779052734, "rewards/rejected": -5.557668685913086, "step": 596 }, { "epoch": 1.4420289855072463, "grad_norm": 4.7508649826049805, "learning_rate": 8.560386473429951e-07, "logits/chosen": -2.5702457427978516, "logits/rejected": -2.562297821044922, "logps/chosen": -40.90800476074219, "logps/rejected": -117.75696563720703, "loss": 0.4172, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6099331378936768, "rewards/margins": 6.100215435028076, "rewards/rejected": -4.4902825355529785, "step": 597 }, { "epoch": 1.4444444444444444, "grad_norm": 3.491187810897827, "learning_rate": 8.557971014492753e-07, "logits/chosen": -2.5877370834350586, "logits/rejected": -2.564937114715576, "logps/chosen": -36.293060302734375, "logps/rejected": -119.98421478271484, "loss": 0.4814, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9989135265350342, "rewards/margins": 6.292038917541504, "rewards/rejected": -4.293125152587891, "step": 598 }, { "epoch": 1.4468599033816425, "grad_norm": 4.8071489334106445, "learning_rate": 8.555555555555555e-07, "logits/chosen": -2.658907890319824, "logits/rejected": -2.593508005142212, "logps/chosen": -43.486900329589844, "logps/rejected": -122.25668334960938, "loss": 0.4887, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1543458700180054, "rewards/margins": 5.624255657196045, "rewards/rejected": -4.469909191131592, "step": 599 }, { "epoch": 1.4492753623188406, "grad_norm": 3.234780788421631, "learning_rate": 8.553140096618357e-07, "logits/chosen": -2.699831962585449, "logits/rejected": -2.618434429168701, "logps/chosen": -48.697383880615234, "logps/rejected": -120.73259735107422, "loss": 0.5578, "rewards/accuracies": 0.875, "rewards/chosen": 0.8243659138679504, "rewards/margins": 5.28347635269165, "rewards/rejected": -4.459110260009766, "step": 600 }, { "epoch": 1.4516908212560387, "grad_norm": 3.2191619873046875, "learning_rate": 8.550724637681159e-07, "logits/chosen": -2.6017565727233887, "logits/rejected": -2.5758566856384277, "logps/chosen": -58.61246871948242, "logps/rejected": -133.73898315429688, "loss": 0.5518, "rewards/accuracies": 1.0, "rewards/chosen": 0.11335326731204987, "rewards/margins": 5.791937828063965, "rewards/rejected": -5.678584575653076, "step": 601 }, { "epoch": 1.4541062801932367, "grad_norm": 3.2270023822784424, "learning_rate": 8.548309178743961e-07, "logits/chosen": -2.57936429977417, "logits/rejected": -2.628835916519165, "logps/chosen": -48.93642807006836, "logps/rejected": -117.50521850585938, "loss": 0.5408, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2107000350952148, "rewards/margins": 5.525285720825195, "rewards/rejected": -4.3145856857299805, "step": 602 }, { "epoch": 1.4565217391304348, "grad_norm": 3.0343542098999023, "learning_rate": 8.545893719806762e-07, "logits/chosen": -2.5454916954040527, "logits/rejected": -2.5884504318237305, "logps/chosen": -36.080814361572266, "logps/rejected": -120.78044128417969, "loss": 0.4471, "rewards/accuracies": 0.96875, "rewards/chosen": 2.010427951812744, "rewards/margins": 6.4535017013549805, "rewards/rejected": -4.443073272705078, "step": 603 }, { "epoch": 1.458937198067633, "grad_norm": 4.1161088943481445, "learning_rate": 8.543478260869565e-07, "logits/chosen": -2.5434367656707764, "logits/rejected": -2.5892961025238037, "logps/chosen": -36.990989685058594, "logps/rejected": -111.61705780029297, "loss": 0.4991, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6276075839996338, "rewards/margins": 5.6406683921813965, "rewards/rejected": -4.013061046600342, "step": 604 }, { "epoch": 1.461352657004831, "grad_norm": 3.4950144290924072, "learning_rate": 8.541062801932367e-07, "logits/chosen": -2.6208624839782715, "logits/rejected": -2.597256898880005, "logps/chosen": -35.002906799316406, "logps/rejected": -129.97552490234375, "loss": 0.369, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6144145727157593, "rewards/margins": 6.595409870147705, "rewards/rejected": -4.980995178222656, "step": 605 }, { "epoch": 1.463768115942029, "grad_norm": 3.467128038406372, "learning_rate": 8.538647342995168e-07, "logits/chosen": -2.604996681213379, "logits/rejected": -2.6396079063415527, "logps/chosen": -43.42456817626953, "logps/rejected": -126.73687744140625, "loss": 0.4321, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7321090698242188, "rewards/margins": 6.276237487792969, "rewards/rejected": -4.544127464294434, "step": 606 }, { "epoch": 1.4661835748792271, "grad_norm": 3.840819835662842, "learning_rate": 8.536231884057971e-07, "logits/chosen": -2.6372509002685547, "logits/rejected": -2.6056625843048096, "logps/chosen": -45.280391693115234, "logps/rejected": -121.13685607910156, "loss": 0.5238, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0810704231262207, "rewards/margins": 5.575228214263916, "rewards/rejected": -4.4941582679748535, "step": 607 }, { "epoch": 1.4685990338164252, "grad_norm": 3.6587650775909424, "learning_rate": 8.533816425120772e-07, "logits/chosen": -2.5538032054901123, "logits/rejected": -2.5056865215301514, "logps/chosen": -40.313175201416016, "logps/rejected": -110.57817077636719, "loss": 0.5357, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5104944705963135, "rewards/margins": 5.308216094970703, "rewards/rejected": -3.7977218627929688, "step": 608 }, { "epoch": 1.471014492753623, "grad_norm": 4.29678201675415, "learning_rate": 8.531400966183574e-07, "logits/chosen": -2.633415937423706, "logits/rejected": -2.642228603363037, "logps/chosen": -45.60035705566406, "logps/rejected": -116.40776824951172, "loss": 0.4701, "rewards/accuracies": 0.9375, "rewards/chosen": 1.322991132736206, "rewards/margins": 5.658492565155029, "rewards/rejected": -4.335501194000244, "step": 609 }, { "epoch": 1.4734299516908211, "grad_norm": 4.439515113830566, "learning_rate": 8.528985507246377e-07, "logits/chosen": -2.6509132385253906, "logits/rejected": -2.6184353828430176, "logps/chosen": -49.953460693359375, "logps/rejected": -132.5948028564453, "loss": 0.5384, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6747921109199524, "rewards/margins": 5.951862335205078, "rewards/rejected": -5.27707052230835, "step": 610 }, { "epoch": 1.4758454106280192, "grad_norm": 2.520308017730713, "learning_rate": 8.526570048309178e-07, "logits/chosen": -2.6431543827056885, "logits/rejected": -2.540743827819824, "logps/chosen": -44.07157897949219, "logps/rejected": -135.5053253173828, "loss": 0.3896, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3615102767944336, "rewards/margins": 6.730288982391357, "rewards/rejected": -5.368778705596924, "step": 611 }, { "epoch": 1.4782608695652173, "grad_norm": 4.362029075622559, "learning_rate": 8.524154589371981e-07, "logits/chosen": -2.53299880027771, "logits/rejected": -2.575387954711914, "logps/chosen": -52.70979690551758, "logps/rejected": -134.573974609375, "loss": 0.4983, "rewards/accuracies": 0.90625, "rewards/chosen": 0.27100032567977905, "rewards/margins": 5.541836738586426, "rewards/rejected": -5.27083683013916, "step": 612 }, { "epoch": 1.4806763285024154, "grad_norm": 4.939234256744385, "learning_rate": 8.521739130434782e-07, "logits/chosen": -2.582836627960205, "logits/rejected": -2.605060577392578, "logps/chosen": -38.97877883911133, "logps/rejected": -102.04784393310547, "loss": 0.4707, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6562341451644897, "rewards/margins": 5.108818531036377, "rewards/rejected": -3.4525845050811768, "step": 613 }, { "epoch": 1.4830917874396135, "grad_norm": 4.174498081207275, "learning_rate": 8.519323671497584e-07, "logits/chosen": -2.571233034133911, "logits/rejected": -2.5681989192962646, "logps/chosen": -39.589046478271484, "logps/rejected": -124.58238983154297, "loss": 0.4797, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3990687131881714, "rewards/margins": 6.3893656730651855, "rewards/rejected": -4.990297317504883, "step": 614 }, { "epoch": 1.4855072463768115, "grad_norm": 2.935373306274414, "learning_rate": 8.516908212560387e-07, "logits/chosen": -2.561680316925049, "logits/rejected": -2.5342752933502197, "logps/chosen": -42.14595413208008, "logps/rejected": -108.6977310180664, "loss": 0.5308, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3119006156921387, "rewards/margins": 5.113885879516602, "rewards/rejected": -3.801985740661621, "step": 615 }, { "epoch": 1.4879227053140096, "grad_norm": 4.114550590515137, "learning_rate": 8.514492753623188e-07, "logits/chosen": -2.6317028999328613, "logits/rejected": -2.609656572341919, "logps/chosen": -57.57586669921875, "logps/rejected": -136.0525360107422, "loss": 0.5986, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06651508808135986, "rewards/margins": 5.619213104248047, "rewards/rejected": -5.685727596282959, "step": 616 }, { "epoch": 1.4903381642512077, "grad_norm": 3.872185230255127, "learning_rate": 8.512077294685989e-07, "logits/chosen": -2.5837440490722656, "logits/rejected": -2.5702409744262695, "logps/chosen": -43.807777404785156, "logps/rejected": -119.82904052734375, "loss": 0.4554, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0656578540802002, "rewards/margins": 5.675937175750732, "rewards/rejected": -4.610279560089111, "step": 617 }, { "epoch": 1.4927536231884058, "grad_norm": 4.556628704071045, "learning_rate": 8.509661835748792e-07, "logits/chosen": -2.577723503112793, "logits/rejected": -2.601050615310669, "logps/chosen": -46.79001998901367, "logps/rejected": -114.86477661132812, "loss": 0.5508, "rewards/accuracies": 0.9375, "rewards/chosen": 1.289038062095642, "rewards/margins": 5.2539825439453125, "rewards/rejected": -3.964944362640381, "step": 618 }, { "epoch": 1.4951690821256038, "grad_norm": 4.016005516052246, "learning_rate": 8.507246376811594e-07, "logits/chosen": -2.623955249786377, "logits/rejected": -2.6315958499908447, "logps/chosen": -42.7066764831543, "logps/rejected": -115.05704498291016, "loss": 0.4705, "rewards/accuracies": 0.9375, "rewards/chosen": 1.574674129486084, "rewards/margins": 5.548727989196777, "rewards/rejected": -3.9740536212921143, "step": 619 }, { "epoch": 1.497584541062802, "grad_norm": 3.857245683670044, "learning_rate": 8.504830917874396e-07, "logits/chosen": -2.5174689292907715, "logits/rejected": -2.531477689743042, "logps/chosen": -35.95781326293945, "logps/rejected": -119.94706726074219, "loss": 0.3874, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6328320503234863, "rewards/margins": 6.3085174560546875, "rewards/rejected": -4.675684452056885, "step": 620 }, { "epoch": 1.5, "grad_norm": 3.9104063510894775, "learning_rate": 8.502415458937198e-07, "logits/chosen": -2.6698384284973145, "logits/rejected": -2.6380653381347656, "logps/chosen": -46.87696075439453, "logps/rejected": -120.45149993896484, "loss": 0.5203, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0418614149093628, "rewards/margins": 5.088179588317871, "rewards/rejected": -4.046318531036377, "step": 621 }, { "epoch": 1.502415458937198, "grad_norm": 3.3232853412628174, "learning_rate": 8.499999999999999e-07, "logits/chosen": -2.6418869495391846, "logits/rejected": -2.595200777053833, "logps/chosen": -28.305431365966797, "logps/rejected": -118.07929992675781, "loss": 0.4167, "rewards/accuracies": 0.9375, "rewards/chosen": 2.4013264179229736, "rewards/margins": 6.584329605102539, "rewards/rejected": -4.1830034255981445, "step": 622 }, { "epoch": 1.5048309178743962, "grad_norm": 3.0997657775878906, "learning_rate": 8.497584541062801e-07, "logits/chosen": -2.544351100921631, "logits/rejected": -2.5703749656677246, "logps/chosen": -44.995086669921875, "logps/rejected": -127.17832946777344, "loss": 0.4719, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9507427215576172, "rewards/margins": 5.893670082092285, "rewards/rejected": -4.94292688369751, "step": 623 }, { "epoch": 1.5072463768115942, "grad_norm": 2.6455860137939453, "learning_rate": 8.495169082125604e-07, "logits/chosen": -2.6033763885498047, "logits/rejected": -2.549755334854126, "logps/chosen": -47.47113037109375, "logps/rejected": -114.0958251953125, "loss": 0.5616, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9536014795303345, "rewards/margins": 4.973103046417236, "rewards/rejected": -4.019501686096191, "step": 624 }, { "epoch": 1.5096618357487923, "grad_norm": 3.7555558681488037, "learning_rate": 8.492753623188405e-07, "logits/chosen": -2.5715389251708984, "logits/rejected": -2.564971923828125, "logps/chosen": -52.84260940551758, "logps/rejected": -135.5547332763672, "loss": 0.4631, "rewards/accuracies": 0.90625, "rewards/chosen": 0.584739089012146, "rewards/margins": 6.131957530975342, "rewards/rejected": -5.547217845916748, "step": 625 }, { "epoch": 1.5120772946859904, "grad_norm": 4.76609992980957, "learning_rate": 8.490338164251208e-07, "logits/chosen": -2.611638307571411, "logits/rejected": -2.542053461074829, "logps/chosen": -38.068756103515625, "logps/rejected": -101.26969909667969, "loss": 0.4843, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9996509552001953, "rewards/margins": 4.869932174682617, "rewards/rejected": -2.870281219482422, "step": 626 }, { "epoch": 1.5144927536231885, "grad_norm": 4.587608814239502, "learning_rate": 8.487922705314009e-07, "logits/chosen": -2.6548328399658203, "logits/rejected": -2.5595526695251465, "logps/chosen": -37.109378814697266, "logps/rejected": -118.80729675292969, "loss": 0.4655, "rewards/accuracies": 0.875, "rewards/chosen": 1.4104087352752686, "rewards/margins": 5.779996395111084, "rewards/rejected": -4.3695878982543945, "step": 627 }, { "epoch": 1.5169082125603865, "grad_norm": 5.218349456787109, "learning_rate": 8.485507246376811e-07, "logits/chosen": -2.529320478439331, "logits/rejected": -2.4933722019195557, "logps/chosen": -34.569034576416016, "logps/rejected": -129.6604766845703, "loss": 0.438, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9730628728866577, "rewards/margins": 6.9716010093688965, "rewards/rejected": -4.998538494110107, "step": 628 }, { "epoch": 1.5193236714975846, "grad_norm": 4.184903621673584, "learning_rate": 8.483091787439614e-07, "logits/chosen": -2.6106464862823486, "logits/rejected": -2.623108386993408, "logps/chosen": -32.1125373840332, "logps/rejected": -95.70994567871094, "loss": 0.5234, "rewards/accuracies": 0.96875, "rewards/chosen": 2.116955041885376, "rewards/margins": 4.8610076904296875, "rewards/rejected": -2.7440521717071533, "step": 629 }, { "epoch": 1.5217391304347827, "grad_norm": 5.591579437255859, "learning_rate": 8.480676328502415e-07, "logits/chosen": -2.5445940494537354, "logits/rejected": -2.611309051513672, "logps/chosen": -39.686912536621094, "logps/rejected": -111.64945983886719, "loss": 0.4619, "rewards/accuracies": 0.96875, "rewards/chosen": 1.908467173576355, "rewards/margins": 5.8793487548828125, "rewards/rejected": -3.9708809852600098, "step": 630 }, { "epoch": 1.5241545893719808, "grad_norm": 4.691028118133545, "learning_rate": 8.478260869565217e-07, "logits/chosen": -2.590862512588501, "logits/rejected": -2.650088310241699, "logps/chosen": -49.415164947509766, "logps/rejected": -110.45014190673828, "loss": 0.607, "rewards/accuracies": 0.875, "rewards/chosen": 1.0089795589447021, "rewards/margins": 4.60223388671875, "rewards/rejected": -3.593255043029785, "step": 631 }, { "epoch": 1.5265700483091789, "grad_norm": 4.027966499328613, "learning_rate": 8.475845410628019e-07, "logits/chosen": -2.647732973098755, "logits/rejected": -2.6750235557556152, "logps/chosen": -45.60964584350586, "logps/rejected": -117.05667877197266, "loss": 0.5209, "rewards/accuracies": 0.90625, "rewards/chosen": 1.169980764389038, "rewards/margins": 5.390131950378418, "rewards/rejected": -4.220150947570801, "step": 632 }, { "epoch": 1.528985507246377, "grad_norm": 3.0520029067993164, "learning_rate": 8.473429951690821e-07, "logits/chosen": -2.6349411010742188, "logits/rejected": -2.6084671020507812, "logps/chosen": -39.43418884277344, "logps/rejected": -112.75638580322266, "loss": 0.4651, "rewards/accuracies": 1.0, "rewards/chosen": 1.325796127319336, "rewards/margins": 5.476062297821045, "rewards/rejected": -4.150266170501709, "step": 633 }, { "epoch": 1.531400966183575, "grad_norm": 3.766964912414551, "learning_rate": 8.471014492753623e-07, "logits/chosen": -2.506166934967041, "logits/rejected": -2.541616439819336, "logps/chosen": -30.39445686340332, "logps/rejected": -134.18650817871094, "loss": 0.3882, "rewards/accuracies": 0.96875, "rewards/chosen": 2.141453266143799, "rewards/margins": 7.692721843719482, "rewards/rejected": -5.551268577575684, "step": 634 }, { "epoch": 1.533816425120773, "grad_norm": 4.542481422424316, "learning_rate": 8.468599033816425e-07, "logits/chosen": -2.55721378326416, "logits/rejected": -2.490816593170166, "logps/chosen": -28.27682113647461, "logps/rejected": -116.79496765136719, "loss": 0.3314, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5380635261535645, "rewards/margins": 6.840311050415039, "rewards/rejected": -4.302247524261475, "step": 635 }, { "epoch": 1.5362318840579712, "grad_norm": 4.1551337242126465, "learning_rate": 8.466183574879227e-07, "logits/chosen": -2.631221294403076, "logits/rejected": -2.6344857215881348, "logps/chosen": -43.76191711425781, "logps/rejected": -123.1961898803711, "loss": 0.448, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2290644645690918, "rewards/margins": 6.12510871887207, "rewards/rejected": -4.89604377746582, "step": 636 }, { "epoch": 1.538647342995169, "grad_norm": 4.045119762420654, "learning_rate": 8.463768115942028e-07, "logits/chosen": -2.6170666217803955, "logits/rejected": -2.582827568054199, "logps/chosen": -33.569190979003906, "logps/rejected": -94.93843841552734, "loss": 0.5046, "rewards/accuracies": 0.9375, "rewards/chosen": 2.134382486343384, "rewards/margins": 4.765603542327881, "rewards/rejected": -2.6312215328216553, "step": 637 }, { "epoch": 1.541062801932367, "grad_norm": 2.9123635292053223, "learning_rate": 8.461352657004831e-07, "logits/chosen": -2.6691465377807617, "logits/rejected": -2.633005380630493, "logps/chosen": -49.769065856933594, "logps/rejected": -141.697021484375, "loss": 0.5012, "rewards/accuracies": 0.96875, "rewards/chosen": 0.671323299407959, "rewards/margins": 6.769821643829346, "rewards/rejected": -6.098498821258545, "step": 638 }, { "epoch": 1.5434782608695652, "grad_norm": 3.3898885250091553, "learning_rate": 8.458937198067633e-07, "logits/chosen": -2.539238691329956, "logits/rejected": -2.5435242652893066, "logps/chosen": -59.99280548095703, "logps/rejected": -147.806884765625, "loss": 0.4844, "rewards/accuracies": 0.96875, "rewards/chosen": -0.1426049768924713, "rewards/margins": 6.482157230377197, "rewards/rejected": -6.624762535095215, "step": 639 }, { "epoch": 1.5458937198067633, "grad_norm": 4.502304553985596, "learning_rate": 8.456521739130434e-07, "logits/chosen": -2.584613084793091, "logits/rejected": -2.566758632659912, "logps/chosen": -42.886932373046875, "logps/rejected": -106.73583221435547, "loss": 0.5425, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2443245649337769, "rewards/margins": 4.952151298522949, "rewards/rejected": -3.707826614379883, "step": 640 }, { "epoch": 1.5483091787439613, "grad_norm": 3.778538465499878, "learning_rate": 8.454106280193236e-07, "logits/chosen": -2.5495126247406006, "logits/rejected": -2.533966302871704, "logps/chosen": -41.825557708740234, "logps/rejected": -107.11492919921875, "loss": 0.5579, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2067991495132446, "rewards/margins": 4.848540782928467, "rewards/rejected": -3.6417415142059326, "step": 641 }, { "epoch": 1.5507246376811594, "grad_norm": 4.734832286834717, "learning_rate": 8.451690821256038e-07, "logits/chosen": -2.569606304168701, "logits/rejected": -2.633288860321045, "logps/chosen": -40.89497756958008, "logps/rejected": -117.94075012207031, "loss": 0.4794, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6081874370574951, "rewards/margins": 5.9080610275268555, "rewards/rejected": -4.299873352050781, "step": 642 }, { "epoch": 1.5531400966183575, "grad_norm": 4.132777690887451, "learning_rate": 8.44927536231884e-07, "logits/chosen": -2.4957470893859863, "logits/rejected": -2.5215322971343994, "logps/chosen": -44.69668197631836, "logps/rejected": -115.07266235351562, "loss": 0.5195, "rewards/accuracies": 0.875, "rewards/chosen": 1.0822653770446777, "rewards/margins": 5.0651702880859375, "rewards/rejected": -3.982905149459839, "step": 643 }, { "epoch": 1.5555555555555556, "grad_norm": 4.200713157653809, "learning_rate": 8.446859903381643e-07, "logits/chosen": -2.5692644119262695, "logits/rejected": -2.5252597332000732, "logps/chosen": -37.99854278564453, "logps/rejected": -100.64336395263672, "loss": 0.5197, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6241371631622314, "rewards/margins": 4.62413215637207, "rewards/rejected": -2.999995231628418, "step": 644 }, { "epoch": 1.5579710144927537, "grad_norm": 3.4473485946655273, "learning_rate": 8.444444444444444e-07, "logits/chosen": -2.6337313652038574, "logits/rejected": -2.553752899169922, "logps/chosen": -55.69038391113281, "logps/rejected": -111.11212158203125, "loss": 0.6433, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4524948000907898, "rewards/margins": 4.169265270233154, "rewards/rejected": -3.7167704105377197, "step": 645 }, { "epoch": 1.5603864734299517, "grad_norm": 4.716175556182861, "learning_rate": 8.442028985507245e-07, "logits/chosen": -2.629760980606079, "logits/rejected": -2.5667755603790283, "logps/chosen": -38.16816711425781, "logps/rejected": -104.1412582397461, "loss": 0.5265, "rewards/accuracies": 0.90625, "rewards/chosen": 1.8114100694656372, "rewards/margins": 5.3134918212890625, "rewards/rejected": -3.502081871032715, "step": 646 }, { "epoch": 1.5628019323671496, "grad_norm": 4.2634758949279785, "learning_rate": 8.439613526570048e-07, "logits/chosen": -2.673330783843994, "logits/rejected": -2.5891170501708984, "logps/chosen": -45.05158996582031, "logps/rejected": -122.84365844726562, "loss": 0.4923, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9508605599403381, "rewards/margins": 5.924880027770996, "rewards/rejected": -4.9740190505981445, "step": 647 }, { "epoch": 1.5652173913043477, "grad_norm": 2.922557830810547, "learning_rate": 8.43719806763285e-07, "logits/chosen": -2.5564448833465576, "logits/rejected": -2.5416815280914307, "logps/chosen": -37.51532745361328, "logps/rejected": -121.79499053955078, "loss": 0.4556, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9440498352050781, "rewards/margins": 6.5179877281188965, "rewards/rejected": -4.573937892913818, "step": 648 }, { "epoch": 1.5676328502415457, "grad_norm": 3.4616599082946777, "learning_rate": 8.434782608695652e-07, "logits/chosen": -2.6112308502197266, "logits/rejected": -2.586914539337158, "logps/chosen": -36.87850570678711, "logps/rejected": -121.91585540771484, "loss": 0.4878, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6912347078323364, "rewards/margins": 6.314401149749756, "rewards/rejected": -4.623166561126709, "step": 649 }, { "epoch": 1.5700483091787438, "grad_norm": 4.536777496337891, "learning_rate": 8.432367149758454e-07, "logits/chosen": -2.6601412296295166, "logits/rejected": -2.6457645893096924, "logps/chosen": -44.51705551147461, "logps/rejected": -111.33515930175781, "loss": 0.4667, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4250808954238892, "rewards/margins": 5.4723405838012695, "rewards/rejected": -4.04725980758667, "step": 650 }, { "epoch": 1.572463768115942, "grad_norm": 4.203700542449951, "learning_rate": 8.429951690821255e-07, "logits/chosen": -2.7498977184295654, "logits/rejected": -2.6555564403533936, "logps/chosen": -40.332374572753906, "logps/rejected": -125.09993743896484, "loss": 0.4387, "rewards/accuracies": 1.0, "rewards/chosen": 1.5706101655960083, "rewards/margins": 6.310894012451172, "rewards/rejected": -4.740283489227295, "step": 651 }, { "epoch": 1.57487922705314, "grad_norm": 4.212907314300537, "learning_rate": 8.427536231884057e-07, "logits/chosen": -2.5481109619140625, "logits/rejected": -2.5348942279815674, "logps/chosen": -47.76445770263672, "logps/rejected": -122.25254821777344, "loss": 0.5391, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7802766561508179, "rewards/margins": 5.787614822387695, "rewards/rejected": -5.007338523864746, "step": 652 }, { "epoch": 1.577294685990338, "grad_norm": 3.0242953300476074, "learning_rate": 8.42512077294686e-07, "logits/chosen": -2.5940868854522705, "logits/rejected": -2.545438051223755, "logps/chosen": -50.592979431152344, "logps/rejected": -127.9167251586914, "loss": 0.527, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4520726203918457, "rewards/margins": 5.81563663482666, "rewards/rejected": -5.363563537597656, "step": 653 }, { "epoch": 1.5797101449275361, "grad_norm": 6.59989070892334, "learning_rate": 8.422705314009661e-07, "logits/chosen": -2.716860294342041, "logits/rejected": -2.6880486011505127, "logps/chosen": -45.369293212890625, "logps/rejected": -107.7772445678711, "loss": 0.5858, "rewards/accuracies": 0.90625, "rewards/chosen": 1.301823377609253, "rewards/margins": 5.263678073883057, "rewards/rejected": -3.961854934692383, "step": 654 }, { "epoch": 1.5821256038647342, "grad_norm": 4.627190113067627, "learning_rate": 8.420289855072464e-07, "logits/chosen": -2.615213394165039, "logits/rejected": -2.5500588417053223, "logps/chosen": -39.71969223022461, "logps/rejected": -112.84545135498047, "loss": 0.5615, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4610612392425537, "rewards/margins": 5.521878242492676, "rewards/rejected": -4.060816764831543, "step": 655 }, { "epoch": 1.5845410628019323, "grad_norm": 4.491252899169922, "learning_rate": 8.417874396135265e-07, "logits/chosen": -2.606715202331543, "logits/rejected": -2.625729560852051, "logps/chosen": -44.373905181884766, "logps/rejected": -114.03400421142578, "loss": 0.585, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2700684070587158, "rewards/margins": 5.317601680755615, "rewards/rejected": -4.04753303527832, "step": 656 }, { "epoch": 1.5869565217391304, "grad_norm": 3.4214565753936768, "learning_rate": 8.415458937198067e-07, "logits/chosen": -2.554043769836426, "logits/rejected": -2.5788683891296387, "logps/chosen": -53.254817962646484, "logps/rejected": -119.69293212890625, "loss": 0.5137, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6057918667793274, "rewards/margins": 5.118993759155273, "rewards/rejected": -4.513201713562012, "step": 657 }, { "epoch": 1.5893719806763285, "grad_norm": 3.0600802898406982, "learning_rate": 8.41304347826087e-07, "logits/chosen": -2.7010011672973633, "logits/rejected": -2.6086392402648926, "logps/chosen": -39.884849548339844, "logps/rejected": -116.74180603027344, "loss": 0.4963, "rewards/accuracies": 0.9375, "rewards/chosen": 1.614537000656128, "rewards/margins": 5.858654022216797, "rewards/rejected": -4.244117259979248, "step": 658 }, { "epoch": 1.5917874396135265, "grad_norm": 3.7925007343292236, "learning_rate": 8.410628019323671e-07, "logits/chosen": -2.6556427478790283, "logits/rejected": -2.6323180198669434, "logps/chosen": -48.00304412841797, "logps/rejected": -116.3006591796875, "loss": 0.5676, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8484882712364197, "rewards/margins": 5.20098352432251, "rewards/rejected": -4.352495193481445, "step": 659 }, { "epoch": 1.5942028985507246, "grad_norm": 4.0407586097717285, "learning_rate": 8.408212560386472e-07, "logits/chosen": -2.6457574367523193, "logits/rejected": -2.6514575481414795, "logps/chosen": -51.42637252807617, "logps/rejected": -117.99774169921875, "loss": 0.5438, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4876704216003418, "rewards/margins": 5.113872051239014, "rewards/rejected": -4.626201629638672, "step": 660 }, { "epoch": 1.5966183574879227, "grad_norm": 5.472143173217773, "learning_rate": 8.405797101449275e-07, "logits/chosen": -2.6429860591888428, "logits/rejected": -2.5850744247436523, "logps/chosen": -53.333831787109375, "logps/rejected": -126.95305633544922, "loss": 0.6037, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09961014986038208, "rewards/margins": 5.534361362457275, "rewards/rejected": -5.434751987457275, "step": 661 }, { "epoch": 1.5990338164251208, "grad_norm": 4.225754261016846, "learning_rate": 8.403381642512077e-07, "logits/chosen": -2.690913438796997, "logits/rejected": -2.6278271675109863, "logps/chosen": -51.16025161743164, "logps/rejected": -113.86221313476562, "loss": 0.6013, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7133285999298096, "rewards/margins": 4.8088507652282715, "rewards/rejected": -4.095522403717041, "step": 662 }, { "epoch": 1.6014492753623188, "grad_norm": 3.4712181091308594, "learning_rate": 8.40096618357488e-07, "logits/chosen": -2.6534383296966553, "logits/rejected": -2.6539230346679688, "logps/chosen": -50.77751159667969, "logps/rejected": -122.66464233398438, "loss": 0.4952, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6259543895721436, "rewards/margins": 5.920276165008545, "rewards/rejected": -5.294321537017822, "step": 663 }, { "epoch": 1.603864734299517, "grad_norm": 9.10732364654541, "learning_rate": 8.398550724637681e-07, "logits/chosen": -2.669902801513672, "logits/rejected": -2.6660537719726562, "logps/chosen": -57.37066650390625, "logps/rejected": -136.82798767089844, "loss": 0.5332, "rewards/accuracies": 0.9375, "rewards/chosen": 0.030838310718536377, "rewards/margins": 5.878807067871094, "rewards/rejected": -5.847969055175781, "step": 664 }, { "epoch": 1.603864734299517, "eval_logits/chosen": -2.640667676925659, "eval_logits/rejected": -2.6335926055908203, "eval_logps/chosen": -42.330562591552734, "eval_logps/rejected": -131.32115173339844, "eval_loss": 0.43364009261131287, "eval_rewards/accuracies": 0.9613970518112183, "eval_rewards/chosen": 1.233035683631897, "eval_rewards/margins": 6.731750011444092, "eval_rewards/rejected": -5.498713970184326, "eval_runtime": 999.2126, "eval_samples_per_second": 0.544, "eval_steps_per_second": 0.272, "step": 664 }, { "epoch": 1.606280193236715, "grad_norm": 4.018294811248779, "learning_rate": 8.396135265700482e-07, "logits/chosen": -2.568243980407715, "logits/rejected": -2.6337342262268066, "logps/chosen": -43.47088623046875, "logps/rejected": -113.19992065429688, "loss": 0.5045, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4012603759765625, "rewards/margins": 5.7963762283325195, "rewards/rejected": -4.395115852355957, "step": 665 }, { "epoch": 1.608695652173913, "grad_norm": 4.314920425415039, "learning_rate": 8.393719806763285e-07, "logits/chosen": -2.6372835636138916, "logits/rejected": -2.6157054901123047, "logps/chosen": -40.002498626708984, "logps/rejected": -134.19851684570312, "loss": 0.3775, "rewards/accuracies": 1.0, "rewards/chosen": 1.6402814388275146, "rewards/margins": 7.5320563316345215, "rewards/rejected": -5.891775131225586, "step": 666 }, { "epoch": 1.6111111111111112, "grad_norm": 5.934546947479248, "learning_rate": 8.391304347826087e-07, "logits/chosen": -2.6263723373413086, "logits/rejected": -2.5804409980773926, "logps/chosen": -53.408660888671875, "logps/rejected": -147.38067626953125, "loss": 0.5232, "rewards/accuracies": 1.0, "rewards/chosen": 0.08210109174251556, "rewards/margins": 6.66071891784668, "rewards/rejected": -6.578618049621582, "step": 667 }, { "epoch": 1.6135265700483092, "grad_norm": 3.1804826259613037, "learning_rate": 8.388888888888888e-07, "logits/chosen": -2.6117591857910156, "logits/rejected": -2.6033267974853516, "logps/chosen": -27.787410736083984, "logps/rejected": -108.93672180175781, "loss": 0.4137, "rewards/accuracies": 0.96875, "rewards/chosen": 2.416653871536255, "rewards/margins": 6.210757255554199, "rewards/rejected": -3.7941040992736816, "step": 668 }, { "epoch": 1.6159420289855073, "grad_norm": 2.5767099857330322, "learning_rate": 8.386473429951691e-07, "logits/chosen": -2.589995861053467, "logits/rejected": -2.626605987548828, "logps/chosen": -36.906654357910156, "logps/rejected": -125.21451568603516, "loss": 0.4236, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8725744485855103, "rewards/margins": 6.814212799072266, "rewards/rejected": -4.941637992858887, "step": 669 }, { "epoch": 1.6183574879227054, "grad_norm": 3.8746209144592285, "learning_rate": 8.384057971014492e-07, "logits/chosen": -2.592332363128662, "logits/rejected": -2.5787758827209473, "logps/chosen": -44.97412872314453, "logps/rejected": -106.86740112304688, "loss": 0.5737, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7649816870689392, "rewards/margins": 4.646505355834961, "rewards/rejected": -3.881523370742798, "step": 670 }, { "epoch": 1.6207729468599035, "grad_norm": 4.700399398803711, "learning_rate": 8.381642512077294e-07, "logits/chosen": -2.6389591693878174, "logits/rejected": -2.6026225090026855, "logps/chosen": -38.77603530883789, "logps/rejected": -126.21134948730469, "loss": 0.4386, "rewards/accuracies": 0.96875, "rewards/chosen": 1.447622299194336, "rewards/margins": 6.462416648864746, "rewards/rejected": -5.014794826507568, "step": 671 }, { "epoch": 1.6231884057971016, "grad_norm": 3.9811136722564697, "learning_rate": 8.379227053140097e-07, "logits/chosen": -2.7173147201538086, "logits/rejected": -2.6678578853607178, "logps/chosen": -42.36113357543945, "logps/rejected": -143.20504760742188, "loss": 0.3631, "rewards/accuracies": 1.0, "rewards/chosen": 1.9592411518096924, "rewards/margins": 8.092348098754883, "rewards/rejected": -6.1331071853637695, "step": 672 }, { "epoch": 1.6256038647342996, "grad_norm": 4.169699668884277, "learning_rate": 8.376811594202898e-07, "logits/chosen": -2.591356039047241, "logits/rejected": -2.6585545539855957, "logps/chosen": -38.99812698364258, "logps/rejected": -124.17424774169922, "loss": 0.4206, "rewards/accuracies": 0.90625, "rewards/chosen": 1.8382205963134766, "rewards/margins": 6.701651096343994, "rewards/rejected": -4.863430023193359, "step": 673 }, { "epoch": 1.6280193236714977, "grad_norm": 4.5578131675720215, "learning_rate": 8.3743961352657e-07, "logits/chosen": -2.6830224990844727, "logits/rejected": -2.6215386390686035, "logps/chosen": -57.68989944458008, "logps/rejected": -128.95054626464844, "loss": 0.6105, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1684025228023529, "rewards/margins": 5.108460426330566, "rewards/rejected": -5.276863098144531, "step": 674 }, { "epoch": 1.6304347826086958, "grad_norm": 6.216601848602295, "learning_rate": 8.371980676328502e-07, "logits/chosen": -2.6116418838500977, "logits/rejected": -2.53996205329895, "logps/chosen": -35.47508239746094, "logps/rejected": -119.317138671875, "loss": 0.5948, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9382445812225342, "rewards/margins": 6.263326644897461, "rewards/rejected": -4.325082302093506, "step": 675 }, { "epoch": 1.6328502415458939, "grad_norm": 4.194127082824707, "learning_rate": 8.369565217391304e-07, "logits/chosen": -2.6494698524475098, "logits/rejected": -2.6019911766052246, "logps/chosen": -50.02307891845703, "logps/rejected": -112.41683959960938, "loss": 0.5045, "rewards/accuracies": 0.875, "rewards/chosen": 0.9129865765571594, "rewards/margins": 5.092328071594238, "rewards/rejected": -4.1793413162231445, "step": 676 }, { "epoch": 1.635265700483092, "grad_norm": 5.205430507659912, "learning_rate": 8.367149758454106e-07, "logits/chosen": -2.5719969272613525, "logits/rejected": -2.5323903560638428, "logps/chosen": -51.83967971801758, "logps/rejected": -139.45802307128906, "loss": 0.5279, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7640519142150879, "rewards/margins": 6.361582279205322, "rewards/rejected": -5.597530364990234, "step": 677 }, { "epoch": 1.6376811594202898, "grad_norm": 2.936479091644287, "learning_rate": 8.364734299516908e-07, "logits/chosen": -2.510464906692505, "logits/rejected": -2.533627986907959, "logps/chosen": -23.59852409362793, "logps/rejected": -112.0220947265625, "loss": 0.3472, "rewards/accuracies": 1.0, "rewards/chosen": 2.7775988578796387, "rewards/margins": 6.7856550216674805, "rewards/rejected": -4.008056163787842, "step": 678 }, { "epoch": 1.6400966183574879, "grad_norm": 3.74651837348938, "learning_rate": 8.36231884057971e-07, "logits/chosen": -2.640202522277832, "logits/rejected": -2.5782041549682617, "logps/chosen": -43.743412017822266, "logps/rejected": -110.66520690917969, "loss": 0.6358, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2357864379882812, "rewards/margins": 5.148478984832764, "rewards/rejected": -3.9126930236816406, "step": 679 }, { "epoch": 1.642512077294686, "grad_norm": 4.015524864196777, "learning_rate": 8.359903381642511e-07, "logits/chosen": -2.6018142700195312, "logits/rejected": -2.564903736114502, "logps/chosen": -33.81147766113281, "logps/rejected": -117.76409912109375, "loss": 0.3544, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1334095001220703, "rewards/margins": 6.832460403442383, "rewards/rejected": -4.6990509033203125, "step": 680 }, { "epoch": 1.644927536231884, "grad_norm": 4.843131065368652, "learning_rate": 8.357487922705314e-07, "logits/chosen": -2.7040538787841797, "logits/rejected": -2.68178391456604, "logps/chosen": -49.55031204223633, "logps/rejected": -123.42503356933594, "loss": 0.5006, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9622964859008789, "rewards/margins": 5.680532455444336, "rewards/rejected": -4.718235969543457, "step": 681 }, { "epoch": 1.6473429951690821, "grad_norm": 4.08967399597168, "learning_rate": 8.355072463768116e-07, "logits/chosen": -2.6560511589050293, "logits/rejected": -2.5673699378967285, "logps/chosen": -45.50414276123047, "logps/rejected": -120.25020599365234, "loss": 0.5506, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8895099759101868, "rewards/margins": 5.559074401855469, "rewards/rejected": -4.669564723968506, "step": 682 }, { "epoch": 1.6497584541062802, "grad_norm": 5.7892279624938965, "learning_rate": 8.352657004830917e-07, "logits/chosen": -2.6306958198547363, "logits/rejected": -2.551665782928467, "logps/chosen": -42.50507354736328, "logps/rejected": -128.0979766845703, "loss": 0.4749, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7362512350082397, "rewards/margins": 6.712541103363037, "rewards/rejected": -4.976289749145508, "step": 683 }, { "epoch": 1.6521739130434783, "grad_norm": 4.519760608673096, "learning_rate": 8.350241545893719e-07, "logits/chosen": -2.58857798576355, "logits/rejected": -2.551815986633301, "logps/chosen": -44.25920486450195, "logps/rejected": -101.20939636230469, "loss": 0.6498, "rewards/accuracies": 0.96875, "rewards/chosen": 1.266303300857544, "rewards/margins": 4.600747108459473, "rewards/rejected": -3.334444046020508, "step": 684 }, { "epoch": 1.6545893719806763, "grad_norm": 3.9189188480377197, "learning_rate": 8.347826086956521e-07, "logits/chosen": -2.5580973625183105, "logits/rejected": -2.487110137939453, "logps/chosen": -44.637577056884766, "logps/rejected": -134.0721893310547, "loss": 0.4676, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1410737037658691, "rewards/margins": 6.59312105178833, "rewards/rejected": -5.452047824859619, "step": 685 }, { "epoch": 1.6570048309178744, "grad_norm": 5.734509468078613, "learning_rate": 8.345410628019324e-07, "logits/chosen": -2.6054184436798096, "logits/rejected": -2.5828309059143066, "logps/chosen": -43.867618560791016, "logps/rejected": -128.30923461914062, "loss": 0.5069, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4420275688171387, "rewards/margins": 6.183282375335693, "rewards/rejected": -4.741254806518555, "step": 686 }, { "epoch": 1.6594202898550725, "grad_norm": 3.6031460762023926, "learning_rate": 8.342995169082126e-07, "logits/chosen": -2.682182550430298, "logits/rejected": -2.655451774597168, "logps/chosen": -37.113582611083984, "logps/rejected": -134.7492218017578, "loss": 0.3918, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8988933563232422, "rewards/margins": 7.352386951446533, "rewards/rejected": -5.453493118286133, "step": 687 }, { "epoch": 1.6618357487922706, "grad_norm": 4.74965238571167, "learning_rate": 8.340579710144927e-07, "logits/chosen": -2.5965564250946045, "logits/rejected": -2.693492889404297, "logps/chosen": -56.87493133544922, "logps/rejected": -130.0946044921875, "loss": 0.4944, "rewards/accuracies": 0.90625, "rewards/chosen": 0.25776422023773193, "rewards/margins": 5.554319381713867, "rewards/rejected": -5.296555519104004, "step": 688 }, { "epoch": 1.6642512077294684, "grad_norm": 3.9727447032928467, "learning_rate": 8.338164251207729e-07, "logits/chosen": -2.6421327590942383, "logits/rejected": -2.6118111610412598, "logps/chosen": -49.339439392089844, "logps/rejected": -112.81758880615234, "loss": 0.5104, "rewards/accuracies": 0.9375, "rewards/chosen": 1.247067928314209, "rewards/margins": 4.88437557220459, "rewards/rejected": -3.637307643890381, "step": 689 }, { "epoch": 1.6666666666666665, "grad_norm": 4.611842632293701, "learning_rate": 8.335748792270531e-07, "logits/chosen": -2.530153512954712, "logits/rejected": -2.5580732822418213, "logps/chosen": -34.339839935302734, "logps/rejected": -109.2869873046875, "loss": 0.3996, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9869335889816284, "rewards/margins": 5.908181667327881, "rewards/rejected": -3.921247720718384, "step": 690 }, { "epoch": 1.6690821256038646, "grad_norm": 3.9192841053009033, "learning_rate": 8.333333333333333e-07, "logits/chosen": -2.5916898250579834, "logits/rejected": -2.539670944213867, "logps/chosen": -34.69058609008789, "logps/rejected": -118.1583251953125, "loss": 0.4865, "rewards/accuracies": 0.875, "rewards/chosen": 1.8002707958221436, "rewards/margins": 5.929560661315918, "rewards/rejected": -4.1292901039123535, "step": 691 }, { "epoch": 1.6714975845410627, "grad_norm": 4.859731674194336, "learning_rate": 8.330917874396135e-07, "logits/chosen": -2.6350831985473633, "logits/rejected": -2.5612196922302246, "logps/chosen": -44.70105743408203, "logps/rejected": -105.22042083740234, "loss": 0.5688, "rewards/accuracies": 0.9375, "rewards/chosen": 1.243877649307251, "rewards/margins": 4.739588737487793, "rewards/rejected": -3.495711326599121, "step": 692 }, { "epoch": 1.6739130434782608, "grad_norm": 6.984130382537842, "learning_rate": 8.328502415458937e-07, "logits/chosen": -2.566648483276367, "logits/rejected": -2.554666042327881, "logps/chosen": -34.45218276977539, "logps/rejected": -108.9013900756836, "loss": 0.4785, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7165112495422363, "rewards/margins": 5.5175347328186035, "rewards/rejected": -3.801023006439209, "step": 693 }, { "epoch": 1.6763285024154588, "grad_norm": 5.328852653503418, "learning_rate": 8.326086956521738e-07, "logits/chosen": -2.5774080753326416, "logits/rejected": -2.5759694576263428, "logps/chosen": -28.310564041137695, "logps/rejected": -93.42249298095703, "loss": 0.4229, "rewards/accuracies": 0.96875, "rewards/chosen": 2.6191189289093018, "rewards/margins": 5.4077982902526855, "rewards/rejected": -2.788679361343384, "step": 694 }, { "epoch": 1.678743961352657, "grad_norm": 5.1253814697265625, "learning_rate": 8.323671497584542e-07, "logits/chosen": -2.6227571964263916, "logits/rejected": -2.575604200363159, "logps/chosen": -59.15333557128906, "logps/rejected": -114.92227935791016, "loss": 0.6569, "rewards/accuracies": 0.875, "rewards/chosen": 0.060044899582862854, "rewards/margins": 4.205252170562744, "rewards/rejected": -4.145207405090332, "step": 695 }, { "epoch": 1.681159420289855, "grad_norm": 4.4994425773620605, "learning_rate": 8.321256038647343e-07, "logits/chosen": -2.5850632190704346, "logits/rejected": -2.6007320880889893, "logps/chosen": -42.49415588378906, "logps/rejected": -105.35298919677734, "loss": 0.5576, "rewards/accuracies": 0.875, "rewards/chosen": 1.3659298419952393, "rewards/margins": 4.830303192138672, "rewards/rejected": -3.4643735885620117, "step": 696 }, { "epoch": 1.683574879227053, "grad_norm": 6.270987510681152, "learning_rate": 8.318840579710144e-07, "logits/chosen": -2.582374334335327, "logits/rejected": -2.527193069458008, "logps/chosen": -47.47509765625, "logps/rejected": -129.29757690429688, "loss": 0.532, "rewards/accuracies": 0.875, "rewards/chosen": 0.7118663787841797, "rewards/margins": 5.7445831298828125, "rewards/rejected": -5.032716751098633, "step": 697 }, { "epoch": 1.6859903381642511, "grad_norm": 3.61926007270813, "learning_rate": 8.316425120772947e-07, "logits/chosen": -2.588880777359009, "logits/rejected": -2.6207995414733887, "logps/chosen": -40.75214385986328, "logps/rejected": -117.21098327636719, "loss": 0.5492, "rewards/accuracies": 0.875, "rewards/chosen": 1.3699971437454224, "rewards/margins": 5.629952907562256, "rewards/rejected": -4.259955406188965, "step": 698 }, { "epoch": 1.6884057971014492, "grad_norm": 5.470647811889648, "learning_rate": 8.314009661835748e-07, "logits/chosen": -2.558736801147461, "logits/rejected": -2.5256195068359375, "logps/chosen": -43.57239532470703, "logps/rejected": -112.29236602783203, "loss": 0.5076, "rewards/accuracies": 0.875, "rewards/chosen": 1.3800301551818848, "rewards/margins": 5.135665416717529, "rewards/rejected": -3.7556357383728027, "step": 699 }, { "epoch": 1.6908212560386473, "grad_norm": 2.9075982570648193, "learning_rate": 8.31159420289855e-07, "logits/chosen": -2.6528306007385254, "logits/rejected": -2.5359888076782227, "logps/chosen": -36.691471099853516, "logps/rejected": -115.70509338378906, "loss": 0.5407, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5793873071670532, "rewards/margins": 5.713630676269531, "rewards/rejected": -4.134243965148926, "step": 700 }, { "epoch": 1.6932367149758454, "grad_norm": 5.4854302406311035, "learning_rate": 8.309178743961353e-07, "logits/chosen": -2.65466046333313, "logits/rejected": -2.6647040843963623, "logps/chosen": -40.72132110595703, "logps/rejected": -116.84257507324219, "loss": 0.5194, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7418142557144165, "rewards/margins": 5.7852702140808105, "rewards/rejected": -4.043456077575684, "step": 701 }, { "epoch": 1.6956521739130435, "grad_norm": 2.879545211791992, "learning_rate": 8.306763285024154e-07, "logits/chosen": -2.5807721614837646, "logits/rejected": -2.5439751148223877, "logps/chosen": -49.62998580932617, "logps/rejected": -131.18692016601562, "loss": 0.4962, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0946253538131714, "rewards/margins": 6.1236066818237305, "rewards/rejected": -5.028981685638428, "step": 702 }, { "epoch": 1.6980676328502415, "grad_norm": 3.3539865016937256, "learning_rate": 8.304347826086955e-07, "logits/chosen": -2.66974139213562, "logits/rejected": -2.594677448272705, "logps/chosen": -40.944427490234375, "logps/rejected": -121.70238494873047, "loss": 0.3899, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4151663780212402, "rewards/margins": 5.7166337966918945, "rewards/rejected": -4.301467418670654, "step": 703 }, { "epoch": 1.7004830917874396, "grad_norm": 6.030129909515381, "learning_rate": 8.301932367149758e-07, "logits/chosen": -2.5757622718811035, "logits/rejected": -2.5729143619537354, "logps/chosen": -38.42225646972656, "logps/rejected": -113.12271881103516, "loss": 0.4489, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8774081468582153, "rewards/margins": 5.754689693450928, "rewards/rejected": -3.8772811889648438, "step": 704 }, { "epoch": 1.7028985507246377, "grad_norm": 4.322689533233643, "learning_rate": 8.29951690821256e-07, "logits/chosen": -2.586172580718994, "logits/rejected": -2.5405333042144775, "logps/chosen": -46.78536605834961, "logps/rejected": -110.49565124511719, "loss": 0.608, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9354375600814819, "rewards/margins": 4.690542697906494, "rewards/rejected": -3.7551050186157227, "step": 705 }, { "epoch": 1.7053140096618358, "grad_norm": 6.315699577331543, "learning_rate": 8.297101449275363e-07, "logits/chosen": -2.456275701522827, "logits/rejected": -2.4550728797912598, "logps/chosen": -38.871253967285156, "logps/rejected": -116.57318878173828, "loss": 0.4645, "rewards/accuracies": 0.9375, "rewards/chosen": 1.536694049835205, "rewards/margins": 5.792900562286377, "rewards/rejected": -4.256206035614014, "step": 706 }, { "epoch": 1.7077294685990339, "grad_norm": 4.693209648132324, "learning_rate": 8.294685990338164e-07, "logits/chosen": -2.481675624847412, "logits/rejected": -2.4697697162628174, "logps/chosen": -44.12788391113281, "logps/rejected": -124.55241394042969, "loss": 0.469, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3703503608703613, "rewards/margins": 6.001958847045898, "rewards/rejected": -4.631608486175537, "step": 707 }, { "epoch": 1.710144927536232, "grad_norm": 3.701059579849243, "learning_rate": 8.292270531400965e-07, "logits/chosen": -2.5465385913848877, "logits/rejected": -2.5209388732910156, "logps/chosen": -45.402137756347656, "logps/rejected": -128.00079345703125, "loss": 0.513, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1591672897338867, "rewards/margins": 5.916379928588867, "rewards/rejected": -4.757212162017822, "step": 708 }, { "epoch": 1.71256038647343, "grad_norm": 4.151609420776367, "learning_rate": 8.289855072463768e-07, "logits/chosen": -2.6339168548583984, "logits/rejected": -2.5725622177124023, "logps/chosen": -40.39990234375, "logps/rejected": -102.86223602294922, "loss": 0.6654, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4270565509796143, "rewards/margins": 4.819352626800537, "rewards/rejected": -3.39229679107666, "step": 709 }, { "epoch": 1.714975845410628, "grad_norm": 4.867971420288086, "learning_rate": 8.28743961352657e-07, "logits/chosen": -2.5419414043426514, "logits/rejected": -2.530094861984253, "logps/chosen": -42.21279525756836, "logps/rejected": -129.80160522460938, "loss": 0.4835, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2870540618896484, "rewards/margins": 6.30351448059082, "rewards/rejected": -5.016460418701172, "step": 710 }, { "epoch": 1.7173913043478262, "grad_norm": 3.9888463020324707, "learning_rate": 8.285024154589371e-07, "logits/chosen": -2.563748359680176, "logits/rejected": -2.6093955039978027, "logps/chosen": -38.43497085571289, "logps/rejected": -123.67083740234375, "loss": 0.3889, "rewards/accuracies": 1.0, "rewards/chosen": 1.5998855829238892, "rewards/margins": 6.477542877197266, "rewards/rejected": -4.877657413482666, "step": 711 }, { "epoch": 1.7198067632850242, "grad_norm": 5.128357887268066, "learning_rate": 8.282608695652174e-07, "logits/chosen": -2.534259796142578, "logits/rejected": -2.4828672409057617, "logps/chosen": -39.61003494262695, "logps/rejected": -102.78076934814453, "loss": 0.4906, "rewards/accuracies": 1.0, "rewards/chosen": 1.8035051822662354, "rewards/margins": 4.835464954376221, "rewards/rejected": -3.0319604873657227, "step": 712 }, { "epoch": 1.7222222222222223, "grad_norm": 5.145055294036865, "learning_rate": 8.280193236714975e-07, "logits/chosen": -2.509603977203369, "logits/rejected": -2.5250539779663086, "logps/chosen": -41.598426818847656, "logps/rejected": -126.12289428710938, "loss": 0.5093, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5768232345581055, "rewards/margins": 6.293777942657471, "rewards/rejected": -4.716955184936523, "step": 713 }, { "epoch": 1.7246376811594204, "grad_norm": 4.323469638824463, "learning_rate": 8.277777777777777e-07, "logits/chosen": -2.612966537475586, "logits/rejected": -2.5411715507507324, "logps/chosen": -32.8932991027832, "logps/rejected": -111.87124633789062, "loss": 0.4185, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9191452264785767, "rewards/margins": 5.911159038543701, "rewards/rejected": -3.992013692855835, "step": 714 }, { "epoch": 1.7270531400966185, "grad_norm": 3.2365527153015137, "learning_rate": 8.27536231884058e-07, "logits/chosen": -2.580794334411621, "logits/rejected": -2.607008457183838, "logps/chosen": -53.230709075927734, "logps/rejected": -119.60699462890625, "loss": 0.5672, "rewards/accuracies": 0.90625, "rewards/chosen": 0.544113278388977, "rewards/margins": 4.809896469116211, "rewards/rejected": -4.265783309936523, "step": 715 }, { "epoch": 1.7294685990338166, "grad_norm": 4.716981887817383, "learning_rate": 8.272946859903381e-07, "logits/chosen": -2.535529613494873, "logits/rejected": -2.51076340675354, "logps/chosen": -61.27044677734375, "logps/rejected": -107.56881713867188, "loss": 0.6934, "rewards/accuracies": 0.78125, "rewards/chosen": -0.04375100135803223, "rewards/margins": 3.4709925651550293, "rewards/rejected": -3.5147433280944824, "step": 716 }, { "epoch": 1.7318840579710146, "grad_norm": 3.5784897804260254, "learning_rate": 8.270531400966183e-07, "logits/chosen": -2.5856382846832275, "logits/rejected": -2.5755228996276855, "logps/chosen": -49.66697692871094, "logps/rejected": -116.67991638183594, "loss": 0.5431, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5862114429473877, "rewards/margins": 5.089511871337891, "rewards/rejected": -4.503299713134766, "step": 717 }, { "epoch": 1.7342995169082127, "grad_norm": 3.190159320831299, "learning_rate": 8.268115942028985e-07, "logits/chosen": -2.5583527088165283, "logits/rejected": -2.541348457336426, "logps/chosen": -44.75562286376953, "logps/rejected": -137.61517333984375, "loss": 0.3616, "rewards/accuracies": 0.96875, "rewards/chosen": 1.064576268196106, "rewards/margins": 7.118574142456055, "rewards/rejected": -6.05399751663208, "step": 718 }, { "epoch": 1.7367149758454108, "grad_norm": 4.632556438446045, "learning_rate": 8.265700483091787e-07, "logits/chosen": -2.595391035079956, "logits/rejected": -2.5809922218322754, "logps/chosen": -31.673879623413086, "logps/rejected": -116.27613067626953, "loss": 0.3367, "rewards/accuracies": 0.96875, "rewards/chosen": 2.208860158920288, "rewards/margins": 6.383100509643555, "rewards/rejected": -4.1742401123046875, "step": 719 }, { "epoch": 1.7391304347826086, "grad_norm": 4.097049236297607, "learning_rate": 8.26328502415459e-07, "logits/chosen": -2.5489044189453125, "logits/rejected": -2.5219950675964355, "logps/chosen": -35.57185745239258, "logps/rejected": -112.2398452758789, "loss": 0.5102, "rewards/accuracies": 0.96875, "rewards/chosen": 1.758033037185669, "rewards/margins": 5.876170635223389, "rewards/rejected": -4.118137836456299, "step": 720 }, { "epoch": 1.7415458937198067, "grad_norm": 3.8574235439300537, "learning_rate": 8.260869565217391e-07, "logits/chosen": -2.63019061088562, "logits/rejected": -2.5349233150482178, "logps/chosen": -49.770118713378906, "logps/rejected": -131.81561279296875, "loss": 0.507, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9393612146377563, "rewards/margins": 6.4334917068481445, "rewards/rejected": -5.494130611419678, "step": 721 }, { "epoch": 1.7439613526570048, "grad_norm": 5.643762111663818, "learning_rate": 8.258454106280193e-07, "logits/chosen": -2.5615484714508057, "logits/rejected": -2.5688040256500244, "logps/chosen": -56.02629470825195, "logps/rejected": -132.80691528320312, "loss": 0.5431, "rewards/accuracies": 0.96875, "rewards/chosen": 0.42821604013442993, "rewards/margins": 5.779528617858887, "rewards/rejected": -5.351312160491943, "step": 722 }, { "epoch": 1.7463768115942029, "grad_norm": 3.8469481468200684, "learning_rate": 8.256038647342994e-07, "logits/chosen": -2.54687762260437, "logits/rejected": -2.5416860580444336, "logps/chosen": -35.43252944946289, "logps/rejected": -119.33303833007812, "loss": 0.4272, "rewards/accuracies": 0.9375, "rewards/chosen": 1.955466866493225, "rewards/margins": 6.224923133850098, "rewards/rejected": -4.269456386566162, "step": 723 }, { "epoch": 1.748792270531401, "grad_norm": 4.856188774108887, "learning_rate": 8.253623188405797e-07, "logits/chosen": -2.544487476348877, "logits/rejected": -2.600116014480591, "logps/chosen": -43.67333221435547, "logps/rejected": -129.86712646484375, "loss": 0.4495, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5380975008010864, "rewards/margins": 6.4947309494018555, "rewards/rejected": -4.956633567810059, "step": 724 }, { "epoch": 1.751207729468599, "grad_norm": 4.0698561668396, "learning_rate": 8.251207729468599e-07, "logits/chosen": -2.5692989826202393, "logits/rejected": -2.542374849319458, "logps/chosen": -32.810062408447266, "logps/rejected": -120.13160705566406, "loss": 0.4211, "rewards/accuracies": 1.0, "rewards/chosen": 2.1530628204345703, "rewards/margins": 6.676784515380859, "rewards/rejected": -4.523721218109131, "step": 725 }, { "epoch": 1.7536231884057971, "grad_norm": 4.745157241821289, "learning_rate": 8.248792270531401e-07, "logits/chosen": -2.612773895263672, "logits/rejected": -2.513465166091919, "logps/chosen": -50.2326774597168, "logps/rejected": -118.28324890136719, "loss": 0.592, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8672359585762024, "rewards/margins": 5.0636115074157715, "rewards/rejected": -4.196375846862793, "step": 726 }, { "epoch": 1.7560386473429952, "grad_norm": 4.193182468414307, "learning_rate": 8.246376811594202e-07, "logits/chosen": -2.5717334747314453, "logits/rejected": -2.4901695251464844, "logps/chosen": -35.00527572631836, "logps/rejected": -113.81985473632812, "loss": 0.5461, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7758668661117554, "rewards/margins": 5.932925701141357, "rewards/rejected": -4.1570587158203125, "step": 727 }, { "epoch": 1.7584541062801933, "grad_norm": 5.443159580230713, "learning_rate": 8.243961352657004e-07, "logits/chosen": -2.5862929821014404, "logits/rejected": -2.5619919300079346, "logps/chosen": -37.2120361328125, "logps/rejected": -129.5594482421875, "loss": 0.4011, "rewards/accuracies": 1.0, "rewards/chosen": 1.7199870347976685, "rewards/margins": 6.984579086303711, "rewards/rejected": -5.264591693878174, "step": 728 }, { "epoch": 1.7608695652173914, "grad_norm": 3.219655752182007, "learning_rate": 8.241545893719807e-07, "logits/chosen": -2.539716958999634, "logits/rejected": -2.5221121311187744, "logps/chosen": -46.84470748901367, "logps/rejected": -120.759521484375, "loss": 0.4624, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0560097694396973, "rewards/margins": 5.697183609008789, "rewards/rejected": -4.64117431640625, "step": 729 }, { "epoch": 1.7632850241545892, "grad_norm": 5.485056400299072, "learning_rate": 8.239130434782609e-07, "logits/chosen": -2.546609401702881, "logits/rejected": -2.561067581176758, "logps/chosen": -63.84901809692383, "logps/rejected": -141.34275817871094, "loss": 0.5212, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2230173498392105, "rewards/margins": 5.686964511871338, "rewards/rejected": -5.9099812507629395, "step": 730 }, { "epoch": 1.7657004830917873, "grad_norm": 3.9665935039520264, "learning_rate": 8.23671497584541e-07, "logits/chosen": -2.5869011878967285, "logits/rejected": -2.5848591327667236, "logps/chosen": -46.834739685058594, "logps/rejected": -122.87374877929688, "loss": 0.5389, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7779910564422607, "rewards/margins": 5.129876136779785, "rewards/rejected": -4.3518853187561035, "step": 731 }, { "epoch": 1.7681159420289854, "grad_norm": 3.8456478118896484, "learning_rate": 8.234299516908212e-07, "logits/chosen": -2.535996913909912, "logits/rejected": -2.4817111492156982, "logps/chosen": -32.000396728515625, "logps/rejected": -118.93661499023438, "loss": 0.4435, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8330600261688232, "rewards/margins": 6.242949485778809, "rewards/rejected": -4.409888744354248, "step": 732 }, { "epoch": 1.7705314009661834, "grad_norm": 4.43218994140625, "learning_rate": 8.231884057971014e-07, "logits/chosen": -2.4960012435913086, "logits/rejected": -2.5182034969329834, "logps/chosen": -31.299055099487305, "logps/rejected": -122.56832885742188, "loss": 0.4842, "rewards/accuracies": 0.9375, "rewards/chosen": 2.392033576965332, "rewards/margins": 6.336233139038086, "rewards/rejected": -3.9441990852355957, "step": 733 }, { "epoch": 1.7729468599033815, "grad_norm": 5.6440300941467285, "learning_rate": 8.229468599033816e-07, "logits/chosen": -2.5733718872070312, "logits/rejected": -2.6034321784973145, "logps/chosen": -50.222267150878906, "logps/rejected": -118.35313415527344, "loss": 0.5952, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8258697986602783, "rewards/margins": 5.1110992431640625, "rewards/rejected": -4.285229682922363, "step": 734 }, { "epoch": 1.7753623188405796, "grad_norm": 4.754676342010498, "learning_rate": 8.227053140096618e-07, "logits/chosen": -2.555070400238037, "logits/rejected": -2.5478250980377197, "logps/chosen": -42.14895248413086, "logps/rejected": -115.41665649414062, "loss": 0.4141, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2835447788238525, "rewards/margins": 5.500647068023682, "rewards/rejected": -4.217103004455566, "step": 735 }, { "epoch": 1.7777777777777777, "grad_norm": 3.831775188446045, "learning_rate": 8.22463768115942e-07, "logits/chosen": -2.591989517211914, "logits/rejected": -2.5147008895874023, "logps/chosen": -48.65517044067383, "logps/rejected": -134.2303466796875, "loss": 0.4784, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7364754676818848, "rewards/margins": 6.011836051940918, "rewards/rejected": -5.275360584259033, "step": 736 }, { "epoch": 1.7801932367149758, "grad_norm": 5.786038875579834, "learning_rate": 8.222222222222221e-07, "logits/chosen": -2.5085854530334473, "logits/rejected": -2.478555679321289, "logps/chosen": -41.17864990234375, "logps/rejected": -121.15592956542969, "loss": 0.5117, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4784342050552368, "rewards/margins": 5.911670207977295, "rewards/rejected": -4.4332356452941895, "step": 737 }, { "epoch": 1.7826086956521738, "grad_norm": 4.330005645751953, "learning_rate": 8.219806763285025e-07, "logits/chosen": -2.5626072883605957, "logits/rejected": -2.4783811569213867, "logps/chosen": -37.13644790649414, "logps/rejected": -105.12569427490234, "loss": 0.5027, "rewards/accuracies": 0.96875, "rewards/chosen": 1.867459774017334, "rewards/margins": 5.141822814941406, "rewards/rejected": -3.2743630409240723, "step": 738 }, { "epoch": 1.785024154589372, "grad_norm": 2.6937408447265625, "learning_rate": 8.217391304347826e-07, "logits/chosen": -2.6344552040100098, "logits/rejected": -2.5614736080169678, "logps/chosen": -31.051170349121094, "logps/rejected": -107.25856018066406, "loss": 0.4429, "rewards/accuracies": 0.875, "rewards/chosen": 2.30002498626709, "rewards/margins": 5.89914083480835, "rewards/rejected": -3.5991156101226807, "step": 739 }, { "epoch": 1.78743961352657, "grad_norm": 2.641336441040039, "learning_rate": 8.214975845410627e-07, "logits/chosen": -2.630413770675659, "logits/rejected": -2.5239803791046143, "logps/chosen": -65.26165771484375, "logps/rejected": -147.69717407226562, "loss": 0.5382, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4418967664241791, "rewards/margins": 5.832775592803955, "rewards/rejected": -6.274672508239746, "step": 740 }, { "epoch": 1.789855072463768, "grad_norm": 4.974237442016602, "learning_rate": 8.21256038647343e-07, "logits/chosen": -2.6504523754119873, "logits/rejected": -2.5905356407165527, "logps/chosen": -47.347740173339844, "logps/rejected": -126.4474105834961, "loss": 0.5638, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3650896549224854, "rewards/margins": 6.016920566558838, "rewards/rejected": -4.651830673217773, "step": 741 }, { "epoch": 1.7922705314009661, "grad_norm": 4.696399211883545, "learning_rate": 8.210144927536231e-07, "logits/chosen": -2.547586441040039, "logits/rejected": -2.551908493041992, "logps/chosen": -65.14665985107422, "logps/rejected": -126.37115478515625, "loss": 0.6753, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6408499479293823, "rewards/margins": 4.234805107116699, "rewards/rejected": -4.875655651092529, "step": 742 }, { "epoch": 1.7946859903381642, "grad_norm": 3.6934316158294678, "learning_rate": 8.207729468599034e-07, "logits/chosen": -2.6033551692962646, "logits/rejected": -2.5551671981811523, "logps/chosen": -50.07727813720703, "logps/rejected": -121.0328369140625, "loss": 0.4894, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8608310222625732, "rewards/margins": 5.426725387573242, "rewards/rejected": -4.56589412689209, "step": 743 }, { "epoch": 1.7971014492753623, "grad_norm": 3.4633936882019043, "learning_rate": 8.205314009661836e-07, "logits/chosen": -2.6272125244140625, "logits/rejected": -2.6020846366882324, "logps/chosen": -39.01151657104492, "logps/rejected": -135.56382751464844, "loss": 0.3897, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7945120334625244, "rewards/margins": 7.468781471252441, "rewards/rejected": -5.674269199371338, "step": 744 }, { "epoch": 1.7995169082125604, "grad_norm": 3.733128309249878, "learning_rate": 8.202898550724637e-07, "logits/chosen": -2.5998289585113525, "logits/rejected": -2.533222198486328, "logps/chosen": -50.09972381591797, "logps/rejected": -130.3362274169922, "loss": 0.5345, "rewards/accuracies": 0.96875, "rewards/chosen": 0.592058539390564, "rewards/margins": 5.906798362731934, "rewards/rejected": -5.31473970413208, "step": 745 }, { "epoch": 1.8019323671497585, "grad_norm": 4.55730676651001, "learning_rate": 8.200483091787439e-07, "logits/chosen": -2.5570178031921387, "logits/rejected": -2.4997830390930176, "logps/chosen": -42.770164489746094, "logps/rejected": -125.0755615234375, "loss": 0.4896, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5903871059417725, "rewards/margins": 6.527333736419678, "rewards/rejected": -4.936946392059326, "step": 746 }, { "epoch": 1.8043478260869565, "grad_norm": 4.972842693328857, "learning_rate": 8.198067632850241e-07, "logits/chosen": -2.498180389404297, "logits/rejected": -2.536853551864624, "logps/chosen": -54.178043365478516, "logps/rejected": -133.96298217773438, "loss": 0.575, "rewards/accuracies": 0.9375, "rewards/chosen": 0.49662649631500244, "rewards/margins": 5.722796440124512, "rewards/rejected": -5.226170539855957, "step": 747 }, { "epoch": 1.8043478260869565, "eval_logits/chosen": -2.618844985961914, "eval_logits/rejected": -2.6072070598602295, "eval_logps/chosen": -38.802528381347656, "eval_logps/rejected": -125.60008239746094, "eval_loss": 0.4260672330856323, "eval_rewards/accuracies": 0.9558823704719543, "eval_rewards/chosen": 1.5858384370803833, "eval_rewards/margins": 6.512447357177734, "eval_rewards/rejected": -4.926609039306641, "eval_runtime": 1000.6353, "eval_samples_per_second": 0.544, "eval_steps_per_second": 0.272, "step": 747 } ], "logging_steps": 1.0, "max_steps": 4140, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 83, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }