diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6329 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 2000, + "global_step": 4168, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002399232245681382, + "grad_norm": 3.5056792836862627, + "learning_rate": 1.199040767386091e-09, + "logits/chosen": -1.4883875846862793, + "logits/rejected": -1.416823148727417, + "logps/chosen": -161.24717712402344, + "logps/rejected": -175.51541137695312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0023992322456813818, + "grad_norm": 3.8174848446929266, + "learning_rate": 1.199040767386091e-08, + "logits/chosen": -1.6635231971740723, + "logits/rejected": -1.6545089483261108, + "logps/chosen": -398.12603759765625, + "logps/rejected": -322.4006652832031, + "loss": 0.6932, + "rewards/accuracies": 0.4722222089767456, + "rewards/chosen": 0.00020433742611203343, + "rewards/margins": 0.000947743421420455, + "rewards/rejected": -0.000743405893445015, + "step": 10 + }, + { + "epoch": 0.0047984644913627635, + "grad_norm": 3.863373696596945, + "learning_rate": 2.398081534772182e-08, + "logits/chosen": -1.6468368768692017, + "logits/rejected": -1.6784213781356812, + "logps/chosen": -268.8175354003906, + "logps/rejected": -237.06240844726562, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0003671760787256062, + "rewards/margins": 0.00017412376473657787, + "rewards/rejected": 0.00019305227033328265, + "step": 20 + }, + { + "epoch": 0.007197696737044146, + "grad_norm": 3.7081110480933135, + "learning_rate": 3.597122302158273e-08, + "logits/chosen": -1.5468581914901733, + "logits/rejected": -1.542797327041626, + "logps/chosen": -266.8534240722656, + "logps/rejected": -267.03790283203125, + "loss": 0.6934, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.0006165923550724983, + "rewards/margins": 0.0003948546072933823, + "rewards/rejected": 0.00022173782053869218, + "step": 30 + }, + { + "epoch": 0.009596928982725527, + "grad_norm": 3.5686633612597114, + "learning_rate": 4.796163069544364e-08, + "logits/chosen": -1.6888965368270874, + "logits/rejected": -1.651166319847107, + "logps/chosen": -269.12921142578125, + "logps/rejected": -259.73663330078125, + "loss": 0.6925, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.001302229822613299, + "rewards/margins": 0.001623004674911499, + "rewards/rejected": -0.00032077505602501333, + "step": 40 + }, + { + "epoch": 0.01199616122840691, + "grad_norm": 3.8364678475966993, + "learning_rate": 5.995203836930455e-08, + "logits/chosen": -1.7128832340240479, + "logits/rejected": -1.6672801971435547, + "logps/chosen": -294.93475341796875, + "logps/rejected": -250.9867401123047, + "loss": 0.6932, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0001523580722277984, + "rewards/margins": -0.000665490108076483, + "rewards/rejected": 0.0008178481948561966, + "step": 50 + }, + { + "epoch": 0.014395393474088292, + "grad_norm": 3.908172449607729, + "learning_rate": 7.194244604316546e-08, + "logits/chosen": -1.5987389087677002, + "logits/rejected": -1.609201192855835, + "logps/chosen": -311.1792907714844, + "logps/rejected": -277.0119323730469, + "loss": 0.6935, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0004900293424725533, + "rewards/margins": -1.3016722732572816e-05, + "rewards/rejected": 0.0005030458560213447, + "step": 60 + }, + { + "epoch": 0.016794625719769675, + "grad_norm": 3.4877417634951255, + "learning_rate": 8.393285371702638e-08, + "logits/chosen": -1.6401567459106445, + "logits/rejected": -1.6593749523162842, + "logps/chosen": -300.8813171386719, + "logps/rejected": -285.99005126953125, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0007752370438538492, + "rewards/margins": 0.0012944363988935947, + "rewards/rejected": -0.0005191992968320847, + "step": 70 + }, + { + "epoch": 0.019193857965451054, + "grad_norm": 3.8477961171736634, + "learning_rate": 9.592326139088728e-08, + "logits/chosen": -1.5876344442367554, + "logits/rejected": -1.619933843612671, + "logps/chosen": -222.11495971679688, + "logps/rejected": -259.1878967285156, + "loss": 0.6932, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0002348280104342848, + "rewards/margins": -0.0016285456949844956, + "rewards/rejected": 0.00186337367631495, + "step": 80 + }, + { + "epoch": 0.021593090211132437, + "grad_norm": 3.658568799391329, + "learning_rate": 1.0791366906474819e-07, + "logits/chosen": -1.6128196716308594, + "logits/rejected": -1.612391471862793, + "logps/chosen": -364.6415710449219, + "logps/rejected": -313.7099914550781, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00037602329393848777, + "rewards/margins": 0.0007518329657614231, + "rewards/rejected": -0.00112785620149225, + "step": 90 + }, + { + "epoch": 0.02399232245681382, + "grad_norm": 3.8164854638361025, + "learning_rate": 1.199040767386091e-07, + "logits/chosen": -1.6457388401031494, + "logits/rejected": -1.6554689407348633, + "logps/chosen": -280.8997802734375, + "logps/rejected": -295.6603698730469, + "loss": 0.6926, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.001076525659300387, + "rewards/margins": -0.00022420981258619577, + "rewards/rejected": -0.0008523158612661064, + "step": 100 + }, + { + "epoch": 0.026391554702495202, + "grad_norm": 3.251687359051961, + "learning_rate": 1.3189448441247004e-07, + "logits/chosen": -1.6135714054107666, + "logits/rejected": -1.6114752292633057, + "logps/chosen": -247.7429962158203, + "logps/rejected": -244.31112670898438, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0007790708914399147, + "rewards/margins": 0.0011699094902724028, + "rewards/rejected": -0.0019489802652969956, + "step": 110 + }, + { + "epoch": 0.028790786948176585, + "grad_norm": 3.6767857848694434, + "learning_rate": 1.4388489208633092e-07, + "logits/chosen": -1.643938660621643, + "logits/rejected": -1.6287577152252197, + "logps/chosen": -324.9435729980469, + "logps/rejected": -294.64166259765625, + "loss": 0.693, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.0025485847145318985, + "rewards/margins": -0.0018857631366699934, + "rewards/rejected": -0.0006628216942772269, + "step": 120 + }, + { + "epoch": 0.031190019193857964, + "grad_norm": 3.378467310139603, + "learning_rate": 1.5587529976019183e-07, + "logits/chosen": -1.5718883275985718, + "logits/rejected": -1.5992323160171509, + "logps/chosen": -236.16085815429688, + "logps/rejected": -323.8802795410156, + "loss": 0.6924, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0005537395481951535, + "rewards/margins": 0.0013779096771031618, + "rewards/rejected": -0.0019316490506753325, + "step": 130 + }, + { + "epoch": 0.03358925143953935, + "grad_norm": 3.5565622632521476, + "learning_rate": 1.6786570743405277e-07, + "logits/chosen": -1.6049950122833252, + "logits/rejected": -1.670636773109436, + "logps/chosen": -302.82330322265625, + "logps/rejected": -293.5691223144531, + "loss": 0.6924, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0013237579260021448, + "rewards/margins": 0.0033771514426916838, + "rewards/rejected": -0.004700910300016403, + "step": 140 + }, + { + "epoch": 0.03598848368522073, + "grad_norm": 3.7241887452710385, + "learning_rate": 1.7985611510791365e-07, + "logits/chosen": -1.6608047485351562, + "logits/rejected": -1.6172984838485718, + "logps/chosen": -247.96939086914062, + "logps/rejected": -241.17111206054688, + "loss": 0.6922, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.001827596453949809, + "rewards/margins": 0.0031149538699537516, + "rewards/rejected": -0.004942550323903561, + "step": 150 + }, + { + "epoch": 0.03838771593090211, + "grad_norm": 3.7650474405725984, + "learning_rate": 1.9184652278177456e-07, + "logits/chosen": -1.623676061630249, + "logits/rejected": -1.58232843875885, + "logps/chosen": -320.1465759277344, + "logps/rejected": -246.5814971923828, + "loss": 0.6922, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0039003852289170027, + "rewards/margins": 8.531531420885585e-06, + "rewards/rejected": -0.003908916376531124, + "step": 160 + }, + { + "epoch": 0.040786948176583494, + "grad_norm": 3.3742190114845125, + "learning_rate": 2.038369304556355e-07, + "logits/chosen": -1.6414167881011963, + "logits/rejected": -1.6517263650894165, + "logps/chosen": -365.78070068359375, + "logps/rejected": -355.07684326171875, + "loss": 0.6918, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.003375888569280505, + "rewards/margins": 0.003419106360524893, + "rewards/rejected": -0.006794995162636042, + "step": 170 + }, + { + "epoch": 0.04318618042226487, + "grad_norm": 3.893923672357319, + "learning_rate": 2.1582733812949638e-07, + "logits/chosen": -1.6334331035614014, + "logits/rejected": -1.6182676553726196, + "logps/chosen": -249.95315551757812, + "logps/rejected": -243.65365600585938, + "loss": 0.6921, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003182569518685341, + "rewards/margins": 0.005136819090694189, + "rewards/rejected": -0.008319388143718243, + "step": 180 + }, + { + "epoch": 0.04558541266794626, + "grad_norm": 4.328005991935876, + "learning_rate": 2.278177458033573e-07, + "logits/chosen": -1.59438157081604, + "logits/rejected": -1.598433256149292, + "logps/chosen": -328.0663757324219, + "logps/rejected": -265.6521911621094, + "loss": 0.691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.005676161497831345, + "rewards/margins": 0.00224656006321311, + "rewards/rejected": -0.007922721095383167, + "step": 190 + }, + { + "epoch": 0.04798464491362764, + "grad_norm": 3.4669166404466267, + "learning_rate": 2.398081534772182e-07, + "logits/chosen": -1.715904951095581, + "logits/rejected": -1.6798690557479858, + "logps/chosen": -330.5942687988281, + "logps/rejected": -312.1226806640625, + "loss": 0.691, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007675034459680319, + "rewards/margins": 0.004218742251396179, + "rewards/rejected": -0.011893777176737785, + "step": 200 + }, + { + "epoch": 0.05038387715930902, + "grad_norm": 3.550601001859104, + "learning_rate": 2.517985611510791e-07, + "logits/chosen": -1.6995878219604492, + "logits/rejected": -1.7092031240463257, + "logps/chosen": -247.4270477294922, + "logps/rejected": -271.40960693359375, + "loss": 0.6902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.006873033009469509, + "rewards/margins": 0.006390347145497799, + "rewards/rejected": -0.013263382017612457, + "step": 210 + }, + { + "epoch": 0.052783109404990404, + "grad_norm": 3.5506079722939763, + "learning_rate": 2.637889688249401e-07, + "logits/chosen": -1.6824872493743896, + "logits/rejected": -1.692633032798767, + "logps/chosen": -331.85870361328125, + "logps/rejected": -327.26947021484375, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.012020569294691086, + "rewards/margins": 0.00552480760961771, + "rewards/rejected": -0.01754537597298622, + "step": 220 + }, + { + "epoch": 0.05518234165067178, + "grad_norm": 3.7749557114917605, + "learning_rate": 2.7577937649880093e-07, + "logits/chosen": -1.6799322366714478, + "logits/rejected": -1.7364540100097656, + "logps/chosen": -252.82473754882812, + "logps/rejected": -288.5075378417969, + "loss": 0.6887, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.011706655845046043, + "rewards/margins": 0.01080317609012127, + "rewards/rejected": -0.022509830072522163, + "step": 230 + }, + { + "epoch": 0.05758157389635317, + "grad_norm": 4.1857193206793015, + "learning_rate": 2.8776978417266184e-07, + "logits/chosen": -1.6128301620483398, + "logits/rejected": -1.5980497598648071, + "logps/chosen": -320.16375732421875, + "logps/rejected": -270.80133056640625, + "loss": 0.6867, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010916762053966522, + "rewards/margins": 0.015229749493300915, + "rewards/rejected": -0.026146510615944862, + "step": 240 + }, + { + "epoch": 0.05998080614203455, + "grad_norm": 3.8497859603369804, + "learning_rate": 2.997601918465228e-07, + "logits/chosen": -1.6171119213104248, + "logits/rejected": -1.5841248035430908, + "logps/chosen": -254.93936157226562, + "logps/rejected": -243.93179321289062, + "loss": 0.6871, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.01718009077012539, + "rewards/margins": 0.009010560810565948, + "rewards/rejected": -0.026190653443336487, + "step": 250 + }, + { + "epoch": 0.06238003838771593, + "grad_norm": 3.705368784982066, + "learning_rate": 3.1175059952038366e-07, + "logits/chosen": -1.6593215465545654, + "logits/rejected": -1.6591377258300781, + "logps/chosen": -282.84100341796875, + "logps/rejected": -276.47125244140625, + "loss": 0.6854, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.01674341782927513, + "rewards/margins": 0.015696872025728226, + "rewards/rejected": -0.03244028985500336, + "step": 260 + }, + { + "epoch": 0.0647792706333973, + "grad_norm": 3.601297170964158, + "learning_rate": 3.2374100719424457e-07, + "logits/chosen": -1.536478042602539, + "logits/rejected": -1.5124893188476562, + "logps/chosen": -303.76104736328125, + "logps/rejected": -241.261474609375, + "loss": 0.684, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.017451269552111626, + "rewards/margins": 0.009415589272975922, + "rewards/rejected": -0.026866856962442398, + "step": 270 + }, + { + "epoch": 0.0671785028790787, + "grad_norm": 3.9036742193164975, + "learning_rate": 3.3573141486810554e-07, + "logits/chosen": -1.5547986030578613, + "logits/rejected": -1.5795847177505493, + "logps/chosen": -316.18121337890625, + "logps/rejected": -299.7501220703125, + "loss": 0.6794, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.02595636248588562, + "rewards/margins": 0.026472270488739014, + "rewards/rejected": -0.052428632974624634, + "step": 280 + }, + { + "epoch": 0.06957773512476008, + "grad_norm": 3.3624197530231363, + "learning_rate": 3.477218225419664e-07, + "logits/chosen": -1.6303586959838867, + "logits/rejected": -1.6242969036102295, + "logps/chosen": -305.33953857421875, + "logps/rejected": -274.9311218261719, + "loss": 0.6799, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.027737725526094437, + "rewards/margins": 0.02691362239420414, + "rewards/rejected": -0.05465134233236313, + "step": 290 + }, + { + "epoch": 0.07197696737044146, + "grad_norm": 4.058047991319342, + "learning_rate": 3.597122302158273e-07, + "logits/chosen": -1.6332571506500244, + "logits/rejected": -1.6581714153289795, + "logps/chosen": -276.0576171875, + "logps/rejected": -290.2926940917969, + "loss": 0.6784, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04013335332274437, + "rewards/margins": 0.030186835676431656, + "rewards/rejected": -0.07032018899917603, + "step": 300 + }, + { + "epoch": 0.07437619961612284, + "grad_norm": 3.4101110193570454, + "learning_rate": 3.7170263788968827e-07, + "logits/chosen": -1.6190553903579712, + "logits/rejected": -1.6291663646697998, + "logps/chosen": -292.63751220703125, + "logps/rejected": -243.106201171875, + "loss": 0.6808, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.040502794086933136, + "rewards/margins": 0.03982759267091751, + "rewards/rejected": -0.08033039420843124, + "step": 310 + }, + { + "epoch": 0.07677543186180422, + "grad_norm": 3.351913067776505, + "learning_rate": 3.836930455635491e-07, + "logits/chosen": -1.6056486368179321, + "logits/rejected": -1.5875630378723145, + "logps/chosen": -295.599853515625, + "logps/rejected": -259.6369934082031, + "loss": 0.6774, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0446627140045166, + "rewards/margins": 0.02809850312769413, + "rewards/rejected": -0.07276121526956558, + "step": 320 + }, + { + "epoch": 0.07917466410748561, + "grad_norm": 3.4017416105630573, + "learning_rate": 3.9568345323741003e-07, + "logits/chosen": -1.5247066020965576, + "logits/rejected": -1.5482518672943115, + "logps/chosen": -268.7400207519531, + "logps/rejected": -309.99786376953125, + "loss": 0.6729, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0597003698348999, + "rewards/margins": 0.05082261562347412, + "rewards/rejected": -0.11052300035953522, + "step": 330 + }, + { + "epoch": 0.08157389635316699, + "grad_norm": 3.3527499070225244, + "learning_rate": 4.07673860911271e-07, + "logits/chosen": -1.6196010112762451, + "logits/rejected": -1.595496416091919, + "logps/chosen": -257.72515869140625, + "logps/rejected": -282.81781005859375, + "loss": 0.6716, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05155477672815323, + "rewards/margins": 0.06573888659477234, + "rewards/rejected": -0.11729365587234497, + "step": 340 + }, + { + "epoch": 0.08397312859884837, + "grad_norm": 3.8502979336068717, + "learning_rate": 4.1966426858513185e-07, + "logits/chosen": -1.6366294622421265, + "logits/rejected": -1.6474313735961914, + "logps/chosen": -299.79010009765625, + "logps/rejected": -299.626220703125, + "loss": 0.6741, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09185166656970978, + "rewards/margins": 0.03506358712911606, + "rewards/rejected": -0.12691523134708405, + "step": 350 + }, + { + "epoch": 0.08637236084452975, + "grad_norm": 4.087127125149584, + "learning_rate": 4.3165467625899276e-07, + "logits/chosen": -1.6223100423812866, + "logits/rejected": -1.5971088409423828, + "logps/chosen": -292.7254333496094, + "logps/rejected": -240.09213256835938, + "loss": 0.6693, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10348290205001831, + "rewards/margins": 0.053151585161685944, + "rewards/rejected": -0.15663447976112366, + "step": 360 + }, + { + "epoch": 0.08877159309021113, + "grad_norm": 3.962698259273254, + "learning_rate": 4.436450839328537e-07, + "logits/chosen": -1.5964925289154053, + "logits/rejected": -1.617010474205017, + "logps/chosen": -265.0798034667969, + "logps/rejected": -278.52117919921875, + "loss": 0.6651, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1335074007511139, + "rewards/margins": 0.0590825080871582, + "rewards/rejected": -0.1925898939371109, + "step": 370 + }, + { + "epoch": 0.09117082533589252, + "grad_norm": 3.5033230698831668, + "learning_rate": 4.556354916067146e-07, + "logits/chosen": -1.5588051080703735, + "logits/rejected": -1.5864968299865723, + "logps/chosen": -267.39471435546875, + "logps/rejected": -266.3304443359375, + "loss": 0.6546, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09075454622507095, + "rewards/margins": 0.07683941721916199, + "rewards/rejected": -0.16759395599365234, + "step": 380 + }, + { + "epoch": 0.0935700575815739, + "grad_norm": 3.9053311567068993, + "learning_rate": 4.676258992805755e-07, + "logits/chosen": -1.5949430465698242, + "logits/rejected": -1.5674622058868408, + "logps/chosen": -301.2566833496094, + "logps/rejected": -272.92877197265625, + "loss": 0.6529, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16490274667739868, + "rewards/margins": 0.055955369025468826, + "rewards/rejected": -0.2208581268787384, + "step": 390 + }, + { + "epoch": 0.09596928982725528, + "grad_norm": 3.9140466546998827, + "learning_rate": 4.796163069544364e-07, + "logits/chosen": -1.6508136987686157, + "logits/rejected": -1.6695753335952759, + "logps/chosen": -300.67315673828125, + "logps/rejected": -291.95501708984375, + "loss": 0.6473, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21914562582969666, + "rewards/margins": 0.147565096616745, + "rewards/rejected": -0.36671072244644165, + "step": 400 + }, + { + "epoch": 0.09836852207293666, + "grad_norm": 4.42678932238528, + "learning_rate": 4.916067146282974e-07, + "logits/chosen": -1.5839338302612305, + "logits/rejected": -1.5947954654693604, + "logps/chosen": -296.02362060546875, + "logps/rejected": -332.89208984375, + "loss": 0.6292, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3386813700199127, + "rewards/margins": 0.144867405295372, + "rewards/rejected": -0.4835488200187683, + "step": 410 + }, + { + "epoch": 0.10076775431861804, + "grad_norm": 5.2789138321700495, + "learning_rate": 4.999992108529978e-07, + "logits/chosen": -1.5680617094039917, + "logits/rejected": -1.5661697387695312, + "logps/chosen": -414.2063903808594, + "logps/rejected": -393.57598876953125, + "loss": 0.6447, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5759106874465942, + "rewards/margins": 0.245382621884346, + "rewards/rejected": -0.8212932348251343, + "step": 420 + }, + { + "epoch": 0.10316698656429943, + "grad_norm": 4.983795366931148, + "learning_rate": 4.999851817115532e-07, + "logits/chosen": -1.634783387184143, + "logits/rejected": -1.6127817630767822, + "logps/chosen": -324.0008850097656, + "logps/rejected": -326.49810791015625, + "loss": 0.6439, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.47741881012916565, + "rewards/margins": 0.19091394543647766, + "rewards/rejected": -0.6683326959609985, + "step": 430 + }, + { + "epoch": 0.10556621880998081, + "grad_norm": 3.631330415762284, + "learning_rate": 4.999536171027889e-07, + "logits/chosen": -1.6043802499771118, + "logits/rejected": -1.597813367843628, + "logps/chosen": -332.45391845703125, + "logps/rejected": -329.4422912597656, + "loss": 0.6432, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5105425119400024, + "rewards/margins": 0.08223161846399307, + "rewards/rejected": -0.5927742123603821, + "step": 440 + }, + { + "epoch": 0.10796545105566219, + "grad_norm": 4.090138150318208, + "learning_rate": 4.999045192408369e-07, + "logits/chosen": -1.506775140762329, + "logits/rejected": -1.4702590703964233, + "logps/chosen": -296.05792236328125, + "logps/rejected": -289.3249816894531, + "loss": 0.6316, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3827100694179535, + "rewards/margins": 0.15496531128883362, + "rewards/rejected": -0.5376753807067871, + "step": 450 + }, + { + "epoch": 0.11036468330134357, + "grad_norm": 4.139361867296913, + "learning_rate": 4.998378915697171e-07, + "logits/chosen": -1.569830298423767, + "logits/rejected": -1.5698078870773315, + "logps/chosen": -325.01458740234375, + "logps/rejected": -337.7842712402344, + "loss": 0.6148, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2572017014026642, + "rewards/margins": 0.24531395733356476, + "rewards/rejected": -0.5025156140327454, + "step": 460 + }, + { + "epoch": 0.11276391554702495, + "grad_norm": 4.5407974858594455, + "learning_rate": 4.997537387630958e-07, + "logits/chosen": -1.53157639503479, + "logits/rejected": -1.498214840888977, + "logps/chosen": -273.22802734375, + "logps/rejected": -292.3331298828125, + "loss": 0.6057, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3938341736793518, + "rewards/margins": 0.17262205481529236, + "rewards/rejected": -0.5664561986923218, + "step": 470 + }, + { + "epoch": 0.11516314779270634, + "grad_norm": 4.783089704109638, + "learning_rate": 4.996520667239582e-07, + "logits/chosen": -1.5579659938812256, + "logits/rejected": -1.5760862827301025, + "logps/chosen": -306.2027282714844, + "logps/rejected": -380.0113525390625, + "loss": 0.6096, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.42028918862342834, + "rewards/margins": 0.2578314542770386, + "rewards/rejected": -0.6781206727027893, + "step": 480 + }, + { + "epoch": 0.11756238003838772, + "grad_norm": 4.8302096742139105, + "learning_rate": 4.995328825841939e-07, + "logits/chosen": -1.5267994403839111, + "logits/rejected": -1.5366883277893066, + "logps/chosen": -285.08990478515625, + "logps/rejected": -315.9127197265625, + "loss": 0.5931, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34838151931762695, + "rewards/margins": 0.4001844525337219, + "rewards/rejected": -0.7485659122467041, + "step": 490 + }, + { + "epoch": 0.1199616122840691, + "grad_norm": 4.655267788061174, + "learning_rate": 4.993961947040967e-07, + "logits/chosen": -1.5378767251968384, + "logits/rejected": -1.5352329015731812, + "logps/chosen": -361.957275390625, + "logps/rejected": -334.80194091796875, + "loss": 0.609, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6517360806465149, + "rewards/margins": 0.10645874589681625, + "rewards/rejected": -0.7581947445869446, + "step": 500 + }, + { + "epoch": 0.12236084452975048, + "grad_norm": 4.03890292179432, + "learning_rate": 4.992420126717784e-07, + "logits/chosen": -1.5456221103668213, + "logits/rejected": -1.5595461130142212, + "logps/chosen": -313.9156188964844, + "logps/rejected": -357.70074462890625, + "loss": 0.607, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3863525092601776, + "rewards/margins": 0.5194737911224365, + "rewards/rejected": -0.9058262705802917, + "step": 510 + }, + { + "epoch": 0.12476007677543186, + "grad_norm": 4.515523592913458, + "learning_rate": 4.990703473024958e-07, + "logits/chosen": -1.5077584981918335, + "logits/rejected": -1.508772611618042, + "logps/chosen": -363.2955322265625, + "logps/rejected": -380.5810852050781, + "loss": 0.6185, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5841118097305298, + "rewards/margins": 0.2735467553138733, + "rewards/rejected": -0.8576586842536926, + "step": 520 + }, + { + "epoch": 0.12715930902111325, + "grad_norm": 4.68328497340501, + "learning_rate": 4.98881210637893e-07, + "logits/chosen": -1.4954583644866943, + "logits/rejected": -1.5232939720153809, + "logps/chosen": -274.912353515625, + "logps/rejected": -337.65228271484375, + "loss": 0.6211, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3999376893043518, + "rewards/margins": 0.3088452219963074, + "rewards/rejected": -0.7087828516960144, + "step": 530 + }, + { + "epoch": 0.1295585412667946, + "grad_norm": 5.719715088463468, + "learning_rate": 4.986746159451553e-07, + "logits/chosen": -1.5417931079864502, + "logits/rejected": -1.5638864040374756, + "logps/chosen": -317.2779846191406, + "logps/rejected": -345.56207275390625, + "loss": 0.6037, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.46498027443885803, + "rewards/margins": 0.4305883049964905, + "rewards/rejected": -0.8955684900283813, + "step": 540 + }, + { + "epoch": 0.131957773512476, + "grad_norm": 3.924618462296512, + "learning_rate": 4.984505777160795e-07, + "logits/chosen": -1.4803025722503662, + "logits/rejected": -1.5034992694854736, + "logps/chosen": -383.60009765625, + "logps/rejected": -410.09637451171875, + "loss": 0.6196, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.622631847858429, + "rewards/margins": 0.29748308658599854, + "rewards/rejected": -0.9201149940490723, + "step": 550 + }, + { + "epoch": 0.1343570057581574, + "grad_norm": 5.000411140737786, + "learning_rate": 4.982091116660574e-07, + "logits/chosen": -1.5640392303466797, + "logits/rejected": -1.5526823997497559, + "logps/chosen": -262.2401123046875, + "logps/rejected": -251.67489624023438, + "loss": 0.6329, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4863004684448242, + "rewards/margins": 0.13453409075737, + "rewards/rejected": -0.6208345890045166, + "step": 560 + }, + { + "epoch": 0.13675623800383876, + "grad_norm": 5.426495125083168, + "learning_rate": 4.979502347329732e-07, + "logits/chosen": -1.545018196105957, + "logits/rejected": -1.5341848134994507, + "logps/chosen": -359.99212646484375, + "logps/rejected": -430.8915100097656, + "loss": 0.61, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6372441053390503, + "rewards/margins": 0.4039608836174011, + "rewards/rejected": -1.0412050485610962, + "step": 570 + }, + { + "epoch": 0.13915547024952016, + "grad_norm": 4.977101807860706, + "learning_rate": 4.976739650760151e-07, + "logits/chosen": -1.6066020727157593, + "logits/rejected": -1.5763802528381348, + "logps/chosen": -331.3481750488281, + "logps/rejected": -358.0259704589844, + "loss": 0.5891, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5530639886856079, + "rewards/margins": 0.3901999890804291, + "rewards/rejected": -0.9432638883590698, + "step": 580 + }, + { + "epoch": 0.14155470249520152, + "grad_norm": 6.414020548691111, + "learning_rate": 4.97380322074402e-07, + "logits/chosen": -1.5697975158691406, + "logits/rejected": -1.545709252357483, + "logps/chosen": -320.78094482421875, + "logps/rejected": -346.2607116699219, + "loss": 0.6312, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8397436141967773, + "rewards/margins": 0.30745047330856323, + "rewards/rejected": -1.1471941471099854, + "step": 590 + }, + { + "epoch": 0.14395393474088292, + "grad_norm": 5.342440443936391, + "learning_rate": 4.970693263260237e-07, + "logits/chosen": -1.4645593166351318, + "logits/rejected": -1.4802438020706177, + "logps/chosen": -373.068359375, + "logps/rejected": -366.2574157714844, + "loss": 0.6067, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.551115870475769, + "rewards/margins": 0.47067826986312866, + "rewards/rejected": -1.021794080734253, + "step": 600 + }, + { + "epoch": 0.1463531669865643, + "grad_norm": 5.144967895606553, + "learning_rate": 4.967409996459966e-07, + "logits/chosen": -1.6260372400283813, + "logits/rejected": -1.6305965185165405, + "logps/chosen": -339.40875244140625, + "logps/rejected": -351.1748046875, + "loss": 0.593, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5743804574012756, + "rewards/margins": 0.37975504994392395, + "rewards/rejected": -0.954135537147522, + "step": 610 + }, + { + "epoch": 0.14875239923224567, + "grad_norm": 5.888969372392584, + "learning_rate": 4.963953650651326e-07, + "logits/chosen": -1.5199607610702515, + "logits/rejected": -1.4783246517181396, + "logps/chosen": -454.8645935058594, + "logps/rejected": -392.8035888671875, + "loss": 0.5956, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8411262631416321, + "rewards/margins": 0.4142216742038727, + "rewards/rejected": -1.2553479671478271, + "step": 620 + }, + { + "epoch": 0.15115163147792707, + "grad_norm": 5.241290267394913, + "learning_rate": 4.960324468283248e-07, + "logits/chosen": -1.5408384799957275, + "logits/rejected": -1.5691133737564087, + "logps/chosen": -291.14508056640625, + "logps/rejected": -330.90277099609375, + "loss": 0.5691, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5521666407585144, + "rewards/margins": 0.4039185047149658, + "rewards/rejected": -0.956085205078125, + "step": 630 + }, + { + "epoch": 0.15355086372360843, + "grad_norm": 6.034628988888004, + "learning_rate": 4.956522703928451e-07, + "logits/chosen": -1.5112297534942627, + "logits/rejected": -1.5096272230148315, + "logps/chosen": -317.29400634765625, + "logps/rejected": -367.8897705078125, + "loss": 0.5808, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6753019690513611, + "rewards/margins": 0.42325735092163086, + "rewards/rejected": -1.0985593795776367, + "step": 640 + }, + { + "epoch": 0.15595009596928983, + "grad_norm": 6.06042461193669, + "learning_rate": 4.952548624265606e-07, + "logits/chosen": -1.462241530418396, + "logits/rejected": -1.4188714027404785, + "logps/chosen": -373.4043884277344, + "logps/rejected": -387.77923583984375, + "loss": 0.6217, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8461149334907532, + "rewards/margins": 0.27142664790153503, + "rewards/rejected": -1.1175415515899658, + "step": 650 + }, + { + "epoch": 0.15834932821497122, + "grad_norm": 4.638041821009946, + "learning_rate": 4.948402508060607e-07, + "logits/chosen": -1.532447338104248, + "logits/rejected": -1.5247899293899536, + "logps/chosen": -303.63494873046875, + "logps/rejected": -343.4326171875, + "loss": 0.6065, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5687133073806763, + "rewards/margins": 0.5383566617965698, + "rewards/rejected": -1.107069969177246, + "step": 660 + }, + { + "epoch": 0.16074856046065258, + "grad_norm": 4.7563938394985525, + "learning_rate": 4.944084646147038e-07, + "logits/chosen": -1.5980417728424072, + "logits/rejected": -1.5698829889297485, + "logps/chosen": -385.9518737792969, + "logps/rejected": -372.51214599609375, + "loss": 0.6372, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5248526930809021, + "rewards/margins": 0.12505970895290375, + "rewards/rejected": -0.649912416934967, + "step": 670 + }, + { + "epoch": 0.16314779270633398, + "grad_norm": 4.759002323226384, + "learning_rate": 4.939595341405754e-07, + "logits/chosen": -1.5244510173797607, + "logits/rejected": -1.5224545001983643, + "logps/chosen": -313.8047180175781, + "logps/rejected": -315.26263427734375, + "loss": 0.6141, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4212771952152252, + "rewards/margins": 0.25005480647087097, + "rewards/rejected": -0.671332061290741, + "step": 680 + }, + { + "epoch": 0.16554702495201534, + "grad_norm": 5.223029601556046, + "learning_rate": 4.93493490874365e-07, + "logits/chosen": -1.479612946510315, + "logits/rejected": -1.4728684425354004, + "logps/chosen": -311.44775390625, + "logps/rejected": -350.54779052734375, + "loss": 0.5745, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.564383864402771, + "rewards/margins": 0.3139537572860718, + "rewards/rejected": -0.8783376812934875, + "step": 690 + }, + { + "epoch": 0.16794625719769674, + "grad_norm": 5.40070237050068, + "learning_rate": 4.93010367507156e-07, + "logits/chosen": -1.4617944955825806, + "logits/rejected": -1.4599894285202026, + "logps/chosen": -294.10797119140625, + "logps/rejected": -335.54705810546875, + "loss": 0.5637, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6340052485466003, + "rewards/margins": 0.6616395711898804, + "rewards/rejected": -1.2956448793411255, + "step": 700 + }, + { + "epoch": 0.17034548944337813, + "grad_norm": 6.012195841421261, + "learning_rate": 4.925101979281332e-07, + "logits/chosen": -1.4645987749099731, + "logits/rejected": -1.453375220298767, + "logps/chosen": -395.05718994140625, + "logps/rejected": -391.1450500488281, + "loss": 0.6028, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7132852077484131, + "rewards/margins": 0.5267443060874939, + "rewards/rejected": -1.2400295734405518, + "step": 710 + }, + { + "epoch": 0.1727447216890595, + "grad_norm": 5.2059132091980596, + "learning_rate": 4.919930172222054e-07, + "logits/chosen": -1.5603594779968262, + "logits/rejected": -1.5616681575775146, + "logps/chosen": -337.87713623046875, + "logps/rejected": -388.90130615234375, + "loss": 0.5578, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.7377550005912781, + "rewards/margins": 0.5399189591407776, + "rewards/rejected": -1.2776739597320557, + "step": 720 + }, + { + "epoch": 0.1751439539347409, + "grad_norm": 11.822921470671519, + "learning_rate": 4.914588616675445e-07, + "logits/chosen": -1.5943918228149414, + "logits/rejected": -1.606041669845581, + "logps/chosen": -346.92205810546875, + "logps/rejected": -378.21240234375, + "loss": 0.6279, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8934835195541382, + "rewards/margins": 0.5335273146629333, + "rewards/rejected": -1.4270107746124268, + "step": 730 + }, + { + "epoch": 0.17754318618042225, + "grad_norm": 5.211631403180053, + "learning_rate": 4.909077687330404e-07, + "logits/chosen": -1.5509735345840454, + "logits/rejected": -1.4975712299346924, + "logps/chosen": -349.6174011230469, + "logps/rejected": -349.61492919921875, + "loss": 0.5735, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6521992087364197, + "rewards/margins": 0.30101969838142395, + "rewards/rejected": -0.9532188177108765, + "step": 740 + }, + { + "epoch": 0.17994241842610365, + "grad_norm": 5.432660278163866, + "learning_rate": 4.903397770756729e-07, + "logits/chosen": -1.5139765739440918, + "logits/rejected": -1.5181429386138916, + "logps/chosen": -341.566162109375, + "logps/rejected": -375.05828857421875, + "loss": 0.5879, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6539328098297119, + "rewards/margins": 0.38768962025642395, + "rewards/rejected": -1.0416224002838135, + "step": 750 + }, + { + "epoch": 0.18234165067178504, + "grad_norm": 5.539798409070412, + "learning_rate": 4.897549265378004e-07, + "logits/chosen": -1.5405504703521729, + "logits/rejected": -1.4953763484954834, + "logps/chosen": -427.6753845214844, + "logps/rejected": -441.68365478515625, + "loss": 0.5945, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8601313829421997, + "rewards/margins": 0.2185048609972, + "rewards/rejected": -1.0786362886428833, + "step": 760 + }, + { + "epoch": 0.1847408829174664, + "grad_norm": 6.131059558759629, + "learning_rate": 4.891532581443643e-07, + "logits/chosen": -1.4995934963226318, + "logits/rejected": -1.4943822622299194, + "logps/chosen": -385.32073974609375, + "logps/rejected": -454.878173828125, + "loss": 0.5706, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6404799222946167, + "rewards/margins": 0.7536662817001343, + "rewards/rejected": -1.394146203994751, + "step": 770 + }, + { + "epoch": 0.1871401151631478, + "grad_norm": 7.531135867694059, + "learning_rate": 4.885348141000122e-07, + "logits/chosen": -1.5458956956863403, + "logits/rejected": -1.5822408199310303, + "logps/chosen": -344.25244140625, + "logps/rejected": -396.47247314453125, + "loss": 0.5803, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8405560255050659, + "rewards/margins": 0.45578351616859436, + "rewards/rejected": -1.296339511871338, + "step": 780 + }, + { + "epoch": 0.18953934740882916, + "grad_norm": 6.102690327063579, + "learning_rate": 4.878996377861367e-07, + "logits/chosen": -1.5642445087432861, + "logits/rejected": -1.5695910453796387, + "logps/chosen": -318.0394592285156, + "logps/rejected": -365.24224853515625, + "loss": 0.5556, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8526245951652527, + "rewards/margins": 0.4370867609977722, + "rewards/rejected": -1.2897112369537354, + "step": 790 + }, + { + "epoch": 0.19193857965451055, + "grad_norm": 6.832280804361678, + "learning_rate": 4.872477737578327e-07, + "logits/chosen": -1.4795372486114502, + "logits/rejected": -1.4752862453460693, + "logps/chosen": -411.67425537109375, + "logps/rejected": -499.11505126953125, + "loss": 0.5499, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0738346576690674, + "rewards/margins": 1.004138708114624, + "rewards/rejected": -2.0779738426208496, + "step": 800 + }, + { + "epoch": 0.19433781190019195, + "grad_norm": 8.284630220336107, + "learning_rate": 4.865792677407718e-07, + "logits/chosen": -1.593660593032837, + "logits/rejected": -1.5664924383163452, + "logps/chosen": -356.3196105957031, + "logps/rejected": -367.1153564453125, + "loss": 0.5915, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9886842966079712, + "rewards/margins": 0.3350405991077423, + "rewards/rejected": -1.3237249851226807, + "step": 810 + }, + { + "epoch": 0.1967370441458733, + "grad_norm": 5.96781876097138, + "learning_rate": 4.858941666279955e-07, + "logits/chosen": -1.6477540731430054, + "logits/rejected": -1.6294025182724, + "logps/chosen": -371.42059326171875, + "logps/rejected": -375.2044982910156, + "loss": 0.5992, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.831330418586731, + "rewards/margins": 0.20967264473438263, + "rewards/rejected": -1.0410031080245972, + "step": 820 + }, + { + "epoch": 0.1991362763915547, + "grad_norm": 5.266959588580237, + "learning_rate": 4.851925184766247e-07, + "logits/chosen": -1.5675857067108154, + "logits/rejected": -1.504521131515503, + "logps/chosen": -352.08807373046875, + "logps/rejected": -370.45294189453125, + "loss": 0.5815, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.783744215965271, + "rewards/margins": 0.4445928931236267, + "rewards/rejected": -1.2283371686935425, + "step": 830 + }, + { + "epoch": 0.20153550863723607, + "grad_norm": 7.5633333882780365, + "learning_rate": 4.844743725044897e-07, + "logits/chosen": -1.6219160556793213, + "logits/rejected": -1.5205755233764648, + "logps/chosen": -343.59161376953125, + "logps/rejected": -349.74249267578125, + "loss": 0.5914, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7424141764640808, + "rewards/margins": 0.3212757408618927, + "rewards/rejected": -1.0636898279190063, + "step": 840 + }, + { + "epoch": 0.20393474088291746, + "grad_norm": 5.7426817744180045, + "learning_rate": 4.837397790866774e-07, + "logits/chosen": -1.511488914489746, + "logits/rejected": -1.5107842683792114, + "logps/chosen": -362.03057861328125, + "logps/rejected": -407.59161376953125, + "loss": 0.584, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5177355408668518, + "rewards/margins": 0.7411140203475952, + "rewards/rejected": -1.2588495016098022, + "step": 850 + }, + { + "epoch": 0.20633397312859886, + "grad_norm": 6.495429597968071, + "learning_rate": 4.829887897519974e-07, + "logits/chosen": -1.4964102506637573, + "logits/rejected": -1.5037615299224854, + "logps/chosen": -321.86236572265625, + "logps/rejected": -395.5457763671875, + "loss": 0.5886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7702068090438843, + "rewards/margins": 0.5501433610916138, + "rewards/rejected": -1.3203500509262085, + "step": 860 + }, + { + "epoch": 0.20873320537428022, + "grad_norm": 5.5808817971322116, + "learning_rate": 4.82221457179368e-07, + "logits/chosen": -1.488948941230774, + "logits/rejected": -1.5117526054382324, + "logps/chosen": -384.2512512207031, + "logps/rejected": -485.3260803222656, + "loss": 0.5547, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9392452239990234, + "rewards/margins": 1.2166972160339355, + "rewards/rejected": -2.155942440032959, + "step": 870 + }, + { + "epoch": 0.21113243761996162, + "grad_norm": 7.0272013657172625, + "learning_rate": 4.814378351941206e-07, + "logits/chosen": -1.520959734916687, + "logits/rejected": -1.5359563827514648, + "logps/chosen": -357.6517333984375, + "logps/rejected": -377.56085205078125, + "loss": 0.5854, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8206228017807007, + "rewards/margins": 0.3496705889701843, + "rewards/rejected": -1.1702934503555298, + "step": 880 + }, + { + "epoch": 0.21353166986564298, + "grad_norm": 6.3594913416138885, + "learning_rate": 4.806379787642241e-07, + "logits/chosen": -1.466625452041626, + "logits/rejected": -1.4915995597839355, + "logps/chosen": -374.0470886230469, + "logps/rejected": -476.73828125, + "loss": 0.6061, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0663881301879883, + "rewards/margins": 1.0952132940292358, + "rewards/rejected": -2.1616015434265137, + "step": 890 + }, + { + "epoch": 0.21593090211132437, + "grad_norm": 6.116338715106648, + "learning_rate": 4.798219439964293e-07, + "logits/chosen": -1.5042366981506348, + "logits/rejected": -1.5534061193466187, + "logps/chosen": -364.03472900390625, + "logps/rejected": -392.703857421875, + "loss": 0.5509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9239017367362976, + "rewards/margins": 0.20848917961120605, + "rewards/rejected": -1.1323908567428589, + "step": 900 + }, + { + "epoch": 0.21833013435700577, + "grad_norm": 34.61975290546189, + "learning_rate": 4.78989788132333e-07, + "logits/chosen": -1.5566984415054321, + "logits/rejected": -1.5171282291412354, + "logps/chosen": -326.6298828125, + "logps/rejected": -439.526123046875, + "loss": 0.5439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8856453895568848, + "rewards/margins": 1.1186928749084473, + "rewards/rejected": -2.004338264465332, + "step": 910 + }, + { + "epoch": 0.22072936660268713, + "grad_norm": 5.399099459014794, + "learning_rate": 4.781415695443631e-07, + "logits/chosen": -1.4662867784500122, + "logits/rejected": -1.3914134502410889, + "logps/chosen": -505.794921875, + "logps/rejected": -595.918701171875, + "loss": 0.5759, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.901526689529419, + "rewards/margins": 0.8914464116096497, + "rewards/rejected": -2.792973041534424, + "step": 920 + }, + { + "epoch": 0.22312859884836853, + "grad_norm": 5.819110199820003, + "learning_rate": 4.772773477316836e-07, + "logits/chosen": -1.547885537147522, + "logits/rejected": -1.5537118911743164, + "logps/chosen": -381.8018493652344, + "logps/rejected": -429.70758056640625, + "loss": 0.5617, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.960909366607666, + "rewards/margins": 0.4732838273048401, + "rewards/rejected": -1.4341931343078613, + "step": 930 + }, + { + "epoch": 0.2255278310940499, + "grad_norm": 7.075329262381567, + "learning_rate": 4.7639718331602117e-07, + "logits/chosen": -1.4385401010513306, + "logits/rejected": -1.4047901630401611, + "logps/chosen": -439.00054931640625, + "logps/rejected": -563.8079833984375, + "loss": 0.5454, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4129924774169922, + "rewards/margins": 1.386842966079712, + "rewards/rejected": -2.799834966659546, + "step": 940 + }, + { + "epoch": 0.22792706333973128, + "grad_norm": 117.88806086127256, + "learning_rate": 4.7550113803741275e-07, + "logits/chosen": -1.4240310192108154, + "logits/rejected": -1.3947417736053467, + "logps/chosen": -479.65008544921875, + "logps/rejected": -550.0196533203125, + "loss": 0.6165, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7085025310516357, + "rewards/margins": 1.4387762546539307, + "rewards/rejected": -3.1472787857055664, + "step": 950 + }, + { + "epoch": 0.23032629558541268, + "grad_norm": 7.753373606542773, + "learning_rate": 4.7458927474987454e-07, + "logits/chosen": -1.4834281206130981, + "logits/rejected": -1.4571282863616943, + "logps/chosen": -435.71551513671875, + "logps/rejected": -423.3853454589844, + "loss": 0.5621, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0432417392730713, + "rewards/margins": 0.4899858832359314, + "rewards/rejected": -1.533227562904358, + "step": 960 + }, + { + "epoch": 0.23272552783109404, + "grad_norm": 35.84391506181939, + "learning_rate": 4.7366165741699347e-07, + "logits/chosen": -1.540824055671692, + "logits/rejected": -1.5162460803985596, + "logps/chosen": -473.71514892578125, + "logps/rejected": -511.4437561035156, + "loss": 0.5601, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2902333736419678, + "rewards/margins": 0.7524299025535583, + "rewards/rejected": -2.042663097381592, + "step": 970 + }, + { + "epoch": 0.23512476007677544, + "grad_norm": 6.711770568150169, + "learning_rate": 4.727183511074401e-07, + "logits/chosen": -1.5611612796783447, + "logits/rejected": -1.5459405183792114, + "logps/chosen": -390.70880126953125, + "logps/rejected": -402.97979736328125, + "loss": 0.5557, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9223060607910156, + "rewards/margins": 0.2812274992465973, + "rewards/rejected": -1.20353364944458, + "step": 980 + }, + { + "epoch": 0.2375239923224568, + "grad_norm": 7.356819549245005, + "learning_rate": 4.717594219904043e-07, + "logits/chosen": -1.5465433597564697, + "logits/rejected": -1.4800186157226562, + "logps/chosen": -394.327880859375, + "logps/rejected": -407.1432189941406, + "loss": 0.5799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.086284875869751, + "rewards/margins": 0.5457164645195007, + "rewards/rejected": -1.632001519203186, + "step": 990 + }, + { + "epoch": 0.2399232245681382, + "grad_norm": 6.180011768389308, + "learning_rate": 4.7078493733095393e-07, + "logits/chosen": -1.4510798454284668, + "logits/rejected": -1.4394136667251587, + "logps/chosen": -414.8174743652344, + "logps/rejected": -487.3868713378906, + "loss": 0.5292, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4004895687103271, + "rewards/margins": 0.7286332845687866, + "rewards/rejected": -2.1291232109069824, + "step": 1000 + }, + { + "epoch": 0.2423224568138196, + "grad_norm": 8.973107903148879, + "learning_rate": 4.6979496548531614e-07, + "logits/chosen": -1.4409250020980835, + "logits/rejected": -1.4732897281646729, + "logps/chosen": -402.0752258300781, + "logps/rejected": -518.8154907226562, + "loss": 0.5541, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2081267833709717, + "rewards/margins": 0.7229002714157104, + "rewards/rejected": -1.9310270547866821, + "step": 1010 + }, + { + "epoch": 0.24472168905950095, + "grad_norm": 6.834796795848274, + "learning_rate": 4.6878957589608293e-07, + "logits/chosen": -1.385780930519104, + "logits/rejected": -1.3392270803451538, + "logps/chosen": -520.9588012695312, + "logps/rejected": -768.1580200195312, + "loss": 0.5945, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.352682590484619, + "rewards/margins": 2.0763027667999268, + "rewards/rejected": -4.428984642028809, + "step": 1020 + }, + { + "epoch": 0.24712092130518235, + "grad_norm": 6.338350339612117, + "learning_rate": 4.6776883908733956e-07, + "logits/chosen": -1.4341570138931274, + "logits/rejected": -1.3487894535064697, + "logps/chosen": -448.89727783203125, + "logps/rejected": -498.39654541015625, + "loss": 0.5344, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3556662797927856, + "rewards/margins": 1.197887897491455, + "rewards/rejected": -2.553554058074951, + "step": 1030 + }, + { + "epoch": 0.2495201535508637, + "grad_norm": 9.762031523337113, + "learning_rate": 4.667328266597178e-07, + "logits/chosen": -1.5152060985565186, + "logits/rejected": -1.4758002758026123, + "logps/chosen": -388.54815673828125, + "logps/rejected": -449.3814392089844, + "loss": 0.5325, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1459699869155884, + "rewards/margins": 0.7172293663024902, + "rewards/rejected": -1.863199234008789, + "step": 1040 + }, + { + "epoch": 0.2519193857965451, + "grad_norm": 6.587552980986566, + "learning_rate": 4.6568161128537354e-07, + "logits/chosen": -1.5234332084655762, + "logits/rejected": -1.397935152053833, + "logps/chosen": -427.96697998046875, + "logps/rejected": -441.1510314941406, + "loss": 0.548, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.4824414253234863, + "rewards/margins": 0.7462078332901001, + "rewards/rejected": -2.228649139404297, + "step": 1050 + }, + { + "epoch": 0.2543186180422265, + "grad_norm": 7.250799528397824, + "learning_rate": 4.6461526670288877e-07, + "logits/chosen": -1.513885259628296, + "logits/rejected": -1.4797332286834717, + "logps/chosen": -404.1684265136719, + "logps/rejected": -435.52703857421875, + "loss": 0.5982, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1321500539779663, + "rewards/margins": 0.6284357905387878, + "rewards/rejected": -1.7605857849121094, + "step": 1060 + }, + { + "epoch": 0.2567178502879079, + "grad_norm": 6.968262543987022, + "learning_rate": 4.635338677120994e-07, + "logits/chosen": -1.4521477222442627, + "logits/rejected": -1.448061466217041, + "logps/chosen": -391.4552917480469, + "logps/rejected": -503.6328125, + "loss": 0.5347, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.092247486114502, + "rewards/margins": 0.9845672845840454, + "rewards/rejected": -2.076814889907837, + "step": 1070 + }, + { + "epoch": 0.2591170825335892, + "grad_norm": 6.982005120781178, + "learning_rate": 4.6243749016884835e-07, + "logits/chosen": -1.3785518407821655, + "logits/rejected": -1.3828446865081787, + "logps/chosen": -446.35888671875, + "logps/rejected": -595.894775390625, + "loss": 0.5645, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5676217079162598, + "rewards/margins": 1.0973035097122192, + "rewards/rejected": -2.6649250984191895, + "step": 1080 + }, + { + "epoch": 0.2615163147792706, + "grad_norm": 10.40340346607264, + "learning_rate": 4.613262109796645e-07, + "logits/chosen": -1.5188038349151611, + "logits/rejected": -1.5022757053375244, + "logps/chosen": -400.90130615234375, + "logps/rejected": -553.9241943359375, + "loss": 0.5458, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2826495170593262, + "rewards/margins": 1.2281033992767334, + "rewards/rejected": -2.5107529163360596, + "step": 1090 + }, + { + "epoch": 0.263915547024952, + "grad_norm": 6.972298855628279, + "learning_rate": 4.602001080963678e-07, + "logits/chosen": -1.5422290563583374, + "logits/rejected": -1.467437505722046, + "logps/chosen": -410.7484436035156, + "logps/rejected": -512.1973876953125, + "loss": 0.5535, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1985065937042236, + "rewards/margins": 1.2838784456253052, + "rewards/rejected": -2.4823849201202393, + "step": 1100 + }, + { + "epoch": 0.2663147792706334, + "grad_norm": 8.549949049117258, + "learning_rate": 4.590592605106017e-07, + "logits/chosen": -1.5908862352371216, + "logits/rejected": -1.5656800270080566, + "logps/chosen": -441.91448974609375, + "logps/rejected": -512.890869140625, + "loss": 0.5669, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3182880878448486, + "rewards/margins": 1.022831678390503, + "rewards/rejected": -2.3411195278167725, + "step": 1110 + }, + { + "epoch": 0.2687140115163148, + "grad_norm": 7.404227636225907, + "learning_rate": 4.5790374824829165e-07, + "logits/chosen": -1.3316807746887207, + "logits/rejected": -1.3402780294418335, + "logps/chosen": -334.657958984375, + "logps/rejected": -431.4007873535156, + "loss": 0.561, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3125879764556885, + "rewards/margins": 0.8587444424629211, + "rewards/rejected": -2.1713321208953857, + "step": 1120 + }, + { + "epoch": 0.27111324376199614, + "grad_norm": 9.086124099951459, + "learning_rate": 4.5673365236403216e-07, + "logits/chosen": -1.373974323272705, + "logits/rejected": -1.3391624689102173, + "logps/chosen": -451.37176513671875, + "logps/rejected": -618.1118774414062, + "loss": 0.5369, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3380093574523926, + "rewards/margins": 1.4104362726211548, + "rewards/rejected": -3.748445987701416, + "step": 1130 + }, + { + "epoch": 0.27351247600767753, + "grad_norm": 8.245489550138963, + "learning_rate": 4.5554905493540075e-07, + "logits/chosen": -1.3963029384613037, + "logits/rejected": -1.3131446838378906, + "logps/chosen": -397.52484130859375, + "logps/rejected": -545.176025390625, + "loss": 0.5366, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6280912160873413, + "rewards/margins": 1.4453803300857544, + "rewards/rejected": -3.0734715461730957, + "step": 1140 + }, + { + "epoch": 0.2759117082533589, + "grad_norm": 7.262761099946982, + "learning_rate": 4.5435003905720074e-07, + "logits/chosen": -1.4503087997436523, + "logits/rejected": -1.3999178409576416, + "logps/chosen": -486.7837829589844, + "logps/rejected": -546.538818359375, + "loss": 0.5492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.865114450454712, + "rewards/margins": 0.9383285641670227, + "rewards/rejected": -2.803443193435669, + "step": 1150 + }, + { + "epoch": 0.2783109404990403, + "grad_norm": 10.467347582561494, + "learning_rate": 4.531366888356324e-07, + "logits/chosen": -1.413732886314392, + "logits/rejected": -1.3654654026031494, + "logps/chosen": -368.93231201171875, + "logps/rejected": -577.9190673828125, + "loss": 0.527, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6492884159088135, + "rewards/margins": 1.7214257717132568, + "rewards/rejected": -3.370713710784912, + "step": 1160 + }, + { + "epoch": 0.2807101727447217, + "grad_norm": 7.656835918634615, + "learning_rate": 4.519090893823931e-07, + "logits/chosen": -1.4396826028823853, + "logits/rejected": -1.4221911430358887, + "logps/chosen": -438.3218688964844, + "logps/rejected": -529.28759765625, + "loss": 0.5347, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.745650291442871, + "rewards/margins": 1.0542502403259277, + "rewards/rejected": -2.799900531768799, + "step": 1170 + }, + { + "epoch": 0.28310940499040305, + "grad_norm": 10.339237287269262, + "learning_rate": 4.5066732680870734e-07, + "logits/chosen": -1.3462697267532349, + "logits/rejected": -1.3036195039749146, + "logps/chosen": -428.9185485839844, + "logps/rejected": -522.67236328125, + "loss": 0.5053, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5868905782699585, + "rewards/margins": 1.3584994077682495, + "rewards/rejected": -2.945390224456787, + "step": 1180 + }, + { + "epoch": 0.28550863723608444, + "grad_norm": 10.424964890520823, + "learning_rate": 4.494114882192862e-07, + "logits/chosen": -1.4587008953094482, + "logits/rejected": -1.3789886236190796, + "logps/chosen": -418.345458984375, + "logps/rejected": -561.0228271484375, + "loss": 0.5412, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4498519897460938, + "rewards/margins": 1.7318111658096313, + "rewards/rejected": -3.1816630363464355, + "step": 1190 + }, + { + "epoch": 0.28790786948176583, + "grad_norm": 9.033374903725427, + "learning_rate": 4.4814166170621735e-07, + "logits/chosen": -1.4350335597991943, + "logits/rejected": -1.4181472063064575, + "logps/chosen": -448.14141845703125, + "logps/rejected": -524.2359619140625, + "loss": 0.5488, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7904564142227173, + "rewards/margins": 1.0193628072738647, + "rewards/rejected": -2.809818983078003, + "step": 1200 + }, + { + "epoch": 0.2903071017274472, + "grad_norm": 12.051469646958232, + "learning_rate": 4.468579363427858e-07, + "logits/chosen": -1.5547723770141602, + "logits/rejected": -1.5221188068389893, + "logps/chosen": -415.83349609375, + "logps/rejected": -476.7373962402344, + "loss": 0.5325, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3881736993789673, + "rewards/margins": 0.9286211133003235, + "rewards/rejected": -2.3167946338653564, + "step": 1210 + }, + { + "epoch": 0.2927063339731286, + "grad_norm": 8.844148732841079, + "learning_rate": 4.4556040217722555e-07, + "logits/chosen": -1.574505090713501, + "logits/rejected": -1.5706756114959717, + "logps/chosen": -370.81610107421875, + "logps/rejected": -481.24993896484375, + "loss": 0.5232, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1100866794586182, + "rewards/margins": 0.9274276494979858, + "rewards/rejected": -2.0375144481658936, + "step": 1220 + }, + { + "epoch": 0.29510556621880996, + "grad_norm": 9.594644147905797, + "learning_rate": 4.442491502264033e-07, + "logits/chosen": -1.4629461765289307, + "logits/rejected": -1.4520565271377563, + "logps/chosen": -367.5827331542969, + "logps/rejected": -431.8439025878906, + "loss": 0.5347, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2434320449829102, + "rewards/margins": 0.7836133241653442, + "rewards/rejected": -2.0270450115203857, + "step": 1230 + }, + { + "epoch": 0.29750479846449135, + "grad_norm": 6.780108062586533, + "learning_rate": 4.429242724694338e-07, + "logits/chosen": -1.5323419570922852, + "logits/rejected": -1.5191363096237183, + "logps/chosen": -375.8507995605469, + "logps/rejected": -497.87921142578125, + "loss": 0.5549, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.157468557357788, + "rewards/margins": 1.0888302326202393, + "rewards/rejected": -2.2462985515594482, + "step": 1240 + }, + { + "epoch": 0.29990403071017274, + "grad_norm": 7.668018750014516, + "learning_rate": 4.4158586184122817e-07, + "logits/chosen": -1.468441128730774, + "logits/rejected": -1.3979756832122803, + "logps/chosen": -467.69976806640625, + "logps/rejected": -525.319091796875, + "loss": 0.5323, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5909717082977295, + "rewards/margins": 0.947952926158905, + "rewards/rejected": -2.5389246940612793, + "step": 1250 + }, + { + "epoch": 0.30230326295585414, + "grad_norm": 8.293797053598022, + "learning_rate": 4.4023401222597443e-07, + "logits/chosen": -1.4755961894989014, + "logits/rejected": -1.4515053033828735, + "logps/chosen": -433.07147216796875, + "logps/rejected": -502.43963623046875, + "loss": 0.5421, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.423656702041626, + "rewards/margins": 0.8972675204277039, + "rewards/rejected": -2.3209242820739746, + "step": 1260 + }, + { + "epoch": 0.30470249520153553, + "grad_norm": 11.368693082147297, + "learning_rate": 4.3886881845055235e-07, + "logits/chosen": -1.4548556804656982, + "logits/rejected": -1.4107099771499634, + "logps/chosen": -393.0867614746094, + "logps/rejected": -568.4356689453125, + "loss": 0.5143, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3003679513931274, + "rewards/margins": 1.8714444637298584, + "rewards/rejected": -3.1718125343322754, + "step": 1270 + }, + { + "epoch": 0.30710172744721687, + "grad_norm": 8.372672642590855, + "learning_rate": 4.374903762778814e-07, + "logits/chosen": -1.449552297592163, + "logits/rejected": -1.375659704208374, + "logps/chosen": -478.12615966796875, + "logps/rejected": -589.36376953125, + "loss": 0.5178, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0235326290130615, + "rewards/margins": 1.4111472368240356, + "rewards/rejected": -3.4346795082092285, + "step": 1280 + }, + { + "epoch": 0.30950095969289826, + "grad_norm": 10.643231523467081, + "learning_rate": 4.3609878240020356e-07, + "logits/chosen": -1.5441666841506958, + "logits/rejected": -1.4388468265533447, + "logps/chosen": -532.8125, + "logps/rejected": -604.3128662109375, + "loss": 0.5416, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.138451337814331, + "rewards/margins": 1.39248788356781, + "rewards/rejected": -3.5309395790100098, + "step": 1290 + }, + { + "epoch": 0.31190019193857965, + "grad_norm": 8.214989448750508, + "learning_rate": 4.346941344323005e-07, + "logits/chosen": -1.5427402257919312, + "logits/rejected": -1.4345190525054932, + "logps/chosen": -433.6434631347656, + "logps/rejected": -459.6392517089844, + "loss": 0.5528, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6685816049575806, + "rewards/margins": 0.8387784957885742, + "rewards/rejected": -2.5073604583740234, + "step": 1300 + }, + { + "epoch": 0.31429942418426104, + "grad_norm": 7.657845271019461, + "learning_rate": 4.332765309046467e-07, + "logits/chosen": -1.3856598138809204, + "logits/rejected": -1.3218709230422974, + "logps/chosen": -462.48175048828125, + "logps/rejected": -564.1286010742188, + "loss": 0.5528, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8233705759048462, + "rewards/margins": 1.4314063787460327, + "rewards/rejected": -3.2547767162323, + "step": 1310 + }, + { + "epoch": 0.31669865642994244, + "grad_norm": 12.669727176995098, + "learning_rate": 4.3184607125649754e-07, + "logits/chosen": -1.5354691743850708, + "logits/rejected": -1.452030062675476, + "logps/chosen": -461.422607421875, + "logps/rejected": -695.4813842773438, + "loss": 0.5574, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5948652029037476, + "rewards/margins": 2.2517476081848145, + "rewards/rejected": -3.8466124534606934, + "step": 1320 + }, + { + "epoch": 0.3190978886756238, + "grad_norm": 7.428351039897863, + "learning_rate": 4.304028558289141e-07, + "logits/chosen": -1.5226895809173584, + "logits/rejected": -1.4397214651107788, + "logps/chosen": -448.7567443847656, + "logps/rejected": -654.158447265625, + "loss": 0.5026, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4266167879104614, + "rewards/margins": 2.2771382331848145, + "rewards/rejected": -3.7037551403045654, + "step": 1330 + }, + { + "epoch": 0.32149712092130517, + "grad_norm": 8.35394394188628, + "learning_rate": 4.28946985857725e-07, + "logits/chosen": -1.5027117729187012, + "logits/rejected": -1.4603252410888672, + "logps/chosen": -464.45770263671875, + "logps/rejected": -600.5919189453125, + "loss": 0.517, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7185165882110596, + "rewards/margins": 1.4774185419082642, + "rewards/rejected": -3.195935010910034, + "step": 1340 + }, + { + "epoch": 0.32389635316698656, + "grad_norm": 8.547450000172507, + "learning_rate": 4.2747856346642445e-07, + "logits/chosen": -1.6239745616912842, + "logits/rejected": -1.5768134593963623, + "logps/chosen": -385.5645751953125, + "logps/rejected": -461.92413330078125, + "loss": 0.498, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4032585620880127, + "rewards/margins": 0.9687395095825195, + "rewards/rejected": -2.3719983100891113, + "step": 1350 + }, + { + "epoch": 0.32629558541266795, + "grad_norm": 10.46357929901763, + "learning_rate": 4.2599769165900933e-07, + "logits/chosen": -1.5189791917800903, + "logits/rejected": -1.4443309307098389, + "logps/chosen": -452.8273010253906, + "logps/rejected": -588.3678588867188, + "loss": 0.5361, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9200900793075562, + "rewards/margins": 1.5941896438598633, + "rewards/rejected": -3.51427960395813, + "step": 1360 + }, + { + "epoch": 0.32869481765834935, + "grad_norm": 7.921189286394596, + "learning_rate": 4.245044743127535e-07, + "logits/chosen": -1.5018115043640137, + "logits/rejected": -1.5035383701324463, + "logps/chosen": -435.9285583496094, + "logps/rejected": -535.82275390625, + "loss": 0.5308, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5959597826004028, + "rewards/margins": 1.0554875135421753, + "rewards/rejected": -2.651447057723999, + "step": 1370 + }, + { + "epoch": 0.3310940499040307, + "grad_norm": 9.780614080293216, + "learning_rate": 4.229990161709214e-07, + "logits/chosen": -1.4760569334030151, + "logits/rejected": -1.4197697639465332, + "logps/chosen": -389.590087890625, + "logps/rejected": -585.2882690429688, + "loss": 0.5347, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.401724100112915, + "rewards/margins": 1.7929754257202148, + "rewards/rejected": -3.1946990489959717, + "step": 1380 + }, + { + "epoch": 0.3334932821497121, + "grad_norm": 8.139127977792159, + "learning_rate": 4.214814228354204e-07, + "logits/chosen": -1.5709788799285889, + "logits/rejected": -1.4670157432556152, + "logps/chosen": -426.033203125, + "logps/rejected": -606.0979614257812, + "loss": 0.5143, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.373948335647583, + "rewards/margins": 1.9848015308380127, + "rewards/rejected": -3.358750104904175, + "step": 1390 + }, + { + "epoch": 0.33589251439539347, + "grad_norm": 8.460928737411663, + "learning_rate": 4.1995180075939375e-07, + "logits/chosen": -1.550615668296814, + "logits/rejected": -1.5216505527496338, + "logps/chosen": -427.553955078125, + "logps/rejected": -545.8343505859375, + "loss": 0.4946, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3485326766967773, + "rewards/margins": 1.4248372316360474, + "rewards/rejected": -2.773369789123535, + "step": 1400 + }, + { + "epoch": 0.33829174664107486, + "grad_norm": 8.81270929184022, + "learning_rate": 4.1841025723975297e-07, + "logits/chosen": -1.5789110660552979, + "logits/rejected": -1.5154634714126587, + "logps/chosen": -434.64849853515625, + "logps/rejected": -600.957763671875, + "loss": 0.5008, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3257498741149902, + "rewards/margins": 1.8993675708770752, + "rewards/rejected": -3.2251172065734863, + "step": 1410 + }, + { + "epoch": 0.34069097888675626, + "grad_norm": 10.770138588745654, + "learning_rate": 4.168569004096516e-07, + "logits/chosen": -1.4905095100402832, + "logits/rejected": -1.4610494375228882, + "logps/chosen": -411.388427734375, + "logps/rejected": -600.2252807617188, + "loss": 0.5136, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6003555059432983, + "rewards/margins": 1.7819187641143799, + "rewards/rejected": -3.3822741508483887, + "step": 1420 + }, + { + "epoch": 0.3430902111324376, + "grad_norm": 7.293720682350634, + "learning_rate": 4.152918392308997e-07, + "logits/chosen": -1.4742928743362427, + "logits/rejected": -1.4396989345550537, + "logps/chosen": -440.1182556152344, + "logps/rejected": -565.8653564453125, + "loss": 0.5199, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.718984603881836, + "rewards/margins": 1.4051283597946167, + "rewards/rejected": -3.124112606048584, + "step": 1430 + }, + { + "epoch": 0.345489443378119, + "grad_norm": 12.227998192379331, + "learning_rate": 4.137151834863213e-07, + "logits/chosen": -1.5375521183013916, + "logits/rejected": -1.4570263624191284, + "logps/chosen": -437.3187561035156, + "logps/rejected": -679.8615112304688, + "loss": 0.5442, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.817325234413147, + "rewards/margins": 2.153637409210205, + "rewards/rejected": -3.9709632396698, + "step": 1440 + }, + { + "epoch": 0.3478886756238004, + "grad_norm": 8.40519372330152, + "learning_rate": 4.121270437720526e-07, + "logits/chosen": -1.6291801929473877, + "logits/rejected": -1.6403745412826538, + "logps/chosen": -378.547119140625, + "logps/rejected": -465.722412109375, + "loss": 0.5377, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5566104650497437, + "rewards/margins": 0.5313079953193665, + "rewards/rejected": -2.087918519973755, + "step": 1450 + }, + { + "epoch": 0.3502879078694818, + "grad_norm": 7.576024138002559, + "learning_rate": 4.105275314897852e-07, + "logits/chosen": -1.4507944583892822, + "logits/rejected": -1.3628849983215332, + "logps/chosen": -424.55401611328125, + "logps/rejected": -754.7704467773438, + "loss": 0.5183, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7864032983779907, + "rewards/margins": 3.053657293319702, + "rewards/rejected": -4.840060710906982, + "step": 1460 + }, + { + "epoch": 0.35268714011516317, + "grad_norm": 9.120638907797822, + "learning_rate": 4.089167588389508e-07, + "logits/chosen": -1.6330862045288086, + "logits/rejected": -1.5513877868652344, + "logps/chosen": -524.4133911132812, + "logps/rejected": -615.8099975585938, + "loss": 0.5284, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.665891408920288, + "rewards/margins": 1.4321410655975342, + "rewards/rejected": -3.0980327129364014, + "step": 1470 + }, + { + "epoch": 0.3550863723608445, + "grad_norm": 8.831999384046886, + "learning_rate": 4.072948388088515e-07, + "logits/chosen": -1.5293561220169067, + "logits/rejected": -1.4742053747177124, + "logps/chosen": -455.7305603027344, + "logps/rejected": -572.6296997070312, + "loss": 0.5715, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.669014573097229, + "rewards/margins": 1.1838295459747314, + "rewards/rejected": -2.852843999862671, + "step": 1480 + }, + { + "epoch": 0.3574856046065259, + "grad_norm": 8.125729954588683, + "learning_rate": 4.056618851707334e-07, + "logits/chosen": -1.5556094646453857, + "logits/rejected": -1.544398307800293, + "logps/chosen": -411.40631103515625, + "logps/rejected": -525.8662719726562, + "loss": 0.4887, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1296329498291016, + "rewards/margins": 1.2118812799453735, + "rewards/rejected": -2.3415141105651855, + "step": 1490 + }, + { + "epoch": 0.3598848368522073, + "grad_norm": 7.32928329168995, + "learning_rate": 4.0401801246980675e-07, + "logits/chosen": -1.5943670272827148, + "logits/rejected": -1.550929307937622, + "logps/chosen": -381.600830078125, + "logps/rejected": -475.3746643066406, + "loss": 0.5565, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4997680187225342, + "rewards/margins": 1.1621274948120117, + "rewards/rejected": -2.661895513534546, + "step": 1500 + }, + { + "epoch": 0.3622840690978887, + "grad_norm": 8.147323008767865, + "learning_rate": 4.0236333601721043e-07, + "logits/chosen": -1.5589003562927246, + "logits/rejected": -1.6057322025299072, + "logps/chosen": -448.15496826171875, + "logps/rejected": -532.5565185546875, + "loss": 0.5517, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.489591121673584, + "rewards/margins": 0.731250524520874, + "rewards/rejected": -2.220841646194458, + "step": 1510 + }, + { + "epoch": 0.3646833013435701, + "grad_norm": 7.889436233563884, + "learning_rate": 4.0069797188192364e-07, + "logits/chosen": -1.482542872428894, + "logits/rejected": -1.3949836492538452, + "logps/chosen": -495.69140625, + "logps/rejected": -660.7445678710938, + "loss": 0.5172, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8629268407821655, + "rewards/margins": 2.0722007751464844, + "rewards/rejected": -3.9351277351379395, + "step": 1520 + }, + { + "epoch": 0.3670825335892514, + "grad_norm": 10.641539880656538, + "learning_rate": 3.9902203688262417e-07, + "logits/chosen": -1.554213285446167, + "logits/rejected": -1.5207319259643555, + "logps/chosen": -438.8204650878906, + "logps/rejected": -519.6439208984375, + "loss": 0.5133, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5820989608764648, + "rewards/margins": 0.9530105590820312, + "rewards/rejected": -2.535109281539917, + "step": 1530 + }, + { + "epoch": 0.3694817658349328, + "grad_norm": 8.725526436669323, + "learning_rate": 3.9733564857949365e-07, + "logits/chosen": -1.5051288604736328, + "logits/rejected": -1.4673130512237549, + "logps/chosen": -474.5401306152344, + "logps/rejected": -572.7054443359375, + "loss": 0.4857, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5164769887924194, + "rewards/margins": 1.4391937255859375, + "rewards/rejected": -2.9556708335876465, + "step": 1540 + }, + { + "epoch": 0.3718809980806142, + "grad_norm": 19.17908292592815, + "learning_rate": 3.9563892526597177e-07, + "logits/chosen": -1.5526840686798096, + "logits/rejected": -1.5369585752487183, + "logps/chosen": -372.50921630859375, + "logps/rejected": -465.1087951660156, + "loss": 0.5281, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3534399271011353, + "rewards/margins": 0.4993162155151367, + "rewards/rejected": -1.852756142616272, + "step": 1550 + }, + { + "epoch": 0.3742802303262956, + "grad_norm": 7.678163461265528, + "learning_rate": 3.9393198596045795e-07, + "logits/chosen": -1.5254640579223633, + "logits/rejected": -1.5481865406036377, + "logps/chosen": -403.49542236328125, + "logps/rejected": -521.7468872070312, + "loss": 0.5212, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5229865312576294, + "rewards/margins": 1.0962460041046143, + "rewards/rejected": -2.619232416152954, + "step": 1560 + }, + { + "epoch": 0.376679462571977, + "grad_norm": 6.910401313727213, + "learning_rate": 3.922149503979628e-07, + "logits/chosen": -1.4570752382278442, + "logits/rejected": -1.3976576328277588, + "logps/chosen": -587.8316040039062, + "logps/rejected": -922.5247802734375, + "loss": 0.4951, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9414281845092773, + "rewards/margins": 3.3815834522247314, + "rewards/rejected": -6.323011875152588, + "step": 1570 + }, + { + "epoch": 0.3790786948176583, + "grad_norm": 10.822868640544474, + "learning_rate": 3.904879390217095e-07, + "logits/chosen": -1.605613112449646, + "logits/rejected": -1.5582932233810425, + "logps/chosen": -432.2767028808594, + "logps/rejected": -528.9737548828125, + "loss": 0.5277, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6335880756378174, + "rewards/margins": 1.2116552591323853, + "rewards/rejected": -2.845243215560913, + "step": 1580 + }, + { + "epoch": 0.3814779270633397, + "grad_norm": 9.531595564715918, + "learning_rate": 3.8875107297468463e-07, + "logits/chosen": -1.5569547414779663, + "logits/rejected": -1.5293956995010376, + "logps/chosen": -408.19317626953125, + "logps/rejected": -679.7545166015625, + "loss": 0.5166, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.506165862083435, + "rewards/margins": 2.2621097564697266, + "rewards/rejected": -3.768275499343872, + "step": 1590 + }, + { + "epoch": 0.3838771593090211, + "grad_norm": 9.016181079029822, + "learning_rate": 3.87004474091141e-07, + "logits/chosen": -1.5378257036209106, + "logits/rejected": -1.5579806566238403, + "logps/chosen": -399.8911437988281, + "logps/rejected": -520.0226440429688, + "loss": 0.5094, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.619594931602478, + "rewards/margins": 1.0523192882537842, + "rewards/rejected": -2.6719141006469727, + "step": 1600 + }, + { + "epoch": 0.3862763915547025, + "grad_norm": 9.569743956165809, + "learning_rate": 3.8524826488805114e-07, + "logits/chosen": -1.5677305459976196, + "logits/rejected": -1.5042650699615479, + "logps/chosen": -488.653564453125, + "logps/rejected": -553.4172973632812, + "loss": 0.57, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8398549556732178, + "rewards/margins": 1.1457983255386353, + "rewards/rejected": -2.9856534004211426, + "step": 1610 + }, + { + "epoch": 0.3886756238003839, + "grad_norm": 7.953020161460846, + "learning_rate": 3.834825685565133e-07, + "logits/chosen": -1.5813748836517334, + "logits/rejected": -1.5757431983947754, + "logps/chosen": -361.91192626953125, + "logps/rejected": -377.3770446777344, + "loss": 0.4998, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.096151351928711, + "rewards/margins": 0.5215452313423157, + "rewards/rejected": -1.6176965236663818, + "step": 1620 + }, + { + "epoch": 0.39107485604606523, + "grad_norm": 12.112559070938909, + "learning_rate": 3.8170750895311007e-07, + "logits/chosen": -1.661161184310913, + "logits/rejected": -1.604376196861267, + "logps/chosen": -426.1370544433594, + "logps/rejected": -517.6278076171875, + "loss": 0.4749, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2490683794021606, + "rewards/margins": 1.1713922023773193, + "rewards/rejected": -2.4204607009887695, + "step": 1630 + }, + { + "epoch": 0.3934740882917466, + "grad_norm": 10.613017352814472, + "learning_rate": 3.7992321059122045e-07, + "logits/chosen": -1.477107286453247, + "logits/rejected": -1.435372233390808, + "logps/chosen": -473.62750244140625, + "logps/rejected": -581.3477172851562, + "loss": 0.5086, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0801384449005127, + "rewards/margins": 1.3069989681243896, + "rewards/rejected": -3.3871371746063232, + "step": 1640 + }, + { + "epoch": 0.395873320537428, + "grad_norm": 8.03177266316891, + "learning_rate": 3.7812979863228576e-07, + "logits/chosen": -1.4891306161880493, + "logits/rejected": -1.524371862411499, + "logps/chosen": -451.3934631347656, + "logps/rejected": -560.3739013671875, + "loss": 0.4753, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.225372314453125, + "rewards/margins": 0.9865644574165344, + "rewards/rejected": -3.2119364738464355, + "step": 1650 + }, + { + "epoch": 0.3982725527831094, + "grad_norm": 14.176933508927005, + "learning_rate": 3.763273988770296e-07, + "logits/chosen": -1.4202697277069092, + "logits/rejected": -1.3632880449295044, + "logps/chosen": -510.245361328125, + "logps/rejected": -665.1488037109375, + "loss": 0.4931, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.517984628677368, + "rewards/margins": 1.563971757888794, + "rewards/rejected": -4.081956386566162, + "step": 1660 + }, + { + "epoch": 0.4006717850287908, + "grad_norm": 10.323789347042394, + "learning_rate": 3.7451613775663405e-07, + "logits/chosen": -1.5507957935333252, + "logits/rejected": -1.4645434617996216, + "logps/chosen": -424.00042724609375, + "logps/rejected": -677.9902954101562, + "loss": 0.5348, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6596157550811768, + "rewards/margins": 2.5375232696533203, + "rewards/rejected": -4.197138786315918, + "step": 1670 + }, + { + "epoch": 0.40307101727447214, + "grad_norm": 13.602368967903537, + "learning_rate": 3.726961423238706e-07, + "logits/chosen": -1.5984694957733154, + "logits/rejected": -1.5915277004241943, + "logps/chosen": -382.999755859375, + "logps/rejected": -552.7100830078125, + "loss": 0.5241, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.413910150527954, + "rewards/margins": 1.5153096914291382, + "rewards/rejected": -2.9292197227478027, + "step": 1680 + }, + { + "epoch": 0.40547024952015354, + "grad_norm": 7.502258208680551, + "learning_rate": 3.708675402441882e-07, + "logits/chosen": -1.6568260192871094, + "logits/rejected": -1.5876600742340088, + "logps/chosen": -440.16357421875, + "logps/rejected": -474.075439453125, + "loss": 0.5576, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4147294759750366, + "rewards/margins": 0.780114471912384, + "rewards/rejected": -2.1948440074920654, + "step": 1690 + }, + { + "epoch": 0.40786948176583493, + "grad_norm": 8.66494523712904, + "learning_rate": 3.6903045978675775e-07, + "logits/chosen": -1.562082052230835, + "logits/rejected": -1.4948246479034424, + "logps/chosen": -393.52508544921875, + "logps/rejected": -548.5087280273438, + "loss": 0.5081, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3889000415802002, + "rewards/margins": 1.800432562828064, + "rewards/rejected": -3.1893324851989746, + "step": 1700 + }, + { + "epoch": 0.4102687140115163, + "grad_norm": 8.12061625541492, + "learning_rate": 3.6718502981547474e-07, + "logits/chosen": -1.6216672658920288, + "logits/rejected": -1.6438817977905273, + "logps/chosen": -408.8639221191406, + "logps/rejected": -543.5513916015625, + "loss": 0.5151, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3934214115142822, + "rewards/margins": 0.9635807275772095, + "rewards/rejected": -2.357002019882202, + "step": 1710 + }, + { + "epoch": 0.4126679462571977, + "grad_norm": 7.83467861319349, + "learning_rate": 3.6533137977991986e-07, + "logits/chosen": -1.6941850185394287, + "logits/rejected": -1.6553080081939697, + "logps/chosen": -461.6195373535156, + "logps/rejected": -582.6620483398438, + "loss": 0.5341, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6303510665893555, + "rewards/margins": 0.9588174819946289, + "rewards/rejected": -2.5891685485839844, + "step": 1720 + }, + { + "epoch": 0.41506717850287905, + "grad_norm": 7.244089308316078, + "learning_rate": 3.6346963970627865e-07, + "logits/chosen": -1.5541958808898926, + "logits/rejected": -1.5294215679168701, + "logps/chosen": -429.1705017089844, + "logps/rejected": -555.658447265625, + "loss": 0.5078, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.70223867893219, + "rewards/margins": 1.1579748392105103, + "rewards/rejected": -2.8602135181427, + "step": 1730 + }, + { + "epoch": 0.41746641074856045, + "grad_norm": 22.58474812752272, + "learning_rate": 3.615999401882207e-07, + "logits/chosen": -1.4961183071136475, + "logits/rejected": -1.4009946584701538, + "logps/chosen": -451.9063415527344, + "logps/rejected": -685.244140625, + "loss": 0.5014, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.072859764099121, + "rewards/margins": 2.2618536949157715, + "rewards/rejected": -4.334712982177734, + "step": 1740 + }, + { + "epoch": 0.41986564299424184, + "grad_norm": 8.32162712301176, + "learning_rate": 3.597224123777389e-07, + "logits/chosen": -1.5286868810653687, + "logits/rejected": -1.387584924697876, + "logps/chosen": -498.414306640625, + "logps/rejected": -741.4631958007812, + "loss": 0.5284, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.277275800704956, + "rewards/margins": 2.3793492317199707, + "rewards/rejected": -4.656625747680664, + "step": 1750 + }, + { + "epoch": 0.42226487523992323, + "grad_norm": 8.974293009829895, + "learning_rate": 3.5783718797595e-07, + "logits/chosen": -1.6080338954925537, + "logits/rejected": -1.4449987411499023, + "logps/chosen": -483.98486328125, + "logps/rejected": -583.08056640625, + "loss": 0.4899, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6723836660385132, + "rewards/margins": 1.4705579280853271, + "rewards/rejected": -3.1429412364959717, + "step": 1760 + }, + { + "epoch": 0.4246641074856046, + "grad_norm": 9.046062404092567, + "learning_rate": 3.559443992238558e-07, + "logits/chosen": -1.6072918176651, + "logits/rejected": -1.5334550142288208, + "logps/chosen": -404.91387939453125, + "logps/rejected": -659.3243408203125, + "loss": 0.5256, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4262123107910156, + "rewards/margins": 2.3509132862091064, + "rewards/rejected": -3.777125835418701, + "step": 1770 + }, + { + "epoch": 0.42706333973128596, + "grad_norm": 7.551979886612425, + "learning_rate": 3.540441788930673e-07, + "logits/chosen": -1.5570323467254639, + "logits/rejected": -1.5010316371917725, + "logps/chosen": -499.6627502441406, + "logps/rejected": -655.69775390625, + "loss": 0.4806, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.816043496131897, + "rewards/margins": 2.008720636367798, + "rewards/rejected": -3.8247642517089844, + "step": 1780 + }, + { + "epoch": 0.42946257197696736, + "grad_norm": 8.307725059400688, + "learning_rate": 3.5213666027649123e-07, + "logits/chosen": -1.5870680809020996, + "logits/rejected": -1.5261785984039307, + "logps/chosen": -484.54364013671875, + "logps/rejected": -549.2987060546875, + "loss": 0.5261, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8930203914642334, + "rewards/margins": 1.094689965248108, + "rewards/rejected": -2.987710475921631, + "step": 1790 + }, + { + "epoch": 0.43186180422264875, + "grad_norm": 8.582762739992743, + "learning_rate": 3.5022197717898017e-07, + "logits/chosen": -1.580644130706787, + "logits/rejected": -1.3994085788726807, + "logps/chosen": -401.8323059082031, + "logps/rejected": -603.8817138671875, + "loss": 0.4704, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.661513328552246, + "rewards/margins": 2.428684949874878, + "rewards/rejected": -4.090198516845703, + "step": 1800 + }, + { + "epoch": 0.43426103646833014, + "grad_norm": 10.93719877269778, + "learning_rate": 3.4830026390794633e-07, + "logits/chosen": -1.607084035873413, + "logits/rejected": -1.5272046327590942, + "logps/chosen": -510.1648864746094, + "logps/rejected": -667.7628173828125, + "loss": 0.4909, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0123143196105957, + "rewards/margins": 2.094109535217285, + "rewards/rejected": -4.106423377990723, + "step": 1810 + }, + { + "epoch": 0.43666026871401153, + "grad_norm": 8.551853777479481, + "learning_rate": 3.4637165526394104e-07, + "logits/chosen": -1.6525242328643799, + "logits/rejected": -1.6364984512329102, + "logps/chosen": -405.4945983886719, + "logps/rejected": -535.1358032226562, + "loss": 0.4973, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5821750164031982, + "rewards/margins": 1.2321652173995972, + "rewards/rejected": -2.814340114593506, + "step": 1820 + }, + { + "epoch": 0.43905950095969287, + "grad_norm": 7.385716657081044, + "learning_rate": 3.4443628653119814e-07, + "logits/chosen": -1.517671823501587, + "logits/rejected": -1.4017354249954224, + "logps/chosen": -517.3212890625, + "logps/rejected": -859.2703247070312, + "loss": 0.5639, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.307370662689209, + "rewards/margins": 3.0285544395446777, + "rewards/rejected": -5.335925102233887, + "step": 1830 + }, + { + "epoch": 0.44145873320537427, + "grad_norm": 9.147598141998557, + "learning_rate": 3.424942934681453e-07, + "logits/chosen": -1.603632926940918, + "logits/rejected": -1.50962233543396, + "logps/chosen": -405.5295715332031, + "logps/rejected": -603.0396728515625, + "loss": 0.4915, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5191326141357422, + "rewards/margins": 2.0283052921295166, + "rewards/rejected": -3.5474376678466797, + "step": 1840 + }, + { + "epoch": 0.44385796545105566, + "grad_norm": 10.988746391986139, + "learning_rate": 3.405458122978804e-07, + "logits/chosen": -1.5768574476242065, + "logits/rejected": -1.5998786687850952, + "logps/chosen": -428.5262756347656, + "logps/rejected": -487.67987060546875, + "loss": 0.4836, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3609421253204346, + "rewards/margins": 0.9485089182853699, + "rewards/rejected": -2.309451103210449, + "step": 1850 + }, + { + "epoch": 0.44625719769673705, + "grad_norm": 15.944164556320738, + "learning_rate": 3.3859097969861633e-07, + "logits/chosen": -1.625605583190918, + "logits/rejected": -1.5874695777893066, + "logps/chosen": -494.60919189453125, + "logps/rejected": -607.7193603515625, + "loss": 0.5338, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8808681964874268, + "rewards/margins": 1.5208300352096558, + "rewards/rejected": -3.401698350906372, + "step": 1860 + }, + { + "epoch": 0.44865642994241844, + "grad_norm": 8.268100089151192, + "learning_rate": 3.366299327940936e-07, + "logits/chosen": -1.6645658016204834, + "logits/rejected": -1.648751974105835, + "logps/chosen": -499.26336669921875, + "logps/rejected": -698.3704833984375, + "loss": 0.4915, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9141197204589844, + "rewards/margins": 1.823472261428833, + "rewards/rejected": -3.7375919818878174, + "step": 1870 + }, + { + "epoch": 0.4510556621880998, + "grad_norm": 8.804078070830371, + "learning_rate": 3.3466280914396117e-07, + "logits/chosen": -1.5872663259506226, + "logits/rejected": -1.5196198225021362, + "logps/chosen": -456.85992431640625, + "logps/rejected": -658.8201904296875, + "loss": 0.4869, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.035572052001953, + "rewards/margins": 1.8471441268920898, + "rewards/rejected": -3.882716417312622, + "step": 1880 + }, + { + "epoch": 0.4534548944337812, + "grad_norm": 11.092976917744208, + "learning_rate": 3.326897467341281e-07, + "logits/chosen": -1.5509458780288696, + "logits/rejected": -1.4662238359451294, + "logps/chosen": -485.05230712890625, + "logps/rejected": -712.2886962890625, + "loss": 0.4878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.6332812309265137, + "rewards/margins": 2.179011821746826, + "rewards/rejected": -4.81229305267334, + "step": 1890 + }, + { + "epoch": 0.45585412667946257, + "grad_norm": 12.839651291896095, + "learning_rate": 3.3071088396708335e-07, + "logits/chosen": -1.532083511352539, + "logits/rejected": -1.428544282913208, + "logps/chosen": -468.669921875, + "logps/rejected": -751.4425048828125, + "loss": 0.5366, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.547771692276001, + "rewards/margins": 2.576319932937622, + "rewards/rejected": -5.124091148376465, + "step": 1900 + }, + { + "epoch": 0.45825335892514396, + "grad_norm": 10.082888820884584, + "learning_rate": 3.2872635965218824e-07, + "logits/chosen": -1.407777190208435, + "logits/rejected": -1.3572627305984497, + "logps/chosen": -552.7352294921875, + "logps/rejected": -779.3690185546875, + "loss": 0.5304, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7334177494049072, + "rewards/margins": 2.2001953125, + "rewards/rejected": -4.93361234664917, + "step": 1910 + }, + { + "epoch": 0.46065259117082535, + "grad_norm": 8.22503563553893, + "learning_rate": 3.2673631299593905e-07, + "logits/chosen": -1.5405181646347046, + "logits/rejected": -1.3945229053497314, + "logps/chosen": -509.95916748046875, + "logps/rejected": -725.9969482421875, + "loss": 0.4924, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.202409267425537, + "rewards/margins": 2.3135194778442383, + "rewards/rejected": -4.515929222106934, + "step": 1920 + }, + { + "epoch": 0.4630518234165067, + "grad_norm": 11.990528680107209, + "learning_rate": 3.247408835922024e-07, + "logits/chosen": -1.4919646978378296, + "logits/rejected": -1.400061845779419, + "logps/chosen": -584.2794189453125, + "logps/rejected": -807.6492919921875, + "loss": 0.5074, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.5771946907043457, + "rewards/margins": 2.1979637145996094, + "rewards/rejected": -4.775158882141113, + "step": 1930 + }, + { + "epoch": 0.4654510556621881, + "grad_norm": 11.277957093878648, + "learning_rate": 3.2274021141242306e-07, + "logits/chosen": -1.4834940433502197, + "logits/rejected": -1.4324702024459839, + "logps/chosen": -476.18499755859375, + "logps/rejected": -635.02783203125, + "loss": 0.501, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0429248809814453, + "rewards/margins": 1.4896948337554932, + "rewards/rejected": -3.5326199531555176, + "step": 1940 + }, + { + "epoch": 0.4678502879078695, + "grad_norm": 15.301957388985223, + "learning_rate": 3.2073443679580613e-07, + "logits/chosen": -1.6484432220458984, + "logits/rejected": -1.6361116170883179, + "logps/chosen": -451.64276123046875, + "logps/rejected": -508.2003479003906, + "loss": 0.5063, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6862659454345703, + "rewards/margins": 0.613060712814331, + "rewards/rejected": -2.2993264198303223, + "step": 1950 + }, + { + "epoch": 0.47024952015355087, + "grad_norm": 9.017778030476915, + "learning_rate": 3.1872370043947194e-07, + "logits/chosen": -1.6414830684661865, + "logits/rejected": -1.564360499382019, + "logps/chosen": -421.6163024902344, + "logps/rejected": -599.665283203125, + "loss": 0.4565, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4122569561004639, + "rewards/margins": 1.8497397899627686, + "rewards/rejected": -3.2619965076446533, + "step": 1960 + }, + { + "epoch": 0.47264875239923226, + "grad_norm": 13.966059992110349, + "learning_rate": 3.167081433885874e-07, + "logits/chosen": -1.4791144132614136, + "logits/rejected": -1.4647550582885742, + "logps/chosen": -573.87353515625, + "logps/rejected": -760.268798828125, + "loss": 0.4637, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.4180257320404053, + "rewards/margins": 1.517680287361145, + "rewards/rejected": -3.935706377029419, + "step": 1970 + }, + { + "epoch": 0.4750479846449136, + "grad_norm": 14.922819489110463, + "learning_rate": 3.14687907026472e-07, + "logits/chosen": -1.5097310543060303, + "logits/rejected": -1.4952431917190552, + "logps/chosen": -426.15045166015625, + "logps/rejected": -578.3543090820312, + "loss": 0.4975, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8940229415893555, + "rewards/margins": 1.3460915088653564, + "rewards/rejected": -3.240114212036133, + "step": 1980 + }, + { + "epoch": 0.477447216890595, + "grad_norm": 8.988836306290604, + "learning_rate": 3.126631330646801e-07, + "logits/chosen": -1.6298195123672485, + "logits/rejected": -1.6404377222061157, + "logps/chosen": -538.1989135742188, + "logps/rejected": -648.9851684570312, + "loss": 0.5374, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.212888240814209, + "rewards/margins": 1.0854294300079346, + "rewards/rejected": -3.2983174324035645, + "step": 1990 + }, + { + "epoch": 0.4798464491362764, + "grad_norm": 8.750744135615523, + "learning_rate": 3.1063396353306097e-07, + "logits/chosen": -1.643333077430725, + "logits/rejected": -1.610515832901001, + "logps/chosen": -407.2403869628906, + "logps/rejected": -473.98858642578125, + "loss": 0.5034, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2611894607543945, + "rewards/margins": 1.0861375331878662, + "rewards/rejected": -2.3473267555236816, + "step": 2000 + }, + { + "epoch": 0.4798464491362764, + "eval_logits/chosen": -1.5855597257614136, + "eval_logits/rejected": -1.54659903049469, + "eval_logps/chosen": -422.9024963378906, + "eval_logps/rejected": -588.136474609375, + "eval_loss": 0.4988311231136322, + "eval_rewards/accuracies": 0.7982142567634583, + "eval_rewards/chosen": -1.505989670753479, + "eval_rewards/margins": 1.6387678384780884, + "eval_rewards/rejected": -3.1447572708129883, + "eval_runtime": 52.378, + "eval_samples_per_second": 85.169, + "eval_steps_per_second": 1.336, + "step": 2000 + }, + { + "epoch": 0.4822456813819578, + "grad_norm": 13.733129427049343, + "learning_rate": 3.0860054076979535e-07, + "logits/chosen": -1.6123840808868408, + "logits/rejected": -1.5536229610443115, + "logps/chosen": -480.535888671875, + "logps/rejected": -624.6920166015625, + "loss": 0.4772, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8841956853866577, + "rewards/margins": 1.7770442962646484, + "rewards/rejected": -3.6612396240234375, + "step": 2010 + }, + { + "epoch": 0.4846449136276392, + "grad_norm": 9.057968429745728, + "learning_rate": 3.065630074114115e-07, + "logits/chosen": -1.6468747854232788, + "logits/rejected": -1.5576668977737427, + "logps/chosen": -458.7217712402344, + "logps/rejected": -609.1342163085938, + "loss": 0.5354, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5887041091918945, + "rewards/margins": 1.9728997945785522, + "rewards/rejected": -3.5616040229797363, + "step": 2020 + }, + { + "epoch": 0.4870441458733205, + "grad_norm": 9.014043112611292, + "learning_rate": 3.0452150638277947e-07, + "logits/chosen": -1.602269172668457, + "logits/rejected": -1.5746511220932007, + "logps/chosen": -426.79180908203125, + "logps/rejected": -550.3716430664062, + "loss": 0.4984, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.901236891746521, + "rewards/margins": 1.115099310874939, + "rewards/rejected": -3.016335964202881, + "step": 2030 + }, + { + "epoch": 0.4894433781190019, + "grad_norm": 10.239650017211535, + "learning_rate": 3.024761808870856e-07, + "logits/chosen": -1.537592887878418, + "logits/rejected": -1.5135307312011719, + "logps/chosen": -394.8808898925781, + "logps/rejected": -632.0445556640625, + "loss": 0.4599, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3750698566436768, + "rewards/margins": 2.4639463424682617, + "rewards/rejected": -3.8390164375305176, + "step": 2040 + }, + { + "epoch": 0.4918426103646833, + "grad_norm": 15.750043165793306, + "learning_rate": 3.004271743957875e-07, + "logits/chosen": -1.6159226894378662, + "logits/rejected": -1.655556321144104, + "logps/chosen": -525.2373657226562, + "logps/rejected": -622.0975341796875, + "loss": 0.5105, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.482973098754883, + "rewards/margins": 0.6949129700660706, + "rewards/rejected": -3.1778860092163086, + "step": 2050 + }, + { + "epoch": 0.4942418426103647, + "grad_norm": 8.499109074782012, + "learning_rate": 2.983746306385499e-07, + "logits/chosen": -1.5946094989776611, + "logits/rejected": -1.534623622894287, + "logps/chosen": -453.0174255371094, + "logps/rejected": -686.3258056640625, + "loss": 0.521, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9426515102386475, + "rewards/margins": 2.2058510780334473, + "rewards/rejected": -4.148502349853516, + "step": 2060 + }, + { + "epoch": 0.4966410748560461, + "grad_norm": 10.800111792066579, + "learning_rate": 2.963186935931628e-07, + "logits/chosen": -1.6709175109863281, + "logits/rejected": -1.6887776851654053, + "logps/chosen": -438.7301330566406, + "logps/rejected": -554.9039306640625, + "loss": 0.4837, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6398264169692993, + "rewards/margins": 1.2201743125915527, + "rewards/rejected": -2.8600010871887207, + "step": 2070 + }, + { + "epoch": 0.4990403071017274, + "grad_norm": 8.988674035333007, + "learning_rate": 2.9425950747544176e-07, + "logits/chosen": -1.5756020545959473, + "logits/rejected": -1.5436617136001587, + "logps/chosen": -561.7062377929688, + "logps/rejected": -795.3497314453125, + "loss": 0.4775, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4821224212646484, + "rewards/margins": 2.639726161956787, + "rewards/rejected": -5.121848106384277, + "step": 2080 + }, + { + "epoch": 0.5014395393474088, + "grad_norm": 10.085891400781119, + "learning_rate": 2.921972167291119e-07, + "logits/chosen": -1.6471210718154907, + "logits/rejected": -1.6008189916610718, + "logps/chosen": -478.99835205078125, + "logps/rejected": -686.2216186523438, + "loss": 0.4919, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7925002574920654, + "rewards/margins": 1.8971786499023438, + "rewards/rejected": -3.689678192138672, + "step": 2090 + }, + { + "epoch": 0.5038387715930902, + "grad_norm": 10.421882065221322, + "learning_rate": 2.9013196601567567e-07, + "logits/chosen": -1.6900784969329834, + "logits/rejected": -1.6556028127670288, + "logps/chosen": -423.0419921875, + "logps/rejected": -528.6862182617188, + "loss": 0.5545, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5560649633407593, + "rewards/margins": 0.938676655292511, + "rewards/rejected": -2.494741916656494, + "step": 2100 + }, + { + "epoch": 0.5062380038387716, + "grad_norm": 8.821766288439926, + "learning_rate": 2.8806390020426555e-07, + "logits/chosen": -1.662415862083435, + "logits/rejected": -1.6548817157745361, + "logps/chosen": -453.2157287597656, + "logps/rejected": -591.32470703125, + "loss": 0.5049, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6261285543441772, + "rewards/margins": 1.4008034467697144, + "rewards/rejected": -3.0269322395324707, + "step": 2110 + }, + { + "epoch": 0.508637236084453, + "grad_norm": 12.186841933449609, + "learning_rate": 2.8599316436148187e-07, + "logits/chosen": -1.5856435298919678, + "logits/rejected": -1.5153669118881226, + "logps/chosen": -424.302490234375, + "logps/rejected": -558.4691162109375, + "loss": 0.4916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5989717245101929, + "rewards/margins": 1.3942369222640991, + "rewards/rejected": -2.993208646774292, + "step": 2120 + }, + { + "epoch": 0.5110364683301344, + "grad_norm": 9.950542312444133, + "learning_rate": 2.8391990374121723e-07, + "logits/chosen": -1.58291494846344, + "logits/rejected": -1.506280541419983, + "logps/chosen": -473.59130859375, + "logps/rejected": -741.8995361328125, + "loss": 0.4944, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1417229175567627, + "rewards/margins": 2.387235641479492, + "rewards/rejected": -4.528958320617676, + "step": 2130 + }, + { + "epoch": 0.5134357005758158, + "grad_norm": 9.848738060883088, + "learning_rate": 2.818442637744669e-07, + "logits/chosen": -1.5625739097595215, + "logits/rejected": -1.5479421615600586, + "logps/chosen": -466.6238708496094, + "logps/rejected": -644.1817626953125, + "loss": 0.4903, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0364062786102295, + "rewards/margins": 1.7278406620025635, + "rewards/rejected": -3.764247417449951, + "step": 2140 + }, + { + "epoch": 0.5158349328214972, + "grad_norm": 15.060228171309086, + "learning_rate": 2.797663900591284e-07, + "logits/chosen": -1.6554969549179077, + "logits/rejected": -1.590504765510559, + "logps/chosen": -499.38653564453125, + "logps/rejected": -656.7408447265625, + "loss": 0.4641, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2184956073760986, + "rewards/margins": 1.8517191410064697, + "rewards/rejected": -4.07021427154541, + "step": 2150 + }, + { + "epoch": 0.5182341650671785, + "grad_norm": 11.311863584668101, + "learning_rate": 2.776864283497874e-07, + "logits/chosen": -1.6083641052246094, + "logits/rejected": -1.4705779552459717, + "logps/chosen": -442.3028869628906, + "logps/rejected": -744.4788818359375, + "loss": 0.4928, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9584490060806274, + "rewards/margins": 3.1203114986419678, + "rewards/rejected": -5.078760623931885, + "step": 2160 + }, + { + "epoch": 0.5206333973128598, + "grad_norm": 10.28461247325414, + "learning_rate": 2.756045245474943e-07, + "logits/chosen": -1.7539141178131104, + "logits/rejected": -1.7315905094146729, + "logps/chosen": -480.287353515625, + "logps/rejected": -616.8678588867188, + "loss": 0.5098, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0116958618164062, + "rewards/margins": 1.2387454509735107, + "rewards/rejected": -3.250441312789917, + "step": 2170 + }, + { + "epoch": 0.5230326295585412, + "grad_norm": 9.31477755841552, + "learning_rate": 2.7352082468952977e-07, + "logits/chosen": -1.5500714778900146, + "logits/rejected": -1.497689962387085, + "logps/chosen": -472.05908203125, + "logps/rejected": -784.3190307617188, + "loss": 0.5352, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0980050563812256, + "rewards/margins": 2.975919485092163, + "rewards/rejected": -5.073924541473389, + "step": 2180 + }, + { + "epoch": 0.5254318618042226, + "grad_norm": 9.270464819675132, + "learning_rate": 2.7143547493916e-07, + "logits/chosen": -1.610926866531372, + "logits/rejected": -1.4779975414276123, + "logps/chosen": -438.76495361328125, + "logps/rejected": -772.690673828125, + "loss": 0.4729, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.654784917831421, + "rewards/margins": 3.3569869995117188, + "rewards/rejected": -5.011772632598877, + "step": 2190 + }, + { + "epoch": 0.527831094049904, + "grad_norm": 11.881300504373025, + "learning_rate": 2.693486215753853e-07, + "logits/chosen": -1.6387016773223877, + "logits/rejected": -1.5379244089126587, + "logps/chosen": -487.00250244140625, + "logps/rejected": -804.3506469726562, + "loss": 0.5173, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.203137159347534, + "rewards/margins": 3.4220173358917236, + "rewards/rejected": -5.625154495239258, + "step": 2200 + }, + { + "epoch": 0.5302303262955854, + "grad_norm": 11.209078276132704, + "learning_rate": 2.6726041098267805e-07, + "logits/chosen": -1.7330009937286377, + "logits/rejected": -1.610637903213501, + "logps/chosen": -492.36883544921875, + "logps/rejected": -618.7637329101562, + "loss": 0.5552, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8613355159759521, + "rewards/margins": 1.652645468711853, + "rewards/rejected": -3.5139803886413574, + "step": 2210 + }, + { + "epoch": 0.5326295585412668, + "grad_norm": 18.552236297372197, + "learning_rate": 2.6517098964071507e-07, + "logits/chosen": -1.6331803798675537, + "logits/rejected": -1.648450493812561, + "logps/chosen": -422.33184814453125, + "logps/rejected": -502.978271484375, + "loss": 0.5321, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5346616506576538, + "rewards/margins": 0.6287662386894226, + "rewards/rejected": -2.1634278297424316, + "step": 2220 + }, + { + "epoch": 0.5350287907869482, + "grad_norm": 21.4582048998974, + "learning_rate": 2.630805041141023e-07, + "logits/chosen": -1.5773918628692627, + "logits/rejected": -1.5001894235610962, + "logps/chosen": -385.9847412109375, + "logps/rejected": -727.9683837890625, + "loss": 0.489, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4480630159378052, + "rewards/margins": 3.2581539154052734, + "rewards/rejected": -4.706217288970947, + "step": 2230 + }, + { + "epoch": 0.5374280230326296, + "grad_norm": 26.843224623928677, + "learning_rate": 2.609891010420941e-07, + "logits/chosen": -1.6596715450286865, + "logits/rejected": -1.5660836696624756, + "logps/chosen": -512.3265380859375, + "logps/rejected": -711.9317016601562, + "loss": 0.4656, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2569899559020996, + "rewards/margins": 1.9923429489135742, + "rewards/rejected": -4.249333381652832, + "step": 2240 + }, + { + "epoch": 0.539827255278311, + "grad_norm": 10.63742659021482, + "learning_rate": 2.5889692712830674e-07, + "logits/chosen": -1.7071069478988647, + "logits/rejected": -1.6586761474609375, + "logps/chosen": -430.1871643066406, + "logps/rejected": -572.591064453125, + "loss": 0.4918, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8596538305282593, + "rewards/margins": 1.5033780336380005, + "rewards/rejected": -3.363032102584839, + "step": 2250 + }, + { + "epoch": 0.5422264875239923, + "grad_norm": 14.567934039227358, + "learning_rate": 2.5680412913042843e-07, + "logits/chosen": -1.5422347784042358, + "logits/rejected": -1.4246468544006348, + "logps/chosen": -479.8147888183594, + "logps/rejected": -750.625, + "loss": 0.5143, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.149322986602783, + "rewards/margins": 2.762144088745117, + "rewards/rejected": -4.911467552185059, + "step": 2260 + }, + { + "epoch": 0.5446257197696737, + "grad_norm": 17.356455187998073, + "learning_rate": 2.5471085384992404e-07, + "logits/chosen": -1.5986369848251343, + "logits/rejected": -1.5308504104614258, + "logps/chosen": -437.1161193847656, + "logps/rejected": -754.64453125, + "loss": 0.4815, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.717659592628479, + "rewards/margins": 3.181501626968384, + "rewards/rejected": -4.899160861968994, + "step": 2270 + }, + { + "epoch": 0.5470249520153551, + "grad_norm": 9.28953605758801, + "learning_rate": 2.526172481217381e-07, + "logits/chosen": -1.5438369512557983, + "logits/rejected": -1.5630390644073486, + "logps/chosen": -416.0315856933594, + "logps/rejected": -573.1238403320312, + "loss": 0.5108, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.984635591506958, + "rewards/margins": 1.3827749490737915, + "rewards/rejected": -3.367410182952881, + "step": 2280 + }, + { + "epoch": 0.5494241842610365, + "grad_norm": 8.596639384497214, + "learning_rate": 2.5052345880399456e-07, + "logits/chosen": -1.6020433902740479, + "logits/rejected": -1.558334469795227, + "logps/chosen": -431.74029541015625, + "logps/rejected": -551.8709106445312, + "loss": 0.4808, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8813968896865845, + "rewards/margins": 1.2201679944992065, + "rewards/rejected": -3.10156512260437, + "step": 2290 + }, + { + "epoch": 0.5518234165067178, + "grad_norm": 11.753794107565938, + "learning_rate": 2.4842963276769555e-07, + "logits/chosen": -1.5615367889404297, + "logits/rejected": -1.5330339670181274, + "logps/chosen": -437.43084716796875, + "logps/rejected": -621.0929565429688, + "loss": 0.5155, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.09666109085083, + "rewards/margins": 1.3752198219299316, + "rewards/rejected": -3.4718806743621826, + "step": 2300 + }, + { + "epoch": 0.5542226487523992, + "grad_norm": 10.071023503443175, + "learning_rate": 2.463359168864189e-07, + "logits/chosen": -1.6465524435043335, + "logits/rejected": -1.646805763244629, + "logps/chosen": -486.73565673828125, + "logps/rejected": -566.8058471679688, + "loss": 0.5368, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7878824472427368, + "rewards/margins": 1.157956838607788, + "rewards/rejected": -2.9458394050598145, + "step": 2310 + }, + { + "epoch": 0.5566218809980806, + "grad_norm": 11.942512663088241, + "learning_rate": 2.4424245802601555e-07, + "logits/chosen": -1.6439406871795654, + "logits/rejected": -1.6925067901611328, + "logps/chosen": -371.7730407714844, + "logps/rejected": -524.0623168945312, + "loss": 0.4857, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3829593658447266, + "rewards/margins": 0.9715530276298523, + "rewards/rejected": -2.3545122146606445, + "step": 2320 + }, + { + "epoch": 0.559021113243762, + "grad_norm": 12.373131204862757, + "learning_rate": 2.421494030343072e-07, + "logits/chosen": -1.5384362936019897, + "logits/rejected": -1.480912446975708, + "logps/chosen": -452.34100341796875, + "logps/rejected": -508.7037048339844, + "loss": 0.5744, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7134408950805664, + "rewards/margins": 1.0949541330337524, + "rewards/rejected": -2.808394432067871, + "step": 2330 + }, + { + "epoch": 0.5614203454894434, + "grad_norm": 12.389004819032923, + "learning_rate": 2.400568987307861e-07, + "logits/chosen": -1.5702178478240967, + "logits/rejected": -1.5472772121429443, + "logps/chosen": -392.81146240234375, + "logps/rejected": -411.69317626953125, + "loss": 0.4819, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4531047344207764, + "rewards/margins": 0.3802509605884552, + "rewards/rejected": -1.8333555459976196, + "step": 2340 + }, + { + "epoch": 0.5638195777351248, + "grad_norm": 10.648864731231448, + "learning_rate": 2.379650918963156e-07, + "logits/chosen": -1.6462970972061157, + "logits/rejected": -1.6218713521957397, + "logps/chosen": -369.7726745605469, + "logps/rejected": -520.507080078125, + "loss": 0.5095, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6180322170257568, + "rewards/margins": 1.4091498851776123, + "rewards/rejected": -3.027182102203369, + "step": 2350 + }, + { + "epoch": 0.5662188099808061, + "grad_norm": 21.383140181787045, + "learning_rate": 2.3587412926283438e-07, + "logits/chosen": -1.6451104879379272, + "logits/rejected": -1.5843571424484253, + "logps/chosen": -529.2950439453125, + "logps/rejected": -677.9429321289062, + "loss": 0.5167, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.010671615600586, + "rewards/margins": 1.9736932516098022, + "rewards/rejected": -3.9843647480010986, + "step": 2360 + }, + { + "epoch": 0.5686180422264875, + "grad_norm": 11.060278026039455, + "learning_rate": 2.337841575030642e-07, + "logits/chosen": -1.6365985870361328, + "logits/rejected": -1.6005491018295288, + "logps/chosen": -461.49749755859375, + "logps/rejected": -656.2427978515625, + "loss": 0.4837, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6665138006210327, + "rewards/margins": 1.8031669855117798, + "rewards/rejected": -3.4696803092956543, + "step": 2370 + }, + { + "epoch": 0.5710172744721689, + "grad_norm": 12.434301470105313, + "learning_rate": 2.316953232202206e-07, + "logits/chosen": -1.5425972938537598, + "logits/rejected": -1.5300174951553345, + "logps/chosen": -429.36444091796875, + "logps/rejected": -472.57568359375, + "loss": 0.4711, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7053802013397217, + "rewards/margins": 1.1300846338272095, + "rewards/rejected": -2.8354649543762207, + "step": 2380 + }, + { + "epoch": 0.5734165067178503, + "grad_norm": 11.159233566131416, + "learning_rate": 2.2960777293772958e-07, + "logits/chosen": -1.4974268674850464, + "logits/rejected": -1.412022590637207, + "logps/chosen": -410.587890625, + "logps/rejected": -676.4757690429688, + "loss": 0.4907, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8208236694335938, + "rewards/margins": 2.8905091285705566, + "rewards/rejected": -4.71133279800415, + "step": 2390 + }, + { + "epoch": 0.5758157389635317, + "grad_norm": 8.941122859434415, + "learning_rate": 2.2752165308894974e-07, + "logits/chosen": -1.5879501104354858, + "logits/rejected": -1.538260579109192, + "logps/chosen": -387.39886474609375, + "logps/rejected": -541.6805419921875, + "loss": 0.4716, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7686420679092407, + "rewards/margins": 1.6267540454864502, + "rewards/rejected": -3.3953964710235596, + "step": 2400 + }, + { + "epoch": 0.5782149712092131, + "grad_norm": 12.044861445889701, + "learning_rate": 2.254371100069005e-07, + "logits/chosen": -1.5681109428405762, + "logits/rejected": -1.5912139415740967, + "logps/chosen": -399.2314453125, + "logps/rejected": -558.3780517578125, + "loss": 0.4731, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4787284135818481, + "rewards/margins": 1.385317325592041, + "rewards/rejected": -2.8640456199645996, + "step": 2410 + }, + { + "epoch": 0.5806142034548945, + "grad_norm": 14.345741676788231, + "learning_rate": 2.2335428991399725e-07, + "logits/chosen": -1.4331409931182861, + "logits/rejected": -1.3503481149673462, + "logps/chosen": -473.1424255371094, + "logps/rejected": -837.6438598632812, + "loss": 0.4977, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4872748851776123, + "rewards/margins": 3.6537222862243652, + "rewards/rejected": -6.140997409820557, + "step": 2420 + }, + { + "epoch": 0.5830134357005758, + "grad_norm": 13.4432658994951, + "learning_rate": 2.2127333891179458e-07, + "logits/chosen": -1.574540138244629, + "logits/rejected": -1.505368709564209, + "logps/chosen": -423.1871032714844, + "logps/rejected": -710.91845703125, + "loss": 0.5206, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.972616195678711, + "rewards/margins": 2.7154417037963867, + "rewards/rejected": -4.6880574226379395, + "step": 2430 + }, + { + "epoch": 0.5854126679462572, + "grad_norm": 11.856242883233412, + "learning_rate": 2.1919440297073782e-07, + "logits/chosen": -1.5872819423675537, + "logits/rejected": -1.4878933429718018, + "logps/chosen": -429.8773498535156, + "logps/rejected": -675.3303833007812, + "loss": 0.5143, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9643665552139282, + "rewards/margins": 2.4596877098083496, + "rewards/rejected": -4.424054145812988, + "step": 2440 + }, + { + "epoch": 0.5878119001919386, + "grad_norm": 12.96744179023995, + "learning_rate": 2.1711762791992368e-07, + "logits/chosen": -1.5757300853729248, + "logits/rejected": -1.498997449874878, + "logps/chosen": -521.7322998046875, + "logps/rejected": -615.30712890625, + "loss": 0.5374, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.17085337638855, + "rewards/margins": 1.284227967262268, + "rewards/rejected": -3.4550812244415283, + "step": 2450 + }, + { + "epoch": 0.5902111324376199, + "grad_norm": 10.542916107819408, + "learning_rate": 2.1504315943687114e-07, + "logits/chosen": -1.6688730716705322, + "logits/rejected": -1.6244138479232788, + "logps/chosen": -413.05352783203125, + "logps/rejected": -669.3853149414062, + "loss": 0.5043, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.667035698890686, + "rewards/margins": 2.1492269039154053, + "rewards/rejected": -3.8162624835968018, + "step": 2460 + }, + { + "epoch": 0.5926103646833013, + "grad_norm": 13.515470462935513, + "learning_rate": 2.1297114303730248e-07, + "logits/chosen": -1.5288382768630981, + "logits/rejected": -1.4609134197235107, + "logps/chosen": -422.8729553222656, + "logps/rejected": -724.0157470703125, + "loss": 0.5295, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.968060851097107, + "rewards/margins": 2.4587647914886475, + "rewards/rejected": -4.426825523376465, + "step": 2470 + }, + { + "epoch": 0.5950095969289827, + "grad_norm": 14.338489620286177, + "learning_rate": 2.1090172406493616e-07, + "logits/chosen": -1.5649030208587646, + "logits/rejected": -1.5623984336853027, + "logps/chosen": -399.79290771484375, + "logps/rejected": -583.4332275390625, + "loss": 0.4501, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5557973384857178, + "rewards/margins": 1.7626543045043945, + "rewards/rejected": -3.3184516429901123, + "step": 2480 + }, + { + "epoch": 0.5974088291746641, + "grad_norm": 17.510238929607876, + "learning_rate": 2.0883504768129146e-07, + "logits/chosen": -1.6265462636947632, + "logits/rejected": -1.598730444908142, + "logps/chosen": -497.2835998535156, + "logps/rejected": -648.0611572265625, + "loss": 0.5081, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.131727695465088, + "rewards/margins": 1.5764387845993042, + "rewards/rejected": -3.7081668376922607, + "step": 2490 + }, + { + "epoch": 0.5998080614203455, + "grad_norm": 11.33500534140664, + "learning_rate": 2.0677125885550571e-07, + "logits/chosen": -1.4689067602157593, + "logits/rejected": -1.453774094581604, + "logps/chosen": -439.34075927734375, + "logps/rejected": -537.5209350585938, + "loss": 0.4906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8917917013168335, + "rewards/margins": 1.3580577373504639, + "rewards/rejected": -3.249849319458008, + "step": 2500 + }, + { + "epoch": 0.6022072936660269, + "grad_norm": 13.216366762387246, + "learning_rate": 2.0471050235416587e-07, + "logits/chosen": -1.578136920928955, + "logits/rejected": -1.5401719808578491, + "logps/chosen": -500.56072998046875, + "logps/rejected": -603.341796875, + "loss": 0.4649, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2012569904327393, + "rewards/margins": 1.514261245727539, + "rewards/rejected": -3.71551775932312, + "step": 2510 + }, + { + "epoch": 0.6046065259117083, + "grad_norm": 13.494163188740151, + "learning_rate": 2.026529227311532e-07, + "logits/chosen": -1.5589288473129272, + "logits/rejected": -1.4520585536956787, + "logps/chosen": -436.919189453125, + "logps/rejected": -646.716796875, + "loss": 0.5339, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0600430965423584, + "rewards/margins": 2.0313327312469482, + "rewards/rejected": -4.091375827789307, + "step": 2520 + }, + { + "epoch": 0.6070057581573897, + "grad_norm": 13.27727238694429, + "learning_rate": 2.005986643175036e-07, + "logits/chosen": -1.5554237365722656, + "logits/rejected": -1.5433388948440552, + "logps/chosen": -461.245849609375, + "logps/rejected": -686.7439575195312, + "loss": 0.4396, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.842258095741272, + "rewards/margins": 2.4114012718200684, + "rewards/rejected": -4.253658771514893, + "step": 2530 + }, + { + "epoch": 0.6094049904030711, + "grad_norm": 13.0139085843487, + "learning_rate": 1.9854787121128328e-07, + "logits/chosen": -1.51650071144104, + "logits/rejected": -1.4218528270721436, + "logps/chosen": -408.762939453125, + "logps/rejected": -523.1329956054688, + "loss": 0.5186, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.722747802734375, + "rewards/margins": 1.6003834009170532, + "rewards/rejected": -3.3231310844421387, + "step": 2540 + }, + { + "epoch": 0.6118042226487524, + "grad_norm": 10.279174762722011, + "learning_rate": 1.9650068726748106e-07, + "logits/chosen": -1.4751393795013428, + "logits/rejected": -1.4030897617340088, + "logps/chosen": -480.7040100097656, + "logps/rejected": -683.616943359375, + "loss": 0.5435, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9335578680038452, + "rewards/margins": 2.1118619441986084, + "rewards/rejected": -4.045419692993164, + "step": 2550 + }, + { + "epoch": 0.6142034548944337, + "grad_norm": 10.58914230166321, + "learning_rate": 1.9445725608791718e-07, + "logits/chosen": -1.4881677627563477, + "logits/rejected": -1.339155673980713, + "logps/chosen": -489.5857849121094, + "logps/rejected": -993.9429931640625, + "loss": 0.4928, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1533727645874023, + "rewards/margins": 4.9657697677612305, + "rewards/rejected": -7.119143009185791, + "step": 2560 + }, + { + "epoch": 0.6166026871401151, + "grad_norm": 10.044783509073032, + "learning_rate": 1.924177210111705e-07, + "logits/chosen": -1.5943437814712524, + "logits/rejected": -1.4688080549240112, + "logps/chosen": -434.68804931640625, + "logps/rejected": -752.3925170898438, + "loss": 0.4961, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8741295337677002, + "rewards/margins": 3.1029107570648193, + "rewards/rejected": -4.977039813995361, + "step": 2570 + }, + { + "epoch": 0.6190019193857965, + "grad_norm": 10.803603559596482, + "learning_rate": 1.9038222510252364e-07, + "logits/chosen": -1.63632071018219, + "logits/rejected": -1.5772790908813477, + "logps/chosen": -425.29705810546875, + "logps/rejected": -547.4097900390625, + "loss": 0.4988, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6209042072296143, + "rewards/margins": 1.3503049612045288, + "rewards/rejected": -2.9712090492248535, + "step": 2580 + }, + { + "epoch": 0.6214011516314779, + "grad_norm": 13.398493396695924, + "learning_rate": 1.883509111439277e-07, + "logits/chosen": -1.5346853733062744, + "logits/rejected": -1.4362837076187134, + "logps/chosen": -431.0760192871094, + "logps/rejected": -812.218017578125, + "loss": 0.5172, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9112809896469116, + "rewards/margins": 3.1443681716918945, + "rewards/rejected": -5.055649757385254, + "step": 2590 + }, + { + "epoch": 0.6238003838771593, + "grad_norm": 8.974066504769196, + "learning_rate": 1.8632392162398665e-07, + "logits/chosen": -1.6221952438354492, + "logits/rejected": -1.5492918491363525, + "logps/chosen": -503.0997009277344, + "logps/rejected": -713.653564453125, + "loss": 0.4848, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9071382284164429, + "rewards/margins": 2.196667432785034, + "rewards/rejected": -4.1038055419921875, + "step": 2600 + }, + { + "epoch": 0.6261996161228407, + "grad_norm": 10.465480641447618, + "learning_rate": 1.84301398727962e-07, + "logits/chosen": -1.463478922843933, + "logits/rejected": -1.373296856880188, + "logps/chosen": -358.58990478515625, + "logps/rejected": -702.1085205078125, + "loss": 0.5041, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6315553188323975, + "rewards/margins": 3.0978779792785645, + "rewards/rejected": -4.729433536529541, + "step": 2610 + }, + { + "epoch": 0.6285988483685221, + "grad_norm": 16.447249304360692, + "learning_rate": 1.8228348432779966e-07, + "logits/chosen": -1.601949691772461, + "logits/rejected": -1.5340659618377686, + "logps/chosen": -446.01934814453125, + "logps/rejected": -602.1727294921875, + "loss": 0.5133, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0065500736236572, + "rewards/margins": 1.6265771389007568, + "rewards/rejected": -3.633127212524414, + "step": 2620 + }, + { + "epoch": 0.6309980806142035, + "grad_norm": 9.975684368561806, + "learning_rate": 1.8027031997217773e-07, + "logits/chosen": -1.496711015701294, + "logits/rejected": -1.3451837301254272, + "logps/chosen": -453.6783752441406, + "logps/rejected": -909.4461059570312, + "loss": 0.4633, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1523375511169434, + "rewards/margins": 4.429135799407959, + "rewards/rejected": -6.581473350524902, + "step": 2630 + }, + { + "epoch": 0.6333973128598849, + "grad_norm": 9.551241278071267, + "learning_rate": 1.7826204687657758e-07, + "logits/chosen": -1.5323810577392578, + "logits/rejected": -1.4816257953643799, + "logps/chosen": -476.60723876953125, + "logps/rejected": -552.1718139648438, + "loss": 0.4779, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8675925731658936, + "rewards/margins": 1.1629068851470947, + "rewards/rejected": -3.0304996967315674, + "step": 2640 + }, + { + "epoch": 0.6357965451055663, + "grad_norm": 11.460448418061953, + "learning_rate": 1.762588059133781e-07, + "logits/chosen": -1.5307328701019287, + "logits/rejected": -1.4167407751083374, + "logps/chosen": -492.13885498046875, + "logps/rejected": -644.9505004882812, + "loss": 0.4905, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7892935276031494, + "rewards/margins": 1.8879598379135132, + "rewards/rejected": -3.6772537231445312, + "step": 2650 + }, + { + "epoch": 0.6381957773512476, + "grad_norm": 18.293188421454047, + "learning_rate": 1.7426073760197406e-07, + "logits/chosen": -1.6222972869873047, + "logits/rejected": -1.5244884490966797, + "logps/chosen": -476.624267578125, + "logps/rejected": -844.6385498046875, + "loss": 0.5186, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1062722206115723, + "rewards/margins": 3.3689582347869873, + "rewards/rejected": -5.4752302169799805, + "step": 2660 + }, + { + "epoch": 0.6405950095969289, + "grad_norm": 9.733158102478546, + "learning_rate": 1.7226798209891935e-07, + "logits/chosen": -1.5991920232772827, + "logits/rejected": -1.5383590459823608, + "logps/chosen": -447.33721923828125, + "logps/rejected": -570.9419555664062, + "loss": 0.4602, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.72260320186615, + "rewards/margins": 1.8290891647338867, + "rewards/rejected": -3.551692247390747, + "step": 2670 + }, + { + "epoch": 0.6429942418426103, + "grad_norm": 13.946180043271863, + "learning_rate": 1.7028067918809535e-07, + "logits/chosen": -1.5528197288513184, + "logits/rejected": -1.3879868984222412, + "logps/chosen": -410.92718505859375, + "logps/rejected": -823.4285888671875, + "loss": 0.4851, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.739851713180542, + "rewards/margins": 3.7594234943389893, + "rewards/rejected": -5.499274730682373, + "step": 2680 + }, + { + "epoch": 0.6453934740882917, + "grad_norm": 10.290377075371278, + "learning_rate": 1.6829896827090584e-07, + "logits/chosen": -1.6512863636016846, + "logits/rejected": -1.6360222101211548, + "logps/chosen": -465.19073486328125, + "logps/rejected": -523.7138061523438, + "loss": 0.554, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8953874111175537, + "rewards/margins": 0.9017803072929382, + "rewards/rejected": -2.797168254852295, + "step": 2690 + }, + { + "epoch": 0.6477927063339731, + "grad_norm": 8.491674840989798, + "learning_rate": 1.6632298835649844e-07, + "logits/chosen": -1.6026594638824463, + "logits/rejected": -1.5701754093170166, + "logps/chosen": -467.7550354003906, + "logps/rejected": -753.0574951171875, + "loss": 0.4577, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8194019794464111, + "rewards/margins": 2.601040840148926, + "rewards/rejected": -4.420442581176758, + "step": 2700 + }, + { + "epoch": 0.6501919385796545, + "grad_norm": 31.306228191814924, + "learning_rate": 1.6435287805201364e-07, + "logits/chosen": -1.5099729299545288, + "logits/rejected": -1.4581714868545532, + "logps/chosen": -468.90753173828125, + "logps/rejected": -623.4779663085938, + "loss": 0.515, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9226620197296143, + "rewards/margins": 1.5821655988693237, + "rewards/rejected": -3.5048279762268066, + "step": 2710 + }, + { + "epoch": 0.6525911708253359, + "grad_norm": 11.64208369065543, + "learning_rate": 1.6238877555286207e-07, + "logits/chosen": -1.6213748455047607, + "logits/rejected": -1.5881431102752686, + "logps/chosen": -452.04132080078125, + "logps/rejected": -652.6304931640625, + "loss": 0.463, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6790266036987305, + "rewards/margins": 1.9346497058868408, + "rewards/rejected": -3.6136765480041504, + "step": 2720 + }, + { + "epoch": 0.6549904030710173, + "grad_norm": 13.434697867289191, + "learning_rate": 1.60430818633031e-07, + "logits/chosen": -1.717829704284668, + "logits/rejected": -1.6711931228637695, + "logps/chosen": -415.7367248535156, + "logps/rejected": -573.2308959960938, + "loss": 0.4395, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4689064025878906, + "rewards/margins": 1.6664180755615234, + "rewards/rejected": -3.135324239730835, + "step": 2730 + }, + { + "epoch": 0.6573896353166987, + "grad_norm": 9.195466035459773, + "learning_rate": 1.5847914463541939e-07, + "logits/chosen": -1.4839345216751099, + "logits/rejected": -1.4070708751678467, + "logps/chosen": -408.98046875, + "logps/rejected": -607.1571044921875, + "loss": 0.4778, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.960526704788208, + "rewards/margins": 1.7776432037353516, + "rewards/rejected": -3.7381699085235596, + "step": 2740 + }, + { + "epoch": 0.6597888675623801, + "grad_norm": 8.168397030598346, + "learning_rate": 1.5653389046220427e-07, + "logits/chosen": -1.5618484020233154, + "logits/rejected": -1.5674700736999512, + "logps/chosen": -410.63739013671875, + "logps/rejected": -545.4283447265625, + "loss": 0.5017, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5986849069595337, + "rewards/margins": 1.2204779386520386, + "rewards/rejected": -2.8191628456115723, + "step": 2750 + }, + { + "epoch": 0.6621880998080614, + "grad_norm": 15.795620020388752, + "learning_rate": 1.545951925652375e-07, + "logits/chosen": -1.574741244316101, + "logits/rejected": -1.5150604248046875, + "logps/chosen": -502.17840576171875, + "logps/rejected": -597.1779174804688, + "loss": 0.4903, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.739625334739685, + "rewards/margins": 1.5817149877548218, + "rewards/rejected": -3.3213400840759277, + "step": 2760 + }, + { + "epoch": 0.6645873320537428, + "grad_norm": 12.564082860552478, + "learning_rate": 1.5266318693647423e-07, + "logits/chosen": -1.604776382446289, + "logits/rejected": -1.5733304023742676, + "logps/chosen": -460.91168212890625, + "logps/rejected": -548.9085083007812, + "loss": 0.4572, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.717616081237793, + "rewards/margins": 0.9698230624198914, + "rewards/rejected": -2.68743896484375, + "step": 2770 + }, + { + "epoch": 0.6669865642994242, + "grad_norm": 12.201358595051783, + "learning_rate": 1.5073800909843353e-07, + "logits/chosen": -1.6035501956939697, + "logits/rejected": -1.4797896146774292, + "logps/chosen": -501.05548095703125, + "logps/rejected": -667.8190307617188, + "loss": 0.4558, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.150601863861084, + "rewards/margins": 2.3036932945251465, + "rewards/rejected": -4.4542951583862305, + "step": 2780 + }, + { + "epoch": 0.6693857965451055, + "grad_norm": 10.795967175298328, + "learning_rate": 1.488197940946922e-07, + "logits/chosen": -1.5843151807785034, + "logits/rejected": -1.5326802730560303, + "logps/chosen": -501.5411682128906, + "logps/rejected": -611.32568359375, + "loss": 0.4764, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1024060249328613, + "rewards/margins": 1.6799418926239014, + "rewards/rejected": -3.7823474407196045, + "step": 2790 + }, + { + "epoch": 0.6717850287907869, + "grad_norm": 28.011002170203856, + "learning_rate": 1.4690867648041167e-07, + "logits/chosen": -1.5918710231781006, + "logits/rejected": -1.4779281616210938, + "logps/chosen": -473.7042541503906, + "logps/rejected": -662.2276000976562, + "loss": 0.5129, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0022711753845215, + "rewards/margins": 2.1550307273864746, + "rewards/rejected": -4.157301902770996, + "step": 2800 + }, + { + "epoch": 0.6741842610364683, + "grad_norm": 12.528624583800916, + "learning_rate": 1.4500479031289987e-07, + "logits/chosen": -1.6873347759246826, + "logits/rejected": -1.596543312072754, + "logps/chosen": -460.88909912109375, + "logps/rejected": -586.3327026367188, + "loss": 0.5198, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7791045904159546, + "rewards/margins": 1.3285664319992065, + "rewards/rejected": -3.1076712608337402, + "step": 2810 + }, + { + "epoch": 0.6765834932821497, + "grad_norm": 11.065013602236313, + "learning_rate": 1.4310826914220747e-07, + "logits/chosen": -1.7246005535125732, + "logits/rejected": -1.7012087106704712, + "logps/chosen": -493.4828186035156, + "logps/rejected": -576.1470336914062, + "loss": 0.5056, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8482357263565063, + "rewards/margins": 0.951060950756073, + "rewards/rejected": -2.7992968559265137, + "step": 2820 + }, + { + "epoch": 0.6789827255278311, + "grad_norm": 11.565962251192635, + "learning_rate": 1.412192460017597e-07, + "logits/chosen": -1.5370794534683228, + "logits/rejected": -1.459987759590149, + "logps/chosen": -463.569580078125, + "logps/rejected": -711.2032470703125, + "loss": 0.5085, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.087894916534424, + "rewards/margins": 2.450357675552368, + "rewards/rejected": -4.538252353668213, + "step": 2830 + }, + { + "epoch": 0.6813819577735125, + "grad_norm": 9.118437836656124, + "learning_rate": 1.3933785339902504e-07, + "logits/chosen": -1.5175681114196777, + "logits/rejected": -1.5240638256072998, + "logps/chosen": -382.7988586425781, + "logps/rejected": -606.9676513671875, + "loss": 0.5099, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6014810800552368, + "rewards/margins": 1.8436975479125977, + "rewards/rejected": -3.445178508758545, + "step": 2840 + }, + { + "epoch": 0.6837811900191939, + "grad_norm": 8.445711443707477, + "learning_rate": 1.374642233062197e-07, + "logits/chosen": -1.6221126317977905, + "logits/rejected": -1.5964024066925049, + "logps/chosen": -470.7940368652344, + "logps/rejected": -608.9495239257812, + "loss": 0.5176, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7363065481185913, + "rewards/margins": 1.7359542846679688, + "rewards/rejected": -3.4722609519958496, + "step": 2850 + }, + { + "epoch": 0.6861804222648752, + "grad_norm": 8.500425735666235, + "learning_rate": 1.355984871510511e-07, + "logits/chosen": -1.5512058734893799, + "logits/rejected": -1.5746601819992065, + "logps/chosen": -498.8207092285156, + "logps/rejected": -657.9988403320312, + "loss": 0.4596, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9174222946166992, + "rewards/margins": 1.5131709575653076, + "rewards/rejected": -3.430593490600586, + "step": 2860 + }, + { + "epoch": 0.6885796545105566, + "grad_norm": 12.852665923928024, + "learning_rate": 1.3374077580749783e-07, + "logits/chosen": -1.6059240102767944, + "logits/rejected": -1.5994579792022705, + "logps/chosen": -344.4139404296875, + "logps/rejected": -512.6463623046875, + "loss": 0.4934, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4501264095306396, + "rewards/margins": 1.4970743656158447, + "rewards/rejected": -2.9472010135650635, + "step": 2870 + }, + { + "epoch": 0.690978886756238, + "grad_norm": 17.194251207513553, + "learning_rate": 1.3189121958663024e-07, + "logits/chosen": -1.643771767616272, + "logits/rejected": -1.5831727981567383, + "logps/chosen": -502.8651428222656, + "logps/rejected": -554.9192504882812, + "loss": 0.493, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0862865447998047, + "rewards/margins": 0.8170648813247681, + "rewards/rejected": -2.903351306915283, + "step": 2880 + }, + { + "epoch": 0.6933781190019194, + "grad_norm": 12.906100640591236, + "learning_rate": 1.3004994822746895e-07, + "logits/chosen": -1.7391399145126343, + "logits/rejected": -1.6716740131378174, + "logps/chosen": -431.04180908203125, + "logps/rejected": -561.5888671875, + "loss": 0.5147, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6912224292755127, + "rewards/margins": 1.2724682092666626, + "rewards/rejected": -2.9636902809143066, + "step": 2890 + }, + { + "epoch": 0.6957773512476008, + "grad_norm": 9.565619443461362, + "learning_rate": 1.2821709088788434e-07, + "logits/chosen": -1.5313981771469116, + "logits/rejected": -1.5281095504760742, + "logps/chosen": -398.6699523925781, + "logps/rejected": -540.1404418945312, + "loss": 0.5184, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8083369731903076, + "rewards/margins": 1.3905736207962036, + "rewards/rejected": -3.198910713195801, + "step": 2900 + }, + { + "epoch": 0.6981765834932822, + "grad_norm": 15.033651183019835, + "learning_rate": 1.2639277613553736e-07, + "logits/chosen": -1.469612717628479, + "logits/rejected": -1.414353609085083, + "logps/chosen": -363.5370178222656, + "logps/rejected": -500.765380859375, + "loss": 0.486, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.468360185623169, + "rewards/margins": 1.3389683961868286, + "rewards/rejected": -2.807328701019287, + "step": 2910 + }, + { + "epoch": 0.7005758157389635, + "grad_norm": 9.435304971614766, + "learning_rate": 1.2457713193885975e-07, + "logits/chosen": -1.461320161819458, + "logits/rejected": -1.4299747943878174, + "logps/chosen": -376.7994079589844, + "logps/rejected": -629.6224975585938, + "loss": 0.4765, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.896950364112854, + "rewards/margins": 2.1872236728668213, + "rewards/rejected": -4.084174156188965, + "step": 2920 + }, + { + "epoch": 0.7029750479846449, + "grad_norm": 19.51031494795868, + "learning_rate": 1.2277028565807838e-07, + "logits/chosen": -1.6188557147979736, + "logits/rejected": -1.5686419010162354, + "logps/chosen": -430.37677001953125, + "logps/rejected": -559.7628173828125, + "loss": 0.4852, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6697213649749756, + "rewards/margins": 1.4008890390396118, + "rewards/rejected": -3.070610523223877, + "step": 2930 + }, + { + "epoch": 0.7053742802303263, + "grad_norm": 9.121829865634552, + "learning_rate": 1.209723640362815e-07, + "logits/chosen": -1.6373344659805298, + "logits/rejected": -1.5528385639190674, + "logps/chosen": -470.95147705078125, + "logps/rejected": -721.1168212890625, + "loss": 0.5449, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9253082275390625, + "rewards/margins": 2.59212327003479, + "rewards/rejected": -4.517431259155273, + "step": 2940 + }, + { + "epoch": 0.7077735124760077, + "grad_norm": 10.746753751922695, + "learning_rate": 1.191834931905277e-07, + "logits/chosen": -1.6567723751068115, + "logits/rejected": -1.6191829442977905, + "logps/chosen": -511.308349609375, + "logps/rejected": -676.2968139648438, + "loss": 0.5049, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9652636051177979, + "rewards/margins": 1.6685765981674194, + "rewards/rejected": -3.6338400840759277, + "step": 2950 + }, + { + "epoch": 0.710172744721689, + "grad_norm": 9.111246533138369, + "learning_rate": 1.1740379860299988e-07, + "logits/chosen": -1.605023741722107, + "logits/rejected": -1.6224443912506104, + "logps/chosen": -493.97021484375, + "logps/rejected": -663.3172607421875, + "loss": 0.517, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9065659046173096, + "rewards/margins": 1.4981725215911865, + "rewards/rejected": -3.404738664627075, + "step": 2960 + }, + { + "epoch": 0.7125719769673704, + "grad_norm": 19.195509902532088, + "learning_rate": 1.1563340511220254e-07, + "logits/chosen": -1.5975620746612549, + "logits/rejected": -1.5121543407440186, + "logps/chosen": -490.51123046875, + "logps/rejected": -638.4617309570312, + "loss": 0.5129, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8694756031036377, + "rewards/margins": 1.672284722328186, + "rewards/rejected": -3.541760206222534, + "step": 2970 + }, + { + "epoch": 0.7149712092130518, + "grad_norm": 7.617751594360228, + "learning_rate": 1.1387243690420556e-07, + "logits/chosen": -1.6257362365722656, + "logits/rejected": -1.556630253791809, + "logps/chosen": -514.2935180664062, + "logps/rejected": -719.3594360351562, + "loss": 0.5041, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7584432363510132, + "rewards/margins": 2.219550371170044, + "rewards/rejected": -3.9779934883117676, + "step": 2980 + }, + { + "epoch": 0.7173704414587332, + "grad_norm": 13.715441326355377, + "learning_rate": 1.1212101750393235e-07, + "logits/chosen": -1.5823287963867188, + "logits/rejected": -1.4760550260543823, + "logps/chosen": -454.1546936035156, + "logps/rejected": -674.0308837890625, + "loss": 0.4824, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.85116446018219, + "rewards/margins": 2.426201581954956, + "rewards/rejected": -4.277366638183594, + "step": 2990 + }, + { + "epoch": 0.7197696737044146, + "grad_norm": 11.870152015817894, + "learning_rate": 1.1037926976649562e-07, + "logits/chosen": -1.6472456455230713, + "logits/rejected": -1.5713512897491455, + "logps/chosen": -454.6564025878906, + "logps/rejected": -658.142578125, + "loss": 0.5097, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7928310632705688, + "rewards/margins": 1.8045374155044556, + "rewards/rejected": -3.5973682403564453, + "step": 3000 + }, + { + "epoch": 0.722168905950096, + "grad_norm": 12.133762665442388, + "learning_rate": 1.0864731586857936e-07, + "logits/chosen": -1.5800144672393799, + "logits/rejected": -1.4943416118621826, + "logps/chosen": -462.8450622558594, + "logps/rejected": -622.0472412109375, + "loss": 0.4795, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.633138656616211, + "rewards/margins": 1.8899364471435547, + "rewards/rejected": -3.5230751037597656, + "step": 3010 + }, + { + "epoch": 0.7245681381957774, + "grad_norm": 15.184681556970155, + "learning_rate": 1.0692527729986839e-07, + "logits/chosen": -1.6487934589385986, + "logits/rejected": -1.5611571073532104, + "logps/chosen": -443.3125, + "logps/rejected": -629.9989624023438, + "loss": 0.4486, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7226934432983398, + "rewards/margins": 2.09248948097229, + "rewards/rejected": -3.81518292427063, + "step": 3020 + }, + { + "epoch": 0.7269673704414588, + "grad_norm": 8.881606654301349, + "learning_rate": 1.0521327485452692e-07, + "logits/chosen": -1.50355064868927, + "logits/rejected": -1.4316844940185547, + "logps/chosen": -454.41375732421875, + "logps/rejected": -705.1302490234375, + "loss": 0.4685, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0102181434631348, + "rewards/margins": 2.672572612762451, + "rewards/rejected": -4.682791233062744, + "step": 3030 + }, + { + "epoch": 0.7293666026871402, + "grad_norm": 12.06878430856091, + "learning_rate": 1.0351142862272468e-07, + "logits/chosen": -1.5602006912231445, + "logits/rejected": -1.397483229637146, + "logps/chosen": -426.56011962890625, + "logps/rejected": -804.3447265625, + "loss": 0.488, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9955031871795654, + "rewards/margins": 3.9034297466278076, + "rewards/rejected": -5.898933410644531, + "step": 3040 + }, + { + "epoch": 0.7317658349328215, + "grad_norm": 12.298414208256554, + "learning_rate": 1.0181985798221343e-07, + "logits/chosen": -1.5111182928085327, + "logits/rejected": -1.4830925464630127, + "logps/chosen": -453.55706787109375, + "logps/rejected": -682.9276123046875, + "loss": 0.4944, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8569743633270264, + "rewards/margins": 2.276150703430176, + "rewards/rejected": -4.133125305175781, + "step": 3050 + }, + { + "epoch": 0.7341650671785028, + "grad_norm": 11.302209741727303, + "learning_rate": 1.0013868158995329e-07, + "logits/chosen": -1.479041337966919, + "logits/rejected": -1.4589459896087646, + "logps/chosen": -488.8414001464844, + "logps/rejected": -664.2630615234375, + "loss": 0.4896, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.093553066253662, + "rewards/margins": 1.9455658197402954, + "rewards/rejected": -4.039118766784668, + "step": 3060 + }, + { + "epoch": 0.7365642994241842, + "grad_norm": 9.425382101973028, + "learning_rate": 9.84680173737887e-08, + "logits/chosen": -1.6392253637313843, + "logits/rejected": -1.6131207942962646, + "logps/chosen": -453.7015686035156, + "logps/rejected": -568.0528564453125, + "loss": 0.505, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7246614694595337, + "rewards/margins": 1.5568969249725342, + "rewards/rejected": -3.2815582752227783, + "step": 3070 + }, + { + "epoch": 0.7389635316698656, + "grad_norm": 10.214651228125971, + "learning_rate": 9.680798252417713e-08, + "logits/chosen": -1.5981972217559814, + "logits/rejected": -1.5456653833389282, + "logps/chosen": -404.54217529296875, + "logps/rejected": -615.52490234375, + "loss": 0.4716, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8212268352508545, + "rewards/margins": 1.7736434936523438, + "rewards/rejected": -3.5948708057403564, + "step": 3080 + }, + { + "epoch": 0.741362763915547, + "grad_norm": 9.96347940855632, + "learning_rate": 9.515869348596808e-08, + "logits/chosen": -1.7033554315567017, + "logits/rejected": -1.689077615737915, + "logps/chosen": -479.61602783203125, + "logps/rejected": -600.86083984375, + "loss": 0.4722, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7726186513900757, + "rewards/margins": 1.4355189800262451, + "rewards/rejected": -3.2081375122070312, + "step": 3090 + }, + { + "epoch": 0.7437619961612284, + "grad_norm": 9.182568453922713, + "learning_rate": 9.352026595023493e-08, + "logits/chosen": -1.7283750772476196, + "logits/rejected": -1.651894211769104, + "logps/chosen": -472.06890869140625, + "logps/rejected": -567.2432250976562, + "loss": 0.4948, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7672761678695679, + "rewards/margins": 1.1923803091049194, + "rewards/rejected": -2.9596564769744873, + "step": 3100 + }, + { + "epoch": 0.7461612284069098, + "grad_norm": 13.775785773649671, + "learning_rate": 9.189281484616004e-08, + "logits/chosen": -1.5656628608703613, + "logits/rejected": -1.5381602048873901, + "logps/chosen": -405.3626403808594, + "logps/rejected": -634.8448486328125, + "loss": 0.5248, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9110819101333618, + "rewards/margins": 1.8859049081802368, + "rewards/rejected": -3.7969868183135986, + "step": 3110 + }, + { + "epoch": 0.7485604606525912, + "grad_norm": 18.73925450329653, + "learning_rate": 9.027645433297249e-08, + "logits/chosen": -1.5921382904052734, + "logits/rejected": -1.3741363286972046, + "logps/chosen": -594.7786865234375, + "logps/rejected": -781.3391723632812, + "loss": 0.5287, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.647385597229004, + "rewards/margins": 2.289605140686035, + "rewards/rejected": -4.936990737915039, + "step": 3120 + }, + { + "epoch": 0.7509596928982726, + "grad_norm": 24.661577248495657, + "learning_rate": 8.867129779194066e-08, + "logits/chosen": -1.5586223602294922, + "logits/rejected": -1.4272655248641968, + "logps/chosen": -389.38580322265625, + "logps/rejected": -706.0331420898438, + "loss": 0.5109, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.627242088317871, + "rewards/margins": 3.178426742553711, + "rewards/rejected": -4.805668830871582, + "step": 3130 + }, + { + "epoch": 0.753358925143954, + "grad_norm": 13.393978248198778, + "learning_rate": 8.707745781841866e-08, + "logits/chosen": -1.5590431690216064, + "logits/rejected": -1.4452258348464966, + "logps/chosen": -470.12890625, + "logps/rejected": -711.6036376953125, + "loss": 0.5168, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2957184314727783, + "rewards/margins": 2.5038769245147705, + "rewards/rejected": -4.799595355987549, + "step": 3140 + }, + { + "epoch": 0.7557581573896354, + "grad_norm": 15.326596868212043, + "learning_rate": 8.549504621394831e-08, + "logits/chosen": -1.6833692789077759, + "logits/rejected": -1.5196136236190796, + "logps/chosen": -434.9501953125, + "logps/rejected": -797.6722412109375, + "loss": 0.4405, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8721044063568115, + "rewards/margins": 3.692800998687744, + "rewards/rejected": -5.564905166625977, + "step": 3150 + }, + { + "epoch": 0.7581573896353166, + "grad_norm": 15.888863470601404, + "learning_rate": 8.392417397841703e-08, + "logits/chosen": -1.576700210571289, + "logits/rejected": -1.5513032674789429, + "logps/chosen": -457.85125732421875, + "logps/rejected": -624.4636840820312, + "loss": 0.4774, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7959423065185547, + "rewards/margins": 1.5467565059661865, + "rewards/rejected": -3.3426990509033203, + "step": 3160 + }, + { + "epoch": 0.760556621880998, + "grad_norm": 80.57455805524923, + "learning_rate": 8.236495130227083e-08, + "logits/chosen": -1.5804119110107422, + "logits/rejected": -1.4564083814620972, + "logps/chosen": -529.6447143554688, + "logps/rejected": -735.8713989257812, + "loss": 0.5082, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.244884967803955, + "rewards/margins": 2.5058138370513916, + "rewards/rejected": -4.750698566436768, + "step": 3170 + }, + { + "epoch": 0.7629558541266794, + "grad_norm": 11.057339252820107, + "learning_rate": 8.081748755878612e-08, + "logits/chosen": -1.6222680807113647, + "logits/rejected": -1.5655839443206787, + "logps/chosen": -490.6512756347656, + "logps/rejected": -579.689208984375, + "loss": 0.494, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9746372699737549, + "rewards/margins": 1.4921201467514038, + "rewards/rejected": -3.466757297515869, + "step": 3180 + }, + { + "epoch": 0.7653550863723608, + "grad_norm": 10.618176488071374, + "learning_rate": 7.928189129639632e-08, + "logits/chosen": -1.5830059051513672, + "logits/rejected": -1.5720051527023315, + "logps/chosen": -406.87188720703125, + "logps/rejected": -586.8067016601562, + "loss": 0.4807, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.686078429222107, + "rewards/margins": 1.659711241722107, + "rewards/rejected": -3.345789670944214, + "step": 3190 + }, + { + "epoch": 0.7677543186180422, + "grad_norm": 13.680452637120762, + "learning_rate": 7.775827023107834e-08, + "logits/chosen": -1.6163785457611084, + "logits/rejected": -1.5762126445770264, + "logps/chosen": -446.047607421875, + "logps/rejected": -623.6538696289062, + "loss": 0.5153, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9947631359100342, + "rewards/margins": 1.5673038959503174, + "rewards/rejected": -3.5620665550231934, + "step": 3200 + }, + { + "epoch": 0.7701535508637236, + "grad_norm": 14.433630119675083, + "learning_rate": 7.624673123879682e-08, + "logits/chosen": -1.7464625835418701, + "logits/rejected": -1.6586641073226929, + "logps/chosen": -430.14776611328125, + "logps/rejected": -545.7342529296875, + "loss": 0.5204, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7657415866851807, + "rewards/margins": 1.3537575006484985, + "rewards/rejected": -3.1194987297058105, + "step": 3210 + }, + { + "epoch": 0.772552783109405, + "grad_norm": 12.66091896209343, + "learning_rate": 7.474738034800663e-08, + "logits/chosen": -1.5930930376052856, + "logits/rejected": -1.4509809017181396, + "logps/chosen": -376.0271301269531, + "logps/rejected": -682.3421630859375, + "loss": 0.5118, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4719572067260742, + "rewards/margins": 3.3040995597839355, + "rewards/rejected": -4.776057243347168, + "step": 3220 + }, + { + "epoch": 0.7749520153550864, + "grad_norm": 10.980304828194955, + "learning_rate": 7.326032273221606e-08, + "logits/chosen": -1.6833438873291016, + "logits/rejected": -1.622230887413025, + "logps/chosen": -491.75860595703125, + "logps/rejected": -625.6595458984375, + "loss": 0.4872, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9277355670928955, + "rewards/margins": 1.5761892795562744, + "rewards/rejected": -3.503924608230591, + "step": 3230 + }, + { + "epoch": 0.7773512476007678, + "grad_norm": 10.749977033774096, + "learning_rate": 7.178566270260872e-08, + "logits/chosen": -1.5899261236190796, + "logits/rejected": -1.5829485654830933, + "logps/chosen": -477.94805908203125, + "logps/rejected": -673.7672729492188, + "loss": 0.5126, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0385262966156006, + "rewards/margins": 1.790924072265625, + "rewards/rejected": -3.8294501304626465, + "step": 3240 + }, + { + "epoch": 0.7797504798464492, + "grad_norm": 8.484431263168627, + "learning_rate": 7.032350370072709e-08, + "logits/chosen": -1.6738132238388062, + "logits/rejected": -1.6105749607086182, + "logps/chosen": -458.83331298828125, + "logps/rejected": -622.7803955078125, + "loss": 0.4562, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.730473518371582, + "rewards/margins": 1.7545102834701538, + "rewards/rejected": -3.4849839210510254, + "step": 3250 + }, + { + "epoch": 0.7821497120921305, + "grad_norm": 9.387892269267574, + "learning_rate": 6.887394829121596e-08, + "logits/chosen": -1.5848913192749023, + "logits/rejected": -1.4476993083953857, + "logps/chosen": -471.938720703125, + "logps/rejected": -831.1727294921875, + "loss": 0.4836, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8803211450576782, + "rewards/margins": 3.8900272846221924, + "rewards/rejected": -5.77034854888916, + "step": 3260 + }, + { + "epoch": 0.7845489443378119, + "grad_norm": 18.641825226968827, + "learning_rate": 6.743709815462833e-08, + "logits/chosen": -1.6654167175292969, + "logits/rejected": -1.5044206380844116, + "logps/chosen": -483.08917236328125, + "logps/rejected": -693.2271728515625, + "loss": 0.4752, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.000235080718994, + "rewards/margins": 2.46742582321167, + "rewards/rejected": -4.467661380767822, + "step": 3270 + }, + { + "epoch": 0.7869481765834933, + "grad_norm": 12.349301851226299, + "learning_rate": 6.601305408029287e-08, + "logits/chosen": -1.5291095972061157, + "logits/rejected": -1.419311285018921, + "logps/chosen": -436.09454345703125, + "logps/rejected": -668.6248779296875, + "loss": 0.4694, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8284947872161865, + "rewards/margins": 2.354825496673584, + "rewards/rejected": -4.18332052230835, + "step": 3280 + }, + { + "epoch": 0.7893474088291746, + "grad_norm": 9.060603124592086, + "learning_rate": 6.460191595924366e-08, + "logits/chosen": -1.6744062900543213, + "logits/rejected": -1.6133877038955688, + "logps/chosen": -455.89495849609375, + "logps/rejected": -635.6976318359375, + "loss": 0.4573, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.951505422592163, + "rewards/margins": 1.7754148244857788, + "rewards/rejected": -3.7269206047058105, + "step": 3290 + }, + { + "epoch": 0.791746641074856, + "grad_norm": 12.585511956386624, + "learning_rate": 6.320378277721342e-08, + "logits/chosen": -1.598135232925415, + "logits/rejected": -1.6028814315795898, + "logps/chosen": -483.92535400390625, + "logps/rejected": -569.0828247070312, + "loss": 0.4735, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1339850425720215, + "rewards/margins": 1.0029327869415283, + "rewards/rejected": -3.136918067932129, + "step": 3300 + }, + { + "epoch": 0.7941458733205374, + "grad_norm": 13.389579263658327, + "learning_rate": 6.181875260769032e-08, + "logits/chosen": -1.640591025352478, + "logits/rejected": -1.5519497394561768, + "logps/chosen": -507.2039489746094, + "logps/rejected": -642.580322265625, + "loss": 0.4724, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0453391075134277, + "rewards/margins": 2.058948040008545, + "rewards/rejected": -4.104287147521973, + "step": 3310 + }, + { + "epoch": 0.7965451055662188, + "grad_norm": 13.291226613775438, + "learning_rate": 6.044692260503797e-08, + "logits/chosen": -1.569854736328125, + "logits/rejected": -1.5137242078781128, + "logps/chosen": -525.5565185546875, + "logps/rejected": -748.2379150390625, + "loss": 0.443, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.161888360977173, + "rewards/margins": 2.5372633934020996, + "rewards/rejected": -4.699151515960693, + "step": 3320 + }, + { + "epoch": 0.7989443378119002, + "grad_norm": 11.742394321174341, + "learning_rate": 5.9088388997680984e-08, + "logits/chosen": -1.680760383605957, + "logits/rejected": -1.5625580549240112, + "logps/chosen": -540.314453125, + "logps/rejected": -675.5418090820312, + "loss": 0.4639, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9422404766082764, + "rewards/margins": 2.1821341514587402, + "rewards/rejected": -4.1243743896484375, + "step": 3330 + }, + { + "epoch": 0.8013435700575816, + "grad_norm": 10.388328753668427, + "learning_rate": 5.774324708135439e-08, + "logits/chosen": -1.631049394607544, + "logits/rejected": -1.5399057865142822, + "logps/chosen": -395.74127197265625, + "logps/rejected": -528.3821411132812, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6879892349243164, + "rewards/margins": 1.5684130191802979, + "rewards/rejected": -3.2564022541046143, + "step": 3340 + }, + { + "epoch": 0.803742802303263, + "grad_norm": 8.566813415550278, + "learning_rate": 5.641159121241953e-08, + "logits/chosen": -1.5318087339401245, + "logits/rejected": -1.3939533233642578, + "logps/chosen": -455.18475341796875, + "logps/rejected": -784.6776123046875, + "loss": 0.4742, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1146061420440674, + "rewards/margins": 3.002748966217041, + "rewards/rejected": -5.1173553466796875, + "step": 3350 + }, + { + "epoch": 0.8061420345489443, + "grad_norm": 24.581271633624063, + "learning_rate": 5.5093514801245106e-08, + "logits/chosen": -1.572483777999878, + "logits/rejected": -1.5058854818344116, + "logps/chosen": -462.61798095703125, + "logps/rejected": -692.5750732421875, + "loss": 0.5167, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.017944812774658, + "rewards/margins": 2.093128204345703, + "rewards/rejected": -4.111073017120361, + "step": 3360 + }, + { + "epoch": 0.8085412667946257, + "grad_norm": 11.15477109986071, + "learning_rate": 5.378911030565453e-08, + "logits/chosen": -1.6517555713653564, + "logits/rejected": -1.6141020059585571, + "logps/chosen": -524.3145751953125, + "logps/rejected": -706.8885498046875, + "loss": 0.4856, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.254424571990967, + "rewards/margins": 1.5656322240829468, + "rewards/rejected": -3.820056438446045, + "step": 3370 + }, + { + "epoch": 0.8109404990403071, + "grad_norm": 14.053000697829326, + "learning_rate": 5.249846922444101e-08, + "logits/chosen": -1.565073013305664, + "logits/rejected": -1.3497178554534912, + "logps/chosen": -454.67254638671875, + "logps/rejected": -954.7060546875, + "loss": 0.4615, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2211482524871826, + "rewards/margins": 5.220716953277588, + "rewards/rejected": -7.441864967346191, + "step": 3380 + }, + { + "epoch": 0.8133397312859885, + "grad_norm": 14.873913908271929, + "learning_rate": 5.122168209094865e-08, + "logits/chosen": -1.5818357467651367, + "logits/rejected": -1.6091349124908447, + "logps/chosen": -396.847412109375, + "logps/rejected": -471.558837890625, + "loss": 0.4921, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7213027477264404, + "rewards/margins": 0.7357383966445923, + "rewards/rejected": -2.457041025161743, + "step": 3390 + }, + { + "epoch": 0.8157389635316699, + "grad_norm": 7.662127553049077, + "learning_rate": 4.995883846672222e-08, + "logits/chosen": -1.7527958154678345, + "logits/rejected": -1.6081327199935913, + "logps/chosen": -597.8883666992188, + "logps/rejected": -634.3714599609375, + "loss": 0.4928, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.129807472229004, + "rewards/margins": 1.1348294019699097, + "rewards/rejected": -3.2646374702453613, + "step": 3400 + }, + { + "epoch": 0.8181381957773513, + "grad_norm": 9.295843025722531, + "learning_rate": 4.871002693522486e-08, + "logits/chosen": -1.6534563302993774, + "logits/rejected": -1.5655500888824463, + "logps/chosen": -467.03948974609375, + "logps/rejected": -591.6249389648438, + "loss": 0.4943, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7798774242401123, + "rewards/margins": 1.6510403156280518, + "rewards/rejected": -3.4309182167053223, + "step": 3410 + }, + { + "epoch": 0.8205374280230326, + "grad_norm": 11.329859485907628, + "learning_rate": 4.7475335095623956e-08, + "logits/chosen": -1.6167590618133545, + "logits/rejected": -1.5307334661483765, + "logps/chosen": -470.58270263671875, + "logps/rejected": -668.56640625, + "loss": 0.4725, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9665504693984985, + "rewards/margins": 2.131230592727661, + "rewards/rejected": -4.097781181335449, + "step": 3420 + }, + { + "epoch": 0.822936660268714, + "grad_norm": 20.11235991117846, + "learning_rate": 4.6254849556646714e-08, + "logits/chosen": -1.670371413230896, + "logits/rejected": -1.4920861721038818, + "logps/chosen": -548.06787109375, + "logps/rejected": -802.3192138671875, + "loss": 0.5013, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2579779624938965, + "rewards/margins": 3.0247139930725098, + "rewards/rejected": -5.282691955566406, + "step": 3430 + }, + { + "epoch": 0.8253358925143954, + "grad_norm": 11.445798775826706, + "learning_rate": 4.504865593050483e-08, + "logits/chosen": -1.6630195379257202, + "logits/rejected": -1.5998250246047974, + "logps/chosen": -498.11968994140625, + "logps/rejected": -625.1238403320312, + "loss": 0.5091, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1429293155670166, + "rewards/margins": 1.2725276947021484, + "rewards/rejected": -3.415456771850586, + "step": 3440 + }, + { + "epoch": 0.8277351247600768, + "grad_norm": 11.346669302814137, + "learning_rate": 4.385683882688895e-08, + "logits/chosen": -1.723249077796936, + "logits/rejected": -1.665723443031311, + "logps/chosen": -476.0176696777344, + "logps/rejected": -490.48779296875, + "loss": 0.5342, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8500604629516602, + "rewards/margins": 0.6172209978103638, + "rewards/rejected": -2.4672813415527344, + "step": 3450 + }, + { + "epoch": 0.8301343570057581, + "grad_norm": 15.969215547104008, + "learning_rate": 4.2679481847033985e-08, + "logits/chosen": -1.588076114654541, + "logits/rejected": -1.5232843160629272, + "logps/chosen": -461.88848876953125, + "logps/rejected": -622.911865234375, + "loss": 0.5174, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8116493225097656, + "rewards/margins": 1.6616909503936768, + "rewards/rejected": -3.4733402729034424, + "step": 3460 + }, + { + "epoch": 0.8325335892514395, + "grad_norm": 11.072153331210435, + "learning_rate": 4.151666757785435e-08, + "logits/chosen": -1.5928449630737305, + "logits/rejected": -1.5169405937194824, + "logps/chosen": -427.0491638183594, + "logps/rejected": -672.35107421875, + "loss": 0.4681, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7070322036743164, + "rewards/margins": 2.5327906608581543, + "rewards/rejected": -4.239823341369629, + "step": 3470 + }, + { + "epoch": 0.8349328214971209, + "grad_norm": 9.49570882884042, + "learning_rate": 4.036847758615136e-08, + "logits/chosen": -1.6075379848480225, + "logits/rejected": -1.5888822078704834, + "logps/chosen": -491.94390869140625, + "logps/rejected": -652.8501586914062, + "loss": 0.5274, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3509058952331543, + "rewards/margins": 1.588134765625, + "rewards/rejected": -3.939040422439575, + "step": 3480 + }, + { + "epoch": 0.8373320537428023, + "grad_norm": 11.923530456024624, + "learning_rate": 3.923499241289113e-08, + "logits/chosen": -1.6823714971542358, + "logits/rejected": -1.593367338180542, + "logps/chosen": -547.1072998046875, + "logps/rejected": -672.8248901367188, + "loss": 0.5441, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.304713249206543, + "rewards/margins": 1.8532909154891968, + "rewards/rejected": -4.158003807067871, + "step": 3490 + }, + { + "epoch": 0.8397312859884837, + "grad_norm": 10.522005050954938, + "learning_rate": 3.811629156755541e-08, + "logits/chosen": -1.6868102550506592, + "logits/rejected": -1.6846917867660522, + "logps/chosen": -485.7550354003906, + "logps/rejected": -589.5763549804688, + "loss": 0.4966, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7846243381500244, + "rewards/margins": 1.1891510486602783, + "rewards/rejected": -2.9737753868103027, + "step": 3500 + }, + { + "epoch": 0.8421305182341651, + "grad_norm": 9.525223827769382, + "learning_rate": 3.701245352256391e-08, + "logits/chosen": -1.6697797775268555, + "logits/rejected": -1.6646827459335327, + "logps/chosen": -471.4710998535156, + "logps/rejected": -512.9531860351562, + "loss": 0.4984, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.646978735923767, + "rewards/margins": 0.6669288873672485, + "rewards/rejected": -2.3139073848724365, + "step": 3510 + }, + { + "epoch": 0.8445297504798465, + "grad_norm": 12.160402315077537, + "learning_rate": 3.592355570776984e-08, + "logits/chosen": -1.6833436489105225, + "logits/rejected": -1.6594688892364502, + "logps/chosen": -362.41400146484375, + "logps/rejected": -500.51666259765625, + "loss": 0.472, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2686312198638916, + "rewards/margins": 1.350454568862915, + "rewards/rejected": -2.6190860271453857, + "step": 3520 + }, + { + "epoch": 0.8469289827255279, + "grad_norm": 8.805306061647622, + "learning_rate": 3.484967450502904e-08, + "logits/chosen": -1.5704456567764282, + "logits/rejected": -1.5578100681304932, + "logps/chosen": -376.8490905761719, + "logps/rejected": -579.5418701171875, + "loss": 0.4634, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.635333776473999, + "rewards/margins": 1.6225509643554688, + "rewards/rejected": -3.257884979248047, + "step": 3530 + }, + { + "epoch": 0.8493282149712092, + "grad_norm": 12.572003529621947, + "learning_rate": 3.3790885242841296e-08, + "logits/chosen": -1.6139347553253174, + "logits/rejected": -1.5461299419403076, + "logps/chosen": -473.8414611816406, + "logps/rejected": -705.4661865234375, + "loss": 0.4651, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.090013027191162, + "rewards/margins": 2.4070258140563965, + "rewards/rejected": -4.4970383644104, + "step": 3540 + }, + { + "epoch": 0.8517274472168906, + "grad_norm": 14.913426265894294, + "learning_rate": 3.274726219106677e-08, + "logits/chosen": -1.657772421836853, + "logits/rejected": -1.6069705486297607, + "logps/chosen": -506.9501037597656, + "logps/rejected": -699.5079345703125, + "loss": 0.4897, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0737128257751465, + "rewards/margins": 2.0126349925994873, + "rewards/rejected": -4.086348056793213, + "step": 3550 + }, + { + "epoch": 0.8541266794625719, + "grad_norm": 13.051269160073966, + "learning_rate": 3.171887855571642e-08, + "logits/chosen": -1.714739203453064, + "logits/rejected": -1.664184808731079, + "logps/chosen": -409.457763671875, + "logps/rejected": -502.5277404785156, + "loss": 0.484, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7403171062469482, + "rewards/margins": 0.9889610409736633, + "rewards/rejected": -2.729278087615967, + "step": 3560 + }, + { + "epoch": 0.8565259117082533, + "grad_norm": 9.061162365072834, + "learning_rate": 3.070580647381643e-08, + "logits/chosen": -1.6095638275146484, + "logits/rejected": -1.5605108737945557, + "logps/chosen": -442.99029541015625, + "logps/rejected": -652.7391357421875, + "loss": 0.5072, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9627918004989624, + "rewards/margins": 2.100388288497925, + "rewards/rejected": -4.063180446624756, + "step": 3570 + }, + { + "epoch": 0.8589251439539347, + "grad_norm": 13.034931697815875, + "learning_rate": 2.9708117008348576e-08, + "logits/chosen": -1.6348292827606201, + "logits/rejected": -1.5857656002044678, + "logps/chosen": -506.25128173828125, + "logps/rejected": -560.7322387695312, + "loss": 0.4512, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7951558828353882, + "rewards/margins": 1.0923255681991577, + "rewards/rejected": -2.887481212615967, + "step": 3580 + }, + { + "epoch": 0.8613243761996161, + "grad_norm": 16.47510836794499, + "learning_rate": 2.8725880143264992e-08, + "logits/chosen": -1.652269959449768, + "logits/rejected": -1.6364988088607788, + "logps/chosen": -473.0938415527344, + "logps/rejected": -605.9546508789062, + "loss": 0.5383, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1653454303741455, + "rewards/margins": 0.9054538011550903, + "rewards/rejected": -3.070798873901367, + "step": 3590 + }, + { + "epoch": 0.8637236084452975, + "grad_norm": 16.558002870622833, + "learning_rate": 2.775916477857948e-08, + "logits/chosen": -1.5733797550201416, + "logits/rejected": -1.5531814098358154, + "logps/chosen": -409.64007568359375, + "logps/rejected": -545.078857421875, + "loss": 0.4997, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9042761325836182, + "rewards/margins": 1.3297529220581055, + "rewards/rejected": -3.2340290546417236, + "step": 3600 + }, + { + "epoch": 0.8661228406909789, + "grad_norm": 17.718409075113666, + "learning_rate": 2.680803872553408e-08, + "logits/chosen": -1.6162738800048828, + "logits/rejected": -1.422507643699646, + "logps/chosen": -433.74639892578125, + "logps/rejected": -798.6219482421875, + "loss": 0.4981, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6543105840682983, + "rewards/margins": 3.9157538414001465, + "rewards/rejected": -5.570064067840576, + "step": 3610 + }, + { + "epoch": 0.8685220729366603, + "grad_norm": 19.479387945921662, + "learning_rate": 2.5872568701842706e-08, + "logits/chosen": -1.5644371509552002, + "logits/rejected": -1.5578614473342896, + "logps/chosen": -391.2760314941406, + "logps/rejected": -588.4986572265625, + "loss": 0.4988, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6809015274047852, + "rewards/margins": 1.7512578964233398, + "rewards/rejected": -3.432159423828125, + "step": 3620 + }, + { + "epoch": 0.8709213051823417, + "grad_norm": 15.579421392343054, + "learning_rate": 2.495282032701096e-08, + "logits/chosen": -1.626412034034729, + "logits/rejected": -1.508430004119873, + "logps/chosen": -362.73089599609375, + "logps/rejected": -497.5302734375, + "loss": 0.4601, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5209667682647705, + "rewards/margins": 1.7109506130218506, + "rewards/rejected": -3.231917142868042, + "step": 3630 + }, + { + "epoch": 0.8733205374280231, + "grad_norm": 12.975221010220178, + "learning_rate": 2.4048858117733133e-08, + "logits/chosen": -1.6329269409179688, + "logits/rejected": -1.472723126411438, + "logps/chosen": -473.1459045410156, + "logps/rejected": -683.5482788085938, + "loss": 0.4439, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.913275957107544, + "rewards/margins": 2.6152966022491455, + "rewards/rejected": -4.528572082519531, + "step": 3640 + }, + { + "epoch": 0.8757197696737045, + "grad_norm": 14.423256562483388, + "learning_rate": 2.3160745483366938e-08, + "logits/chosen": -1.6391162872314453, + "logits/rejected": -1.671979546546936, + "logps/chosen": -443.46258544921875, + "logps/rejected": -672.6021118164062, + "loss": 0.476, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9398761987686157, + "rewards/margins": 2.000032901763916, + "rewards/rejected": -3.9399094581604004, + "step": 3650 + }, + { + "epoch": 0.8781190019193857, + "grad_norm": 9.425763927010763, + "learning_rate": 2.2288544721485197e-08, + "logits/chosen": -1.6494081020355225, + "logits/rejected": -1.5929086208343506, + "logps/chosen": -396.45013427734375, + "logps/rejected": -661.6722412109375, + "loss": 0.4608, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.6658204793930054, + "rewards/margins": 2.458636999130249, + "rewards/rejected": -4.124457359313965, + "step": 3660 + }, + { + "epoch": 0.8805182341650671, + "grad_norm": 21.417485675065244, + "learning_rate": 2.1432317013506117e-08, + "logits/chosen": -1.740401268005371, + "logits/rejected": -1.6570911407470703, + "logps/chosen": -494.31964111328125, + "logps/rejected": -612.0262451171875, + "loss": 0.5057, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1772966384887695, + "rewards/margins": 1.6181838512420654, + "rewards/rejected": -3.795480728149414, + "step": 3670 + }, + { + "epoch": 0.8829174664107485, + "grad_norm": 16.00309629191891, + "learning_rate": 2.0592122420401704e-08, + "logits/chosen": -1.5891798734664917, + "logits/rejected": -1.5248258113861084, + "logps/chosen": -467.4602966308594, + "logps/rejected": -596.62060546875, + "loss": 0.526, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2219550609588623, + "rewards/margins": 1.2901359796524048, + "rewards/rejected": -3.5120906829833984, + "step": 3680 + }, + { + "epoch": 0.8853166986564299, + "grad_norm": 13.429038672487438, + "learning_rate": 1.976801987848459e-08, + "logits/chosen": -1.6623175144195557, + "logits/rejected": -1.6009843349456787, + "logps/chosen": -486.54345703125, + "logps/rejected": -719.8893432617188, + "loss": 0.5037, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.023263931274414, + "rewards/margins": 2.204184055328369, + "rewards/rejected": -4.227447986602783, + "step": 3690 + }, + { + "epoch": 0.8877159309021113, + "grad_norm": 26.48164665804949, + "learning_rate": 1.8960067195273987e-08, + "logits/chosen": -1.6708219051361084, + "logits/rejected": -1.5749927759170532, + "logps/chosen": -400.6322937011719, + "logps/rejected": -630.9937133789062, + "loss": 0.5018, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7153675556182861, + "rewards/margins": 2.3315227031707764, + "rewards/rejected": -4.0468902587890625, + "step": 3700 + }, + { + "epoch": 0.8901151631477927, + "grad_norm": 11.411704364971511, + "learning_rate": 1.816832104544072e-08, + "logits/chosen": -1.564396619796753, + "logits/rejected": -1.4874539375305176, + "logps/chosen": -481.55242919921875, + "logps/rejected": -635.627685546875, + "loss": 0.4544, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.952723503112793, + "rewards/margins": 1.7845569849014282, + "rewards/rejected": -3.7372806072235107, + "step": 3710 + }, + { + "epoch": 0.8925143953934741, + "grad_norm": 15.030315396145902, + "learning_rate": 1.7392836966831553e-08, + "logits/chosen": -1.5914599895477295, + "logits/rejected": -1.4839370250701904, + "logps/chosen": -488.05621337890625, + "logps/rejected": -683.0465698242188, + "loss": 0.465, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.0653412342071533, + "rewards/margins": 2.3046443462371826, + "rewards/rejected": -4.369986057281494, + "step": 3720 + }, + { + "epoch": 0.8949136276391555, + "grad_norm": 11.782758949527297, + "learning_rate": 1.663366935657373e-08, + "logits/chosen": -1.6213829517364502, + "logits/rejected": -1.4937806129455566, + "logps/chosen": -420.52911376953125, + "logps/rejected": -647.6318969726562, + "loss": 0.514, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7656971216201782, + "rewards/margins": 2.258690357208252, + "rewards/rejected": -4.024387836456299, + "step": 3730 + }, + { + "epoch": 0.8973128598848369, + "grad_norm": 19.85256790289952, + "learning_rate": 1.5890871467258898e-08, + "logits/chosen": -1.6818307638168335, + "logits/rejected": -1.6265443563461304, + "logps/chosen": -564.2450561523438, + "logps/rejected": -681.9746704101562, + "loss": 0.5019, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.256525754928589, + "rewards/margins": 1.55268394947052, + "rewards/rejected": -3.8092098236083984, + "step": 3740 + }, + { + "epoch": 0.8997120921305183, + "grad_norm": 12.586313589978424, + "learning_rate": 1.5164495403207967e-08, + "logits/chosen": -1.700171709060669, + "logits/rejected": -1.6361474990844727, + "logps/chosen": -475.1417541503906, + "logps/rejected": -746.2467041015625, + "loss": 0.4623, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0661838054656982, + "rewards/margins": 2.443232297897339, + "rewards/rejected": -4.509416103363037, + "step": 3750 + }, + { + "epoch": 0.9021113243761996, + "grad_norm": 12.787222537511584, + "learning_rate": 1.4454592116815962e-08, + "logits/chosen": -1.593766450881958, + "logits/rejected": -1.5601489543914795, + "logps/chosen": -449.85858154296875, + "logps/rejected": -639.9388427734375, + "loss": 0.445, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.833129644393921, + "rewards/margins": 1.7945473194122314, + "rewards/rejected": -3.6276767253875732, + "step": 3760 + }, + { + "epoch": 0.904510556621881, + "grad_norm": 10.429284153734484, + "learning_rate": 1.3761211404977934e-08, + "logits/chosen": -1.5752068758010864, + "logits/rejected": -1.422378659248352, + "logps/chosen": -471.98876953125, + "logps/rejected": -817.8197021484375, + "loss": 0.442, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2775814533233643, + "rewards/margins": 3.5992565155029297, + "rewards/rejected": -5.876837730407715, + "step": 3770 + }, + { + "epoch": 0.9069097888675623, + "grad_norm": 12.741709586667612, + "learning_rate": 1.3084401905596177e-08, + "logits/chosen": -1.7176425457000732, + "logits/rejected": -1.6414988040924072, + "logps/chosen": -503.9371643066406, + "logps/rejected": -640.4576416015625, + "loss": 0.4707, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8939365148544312, + "rewards/margins": 1.9764692783355713, + "rewards/rejected": -3.870405912399292, + "step": 3780 + }, + { + "epoch": 0.9093090211132437, + "grad_norm": 11.675903630631732, + "learning_rate": 1.2424211094168053e-08, + "logits/chosen": -1.6118358373641968, + "logits/rejected": -1.5552772283554077, + "logps/chosen": -522.3504638671875, + "logps/rejected": -701.8868408203125, + "loss": 0.5099, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.871604323387146, + "rewards/margins": 1.9561458826065063, + "rewards/rejected": -3.8277504444122314, + "step": 3790 + }, + { + "epoch": 0.9117082533589251, + "grad_norm": 11.816100782837214, + "learning_rate": 1.1780685280456143e-08, + "logits/chosen": -1.6721569299697876, + "logits/rejected": -1.6179077625274658, + "logps/chosen": -539.4227905273438, + "logps/rejected": -888.7571411132812, + "loss": 0.4959, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.298069477081299, + "rewards/margins": 3.5056235790252686, + "rewards/rejected": -5.803693771362305, + "step": 3800 + }, + { + "epoch": 0.9141074856046065, + "grad_norm": 11.727624985469124, + "learning_rate": 1.1153869605239564e-08, + "logits/chosen": -1.6193329095840454, + "logits/rejected": -1.5656651258468628, + "logps/chosen": -435.56158447265625, + "logps/rejected": -478.83770751953125, + "loss": 0.4958, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5856086015701294, + "rewards/margins": 0.8730261921882629, + "rewards/rejected": -2.458634614944458, + "step": 3810 + }, + { + "epoch": 0.9165067178502879, + "grad_norm": 16.85961078114398, + "learning_rate": 1.0543808037147606e-08, + "logits/chosen": -1.6750271320343018, + "logits/rejected": -1.6107738018035889, + "logps/chosen": -503.64471435546875, + "logps/rejected": -785.1126708984375, + "loss": 0.4807, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2831358909606934, + "rewards/margins": 2.812958240509033, + "rewards/rejected": -5.096093654632568, + "step": 3820 + }, + { + "epoch": 0.9189059500959693, + "grad_norm": 12.461972445162298, + "learning_rate": 9.95054336957557e-09, + "logits/chosen": -1.734662652015686, + "logits/rejected": -1.739912748336792, + "logps/chosen": -434.0250549316406, + "logps/rejected": -562.7544555664062, + "loss": 0.4761, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7552568912506104, + "rewards/margins": 1.2200746536254883, + "rewards/rejected": -2.9753317832946777, + "step": 3830 + }, + { + "epoch": 0.9213051823416507, + "grad_norm": 12.522523399418205, + "learning_rate": 9.37411721768286e-09, + "logits/chosen": -1.5948150157928467, + "logits/rejected": -1.5764580965042114, + "logps/chosen": -513.3563232421875, + "logps/rejected": -767.2200927734375, + "loss": 0.474, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.226046085357666, + "rewards/margins": 2.2817482948303223, + "rewards/rejected": -4.507794380187988, + "step": 3840 + }, + { + "epoch": 0.9237044145873321, + "grad_norm": 10.9488041002692, + "learning_rate": 8.81457001547392e-09, + "logits/chosen": -1.6462711095809937, + "logits/rejected": -1.6491082906723022, + "logps/chosen": -459.8772888183594, + "logps/rejected": -560.5117797851562, + "loss": 0.4729, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.011331558227539, + "rewards/margins": 0.9149211645126343, + "rewards/rejected": -2.9262523651123047, + "step": 3850 + }, + { + "epoch": 0.9261036468330134, + "grad_norm": 9.649161518168588, + "learning_rate": 8.271941012961942e-09, + "logits/chosen": -1.5380823612213135, + "logits/rejected": -1.412889838218689, + "logps/chosen": -441.87335205078125, + "logps/rejected": -816.4763793945312, + "loss": 0.4613, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1320416927337646, + "rewards/margins": 3.2392337322235107, + "rewards/rejected": -5.371275424957275, + "step": 3860 + }, + { + "epoch": 0.9285028790786948, + "grad_norm": 11.267633686817604, + "learning_rate": 7.746268273415568e-09, + "logits/chosen": -1.648723840713501, + "logits/rejected": -1.6860382556915283, + "logps/chosen": -460.56036376953125, + "logps/rejected": -567.8131103515625, + "loss": 0.4806, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.851995825767517, + "rewards/margins": 0.5843394994735718, + "rewards/rejected": -2.436335325241089, + "step": 3870 + }, + { + "epoch": 0.9309021113243762, + "grad_norm": 11.04068659513372, + "learning_rate": 7.237588670689076e-09, + "logits/chosen": -1.7109369039535522, + "logits/rejected": -1.5930227041244507, + "logps/chosen": -468.35186767578125, + "logps/rejected": -685.10986328125, + "loss": 0.4614, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9170551300048828, + "rewards/margins": 2.5844249725341797, + "rewards/rejected": -4.5014801025390625, + "step": 3880 + }, + { + "epoch": 0.9333013435700576, + "grad_norm": 14.964168805394552, + "learning_rate": 6.745937886635606e-09, + "logits/chosen": -1.6948446035385132, + "logits/rejected": -1.5991394519805908, + "logps/chosen": -517.13623046875, + "logps/rejected": -857.3069458007812, + "loss": 0.4527, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.1508491039276123, + "rewards/margins": 3.447594404220581, + "rewards/rejected": -5.598443508148193, + "step": 3890 + }, + { + "epoch": 0.935700575815739, + "grad_norm": 14.883238902792032, + "learning_rate": 6.271350408604409e-09, + "logits/chosen": -1.6298729181289673, + "logits/rejected": -1.5697710514068604, + "logps/chosen": -407.47216796875, + "logps/rejected": -590.4325561523438, + "loss": 0.4973, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.779510259628296, + "rewards/margins": 1.6561663150787354, + "rewards/rejected": -3.4356765747070312, + "step": 3900 + }, + { + "epoch": 0.9380998080614203, + "grad_norm": 11.184941495697121, + "learning_rate": 5.813859527021487e-09, + "logits/chosen": -1.5353864431381226, + "logits/rejected": -1.5042493343353271, + "logps/chosen": -483.96435546875, + "logps/rejected": -689.84130859375, + "loss": 0.4324, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.045846462249756, + "rewards/margins": 2.3413102626800537, + "rewards/rejected": -4.387156963348389, + "step": 3910 + }, + { + "epoch": 0.9404990403071017, + "grad_norm": 14.299734951667778, + "learning_rate": 5.373497333054616e-09, + "logits/chosen": -1.6646445989608765, + "logits/rejected": -1.649578332901001, + "logps/chosen": -482.82843017578125, + "logps/rejected": -551.9417724609375, + "loss": 0.489, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0101306438446045, + "rewards/margins": 0.7671645879745483, + "rewards/rejected": -2.777294874191284, + "step": 3920 + }, + { + "epoch": 0.9428982725527831, + "grad_norm": 10.602051511979361, + "learning_rate": 4.950294716362213e-09, + "logits/chosen": -1.6949056386947632, + "logits/rejected": -1.670143723487854, + "logps/chosen": -504.10821533203125, + "logps/rejected": -612.98876953125, + "loss": 0.4873, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.004544734954834, + "rewards/margins": 1.1068131923675537, + "rewards/rejected": -3.111358165740967, + "step": 3930 + }, + { + "epoch": 0.9452975047984645, + "grad_norm": 10.528232336553208, + "learning_rate": 4.544281362926422e-09, + "logits/chosen": -1.7100117206573486, + "logits/rejected": -1.6325490474700928, + "logps/chosen": -508.181640625, + "logps/rejected": -681.7391967773438, + "loss": 0.5311, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9281724691390991, + "rewards/margins": 1.9070625305175781, + "rewards/rejected": -3.835235595703125, + "step": 3940 + }, + { + "epoch": 0.9476967370441459, + "grad_norm": 11.26583083013792, + "learning_rate": 4.15548575297095e-09, + "logits/chosen": -1.668421745300293, + "logits/rejected": -1.5268778800964355, + "logps/chosen": -452.11077880859375, + "logps/rejected": -725.6536254882812, + "loss": 0.4537, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9140384197235107, + "rewards/margins": 2.8626558780670166, + "rewards/rejected": -4.776694297790527, + "step": 3950 + }, + { + "epoch": 0.9500959692898272, + "grad_norm": 9.866263268327208, + "learning_rate": 3.7839351589631366e-09, + "logits/chosen": -1.6612355709075928, + "logits/rejected": -1.623468041419983, + "logps/chosen": -477.5087890625, + "logps/rejected": -704.5999755859375, + "loss": 0.5035, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.534862518310547, + "rewards/margins": 1.8013521432876587, + "rewards/rejected": -4.336214065551758, + "step": 3960 + }, + { + "epoch": 0.9524952015355086, + "grad_norm": 11.105179609469939, + "learning_rate": 3.4296556437010405e-09, + "logits/chosen": -1.6372588872909546, + "logits/rejected": -1.5610383749008179, + "logps/chosen": -448.084228515625, + "logps/rejected": -620.5638427734375, + "loss": 0.4961, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.273430585861206, + "rewards/margins": 1.8218492269515991, + "rewards/rejected": -4.095280170440674, + "step": 3970 + }, + { + "epoch": 0.95489443378119, + "grad_norm": 15.432439336051313, + "learning_rate": 3.092672058485124e-09, + "logits/chosen": -1.6299177408218384, + "logits/rejected": -1.5185959339141846, + "logps/chosen": -480.63623046875, + "logps/rejected": -749.26171875, + "loss": 0.5051, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3645920753479004, + "rewards/margins": 2.5585074424743652, + "rewards/rejected": -4.923099517822266, + "step": 3980 + }, + { + "epoch": 0.9572936660268714, + "grad_norm": 13.295737344758477, + "learning_rate": 2.7730080413750356e-09, + "logits/chosen": -1.6487003564834595, + "logits/rejected": -1.616842269897461, + "logps/chosen": -471.357421875, + "logps/rejected": -603.6805419921875, + "loss": 0.4891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8420718908309937, + "rewards/margins": 1.3825440406799316, + "rewards/rejected": -3.2246158123016357, + "step": 3990 + }, + { + "epoch": 0.9596928982725528, + "grad_norm": 11.019669520177573, + "learning_rate": 2.4706860155316033e-09, + "logits/chosen": -1.641543984413147, + "logits/rejected": -1.5944894552230835, + "logps/chosen": -602.7939453125, + "logps/rejected": -756.920166015625, + "loss": 0.4894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.417752504348755, + "rewards/margins": 1.6268641948699951, + "rewards/rejected": -4.04461669921875, + "step": 4000 + }, + { + "epoch": 0.9596928982725528, + "eval_logits/chosen": -1.5951924324035645, + "eval_logits/rejected": -1.5356674194335938, + "eval_logps/chosen": -472.6584167480469, + "eval_logps/rejected": -691.1746215820312, + "eval_loss": 0.4801708161830902, + "eval_rewards/accuracies": 0.7928571701049805, + "eval_rewards/chosen": -2.003549337387085, + "eval_rewards/margins": 2.17158842086792, + "eval_rewards/rejected": -4.175137996673584, + "eval_runtime": 45.9357, + "eval_samples_per_second": 97.114, + "eval_steps_per_second": 1.524, + "step": 4000 + }, + { + "epoch": 0.9620921305182342, + "grad_norm": 11.233710733457718, + "learning_rate": 2.185727187643843e-09, + "logits/chosen": -1.6332000494003296, + "logits/rejected": -1.5429205894470215, + "logps/chosen": -419.66815185546875, + "logps/rejected": -734.4024658203125, + "loss": 0.4842, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.8838298320770264, + "rewards/margins": 3.1948580741882324, + "rewards/rejected": -5.078688144683838, + "step": 4010 + }, + { + "epoch": 0.9644913627639156, + "grad_norm": 15.328579684902264, + "learning_rate": 1.9181515464413434e-09, + "logits/chosen": -1.670598030090332, + "logits/rejected": -1.6298954486846924, + "logps/chosen": -588.2853393554688, + "logps/rejected": -813.2299194335938, + "loss": 0.4958, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.263362407684326, + "rewards/margins": 2.3236560821533203, + "rewards/rejected": -4.5870184898376465, + "step": 4020 + }, + { + "epoch": 0.966890595009597, + "grad_norm": 13.812423991968044, + "learning_rate": 1.6679778612923302e-09, + "logits/chosen": -1.752912163734436, + "logits/rejected": -1.669989824295044, + "logps/chosen": -523.4584350585938, + "logps/rejected": -659.4483642578125, + "loss": 0.4615, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.229353427886963, + "rewards/margins": 1.3204247951507568, + "rewards/rejected": -3.549778699874878, + "step": 4030 + }, + { + "epoch": 0.9692898272552783, + "grad_norm": 19.733776466762233, + "learning_rate": 1.43522368088686e-09, + "logits/chosen": -1.619080901145935, + "logits/rejected": -1.4254459142684937, + "logps/chosen": -500.8919982910156, + "logps/rejected": -871.8958740234375, + "loss": 0.5482, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.26896595954895, + "rewards/margins": 3.7620201110839844, + "rewards/rejected": -6.030986309051514, + "step": 4040 + }, + { + "epoch": 0.9716890595009597, + "grad_norm": 13.902504420645041, + "learning_rate": 1.2199053320059993e-09, + "logits/chosen": -1.6571025848388672, + "logits/rejected": -1.6325149536132812, + "logps/chosen": -482.8365783691406, + "logps/rejected": -686.3687744140625, + "loss": 0.4742, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.980038046836853, + "rewards/margins": 1.896106481552124, + "rewards/rejected": -3.8761448860168457, + "step": 4050 + }, + { + "epoch": 0.974088291746641, + "grad_norm": 10.386888230304015, + "learning_rate": 1.0220379183764338e-09, + "logits/chosen": -1.6559547185897827, + "logits/rejected": -1.545379638671875, + "logps/chosen": -393.8249206542969, + "logps/rejected": -653.940185546875, + "loss": 0.4873, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7048404216766357, + "rewards/margins": 2.559105396270752, + "rewards/rejected": -4.263945579528809, + "step": 4060 + }, + { + "epoch": 0.9764875239923224, + "grad_norm": 13.972760200119874, + "learning_rate": 8.416353196111503e-10, + "logits/chosen": -1.4547879695892334, + "logits/rejected": -1.3284496068954468, + "logps/chosen": -491.6461486816406, + "logps/rejected": -743.1737670898438, + "loss": 0.5082, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.368063449859619, + "rewards/margins": 2.6352427005767822, + "rewards/rejected": -5.0033063888549805, + "step": 4070 + }, + { + "epoch": 0.9788867562380038, + "grad_norm": 11.880689150494451, + "learning_rate": 6.787101902356873e-10, + "logits/chosen": -1.5535688400268555, + "logits/rejected": -1.5120995044708252, + "logps/chosen": -489.68719482421875, + "logps/rejected": -716.9655151367188, + "loss": 0.4804, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1152749061584473, + "rewards/margins": 2.0652334690093994, + "rewards/rejected": -4.180508613586426, + "step": 4080 + }, + { + "epoch": 0.9812859884836852, + "grad_norm": 18.79541252367771, + "learning_rate": 5.332739588005953e-10, + "logits/chosen": -1.630657434463501, + "logits/rejected": -1.5946576595306396, + "logps/chosen": -391.0518493652344, + "logps/rejected": -655.5036010742188, + "loss": 0.4848, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7571265697479248, + "rewards/margins": 2.442540407180786, + "rewards/rejected": -4.199666976928711, + "step": 4090 + }, + { + "epoch": 0.9836852207293666, + "grad_norm": 18.62380663413524, + "learning_rate": 4.053368270797164e-10, + "logits/chosen": -1.5710262060165405, + "logits/rejected": -1.495965838432312, + "logps/chosen": -468.63946533203125, + "logps/rejected": -698.932373046875, + "loss": 0.4696, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.210425853729248, + "rewards/margins": 2.375892162322998, + "rewards/rejected": -4.586318016052246, + "step": 4100 + }, + { + "epoch": 0.986084452975048, + "grad_norm": 10.138418324575126, + "learning_rate": 2.949077693545354e-10, + "logits/chosen": -1.5962473154067993, + "logits/rejected": -1.61128830909729, + "logps/chosen": -506.21630859375, + "logps/rejected": -674.5911865234375, + "loss": 0.5628, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2012012004852295, + "rewards/margins": 1.3675811290740967, + "rewards/rejected": -3.568782091140747, + "step": 4110 + }, + { + "epoch": 0.9884836852207294, + "grad_norm": 11.981782992200987, + "learning_rate": 2.0199453178471047e-10, + "logits/chosen": -1.6453485488891602, + "logits/rejected": -1.6295568943023682, + "logps/chosen": -540.22607421875, + "logps/rejected": -574.7904663085938, + "loss": 0.4926, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1138644218444824, + "rewards/margins": 0.7879716157913208, + "rewards/rejected": -2.9018359184265137, + "step": 4120 + }, + { + "epoch": 0.9908829174664108, + "grad_norm": 15.974370111697047, + "learning_rate": 1.266036318647301e-10, + "logits/chosen": -1.632550835609436, + "logits/rejected": -1.550283670425415, + "logps/chosen": -528.643310546875, + "logps/rejected": -716.7777709960938, + "loss": 0.49, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1183180809020996, + "rewards/margins": 2.1329312324523926, + "rewards/rejected": -4.25124979019165, + "step": 4130 + }, + { + "epoch": 0.9932821497120922, + "grad_norm": 15.469000968932196, + "learning_rate": 6.874035796672339e-11, + "logits/chosen": -1.6695518493652344, + "logits/rejected": -1.5455235242843628, + "logps/chosen": -510.795654296875, + "logps/rejected": -794.1751708984375, + "loss": 0.4936, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.969916582107544, + "rewards/margins": 3.434035539627075, + "rewards/rejected": -5.403952121734619, + "step": 4140 + }, + { + "epoch": 0.9956813819577736, + "grad_norm": 13.801035998962295, + "learning_rate": 2.8408768969423458e-11, + "logits/chosen": -1.7181600332260132, + "logits/rejected": -1.696080207824707, + "logps/chosen": -485.26495361328125, + "logps/rejected": -656.0448608398438, + "loss": 0.4756, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.976780652999878, + "rewards/margins": 1.6538575887680054, + "rewards/rejected": -3.6306381225585938, + "step": 4150 + }, + { + "epoch": 0.9980806142034548, + "grad_norm": 11.770076455666835, + "learning_rate": 5.611693973617271e-12, + "logits/chosen": -1.54105544090271, + "logits/rejected": -1.4928066730499268, + "logps/chosen": -425.23748779296875, + "logps/rejected": -606.849853515625, + "loss": 0.5225, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8429193496704102, + "rewards/margins": 1.6649690866470337, + "rewards/rejected": -3.5078887939453125, + "step": 4160 + }, + { + "epoch": 1.0, + "step": 4168, + "total_flos": 0.0, + "train_loss": 0.5327433187535995, + "train_runtime": 4228.4167, + "train_samples_per_second": 31.541, + "train_steps_per_second": 0.986 + } + ], + "logging_steps": 10, + "max_steps": 4168, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}