{ "best_global_step": 815, "best_metric": 1.3573591709136963, "best_model_checkpoint": "/tmp/svadugur/39823/wnr_change_preference-speaker=gemma-listener=pixtral_ft-length_conditioned=False-contexts=medium-39823/checkpoint-815", "epoch": 0.9996167113836719, "eval_steps": 163, "global_step": 815, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_logits/chosen": -2.2216827869415283, "eval_logits/rejected": -2.231610059738159, "eval_logps/chosen": -57.6313591003418, "eval_logps/rejected": -56.49857711791992, "eval_loss": 1.0, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0, "eval_runtime": 1611.6963, "eval_samples_per_second": 0.547, "eval_steps_per_second": 0.274, "step": 0 }, { "epoch": 0.0012265235722499041, "grad_norm": 1.6502304077148438, "learning_rate": 1e-06, "logits/chosen": -2.2543065547943115, "logits/rejected": -2.279301881790161, "logps/chosen": -55.43144989013672, "logps/rejected": -52.21455383300781, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0024530471444998083, "grad_norm": 1.9496572017669678, "learning_rate": 9.998773006134968e-07, "logits/chosen": -2.2541019916534424, "logits/rejected": -2.27193546295166, "logps/chosen": -55.03106689453125, "logps/rejected": -53.453155517578125, "loss": 1.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.025432538241147995, "rewards/margins": -7.317215204238892e-05, "rewards/rejected": -0.025359369814395905, "step": 2 }, { "epoch": 0.0036795707167497126, "grad_norm": 1.8870964050292969, "learning_rate": 9.99754601226994e-07, "logits/chosen": -2.271968364715576, "logits/rejected": -2.2829127311706543, "logps/chosen": -55.85136032104492, "logps/rejected": -58.49956512451172, "loss": 1.0036, "rewards/accuracies": 0.53125, "rewards/chosen": -0.013036608695983887, "rewards/margins": -0.014526653103530407, "rewards/rejected": 0.0014900458045303822, "step": 3 }, { "epoch": 0.0049060942889996165, "grad_norm": 1.887001633644104, "learning_rate": 9.996319018404908e-07, "logits/chosen": -2.2055845260620117, "logits/rejected": -2.200967311859131, "logps/chosen": -58.82973861694336, "logps/rejected": -57.85545349121094, "loss": 0.9967, "rewards/accuracies": 0.53125, "rewards/chosen": 0.02053203620016575, "rewards/margins": 0.013359952718019485, "rewards/rejected": 0.007172084413468838, "step": 4 }, { "epoch": 0.006132617861249521, "grad_norm": 1.814192771911621, "learning_rate": 9.995092024539877e-07, "logits/chosen": -2.250117301940918, "logits/rejected": -2.2246193885803223, "logps/chosen": -58.971839904785156, "logps/rejected": -59.7353515625, "loss": 1.0058, "rewards/accuracies": 0.375, "rewards/chosen": -0.006158745847642422, "rewards/margins": -0.023273468017578125, "rewards/rejected": 0.017114723101258278, "step": 5 }, { "epoch": 0.007359141433499425, "grad_norm": 2.0333399772644043, "learning_rate": 9.993865030674846e-07, "logits/chosen": -2.2327892780303955, "logits/rejected": -2.2579684257507324, "logps/chosen": -54.45442199707031, "logps/rejected": -51.91791915893555, "loss": 1.0011, "rewards/accuracies": 0.4375, "rewards/chosen": -0.003721976652741432, "rewards/margins": -0.004465007223188877, "rewards/rejected": 0.0007430319674313068, "step": 6 }, { "epoch": 0.008585665005749329, "grad_norm": 2.2606923580169678, "learning_rate": 9.992638036809814e-07, "logits/chosen": -2.2253775596618652, "logits/rejected": -2.2605338096618652, "logps/chosen": -56.56025695800781, "logps/rejected": -55.42701721191406, "loss": 1.002, "rewards/accuracies": 0.46875, "rewards/chosen": -0.034399211406707764, "rewards/margins": -0.008065998554229736, "rewards/rejected": -0.026333212852478027, "step": 7 }, { "epoch": 0.009812188577999233, "grad_norm": 1.8747289180755615, "learning_rate": 9.991411042944785e-07, "logits/chosen": -2.244736671447754, "logits/rejected": -2.247849225997925, "logps/chosen": -54.06218719482422, "logps/rejected": -54.61011505126953, "loss": 0.9987, "rewards/accuracies": 0.53125, "rewards/chosen": 0.010196995921432972, "rewards/margins": 0.00538643728941679, "rewards/rejected": 0.004810560029000044, "step": 8 }, { "epoch": 0.011038712150249137, "grad_norm": 1.9319449663162231, "learning_rate": 9.990184049079754e-07, "logits/chosen": -2.189882755279541, "logits/rejected": -2.207411766052246, "logps/chosen": -58.723846435546875, "logps/rejected": -59.417457580566406, "loss": 0.9964, "rewards/accuracies": 0.53125, "rewards/chosen": 0.010770226828753948, "rewards/margins": 0.014559279195964336, "rewards/rejected": -0.003789055161178112, "step": 9 }, { "epoch": 0.012265235722499043, "grad_norm": 1.913153052330017, "learning_rate": 9.988957055214723e-07, "logits/chosen": -2.2045443058013916, "logits/rejected": -2.255871295928955, "logps/chosen": -56.52876663208008, "logps/rejected": -57.77139663696289, "loss": 1.0023, "rewards/accuracies": 0.46875, "rewards/chosen": 0.009393095970153809, "rewards/margins": -0.009203792549669743, "rewards/rejected": 0.018596887588500977, "step": 10 }, { "epoch": 0.013491759294748946, "grad_norm": 2.020563840866089, "learning_rate": 9.987730061349692e-07, "logits/chosen": -2.2348456382751465, "logits/rejected": -2.249089241027832, "logps/chosen": -56.953704833984375, "logps/rejected": -56.97029113769531, "loss": 1.0007, "rewards/accuracies": 0.4375, "rewards/chosen": -0.02881915681064129, "rewards/margins": -0.0028015035204589367, "rewards/rejected": -0.026017654687166214, "step": 11 }, { "epoch": 0.01471828286699885, "grad_norm": 2.2782418727874756, "learning_rate": 9.986503067484663e-07, "logits/chosen": -2.21018123626709, "logits/rejected": -2.223275661468506, "logps/chosen": -57.53064727783203, "logps/rejected": -57.27278137207031, "loss": 0.9982, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010295471176505089, "rewards/margins": 0.0073087275959551334, "rewards/rejected": 0.002986741252243519, "step": 12 }, { "epoch": 0.015944806439248754, "grad_norm": 1.7818843126296997, "learning_rate": 9.985276073619632e-07, "logits/chosen": -2.2613790035247803, "logits/rejected": -2.1912052631378174, "logps/chosen": -58.226043701171875, "logps/rejected": -59.55123519897461, "loss": 1.0114, "rewards/accuracies": 0.25, "rewards/chosen": -0.04176298528909683, "rewards/margins": -0.04557749629020691, "rewards/rejected": 0.0038145065773278475, "step": 13 }, { "epoch": 0.017171330011498658, "grad_norm": 1.9084312915802002, "learning_rate": 9.9840490797546e-07, "logits/chosen": -2.2789173126220703, "logits/rejected": -2.2848687171936035, "logps/chosen": -55.16374588012695, "logps/rejected": -55.189430236816406, "loss": 1.0052, "rewards/accuracies": 0.5, "rewards/chosen": -0.028668129816651344, "rewards/margins": -0.02083817683160305, "rewards/rejected": -0.007829951122403145, "step": 14 }, { "epoch": 0.018397853583748562, "grad_norm": 2.1165359020233154, "learning_rate": 9.982822085889571e-07, "logits/chosen": -2.277003288269043, "logits/rejected": -2.269909620285034, "logps/chosen": -54.23345947265625, "logps/rejected": -54.20281982421875, "loss": 0.995, "rewards/accuracies": 0.59375, "rewards/chosen": 0.011915469542145729, "rewards/margins": 0.020113814622163773, "rewards/rejected": -0.008198344148695469, "step": 15 }, { "epoch": 0.019624377155998466, "grad_norm": 2.3525915145874023, "learning_rate": 9.98159509202454e-07, "logits/chosen": -2.258922576904297, "logits/rejected": -2.26751708984375, "logps/chosen": -55.792724609375, "logps/rejected": -56.430294036865234, "loss": 1.0017, "rewards/accuracies": 0.5625, "rewards/chosen": -0.007940614596009254, "rewards/margins": -0.006644833832979202, "rewards/rejected": -0.001295781577937305, "step": 16 }, { "epoch": 0.02085090072824837, "grad_norm": 2.1162328720092773, "learning_rate": 9.980368098159509e-07, "logits/chosen": -2.1972813606262207, "logits/rejected": -2.1965432167053223, "logps/chosen": -54.49195098876953, "logps/rejected": -55.56049346923828, "loss": 0.9969, "rewards/accuracies": 0.4375, "rewards/chosen": 0.025996636599302292, "rewards/margins": 0.012510632164776325, "rewards/rejected": 0.013486005365848541, "step": 17 }, { "epoch": 0.022077424300498274, "grad_norm": 2.0335235595703125, "learning_rate": 9.979141104294478e-07, "logits/chosen": -2.218026638031006, "logits/rejected": -2.208946466445923, "logps/chosen": -56.58424377441406, "logps/rejected": -53.21271896362305, "loss": 1.0008, "rewards/accuracies": 0.46875, "rewards/chosen": -0.010871140286326408, "rewards/margins": -0.0033205454237759113, "rewards/rejected": -0.007550596725195646, "step": 18 }, { "epoch": 0.023303947872748178, "grad_norm": 1.8876171112060547, "learning_rate": 9.977914110429446e-07, "logits/chosen": -2.214871883392334, "logits/rejected": -2.2447235584259033, "logps/chosen": -58.27252197265625, "logps/rejected": -56.25828170776367, "loss": 0.9971, "rewards/accuracies": 0.53125, "rewards/chosen": -0.01484462060034275, "rewards/margins": 0.011532879434525967, "rewards/rejected": -0.026377499103546143, "step": 19 }, { "epoch": 0.024530471444998085, "grad_norm": 1.9685484170913696, "learning_rate": 9.976687116564415e-07, "logits/chosen": -2.2437856197357178, "logits/rejected": -2.223264455795288, "logps/chosen": -56.683170318603516, "logps/rejected": -59.687618255615234, "loss": 0.9936, "rewards/accuracies": 0.53125, "rewards/chosen": -0.003348434576764703, "rewards/margins": 0.025717690587043762, "rewards/rejected": -0.029066121205687523, "step": 20 }, { "epoch": 0.02575699501724799, "grad_norm": 2.3109028339385986, "learning_rate": 9.975460122699386e-07, "logits/chosen": -2.2612545490264893, "logits/rejected": -2.264728546142578, "logps/chosen": -60.70183181762695, "logps/rejected": -57.77599334716797, "loss": 1.0, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0027068022172898054, "rewards/margins": 7.451837882399559e-05, "rewards/rejected": -0.0027813194319605827, "step": 21 }, { "epoch": 0.026983518589497893, "grad_norm": 1.9125782251358032, "learning_rate": 9.974233128834355e-07, "logits/chosen": -2.286746025085449, "logits/rejected": -2.2887911796569824, "logps/chosen": -58.28277587890625, "logps/rejected": -55.428123474121094, "loss": 0.9955, "rewards/accuracies": 0.65625, "rewards/chosen": 0.009252572432160378, "rewards/margins": 0.018188752233982086, "rewards/rejected": -0.008936180733144283, "step": 22 }, { "epoch": 0.028210042161747797, "grad_norm": 2.007206439971924, "learning_rate": 9.973006134969326e-07, "logits/chosen": -2.347057819366455, "logits/rejected": -2.365382194519043, "logps/chosen": -57.627532958984375, "logps/rejected": -57.5980224609375, "loss": 0.9877, "rewards/accuracies": 0.6875, "rewards/chosen": 0.019937027245759964, "rewards/margins": 0.04918154329061508, "rewards/rejected": -0.029244519770145416, "step": 23 }, { "epoch": 0.0294365657339977, "grad_norm": 2.034442186355591, "learning_rate": 9.971779141104295e-07, "logits/chosen": -2.257885456085205, "logits/rejected": -2.285210609436035, "logps/chosen": -56.48740005493164, "logps/rejected": -57.884315490722656, "loss": 1.0022, "rewards/accuracies": 0.5, "rewards/chosen": -0.024035489186644554, "rewards/margins": -0.008854364976286888, "rewards/rejected": -0.015181124210357666, "step": 24 }, { "epoch": 0.030663089306247605, "grad_norm": 1.9139817953109741, "learning_rate": 9.970552147239264e-07, "logits/chosen": -2.257434129714966, "logits/rejected": -2.237396240234375, "logps/chosen": -58.906803131103516, "logps/rejected": -58.86420440673828, "loss": 1.0026, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01337047852575779, "rewards/margins": -0.010468291118741035, "rewards/rejected": -0.0029021864756941795, "step": 25 }, { "epoch": 0.03188961287849751, "grad_norm": 1.8657565116882324, "learning_rate": 9.969325153374232e-07, "logits/chosen": -2.2437727451324463, "logits/rejected": -2.2863457202911377, "logps/chosen": -57.03628921508789, "logps/rejected": -58.25613021850586, "loss": 0.9998, "rewards/accuracies": 0.46875, "rewards/chosen": 0.010653482750058174, "rewards/margins": 0.0008586505427956581, "rewards/rejected": 0.009794832207262516, "step": 26 }, { "epoch": 0.033116136450747416, "grad_norm": 2.3325679302215576, "learning_rate": 9.968098159509201e-07, "logits/chosen": -2.21683406829834, "logits/rejected": -2.260067939758301, "logps/chosen": -61.77953338623047, "logps/rejected": -59.52372741699219, "loss": 0.9982, "rewards/accuracies": 0.5, "rewards/chosen": 0.007733452599495649, "rewards/margins": 0.007248271256685257, "rewards/rejected": 0.00048518116818740964, "step": 27 }, { "epoch": 0.034342660022997316, "grad_norm": 2.416856050491333, "learning_rate": 9.966871165644172e-07, "logits/chosen": -2.276193380355835, "logits/rejected": -2.288438081741333, "logps/chosen": -54.636444091796875, "logps/rejected": -55.1606330871582, "loss": 0.9994, "rewards/accuracies": 0.4375, "rewards/chosen": -0.012577496469020844, "rewards/margins": 0.0025397762656211853, "rewards/rejected": -0.015117276459932327, "step": 28 }, { "epoch": 0.035569183595247224, "grad_norm": 2.1659514904022217, "learning_rate": 9.96564417177914e-07, "logits/chosen": -2.156749963760376, "logits/rejected": -2.1809792518615723, "logps/chosen": -56.62805938720703, "logps/rejected": -56.7884635925293, "loss": 0.9984, "rewards/accuracies": 0.53125, "rewards/chosen": -0.01458813063800335, "rewards/margins": 0.0064969900995492935, "rewards/rejected": -0.021085122600197792, "step": 29 }, { "epoch": 0.036795707167497124, "grad_norm": 1.8536784648895264, "learning_rate": 9.96441717791411e-07, "logits/chosen": -2.2663216590881348, "logits/rejected": -2.2836265563964844, "logps/chosen": -56.308746337890625, "logps/rejected": -54.5359001159668, "loss": 0.9976, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01676856353878975, "rewards/margins": 0.0097883939743042, "rewards/rejected": 0.006980167701840401, "step": 30 }, { "epoch": 0.03802223073974703, "grad_norm": 2.3641104698181152, "learning_rate": 9.963190184049078e-07, "logits/chosen": -2.2422733306884766, "logits/rejected": -2.2931559085845947, "logps/chosen": -58.076961517333984, "logps/rejected": -52.7287712097168, "loss": 0.9991, "rewards/accuracies": 0.5, "rewards/chosen": 0.020653333514928818, "rewards/margins": 0.0036722319200634956, "rewards/rejected": 0.016981102526187897, "step": 31 }, { "epoch": 0.03924875431199693, "grad_norm": 2.743985652923584, "learning_rate": 9.96196319018405e-07, "logits/chosen": -2.1930863857269287, "logits/rejected": -2.209193468093872, "logps/chosen": -56.367637634277344, "logps/rejected": -56.3955078125, "loss": 0.9937, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0066816918551921844, "rewards/margins": 0.025072097778320312, "rewards/rejected": -0.0317537896335125, "step": 32 }, { "epoch": 0.04047527788424684, "grad_norm": 2.10446834564209, "learning_rate": 9.960736196319018e-07, "logits/chosen": -2.3093085289001465, "logits/rejected": -2.2977101802825928, "logps/chosen": -57.12293243408203, "logps/rejected": -56.22285079956055, "loss": 1.005, "rewards/accuracies": 0.46875, "rewards/chosen": 0.010873698629438877, "rewards/margins": -0.019869647920131683, "rewards/rejected": 0.030743349343538284, "step": 33 }, { "epoch": 0.04170180145649674, "grad_norm": 2.094043016433716, "learning_rate": 9.959509202453987e-07, "logits/chosen": -2.2226099967956543, "logits/rejected": -2.228837013244629, "logps/chosen": -58.128578186035156, "logps/rejected": -56.803165435791016, "loss": 1.0011, "rewards/accuracies": 0.5, "rewards/chosen": -0.01998429372906685, "rewards/margins": -0.004333687014877796, "rewards/rejected": -0.015650607645511627, "step": 34 }, { "epoch": 0.04292832502874665, "grad_norm": 2.2499382495880127, "learning_rate": 9.958282208588956e-07, "logits/chosen": -2.268580913543701, "logits/rejected": -2.254255533218384, "logps/chosen": -58.368534088134766, "logps/rejected": -58.83753204345703, "loss": 0.9926, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004348576068878174, "rewards/margins": 0.029439734295010567, "rewards/rejected": -0.025091160088777542, "step": 35 }, { "epoch": 0.04415484860099655, "grad_norm": 1.9523760080337524, "learning_rate": 9.957055214723927e-07, "logits/chosen": -2.253394603729248, "logits/rejected": -2.237548351287842, "logps/chosen": -56.00032424926758, "logps/rejected": -55.11772918701172, "loss": 0.9968, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0041685812175273895, "rewards/margins": 0.012901712208986282, "rewards/rejected": -0.01707029528915882, "step": 36 }, { "epoch": 0.045381372173246455, "grad_norm": 1.9436532258987427, "learning_rate": 9.955828220858896e-07, "logits/chosen": -2.2186617851257324, "logits/rejected": -2.2172160148620605, "logps/chosen": -56.150367736816406, "logps/rejected": -58.27134704589844, "loss": 1.001, "rewards/accuracies": 0.46875, "rewards/chosen": -0.01626422442495823, "rewards/margins": -0.004146600607782602, "rewards/rejected": -0.012117625214159489, "step": 37 }, { "epoch": 0.046607895745496355, "grad_norm": 2.1912147998809814, "learning_rate": 9.954601226993864e-07, "logits/chosen": -2.2081079483032227, "logits/rejected": -2.2306151390075684, "logps/chosen": -61.017005920410156, "logps/rejected": -58.84674835205078, "loss": 1.0002, "rewards/accuracies": 0.5, "rewards/chosen": 0.0002624755725264549, "rewards/margins": -0.0009063859470188618, "rewards/rejected": 0.0011688594240695238, "step": 38 }, { "epoch": 0.04783441931774626, "grad_norm": 2.213470458984375, "learning_rate": 9.953374233128833e-07, "logits/chosen": -2.2412030696868896, "logits/rejected": -2.242182970046997, "logps/chosen": -60.18362808227539, "logps/rejected": -60.83038330078125, "loss": 0.9998, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00011015927884727716, "rewards/margins": 0.0007920251227915287, "rewards/rejected": -0.0006818640977144241, "step": 39 }, { "epoch": 0.04906094288999617, "grad_norm": 2.159013271331787, "learning_rate": 9.952147239263802e-07, "logits/chosen": -2.209129810333252, "logits/rejected": -2.1785292625427246, "logps/chosen": -57.59700012207031, "logps/rejected": -54.92272186279297, "loss": 1.0037, "rewards/accuracies": 0.46875, "rewards/chosen": -0.002740645781159401, "rewards/margins": -0.015059580095112324, "rewards/rejected": 0.012318935245275497, "step": 40 }, { "epoch": 0.05028746646224607, "grad_norm": 2.219646692276001, "learning_rate": 9.950920245398773e-07, "logits/chosen": -2.2795159816741943, "logits/rejected": -2.2575933933258057, "logps/chosen": -59.43921661376953, "logps/rejected": -56.843841552734375, "loss": 1.0061, "rewards/accuracies": 0.40625, "rewards/chosen": -0.021708477288484573, "rewards/margins": -0.02448113262653351, "rewards/rejected": 0.0027726534754037857, "step": 41 }, { "epoch": 0.05151399003449598, "grad_norm": 2.1003174781799316, "learning_rate": 9.949693251533742e-07, "logits/chosen": -2.2343554496765137, "logits/rejected": -2.2434566020965576, "logps/chosen": -57.88468933105469, "logps/rejected": -53.46592330932617, "loss": 0.9982, "rewards/accuracies": 0.5, "rewards/chosen": -0.009445357136428356, "rewards/margins": 0.007013438735157251, "rewards/rejected": -0.016458798199892044, "step": 42 }, { "epoch": 0.05274051360674588, "grad_norm": 3.928990364074707, "learning_rate": 9.948466257668713e-07, "logits/chosen": -2.2342703342437744, "logits/rejected": -2.281601667404175, "logps/chosen": -57.62091064453125, "logps/rejected": -56.6435661315918, "loss": 0.9957, "rewards/accuracies": 0.53125, "rewards/chosen": -0.015808403491973877, "rewards/margins": 0.0171426422894001, "rewards/rejected": -0.03295104205608368, "step": 43 }, { "epoch": 0.053967037178995786, "grad_norm": 1.701369285583496, "learning_rate": 9.947239263803681e-07, "logits/chosen": -2.2416775226593018, "logits/rejected": -2.2885074615478516, "logps/chosen": -55.75666427612305, "logps/rejected": -53.847068786621094, "loss": 0.994, "rewards/accuracies": 0.625, "rewards/chosen": 0.012281466275453568, "rewards/margins": 0.023988571017980576, "rewards/rejected": -0.011707105673849583, "step": 44 }, { "epoch": 0.055193560751245686, "grad_norm": 2.0544931888580322, "learning_rate": 9.94601226993865e-07, "logits/chosen": -2.254025936126709, "logits/rejected": -2.2693798542022705, "logps/chosen": -56.8564567565918, "logps/rejected": -56.0155029296875, "loss": 0.9981, "rewards/accuracies": 0.5, "rewards/chosen": 0.017573246732354164, "rewards/margins": 0.007478821091353893, "rewards/rejected": 0.01009442750364542, "step": 45 }, { "epoch": 0.056420084323495594, "grad_norm": 2.037625312805176, "learning_rate": 9.94478527607362e-07, "logits/chosen": -2.1812431812286377, "logits/rejected": -2.1654458045959473, "logps/chosen": -59.36729049682617, "logps/rejected": -57.4759407043457, "loss": 0.9985, "rewards/accuracies": 0.5, "rewards/chosen": 0.024743519723415375, "rewards/margins": 0.0061497557908296585, "rewards/rejected": 0.018593765795230865, "step": 46 }, { "epoch": 0.057646607895745494, "grad_norm": 3.0114245414733887, "learning_rate": 9.943558282208588e-07, "logits/chosen": -2.2033047676086426, "logits/rejected": -2.2196996212005615, "logps/chosen": -60.54022979736328, "logps/rejected": -57.22377395629883, "loss": 0.9988, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0018752932082861662, "rewards/margins": 0.0047811768017709255, "rewards/rejected": -0.002905883127823472, "step": 47 }, { "epoch": 0.0588731314679954, "grad_norm": 1.9805961847305298, "learning_rate": 9.942331288343559e-07, "logits/chosen": -2.297060489654541, "logits/rejected": -2.304605722427368, "logps/chosen": -56.84038543701172, "logps/rejected": -55.31877136230469, "loss": 0.9953, "rewards/accuracies": 0.5625, "rewards/chosen": 0.011200978420674801, "rewards/margins": 0.018666958436369896, "rewards/rejected": -0.007465982809662819, "step": 48 }, { "epoch": 0.0600996550402453, "grad_norm": 2.2337071895599365, "learning_rate": 9.941104294478528e-07, "logits/chosen": -2.250645637512207, "logits/rejected": -2.2615787982940674, "logps/chosen": -58.05844497680664, "logps/rejected": -55.51797103881836, "loss": 0.9944, "rewards/accuracies": 0.75, "rewards/chosen": 0.009682191535830498, "rewards/margins": 0.022527826949954033, "rewards/rejected": -0.012845635414123535, "step": 49 }, { "epoch": 0.06132617861249521, "grad_norm": 2.162181854248047, "learning_rate": 9.939877300613496e-07, "logits/chosen": -2.200345039367676, "logits/rejected": -2.198975086212158, "logps/chosen": -57.39371871948242, "logps/rejected": -57.28093338012695, "loss": 0.99, "rewards/accuracies": 0.71875, "rewards/chosen": 0.027413060888648033, "rewards/margins": 0.03983667865395546, "rewards/rejected": -0.012423623353242874, "step": 50 }, { "epoch": 0.06255270218474511, "grad_norm": 2.034470558166504, "learning_rate": 9.938650306748465e-07, "logits/chosen": -2.2101895809173584, "logits/rejected": -2.244197368621826, "logps/chosen": -57.20738983154297, "logps/rejected": -57.939361572265625, "loss": 0.9872, "rewards/accuracies": 0.59375, "rewards/chosen": 0.021804500371217728, "rewards/margins": 0.051183320581912994, "rewards/rejected": -0.029378822073340416, "step": 51 }, { "epoch": 0.06377922575699502, "grad_norm": 3.147287130355835, "learning_rate": 9.937423312883436e-07, "logits/chosen": -2.2286336421966553, "logits/rejected": -2.204883098602295, "logps/chosen": -60.0673828125, "logps/rejected": -54.68867492675781, "loss": 1.0082, "rewards/accuracies": 0.34375, "rewards/chosen": -0.0011117109097540379, "rewards/margins": -0.03300487995147705, "rewards/rejected": 0.03189317137002945, "step": 52 }, { "epoch": 0.06500574932924492, "grad_norm": 2.0518081188201904, "learning_rate": 9.936196319018405e-07, "logits/chosen": -2.2065370082855225, "logits/rejected": -2.2323343753814697, "logps/chosen": -57.904754638671875, "logps/rejected": -55.75952911376953, "loss": 0.9935, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0010091906879097223, "rewards/margins": 0.026194598525762558, "rewards/rejected": -0.02518540434539318, "step": 53 }, { "epoch": 0.06623227290149483, "grad_norm": 1.8963403701782227, "learning_rate": 9.934969325153374e-07, "logits/chosen": -2.2428646087646484, "logits/rejected": -2.235370635986328, "logps/chosen": -56.63778305053711, "logps/rejected": -56.15742492675781, "loss": 0.9929, "rewards/accuracies": 0.625, "rewards/chosen": 0.004592918325215578, "rewards/margins": 0.028263721615076065, "rewards/rejected": -0.023670803755521774, "step": 54 }, { "epoch": 0.06745879647374473, "grad_norm": 1.9178003072738647, "learning_rate": 9.933742331288343e-07, "logits/chosen": -2.2449121475219727, "logits/rejected": -2.2477476596832275, "logps/chosen": -57.41055679321289, "logps/rejected": -58.86910629272461, "loss": 1.0031, "rewards/accuracies": 0.375, "rewards/chosen": -0.02249237336218357, "rewards/margins": -0.012546278536319733, "rewards/rejected": -0.009946094825863838, "step": 55 }, { "epoch": 0.06868532004599463, "grad_norm": 2.1461381912231445, "learning_rate": 9.932515337423313e-07, "logits/chosen": -2.183260917663574, "logits/rejected": -2.225081205368042, "logps/chosen": -58.020790100097656, "logps/rejected": -56.98574447631836, "loss": 1.0021, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0032472724560648203, "rewards/margins": -0.008251474238932133, "rewards/rejected": 0.005004202947020531, "step": 56 }, { "epoch": 0.06991184361824454, "grad_norm": 2.2452950477600098, "learning_rate": 9.931288343558282e-07, "logits/chosen": -2.244309186935425, "logits/rejected": -2.2415380477905273, "logps/chosen": -55.967689514160156, "logps/rejected": -57.158355712890625, "loss": 0.9988, "rewards/accuracies": 0.5, "rewards/chosen": -0.02299964614212513, "rewards/margins": 0.004892921075224876, "rewards/rejected": -0.027892569079995155, "step": 57 }, { "epoch": 0.07113836719049445, "grad_norm": 2.1530628204345703, "learning_rate": 9.93006134969325e-07, "logits/chosen": -2.2580480575561523, "logits/rejected": -2.234008312225342, "logps/chosen": -58.39783477783203, "logps/rejected": -57.55308532714844, "loss": 1.0038, "rewards/accuracies": 0.53125, "rewards/chosen": -0.020692924037575722, "rewards/margins": -0.01517806202173233, "rewards/rejected": -0.005514861084520817, "step": 58 }, { "epoch": 0.07236489076274434, "grad_norm": 2.255979537963867, "learning_rate": 9.92883435582822e-07, "logits/chosen": -2.247986316680908, "logits/rejected": -2.2259864807128906, "logps/chosen": -59.8474006652832, "logps/rejected": -55.13397979736328, "loss": 1.006, "rewards/accuracies": 0.5, "rewards/chosen": 0.013217128813266754, "rewards/margins": -0.024193499237298965, "rewards/rejected": 0.03741062805056572, "step": 59 }, { "epoch": 0.07359141433499425, "grad_norm": 2.024268388748169, "learning_rate": 9.927607361963189e-07, "logits/chosen": -2.250873565673828, "logits/rejected": -2.260991096496582, "logps/chosen": -57.24102020263672, "logps/rejected": -56.05735778808594, "loss": 0.9963, "rewards/accuracies": 0.53125, "rewards/chosen": 0.01204775646328926, "rewards/margins": 0.014907587319612503, "rewards/rejected": -0.002859831787645817, "step": 60 }, { "epoch": 0.07481793790724416, "grad_norm": 2.946913480758667, "learning_rate": 9.92638036809816e-07, "logits/chosen": -2.2987818717956543, "logits/rejected": -2.266106367111206, "logps/chosen": -57.15242004394531, "logps/rejected": -55.597957611083984, "loss": 1.0074, "rewards/accuracies": 0.46875, "rewards/chosen": -0.02158566750586033, "rewards/margins": -0.02950589545071125, "rewards/rejected": 0.007920228876173496, "step": 61 }, { "epoch": 0.07604446147949406, "grad_norm": 2.210632085800171, "learning_rate": 9.925153374233128e-07, "logits/chosen": -2.266141414642334, "logits/rejected": -2.279477834701538, "logps/chosen": -60.119319915771484, "logps/rejected": -57.743526458740234, "loss": 0.9994, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0016003381460905075, "rewards/margins": 0.0023128269240260124, "rewards/rejected": -0.0007124899420887232, "step": 62 }, { "epoch": 0.07727098505174396, "grad_norm": 1.9800574779510498, "learning_rate": 9.923926380368097e-07, "logits/chosen": -2.2023539543151855, "logits/rejected": -2.208343267440796, "logps/chosen": -57.435882568359375, "logps/rejected": -55.59637451171875, "loss": 0.9891, "rewards/accuracies": 0.71875, "rewards/chosen": 0.02280469983816147, "rewards/margins": 0.04351503774523735, "rewards/rejected": -0.020710337907075882, "step": 63 }, { "epoch": 0.07849750862399386, "grad_norm": 2.1346797943115234, "learning_rate": 9.922699386503068e-07, "logits/chosen": -2.2990403175354004, "logits/rejected": -2.2595291137695312, "logps/chosen": -54.42109680175781, "logps/rejected": -55.48493957519531, "loss": 0.9933, "rewards/accuracies": 0.6875, "rewards/chosen": 0.025529885664582253, "rewards/margins": 0.026759063825011253, "rewards/rejected": -0.0012291790917515755, "step": 64 }, { "epoch": 0.07972403219624377, "grad_norm": 2.247443675994873, "learning_rate": 9.921472392638037e-07, "logits/chosen": -2.1848700046539307, "logits/rejected": -2.203855276107788, "logps/chosen": -61.42689895629883, "logps/rejected": -56.18310546875, "loss": 0.998, "rewards/accuracies": 0.5625, "rewards/chosen": 0.020248759537935257, "rewards/margins": 0.007847142405807972, "rewards/rejected": 0.012401617132127285, "step": 65 }, { "epoch": 0.08095055576849368, "grad_norm": 1.849217414855957, "learning_rate": 9.920245398773006e-07, "logits/chosen": -2.2037224769592285, "logits/rejected": -2.2081668376922607, "logps/chosen": -56.08262252807617, "logps/rejected": -57.061073303222656, "loss": 0.9953, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0024696355685591698, "rewards/margins": 0.018924225121736526, "rewards/rejected": -0.02139385975897312, "step": 66 }, { "epoch": 0.08217707934074359, "grad_norm": 2.1660242080688477, "learning_rate": 9.919018404907975e-07, "logits/chosen": -2.250087261199951, "logits/rejected": -2.2883150577545166, "logps/chosen": -55.6312370300293, "logps/rejected": -54.87425231933594, "loss": 0.9992, "rewards/accuracies": 0.46875, "rewards/chosen": 0.026813924312591553, "rewards/margins": 0.0034438390284776688, "rewards/rejected": 0.023370087146759033, "step": 67 }, { "epoch": 0.08340360291299348, "grad_norm": 3.668339490890503, "learning_rate": 9.917791411042943e-07, "logits/chosen": -2.228538990020752, "logits/rejected": -2.2596051692962646, "logps/chosen": -63.04720687866211, "logps/rejected": -63.28890609741211, "loss": 0.9955, "rewards/accuracies": 0.59375, "rewards/chosen": 0.037978220731019974, "rewards/margins": 0.01782071590423584, "rewards/rejected": 0.020157504826784134, "step": 68 }, { "epoch": 0.08463012648524339, "grad_norm": 2.7660553455352783, "learning_rate": 9.916564417177914e-07, "logits/chosen": -2.1827054023742676, "logits/rejected": -2.2031989097595215, "logps/chosen": -58.06291580200195, "logps/rejected": -54.29638671875, "loss": 0.9962, "rewards/accuracies": 0.5, "rewards/chosen": 0.022410273551940918, "rewards/margins": 0.01532666850835085, "rewards/rejected": 0.007083607371896505, "step": 69 }, { "epoch": 0.0858566500574933, "grad_norm": 2.203369617462158, "learning_rate": 9.915337423312883e-07, "logits/chosen": -2.255747079849243, "logits/rejected": -2.268655776977539, "logps/chosen": -57.09893035888672, "logps/rejected": -56.395023345947266, "loss": 0.9979, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06413205713033676, "rewards/margins": 0.008275414817035198, "rewards/rejected": 0.05585664510726929, "step": 70 }, { "epoch": 0.0870831736297432, "grad_norm": 2.2016663551330566, "learning_rate": 9.914110429447852e-07, "logits/chosen": -2.166123390197754, "logits/rejected": -2.1980714797973633, "logps/chosen": -57.98768615722656, "logps/rejected": -56.80472183227539, "loss": 1.0047, "rewards/accuracies": 0.40625, "rewards/chosen": -0.020271241664886475, "rewards/margins": -0.01899779587984085, "rewards/rejected": -0.001273446250706911, "step": 71 }, { "epoch": 0.0883096972019931, "grad_norm": 2.423419237136841, "learning_rate": 9.91288343558282e-07, "logits/chosen": -2.217855453491211, "logits/rejected": -2.242621898651123, "logps/chosen": -57.96121597290039, "logps/rejected": -56.31623840332031, "loss": 1.0007, "rewards/accuracies": 0.40625, "rewards/chosen": 0.015578508377075195, "rewards/margins": -0.002946578897535801, "rewards/rejected": 0.01852509006857872, "step": 72 }, { "epoch": 0.089536220774243, "grad_norm": 2.086735725402832, "learning_rate": 9.911656441717792e-07, "logits/chosen": -2.2793188095092773, "logits/rejected": -2.2732226848602295, "logps/chosen": -59.148040771484375, "logps/rejected": -56.863861083984375, "loss": 0.9984, "rewards/accuracies": 0.53125, "rewards/chosen": -0.012432479299604893, "rewards/margins": 0.006603384390473366, "rewards/rejected": -0.019035864621400833, "step": 73 }, { "epoch": 0.09076274434649291, "grad_norm": 2.1298177242279053, "learning_rate": 9.91042944785276e-07, "logits/chosen": -2.207160711288452, "logits/rejected": -2.1873273849487305, "logps/chosen": -58.6773567199707, "logps/rejected": -58.31796646118164, "loss": 0.9996, "rewards/accuracies": 0.53125, "rewards/chosen": -0.01672659069299698, "rewards/margins": 0.0015267468988895416, "rewards/rejected": -0.01825333759188652, "step": 74 }, { "epoch": 0.09198926791874282, "grad_norm": 2.3379173278808594, "learning_rate": 9.90920245398773e-07, "logits/chosen": -2.2984378337860107, "logits/rejected": -2.3080859184265137, "logps/chosen": -57.556129455566406, "logps/rejected": -53.37626266479492, "loss": 0.9953, "rewards/accuracies": 0.5, "rewards/chosen": 0.013839926570653915, "rewards/margins": 0.018947675824165344, "rewards/rejected": -0.005107749253511429, "step": 75 }, { "epoch": 0.09321579149099271, "grad_norm": 2.2884349822998047, "learning_rate": 9.9079754601227e-07, "logits/chosen": -2.220034122467041, "logits/rejected": -2.241994857788086, "logps/chosen": -58.631813049316406, "logps/rejected": -56.61625671386719, "loss": 1.0023, "rewards/accuracies": 0.5, "rewards/chosen": 0.002953995717689395, "rewards/margins": -0.009158171713352203, "rewards/rejected": 0.012112165801227093, "step": 76 }, { "epoch": 0.09444231506324262, "grad_norm": 2.5483505725860596, "learning_rate": 9.906748466257669e-07, "logits/chosen": -2.260030508041382, "logits/rejected": -2.2778549194335938, "logps/chosen": -58.821998596191406, "logps/rejected": -56.599647521972656, "loss": 0.9989, "rewards/accuracies": 0.53125, "rewards/chosen": -0.01094834879040718, "rewards/margins": 0.0044662365689873695, "rewards/rejected": -0.015414584428071976, "step": 77 }, { "epoch": 0.09566883863549253, "grad_norm": 2.298429012298584, "learning_rate": 9.905521472392638e-07, "logits/chosen": -2.2159628868103027, "logits/rejected": -2.2128615379333496, "logps/chosen": -56.14712142944336, "logps/rejected": -54.143409729003906, "loss": 0.9825, "rewards/accuracies": 0.8125, "rewards/chosen": 0.057406555861234665, "rewards/margins": 0.07014358043670654, "rewards/rejected": -0.012737023644149303, "step": 78 }, { "epoch": 0.09689536220774243, "grad_norm": 2.1310760974884033, "learning_rate": 9.904294478527607e-07, "logits/chosen": -2.2696566581726074, "logits/rejected": -2.2660999298095703, "logps/chosen": -58.931419372558594, "logps/rejected": -58.12592315673828, "loss": 0.9947, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010283865965902805, "rewards/margins": 0.021106790751218796, "rewards/rejected": -0.010822927579283714, "step": 79 }, { "epoch": 0.09812188577999234, "grad_norm": 2.2274513244628906, "learning_rate": 9.903067484662575e-07, "logits/chosen": -2.2328317165374756, "logits/rejected": -2.2461678981781006, "logps/chosen": -54.33456802368164, "logps/rejected": -55.776119232177734, "loss": 0.9954, "rewards/accuracies": 0.5625, "rewards/chosen": 0.018447507172822952, "rewards/margins": 0.018564498052001, "rewards/rejected": -0.00011699134483933449, "step": 80 }, { "epoch": 0.09934840935224223, "grad_norm": 2.6776108741760254, "learning_rate": 9.901840490797544e-07, "logits/chosen": -2.175020456314087, "logits/rejected": -2.2328999042510986, "logps/chosen": -57.85050964355469, "logps/rejected": -55.94074630737305, "loss": 0.9867, "rewards/accuracies": 0.625, "rewards/chosen": 0.04216725751757622, "rewards/margins": 0.05337193235754967, "rewards/rejected": -0.0112046729773283, "step": 81 }, { "epoch": 0.10057493292449214, "grad_norm": 2.322758436203003, "learning_rate": 9.900613496932515e-07, "logits/chosen": -2.1618313789367676, "logits/rejected": -2.16957950592041, "logps/chosen": -54.00907897949219, "logps/rejected": -55.38515853881836, "loss": 0.9954, "rewards/accuracies": 0.65625, "rewards/chosen": 0.006587133277207613, "rewards/margins": 0.018577858805656433, "rewards/rejected": -0.011990725062787533, "step": 82 }, { "epoch": 0.10180145649674205, "grad_norm": 2.4458887577056885, "learning_rate": 9.899386503067484e-07, "logits/chosen": -2.1982951164245605, "logits/rejected": -2.2292418479919434, "logps/chosen": -61.07661437988281, "logps/rejected": -57.382266998291016, "loss": 0.9849, "rewards/accuracies": 0.71875, "rewards/chosen": 0.07361408323049545, "rewards/margins": 0.06050998717546463, "rewards/rejected": 0.013104092329740524, "step": 83 }, { "epoch": 0.10302798006899196, "grad_norm": 2.169191360473633, "learning_rate": 9.898159509202455e-07, "logits/chosen": -2.2593331336975098, "logits/rejected": -2.277379274368286, "logps/chosen": -59.36576843261719, "logps/rejected": -55.62268829345703, "loss": 0.9927, "rewards/accuracies": 0.65625, "rewards/chosen": 0.045865047723054886, "rewards/margins": 0.029055632650852203, "rewards/rejected": 0.016809415072202682, "step": 84 }, { "epoch": 0.10425450364124185, "grad_norm": 2.3098719120025635, "learning_rate": 9.896932515337424e-07, "logits/chosen": -2.207205295562744, "logits/rejected": -2.2169880867004395, "logps/chosen": -59.23664093017578, "logps/rejected": -60.24756622314453, "loss": 0.9936, "rewards/accuracies": 0.625, "rewards/chosen": 0.024843264371156693, "rewards/margins": 0.02543553151190281, "rewards/rejected": -0.0005922680720686913, "step": 85 }, { "epoch": 0.10548102721349176, "grad_norm": 2.17486572265625, "learning_rate": 9.895705521472392e-07, "logits/chosen": -2.196558952331543, "logits/rejected": -2.2169086933135986, "logps/chosen": -57.28391647338867, "logps/rejected": -60.034297943115234, "loss": 1.0012, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0006826035678386688, "rewards/margins": -0.0051172152161598206, "rewards/rejected": 0.005799818783998489, "step": 86 }, { "epoch": 0.10670755078574166, "grad_norm": 2.0904834270477295, "learning_rate": 9.894478527607361e-07, "logits/chosen": -2.279557466506958, "logits/rejected": -2.262892961502075, "logps/chosen": -58.00033950805664, "logps/rejected": -56.42808532714844, "loss": 0.9973, "rewards/accuracies": 0.625, "rewards/chosen": 0.0045289164409041405, "rewards/margins": 0.010521973483264446, "rewards/rejected": -0.005993055645376444, "step": 87 }, { "epoch": 0.10793407435799157, "grad_norm": 2.0575339794158936, "learning_rate": 9.89325153374233e-07, "logits/chosen": -2.2200710773468018, "logits/rejected": -2.2469797134399414, "logps/chosen": -56.09221649169922, "logps/rejected": -55.29115295410156, "loss": 0.9887, "rewards/accuracies": 0.65625, "rewards/chosen": 0.027255523949861526, "rewards/margins": 0.04522604122757912, "rewards/rejected": -0.01797051541507244, "step": 88 }, { "epoch": 0.10916059793024147, "grad_norm": 2.484066963195801, "learning_rate": 9.8920245398773e-07, "logits/chosen": -2.208803653717041, "logits/rejected": -2.2626304626464844, "logps/chosen": -56.95098114013672, "logps/rejected": -54.60477066040039, "loss": 0.9943, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04773905500769615, "rewards/margins": 0.023007024079561234, "rewards/rejected": 0.024732030928134918, "step": 89 }, { "epoch": 0.11038712150249137, "grad_norm": 2.2320594787597656, "learning_rate": 9.89079754601227e-07, "logits/chosen": -2.2233641147613525, "logits/rejected": -2.222987651824951, "logps/chosen": -54.6097297668457, "logps/rejected": -55.74415969848633, "loss": 0.9922, "rewards/accuracies": 0.59375, "rewards/chosen": 0.018405044451355934, "rewards/margins": 0.031208183616399765, "rewards/rejected": -0.012803137302398682, "step": 90 }, { "epoch": 0.11161364507474128, "grad_norm": 2.1084811687469482, "learning_rate": 9.889570552147239e-07, "logits/chosen": -2.149574041366577, "logits/rejected": -2.2055718898773193, "logps/chosen": -57.45439910888672, "logps/rejected": -56.21996307373047, "loss": 0.9992, "rewards/accuracies": 0.34375, "rewards/chosen": 0.03996549919247627, "rewards/margins": 0.003164814319461584, "rewards/rejected": 0.03680068254470825, "step": 91 }, { "epoch": 0.11284016864699119, "grad_norm": 2.525714874267578, "learning_rate": 9.888343558282207e-07, "logits/chosen": -2.184903621673584, "logits/rejected": -2.1881496906280518, "logps/chosen": -54.82279968261719, "logps/rejected": -54.727813720703125, "loss": 0.9941, "rewards/accuracies": 0.5, "rewards/chosen": 0.040510378777980804, "rewards/margins": 0.02368241548538208, "rewards/rejected": 0.016827967017889023, "step": 92 }, { "epoch": 0.1140666922192411, "grad_norm": 2.515986680984497, "learning_rate": 9.887116564417178e-07, "logits/chosen": -2.1623003482818604, "logits/rejected": -2.1887543201446533, "logps/chosen": -62.05735397338867, "logps/rejected": -59.28530502319336, "loss": 0.9924, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07816734910011292, "rewards/margins": 0.030774114653468132, "rewards/rejected": 0.047393228858709335, "step": 93 }, { "epoch": 0.11529321579149099, "grad_norm": 2.33927059173584, "learning_rate": 9.885889570552147e-07, "logits/chosen": -2.2295618057250977, "logits/rejected": -2.1955881118774414, "logps/chosen": -56.86563491821289, "logps/rejected": -54.18097686767578, "loss": 0.9936, "rewards/accuracies": 0.65625, "rewards/chosen": 0.038958266377449036, "rewards/margins": 0.02557814121246338, "rewards/rejected": 0.013380123302340508, "step": 94 }, { "epoch": 0.1165197393637409, "grad_norm": 5.572426795959473, "learning_rate": 9.884662576687116e-07, "logits/chosen": -2.2410361766815186, "logits/rejected": -2.2341580390930176, "logps/chosen": -61.374454498291016, "logps/rejected": -58.64364242553711, "loss": 0.9967, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0718660056591034, "rewards/margins": 0.01319122314453125, "rewards/rejected": 0.05867477506399155, "step": 95 }, { "epoch": 0.1177462629359908, "grad_norm": 2.1333658695220947, "learning_rate": 9.883435582822085e-07, "logits/chosen": -2.231740951538086, "logits/rejected": -2.2465732097625732, "logps/chosen": -56.48893737792969, "logps/rejected": -55.25089645385742, "loss": 0.9873, "rewards/accuracies": 0.71875, "rewards/chosen": 0.07201765477657318, "rewards/margins": 0.051066622138023376, "rewards/rejected": 0.020951032638549805, "step": 96 }, { "epoch": 0.11897278650824071, "grad_norm": 2.160738468170166, "learning_rate": 9.882208588957056e-07, "logits/chosen": -2.178673028945923, "logits/rejected": -2.19962477684021, "logps/chosen": -56.706851959228516, "logps/rejected": -56.46717834472656, "loss": 1.0033, "rewards/accuracies": 0.53125, "rewards/chosen": 0.02494976483285427, "rewards/margins": -0.01324467547237873, "rewards/rejected": 0.0381944440305233, "step": 97 }, { "epoch": 0.1201993100804906, "grad_norm": 2.653397560119629, "learning_rate": 9.880981595092024e-07, "logits/chosen": -2.2528443336486816, "logits/rejected": -2.246095657348633, "logps/chosen": -56.53608703613281, "logps/rejected": -55.942665100097656, "loss": 0.9974, "rewards/accuracies": 0.59375, "rewards/chosen": 0.018698800355196, "rewards/margins": 0.010416649281978607, "rewards/rejected": 0.008282149210572243, "step": 98 }, { "epoch": 0.12142583365274051, "grad_norm": 2.2374987602233887, "learning_rate": 9.879754601226993e-07, "logits/chosen": -2.176450729370117, "logits/rejected": -2.212080717086792, "logps/chosen": -60.38703918457031, "logps/rejected": -57.68577575683594, "loss": 0.9856, "rewards/accuracies": 0.75, "rewards/chosen": 0.04828538000583649, "rewards/margins": 0.05785039812326431, "rewards/rejected": -0.00956502091139555, "step": 99 }, { "epoch": 0.12265235722499042, "grad_norm": 2.4510154724121094, "learning_rate": 9.878527607361962e-07, "logits/chosen": -2.2310104370117188, "logits/rejected": -2.244642972946167, "logps/chosen": -53.5712890625, "logps/rejected": -53.99269104003906, "loss": 0.9899, "rewards/accuracies": 0.59375, "rewards/chosen": 0.061242200434207916, "rewards/margins": 0.04057997465133667, "rewards/rejected": 0.020662225782871246, "step": 100 }, { "epoch": 0.12387888079724033, "grad_norm": 2.36272931098938, "learning_rate": 9.87730061349693e-07, "logits/chosen": -2.2213730812072754, "logits/rejected": -2.216052770614624, "logps/chosen": -55.230369567871094, "logps/rejected": -59.08475112915039, "loss": 0.9927, "rewards/accuracies": 0.53125, "rewards/chosen": 0.046291567385196686, "rewards/margins": 0.029249893501400948, "rewards/rejected": 0.01704167202115059, "step": 101 }, { "epoch": 0.12510540436949022, "grad_norm": 2.544464349746704, "learning_rate": 9.876073619631902e-07, "logits/chosen": -2.2369418144226074, "logits/rejected": -2.2387819290161133, "logps/chosen": -54.81662368774414, "logps/rejected": -56.570465087890625, "loss": 0.9967, "rewards/accuracies": 0.53125, "rewards/chosen": 0.04191000759601593, "rewards/margins": 0.013410964049398899, "rewards/rejected": 0.028499042615294456, "step": 102 }, { "epoch": 0.12633192794174014, "grad_norm": 2.3228518962860107, "learning_rate": 9.87484662576687e-07, "logits/chosen": -2.2635767459869385, "logits/rejected": -2.2348244190216064, "logps/chosen": -58.016258239746094, "logps/rejected": -56.697669982910156, "loss": 0.994, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06556857377290726, "rewards/margins": 0.024065425619482994, "rewards/rejected": 0.041503142565488815, "step": 103 }, { "epoch": 0.12755845151399003, "grad_norm": 2.479665756225586, "learning_rate": 9.873619631901841e-07, "logits/chosen": -2.234452962875366, "logits/rejected": -2.2672739028930664, "logps/chosen": -58.04557800292969, "logps/rejected": -56.31081771850586, "loss": 0.9867, "rewards/accuracies": 0.65625, "rewards/chosen": 0.06356494128704071, "rewards/margins": 0.05371653288602829, "rewards/rejected": 0.009848416782915592, "step": 104 }, { "epoch": 0.12878497508623993, "grad_norm": 2.571824550628662, "learning_rate": 9.87239263803681e-07, "logits/chosen": -2.1995973587036133, "logits/rejected": -2.2214748859405518, "logps/chosen": -59.16331481933594, "logps/rejected": -57.684234619140625, "loss": 0.9892, "rewards/accuracies": 0.53125, "rewards/chosen": 0.05362558364868164, "rewards/margins": 0.04347953945398331, "rewards/rejected": 0.010146046057343483, "step": 105 }, { "epoch": 0.13001149865848985, "grad_norm": 2.809504747390747, "learning_rate": 9.87116564417178e-07, "logits/chosen": -2.235966920852661, "logits/rejected": -2.244922637939453, "logps/chosen": -57.78858184814453, "logps/rejected": -57.50777053833008, "loss": 0.9961, "rewards/accuracies": 0.5, "rewards/chosen": 0.00335839856415987, "rewards/margins": 0.01587199978530407, "rewards/rejected": -0.012513602152466774, "step": 106 }, { "epoch": 0.13123802223073974, "grad_norm": 2.7958109378814697, "learning_rate": 9.869938650306748e-07, "logits/chosen": -2.222673177719116, "logits/rejected": -2.234426259994507, "logps/chosen": -56.88990783691406, "logps/rejected": -54.719078063964844, "loss": 0.9887, "rewards/accuracies": 0.625, "rewards/chosen": 0.0446627140045166, "rewards/margins": 0.04535232484340668, "rewards/rejected": -0.0006896136328577995, "step": 107 }, { "epoch": 0.13246454580298966, "grad_norm": 2.574723243713379, "learning_rate": 9.868711656441717e-07, "logits/chosen": -2.1953139305114746, "logits/rejected": -2.1432156562805176, "logps/chosen": -60.49640655517578, "logps/rejected": -58.23777770996094, "loss": 0.992, "rewards/accuracies": 0.625, "rewards/chosen": 0.06475293636322021, "rewards/margins": 0.03213369473814964, "rewards/rejected": 0.03261923789978027, "step": 108 }, { "epoch": 0.13369106937523956, "grad_norm": 2.315592050552368, "learning_rate": 9.867484662576688e-07, "logits/chosen": -2.2771315574645996, "logits/rejected": -2.265462875366211, "logps/chosen": -57.736019134521484, "logps/rejected": -60.58026123046875, "loss": 0.9973, "rewards/accuracies": 0.59375, "rewards/chosen": 0.06294280290603638, "rewards/margins": 0.01102366391569376, "rewards/rejected": 0.05191914364695549, "step": 109 }, { "epoch": 0.13491759294748945, "grad_norm": 2.376894950866699, "learning_rate": 9.866257668711656e-07, "logits/chosen": -2.2408008575439453, "logits/rejected": -2.2582507133483887, "logps/chosen": -56.430381774902344, "logps/rejected": -56.97280502319336, "loss": 1.0001, "rewards/accuracies": 0.46875, "rewards/chosen": 0.040258362889289856, "rewards/margins": -0.0003696102648973465, "rewards/rejected": 0.04062797129154205, "step": 110 }, { "epoch": 0.13614411651973937, "grad_norm": 2.724923610687256, "learning_rate": 9.865030674846625e-07, "logits/chosen": -2.2472739219665527, "logits/rejected": -2.263814687728882, "logps/chosen": -56.9896354675293, "logps/rejected": -57.801082611083984, "loss": 0.9869, "rewards/accuracies": 0.65625, "rewards/chosen": 0.08349210023880005, "rewards/margins": 0.0523139126598835, "rewards/rejected": 0.03117818757891655, "step": 111 }, { "epoch": 0.13737064009198927, "grad_norm": 2.1500728130340576, "learning_rate": 9.863803680981594e-07, "logits/chosen": -2.2139368057250977, "logits/rejected": -2.2630324363708496, "logps/chosen": -56.345802307128906, "logps/rejected": -55.407535552978516, "loss": 0.986, "rewards/accuracies": 0.6875, "rewards/chosen": 0.022429345175623894, "rewards/margins": 0.05631376802921295, "rewards/rejected": -0.03388442099094391, "step": 112 }, { "epoch": 0.13859716366423916, "grad_norm": 2.681560516357422, "learning_rate": 9.862576687116565e-07, "logits/chosen": -2.175405263900757, "logits/rejected": -2.1887223720550537, "logps/chosen": -58.09208679199219, "logps/rejected": -60.38833999633789, "loss": 0.9991, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06196744740009308, "rewards/margins": 0.0033333320170640945, "rewards/rejected": 0.058634113520383835, "step": 113 }, { "epoch": 0.13982368723648908, "grad_norm": 2.291261672973633, "learning_rate": 9.861349693251534e-07, "logits/chosen": -2.1797096729278564, "logits/rejected": -2.204566478729248, "logps/chosen": -55.352237701416016, "logps/rejected": -55.777008056640625, "loss": 0.9927, "rewards/accuracies": 0.5, "rewards/chosen": 0.07177495211362839, "rewards/margins": 0.02960854582488537, "rewards/rejected": 0.04216640442609787, "step": 114 }, { "epoch": 0.14105021080873897, "grad_norm": 2.375121831893921, "learning_rate": 9.860122699386503e-07, "logits/chosen": -2.195547580718994, "logits/rejected": -2.228475570678711, "logps/chosen": -60.18072509765625, "logps/rejected": -57.79286193847656, "loss": 0.9942, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0625758171081543, "rewards/margins": 0.023038040846586227, "rewards/rejected": 0.03953777253627777, "step": 115 }, { "epoch": 0.1422767343809889, "grad_norm": 2.6364407539367676, "learning_rate": 9.858895705521471e-07, "logits/chosen": -2.2608048915863037, "logits/rejected": -2.296656370162964, "logps/chosen": -58.288516998291016, "logps/rejected": -55.40946578979492, "loss": 0.9838, "rewards/accuracies": 0.625, "rewards/chosen": 0.04393213987350464, "rewards/margins": 0.06514745950698853, "rewards/rejected": -0.021215317770838737, "step": 116 }, { "epoch": 0.1435032579532388, "grad_norm": 2.7200825214385986, "learning_rate": 9.857668711656442e-07, "logits/chosen": -2.207314968109131, "logits/rejected": -2.2015769481658936, "logps/chosen": -59.263545989990234, "logps/rejected": -59.332557678222656, "loss": 0.9954, "rewards/accuracies": 0.46875, "rewards/chosen": 0.05077667161822319, "rewards/margins": 0.018545519560575485, "rewards/rejected": 0.032231152057647705, "step": 117 }, { "epoch": 0.14472978152548868, "grad_norm": 2.584563970565796, "learning_rate": 9.85644171779141e-07, "logits/chosen": -2.2309558391571045, "logits/rejected": -2.2383785247802734, "logps/chosen": -54.49922180175781, "logps/rejected": -53.10736846923828, "loss": 0.9896, "rewards/accuracies": 0.59375, "rewards/chosen": 0.08011982589960098, "rewards/margins": 0.04168505594134331, "rewards/rejected": 0.038434769958257675, "step": 118 }, { "epoch": 0.1459563050977386, "grad_norm": 8.95164966583252, "learning_rate": 9.85521472392638e-07, "logits/chosen": -2.1912689208984375, "logits/rejected": -2.2389862537384033, "logps/chosen": -52.9278564453125, "logps/rejected": -53.10137176513672, "loss": 0.9934, "rewards/accuracies": 0.625, "rewards/chosen": 0.05337803438305855, "rewards/margins": 0.026281848549842834, "rewards/rejected": 0.027096185833215714, "step": 119 }, { "epoch": 0.1471828286699885, "grad_norm": 2.2694010734558105, "learning_rate": 9.853987730061349e-07, "logits/chosen": -2.2265522480010986, "logits/rejected": -2.268355131149292, "logps/chosen": -55.63481140136719, "logps/rejected": -53.0485954284668, "loss": 0.989, "rewards/accuracies": 0.625, "rewards/chosen": 0.051280271261930466, "rewards/margins": 0.04450855031609535, "rewards/rejected": 0.0067717209458351135, "step": 120 }, { "epoch": 0.14840935224223842, "grad_norm": 2.2574188709259033, "learning_rate": 9.852760736196317e-07, "logits/chosen": -2.2216715812683105, "logits/rejected": -2.2591936588287354, "logps/chosen": -57.210670471191406, "logps/rejected": -56.993080139160156, "loss": 0.9905, "rewards/accuracies": 0.59375, "rewards/chosen": 0.09743428230285645, "rewards/margins": 0.03849995136260986, "rewards/rejected": 0.058934327214956284, "step": 121 }, { "epoch": 0.1496358758144883, "grad_norm": 2.3650128841400146, "learning_rate": 9.851533742331288e-07, "logits/chosen": -2.1989071369171143, "logits/rejected": -2.25303316116333, "logps/chosen": -57.50617218017578, "logps/rejected": -56.5103759765625, "loss": 0.9985, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06112518534064293, "rewards/margins": 0.006160021293908358, "rewards/rejected": 0.05496516078710556, "step": 122 }, { "epoch": 0.1508623993867382, "grad_norm": 2.1873087882995605, "learning_rate": 9.850306748466257e-07, "logits/chosen": -2.2880170345306396, "logits/rejected": -2.2699735164642334, "logps/chosen": -58.51746368408203, "logps/rejected": -56.17219161987305, "loss": 0.9892, "rewards/accuracies": 0.59375, "rewards/chosen": 0.05767405033111572, "rewards/margins": 0.04341989755630493, "rewards/rejected": 0.014254152774810791, "step": 123 }, { "epoch": 0.15208892295898813, "grad_norm": 2.183410167694092, "learning_rate": 9.849079754601226e-07, "logits/chosen": -2.1657543182373047, "logits/rejected": -2.1955530643463135, "logps/chosen": -54.47672653198242, "logps/rejected": -53.32976150512695, "loss": 0.9891, "rewards/accuracies": 0.59375, "rewards/chosen": 0.059931229799985886, "rewards/margins": 0.044036440551280975, "rewards/rejected": 0.01589479297399521, "step": 124 }, { "epoch": 0.15331544653123802, "grad_norm": 2.5096638202667236, "learning_rate": 9.847852760736197e-07, "logits/chosen": -2.2062809467315674, "logits/rejected": -2.2234623432159424, "logps/chosen": -53.89178466796875, "logps/rejected": -55.75862503051758, "loss": 0.9844, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09670574963092804, "rewards/margins": 0.06273829936981201, "rewards/rejected": 0.03396744653582573, "step": 125 }, { "epoch": 0.1545419701034879, "grad_norm": 2.2523341178894043, "learning_rate": 9.846625766871166e-07, "logits/chosen": -2.1977427005767822, "logits/rejected": -2.2162673473358154, "logps/chosen": -54.018497467041016, "logps/rejected": -55.56656265258789, "loss": 0.9898, "rewards/accuracies": 0.625, "rewards/chosen": 0.0520973838865757, "rewards/margins": 0.04138605669140816, "rewards/rejected": 0.010711323469877243, "step": 126 }, { "epoch": 0.15576849367573783, "grad_norm": 3.35132098197937, "learning_rate": 9.845398773006135e-07, "logits/chosen": -2.1842472553253174, "logits/rejected": -2.2284340858459473, "logps/chosen": -59.681678771972656, "logps/rejected": -59.82383728027344, "loss": 0.9931, "rewards/accuracies": 0.5625, "rewards/chosen": 0.014240515418350697, "rewards/margins": 0.02799929678440094, "rewards/rejected": -0.01375877857208252, "step": 127 }, { "epoch": 0.15699501724798773, "grad_norm": 2.2769100666046143, "learning_rate": 9.844171779141103e-07, "logits/chosen": -2.2515292167663574, "logits/rejected": -2.254279136657715, "logps/chosen": -58.19978713989258, "logps/rejected": -54.0025634765625, "loss": 0.9903, "rewards/accuracies": 0.59375, "rewards/chosen": 0.10145643353462219, "rewards/margins": 0.03951321914792061, "rewards/rejected": 0.06194321811199188, "step": 128 }, { "epoch": 0.15822154082023765, "grad_norm": 2.1471824645996094, "learning_rate": 9.842944785276072e-07, "logits/chosen": -2.226419448852539, "logits/rejected": -2.234541177749634, "logps/chosen": -56.09847640991211, "logps/rejected": -54.35316848754883, "loss": 0.9815, "rewards/accuracies": 0.59375, "rewards/chosen": 0.10661139339208603, "rewards/margins": 0.07435860484838486, "rewards/rejected": 0.03225278854370117, "step": 129 }, { "epoch": 0.15944806439248754, "grad_norm": 2.2908971309661865, "learning_rate": 9.841717791411043e-07, "logits/chosen": -2.2280728816986084, "logits/rejected": -2.258737087249756, "logps/chosen": -53.751548767089844, "logps/rejected": -55.801368713378906, "loss": 0.9891, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06417969614267349, "rewards/margins": 0.044179901480674744, "rewards/rejected": 0.01999979093670845, "step": 130 }, { "epoch": 0.16067458796473744, "grad_norm": 2.1481316089630127, "learning_rate": 9.840490797546012e-07, "logits/chosen": -2.1888716220855713, "logits/rejected": -2.213876724243164, "logps/chosen": -58.329795837402344, "logps/rejected": -56.56123733520508, "loss": 0.9909, "rewards/accuracies": 0.625, "rewards/chosen": 0.09066462516784668, "rewards/margins": 0.03607446327805519, "rewards/rejected": 0.05459016561508179, "step": 131 }, { "epoch": 0.16190111153698736, "grad_norm": 2.401381015777588, "learning_rate": 9.83926380368098e-07, "logits/chosen": -2.2287466526031494, "logits/rejected": -2.2693333625793457, "logps/chosen": -58.62920379638672, "logps/rejected": -56.46797180175781, "loss": 0.9959, "rewards/accuracies": 0.59375, "rewards/chosen": 0.07409276813268661, "rewards/margins": 0.016451334580779076, "rewards/rejected": 0.05764143168926239, "step": 132 }, { "epoch": 0.16312763510923725, "grad_norm": 2.7756752967834473, "learning_rate": 9.83803680981595e-07, "logits/chosen": -2.2317628860473633, "logits/rejected": -2.205078601837158, "logps/chosen": -57.25608444213867, "logps/rejected": -58.12001037597656, "loss": 0.9876, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0683147981762886, "rewards/margins": 0.05003764480352402, "rewards/rejected": 0.018277153372764587, "step": 133 }, { "epoch": 0.16435415868148717, "grad_norm": 2.802351236343384, "learning_rate": 9.83680981595092e-07, "logits/chosen": -2.2430224418640137, "logits/rejected": -2.1937472820281982, "logps/chosen": -58.681060791015625, "logps/rejected": -58.03921127319336, "loss": 0.9955, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05588003247976303, "rewards/margins": 0.018292095512151718, "rewards/rejected": 0.03758794069290161, "step": 134 }, { "epoch": 0.16558068225373707, "grad_norm": 2.4453868865966797, "learning_rate": 9.83558282208589e-07, "logits/chosen": -2.2197189331054688, "logits/rejected": -2.229646921157837, "logps/chosen": -59.12512969970703, "logps/rejected": -57.99552917480469, "loss": 0.9966, "rewards/accuracies": 0.5, "rewards/chosen": 0.06769350916147232, "rewards/margins": 0.013790988363325596, "rewards/rejected": 0.0539025217294693, "step": 135 }, { "epoch": 0.16680720582598696, "grad_norm": 2.2878997325897217, "learning_rate": 9.834355828220858e-07, "logits/chosen": -2.1811161041259766, "logits/rejected": -2.1995491981506348, "logps/chosen": -55.00980758666992, "logps/rejected": -57.432518005371094, "loss": 0.9905, "rewards/accuracies": 0.65625, "rewards/chosen": 0.08240540325641632, "rewards/margins": 0.03852502256631851, "rewards/rejected": 0.04388038069009781, "step": 136 }, { "epoch": 0.16803372939823688, "grad_norm": 2.4435226917266846, "learning_rate": 9.833128834355829e-07, "logits/chosen": -2.23633074760437, "logits/rejected": -2.2034530639648438, "logps/chosen": -56.184303283691406, "logps/rejected": -54.66830825805664, "loss": 0.9947, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04367038235068321, "rewards/margins": 0.021599210798740387, "rewards/rejected": 0.022071169689297676, "step": 137 }, { "epoch": 0.16926025297048677, "grad_norm": 2.4164674282073975, "learning_rate": 9.831901840490798e-07, "logits/chosen": -2.201392650604248, "logits/rejected": -2.2477214336395264, "logps/chosen": -54.45841979980469, "logps/rejected": -54.44917678833008, "loss": 0.9852, "rewards/accuracies": 0.59375, "rewards/chosen": 0.09267554432153702, "rewards/margins": 0.05993448570370674, "rewards/rejected": 0.03274105489253998, "step": 138 }, { "epoch": 0.17048677654273667, "grad_norm": 2.3794777393341064, "learning_rate": 9.830674846625767e-07, "logits/chosen": -2.145068645477295, "logits/rejected": -2.174128293991089, "logps/chosen": -58.46399688720703, "logps/rejected": -60.2099609375, "loss": 1.0053, "rewards/accuracies": 0.4375, "rewards/chosen": 0.09949616342782974, "rewards/margins": -0.02160632610321045, "rewards/rejected": 0.1211024820804596, "step": 139 }, { "epoch": 0.1717133001149866, "grad_norm": 2.6040258407592773, "learning_rate": 9.829447852760735e-07, "logits/chosen": -2.217118501663208, "logits/rejected": -2.231038808822632, "logps/chosen": -58.50202178955078, "logps/rejected": -55.580909729003906, "loss": 0.9811, "rewards/accuracies": 0.75, "rewards/chosen": 0.14643412828445435, "rewards/margins": 0.07589123398065567, "rewards/rejected": 0.07054289430379868, "step": 140 }, { "epoch": 0.17293982368723648, "grad_norm": 2.3755552768707275, "learning_rate": 9.828220858895704e-07, "logits/chosen": -2.245476484298706, "logits/rejected": -2.296142101287842, "logps/chosen": -57.928619384765625, "logps/rejected": -55.81462097167969, "loss": 0.9796, "rewards/accuracies": 0.625, "rewards/chosen": 0.096255823969841, "rewards/margins": 0.08241868019104004, "rewards/rejected": 0.013837147504091263, "step": 141 }, { "epoch": 0.1741663472594864, "grad_norm": 2.5702269077301025, "learning_rate": 9.826993865030675e-07, "logits/chosen": -2.1908488273620605, "logits/rejected": -2.2073888778686523, "logps/chosen": -60.644500732421875, "logps/rejected": -59.73516845703125, "loss": 0.9872, "rewards/accuracies": 0.625, "rewards/chosen": 0.1100820004940033, "rewards/margins": 0.051587238907814026, "rewards/rejected": 0.058494772762060165, "step": 142 }, { "epoch": 0.1753928708317363, "grad_norm": 2.594838857650757, "learning_rate": 9.825766871165644e-07, "logits/chosen": -2.2596473693847656, "logits/rejected": -2.267533302307129, "logps/chosen": -58.92604446411133, "logps/rejected": -54.342803955078125, "loss": 0.9886, "rewards/accuracies": 0.625, "rewards/chosen": 0.08729663491249084, "rewards/margins": 0.046381011605262756, "rewards/rejected": 0.040915630757808685, "step": 143 }, { "epoch": 0.1766193944039862, "grad_norm": 2.859229803085327, "learning_rate": 9.824539877300613e-07, "logits/chosen": -2.2296152114868164, "logits/rejected": -2.2551681995391846, "logps/chosen": -52.635765075683594, "logps/rejected": -54.99678421020508, "loss": 0.99, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09545709192752838, "rewards/margins": 0.04059610515832901, "rewards/rejected": 0.05486098676919937, "step": 144 }, { "epoch": 0.1778459179762361, "grad_norm": 2.8999664783477783, "learning_rate": 9.823312883435584e-07, "logits/chosen": -2.1824698448181152, "logits/rejected": -2.214694023132324, "logps/chosen": -57.30475997924805, "logps/rejected": -62.58025360107422, "loss": 0.9726, "rewards/accuracies": 0.59375, "rewards/chosen": 0.10895805060863495, "rewards/margins": 0.11109095811843872, "rewards/rejected": -0.0021329005248844624, "step": 145 }, { "epoch": 0.179072441548486, "grad_norm": 2.76519513130188, "learning_rate": 9.822085889570552e-07, "logits/chosen": -2.2820982933044434, "logits/rejected": -2.2959887981414795, "logps/chosen": -53.73223114013672, "logps/rejected": -57.74559020996094, "loss": 0.9828, "rewards/accuracies": 0.625, "rewards/chosen": 0.09124395996332169, "rewards/margins": 0.06923318654298782, "rewards/rejected": 0.02201075665652752, "step": 146 }, { "epoch": 0.18029896512073593, "grad_norm": 2.7754321098327637, "learning_rate": 9.820858895705521e-07, "logits/chosen": -2.216785430908203, "logits/rejected": -2.2032089233398438, "logps/chosen": -57.019927978515625, "logps/rejected": -54.001800537109375, "loss": 0.9714, "rewards/accuracies": 0.75, "rewards/chosen": 0.06412450969219208, "rewards/margins": 0.11565132439136505, "rewards/rejected": -0.05152680724859238, "step": 147 }, { "epoch": 0.18152548869298582, "grad_norm": 2.650658369064331, "learning_rate": 9.81963190184049e-07, "logits/chosen": -2.241488456726074, "logits/rejected": -2.268728256225586, "logps/chosen": -55.24047088623047, "logps/rejected": -56.47922897338867, "loss": 0.982, "rewards/accuracies": 0.53125, "rewards/chosen": 0.09739203006029129, "rewards/margins": 0.07272262126207352, "rewards/rejected": 0.024669408798217773, "step": 148 }, { "epoch": 0.1827520122652357, "grad_norm": 2.386186122894287, "learning_rate": 9.818404907975459e-07, "logits/chosen": -2.2438740730285645, "logits/rejected": -2.263995409011841, "logps/chosen": -59.36705017089844, "logps/rejected": -58.36250305175781, "loss": 0.9752, "rewards/accuracies": 0.71875, "rewards/chosen": 0.12194383144378662, "rewards/margins": 0.10036958754062653, "rewards/rejected": 0.021574245765805244, "step": 149 }, { "epoch": 0.18397853583748563, "grad_norm": 2.8518385887145996, "learning_rate": 9.81717791411043e-07, "logits/chosen": -2.2438344955444336, "logits/rejected": -2.259779930114746, "logps/chosen": -55.192230224609375, "logps/rejected": -56.074440002441406, "loss": 0.9779, "rewards/accuracies": 0.46875, "rewards/chosen": 0.019891325384378433, "rewards/margins": 0.08973806351423264, "rewards/rejected": -0.0698467344045639, "step": 150 }, { "epoch": 0.18520505940973553, "grad_norm": 3.0229647159576416, "learning_rate": 9.815950920245399e-07, "logits/chosen": -2.1480438709259033, "logits/rejected": -2.1699814796447754, "logps/chosen": -58.01171875, "logps/rejected": -57.28547668457031, "loss": 0.9867, "rewards/accuracies": 0.625, "rewards/chosen": 0.09477907419204712, "rewards/margins": 0.05457184463739395, "rewards/rejected": 0.04020722955465317, "step": 151 }, { "epoch": 0.18643158298198542, "grad_norm": 2.5755248069763184, "learning_rate": 9.814723926380367e-07, "logits/chosen": -2.2418699264526367, "logits/rejected": -2.249640941619873, "logps/chosen": -59.16203308105469, "logps/rejected": -58.711570739746094, "loss": 0.9869, "rewards/accuracies": 0.59375, "rewards/chosen": 0.01043933629989624, "rewards/margins": 0.052786752581596375, "rewards/rejected": -0.042347416281700134, "step": 152 }, { "epoch": 0.18765810655423534, "grad_norm": 2.7426867485046387, "learning_rate": 9.813496932515336e-07, "logits/chosen": -2.194112777709961, "logits/rejected": -2.2099950313568115, "logps/chosen": -61.45913314819336, "logps/rejected": -61.123329162597656, "loss": 0.988, "rewards/accuracies": 0.59375, "rewards/chosen": 0.017197277396917343, "rewards/margins": 0.04814939573407173, "rewards/rejected": -0.030952120199799538, "step": 153 }, { "epoch": 0.18888463012648524, "grad_norm": 2.7948834896087646, "learning_rate": 9.812269938650307e-07, "logits/chosen": -2.1770310401916504, "logits/rejected": -2.2386837005615234, "logps/chosen": -57.249420166015625, "logps/rejected": -54.179771423339844, "loss": 0.9713, "rewards/accuracies": 0.71875, "rewards/chosen": 0.09679090231657028, "rewards/margins": 0.11725848913192749, "rewards/rejected": -0.020467594265937805, "step": 154 }, { "epoch": 0.19011115369873516, "grad_norm": 2.796980142593384, "learning_rate": 9.811042944785276e-07, "logits/chosen": -2.1702048778533936, "logits/rejected": -2.2019731998443604, "logps/chosen": -57.56647491455078, "logps/rejected": -57.86715316772461, "loss": 0.9755, "rewards/accuracies": 0.78125, "rewards/chosen": 0.08506407588720322, "rewards/margins": 0.10060884058475494, "rewards/rejected": -0.015544760040938854, "step": 155 }, { "epoch": 0.19133767727098505, "grad_norm": 2.6829190254211426, "learning_rate": 9.809815950920245e-07, "logits/chosen": -2.1655702590942383, "logits/rejected": -2.2207183837890625, "logps/chosen": -60.129615783691406, "logps/rejected": -56.46141052246094, "loss": 0.9792, "rewards/accuracies": 0.75, "rewards/chosen": 0.09802919626235962, "rewards/margins": 0.08529338985681534, "rewards/rejected": 0.01273580826818943, "step": 156 }, { "epoch": 0.19256420084323495, "grad_norm": 2.58064866065979, "learning_rate": 9.808588957055213e-07, "logits/chosen": -2.231826066970825, "logits/rejected": -2.271695375442505, "logps/chosen": -57.42947769165039, "logps/rejected": -55.180389404296875, "loss": 0.9645, "rewards/accuracies": 0.75, "rewards/chosen": 0.049931600689888, "rewards/margins": 0.1435912698507309, "rewards/rejected": -0.09365968406200409, "step": 157 }, { "epoch": 0.19379072441548487, "grad_norm": 2.5633466243743896, "learning_rate": 9.807361963190184e-07, "logits/chosen": -2.1823580265045166, "logits/rejected": -2.1825265884399414, "logps/chosen": -56.74522018432617, "logps/rejected": -55.9505729675293, "loss": 0.9764, "rewards/accuracies": 0.71875, "rewards/chosen": 0.05331714451313019, "rewards/margins": 0.09532207995653152, "rewards/rejected": -0.04200493544340134, "step": 158 }, { "epoch": 0.19501724798773476, "grad_norm": 2.5302388668060303, "learning_rate": 9.806134969325153e-07, "logits/chosen": -2.269841432571411, "logits/rejected": -2.242398977279663, "logps/chosen": -52.30609130859375, "logps/rejected": -53.969886779785156, "loss": 0.9916, "rewards/accuracies": 0.625, "rewards/chosen": 0.04476612061262131, "rewards/margins": 0.034403856843709946, "rewards/rejected": 0.010362265631556511, "step": 159 }, { "epoch": 0.19624377155998468, "grad_norm": 3.7683491706848145, "learning_rate": 9.804907975460122e-07, "logits/chosen": -2.185666561126709, "logits/rejected": -2.1586339473724365, "logps/chosen": -53.192291259765625, "logps/rejected": -56.08320617675781, "loss": 0.9678, "rewards/accuracies": 0.53125, "rewards/chosen": 0.02662961557507515, "rewards/margins": 0.1303706169128418, "rewards/rejected": -0.10374101996421814, "step": 160 }, { "epoch": 0.19747029513223457, "grad_norm": 2.619354009628296, "learning_rate": 9.80368098159509e-07, "logits/chosen": -2.1803154945373535, "logits/rejected": -2.213895797729492, "logps/chosen": -55.64251708984375, "logps/rejected": -51.261924743652344, "loss": 0.9609, "rewards/accuracies": 0.75, "rewards/chosen": 0.06498942524194717, "rewards/margins": 0.1586255133152008, "rewards/rejected": -0.09363611042499542, "step": 161 }, { "epoch": 0.19869681870448447, "grad_norm": 2.5554697513580322, "learning_rate": 9.80245398773006e-07, "logits/chosen": -2.167788505554199, "logits/rejected": -2.241943359375, "logps/chosen": -53.11471176147461, "logps/rejected": -53.53303146362305, "loss": 0.9591, "rewards/accuracies": 0.78125, "rewards/chosen": 0.11010933667421341, "rewards/margins": 0.16509054601192474, "rewards/rejected": -0.05498121678829193, "step": 162 }, { "epoch": 0.1999233422767344, "grad_norm": 2.990478754043579, "learning_rate": 9.80122699386503e-07, "logits/chosen": -2.149427652359009, "logits/rejected": -2.2006771564483643, "logps/chosen": -55.94937515258789, "logps/rejected": -56.10092544555664, "loss": 0.959, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11224197596311569, "rewards/margins": 0.16702181100845337, "rewards/rejected": -0.054779838770627975, "step": 163 }, { "epoch": 0.1999233422767344, "eval_logits/chosen": -2.2129478454589844, "eval_logits/rejected": -2.225248336791992, "eval_logps/chosen": -57.53826141357422, "eval_logps/rejected": -57.381797790527344, "eval_loss": 0.9761220812797546, "eval_rewards/accuracies": 0.646258533000946, "eval_rewards/chosen": 0.009310076013207436, "eval_rewards/margins": 0.09763213247060776, "eval_rewards/rejected": -0.08832206577062607, "eval_runtime": 1585.6931, "eval_samples_per_second": 0.556, "eval_steps_per_second": 0.278, "step": 163 }, { "epoch": 0.20114986584898428, "grad_norm": 2.6460392475128174, "learning_rate": 9.8e-07, "logits/chosen": -2.2092154026031494, "logits/rejected": -2.217618227005005, "logps/chosen": -59.06272888183594, "logps/rejected": -60.265628814697266, "loss": 0.9715, "rewards/accuracies": 0.65625, "rewards/chosen": 0.08902056515216827, "rewards/margins": 0.11626417189836502, "rewards/rejected": -0.0272436011582613, "step": 164 }, { "epoch": 0.20237638942123418, "grad_norm": 3.1753146648406982, "learning_rate": 9.79877300613497e-07, "logits/chosen": -2.235238790512085, "logits/rejected": -2.23298978805542, "logps/chosen": -57.10411071777344, "logps/rejected": -58.78346633911133, "loss": 0.955, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08687426894903183, "rewards/margins": 0.18466125428676605, "rewards/rejected": -0.09778700768947601, "step": 165 }, { "epoch": 0.2036029129934841, "grad_norm": 2.8554906845092773, "learning_rate": 9.79754601226994e-07, "logits/chosen": -2.2152259349823, "logits/rejected": -2.257423162460327, "logps/chosen": -53.647560119628906, "logps/rejected": -55.49702453613281, "loss": 0.9519, "rewards/accuracies": 0.75, "rewards/chosen": 0.05370066687464714, "rewards/margins": 0.19547612965106964, "rewards/rejected": -0.1417754739522934, "step": 166 }, { "epoch": 0.204829436565734, "grad_norm": 3.4446067810058594, "learning_rate": 9.796319018404908e-07, "logits/chosen": -2.2020535469055176, "logits/rejected": -2.178781747817993, "logps/chosen": -58.669921875, "logps/rejected": -60.094268798828125, "loss": 0.9569, "rewards/accuracies": 0.65625, "rewards/chosen": 0.05982861667871475, "rewards/margins": 0.17847979068756104, "rewards/rejected": -0.11865116655826569, "step": 167 }, { "epoch": 0.2060559601379839, "grad_norm": 5.114363670349121, "learning_rate": 9.795092024539877e-07, "logits/chosen": -2.2483749389648438, "logits/rejected": -2.2573306560516357, "logps/chosen": -58.537960052490234, "logps/rejected": -56.726234436035156, "loss": 0.9657, "rewards/accuracies": 0.65625, "rewards/chosen": 0.11836929619312286, "rewards/margins": 0.1396087408065796, "rewards/rejected": -0.021239448338747025, "step": 168 }, { "epoch": 0.2072824837102338, "grad_norm": 2.656541585922241, "learning_rate": 9.793865030674845e-07, "logits/chosen": -2.235407590866089, "logits/rejected": -2.2420473098754883, "logps/chosen": -58.852149963378906, "logps/rejected": -56.03770065307617, "loss": 0.9651, "rewards/accuracies": 0.71875, "rewards/chosen": 0.029968097805976868, "rewards/margins": 0.14266854524612427, "rewards/rejected": -0.11270047724246979, "step": 169 }, { "epoch": 0.2085090072824837, "grad_norm": 5.633166790008545, "learning_rate": 9.792638036809816e-07, "logits/chosen": -2.1976752281188965, "logits/rejected": -2.1900010108947754, "logps/chosen": -61.7386360168457, "logps/rejected": -57.482147216796875, "loss": 0.931, "rewards/accuracies": 0.84375, "rewards/chosen": 0.12938669323921204, "rewards/margins": 0.28162121772766113, "rewards/rejected": -0.1522345095872879, "step": 170 }, { "epoch": 0.20973553085473362, "grad_norm": 3.1398255825042725, "learning_rate": 9.791411042944785e-07, "logits/chosen": -2.2786359786987305, "logits/rejected": -2.272172689437866, "logps/chosen": -61.98131561279297, "logps/rejected": -59.20221710205078, "loss": 0.9731, "rewards/accuracies": 0.65625, "rewards/chosen": 0.057893067598342896, "rewards/margins": 0.11025725305080414, "rewards/rejected": -0.052364181727170944, "step": 171 }, { "epoch": 0.21096205442698351, "grad_norm": 2.5013763904571533, "learning_rate": 9.790184049079754e-07, "logits/chosen": -2.2581822872161865, "logits/rejected": -2.256866455078125, "logps/chosen": -56.44049072265625, "logps/rejected": -56.40041732788086, "loss": 0.9914, "rewards/accuracies": 0.65625, "rewards/chosen": -0.008894088678061962, "rewards/margins": 0.03535670042037964, "rewards/rejected": -0.044250793755054474, "step": 172 }, { "epoch": 0.21218857799923344, "grad_norm": 2.955842971801758, "learning_rate": 9.788957055214723e-07, "logits/chosen": -2.2498655319213867, "logits/rejected": -2.247792959213257, "logps/chosen": -54.22645568847656, "logps/rejected": -59.28506851196289, "loss": 0.9694, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02972128428518772, "rewards/margins": 0.1251320242881775, "rewards/rejected": -0.15485329926013947, "step": 173 }, { "epoch": 0.21341510157148333, "grad_norm": 2.7105515003204346, "learning_rate": 9.787730061349694e-07, "logits/chosen": -2.226548671722412, "logits/rejected": -2.2621331214904785, "logps/chosen": -56.11246871948242, "logps/rejected": -57.4040412902832, "loss": 0.9777, "rewards/accuracies": 0.625, "rewards/chosen": 0.0027367137372493744, "rewards/margins": 0.08850687742233276, "rewards/rejected": -0.08577016741037369, "step": 174 }, { "epoch": 0.21464162514373322, "grad_norm": 3.3812475204467773, "learning_rate": 9.786503067484663e-07, "logits/chosen": -2.244912624359131, "logits/rejected": -2.2518646717071533, "logps/chosen": -60.56603240966797, "logps/rejected": -59.26036071777344, "loss": 0.9699, "rewards/accuracies": 0.75, "rewards/chosen": 0.06383205950260162, "rewards/margins": 0.1219523698091507, "rewards/rejected": -0.05812031030654907, "step": 175 }, { "epoch": 0.21586814871598314, "grad_norm": 3.0488972663879395, "learning_rate": 9.785276073619631e-07, "logits/chosen": -2.2561447620391846, "logits/rejected": -2.260584831237793, "logps/chosen": -55.08391571044922, "logps/rejected": -57.42425537109375, "loss": 0.9716, "rewards/accuracies": 0.625, "rewards/chosen": 0.047162048518657684, "rewards/margins": 0.11768978834152222, "rewards/rejected": -0.07052773237228394, "step": 176 }, { "epoch": 0.21709467228823304, "grad_norm": 2.90256404876709, "learning_rate": 9.7840490797546e-07, "logits/chosen": -2.150135040283203, "logits/rejected": -2.197086811065674, "logps/chosen": -56.93759536743164, "logps/rejected": -56.39280700683594, "loss": 0.975, "rewards/accuracies": 0.65625, "rewards/chosen": -0.020964521914720535, "rewards/margins": 0.10114824026823044, "rewards/rejected": -0.12211278080940247, "step": 177 }, { "epoch": 0.21832119586048293, "grad_norm": 2.8120458126068115, "learning_rate": 9.782822085889571e-07, "logits/chosen": -2.1985299587249756, "logits/rejected": -2.2368083000183105, "logps/chosen": -59.523311614990234, "logps/rejected": -59.032814025878906, "loss": 0.9506, "rewards/accuracies": 0.75, "rewards/chosen": 0.019097672775387764, "rewards/margins": 0.20274242758750916, "rewards/rejected": -0.18364477157592773, "step": 178 }, { "epoch": 0.21954771943273285, "grad_norm": 3.110067844390869, "learning_rate": 9.78159509202454e-07, "logits/chosen": -2.223315954208374, "logits/rejected": -2.2307987213134766, "logps/chosen": -56.80317687988281, "logps/rejected": -57.8143310546875, "loss": 0.9768, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09834960103034973, "rewards/margins": 0.09608380496501923, "rewards/rejected": -0.19443340599536896, "step": 179 }, { "epoch": 0.22077424300498275, "grad_norm": 5.240647315979004, "learning_rate": 9.780368098159509e-07, "logits/chosen": -2.266916275024414, "logits/rejected": -2.2506120204925537, "logps/chosen": -56.801944732666016, "logps/rejected": -58.52326202392578, "loss": 0.9639, "rewards/accuracies": 0.65625, "rewards/chosen": -0.011133585125207901, "rewards/margins": 0.15869037806987762, "rewards/rejected": -0.16982395946979523, "step": 180 }, { "epoch": 0.22200076657723267, "grad_norm": 2.8173818588256836, "learning_rate": 9.779141104294477e-07, "logits/chosen": -2.207204580307007, "logits/rejected": -2.2629497051239014, "logps/chosen": -56.61592102050781, "logps/rejected": -55.65678787231445, "loss": 0.9559, "rewards/accuracies": 0.625, "rewards/chosen": 0.05698895826935768, "rewards/margins": 0.18964850902557373, "rewards/rejected": -0.13265955448150635, "step": 181 }, { "epoch": 0.22322729014948256, "grad_norm": 2.760131359100342, "learning_rate": 9.777914110429446e-07, "logits/chosen": -2.198199510574341, "logits/rejected": -2.1901369094848633, "logps/chosen": -58.194766998291016, "logps/rejected": -59.7773323059082, "loss": 0.9673, "rewards/accuracies": 0.65625, "rewards/chosen": 0.005784513428807259, "rewards/margins": 0.13405558466911316, "rewards/rejected": -0.12827105820178986, "step": 182 }, { "epoch": 0.22445381372173245, "grad_norm": 2.61710524559021, "learning_rate": 9.776687116564417e-07, "logits/chosen": -2.2404873371124268, "logits/rejected": -2.269984245300293, "logps/chosen": -57.435420989990234, "logps/rejected": -57.751991271972656, "loss": 0.9538, "rewards/accuracies": 0.8125, "rewards/chosen": 0.03843864053487778, "rewards/margins": 0.1978977918624878, "rewards/rejected": -0.1594591736793518, "step": 183 }, { "epoch": 0.22568033729398237, "grad_norm": 3.688959836959839, "learning_rate": 9.775460122699386e-07, "logits/chosen": -2.185727834701538, "logits/rejected": -2.235908031463623, "logps/chosen": -57.21923828125, "logps/rejected": -55.81343078613281, "loss": 0.9864, "rewards/accuracies": 0.5, "rewards/chosen": -0.07866362482309341, "rewards/margins": 0.055820152163505554, "rewards/rejected": -0.13448378443717957, "step": 184 }, { "epoch": 0.22690686086623227, "grad_norm": 3.638516664505005, "learning_rate": 9.774233128834355e-07, "logits/chosen": -2.2555794715881348, "logits/rejected": -2.2523157596588135, "logps/chosen": -57.54905700683594, "logps/rejected": -57.363765716552734, "loss": 0.9559, "rewards/accuracies": 0.65625, "rewards/chosen": 0.014249105006456375, "rewards/margins": 0.1830272376537323, "rewards/rejected": -0.16877812147140503, "step": 185 }, { "epoch": 0.2281333844384822, "grad_norm": 2.9124600887298584, "learning_rate": 9.773006134969326e-07, "logits/chosen": -2.1960678100585938, "logits/rejected": -2.232022762298584, "logps/chosen": -57.83955383300781, "logps/rejected": -56.843807220458984, "loss": 0.9682, "rewards/accuracies": 0.5, "rewards/chosen": 0.007589265704154968, "rewards/margins": 0.1291288286447525, "rewards/rejected": -0.12153957039117813, "step": 186 }, { "epoch": 0.22935990801073208, "grad_norm": 3.3667824268341064, "learning_rate": 9.771779141104295e-07, "logits/chosen": -2.259428024291992, "logits/rejected": -2.2755112648010254, "logps/chosen": -59.53873825073242, "logps/rejected": -58.205787658691406, "loss": 0.9984, "rewards/accuracies": 0.53125, "rewards/chosen": -0.07007473707199097, "rewards/margins": 0.006462054327130318, "rewards/rejected": -0.07653679698705673, "step": 187 }, { "epoch": 0.23058643158298198, "grad_norm": 3.0116724967956543, "learning_rate": 9.770552147239263e-07, "logits/chosen": -2.21445631980896, "logits/rejected": -2.2228782176971436, "logps/chosen": -58.90726089477539, "logps/rejected": -58.67171859741211, "loss": 0.9551, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0027824053540825844, "rewards/margins": 0.18528792262077332, "rewards/rejected": -0.1825055032968521, "step": 188 }, { "epoch": 0.2318129551552319, "grad_norm": 3.129948616027832, "learning_rate": 9.769325153374232e-07, "logits/chosen": -2.2377285957336426, "logits/rejected": -2.2183685302734375, "logps/chosen": -57.06877899169922, "logps/rejected": -58.55497741699219, "loss": 0.9641, "rewards/accuracies": 0.59375, "rewards/chosen": -0.004181228578090668, "rewards/margins": 0.1498926877975464, "rewards/rejected": -0.15407393872737885, "step": 189 }, { "epoch": 0.2330394787274818, "grad_norm": 3.6961565017700195, "learning_rate": 9.7680981595092e-07, "logits/chosen": -2.1786794662475586, "logits/rejected": -2.224048137664795, "logps/chosen": -58.763336181640625, "logps/rejected": -59.62835693359375, "loss": 0.9488, "rewards/accuracies": 0.625, "rewards/chosen": 0.15978723764419556, "rewards/margins": 0.21867629885673523, "rewards/rejected": -0.05888907611370087, "step": 190 }, { "epoch": 0.23426600229973168, "grad_norm": 3.248612642288208, "learning_rate": 9.766871165644172e-07, "logits/chosen": -2.1803698539733887, "logits/rejected": -2.218970775604248, "logps/chosen": -56.76003646850586, "logps/rejected": -57.63624572753906, "loss": 0.9672, "rewards/accuracies": 0.5625, "rewards/chosen": 0.03911640867590904, "rewards/margins": 0.13103064894676208, "rewards/rejected": -0.09191423654556274, "step": 191 }, { "epoch": 0.2354925258719816, "grad_norm": 3.669145345687866, "learning_rate": 9.76564417177914e-07, "logits/chosen": -2.209909200668335, "logits/rejected": -2.2092323303222656, "logps/chosen": -57.03281021118164, "logps/rejected": -61.9040641784668, "loss": 0.9408, "rewards/accuracies": 0.78125, "rewards/chosen": 0.19508101046085358, "rewards/margins": 0.24065914750099182, "rewards/rejected": -0.04557815566658974, "step": 192 }, { "epoch": 0.2367190494442315, "grad_norm": 3.7126755714416504, "learning_rate": 9.76441717791411e-07, "logits/chosen": -2.185610055923462, "logits/rejected": -2.218942642211914, "logps/chosen": -57.036956787109375, "logps/rejected": -61.144447326660156, "loss": 0.9155, "rewards/accuracies": 0.71875, "rewards/chosen": -0.030093764886260033, "rewards/margins": 0.36061975359916687, "rewards/rejected": -0.39071348309516907, "step": 193 }, { "epoch": 0.23794557301648142, "grad_norm": 3.3099308013916016, "learning_rate": 9.763190184049078e-07, "logits/chosen": -2.2588722705841064, "logits/rejected": -2.3074452877044678, "logps/chosen": -58.27874755859375, "logps/rejected": -60.657554626464844, "loss": 0.9599, "rewards/accuracies": 0.65625, "rewards/chosen": -0.030434321612119675, "rewards/margins": 0.17197512090206146, "rewards/rejected": -0.20240944623947144, "step": 194 }, { "epoch": 0.23917209658873131, "grad_norm": 3.395725727081299, "learning_rate": 9.76196319018405e-07, "logits/chosen": -2.16856050491333, "logits/rejected": -2.1644654273986816, "logps/chosen": -58.567100524902344, "logps/rejected": -58.83918762207031, "loss": 0.9458, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06793972849845886, "rewards/margins": 0.22787326574325562, "rewards/rejected": -0.15993353724479675, "step": 195 }, { "epoch": 0.2403986201609812, "grad_norm": 3.4618706703186035, "learning_rate": 9.760736196319018e-07, "logits/chosen": -2.2301125526428223, "logits/rejected": -2.24570631980896, "logps/chosen": -58.30298614501953, "logps/rejected": -58.256553649902344, "loss": 0.9449, "rewards/accuracies": 0.75, "rewards/chosen": 0.05555347725749016, "rewards/margins": 0.2316940724849701, "rewards/rejected": -0.17614060640335083, "step": 196 }, { "epoch": 0.24162514373323113, "grad_norm": 2.8252718448638916, "learning_rate": 9.759509202453987e-07, "logits/chosen": -2.22204852104187, "logits/rejected": -2.2003345489501953, "logps/chosen": -57.873046875, "logps/rejected": -58.02743148803711, "loss": 0.9523, "rewards/accuracies": 0.65625, "rewards/chosen": 0.005425143986940384, "rewards/margins": 0.20013655722141266, "rewards/rejected": -0.19471138715744019, "step": 197 }, { "epoch": 0.24285166730548102, "grad_norm": 3.2738723754882812, "learning_rate": 9.758282208588958e-07, "logits/chosen": -2.163856029510498, "logits/rejected": -2.178149938583374, "logps/chosen": -54.342689514160156, "logps/rejected": -55.0627555847168, "loss": 0.9561, "rewards/accuracies": 0.625, "rewards/chosen": 0.1267026960849762, "rewards/margins": 0.1832016408443451, "rewards/rejected": -0.05649895966053009, "step": 198 }, { "epoch": 0.24407819087773094, "grad_norm": 4.096073150634766, "learning_rate": 9.757055214723927e-07, "logits/chosen": -2.190136432647705, "logits/rejected": -2.1902408599853516, "logps/chosen": -56.53943634033203, "logps/rejected": -57.29685974121094, "loss": 0.9452, "rewards/accuracies": 0.78125, "rewards/chosen": 0.03251629322767258, "rewards/margins": 0.23593159019947052, "rewards/rejected": -0.20341527462005615, "step": 199 }, { "epoch": 0.24530471444998084, "grad_norm": 3.006974697113037, "learning_rate": 9.755828220858895e-07, "logits/chosen": -2.2688982486724854, "logits/rejected": -2.282849073410034, "logps/chosen": -59.24298095703125, "logps/rejected": -59.90022277832031, "loss": 0.9525, "rewards/accuracies": 0.75, "rewards/chosen": -0.03408081457018852, "rewards/margins": 0.1992078423500061, "rewards/rejected": -0.23328866064548492, "step": 200 }, { "epoch": 0.24653123802223073, "grad_norm": 3.297804355621338, "learning_rate": 9.754601226993864e-07, "logits/chosen": -2.235957622528076, "logits/rejected": -2.268630266189575, "logps/chosen": -57.307437896728516, "logps/rejected": -59.16643524169922, "loss": 0.9388, "rewards/accuracies": 0.71875, "rewards/chosen": -0.013687558472156525, "rewards/margins": 0.26264241337776184, "rewards/rejected": -0.27632996439933777, "step": 201 }, { "epoch": 0.24775776159448065, "grad_norm": 3.362529993057251, "learning_rate": 9.753374233128833e-07, "logits/chosen": -2.2266108989715576, "logits/rejected": -2.2425527572631836, "logps/chosen": -60.57048797607422, "logps/rejected": -58.51158142089844, "loss": 0.9558, "rewards/accuracies": 0.6875, "rewards/chosen": 0.021241512149572372, "rewards/margins": 0.19015629589557648, "rewards/rejected": -0.1689147651195526, "step": 202 }, { "epoch": 0.24898428516673055, "grad_norm": 3.247051477432251, "learning_rate": 9.752147239263804e-07, "logits/chosen": -2.283604621887207, "logits/rejected": -2.291098117828369, "logps/chosen": -56.62052917480469, "logps/rejected": -58.35918426513672, "loss": 0.9479, "rewards/accuracies": 0.75, "rewards/chosen": -0.14385659992694855, "rewards/margins": 0.22683292627334595, "rewards/rejected": -0.3706895112991333, "step": 203 }, { "epoch": 0.25021080873898044, "grad_norm": 3.5565848350524902, "learning_rate": 9.750920245398773e-07, "logits/chosen": -2.19114089012146, "logits/rejected": -2.187793731689453, "logps/chosen": -60.276390075683594, "logps/rejected": -59.527923583984375, "loss": 0.9313, "rewards/accuracies": 0.75, "rewards/chosen": -0.08685958385467529, "rewards/margins": 0.3019901216030121, "rewards/rejected": -0.38884973526000977, "step": 204 }, { "epoch": 0.25143733231123033, "grad_norm": 3.691121816635132, "learning_rate": 9.749693251533741e-07, "logits/chosen": -2.2393879890441895, "logits/rejected": -2.2364895343780518, "logps/chosen": -58.11058044433594, "logps/rejected": -61.639671325683594, "loss": 0.9232, "rewards/accuracies": 0.71875, "rewards/chosen": 0.01997845247387886, "rewards/margins": 0.32267695665359497, "rewards/rejected": -0.3026984632015228, "step": 205 }, { "epoch": 0.2526638558834803, "grad_norm": 3.7057456970214844, "learning_rate": 9.748466257668712e-07, "logits/chosen": -2.2227134704589844, "logits/rejected": -2.268899917602539, "logps/chosen": -57.86807632446289, "logps/rejected": -59.45074462890625, "loss": 0.9128, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08089522272348404, "rewards/margins": 0.36189043521881104, "rewards/rejected": -0.2809952199459076, "step": 206 }, { "epoch": 0.2538903794557302, "grad_norm": 3.76484751701355, "learning_rate": 9.747239263803681e-07, "logits/chosen": -2.2104272842407227, "logits/rejected": -2.297198534011841, "logps/chosen": -58.101226806640625, "logps/rejected": -61.22675323486328, "loss": 0.9061, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12297047674655914, "rewards/margins": 0.429110050201416, "rewards/rejected": -0.552080512046814, "step": 207 }, { "epoch": 0.25511690302798007, "grad_norm": 3.234133243560791, "learning_rate": 9.74601226993865e-07, "logits/chosen": -2.2871522903442383, "logits/rejected": -2.312046766281128, "logps/chosen": -56.74910354614258, "logps/rejected": -58.80845642089844, "loss": 0.9595, "rewards/accuracies": 0.625, "rewards/chosen": -0.14409607648849487, "rewards/margins": 0.18296119570732117, "rewards/rejected": -0.32705727219581604, "step": 208 }, { "epoch": 0.25634342660022996, "grad_norm": 3.498549461364746, "learning_rate": 9.744785276073619e-07, "logits/chosen": -2.2026689052581787, "logits/rejected": -2.2521421909332275, "logps/chosen": -58.10637664794922, "logps/rejected": -58.632911682128906, "loss": 0.9301, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0027103805914521217, "rewards/margins": 0.30863475799560547, "rewards/rejected": -0.3059243857860565, "step": 209 }, { "epoch": 0.25756995017247986, "grad_norm": 3.3813164234161377, "learning_rate": 9.743558282208588e-07, "logits/chosen": -2.2135415077209473, "logits/rejected": -2.246603012084961, "logps/chosen": -55.76283645629883, "logps/rejected": -56.66757583618164, "loss": 0.9425, "rewards/accuracies": 0.625, "rewards/chosen": 0.1332480013370514, "rewards/margins": 0.2591882646083832, "rewards/rejected": -0.1259402334690094, "step": 210 }, { "epoch": 0.2587964737447298, "grad_norm": 3.978489875793457, "learning_rate": 9.742331288343559e-07, "logits/chosen": -2.19888973236084, "logits/rejected": -2.2517151832580566, "logps/chosen": -59.376182556152344, "logps/rejected": -60.62775802612305, "loss": 0.9316, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09080745279788971, "rewards/margins": 0.3166324496269226, "rewards/rejected": -0.4074398875236511, "step": 211 }, { "epoch": 0.2600229973169797, "grad_norm": 3.216719150543213, "learning_rate": 9.741104294478527e-07, "logits/chosen": -2.2373225688934326, "logits/rejected": -2.2398176193237305, "logps/chosen": -57.774375915527344, "logps/rejected": -56.62223434448242, "loss": 0.9806, "rewards/accuracies": 0.59375, "rewards/chosen": -0.14872559905052185, "rewards/margins": 0.08711370080709457, "rewards/rejected": -0.23583927750587463, "step": 212 }, { "epoch": 0.2612495208892296, "grad_norm": 3.1411540508270264, "learning_rate": 9.739877300613496e-07, "logits/chosen": -2.26993727684021, "logits/rejected": -2.2648701667785645, "logps/chosen": -59.716453552246094, "logps/rejected": -59.15172576904297, "loss": 0.9198, "rewards/accuracies": 0.6875, "rewards/chosen": -0.017406854778528214, "rewards/margins": 0.35276708006858826, "rewards/rejected": -0.37017399072647095, "step": 213 }, { "epoch": 0.2624760444614795, "grad_norm": 3.658346176147461, "learning_rate": 9.738650306748465e-07, "logits/chosen": -2.2591300010681152, "logits/rejected": -2.266083240509033, "logps/chosen": -58.9105224609375, "logps/rejected": -60.776092529296875, "loss": 0.9535, "rewards/accuracies": 0.5625, "rewards/chosen": -0.053758762776851654, "rewards/margins": 0.19229605793952942, "rewards/rejected": -0.24605482816696167, "step": 214 }, { "epoch": 0.2637025680337294, "grad_norm": 3.9971182346343994, "learning_rate": 9.737423312883436e-07, "logits/chosen": -2.188084602355957, "logits/rejected": -2.2595696449279785, "logps/chosen": -61.072139739990234, "logps/rejected": -62.047481536865234, "loss": 0.8752, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06062617525458336, "rewards/margins": 0.5581847429275513, "rewards/rejected": -0.6188108921051025, "step": 215 }, { "epoch": 0.2649290916059793, "grad_norm": 8.051568031311035, "learning_rate": 9.736196319018405e-07, "logits/chosen": -2.225024938583374, "logits/rejected": -2.263540744781494, "logps/chosen": -59.481719970703125, "logps/rejected": -57.7908935546875, "loss": 0.9512, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10534758120775223, "rewards/margins": 0.22585362195968628, "rewards/rejected": -0.3312011957168579, "step": 216 }, { "epoch": 0.2661556151782292, "grad_norm": 3.696871519088745, "learning_rate": 9.734969325153373e-07, "logits/chosen": -2.2863621711730957, "logits/rejected": -2.2790451049804688, "logps/chosen": -56.36181640625, "logps/rejected": -59.61421203613281, "loss": 0.925, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08676356077194214, "rewards/margins": 0.318224161863327, "rewards/rejected": -0.23146063089370728, "step": 217 }, { "epoch": 0.2673821387504791, "grad_norm": 3.445866107940674, "learning_rate": 9.733742331288342e-07, "logits/chosen": -2.2227556705474854, "logits/rejected": -2.2539656162261963, "logps/chosen": -59.5650634765625, "logps/rejected": -60.24578094482422, "loss": 0.9557, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0391082689166069, "rewards/margins": 0.1887841820716858, "rewards/rejected": -0.14967593550682068, "step": 218 }, { "epoch": 0.268608662322729, "grad_norm": 3.47672700881958, "learning_rate": 9.732515337423313e-07, "logits/chosen": -2.245398998260498, "logits/rejected": -2.264543294906616, "logps/chosen": -58.734954833984375, "logps/rejected": -58.01295471191406, "loss": 0.9253, "rewards/accuracies": 0.59375, "rewards/chosen": 0.039022620767354965, "rewards/margins": 0.3387625813484192, "rewards/rejected": -0.2997399568557739, "step": 219 }, { "epoch": 0.2698351858949789, "grad_norm": 4.151553153991699, "learning_rate": 9.731288343558282e-07, "logits/chosen": -2.289560556411743, "logits/rejected": -2.362332582473755, "logps/chosen": -57.84265899658203, "logps/rejected": -60.99207305908203, "loss": 0.9291, "rewards/accuracies": 0.75, "rewards/chosen": -0.01757587306201458, "rewards/margins": 0.30879294872283936, "rewards/rejected": -0.3263688087463379, "step": 220 }, { "epoch": 0.27106170946722885, "grad_norm": 3.7229840755462646, "learning_rate": 9.73006134969325e-07, "logits/chosen": -2.2625715732574463, "logits/rejected": -2.2621009349823, "logps/chosen": -58.919822692871094, "logps/rejected": -61.701438903808594, "loss": 0.9241, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12901857495307922, "rewards/margins": 0.3394571840763092, "rewards/rejected": -0.46847572922706604, "step": 221 }, { "epoch": 0.27228823303947874, "grad_norm": 5.530328750610352, "learning_rate": 9.72883435582822e-07, "logits/chosen": -2.2486302852630615, "logits/rejected": -2.2469983100891113, "logps/chosen": -57.90180969238281, "logps/rejected": -62.573631286621094, "loss": 0.9335, "rewards/accuracies": 0.65625, "rewards/chosen": -0.026311371475458145, "rewards/margins": 0.29492172598838806, "rewards/rejected": -0.3212330639362335, "step": 222 }, { "epoch": 0.27351475661172864, "grad_norm": 3.591033935546875, "learning_rate": 9.727607361963188e-07, "logits/chosen": -2.194331169128418, "logits/rejected": -2.2325127124786377, "logps/chosen": -56.67311096191406, "logps/rejected": -62.58027267456055, "loss": 0.8814, "rewards/accuracies": 0.84375, "rewards/chosen": 0.04217422753572464, "rewards/margins": 0.5165954232215881, "rewards/rejected": -0.4744212329387665, "step": 223 }, { "epoch": 0.27474128018397853, "grad_norm": 5.745713233947754, "learning_rate": 9.72638036809816e-07, "logits/chosen": -2.2230923175811768, "logits/rejected": -2.233288288116455, "logps/chosen": -56.83674240112305, "logps/rejected": -62.9018440246582, "loss": 0.9145, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12289882451295853, "rewards/margins": 0.44829830527305603, "rewards/rejected": -0.5711972117424011, "step": 224 }, { "epoch": 0.2759678037562284, "grad_norm": 3.6943321228027344, "learning_rate": 9.725153374233128e-07, "logits/chosen": -2.2736704349517822, "logits/rejected": -2.282390594482422, "logps/chosen": -58.74912643432617, "logps/rejected": -60.2332763671875, "loss": 0.9367, "rewards/accuracies": 0.625, "rewards/chosen": -0.0230700746178627, "rewards/margins": 0.3044799566268921, "rewards/rejected": -0.32755008339881897, "step": 225 }, { "epoch": 0.2771943273284783, "grad_norm": 4.162757396697998, "learning_rate": 9.7239263803681e-07, "logits/chosen": -2.2012736797332764, "logits/rejected": -2.2188100814819336, "logps/chosen": -52.64942932128906, "logps/rejected": -57.856990814208984, "loss": 0.948, "rewards/accuracies": 0.65625, "rewards/chosen": 0.06401514261960983, "rewards/margins": 0.21686241030693054, "rewards/rejected": -0.1528472602367401, "step": 226 }, { "epoch": 0.27842085090072827, "grad_norm": 4.367449760437012, "learning_rate": 9.722699386503068e-07, "logits/chosen": -2.225940704345703, "logits/rejected": -2.2422397136688232, "logps/chosen": -55.56187438964844, "logps/rejected": -56.574039459228516, "loss": 0.948, "rewards/accuracies": 0.625, "rewards/chosen": 0.106214240193367, "rewards/margins": 0.23287320137023926, "rewards/rejected": -0.12665894627571106, "step": 227 }, { "epoch": 0.27964737447297816, "grad_norm": 4.599761009216309, "learning_rate": 9.721472392638037e-07, "logits/chosen": -2.228322744369507, "logits/rejected": -2.239119052886963, "logps/chosen": -55.8618049621582, "logps/rejected": -60.0567626953125, "loss": 0.9206, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0019710883498191833, "rewards/margins": 0.3489982783794403, "rewards/rejected": -0.3470272123813629, "step": 228 }, { "epoch": 0.28087389804522805, "grad_norm": 3.820615768432617, "learning_rate": 9.720245398773006e-07, "logits/chosen": -2.264831066131592, "logits/rejected": -2.265393018722534, "logps/chosen": -58.034217834472656, "logps/rejected": -61.53338623046875, "loss": 0.9347, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15261074900627136, "rewards/margins": 0.31291890144348145, "rewards/rejected": -0.4655297100543976, "step": 229 }, { "epoch": 0.28210042161747795, "grad_norm": 3.3732309341430664, "learning_rate": 9.719018404907974e-07, "logits/chosen": -2.245795249938965, "logits/rejected": -2.27901291847229, "logps/chosen": -59.319297790527344, "logps/rejected": -61.40094757080078, "loss": 0.9825, "rewards/accuracies": 0.59375, "rewards/chosen": -0.34559139609336853, "rewards/margins": 0.07126484811306, "rewards/rejected": -0.4168562889099121, "step": 230 }, { "epoch": 0.28332694518972784, "grad_norm": 4.2923994064331055, "learning_rate": 9.717791411042945e-07, "logits/chosen": -2.277470588684082, "logits/rejected": -2.291841983795166, "logps/chosen": -57.52934646606445, "logps/rejected": -60.36398696899414, "loss": 0.9248, "rewards/accuracies": 0.625, "rewards/chosen": -0.1535995602607727, "rewards/margins": 0.3384145200252533, "rewards/rejected": -0.4920140504837036, "step": 231 }, { "epoch": 0.2845534687619778, "grad_norm": 3.848615884780884, "learning_rate": 9.716564417177914e-07, "logits/chosen": -2.2566235065460205, "logits/rejected": -2.2319722175598145, "logps/chosen": -61.475765228271484, "logps/rejected": -61.94943618774414, "loss": 0.9895, "rewards/accuracies": 0.5625, "rewards/chosen": -0.31201082468032837, "rewards/margins": 0.026136714965105057, "rewards/rejected": -0.33814752101898193, "step": 232 }, { "epoch": 0.2857799923342277, "grad_norm": 4.549901962280273, "learning_rate": 9.715337423312883e-07, "logits/chosen": -2.219874382019043, "logits/rejected": -2.2646121978759766, "logps/chosen": -56.25140380859375, "logps/rejected": -58.60102462768555, "loss": 0.8969, "rewards/accuracies": 0.8125, "rewards/chosen": 0.030025988817214966, "rewards/margins": 0.45991280674934387, "rewards/rejected": -0.42988675832748413, "step": 233 }, { "epoch": 0.2870065159064776, "grad_norm": 3.947981834411621, "learning_rate": 9.714110429447852e-07, "logits/chosen": -2.2859320640563965, "logits/rejected": -2.2698135375976562, "logps/chosen": -60.64798355102539, "logps/rejected": -62.987998962402344, "loss": 0.9137, "rewards/accuracies": 0.625, "rewards/chosen": -0.34378644824028015, "rewards/margins": 0.42484748363494873, "rewards/rejected": -0.7686339616775513, "step": 234 }, { "epoch": 0.28823303947872747, "grad_norm": 4.328583240509033, "learning_rate": 9.712883435582823e-07, "logits/chosen": -2.249824047088623, "logits/rejected": -2.32157826423645, "logps/chosen": -59.14007568359375, "logps/rejected": -61.53468704223633, "loss": 0.9634, "rewards/accuracies": 0.625, "rewards/chosen": -0.29743945598602295, "rewards/margins": 0.20987047255039215, "rewards/rejected": -0.5073099136352539, "step": 235 }, { "epoch": 0.28945956305097736, "grad_norm": 4.364981174468994, "learning_rate": 9.711656441717791e-07, "logits/chosen": -2.206968307495117, "logits/rejected": -2.2377984523773193, "logps/chosen": -56.8301887512207, "logps/rejected": -61.22038269042969, "loss": 0.9404, "rewards/accuracies": 0.625, "rewards/chosen": -0.14873340725898743, "rewards/margins": 0.2960086762905121, "rewards/rejected": -0.4447421133518219, "step": 236 }, { "epoch": 0.2906860866232273, "grad_norm": 4.51679801940918, "learning_rate": 9.71042944785276e-07, "logits/chosen": -2.275890827178955, "logits/rejected": -2.332024335861206, "logps/chosen": -56.71741485595703, "logps/rejected": -60.95820617675781, "loss": 0.9741, "rewards/accuracies": 0.625, "rewards/chosen": -0.04775645583868027, "rewards/margins": 0.11398418992757797, "rewards/rejected": -0.16174064576625824, "step": 237 }, { "epoch": 0.2919126101954772, "grad_norm": 5.269367694854736, "learning_rate": 9.70920245398773e-07, "logits/chosen": -2.2773900032043457, "logits/rejected": -2.250969171524048, "logps/chosen": -56.84004211425781, "logps/rejected": -59.775230407714844, "loss": 0.9481, "rewards/accuracies": 0.625, "rewards/chosen": -0.200571671128273, "rewards/margins": 0.2764420509338379, "rewards/rejected": -0.4770137071609497, "step": 238 }, { "epoch": 0.2931391337677271, "grad_norm": 4.921009063720703, "learning_rate": 9.7079754601227e-07, "logits/chosen": -2.228773593902588, "logits/rejected": -2.276599407196045, "logps/chosen": -55.586082458496094, "logps/rejected": -58.63681411743164, "loss": 0.882, "rewards/accuracies": 0.625, "rewards/chosen": 0.27850446105003357, "rewards/margins": 0.5828676223754883, "rewards/rejected": -0.3043631911277771, "step": 239 }, { "epoch": 0.294365657339977, "grad_norm": 4.197547912597656, "learning_rate": 9.706748466257669e-07, "logits/chosen": -2.3063573837280273, "logits/rejected": -2.2763583660125732, "logps/chosen": -58.32592010498047, "logps/rejected": -62.1884651184082, "loss": 0.9241, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1647324413061142, "rewards/margins": 0.3322623670101166, "rewards/rejected": -0.4969947934150696, "step": 240 }, { "epoch": 0.2955921809122269, "grad_norm": 4.654932975769043, "learning_rate": 9.705521472392638e-07, "logits/chosen": -2.2958099842071533, "logits/rejected": -2.2902941703796387, "logps/chosen": -56.73625183105469, "logps/rejected": -60.02557373046875, "loss": 0.9212, "rewards/accuracies": 0.625, "rewards/chosen": -0.009961726143956184, "rewards/margins": 0.3634602725505829, "rewards/rejected": -0.37342196702957153, "step": 241 }, { "epoch": 0.29681870448447684, "grad_norm": 3.798475742340088, "learning_rate": 9.704294478527606e-07, "logits/chosen": -2.283158540725708, "logits/rejected": -2.2855658531188965, "logps/chosen": -60.43410110473633, "logps/rejected": -59.93269348144531, "loss": 0.9459, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09525088965892792, "rewards/margins": 0.26561716198921204, "rewards/rejected": -0.36086803674697876, "step": 242 }, { "epoch": 0.29804522805672673, "grad_norm": 4.3954362869262695, "learning_rate": 9.703067484662575e-07, "logits/chosen": -2.3357150554656982, "logits/rejected": -2.3176302909851074, "logps/chosen": -59.04026794433594, "logps/rejected": -61.33356475830078, "loss": 0.9108, "rewards/accuracies": 0.71875, "rewards/chosen": -0.01788036897778511, "rewards/margins": 0.4248279333114624, "rewards/rejected": -0.4427083134651184, "step": 243 }, { "epoch": 0.2992717516289766, "grad_norm": 4.446166515350342, "learning_rate": 9.701840490797546e-07, "logits/chosen": -2.2559759616851807, "logits/rejected": -2.2629361152648926, "logps/chosen": -56.88330078125, "logps/rejected": -61.7509880065918, "loss": 0.942, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09456269443035126, "rewards/margins": 0.2627048194408417, "rewards/rejected": -0.35726749897003174, "step": 244 }, { "epoch": 0.3004982752012265, "grad_norm": 4.605600833892822, "learning_rate": 9.700613496932515e-07, "logits/chosen": -2.2604079246520996, "logits/rejected": -2.2715375423431396, "logps/chosen": -59.01872634887695, "logps/rejected": -62.61427688598633, "loss": 0.9591, "rewards/accuracies": 0.625, "rewards/chosen": -0.36565834283828735, "rewards/margins": 0.16839273273944855, "rewards/rejected": -0.5340511202812195, "step": 245 }, { "epoch": 0.3017247987734764, "grad_norm": 4.383885860443115, "learning_rate": 9.699386503067484e-07, "logits/chosen": -2.309216022491455, "logits/rejected": -2.3019886016845703, "logps/chosen": -60.15336608886719, "logps/rejected": -62.972511291503906, "loss": 0.9205, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1125607118010521, "rewards/margins": 0.35295483469963074, "rewards/rejected": -0.4655155539512634, "step": 246 }, { "epoch": 0.30295132234572636, "grad_norm": 4.6438517570495605, "learning_rate": 9.698159509202455e-07, "logits/chosen": -2.2037718296051025, "logits/rejected": -2.2283759117126465, "logps/chosen": -55.55849838256836, "logps/rejected": -58.099422454833984, "loss": 0.8979, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13340447843074799, "rewards/margins": 0.46507447957992554, "rewards/rejected": -0.33167001605033875, "step": 247 }, { "epoch": 0.30417784591797625, "grad_norm": 3.987764835357666, "learning_rate": 9.696932515337423e-07, "logits/chosen": -2.2458250522613525, "logits/rejected": -2.278627872467041, "logps/chosen": -57.42875671386719, "logps/rejected": -58.5342903137207, "loss": 0.9416, "rewards/accuracies": 0.6875, "rewards/chosen": 0.020797960460186005, "rewards/margins": 0.2696303129196167, "rewards/rejected": -0.24883237481117249, "step": 248 }, { "epoch": 0.30540436949022615, "grad_norm": 5.809876918792725, "learning_rate": 9.695705521472392e-07, "logits/chosen": -2.259486675262451, "logits/rejected": -2.2814998626708984, "logps/chosen": -57.01399612426758, "logps/rejected": -59.96882247924805, "loss": 0.9203, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07107911258935928, "rewards/margins": 0.3810480833053589, "rewards/rejected": -0.4521271586418152, "step": 249 }, { "epoch": 0.30663089306247604, "grad_norm": 4.328128337860107, "learning_rate": 9.69447852760736e-07, "logits/chosen": -2.199211597442627, "logits/rejected": -2.2456343173980713, "logps/chosen": -58.20145034790039, "logps/rejected": -58.66518020629883, "loss": 0.9001, "rewards/accuracies": 0.8125, "rewards/chosen": 0.09123198688030243, "rewards/margins": 0.4746793806552887, "rewards/rejected": -0.38344740867614746, "step": 250 }, { "epoch": 0.30785741663472593, "grad_norm": 4.288325786590576, "learning_rate": 9.69325153374233e-07, "logits/chosen": -2.299579381942749, "logits/rejected": -2.3152875900268555, "logps/chosen": -56.34951400756836, "logps/rejected": -57.440826416015625, "loss": 0.9703, "rewards/accuracies": 0.53125, "rewards/chosen": -0.008303668349981308, "rewards/margins": 0.1444416642189026, "rewards/rejected": -0.1527453362941742, "step": 251 }, { "epoch": 0.3090839402069758, "grad_norm": 5.260382652282715, "learning_rate": 9.6920245398773e-07, "logits/chosen": -2.2401046752929688, "logits/rejected": -2.2984306812286377, "logps/chosen": -59.068058013916016, "logps/rejected": -61.444740295410156, "loss": 0.9446, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08729809522628784, "rewards/margins": 0.25836896896362305, "rewards/rejected": -0.1710709035396576, "step": 252 }, { "epoch": 0.3103104637792258, "grad_norm": 4.337841510772705, "learning_rate": 9.69079754601227e-07, "logits/chosen": -2.2613070011138916, "logits/rejected": -2.2843682765960693, "logps/chosen": -53.977134704589844, "logps/rejected": -59.19281005859375, "loss": 0.901, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07697927206754684, "rewards/margins": 0.4929216504096985, "rewards/rejected": -0.41594240069389343, "step": 253 }, { "epoch": 0.31153698735147567, "grad_norm": 4.08612585067749, "learning_rate": 9.689570552147238e-07, "logits/chosen": -2.229125499725342, "logits/rejected": -2.2568728923797607, "logps/chosen": -55.75757598876953, "logps/rejected": -57.69784927368164, "loss": 0.8653, "rewards/accuracies": 0.90625, "rewards/chosen": 0.20269152522087097, "rewards/margins": 0.6764520406723022, "rewards/rejected": -0.47376054525375366, "step": 254 }, { "epoch": 0.31276351092372556, "grad_norm": 4.111661434173584, "learning_rate": 9.688343558282207e-07, "logits/chosen": -2.305729389190674, "logits/rejected": -2.3035943508148193, "logps/chosen": -58.2225456237793, "logps/rejected": -61.1976318359375, "loss": 0.8705, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3116147518157959, "rewards/margins": 0.6193372011184692, "rewards/rejected": -0.3077224791049957, "step": 255 }, { "epoch": 0.31399003449597546, "grad_norm": 4.4490814208984375, "learning_rate": 9.687116564417178e-07, "logits/chosen": -2.2239608764648438, "logits/rejected": -2.243239402770996, "logps/chosen": -56.804656982421875, "logps/rejected": -58.7873649597168, "loss": 0.9367, "rewards/accuracies": 0.75, "rewards/chosen": 0.14778520166873932, "rewards/margins": 0.27897730469703674, "rewards/rejected": -0.13119208812713623, "step": 256 }, { "epoch": 0.31521655806822535, "grad_norm": 4.49106502532959, "learning_rate": 9.685889570552147e-07, "logits/chosen": -2.2502222061157227, "logits/rejected": -2.30078387260437, "logps/chosen": -57.221458435058594, "logps/rejected": -61.315704345703125, "loss": 0.9097, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0466204397380352, "rewards/margins": 0.471771240234375, "rewards/rejected": -0.5183916687965393, "step": 257 }, { "epoch": 0.3164430816404753, "grad_norm": 4.670991897583008, "learning_rate": 9.684662576687116e-07, "logits/chosen": -2.230020523071289, "logits/rejected": -2.2742347717285156, "logps/chosen": -57.800960540771484, "logps/rejected": -58.16173553466797, "loss": 0.9089, "rewards/accuracies": 0.71875, "rewards/chosen": 0.08284968882799149, "rewards/margins": 0.4228099584579468, "rewards/rejected": -0.3399602770805359, "step": 258 }, { "epoch": 0.3176696052127252, "grad_norm": 4.138726711273193, "learning_rate": 9.683435582822087e-07, "logits/chosen": -2.2733535766601562, "logits/rejected": -2.2447876930236816, "logps/chosen": -55.85380554199219, "logps/rejected": -58.3249626159668, "loss": 0.9338, "rewards/accuracies": 0.71875, "rewards/chosen": -0.005118235945701599, "rewards/margins": 0.3251428008079529, "rewards/rejected": -0.3302610516548157, "step": 259 }, { "epoch": 0.3188961287849751, "grad_norm": 8.742024421691895, "learning_rate": 9.682208588957055e-07, "logits/chosen": -2.324883222579956, "logits/rejected": -2.32619309425354, "logps/chosen": -58.47166442871094, "logps/rejected": -59.98308563232422, "loss": 0.9205, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07039004564285278, "rewards/margins": 0.3726171851158142, "rewards/rejected": -0.443007230758667, "step": 260 }, { "epoch": 0.320122652357225, "grad_norm": 4.584309101104736, "learning_rate": 9.680981595092024e-07, "logits/chosen": -2.2644636631011963, "logits/rejected": -2.260169267654419, "logps/chosen": -56.47004318237305, "logps/rejected": -59.350433349609375, "loss": 0.9391, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09065771102905273, "rewards/margins": 0.26301050186157227, "rewards/rejected": -0.353668212890625, "step": 261 }, { "epoch": 0.3213491759294749, "grad_norm": 4.518170356750488, "learning_rate": 9.679754601226993e-07, "logits/chosen": -2.2754123210906982, "logits/rejected": -2.308803081512451, "logps/chosen": -59.54153823852539, "logps/rejected": -59.82479476928711, "loss": 0.8436, "rewards/accuracies": 0.71875, "rewards/chosen": 0.17954498529434204, "rewards/margins": 0.7185479402542114, "rewards/rejected": -0.5390029549598694, "step": 262 }, { "epoch": 0.3225756995017248, "grad_norm": 3.9556238651275635, "learning_rate": 9.678527607361962e-07, "logits/chosen": -2.3057289123535156, "logits/rejected": -2.3254480361938477, "logps/chosen": -59.24192810058594, "logps/rejected": -62.2436408996582, "loss": 0.9312, "rewards/accuracies": 0.59375, "rewards/chosen": -0.15212482213974, "rewards/margins": 0.33396509289741516, "rewards/rejected": -0.48608991503715515, "step": 263 }, { "epoch": 0.3238022230739747, "grad_norm": 4.439394474029541, "learning_rate": 9.677300613496933e-07, "logits/chosen": -2.2526326179504395, "logits/rejected": -2.294677257537842, "logps/chosen": -59.58015441894531, "logps/rejected": -59.4432373046875, "loss": 0.9758, "rewards/accuracies": 0.59375, "rewards/chosen": -0.18483731150627136, "rewards/margins": 0.10617592930793762, "rewards/rejected": -0.2910132110118866, "step": 264 }, { "epoch": 0.3250287466462246, "grad_norm": 5.038769721984863, "learning_rate": 9.676073619631902e-07, "logits/chosen": -2.2525813579559326, "logits/rejected": -2.2895257472991943, "logps/chosen": -61.63499450683594, "logps/rejected": -63.415863037109375, "loss": 0.9426, "rewards/accuracies": 0.625, "rewards/chosen": -0.12024715542793274, "rewards/margins": 0.2678435444831848, "rewards/rejected": -0.38809069991111755, "step": 265 }, { "epoch": 0.3262552702184745, "grad_norm": 4.367415904998779, "learning_rate": 9.67484662576687e-07, "logits/chosen": -2.213193416595459, "logits/rejected": -2.2327117919921875, "logps/chosen": -55.14818572998047, "logps/rejected": -62.340084075927734, "loss": 0.8932, "rewards/accuracies": 0.75, "rewards/chosen": 0.042829662561416626, "rewards/margins": 0.537898600101471, "rewards/rejected": -0.4950689673423767, "step": 266 }, { "epoch": 0.3274817937907244, "grad_norm": 4.601179599761963, "learning_rate": 9.673619631901841e-07, "logits/chosen": -2.254503011703491, "logits/rejected": -2.3084824085235596, "logps/chosen": -60.58690643310547, "logps/rejected": -61.037715911865234, "loss": 0.8699, "rewards/accuracies": 0.6875, "rewards/chosen": 0.028209976851940155, "rewards/margins": 0.6119553446769714, "rewards/rejected": -0.5837453007698059, "step": 267 }, { "epoch": 0.32870831736297434, "grad_norm": 4.606231689453125, "learning_rate": 9.67239263803681e-07, "logits/chosen": -2.2925612926483154, "logits/rejected": -2.28700590133667, "logps/chosen": -56.48763656616211, "logps/rejected": -59.03647232055664, "loss": 0.9431, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1095753014087677, "rewards/margins": 0.29157304763793945, "rewards/rejected": -0.40114834904670715, "step": 268 }, { "epoch": 0.32993484093522424, "grad_norm": 4.866989612579346, "learning_rate": 9.671165644171779e-07, "logits/chosen": -2.2916746139526367, "logits/rejected": -2.296541690826416, "logps/chosen": -57.08942794799805, "logps/rejected": -58.63545608520508, "loss": 0.9258, "rewards/accuracies": 0.65625, "rewards/chosen": 0.011854380369186401, "rewards/margins": 0.3956151306629181, "rewards/rejected": -0.3837607502937317, "step": 269 }, { "epoch": 0.33116136450747413, "grad_norm": 4.503044605255127, "learning_rate": 9.669938650306748e-07, "logits/chosen": -2.2635762691497803, "logits/rejected": -2.2487449645996094, "logps/chosen": -54.061729431152344, "logps/rejected": -53.95085144042969, "loss": 0.9405, "rewards/accuracies": 0.71875, "rewards/chosen": 0.08299213647842407, "rewards/margins": 0.3117007613182068, "rewards/rejected": -0.22870860993862152, "step": 270 }, { "epoch": 0.332387888079724, "grad_norm": 4.621575355529785, "learning_rate": 9.668711656441716e-07, "logits/chosen": -2.230031967163086, "logits/rejected": -2.2622196674346924, "logps/chosen": -57.828086853027344, "logps/rejected": -58.73146438598633, "loss": 0.9209, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1894698292016983, "rewards/margins": 0.3708561360836029, "rewards/rejected": -0.1813863068819046, "step": 271 }, { "epoch": 0.3336144116519739, "grad_norm": 5.304027080535889, "learning_rate": 9.667484662576687e-07, "logits/chosen": -2.2386350631713867, "logits/rejected": -2.2784640789031982, "logps/chosen": -59.712520599365234, "logps/rejected": -62.31259536743164, "loss": 0.8508, "rewards/accuracies": 0.71875, "rewards/chosen": 0.28503113985061646, "rewards/margins": 0.711395800113678, "rewards/rejected": -0.42636460065841675, "step": 272 }, { "epoch": 0.33484093522422387, "grad_norm": 4.571116924285889, "learning_rate": 9.666257668711656e-07, "logits/chosen": -2.3060860633850098, "logits/rejected": -2.2957420349121094, "logps/chosen": -59.41524887084961, "logps/rejected": -59.44396209716797, "loss": 0.938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1571769118309021, "rewards/margins": 0.3159908950328827, "rewards/rejected": -0.4731677770614624, "step": 273 }, { "epoch": 0.33606745879647376, "grad_norm": 4.627274513244629, "learning_rate": 9.665030674846625e-07, "logits/chosen": -2.2733592987060547, "logits/rejected": -2.269455909729004, "logps/chosen": -58.66320037841797, "logps/rejected": -61.15357208251953, "loss": 0.8991, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09311029314994812, "rewards/margins": 0.45570454001426697, "rewards/rejected": -0.36259424686431885, "step": 274 }, { "epoch": 0.33729398236872365, "grad_norm": 4.352212905883789, "learning_rate": 9.663803680981594e-07, "logits/chosen": -2.2865898609161377, "logits/rejected": -2.310049533843994, "logps/chosen": -59.655826568603516, "logps/rejected": -59.72926712036133, "loss": 0.9954, "rewards/accuracies": 0.5, "rewards/chosen": -0.09490173310041428, "rewards/margins": 0.018382951617240906, "rewards/rejected": -0.11328467726707458, "step": 275 }, { "epoch": 0.33852050594097355, "grad_norm": 5.1424055099487305, "learning_rate": 9.662576687116565e-07, "logits/chosen": -2.1968963146209717, "logits/rejected": -2.2729907035827637, "logps/chosen": -59.578948974609375, "logps/rejected": -63.03211975097656, "loss": 0.8843, "rewards/accuracies": 0.71875, "rewards/chosen": 0.07095566391944885, "rewards/margins": 0.6098583340644836, "rewards/rejected": -0.538902759552002, "step": 276 }, { "epoch": 0.33974702951322344, "grad_norm": 5.055881500244141, "learning_rate": 9.661349693251534e-07, "logits/chosen": -2.3024895191192627, "logits/rejected": -2.3220646381378174, "logps/chosen": -57.74109649658203, "logps/rejected": -62.41564178466797, "loss": 0.9183, "rewards/accuracies": 0.625, "rewards/chosen": -0.19584974646568298, "rewards/margins": 0.4318767786026001, "rewards/rejected": -0.6277265548706055, "step": 277 }, { "epoch": 0.34097355308547334, "grad_norm": 5.159303188323975, "learning_rate": 9.660122699386502e-07, "logits/chosen": -2.226483106613159, "logits/rejected": -2.253183364868164, "logps/chosen": -57.835777282714844, "logps/rejected": -62.87488555908203, "loss": 0.8677, "rewards/accuracies": 0.75, "rewards/chosen": 0.2619550824165344, "rewards/margins": 0.6436436176300049, "rewards/rejected": -0.38168850541114807, "step": 278 }, { "epoch": 0.3422000766577233, "grad_norm": 4.51404333114624, "learning_rate": 9.658895705521471e-07, "logits/chosen": -2.2677347660064697, "logits/rejected": -2.262878894805908, "logps/chosen": -58.026092529296875, "logps/rejected": -62.64053726196289, "loss": 0.8841, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07611749321222305, "rewards/margins": 0.5507797598838806, "rewards/rejected": -0.47466230392456055, "step": 279 }, { "epoch": 0.3434266002299732, "grad_norm": 7.042360782623291, "learning_rate": 9.657668711656442e-07, "logits/chosen": -2.2731974124908447, "logits/rejected": -2.2866547107696533, "logps/chosen": -61.629669189453125, "logps/rejected": -64.06119537353516, "loss": 0.9479, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1631237417459488, "rewards/margins": 0.2541826367378235, "rewards/rejected": -0.41730639338493347, "step": 280 }, { "epoch": 0.34465312380222307, "grad_norm": 4.949885845184326, "learning_rate": 9.65644171779141e-07, "logits/chosen": -2.25923490524292, "logits/rejected": -2.2651968002319336, "logps/chosen": -57.292236328125, "logps/rejected": -57.38854217529297, "loss": 0.9652, "rewards/accuracies": 0.53125, "rewards/chosen": -0.030940599739551544, "rewards/margins": 0.1490982174873352, "rewards/rejected": -0.18003880977630615, "step": 281 }, { "epoch": 0.34587964737447296, "grad_norm": 5.051362991333008, "learning_rate": 9.65521472392638e-07, "logits/chosen": -2.267876148223877, "logits/rejected": -2.2795422077178955, "logps/chosen": -57.629493713378906, "logps/rejected": -62.19847106933594, "loss": 0.8906, "rewards/accuracies": 0.84375, "rewards/chosen": -0.04149208217859268, "rewards/margins": 0.5112411975860596, "rewards/rejected": -0.5527332425117493, "step": 282 }, { "epoch": 0.34710617094672286, "grad_norm": 5.138825416564941, "learning_rate": 9.653987730061348e-07, "logits/chosen": -2.2946910858154297, "logits/rejected": -2.3034820556640625, "logps/chosen": -59.99066925048828, "logps/rejected": -61.14039611816406, "loss": 0.9367, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08844490349292755, "rewards/margins": 0.2811166048049927, "rewards/rejected": -0.3695615231990814, "step": 283 }, { "epoch": 0.3483326945189728, "grad_norm": 4.645948886871338, "learning_rate": 9.652760736196317e-07, "logits/chosen": -2.23396372795105, "logits/rejected": -2.251310348510742, "logps/chosen": -59.6790771484375, "logps/rejected": -62.55779266357422, "loss": 0.9252, "rewards/accuracies": 0.625, "rewards/chosen": -0.3468037545681, "rewards/margins": 0.3404688835144043, "rewards/rejected": -0.6872726678848267, "step": 284 }, { "epoch": 0.3495592180912227, "grad_norm": 6.048091888427734, "learning_rate": 9.651533742331288e-07, "logits/chosen": -2.259645938873291, "logits/rejected": -2.299731492996216, "logps/chosen": -62.819183349609375, "logps/rejected": -63.33023452758789, "loss": 0.9209, "rewards/accuracies": 0.71875, "rewards/chosen": -0.31190043687820435, "rewards/margins": 0.36286458373069763, "rewards/rejected": -0.6747650504112244, "step": 285 }, { "epoch": 0.3507857416634726, "grad_norm": 5.028835296630859, "learning_rate": 9.650306748466257e-07, "logits/chosen": -2.2714896202087402, "logits/rejected": -2.259305238723755, "logps/chosen": -61.727195739746094, "logps/rejected": -67.84199523925781, "loss": 0.951, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2639613747596741, "rewards/margins": 0.26541537046432495, "rewards/rejected": -0.5293768048286438, "step": 286 }, { "epoch": 0.3520122652357225, "grad_norm": 5.370972633361816, "learning_rate": 9.649079754601228e-07, "logits/chosen": -2.2695472240448, "logits/rejected": -2.2875986099243164, "logps/chosen": -57.71488952636719, "logps/rejected": -63.3268928527832, "loss": 0.8927, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20833493769168854, "rewards/margins": 0.4668562114238739, "rewards/rejected": -0.6751911044120789, "step": 287 }, { "epoch": 0.3532387888079724, "grad_norm": 4.633190155029297, "learning_rate": 9.647852760736197e-07, "logits/chosen": -2.2661640644073486, "logits/rejected": -2.3063206672668457, "logps/chosen": -58.841163635253906, "logps/rejected": -59.515525817871094, "loss": 0.9173, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1402028501033783, "rewards/margins": 0.38784757256507874, "rewards/rejected": -0.5280503034591675, "step": 288 }, { "epoch": 0.35446531238022233, "grad_norm": 5.5257463455200195, "learning_rate": 9.646625766871166e-07, "logits/chosen": -2.2394015789031982, "logits/rejected": -2.2902824878692627, "logps/chosen": -60.190731048583984, "logps/rejected": -67.81196594238281, "loss": 0.8646, "rewards/accuracies": 0.75, "rewards/chosen": -0.21409864723682404, "rewards/margins": 0.7535218596458435, "rewards/rejected": -0.9676204323768616, "step": 289 }, { "epoch": 0.3556918359524722, "grad_norm": 4.410475254058838, "learning_rate": 9.645398773006134e-07, "logits/chosen": -2.2721309661865234, "logits/rejected": -2.3144900798797607, "logps/chosen": -58.86229705810547, "logps/rejected": -64.7719497680664, "loss": 0.8879, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12373369932174683, "rewards/margins": 0.5500935316085815, "rewards/rejected": -0.6738272309303284, "step": 290 }, { "epoch": 0.3569183595247221, "grad_norm": 5.142406940460205, "learning_rate": 9.644171779141103e-07, "logits/chosen": -2.263277769088745, "logits/rejected": -2.2991833686828613, "logps/chosen": -61.28825378417969, "logps/rejected": -63.450138092041016, "loss": 0.931, "rewards/accuracies": 0.625, "rewards/chosen": -0.42305099964141846, "rewards/margins": 0.41614192724227905, "rewards/rejected": -0.8391929864883423, "step": 291 }, { "epoch": 0.358144883096972, "grad_norm": 5.0392022132873535, "learning_rate": 9.642944785276074e-07, "logits/chosen": -2.2549898624420166, "logits/rejected": -2.250610589981079, "logps/chosen": -59.5991325378418, "logps/rejected": -64.50733184814453, "loss": 0.8587, "rewards/accuracies": 0.625, "rewards/chosen": -0.3141891658306122, "rewards/margins": 0.6401875615119934, "rewards/rejected": -0.9543766975402832, "step": 292 }, { "epoch": 0.3593714066692219, "grad_norm": 4.386933326721191, "learning_rate": 9.641717791411043e-07, "logits/chosen": -2.2796010971069336, "logits/rejected": -2.3258299827575684, "logps/chosen": -60.77928161621094, "logps/rejected": -62.49866485595703, "loss": 0.924, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3995177447795868, "rewards/margins": 0.3828296363353729, "rewards/rejected": -0.7823473811149597, "step": 293 }, { "epoch": 0.36059793024147185, "grad_norm": 5.0107550621032715, "learning_rate": 9.640490797546012e-07, "logits/chosen": -2.2652997970581055, "logits/rejected": -2.3034865856170654, "logps/chosen": -60.92121124267578, "logps/rejected": -68.28327178955078, "loss": 0.8734, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3608587384223938, "rewards/margins": 0.7625435590744019, "rewards/rejected": -1.1234022378921509, "step": 294 }, { "epoch": 0.36182445381372175, "grad_norm": 4.341152667999268, "learning_rate": 9.63926380368098e-07, "logits/chosen": -2.2905359268188477, "logits/rejected": -2.313666582107544, "logps/chosen": -58.832672119140625, "logps/rejected": -64.42063903808594, "loss": 0.8766, "rewards/accuracies": 0.75, "rewards/chosen": 0.02145368978381157, "rewards/margins": 0.6073563694953918, "rewards/rejected": -0.5859026312828064, "step": 295 }, { "epoch": 0.36305097738597164, "grad_norm": 5.447764873504639, "learning_rate": 9.638036809815951e-07, "logits/chosen": -2.2571663856506348, "logits/rejected": -2.2782206535339355, "logps/chosen": -60.8035774230957, "logps/rejected": -60.190799713134766, "loss": 0.9651, "rewards/accuracies": 0.625, "rewards/chosen": -0.2569868266582489, "rewards/margins": 0.20976987481117249, "rewards/rejected": -0.4667567014694214, "step": 296 }, { "epoch": 0.36427750095822153, "grad_norm": 4.809421062469482, "learning_rate": 9.63680981595092e-07, "logits/chosen": -2.2691502571105957, "logits/rejected": -2.2698230743408203, "logps/chosen": -63.319923400878906, "logps/rejected": -63.671051025390625, "loss": 0.8981, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3127763271331787, "rewards/margins": 0.4674210250377655, "rewards/rejected": -0.7801973223686218, "step": 297 }, { "epoch": 0.3655040245304714, "grad_norm": 5.561209201812744, "learning_rate": 9.63558282208589e-07, "logits/chosen": -2.1613101959228516, "logits/rejected": -2.172337770462036, "logps/chosen": -55.8950080871582, "logps/rejected": -58.55757141113281, "loss": 0.8838, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06346851587295532, "rewards/margins": 0.5887071490287781, "rewards/rejected": -0.5252386331558228, "step": 298 }, { "epoch": 0.3667305481027214, "grad_norm": 5.078677654266357, "learning_rate": 9.634355828220858e-07, "logits/chosen": -2.2148823738098145, "logits/rejected": -2.237443208694458, "logps/chosen": -57.715816497802734, "logps/rejected": -62.31996154785156, "loss": 0.9068, "rewards/accuracies": 0.625, "rewards/chosen": -0.2684835195541382, "rewards/margins": 0.4224012494087219, "rewards/rejected": -0.6908847689628601, "step": 299 }, { "epoch": 0.36795707167497127, "grad_norm": 5.15749454498291, "learning_rate": 9.633128834355829e-07, "logits/chosen": -2.2562739849090576, "logits/rejected": -2.3123409748077393, "logps/chosen": -64.98746490478516, "logps/rejected": -63.343536376953125, "loss": 0.9696, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5903486013412476, "rewards/margins": 0.14043837785720825, "rewards/rejected": -0.730786919593811, "step": 300 }, { "epoch": 0.36918359524722116, "grad_norm": 6.072709083557129, "learning_rate": 9.631901840490798e-07, "logits/chosen": -2.207216739654541, "logits/rejected": -2.229341506958008, "logps/chosen": -58.236366271972656, "logps/rejected": -62.82260513305664, "loss": 0.9252, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06720946729183197, "rewards/margins": 0.4086737632751465, "rewards/rejected": -0.47588324546813965, "step": 301 }, { "epoch": 0.37041011881947106, "grad_norm": 8.476287841796875, "learning_rate": 9.630674846625766e-07, "logits/chosen": -2.335984468460083, "logits/rejected": -2.342963457107544, "logps/chosen": -61.51187515258789, "logps/rejected": -65.24760437011719, "loss": 0.9324, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3836430311203003, "rewards/margins": 0.36576980352401733, "rewards/rejected": -0.7494128346443176, "step": 302 }, { "epoch": 0.37163664239172095, "grad_norm": 6.4086527824401855, "learning_rate": 9.629447852760735e-07, "logits/chosen": -2.2756829261779785, "logits/rejected": -2.304960250854492, "logps/chosen": -63.25932312011719, "logps/rejected": -69.33010864257812, "loss": 0.8299, "rewards/accuracies": 0.71875, "rewards/chosen": -0.25865763425827026, "rewards/margins": 0.9358510375022888, "rewards/rejected": -1.194508671760559, "step": 303 }, { "epoch": 0.37286316596397084, "grad_norm": 5.386398792266846, "learning_rate": 9.628220858895704e-07, "logits/chosen": -2.2294530868530273, "logits/rejected": -2.2463862895965576, "logps/chosen": -58.35031509399414, "logps/rejected": -62.686580657958984, "loss": 0.8919, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3221118748188019, "rewards/margins": 0.6183806657791138, "rewards/rejected": -0.9404925107955933, "step": 304 }, { "epoch": 0.3740896895362208, "grad_norm": 5.488770008087158, "learning_rate": 9.626993865030675e-07, "logits/chosen": -2.294910430908203, "logits/rejected": -2.3539907932281494, "logps/chosen": -60.23822021484375, "logps/rejected": -66.43999481201172, "loss": 0.9402, "rewards/accuracies": 0.75, "rewards/chosen": -0.06411068141460419, "rewards/margins": 0.30105552077293396, "rewards/rejected": -0.36516615748405457, "step": 305 }, { "epoch": 0.3753162131084707, "grad_norm": 5.080131530761719, "learning_rate": 9.625766871165644e-07, "logits/chosen": -2.2053005695343018, "logits/rejected": -2.2649598121643066, "logps/chosen": -59.257057189941406, "logps/rejected": -63.20817565917969, "loss": 0.9136, "rewards/accuracies": 0.65625, "rewards/chosen": -0.18790186941623688, "rewards/margins": 0.4092288315296173, "rewards/rejected": -0.5971307158470154, "step": 306 }, { "epoch": 0.3765427366807206, "grad_norm": 5.404940605163574, "learning_rate": 9.624539877300612e-07, "logits/chosen": -2.2219369411468506, "logits/rejected": -2.2532639503479004, "logps/chosen": -53.412017822265625, "logps/rejected": -59.50637435913086, "loss": 0.8562, "rewards/accuracies": 0.75, "rewards/chosen": 0.32181236147880554, "rewards/margins": 0.7066063284873962, "rewards/rejected": -0.38479387760162354, "step": 307 }, { "epoch": 0.3777692602529705, "grad_norm": 5.867960453033447, "learning_rate": 9.623312883435583e-07, "logits/chosen": -2.2400460243225098, "logits/rejected": -2.279888868331909, "logps/chosen": -62.10943603515625, "logps/rejected": -68.826904296875, "loss": 0.8637, "rewards/accuracies": 0.75, "rewards/chosen": -0.18190330266952515, "rewards/margins": 0.7362865209579468, "rewards/rejected": -0.9181898236274719, "step": 308 }, { "epoch": 0.37899578382522037, "grad_norm": 4.435805797576904, "learning_rate": 9.622085889570552e-07, "logits/chosen": -2.2550230026245117, "logits/rejected": -2.310394287109375, "logps/chosen": -58.27505874633789, "logps/rejected": -61.65081024169922, "loss": 0.9066, "rewards/accuracies": 0.625, "rewards/chosen": -0.22738240659236908, "rewards/margins": 0.4316173791885376, "rewards/rejected": -0.6589998006820679, "step": 309 }, { "epoch": 0.3802223073974703, "grad_norm": 5.253418445587158, "learning_rate": 9.62085889570552e-07, "logits/chosen": -2.268730640411377, "logits/rejected": -2.298671245574951, "logps/chosen": -63.576560974121094, "logps/rejected": -66.01861572265625, "loss": 0.9035, "rewards/accuracies": 0.5625, "rewards/chosen": -0.36101362109184265, "rewards/margins": 0.5211525559425354, "rewards/rejected": -0.8821661472320557, "step": 310 }, { "epoch": 0.3814488309697202, "grad_norm": 5.9543609619140625, "learning_rate": 9.61963190184049e-07, "logits/chosen": -2.2072668075561523, "logits/rejected": -2.2582879066467285, "logps/chosen": -55.696205139160156, "logps/rejected": -61.2053337097168, "loss": 0.8293, "rewards/accuracies": 0.8125, "rewards/chosen": 0.19528734683990479, "rewards/margins": 0.7981851100921631, "rewards/rejected": -0.6028978824615479, "step": 311 }, { "epoch": 0.3826753545419701, "grad_norm": 5.692485332489014, "learning_rate": 9.618404907975459e-07, "logits/chosen": -2.3043253421783447, "logits/rejected": -2.3045525550842285, "logps/chosen": -61.561302185058594, "logps/rejected": -64.9303970336914, "loss": 0.8726, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2352408468723297, "rewards/margins": 0.663221001625061, "rewards/rejected": -0.8984617590904236, "step": 312 }, { "epoch": 0.38390187811422, "grad_norm": 5.43686580657959, "learning_rate": 9.61717791411043e-07, "logits/chosen": -2.2170252799987793, "logits/rejected": -2.2370553016662598, "logps/chosen": -55.700469970703125, "logps/rejected": -59.00535583496094, "loss": 0.89, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17604407668113708, "rewards/margins": 0.565276026725769, "rewards/rejected": -0.38923197984695435, "step": 313 }, { "epoch": 0.3851284016864699, "grad_norm": 5.712646484375, "learning_rate": 9.615950920245398e-07, "logits/chosen": -2.3209924697875977, "logits/rejected": -2.324291467666626, "logps/chosen": -54.92711639404297, "logps/rejected": -59.73457717895508, "loss": 0.8795, "rewards/accuracies": 0.75, "rewards/chosen": 0.08657313138246536, "rewards/margins": 0.6093297004699707, "rewards/rejected": -0.5227566361427307, "step": 314 }, { "epoch": 0.38635492525871984, "grad_norm": 4.543511390686035, "learning_rate": 9.614723926380367e-07, "logits/chosen": -2.2758731842041016, "logits/rejected": -2.2777857780456543, "logps/chosen": -56.0162353515625, "logps/rejected": -63.459564208984375, "loss": 0.8814, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19839617609977722, "rewards/margins": 0.6037540435791016, "rewards/rejected": -0.40535783767700195, "step": 315 }, { "epoch": 0.38758144883096973, "grad_norm": 5.313583850860596, "learning_rate": 9.613496932515336e-07, "logits/chosen": -2.280726671218872, "logits/rejected": -2.288475751876831, "logps/chosen": -57.96955108642578, "logps/rejected": -62.462806701660156, "loss": 0.9059, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09790588915348053, "rewards/margins": 0.43596065044403076, "rewards/rejected": -0.5338666439056396, "step": 316 }, { "epoch": 0.3888079724032196, "grad_norm": 5.393964767456055, "learning_rate": 9.612269938650307e-07, "logits/chosen": -2.2506136894226074, "logits/rejected": -2.2755141258239746, "logps/chosen": -55.01011276245117, "logps/rejected": -59.893333435058594, "loss": 0.8443, "rewards/accuracies": 0.65625, "rewards/chosen": 0.12435395270586014, "rewards/margins": 0.7459656596183777, "rewards/rejected": -0.6216117739677429, "step": 317 }, { "epoch": 0.3900344959754695, "grad_norm": 5.572841644287109, "learning_rate": 9.611042944785276e-07, "logits/chosen": -2.2329459190368652, "logits/rejected": -2.2183151245117188, "logps/chosen": -57.50254821777344, "logps/rejected": -64.10252380371094, "loss": 0.8441, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0017463825643062592, "rewards/margins": 0.7768816947937012, "rewards/rejected": -0.7751352787017822, "step": 318 }, { "epoch": 0.3912610195477194, "grad_norm": 5.935307025909424, "learning_rate": 9.609815950920244e-07, "logits/chosen": -2.218738317489624, "logits/rejected": -2.265686273574829, "logps/chosen": -56.405479431152344, "logps/rejected": -62.92597961425781, "loss": 0.8914, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10485661029815674, "rewards/margins": 0.6045342087745667, "rewards/rejected": -0.7093907594680786, "step": 319 }, { "epoch": 0.39248754311996936, "grad_norm": 6.495408058166504, "learning_rate": 9.608588957055215e-07, "logits/chosen": -2.2289376258850098, "logits/rejected": -2.2445452213287354, "logps/chosen": -52.83599853515625, "logps/rejected": -61.05306625366211, "loss": 0.8181, "rewards/accuracies": 0.78125, "rewards/chosen": 0.34156283736228943, "rewards/margins": 0.8953173160552979, "rewards/rejected": -0.553754448890686, "step": 320 }, { "epoch": 0.39371406669221926, "grad_norm": 5.4851531982421875, "learning_rate": 9.607361963190184e-07, "logits/chosen": -2.2440476417541504, "logits/rejected": -2.2513086795806885, "logps/chosen": -54.20941925048828, "logps/rejected": -61.36855697631836, "loss": 0.851, "rewards/accuracies": 0.65625, "rewards/chosen": 0.14473937451839447, "rewards/margins": 0.7295917868614197, "rewards/rejected": -0.584852397441864, "step": 321 }, { "epoch": 0.39494059026446915, "grad_norm": 5.815973281860352, "learning_rate": 9.606134969325153e-07, "logits/chosen": -2.2567269802093506, "logits/rejected": -2.326104164123535, "logps/chosen": -59.60626983642578, "logps/rejected": -63.1359977722168, "loss": 0.9161, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0603538379073143, "rewards/margins": 0.44831031560897827, "rewards/rejected": -0.5086641311645508, "step": 322 }, { "epoch": 0.39616711383671904, "grad_norm": 5.561234951019287, "learning_rate": 9.604907975460122e-07, "logits/chosen": -2.2569925785064697, "logits/rejected": -2.292222499847412, "logps/chosen": -59.242332458496094, "logps/rejected": -62.44320297241211, "loss": 0.8986, "rewards/accuracies": 0.65625, "rewards/chosen": -0.22041264176368713, "rewards/margins": 0.4848427176475525, "rewards/rejected": -0.705255389213562, "step": 323 }, { "epoch": 0.39739363740896894, "grad_norm": 6.051816940307617, "learning_rate": 9.60368098159509e-07, "logits/chosen": -2.2072722911834717, "logits/rejected": -2.2696456909179688, "logps/chosen": -56.534576416015625, "logps/rejected": -58.48869323730469, "loss": 0.9016, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0866236463189125, "rewards/margins": 0.5258961915969849, "rewards/rejected": -0.43927252292633057, "step": 324 }, { "epoch": 0.3986201609812189, "grad_norm": 5.0884294509887695, "learning_rate": 9.602453987730062e-07, "logits/chosen": -2.226076364517212, "logits/rejected": -2.2758688926696777, "logps/chosen": -54.35595703125, "logps/rejected": -58.597564697265625, "loss": 0.8602, "rewards/accuracies": 0.65625, "rewards/chosen": 0.09474647045135498, "rewards/margins": 0.746481716632843, "rewards/rejected": -0.651735246181488, "step": 325 }, { "epoch": 0.3998466845534688, "grad_norm": 5.85272216796875, "learning_rate": 9.60122699386503e-07, "logits/chosen": -2.324272632598877, "logits/rejected": -2.344353199005127, "logps/chosen": -59.79692459106445, "logps/rejected": -62.32032775878906, "loss": 0.8936, "rewards/accuracies": 0.625, "rewards/chosen": -0.2216869294643402, "rewards/margins": 0.5467104315757751, "rewards/rejected": -0.7683973908424377, "step": 326 }, { "epoch": 0.3998466845534688, "eval_logits/chosen": -2.2578561305999756, "eval_logits/rejected": -2.2782931327819824, "eval_logps/chosen": -61.85074234008789, "eval_logps/rejected": -65.34583282470703, "eval_loss": 0.9158653020858765, "eval_rewards/accuracies": 0.6258503198623657, "eval_rewards/chosen": -0.4219382405281067, "eval_rewards/margins": 0.46278753876686096, "eval_rewards/rejected": -0.8847259283065796, "eval_runtime": 1580.2682, "eval_samples_per_second": 0.558, "eval_steps_per_second": 0.279, "step": 326 }, { "epoch": 0.40107320812571867, "grad_norm": 6.735551357269287, "learning_rate": 9.6e-07, "logits/chosen": -2.225128173828125, "logits/rejected": -2.27980899810791, "logps/chosen": -63.45716857910156, "logps/rejected": -68.03233337402344, "loss": 0.8745, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3806808590888977, "rewards/margins": 0.6705489754676819, "rewards/rejected": -1.0512298345565796, "step": 327 }, { "epoch": 0.40229973169796857, "grad_norm": 7.1713714599609375, "learning_rate": 9.59877300613497e-07, "logits/chosen": -2.2745864391326904, "logits/rejected": -2.2727513313293457, "logps/chosen": -56.85821533203125, "logps/rejected": -62.52754592895508, "loss": 0.839, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04626765474677086, "rewards/margins": 0.8480675220489502, "rewards/rejected": -0.8943350315093994, "step": 328 }, { "epoch": 0.40352625527021846, "grad_norm": 6.440000534057617, "learning_rate": 9.597546012269939e-07, "logits/chosen": -2.2571496963500977, "logits/rejected": -2.310878038406372, "logps/chosen": -56.6897087097168, "logps/rejected": -61.632957458496094, "loss": 0.8722, "rewards/accuracies": 0.59375, "rewards/chosen": -0.017249591648578644, "rewards/margins": 0.6291567087173462, "rewards/rejected": -0.6464062333106995, "step": 329 }, { "epoch": 0.40475277884246835, "grad_norm": 5.822798252105713, "learning_rate": 9.596319018404908e-07, "logits/chosen": -2.2555994987487793, "logits/rejected": -2.2553188800811768, "logps/chosen": -56.54348373413086, "logps/rejected": -60.91563415527344, "loss": 0.9013, "rewards/accuracies": 0.71875, "rewards/chosen": -0.02374185249209404, "rewards/margins": 0.5290673971176147, "rewards/rejected": -0.5528092384338379, "step": 330 }, { "epoch": 0.4059793024147183, "grad_norm": 5.260415554046631, "learning_rate": 9.595092024539876e-07, "logits/chosen": -2.2957749366760254, "logits/rejected": -2.2680916786193848, "logps/chosen": -60.62851333618164, "logps/rejected": -61.1120719909668, "loss": 0.9146, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3224893808364868, "rewards/margins": 0.4584183692932129, "rewards/rejected": -0.7809076905250549, "step": 331 }, { "epoch": 0.4072058259869682, "grad_norm": 6.539710521697998, "learning_rate": 9.593865030674845e-07, "logits/chosen": -2.268965482711792, "logits/rejected": -2.2951090335845947, "logps/chosen": -59.35078048706055, "logps/rejected": -63.25654602050781, "loss": 0.8656, "rewards/accuracies": 0.71875, "rewards/chosen": -0.22039532661437988, "rewards/margins": 0.6067897081375122, "rewards/rejected": -0.8271851539611816, "step": 332 }, { "epoch": 0.4084323495592181, "grad_norm": 6.047589302062988, "learning_rate": 9.592638036809816e-07, "logits/chosen": -2.2455317974090576, "logits/rejected": -2.2553417682647705, "logps/chosen": -60.62687683105469, "logps/rejected": -61.952842712402344, "loss": 0.9335, "rewards/accuracies": 0.59375, "rewards/chosen": -0.29581230878829956, "rewards/margins": 0.41588151454925537, "rewards/rejected": -0.7116938233375549, "step": 333 }, { "epoch": 0.409658873131468, "grad_norm": 7.686211109161377, "learning_rate": 9.591411042944785e-07, "logits/chosen": -2.301391363143921, "logits/rejected": -2.2982077598571777, "logps/chosen": -60.507484436035156, "logps/rejected": -64.45115661621094, "loss": 0.8153, "rewards/accuracies": 0.75, "rewards/chosen": -0.09978879988193512, "rewards/margins": 1.0116045475006104, "rewards/rejected": -1.1113933324813843, "step": 334 }, { "epoch": 0.4108853967037179, "grad_norm": 5.895122528076172, "learning_rate": 9.590184049079754e-07, "logits/chosen": -2.283168315887451, "logits/rejected": -2.2728240489959717, "logps/chosen": -60.51919174194336, "logps/rejected": -61.80083465576172, "loss": 0.949, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1391884982585907, "rewards/margins": 0.31353089213371277, "rewards/rejected": -0.4527193605899811, "step": 335 }, { "epoch": 0.4121119202759678, "grad_norm": 6.892177581787109, "learning_rate": 9.588957055214723e-07, "logits/chosen": -2.240821599960327, "logits/rejected": -2.2708239555358887, "logps/chosen": -54.262691497802734, "logps/rejected": -62.91630935668945, "loss": 0.8607, "rewards/accuracies": 0.75, "rewards/chosen": 0.3299197256565094, "rewards/margins": 0.7435592412948608, "rewards/rejected": -0.41363954544067383, "step": 336 }, { "epoch": 0.4133384438482177, "grad_norm": 6.025763034820557, "learning_rate": 9.587730061349694e-07, "logits/chosen": -2.2480039596557617, "logits/rejected": -2.2774691581726074, "logps/chosen": -61.790077209472656, "logps/rejected": -63.088165283203125, "loss": 0.9683, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4165221154689789, "rewards/margins": 0.2566930055618286, "rewards/rejected": -0.6732151508331299, "step": 337 }, { "epoch": 0.4145649674204676, "grad_norm": 5.529298782348633, "learning_rate": 9.586503067484662e-07, "logits/chosen": -2.280643939971924, "logits/rejected": -2.3382320404052734, "logps/chosen": -60.973609924316406, "logps/rejected": -62.901187896728516, "loss": 0.952, "rewards/accuracies": 0.53125, "rewards/chosen": 0.01367207057774067, "rewards/margins": 0.2091694176197052, "rewards/rejected": -0.19549734890460968, "step": 338 }, { "epoch": 0.4157914909927175, "grad_norm": 5.845704078674316, "learning_rate": 9.585276073619631e-07, "logits/chosen": -2.257638931274414, "logits/rejected": -2.2790331840515137, "logps/chosen": -55.60069274902344, "logps/rejected": -62.08099365234375, "loss": 0.9047, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10929032415151596, "rewards/margins": 0.48547330498695374, "rewards/rejected": -0.5947636365890503, "step": 339 }, { "epoch": 0.4170180145649674, "grad_norm": 6.410233497619629, "learning_rate": 9.5840490797546e-07, "logits/chosen": -2.257439374923706, "logits/rejected": -2.2756187915802, "logps/chosen": -52.63230895996094, "logps/rejected": -61.06159210205078, "loss": 0.8554, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0243537537753582, "rewards/margins": 0.7777584791183472, "rewards/rejected": -0.8021122217178345, "step": 340 }, { "epoch": 0.41824453813721735, "grad_norm": 8.01363468170166, "learning_rate": 9.58282208588957e-07, "logits/chosen": -2.251455068588257, "logits/rejected": -2.2973461151123047, "logps/chosen": -54.18112564086914, "logps/rejected": -62.16909408569336, "loss": 0.8488, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18671204149723053, "rewards/margins": 0.7777460813522339, "rewards/rejected": -0.5910340547561646, "step": 341 }, { "epoch": 0.41947106170946724, "grad_norm": 8.153672218322754, "learning_rate": 9.58159509202454e-07, "logits/chosen": -2.2595598697662354, "logits/rejected": -2.313063621520996, "logps/chosen": -56.081180572509766, "logps/rejected": -60.93899917602539, "loss": 0.8423, "rewards/accuracies": 0.6875, "rewards/chosen": 0.30729562044143677, "rewards/margins": 0.8074788451194763, "rewards/rejected": -0.5001832246780396, "step": 342 }, { "epoch": 0.42069758528171713, "grad_norm": 5.869141578674316, "learning_rate": 9.580368098159508e-07, "logits/chosen": -2.261685848236084, "logits/rejected": -2.344125270843506, "logps/chosen": -57.993247985839844, "logps/rejected": -59.398468017578125, "loss": 0.9173, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1975701004266739, "rewards/margins": 0.46788379549980164, "rewards/rejected": -0.27031373977661133, "step": 343 }, { "epoch": 0.42192410885396703, "grad_norm": 6.116200923919678, "learning_rate": 9.579141104294477e-07, "logits/chosen": -2.262150287628174, "logits/rejected": -2.300358295440674, "logps/chosen": -58.06559753417969, "logps/rejected": -58.340084075927734, "loss": 0.9524, "rewards/accuracies": 0.4375, "rewards/chosen": -0.22115015983581543, "rewards/margins": 0.1825559288263321, "rewards/rejected": -0.4037061035633087, "step": 344 }, { "epoch": 0.4231506324262169, "grad_norm": 5.858030796051025, "learning_rate": 9.577914110429446e-07, "logits/chosen": -2.2387049198150635, "logits/rejected": -2.274326801300049, "logps/chosen": -58.75651550292969, "logps/rejected": -62.590415954589844, "loss": 0.8358, "rewards/accuracies": 0.78125, "rewards/chosen": 0.36701202392578125, "rewards/margins": 0.8775097131729126, "rewards/rejected": -0.5104976296424866, "step": 345 }, { "epoch": 0.42437715599846687, "grad_norm": 7.0314531326293945, "learning_rate": 9.576687116564417e-07, "logits/chosen": -2.2200586795806885, "logits/rejected": -2.2299749851226807, "logps/chosen": -57.3939094543457, "logps/rejected": -62.9317512512207, "loss": 0.8991, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0013783350586891174, "rewards/margins": 0.5758277177810669, "rewards/rejected": -0.577206015586853, "step": 346 }, { "epoch": 0.42560367957071676, "grad_norm": 5.805675983428955, "learning_rate": 9.575460122699386e-07, "logits/chosen": -2.266303539276123, "logits/rejected": -2.3128786087036133, "logps/chosen": -53.5830078125, "logps/rejected": -59.6632194519043, "loss": 0.8349, "rewards/accuracies": 0.75, "rewards/chosen": 0.32730376720428467, "rewards/margins": 0.844002366065979, "rewards/rejected": -0.5166985988616943, "step": 347 }, { "epoch": 0.42683020314296666, "grad_norm": 6.728042125701904, "learning_rate": 9.574233128834357e-07, "logits/chosen": -2.2377984523773193, "logits/rejected": -2.240304470062256, "logps/chosen": -58.96595001220703, "logps/rejected": -64.84738159179688, "loss": 0.9057, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09390954673290253, "rewards/margins": 0.4543970823287964, "rewards/rejected": -0.5483066439628601, "step": 348 }, { "epoch": 0.42805672671521655, "grad_norm": 6.252343654632568, "learning_rate": 9.573006134969326e-07, "logits/chosen": -2.253002882003784, "logits/rejected": -2.267817497253418, "logps/chosen": -55.76494598388672, "logps/rejected": -56.405975341796875, "loss": 0.9874, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14544077217578888, "rewards/margins": 0.06178515404462814, "rewards/rejected": -0.20722590386867523, "step": 349 }, { "epoch": 0.42928325028746644, "grad_norm": 4.908788681030273, "learning_rate": 9.571779141104294e-07, "logits/chosen": -2.2721009254455566, "logits/rejected": -2.290973424911499, "logps/chosen": -58.63761520385742, "logps/rejected": -63.823387145996094, "loss": 0.9129, "rewards/accuracies": 0.625, "rewards/chosen": -0.48862168192863464, "rewards/margins": 0.5092076063156128, "rewards/rejected": -0.9978291988372803, "step": 350 }, { "epoch": 0.4305097738597164, "grad_norm": 9.224908828735352, "learning_rate": 9.570552147239263e-07, "logits/chosen": -2.2322282791137695, "logits/rejected": -2.2747817039489746, "logps/chosen": -58.84535217285156, "logps/rejected": -67.65483093261719, "loss": 0.8494, "rewards/accuracies": 0.71875, "rewards/chosen": -0.06732411682605743, "rewards/margins": 0.8475061058998108, "rewards/rejected": -0.9148302674293518, "step": 351 }, { "epoch": 0.4317362974319663, "grad_norm": 6.261470317840576, "learning_rate": 9.569325153374232e-07, "logits/chosen": -2.2373032569885254, "logits/rejected": -2.2577075958251953, "logps/chosen": -58.68605041503906, "logps/rejected": -64.94111633300781, "loss": 0.9017, "rewards/accuracies": 0.625, "rewards/chosen": -0.20734649896621704, "rewards/margins": 0.4429803788661957, "rewards/rejected": -0.6503268480300903, "step": 352 }, { "epoch": 0.4329628210042162, "grad_norm": 5.820001602172852, "learning_rate": 9.568098159509203e-07, "logits/chosen": -2.302152395248413, "logits/rejected": -2.3101248741149902, "logps/chosen": -56.58473205566406, "logps/rejected": -59.832794189453125, "loss": 0.9298, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005854994058609009, "rewards/margins": 0.39435726404190063, "rewards/rejected": -0.388502299785614, "step": 353 }, { "epoch": 0.4341893445764661, "grad_norm": 5.8163743019104, "learning_rate": 9.566871165644172e-07, "logits/chosen": -2.253797769546509, "logits/rejected": -2.3208184242248535, "logps/chosen": -55.45524978637695, "logps/rejected": -65.34232330322266, "loss": 0.8138, "rewards/accuracies": 0.65625, "rewards/chosen": 0.21711471676826477, "rewards/margins": 0.9892571568489075, "rewards/rejected": -0.7721425294876099, "step": 354 }, { "epoch": 0.43541586814871597, "grad_norm": 7.221750736236572, "learning_rate": 9.56564417177914e-07, "logits/chosen": -2.282639265060425, "logits/rejected": -2.3101742267608643, "logps/chosen": -64.12615966796875, "logps/rejected": -67.7265396118164, "loss": 0.8735, "rewards/accuracies": 0.71875, "rewards/chosen": -0.30292582511901855, "rewards/margins": 0.6626452207565308, "rewards/rejected": -0.9655711054801941, "step": 355 }, { "epoch": 0.43664239172096586, "grad_norm": 5.39942741394043, "learning_rate": 9.56441717791411e-07, "logits/chosen": -2.2432615756988525, "logits/rejected": -2.257458209991455, "logps/chosen": -59.81240463256836, "logps/rejected": -62.33307647705078, "loss": 0.9185, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07470391690731049, "rewards/margins": 0.4726608693599701, "rewards/rejected": -0.5473647713661194, "step": 356 }, { "epoch": 0.4378689152932158, "grad_norm": 6.498956680297852, "learning_rate": 9.56319018404908e-07, "logits/chosen": -2.221082925796509, "logits/rejected": -2.233565330505371, "logps/chosen": -58.57361602783203, "logps/rejected": -63.7485237121582, "loss": 0.8829, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04309704899787903, "rewards/margins": 0.5931733846664429, "rewards/rejected": -0.6362704634666443, "step": 357 }, { "epoch": 0.4390954388654657, "grad_norm": 5.7523908615112305, "learning_rate": 9.56196319018405e-07, "logits/chosen": -2.230121374130249, "logits/rejected": -2.2716240882873535, "logps/chosen": -56.08293151855469, "logps/rejected": -62.59969711303711, "loss": 0.8237, "rewards/accuracies": 0.6875, "rewards/chosen": 0.27548861503601074, "rewards/margins": 1.009591817855835, "rewards/rejected": -0.7341032028198242, "step": 358 }, { "epoch": 0.4403219624377156, "grad_norm": 6.7718729972839355, "learning_rate": 9.560736196319018e-07, "logits/chosen": -2.233549118041992, "logits/rejected": -2.248908042907715, "logps/chosen": -56.9510612487793, "logps/rejected": -61.38718032836914, "loss": 0.8905, "rewards/accuracies": 0.59375, "rewards/chosen": -0.019146114587783813, "rewards/margins": 0.5963788032531738, "rewards/rejected": -0.6155248880386353, "step": 359 }, { "epoch": 0.4415484860099655, "grad_norm": 7.959458351135254, "learning_rate": 9.559509202453987e-07, "logits/chosen": -2.2330245971679688, "logits/rejected": -2.261286735534668, "logps/chosen": -56.65803146362305, "logps/rejected": -66.13904571533203, "loss": 0.8029, "rewards/accuracies": 0.78125, "rewards/chosen": 0.09742365777492523, "rewards/margins": 1.028261423110962, "rewards/rejected": -0.9308377504348755, "step": 360 }, { "epoch": 0.4427750095822154, "grad_norm": 5.358109474182129, "learning_rate": 9.558282208588958e-07, "logits/chosen": -2.297281265258789, "logits/rejected": -2.3284435272216797, "logps/chosen": -59.765708923339844, "logps/rejected": -65.95001220703125, "loss": 0.8691, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08303938806056976, "rewards/margins": 0.7269620299339294, "rewards/rejected": -0.8100014925003052, "step": 361 }, { "epoch": 0.44400153315446533, "grad_norm": 5.940330982208252, "learning_rate": 9.557055214723926e-07, "logits/chosen": -2.27439022064209, "logits/rejected": -2.2728052139282227, "logps/chosen": -61.33500671386719, "logps/rejected": -64.12138366699219, "loss": 0.8717, "rewards/accuracies": 0.625, "rewards/chosen": -0.5155642628669739, "rewards/margins": 0.6097787022590637, "rewards/rejected": -1.1253429651260376, "step": 362 }, { "epoch": 0.4452280567267152, "grad_norm": 6.875891208648682, "learning_rate": 9.555828220858895e-07, "logits/chosen": -2.221226215362549, "logits/rejected": -2.2659640312194824, "logps/chosen": -56.13221740722656, "logps/rejected": -60.5903434753418, "loss": 0.854, "rewards/accuracies": 0.65625, "rewards/chosen": -0.21329468488693237, "rewards/margins": 0.719907283782959, "rewards/rejected": -0.9332020282745361, "step": 363 }, { "epoch": 0.4464545802989651, "grad_norm": 6.3353047370910645, "learning_rate": 9.554601226993864e-07, "logits/chosen": -2.2849671840667725, "logits/rejected": -2.3114113807678223, "logps/chosen": -57.82180404663086, "logps/rejected": -63.958927154541016, "loss": 0.8304, "rewards/accuracies": 0.75, "rewards/chosen": -0.13215047121047974, "rewards/margins": 0.8269644379615784, "rewards/rejected": -0.9591149091720581, "step": 364 }, { "epoch": 0.447681103871215, "grad_norm": 6.329079627990723, "learning_rate": 9.553374233128833e-07, "logits/chosen": -2.2564756870269775, "logits/rejected": -2.267540454864502, "logps/chosen": -60.297786712646484, "logps/rejected": -65.87538146972656, "loss": 0.9058, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2123645693063736, "rewards/margins": 0.5380512475967407, "rewards/rejected": -0.7504159212112427, "step": 365 }, { "epoch": 0.4489076274434649, "grad_norm": 7.176088333129883, "learning_rate": 9.552147239263804e-07, "logits/chosen": -2.230278491973877, "logits/rejected": -2.2725162506103516, "logps/chosen": -59.26995086669922, "logps/rejected": -64.77888488769531, "loss": 0.857, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0653059184551239, "rewards/margins": 0.8258097171783447, "rewards/rejected": -0.8911155462265015, "step": 366 }, { "epoch": 0.45013415101571486, "grad_norm": 6.4880218505859375, "learning_rate": 9.550920245398772e-07, "logits/chosen": -2.284770965576172, "logits/rejected": -2.3278207778930664, "logps/chosen": -59.2244987487793, "logps/rejected": -62.19206237792969, "loss": 0.8822, "rewards/accuracies": 0.625, "rewards/chosen": -0.016896329820156097, "rewards/margins": 0.7260303497314453, "rewards/rejected": -0.7429265975952148, "step": 367 }, { "epoch": 0.45136067458796475, "grad_norm": 7.253659248352051, "learning_rate": 9.549693251533741e-07, "logits/chosen": -2.2368974685668945, "logits/rejected": -2.2309353351593018, "logps/chosen": -57.81060028076172, "logps/rejected": -63.99787902832031, "loss": 0.921, "rewards/accuracies": 0.625, "rewards/chosen": -0.23136478662490845, "rewards/margins": 0.534680187702179, "rewards/rejected": -0.7660449743270874, "step": 368 }, { "epoch": 0.45258719816021464, "grad_norm": 6.386548042297363, "learning_rate": 9.548466257668712e-07, "logits/chosen": -2.2211663722991943, "logits/rejected": -2.267158031463623, "logps/chosen": -57.77837371826172, "logps/rejected": -61.943077087402344, "loss": 0.8712, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0708756148815155, "rewards/margins": 0.6376176476478577, "rewards/rejected": -0.7084932923316956, "step": 369 }, { "epoch": 0.45381372173246454, "grad_norm": 8.060728073120117, "learning_rate": 9.54723926380368e-07, "logits/chosen": -2.2486441135406494, "logits/rejected": -2.2758307456970215, "logps/chosen": -63.18366241455078, "logps/rejected": -66.260986328125, "loss": 0.8874, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3213299512863159, "rewards/margins": 0.5848391056060791, "rewards/rejected": -0.906169056892395, "step": 370 }, { "epoch": 0.45504024530471443, "grad_norm": 6.4959211349487305, "learning_rate": 9.54601226993865e-07, "logits/chosen": -2.287379503250122, "logits/rejected": -2.2938265800476074, "logps/chosen": -58.000755310058594, "logps/rejected": -62.933265686035156, "loss": 0.8605, "rewards/accuracies": 0.75, "rewards/chosen": 0.05017751082777977, "rewards/margins": 0.7693619132041931, "rewards/rejected": -0.7191843390464783, "step": 371 }, { "epoch": 0.4562667688769644, "grad_norm": 6.152864456176758, "learning_rate": 9.544785276073619e-07, "logits/chosen": -2.259780168533325, "logits/rejected": -2.2771401405334473, "logps/chosen": -60.50145721435547, "logps/rejected": -67.7324447631836, "loss": 0.8331, "rewards/accuracies": 0.71875, "rewards/chosen": -0.26231446862220764, "rewards/margins": 0.8161107301712036, "rewards/rejected": -1.0784251689910889, "step": 372 }, { "epoch": 0.4574932924492143, "grad_norm": 6.374204158782959, "learning_rate": 9.543558282208587e-07, "logits/chosen": -2.2074193954467773, "logits/rejected": -2.2588040828704834, "logps/chosen": -58.08346939086914, "logps/rejected": -61.410770416259766, "loss": 0.8874, "rewards/accuracies": 0.59375, "rewards/chosen": 0.008480317890644073, "rewards/margins": 0.5797194242477417, "rewards/rejected": -0.5712391138076782, "step": 373 }, { "epoch": 0.45871981602146417, "grad_norm": 6.4423418045043945, "learning_rate": 9.542331288343558e-07, "logits/chosen": -2.266986608505249, "logits/rejected": -2.2556777000427246, "logps/chosen": -59.111915588378906, "logps/rejected": -62.538108825683594, "loss": 0.9688, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2638198733329773, "rewards/margins": 0.2542833685874939, "rewards/rejected": -0.5181032419204712, "step": 374 }, { "epoch": 0.45994633959371406, "grad_norm": 6.614043712615967, "learning_rate": 9.541104294478527e-07, "logits/chosen": -2.282231330871582, "logits/rejected": -2.3119194507598877, "logps/chosen": -56.19338607788086, "logps/rejected": -63.83730697631836, "loss": 0.8578, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0018987134099006653, "rewards/margins": 0.7401984333992004, "rewards/rejected": -0.7420971989631653, "step": 375 }, { "epoch": 0.46117286316596395, "grad_norm": 6.448078155517578, "learning_rate": 9.539877300613496e-07, "logits/chosen": -2.262692928314209, "logits/rejected": -2.2725257873535156, "logps/chosen": -57.09231948852539, "logps/rejected": -63.919429779052734, "loss": 0.8559, "rewards/accuracies": 0.65625, "rewards/chosen": 0.08282239735126495, "rewards/margins": 0.7602407932281494, "rewards/rejected": -0.6774184107780457, "step": 376 }, { "epoch": 0.4623993867382139, "grad_norm": 5.631490230560303, "learning_rate": 9.538650306748467e-07, "logits/chosen": -2.2404775619506836, "logits/rejected": -2.2847225666046143, "logps/chosen": -57.469966888427734, "logps/rejected": -66.8936767578125, "loss": 0.826, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0953945517539978, "rewards/margins": 0.982181966304779, "rewards/rejected": -0.8867873549461365, "step": 377 }, { "epoch": 0.4636259103104638, "grad_norm": 8.133557319641113, "learning_rate": 9.537423312883436e-07, "logits/chosen": -2.2324650287628174, "logits/rejected": -2.2906742095947266, "logps/chosen": -58.11244201660156, "logps/rejected": -68.59379577636719, "loss": 0.7636, "rewards/accuracies": 0.84375, "rewards/chosen": -0.019939452409744263, "rewards/margins": 1.3991029262542725, "rewards/rejected": -1.4190423488616943, "step": 378 }, { "epoch": 0.4648524338827137, "grad_norm": 6.3818464279174805, "learning_rate": 9.536196319018404e-07, "logits/chosen": -2.2519447803497314, "logits/rejected": -2.321873903274536, "logps/chosen": -51.90510940551758, "logps/rejected": -60.423301696777344, "loss": 0.8219, "rewards/accuracies": 0.875, "rewards/chosen": 0.43170666694641113, "rewards/margins": 1.1032767295837402, "rewards/rejected": -0.6715701222419739, "step": 379 }, { "epoch": 0.4660789574549636, "grad_norm": 5.354892730712891, "learning_rate": 9.534969325153373e-07, "logits/chosen": -2.2798702716827393, "logits/rejected": -2.3404455184936523, "logps/chosen": -57.51261520385742, "logps/rejected": -60.95307922363281, "loss": 0.8717, "rewards/accuracies": 0.75, "rewards/chosen": -0.05786110833287239, "rewards/margins": 0.7104107141494751, "rewards/rejected": -0.768271803855896, "step": 380 }, { "epoch": 0.4673054810272135, "grad_norm": 6.147059917449951, "learning_rate": 9.533742331288343e-07, "logits/chosen": -2.260438919067383, "logits/rejected": -2.2734103202819824, "logps/chosen": -59.617225646972656, "logps/rejected": -61.30101013183594, "loss": 0.976, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4098794460296631, "rewards/margins": 0.24075110256671906, "rewards/rejected": -0.6506305932998657, "step": 381 }, { "epoch": 0.46853200459946337, "grad_norm": 6.859673976898193, "learning_rate": 9.532515337423312e-07, "logits/chosen": -2.211235761642456, "logits/rejected": -2.2480220794677734, "logps/chosen": -58.41211700439453, "logps/rejected": -61.024986267089844, "loss": 0.9433, "rewards/accuracies": 0.59375, "rewards/chosen": 0.1388070434331894, "rewards/margins": 0.2726331651210785, "rewards/rejected": -0.1338261514902115, "step": 382 }, { "epoch": 0.4697585281717133, "grad_norm": 7.5865478515625, "learning_rate": 9.531288343558282e-07, "logits/chosen": -2.216745376586914, "logits/rejected": -2.2861852645874023, "logps/chosen": -56.24812698364258, "logps/rejected": -64.11015319824219, "loss": 0.7687, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4574652910232544, "rewards/margins": 1.170404314994812, "rewards/rejected": -0.7129390239715576, "step": 383 }, { "epoch": 0.4709850517439632, "grad_norm": 11.046635627746582, "learning_rate": 9.530061349693252e-07, "logits/chosen": -2.278231143951416, "logits/rejected": -2.285125494003296, "logps/chosen": -59.5843391418457, "logps/rejected": -64.12098693847656, "loss": 0.8859, "rewards/accuracies": 0.46875, "rewards/chosen": 0.11732444167137146, "rewards/margins": 0.6343826055526733, "rewards/rejected": -0.5170581340789795, "step": 384 }, { "epoch": 0.4722115753162131, "grad_norm": 8.087364196777344, "learning_rate": 9.52883435582822e-07, "logits/chosen": -2.2351412773132324, "logits/rejected": -2.2553904056549072, "logps/chosen": -51.55535125732422, "logps/rejected": -58.38505172729492, "loss": 0.7953, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6417253017425537, "rewards/margins": 1.096475601196289, "rewards/rejected": -0.45475029945373535, "step": 385 }, { "epoch": 0.473438098888463, "grad_norm": 6.271114826202393, "learning_rate": 9.52760736196319e-07, "logits/chosen": -2.256366491317749, "logits/rejected": -2.294539451599121, "logps/chosen": -55.261322021484375, "logps/rejected": -61.32661056518555, "loss": 0.8577, "rewards/accuracies": 0.8125, "rewards/chosen": 0.33206069469451904, "rewards/margins": 0.6559973955154419, "rewards/rejected": -0.32393670082092285, "step": 386 }, { "epoch": 0.4746646224607129, "grad_norm": 6.610971927642822, "learning_rate": 9.526380368098159e-07, "logits/chosen": -2.2771153450012207, "logits/rejected": -2.3000659942626953, "logps/chosen": -57.43661880493164, "logps/rejected": -67.27749633789062, "loss": 0.8397, "rewards/accuracies": 0.71875, "rewards/chosen": -0.08738642930984497, "rewards/margins": 0.8639270663261414, "rewards/rejected": -0.9513134360313416, "step": 387 }, { "epoch": 0.47589114603296284, "grad_norm": 6.53591251373291, "learning_rate": 9.525153374233128e-07, "logits/chosen": -2.223543167114258, "logits/rejected": -2.2854061126708984, "logps/chosen": -56.573307037353516, "logps/rejected": -62.65263366699219, "loss": 0.8568, "rewards/accuracies": 0.625, "rewards/chosen": 0.2849651277065277, "rewards/margins": 0.7885950803756714, "rewards/rejected": -0.5036299228668213, "step": 388 }, { "epoch": 0.47711766960521274, "grad_norm": 7.4255900382995605, "learning_rate": 9.523926380368098e-07, "logits/chosen": -2.3113625049591064, "logits/rejected": -2.2993385791778564, "logps/chosen": -54.729705810546875, "logps/rejected": -62.153385162353516, "loss": 0.8766, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3284587860107422, "rewards/margins": 0.6703734993934631, "rewards/rejected": -0.3419147729873657, "step": 389 }, { "epoch": 0.47834419317746263, "grad_norm": 6.143249988555908, "learning_rate": 9.522699386503067e-07, "logits/chosen": -2.2327046394348145, "logits/rejected": -2.297597646713257, "logps/chosen": -59.42250061035156, "logps/rejected": -63.931190490722656, "loss": 0.9469, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07612080872058868, "rewards/margins": 0.3372560441493988, "rewards/rejected": -0.41337689757347107, "step": 390 }, { "epoch": 0.4795707167497125, "grad_norm": 8.60274600982666, "learning_rate": 9.521472392638036e-07, "logits/chosen": -2.215038776397705, "logits/rejected": -2.242554187774658, "logps/chosen": -53.3232421875, "logps/rejected": -65.06977081298828, "loss": 0.8172, "rewards/accuracies": 0.75, "rewards/chosen": 0.37148892879486084, "rewards/margins": 0.9947307109832764, "rewards/rejected": -0.6232417821884155, "step": 391 }, { "epoch": 0.4807972403219624, "grad_norm": 8.637670516967773, "learning_rate": 9.520245398773005e-07, "logits/chosen": -2.277238607406616, "logits/rejected": -2.370837688446045, "logps/chosen": -56.227298736572266, "logps/rejected": -57.234039306640625, "loss": 0.8749, "rewards/accuracies": 0.71875, "rewards/chosen": 0.14828750491142273, "rewards/margins": 0.6878867149353027, "rewards/rejected": -0.5395991802215576, "step": 392 }, { "epoch": 0.48202376389421236, "grad_norm": 7.6016974449157715, "learning_rate": 9.519018404907975e-07, "logits/chosen": -2.255810260772705, "logits/rejected": -2.288639783859253, "logps/chosen": -54.48923110961914, "logps/rejected": -61.64469909667969, "loss": 0.784, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4049600064754486, "rewards/margins": 1.2064282894134521, "rewards/rejected": -0.8014682531356812, "step": 393 }, { "epoch": 0.48325028746646226, "grad_norm": 7.2595038414001465, "learning_rate": 9.517791411042945e-07, "logits/chosen": -2.2522735595703125, "logits/rejected": -2.265486001968384, "logps/chosen": -50.06846237182617, "logps/rejected": -57.480812072753906, "loss": 0.8196, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6086874604225159, "rewards/margins": 0.9205632209777832, "rewards/rejected": -0.31187573075294495, "step": 394 }, { "epoch": 0.48447681103871215, "grad_norm": 6.213270664215088, "learning_rate": 9.516564417177914e-07, "logits/chosen": -2.3016273975372314, "logits/rejected": -2.301999568939209, "logps/chosen": -55.87017822265625, "logps/rejected": -59.44801330566406, "loss": 0.9093, "rewards/accuracies": 0.625, "rewards/chosen": -0.04923781752586365, "rewards/margins": 0.5869473218917847, "rewards/rejected": -0.6361851692199707, "step": 395 }, { "epoch": 0.48570333461096205, "grad_norm": 6.591039180755615, "learning_rate": 9.515337423312883e-07, "logits/chosen": -2.2771334648132324, "logits/rejected": -2.2795560359954834, "logps/chosen": -55.69614791870117, "logps/rejected": -66.2806625366211, "loss": 0.8108, "rewards/accuracies": 0.78125, "rewards/chosen": 0.18114736676216125, "rewards/margins": 1.0921123027801514, "rewards/rejected": -0.910964846611023, "step": 396 }, { "epoch": 0.48692985818321194, "grad_norm": 6.158771514892578, "learning_rate": 9.514110429447852e-07, "logits/chosen": -2.230806827545166, "logits/rejected": -2.29724383354187, "logps/chosen": -54.85066223144531, "logps/rejected": -61.976768493652344, "loss": 0.8579, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1307825744152069, "rewards/margins": 0.7162766456604004, "rewards/rejected": -0.5854940414428711, "step": 397 }, { "epoch": 0.4881563817554619, "grad_norm": 6.451861381530762, "learning_rate": 9.512883435582821e-07, "logits/chosen": -2.2799768447875977, "logits/rejected": -2.2490439414978027, "logps/chosen": -52.34040832519531, "logps/rejected": -58.817710876464844, "loss": 0.9073, "rewards/accuracies": 0.59375, "rewards/chosen": 0.4397425055503845, "rewards/margins": 0.4862903952598572, "rewards/rejected": -0.04654790088534355, "step": 398 }, { "epoch": 0.4893829053277118, "grad_norm": 8.339524269104004, "learning_rate": 9.511656441717791e-07, "logits/chosen": -2.2588391304016113, "logits/rejected": -2.2848639488220215, "logps/chosen": -54.65864181518555, "logps/rejected": -63.820281982421875, "loss": 0.8021, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5225483179092407, "rewards/margins": 1.1008210182189941, "rewards/rejected": -0.5782725811004639, "step": 399 }, { "epoch": 0.4906094288999617, "grad_norm": 5.869194507598877, "learning_rate": 9.51042944785276e-07, "logits/chosen": -2.2808756828308105, "logits/rejected": -2.299875020980835, "logps/chosen": -57.685794830322266, "logps/rejected": -66.4992904663086, "loss": 0.8617, "rewards/accuracies": 0.71875, "rewards/chosen": 0.05788746476173401, "rewards/margins": 0.7290492653846741, "rewards/rejected": -0.6711616516113281, "step": 400 }, { "epoch": 0.49183595247221157, "grad_norm": 6.7202301025390625, "learning_rate": 9.509202453987729e-07, "logits/chosen": -2.2623395919799805, "logits/rejected": -2.2891573905944824, "logps/chosen": -55.63900375366211, "logps/rejected": -62.142311096191406, "loss": 0.9281, "rewards/accuracies": 0.5, "rewards/chosen": -0.20376357436180115, "rewards/margins": 0.3615686893463135, "rewards/rejected": -0.5653322339057922, "step": 401 }, { "epoch": 0.49306247604446146, "grad_norm": 8.711150169372559, "learning_rate": 9.507975460122699e-07, "logits/chosen": -2.2892167568206787, "logits/rejected": -2.315486192703247, "logps/chosen": -54.35600280761719, "logps/rejected": -61.126731872558594, "loss": 0.8691, "rewards/accuracies": 0.6875, "rewards/chosen": 0.41067129373550415, "rewards/margins": 0.7283985614776611, "rewards/rejected": -0.317727267742157, "step": 402 }, { "epoch": 0.4942889996167114, "grad_norm": 6.578520774841309, "learning_rate": 9.506748466257668e-07, "logits/chosen": -2.2330052852630615, "logits/rejected": -2.2602782249450684, "logps/chosen": -58.1407470703125, "logps/rejected": -59.31973648071289, "loss": 0.9562, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0010797977447509766, "rewards/margins": 0.22693797945976257, "rewards/rejected": -0.22585821151733398, "step": 403 }, { "epoch": 0.4955155231889613, "grad_norm": 8.73897933959961, "learning_rate": 9.505521472392638e-07, "logits/chosen": -2.248413562774658, "logits/rejected": -2.2404284477233887, "logps/chosen": -55.78387451171875, "logps/rejected": -62.33340835571289, "loss": 0.8728, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05282985419034958, "rewards/margins": 0.7476255297660828, "rewards/rejected": -0.6947957277297974, "step": 404 }, { "epoch": 0.4967420467612112, "grad_norm": 6.73830509185791, "learning_rate": 9.504294478527607e-07, "logits/chosen": -2.2505850791931152, "logits/rejected": -2.270467758178711, "logps/chosen": -60.287147521972656, "logps/rejected": -68.59928131103516, "loss": 0.855, "rewards/accuracies": 0.6875, "rewards/chosen": -0.27257487177848816, "rewards/margins": 0.7244172096252441, "rewards/rejected": -0.9969919919967651, "step": 405 }, { "epoch": 0.4979685703334611, "grad_norm": 6.247476100921631, "learning_rate": 9.503067484662576e-07, "logits/chosen": -2.2929420471191406, "logits/rejected": -2.294867753982544, "logps/chosen": -67.34335327148438, "logps/rejected": -72.70507049560547, "loss": 0.9137, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6090299487113953, "rewards/margins": 0.4632168710231781, "rewards/rejected": -1.072246789932251, "step": 406 }, { "epoch": 0.499195093905711, "grad_norm": 5.951550483703613, "learning_rate": 9.501840490797546e-07, "logits/chosen": -2.245431900024414, "logits/rejected": -2.312390089035034, "logps/chosen": -58.80941390991211, "logps/rejected": -68.43197631835938, "loss": 0.7997, "rewards/accuracies": 0.71875, "rewards/chosen": -0.20629091560840607, "rewards/margins": 1.0975618362426758, "rewards/rejected": -1.3038526773452759, "step": 407 }, { "epoch": 0.5004216174779609, "grad_norm": 7.4349846839904785, "learning_rate": 9.500613496932515e-07, "logits/chosen": -2.273897171020508, "logits/rejected": -2.294863700866699, "logps/chosen": -59.51185989379883, "logps/rejected": -64.61485290527344, "loss": 0.8868, "rewards/accuracies": 0.59375, "rewards/chosen": -0.13955430686473846, "rewards/margins": 0.6815007925033569, "rewards/rejected": -0.8210550546646118, "step": 408 }, { "epoch": 0.5016481410502108, "grad_norm": 7.635817527770996, "learning_rate": 9.499386503067485e-07, "logits/chosen": -2.2956581115722656, "logits/rejected": -2.3236024379730225, "logps/chosen": -61.80683898925781, "logps/rejected": -71.2758560180664, "loss": 0.875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25912168622016907, "rewards/margins": 0.7619784474372864, "rewards/rejected": -1.0211001634597778, "step": 409 }, { "epoch": 0.5028746646224607, "grad_norm": 7.235538482666016, "learning_rate": 9.498159509202453e-07, "logits/chosen": -2.249835252761841, "logits/rejected": -2.260240316390991, "logps/chosen": -57.4884033203125, "logps/rejected": -69.96846008300781, "loss": 0.7904, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12947684526443481, "rewards/margins": 1.2230929136276245, "rewards/rejected": -1.352569818496704, "step": 410 }, { "epoch": 0.5041011881947106, "grad_norm": 6.496146202087402, "learning_rate": 9.496932515337422e-07, "logits/chosen": -2.2711594104766846, "logits/rejected": -2.30186128616333, "logps/chosen": -66.38610076904297, "logps/rejected": -71.63774871826172, "loss": 0.9103, "rewards/accuracies": 0.46875, "rewards/chosen": -0.21779224276542664, "rewards/margins": 0.5668827295303345, "rewards/rejected": -0.7846750020980835, "step": 411 }, { "epoch": 0.5053277117669606, "grad_norm": 6.316473960876465, "learning_rate": 9.495705521472392e-07, "logits/chosen": -2.2560136318206787, "logits/rejected": -2.2930006980895996, "logps/chosen": -62.23032760620117, "logps/rejected": -65.09061431884766, "loss": 0.9405, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4847107231616974, "rewards/margins": 0.36682575941085815, "rewards/rejected": -0.8515364527702332, "step": 412 }, { "epoch": 0.5065542353392104, "grad_norm": 12.051758766174316, "learning_rate": 9.494478527607362e-07, "logits/chosen": -2.2520244121551514, "logits/rejected": -2.265550374984741, "logps/chosen": -64.60804748535156, "logps/rejected": -68.3055419921875, "loss": 0.984, "rewards/accuracies": 0.46875, "rewards/chosen": -0.75027996301651, "rewards/margins": 0.1224619597196579, "rewards/rejected": -0.8727419376373291, "step": 413 }, { "epoch": 0.5077807589114604, "grad_norm": 8.05204963684082, "learning_rate": 9.493251533742332e-07, "logits/chosen": -2.2436163425445557, "logits/rejected": -2.2340214252471924, "logps/chosen": -60.23933029174805, "logps/rejected": -72.43930053710938, "loss": 0.8095, "rewards/accuracies": 0.75, "rewards/chosen": -0.3584403991699219, "rewards/margins": 1.136427402496338, "rewards/rejected": -1.4948678016662598, "step": 414 }, { "epoch": 0.5090072824837102, "grad_norm": 10.823376655578613, "learning_rate": 9.4920245398773e-07, "logits/chosen": -2.3392715454101562, "logits/rejected": -2.3630290031433105, "logps/chosen": -60.55192947387695, "logps/rejected": -71.6903076171875, "loss": 0.8301, "rewards/accuracies": 0.625, "rewards/chosen": -0.4181186854839325, "rewards/margins": 0.910157322883606, "rewards/rejected": -1.3282761573791504, "step": 415 }, { "epoch": 0.5102338060559601, "grad_norm": 6.344428539276123, "learning_rate": 9.490797546012269e-07, "logits/chosen": -2.27579402923584, "logits/rejected": -2.288632869720459, "logps/chosen": -64.83291625976562, "logps/rejected": -69.90808868408203, "loss": 0.9089, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7750338912010193, "rewards/margins": 0.5413833856582642, "rewards/rejected": -1.3164170980453491, "step": 416 }, { "epoch": 0.5114603296282101, "grad_norm": 8.29255199432373, "learning_rate": 9.489570552147239e-07, "logits/chosen": -2.2594640254974365, "logits/rejected": -2.3134944438934326, "logps/chosen": -60.178367614746094, "logps/rejected": -69.78460693359375, "loss": 0.8322, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2807827591896057, "rewards/margins": 0.9648816585540771, "rewards/rejected": -1.245664358139038, "step": 417 }, { "epoch": 0.5126868532004599, "grad_norm": 8.423164367675781, "learning_rate": 9.488343558282208e-07, "logits/chosen": -2.2563931941986084, "logits/rejected": -2.2507801055908203, "logps/chosen": -62.89977264404297, "logps/rejected": -71.68342590332031, "loss": 0.8097, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6325929164886475, "rewards/margins": 1.2178970575332642, "rewards/rejected": -1.8504899740219116, "step": 418 }, { "epoch": 0.5139133767727099, "grad_norm": 7.771142482757568, "learning_rate": 9.487116564417178e-07, "logits/chosen": -2.2595176696777344, "logits/rejected": -2.2839901447296143, "logps/chosen": -62.37128829956055, "logps/rejected": -69.1981201171875, "loss": 0.8331, "rewards/accuracies": 0.75, "rewards/chosen": -0.3114216923713684, "rewards/margins": 0.931984007358551, "rewards/rejected": -1.243405818939209, "step": 419 }, { "epoch": 0.5151399003449597, "grad_norm": 6.879971981048584, "learning_rate": 9.485889570552147e-07, "logits/chosen": -2.294011116027832, "logits/rejected": -2.3129515647888184, "logps/chosen": -64.31007385253906, "logps/rejected": -68.09830474853516, "loss": 0.9071, "rewards/accuracies": 0.625, "rewards/chosen": -0.4341059923171997, "rewards/margins": 0.3777979612350464, "rewards/rejected": -0.8119040131568909, "step": 420 }, { "epoch": 0.5163664239172097, "grad_norm": 7.26153039932251, "learning_rate": 9.484662576687115e-07, "logits/chosen": -2.3217966556549072, "logits/rejected": -2.2653419971466064, "logps/chosen": -59.94739532470703, "logps/rejected": -65.02067565917969, "loss": 0.9275, "rewards/accuracies": 0.5, "rewards/chosen": -0.39647433161735535, "rewards/margins": 0.31786301732063293, "rewards/rejected": -0.7143373489379883, "step": 421 }, { "epoch": 0.5175929474894596, "grad_norm": 7.268307685852051, "learning_rate": 9.483435582822085e-07, "logits/chosen": -2.2445709705352783, "logits/rejected": -2.2579092979431152, "logps/chosen": -56.981178283691406, "logps/rejected": -63.8418083190918, "loss": 0.8461, "rewards/accuracies": 0.6875, "rewards/chosen": -0.025759443640708923, "rewards/margins": 0.9624531269073486, "rewards/rejected": -0.9882127046585083, "step": 422 }, { "epoch": 0.5188194710617094, "grad_norm": 7.420912265777588, "learning_rate": 9.482208588957055e-07, "logits/chosen": -2.2623491287231445, "logits/rejected": -2.352717161178589, "logps/chosen": -59.582969665527344, "logps/rejected": -71.22115325927734, "loss": 0.7981, "rewards/accuracies": 0.875, "rewards/chosen": 0.15692737698554993, "rewards/margins": 1.397423505783081, "rewards/rejected": -1.240496039390564, "step": 423 }, { "epoch": 0.5200459946339594, "grad_norm": 8.376972198486328, "learning_rate": 9.480981595092025e-07, "logits/chosen": -2.303360939025879, "logits/rejected": -2.3048949241638184, "logps/chosen": -59.53324890136719, "logps/rejected": -68.381103515625, "loss": 0.82, "rewards/accuracies": 0.75, "rewards/chosen": -0.08282776176929474, "rewards/margins": 1.097623348236084, "rewards/rejected": -1.1804511547088623, "step": 424 }, { "epoch": 0.5212725182062092, "grad_norm": 7.210766792297363, "learning_rate": 9.479754601226994e-07, "logits/chosen": -2.2094650268554688, "logits/rejected": -2.266850471496582, "logps/chosen": -54.360801696777344, "logps/rejected": -63.9296875, "loss": 0.7635, "rewards/accuracies": 0.75, "rewards/chosen": 0.47482678294181824, "rewards/margins": 1.3408757448196411, "rewards/rejected": -0.8660489320755005, "step": 425 }, { "epoch": 0.5224990417784592, "grad_norm": 6.378595352172852, "learning_rate": 9.478527607361963e-07, "logits/chosen": -2.321707248687744, "logits/rejected": -2.3259809017181396, "logps/chosen": -60.329830169677734, "logps/rejected": -67.35899353027344, "loss": 0.8767, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3202356696128845, "rewards/margins": 0.8628843426704407, "rewards/rejected": -1.1831200122833252, "step": 426 }, { "epoch": 0.5237255653507091, "grad_norm": 7.97865104675293, "learning_rate": 9.477300613496933e-07, "logits/chosen": -2.2851855754852295, "logits/rejected": -2.3211045265197754, "logps/chosen": -56.50232696533203, "logps/rejected": -61.114898681640625, "loss": 0.9394, "rewards/accuracies": 0.59375, "rewards/chosen": 0.11606843769550323, "rewards/margins": 0.4502440392971039, "rewards/rejected": -0.3341755270957947, "step": 427 }, { "epoch": 0.524952088922959, "grad_norm": 6.690507411956787, "learning_rate": 9.476073619631901e-07, "logits/chosen": -2.2776427268981934, "logits/rejected": -2.3424177169799805, "logps/chosen": -56.18839645385742, "logps/rejected": -61.82408142089844, "loss": 0.8392, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4412633776664734, "rewards/margins": 0.976090669631958, "rewards/rejected": -0.5348272919654846, "step": 428 }, { "epoch": 0.5261786124952089, "grad_norm": 10.4157075881958, "learning_rate": 9.47484662576687e-07, "logits/chosen": -2.246428966522217, "logits/rejected": -2.327258348464966, "logps/chosen": -53.831512451171875, "logps/rejected": -61.25825500488281, "loss": 0.8064, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5522216558456421, "rewards/margins": 1.0766193866729736, "rewards/rejected": -0.5243977308273315, "step": 429 }, { "epoch": 0.5274051360674588, "grad_norm": 8.476786613464355, "learning_rate": 9.47361963190184e-07, "logits/chosen": -2.206359386444092, "logits/rejected": -2.2257587909698486, "logps/chosen": -56.289833068847656, "logps/rejected": -66.46004486083984, "loss": 0.7886, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6085401773452759, "rewards/margins": 1.1558600664138794, "rewards/rejected": -0.5473198294639587, "step": 430 }, { "epoch": 0.5286316596397087, "grad_norm": 6.770932197570801, "learning_rate": 9.472392638036809e-07, "logits/chosen": -2.3039181232452393, "logits/rejected": -2.330779552459717, "logps/chosen": -56.62083053588867, "logps/rejected": -63.417724609375, "loss": 0.8521, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3275827467441559, "rewards/margins": 0.7997776865959167, "rewards/rejected": -0.47219496965408325, "step": 431 }, { "epoch": 0.5298581832119587, "grad_norm": 5.882030963897705, "learning_rate": 9.471165644171779e-07, "logits/chosen": -2.339186191558838, "logits/rejected": -2.3692028522491455, "logps/chosen": -56.16205596923828, "logps/rejected": -62.89286804199219, "loss": 0.8986, "rewards/accuracies": 0.625, "rewards/chosen": -0.08595012873411179, "rewards/margins": 0.648918628692627, "rewards/rejected": -0.7348687648773193, "step": 432 }, { "epoch": 0.5310847067842085, "grad_norm": 7.077145099639893, "learning_rate": 9.469938650306749e-07, "logits/chosen": -2.2147727012634277, "logits/rejected": -2.270742416381836, "logps/chosen": -54.70964431762695, "logps/rejected": -66.58744812011719, "loss": 0.81, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11499188095331192, "rewards/margins": 1.1444189548492432, "rewards/rejected": -1.0294270515441895, "step": 433 }, { "epoch": 0.5323112303564584, "grad_norm": 7.042121410369873, "learning_rate": 9.468711656441717e-07, "logits/chosen": -2.2500786781311035, "logits/rejected": -2.2962608337402344, "logps/chosen": -56.32948303222656, "logps/rejected": -68.78521728515625, "loss": 0.8508, "rewards/accuracies": 0.71875, "rewards/chosen": -0.006680868566036224, "rewards/margins": 1.0078204870224, "rewards/rejected": -1.0145013332366943, "step": 434 }, { "epoch": 0.5335377539287083, "grad_norm": 10.830611228942871, "learning_rate": 9.467484662576687e-07, "logits/chosen": -2.334223508834839, "logits/rejected": -2.319321632385254, "logps/chosen": -58.526695251464844, "logps/rejected": -63.991371154785156, "loss": 0.8889, "rewards/accuracies": 0.71875, "rewards/chosen": 0.004643574357032776, "rewards/margins": 0.5995644330978394, "rewards/rejected": -0.5949208736419678, "step": 435 }, { "epoch": 0.5347642775009582, "grad_norm": 7.957648754119873, "learning_rate": 9.466257668711656e-07, "logits/chosen": -2.297636032104492, "logits/rejected": -2.342691659927368, "logps/chosen": -54.92702865600586, "logps/rejected": -67.25183868408203, "loss": 0.7828, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0009310301393270493, "rewards/margins": 1.2762905359268188, "rewards/rejected": -1.2772215604782104, "step": 436 }, { "epoch": 0.5359908010732082, "grad_norm": 8.157051086425781, "learning_rate": 9.465030674846626e-07, "logits/chosen": -2.316720724105835, "logits/rejected": -2.379434585571289, "logps/chosen": -53.646324157714844, "logps/rejected": -63.33824157714844, "loss": 0.8145, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2282305657863617, "rewards/margins": 1.0705374479293823, "rewards/rejected": -0.8423067927360535, "step": 437 }, { "epoch": 0.537217324645458, "grad_norm": 6.802351474761963, "learning_rate": 9.463803680981595e-07, "logits/chosen": -2.2570340633392334, "logits/rejected": -2.30102801322937, "logps/chosen": -60.23997116088867, "logps/rejected": -69.00668334960938, "loss": 0.8568, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10066071152687073, "rewards/margins": 0.9106139540672302, "rewards/rejected": -1.0112745761871338, "step": 438 }, { "epoch": 0.538443848217708, "grad_norm": 7.031404495239258, "learning_rate": 9.462576687116563e-07, "logits/chosen": -2.300276756286621, "logits/rejected": -2.3169052600860596, "logps/chosen": -54.59074401855469, "logps/rejected": -68.30851745605469, "loss": 0.7771, "rewards/accuracies": 0.75, "rewards/chosen": -0.1520187258720398, "rewards/margins": 1.31582510471344, "rewards/rejected": -1.467843770980835, "step": 439 }, { "epoch": 0.5396703717899578, "grad_norm": 9.602483749389648, "learning_rate": 9.461349693251533e-07, "logits/chosen": -2.248798131942749, "logits/rejected": -2.289296865463257, "logps/chosen": -57.07331466674805, "logps/rejected": -69.68745422363281, "loss": 0.757, "rewards/accuracies": 0.78125, "rewards/chosen": 0.16103336215019226, "rewards/margins": 1.4242743253707886, "rewards/rejected": -1.2632410526275635, "step": 440 }, { "epoch": 0.5408968953622078, "grad_norm": 9.485013008117676, "learning_rate": 9.460122699386502e-07, "logits/chosen": -2.2918968200683594, "logits/rejected": -2.2876482009887695, "logps/chosen": -59.068809509277344, "logps/rejected": -70.19426727294922, "loss": 0.8689, "rewards/accuracies": 0.625, "rewards/chosen": -0.5371472835540771, "rewards/margins": 0.9623684883117676, "rewards/rejected": -1.4995157718658447, "step": 441 }, { "epoch": 0.5421234189344577, "grad_norm": 10.968368530273438, "learning_rate": 9.458895705521472e-07, "logits/chosen": -2.1884422302246094, "logits/rejected": -2.2422943115234375, "logps/chosen": -55.65364074707031, "logps/rejected": -64.3056640625, "loss": 0.8504, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2682414948940277, "rewards/margins": 0.9263307452201843, "rewards/rejected": -1.1945720911026, "step": 442 }, { "epoch": 0.5433499425067075, "grad_norm": 7.772372722625732, "learning_rate": 9.457668711656441e-07, "logits/chosen": -2.2671053409576416, "logits/rejected": -2.2651281356811523, "logps/chosen": -58.681915283203125, "logps/rejected": -66.40718841552734, "loss": 0.8188, "rewards/accuracies": 0.75, "rewards/chosen": -0.12968884408473969, "rewards/margins": 1.0583629608154297, "rewards/rejected": -1.1880519390106201, "step": 443 }, { "epoch": 0.5445764660789575, "grad_norm": 5.902589797973633, "learning_rate": 9.456441717791411e-07, "logits/chosen": -2.2687418460845947, "logits/rejected": -2.255512237548828, "logps/chosen": -61.407901763916016, "logps/rejected": -74.56150817871094, "loss": 0.7873, "rewards/accuracies": 0.75, "rewards/chosen": -0.16061213612556458, "rewards/margins": 1.3091222047805786, "rewards/rejected": -1.4697343111038208, "step": 444 }, { "epoch": 0.5458029896512073, "grad_norm": 6.87794828414917, "learning_rate": 9.45521472392638e-07, "logits/chosen": -2.32125186920166, "logits/rejected": -2.340721368789673, "logps/chosen": -58.937191009521484, "logps/rejected": -66.15801239013672, "loss": 0.8366, "rewards/accuracies": 0.71875, "rewards/chosen": -0.18301545083522797, "rewards/margins": 0.9522894024848938, "rewards/rejected": -1.1353048086166382, "step": 445 }, { "epoch": 0.5470295132234573, "grad_norm": 8.411458015441895, "learning_rate": 9.453987730061349e-07, "logits/chosen": -2.2134487628936768, "logits/rejected": -2.241319179534912, "logps/chosen": -56.98176574707031, "logps/rejected": -65.11886596679688, "loss": 0.8343, "rewards/accuracies": 0.65625, "rewards/chosen": 0.09855490922927856, "rewards/margins": 0.9255926012992859, "rewards/rejected": -0.8270378112792969, "step": 446 }, { "epoch": 0.5482560367957071, "grad_norm": 7.300044536590576, "learning_rate": 9.452760736196319e-07, "logits/chosen": -2.254962921142578, "logits/rejected": -2.274074077606201, "logps/chosen": -54.63302230834961, "logps/rejected": -63.60352325439453, "loss": 0.8587, "rewards/accuracies": 0.6875, "rewards/chosen": -0.28453099727630615, "rewards/margins": 0.990146279335022, "rewards/rejected": -1.2746772766113281, "step": 447 }, { "epoch": 0.5494825603679571, "grad_norm": 7.147546768188477, "learning_rate": 9.451533742331288e-07, "logits/chosen": -2.294856548309326, "logits/rejected": -2.2780375480651855, "logps/chosen": -59.350341796875, "logps/rejected": -67.92460632324219, "loss": 0.8763, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09458594024181366, "rewards/margins": 0.7318333983421326, "rewards/rejected": -0.8264192938804626, "step": 448 }, { "epoch": 0.550709083940207, "grad_norm": 8.234169960021973, "learning_rate": 9.450306748466257e-07, "logits/chosen": -2.3081884384155273, "logits/rejected": -2.3238329887390137, "logps/chosen": -57.05027389526367, "logps/rejected": -66.39736938476562, "loss": 0.7876, "rewards/accuracies": 0.71875, "rewards/chosen": 0.13702994585037231, "rewards/margins": 1.2414002418518066, "rewards/rejected": -1.104370355606079, "step": 449 }, { "epoch": 0.5519356075124568, "grad_norm": 8.66989803314209, "learning_rate": 9.449079754601227e-07, "logits/chosen": -2.348820686340332, "logits/rejected": -2.365694761276245, "logps/chosen": -64.1188735961914, "logps/rejected": -70.36621856689453, "loss": 0.8594, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8697726726531982, "rewards/margins": 0.8498080968856812, "rewards/rejected": -1.7195807695388794, "step": 450 }, { "epoch": 0.5531621310847068, "grad_norm": 6.30914306640625, "learning_rate": 9.447852760736195e-07, "logits/chosen": -2.2871928215026855, "logits/rejected": -2.3261265754699707, "logps/chosen": -62.40272903442383, "logps/rejected": -70.42495727539062, "loss": 0.824, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25437840819358826, "rewards/margins": 1.0467660427093506, "rewards/rejected": -1.3011445999145508, "step": 451 }, { "epoch": 0.5543886546569566, "grad_norm": 6.532035827636719, "learning_rate": 9.446625766871165e-07, "logits/chosen": -2.2452104091644287, "logits/rejected": -2.2376832962036133, "logps/chosen": -59.14329528808594, "logps/rejected": -66.50386047363281, "loss": 0.8202, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2409188449382782, "rewards/margins": 0.984151303768158, "rewards/rejected": -1.2250701189041138, "step": 452 }, { "epoch": 0.5556151782292066, "grad_norm": 7.764927864074707, "learning_rate": 9.445398773006134e-07, "logits/chosen": -2.231947898864746, "logits/rejected": -2.269866704940796, "logps/chosen": -63.29151916503906, "logps/rejected": -70.97196960449219, "loss": 0.8382, "rewards/accuracies": 0.71875, "rewards/chosen": -0.24942408502101898, "rewards/margins": 1.166555643081665, "rewards/rejected": -1.4159797430038452, "step": 453 }, { "epoch": 0.5568417018014565, "grad_norm": 6.175893783569336, "learning_rate": 9.444171779141104e-07, "logits/chosen": -2.2840447425842285, "logits/rejected": -2.2793376445770264, "logps/chosen": -68.88481903076172, "logps/rejected": -75.91865539550781, "loss": 0.8915, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0685126781463623, "rewards/margins": 0.563274621963501, "rewards/rejected": -1.6317874193191528, "step": 454 }, { "epoch": 0.5580682253737064, "grad_norm": 7.711546421051025, "learning_rate": 9.442944785276074e-07, "logits/chosen": -2.248222827911377, "logits/rejected": -2.3055002689361572, "logps/chosen": -53.214202880859375, "logps/rejected": -62.72934341430664, "loss": 0.8205, "rewards/accuracies": 0.71875, "rewards/chosen": 0.21043938398361206, "rewards/margins": 1.0466605424880981, "rewards/rejected": -0.8362212777137756, "step": 455 }, { "epoch": 0.5592947489459563, "grad_norm": 7.2043538093566895, "learning_rate": 9.441717791411043e-07, "logits/chosen": -2.263814687728882, "logits/rejected": -2.3005435466766357, "logps/chosen": -59.69619369506836, "logps/rejected": -66.66452026367188, "loss": 0.9013, "rewards/accuracies": 0.625, "rewards/chosen": -0.05287301540374756, "rewards/margins": 0.667680561542511, "rewards/rejected": -0.7205535173416138, "step": 456 }, { "epoch": 0.5605212725182062, "grad_norm": 9.33926010131836, "learning_rate": 9.440490797546011e-07, "logits/chosen": -2.233004331588745, "logits/rejected": -2.2764620780944824, "logps/chosen": -60.248870849609375, "logps/rejected": -67.76306915283203, "loss": 0.8952, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6866747140884399, "rewards/margins": 0.6698508262634277, "rewards/rejected": -1.3565255403518677, "step": 457 }, { "epoch": 0.5617477960904561, "grad_norm": 9.60228157043457, "learning_rate": 9.439263803680981e-07, "logits/chosen": -2.2463254928588867, "logits/rejected": -2.2464921474456787, "logps/chosen": -55.39384078979492, "logps/rejected": -63.73194122314453, "loss": 0.8219, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10126971453428268, "rewards/margins": 1.0614291429519653, "rewards/rejected": -0.9601595997810364, "step": 458 }, { "epoch": 0.5629743196627061, "grad_norm": 8.384932518005371, "learning_rate": 9.43803680981595e-07, "logits/chosen": -2.263528347015381, "logits/rejected": -2.2837276458740234, "logps/chosen": -55.60050582885742, "logps/rejected": -62.353206634521484, "loss": 0.8915, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13068372011184692, "rewards/margins": 0.6447063684463501, "rewards/rejected": -0.7753900289535522, "step": 459 }, { "epoch": 0.5642008432349559, "grad_norm": 8.355484008789062, "learning_rate": 9.43680981595092e-07, "logits/chosen": -2.2515130043029785, "logits/rejected": -2.2495625019073486, "logps/chosen": -56.52301025390625, "logps/rejected": -69.64836883544922, "loss": 0.7613, "rewards/accuracies": 0.75, "rewards/chosen": 0.2335338294506073, "rewards/margins": 1.4476215839385986, "rewards/rejected": -1.2140878438949585, "step": 460 }, { "epoch": 0.5654273668072058, "grad_norm": 9.715960502624512, "learning_rate": 9.435582822085889e-07, "logits/chosen": -2.271583318710327, "logits/rejected": -2.3029112815856934, "logps/chosen": -56.190040588378906, "logps/rejected": -66.96440124511719, "loss": 0.7984, "rewards/accuracies": 0.75, "rewards/chosen": 0.10797876119613647, "rewards/margins": 1.2545342445373535, "rewards/rejected": -1.1465554237365723, "step": 461 }, { "epoch": 0.5666538903794557, "grad_norm": 7.31341552734375, "learning_rate": 9.434355828220858e-07, "logits/chosen": -2.2148609161376953, "logits/rejected": -2.254788637161255, "logps/chosen": -54.10359573364258, "logps/rejected": -63.97199630737305, "loss": 0.8272, "rewards/accuracies": 0.625, "rewards/chosen": 0.223434180021286, "rewards/margins": 0.9386733770370483, "rewards/rejected": -0.7152392864227295, "step": 462 }, { "epoch": 0.5678804139517056, "grad_norm": 10.369959831237793, "learning_rate": 9.433128834355827e-07, "logits/chosen": -2.228191614151001, "logits/rejected": -2.2688698768615723, "logps/chosen": -59.007347106933594, "logps/rejected": -62.32741928100586, "loss": 0.8687, "rewards/accuracies": 0.78125, "rewards/chosen": 0.02629992365837097, "rewards/margins": 0.8142344951629639, "rewards/rejected": -0.7879346013069153, "step": 463 }, { "epoch": 0.5691069375239556, "grad_norm": 8.375624656677246, "learning_rate": 9.431901840490797e-07, "logits/chosen": -2.2353827953338623, "logits/rejected": -2.2431106567382812, "logps/chosen": -47.01676940917969, "logps/rejected": -56.50434494018555, "loss": 0.8042, "rewards/accuracies": 0.75, "rewards/chosen": 0.8198598027229309, "rewards/margins": 0.9845418334007263, "rewards/rejected": -0.1646820604801178, "step": 464 }, { "epoch": 0.5703334610962054, "grad_norm": 7.5505242347717285, "learning_rate": 9.430674846625767e-07, "logits/chosen": -2.2322328090667725, "logits/rejected": -2.263131856918335, "logps/chosen": -51.13544464111328, "logps/rejected": -56.48833084106445, "loss": 0.8588, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4567839205265045, "rewards/margins": 0.7899047136306763, "rewards/rejected": -0.333120733499527, "step": 465 }, { "epoch": 0.5715599846684554, "grad_norm": 12.765416145324707, "learning_rate": 9.429447852760736e-07, "logits/chosen": -2.2349233627319336, "logits/rejected": -2.2564871311187744, "logps/chosen": -54.85421371459961, "logps/rejected": -66.76243591308594, "loss": 0.8662, "rewards/accuracies": 0.625, "rewards/chosen": 0.14743907749652863, "rewards/margins": 0.7371246218681335, "rewards/rejected": -0.5896855592727661, "step": 466 }, { "epoch": 0.5727865082407052, "grad_norm": 12.507558822631836, "learning_rate": 9.428220858895705e-07, "logits/chosen": -2.2741482257843018, "logits/rejected": -2.3211071491241455, "logps/chosen": -49.987918853759766, "logps/rejected": -56.610023498535156, "loss": 0.8396, "rewards/accuracies": 0.65625, "rewards/chosen": 0.6461174488067627, "rewards/margins": 0.873958170413971, "rewards/rejected": -0.22784075140953064, "step": 467 }, { "epoch": 0.5740130318129552, "grad_norm": 10.339900016784668, "learning_rate": 9.426993865030675e-07, "logits/chosen": -2.263425350189209, "logits/rejected": -2.2923755645751953, "logps/chosen": -53.152305603027344, "logps/rejected": -60.91898727416992, "loss": 0.8153, "rewards/accuracies": 0.625, "rewards/chosen": 0.5293306708335876, "rewards/margins": 1.0361099243164062, "rewards/rejected": -0.5067791938781738, "step": 468 }, { "epoch": 0.5752395553852051, "grad_norm": 7.0871124267578125, "learning_rate": 9.425766871165643e-07, "logits/chosen": -2.301584243774414, "logits/rejected": -2.3619816303253174, "logps/chosen": -56.7019157409668, "logps/rejected": -65.23438262939453, "loss": 0.815, "rewards/accuracies": 0.75, "rewards/chosen": 0.2973739206790924, "rewards/margins": 1.1466999053955078, "rewards/rejected": -0.8493260145187378, "step": 469 }, { "epoch": 0.5764660789574549, "grad_norm": 7.589236259460449, "learning_rate": 9.424539877300613e-07, "logits/chosen": -2.3563072681427, "logits/rejected": -2.355325937271118, "logps/chosen": -59.13172912597656, "logps/rejected": -61.64639663696289, "loss": 0.9168, "rewards/accuracies": 0.65625, "rewards/chosen": 0.23689976334571838, "rewards/margins": 0.5385808348655701, "rewards/rejected": -0.3016811013221741, "step": 470 }, { "epoch": 0.5776926025297049, "grad_norm": 7.487234592437744, "learning_rate": 9.423312883435582e-07, "logits/chosen": -2.2653372287750244, "logits/rejected": -2.2596211433410645, "logps/chosen": -53.598873138427734, "logps/rejected": -62.75775146484375, "loss": 0.8573, "rewards/accuracies": 0.625, "rewards/chosen": 0.2040303647518158, "rewards/margins": 1.027284026145935, "rewards/rejected": -0.8232535123825073, "step": 471 }, { "epoch": 0.5789191261019547, "grad_norm": 6.7769246101379395, "learning_rate": 9.422085889570551e-07, "logits/chosen": -2.287304162979126, "logits/rejected": -2.3013134002685547, "logps/chosen": -54.84058380126953, "logps/rejected": -65.95204162597656, "loss": 0.7614, "rewards/accuracies": 0.625, "rewards/chosen": 0.3851194977760315, "rewards/margins": 1.2751848697662354, "rewards/rejected": -0.8900652527809143, "step": 472 }, { "epoch": 0.5801456496742047, "grad_norm": 7.20180082321167, "learning_rate": 9.420858895705521e-07, "logits/chosen": -2.2565579414367676, "logits/rejected": -2.2785959243774414, "logps/chosen": -52.80379104614258, "logps/rejected": -60.722713470458984, "loss": 0.8688, "rewards/accuracies": 0.59375, "rewards/chosen": 0.18611352145671844, "rewards/margins": 0.7368323802947998, "rewards/rejected": -0.5507187843322754, "step": 473 }, { "epoch": 0.5813721732464546, "grad_norm": 8.702348709106445, "learning_rate": 9.419631901840491e-07, "logits/chosen": -2.2520639896392822, "logits/rejected": -2.2879793643951416, "logps/chosen": -57.101600646972656, "logps/rejected": -60.451072692871094, "loss": 0.9574, "rewards/accuracies": 0.5625, "rewards/chosen": 0.10747543722391129, "rewards/margins": 0.36778098344802856, "rewards/rejected": -0.2603054642677307, "step": 474 }, { "epoch": 0.5825986968187045, "grad_norm": 7.20154333114624, "learning_rate": 9.41840490797546e-07, "logits/chosen": -2.2976410388946533, "logits/rejected": -2.3390908241271973, "logps/chosen": -55.43368911743164, "logps/rejected": -65.81669616699219, "loss": 0.8173, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5082273483276367, "rewards/margins": 1.1648374795913696, "rewards/rejected": -0.6566101908683777, "step": 475 }, { "epoch": 0.5838252203909544, "grad_norm": 8.843618392944336, "learning_rate": 9.417177914110429e-07, "logits/chosen": -2.312837839126587, "logits/rejected": -2.3161332607269287, "logps/chosen": -63.08095932006836, "logps/rejected": -66.41377258300781, "loss": 0.9586, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4317748546600342, "rewards/margins": 0.3075202405452728, "rewards/rejected": -0.7392950654029846, "step": 476 }, { "epoch": 0.5850517439632043, "grad_norm": 7.919190406799316, "learning_rate": 9.415950920245398e-07, "logits/chosen": -2.3477840423583984, "logits/rejected": -2.370555877685547, "logps/chosen": -58.79057693481445, "logps/rejected": -69.2091064453125, "loss": 0.782, "rewards/accuracies": 0.71875, "rewards/chosen": 0.011688552796840668, "rewards/margins": 1.470576524734497, "rewards/rejected": -1.4588879346847534, "step": 477 }, { "epoch": 0.5862782675354542, "grad_norm": 10.13640308380127, "learning_rate": 9.414723926380368e-07, "logits/chosen": -2.2566792964935303, "logits/rejected": -2.3062920570373535, "logps/chosen": -57.967987060546875, "logps/rejected": -72.13760375976562, "loss": 0.7797, "rewards/accuracies": 0.78125, "rewards/chosen": 0.07898198813199997, "rewards/margins": 1.365771770477295, "rewards/rejected": -1.286789894104004, "step": 478 }, { "epoch": 0.5875047911077041, "grad_norm": 7.042830467224121, "learning_rate": 9.413496932515337e-07, "logits/chosen": -2.326111316680908, "logits/rejected": -2.387498617172241, "logps/chosen": -58.72688293457031, "logps/rejected": -64.79391479492188, "loss": 0.8488, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1776149868965149, "rewards/margins": 0.9456638693809509, "rewards/rejected": -1.1232788562774658, "step": 479 }, { "epoch": 0.588731314679954, "grad_norm": 7.877779483795166, "learning_rate": 9.412269938650307e-07, "logits/chosen": -2.3323159217834473, "logits/rejected": -2.355072021484375, "logps/chosen": -57.998477935791016, "logps/rejected": -66.14955139160156, "loss": 0.8599, "rewards/accuracies": 0.625, "rewards/chosen": -0.17268389463424683, "rewards/margins": 0.8830462098121643, "rewards/rejected": -1.0557299852371216, "step": 480 }, { "epoch": 0.5899578382522039, "grad_norm": 6.396887302398682, "learning_rate": 9.411042944785275e-07, "logits/chosen": -2.2990047931671143, "logits/rejected": -2.305612087249756, "logps/chosen": -61.4998779296875, "logps/rejected": -64.2196044921875, "loss": 0.8888, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3583609461784363, "rewards/margins": 0.5440621972084045, "rewards/rejected": -0.9024231433868408, "step": 481 }, { "epoch": 0.5911843618244538, "grad_norm": 7.6181840896606445, "learning_rate": 9.409815950920244e-07, "logits/chosen": -2.305072784423828, "logits/rejected": -2.3136072158813477, "logps/chosen": -57.54285430908203, "logps/rejected": -71.6136703491211, "loss": 0.746, "rewards/accuracies": 0.84375, "rewards/chosen": 0.29328984022140503, "rewards/margins": 1.7106332778930664, "rewards/rejected": -1.4173434972763062, "step": 482 }, { "epoch": 0.5924108853967037, "grad_norm": 8.488936424255371, "learning_rate": 9.408588957055214e-07, "logits/chosen": -2.2682154178619385, "logits/rejected": -2.3035728931427, "logps/chosen": -53.804786682128906, "logps/rejected": -62.34993362426758, "loss": 0.856, "rewards/accuracies": 0.5625, "rewards/chosen": 0.29464638233184814, "rewards/margins": 0.9410322904586792, "rewards/rejected": -0.646385908126831, "step": 483 }, { "epoch": 0.5936374089689537, "grad_norm": 8.168787002563477, "learning_rate": 9.407361963190184e-07, "logits/chosen": -2.3266642093658447, "logits/rejected": -2.4046437740325928, "logps/chosen": -55.19828796386719, "logps/rejected": -66.11417388916016, "loss": 0.7656, "rewards/accuracies": 0.78125, "rewards/chosen": 0.504202663898468, "rewards/margins": 1.3809432983398438, "rewards/rejected": -0.876740574836731, "step": 484 }, { "epoch": 0.5948639325412035, "grad_norm": 9.925137519836426, "learning_rate": 9.406134969325154e-07, "logits/chosen": -2.2860100269317627, "logits/rejected": -2.305166482925415, "logps/chosen": -58.04314422607422, "logps/rejected": -66.462158203125, "loss": 0.8224, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3054043650627136, "rewards/margins": 1.0488672256469727, "rewards/rejected": -0.7434629797935486, "step": 485 }, { "epoch": 0.5960904561134535, "grad_norm": 7.65476131439209, "learning_rate": 9.404907975460123e-07, "logits/chosen": -2.3154098987579346, "logits/rejected": -2.3465166091918945, "logps/chosen": -59.791542053222656, "logps/rejected": -68.88844299316406, "loss": 0.8076, "rewards/accuracies": 0.65625, "rewards/chosen": -0.18628311157226562, "rewards/margins": 0.952250063419342, "rewards/rejected": -1.1385332345962524, "step": 486 }, { "epoch": 0.5973169796857033, "grad_norm": 9.594647407531738, "learning_rate": 9.403680981595091e-07, "logits/chosen": -2.2848212718963623, "logits/rejected": -2.280564069747925, "logps/chosen": -54.801204681396484, "logps/rejected": -62.67829895019531, "loss": 0.9203, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3435087203979492, "rewards/margins": 0.4562467634677887, "rewards/rejected": -0.11273801326751709, "step": 487 }, { "epoch": 0.5985435032579532, "grad_norm": 10.465896606445312, "learning_rate": 9.402453987730061e-07, "logits/chosen": -2.241337299346924, "logits/rejected": -2.2809245586395264, "logps/chosen": -54.399051666259766, "logps/rejected": -71.46255493164062, "loss": 0.6574, "rewards/accuracies": 0.875, "rewards/chosen": 0.6948339939117432, "rewards/margins": 2.2624270915985107, "rewards/rejected": -1.5675930976867676, "step": 488 }, { "epoch": 0.5997700268302032, "grad_norm": 14.360000610351562, "learning_rate": 9.40122699386503e-07, "logits/chosen": -2.2549006938934326, "logits/rejected": -2.2502601146698, "logps/chosen": -61.297542572021484, "logps/rejected": -72.54485321044922, "loss": 0.7943, "rewards/accuracies": 0.78125, "rewards/chosen": 0.03869972378015518, "rewards/margins": 1.3630552291870117, "rewards/rejected": -1.3243556022644043, "step": 489 }, { "epoch": 0.5997700268302032, "eval_logits/chosen": -2.29717755317688, "eval_logits/rejected": -2.3168087005615234, "eval_logps/chosen": -67.22518920898438, "eval_logps/rejected": -73.97340393066406, "eval_loss": 0.8763387203216553, "eval_rewards/accuracies": 0.6235827803611755, "eval_rewards/chosen": -0.9593832492828369, "eval_rewards/margins": 0.7880995869636536, "eval_rewards/rejected": -1.7474830150604248, "eval_runtime": 1611.2238, "eval_samples_per_second": 0.547, "eval_steps_per_second": 0.274, "step": 489 }, { "epoch": 0.600996550402453, "grad_norm": 7.475056171417236, "learning_rate": 9.399999999999999e-07, "logits/chosen": -2.304518461227417, "logits/rejected": -2.305793285369873, "logps/chosen": -64.23602294921875, "logps/rejected": -65.5586929321289, "loss": 0.9423, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2674991488456726, "rewards/margins": 0.42695099115371704, "rewards/rejected": -0.6944500803947449, "step": 490 }, { "epoch": 0.602223073974703, "grad_norm": 9.027681350708008, "learning_rate": 9.398773006134969e-07, "logits/chosen": -2.2492332458496094, "logits/rejected": -2.30997371673584, "logps/chosen": -58.621009826660156, "logps/rejected": -66.6835708618164, "loss": 0.8494, "rewards/accuracies": 0.71875, "rewards/chosen": -0.08259116113185883, "rewards/margins": 1.2322828769683838, "rewards/rejected": -1.3148740530014038, "step": 491 }, { "epoch": 0.6034495975469528, "grad_norm": 7.166437149047852, "learning_rate": 9.397546012269938e-07, "logits/chosen": -2.246070146560669, "logits/rejected": -2.2773098945617676, "logps/chosen": -66.23756408691406, "logps/rejected": -76.8200912475586, "loss": 0.7934, "rewards/accuracies": 0.625, "rewards/chosen": -0.5667884349822998, "rewards/margins": 1.2842686176300049, "rewards/rejected": -1.8510571718215942, "step": 492 }, { "epoch": 0.6046761211192028, "grad_norm": 9.620685577392578, "learning_rate": 9.396319018404907e-07, "logits/chosen": -2.33306622505188, "logits/rejected": -2.32975435256958, "logps/chosen": -57.606597900390625, "logps/rejected": -63.28506088256836, "loss": 0.8592, "rewards/accuracies": 0.59375, "rewards/chosen": -0.013468928635120392, "rewards/margins": 0.9038291573524475, "rewards/rejected": -0.9172980189323425, "step": 493 }, { "epoch": 0.6059026446914527, "grad_norm": 7.5516743659973145, "learning_rate": 9.395092024539877e-07, "logits/chosen": -2.3085689544677734, "logits/rejected": -2.3734755516052246, "logps/chosen": -64.30180358886719, "logps/rejected": -79.38433837890625, "loss": 0.7387, "rewards/accuracies": 0.75, "rewards/chosen": -0.6678398847579956, "rewards/margins": 1.6162909269332886, "rewards/rejected": -2.284130811691284, "step": 494 }, { "epoch": 0.6071291682637026, "grad_norm": 12.040207862854004, "learning_rate": 9.393865030674846e-07, "logits/chosen": -2.3355205059051514, "logits/rejected": -2.402440071105957, "logps/chosen": -61.868568420410156, "logps/rejected": -76.41902160644531, "loss": 0.7593, "rewards/accuracies": 0.75, "rewards/chosen": -0.05758626386523247, "rewards/margins": 1.4964171648025513, "rewards/rejected": -1.554003357887268, "step": 495 }, { "epoch": 0.6083556918359525, "grad_norm": 8.863550186157227, "learning_rate": 9.392638036809816e-07, "logits/chosen": -2.2872297763824463, "logits/rejected": -2.304588556289673, "logps/chosen": -66.34400177001953, "logps/rejected": -72.24598693847656, "loss": 0.8914, "rewards/accuracies": 0.625, "rewards/chosen": -0.930029034614563, "rewards/margins": 0.5948925614356995, "rewards/rejected": -1.5249214172363281, "step": 496 }, { "epoch": 0.6095822154082023, "grad_norm": 7.4206318855285645, "learning_rate": 9.391411042944785e-07, "logits/chosen": -2.282766580581665, "logits/rejected": -2.319981813430786, "logps/chosen": -59.175228118896484, "logps/rejected": -74.34059143066406, "loss": 0.73, "rewards/accuracies": 0.6875, "rewards/chosen": -0.31639766693115234, "rewards/margins": 1.5311264991760254, "rewards/rejected": -1.8475244045257568, "step": 497 }, { "epoch": 0.6108087389804523, "grad_norm": 9.688732147216797, "learning_rate": 9.390184049079755e-07, "logits/chosen": -2.333238363265991, "logits/rejected": -2.339139461517334, "logps/chosen": -68.89982604980469, "logps/rejected": -76.65499114990234, "loss": 0.8718, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1516207456588745, "rewards/margins": 0.7137281894683838, "rewards/rejected": -1.8653488159179688, "step": 498 }, { "epoch": 0.6120352625527021, "grad_norm": 9.678208351135254, "learning_rate": 9.388957055214723e-07, "logits/chosen": -2.293522357940674, "logits/rejected": -2.340304374694824, "logps/chosen": -64.45527648925781, "logps/rejected": -67.37825012207031, "loss": 0.9161, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7027865648269653, "rewards/margins": 0.36215779185295105, "rewards/rejected": -1.0649445056915283, "step": 499 }, { "epoch": 0.6132617861249521, "grad_norm": 7.340187072753906, "learning_rate": 9.387730061349692e-07, "logits/chosen": -2.2807698249816895, "logits/rejected": -2.27744460105896, "logps/chosen": -65.49180603027344, "logps/rejected": -66.77400970458984, "loss": 0.8943, "rewards/accuracies": 0.625, "rewards/chosen": -0.7645108699798584, "rewards/margins": 0.5055097937583923, "rewards/rejected": -1.2700207233428955, "step": 500 }, { "epoch": 0.614488309697202, "grad_norm": 6.436230659484863, "learning_rate": 9.386503067484662e-07, "logits/chosen": -2.2920548915863037, "logits/rejected": -2.3125176429748535, "logps/chosen": -56.600196838378906, "logps/rejected": -73.1631851196289, "loss": 0.6858, "rewards/accuracies": 0.75, "rewards/chosen": 0.08495943248271942, "rewards/margins": 1.9660378694534302, "rewards/rejected": -1.8810783624649048, "step": 501 }, { "epoch": 0.6157148332694519, "grad_norm": 10.911918640136719, "learning_rate": 9.385276073619631e-07, "logits/chosen": -2.33774733543396, "logits/rejected": -2.3140676021575928, "logps/chosen": -64.02788543701172, "logps/rejected": -70.08441925048828, "loss": 0.92, "rewards/accuracies": 0.53125, "rewards/chosen": -0.30434998869895935, "rewards/margins": 0.3667747378349304, "rewards/rejected": -0.6711247563362122, "step": 502 }, { "epoch": 0.6169413568417018, "grad_norm": 10.192265510559082, "learning_rate": 9.384049079754601e-07, "logits/chosen": -2.28568959236145, "logits/rejected": -2.320172071456909, "logps/chosen": -57.473655700683594, "logps/rejected": -68.86564636230469, "loss": 0.8319, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0789114236831665, "rewards/margins": 1.2444202899932861, "rewards/rejected": -1.323331594467163, "step": 503 }, { "epoch": 0.6181678804139517, "grad_norm": 7.910482883453369, "learning_rate": 9.38282208588957e-07, "logits/chosen": -2.317164659500122, "logits/rejected": -2.3172318935394287, "logps/chosen": -58.62229919433594, "logps/rejected": -68.04517364501953, "loss": 0.8263, "rewards/accuracies": 0.65625, "rewards/chosen": -0.14745984971523285, "rewards/margins": 1.2266747951507568, "rewards/rejected": -1.3741344213485718, "step": 504 }, { "epoch": 0.6193944039862016, "grad_norm": 8.998490333557129, "learning_rate": 9.38159509202454e-07, "logits/chosen": -2.3776700496673584, "logits/rejected": -2.3571321964263916, "logps/chosen": -63.228153228759766, "logps/rejected": -67.66138458251953, "loss": 0.9099, "rewards/accuracies": 0.5625, "rewards/chosen": -0.49053430557250977, "rewards/margins": 0.5078294277191162, "rewards/rejected": -0.9983636736869812, "step": 505 }, { "epoch": 0.6206209275584516, "grad_norm": 10.861506462097168, "learning_rate": 9.380368098159509e-07, "logits/chosen": -2.325732946395874, "logits/rejected": -2.358751058578491, "logps/chosen": -59.595848083496094, "logps/rejected": -70.27770233154297, "loss": 0.8008, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16461604833602905, "rewards/margins": 1.224802017211914, "rewards/rejected": -1.0601861476898193, "step": 506 }, { "epoch": 0.6218474511307014, "grad_norm": 9.546052932739258, "learning_rate": 9.379141104294478e-07, "logits/chosen": -2.3134560585021973, "logits/rejected": -2.3169870376586914, "logps/chosen": -60.689151763916016, "logps/rejected": -72.8441162109375, "loss": 0.7551, "rewards/accuracies": 0.78125, "rewards/chosen": 0.09094296395778656, "rewards/margins": 1.4575059413909912, "rewards/rejected": -1.366563081741333, "step": 507 }, { "epoch": 0.6230739747029513, "grad_norm": 11.332620620727539, "learning_rate": 9.377914110429448e-07, "logits/chosen": -2.304103374481201, "logits/rejected": -2.303180694580078, "logps/chosen": -57.032955169677734, "logps/rejected": -65.58280944824219, "loss": 0.886, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10437946021556854, "rewards/margins": 0.7097046375274658, "rewards/rejected": -0.8140840530395508, "step": 508 }, { "epoch": 0.6243004982752012, "grad_norm": 10.058517456054688, "learning_rate": 9.376687116564417e-07, "logits/chosen": -2.3030593395233154, "logits/rejected": -2.369968891143799, "logps/chosen": -60.394954681396484, "logps/rejected": -68.4745864868164, "loss": 0.767, "rewards/accuracies": 0.625, "rewards/chosen": -0.05471440404653549, "rewards/margins": 1.3138551712036133, "rewards/rejected": -1.3685694932937622, "step": 509 }, { "epoch": 0.6255270218474511, "grad_norm": 11.601090431213379, "learning_rate": 9.375460122699386e-07, "logits/chosen": -2.3589861392974854, "logits/rejected": -2.3591065406799316, "logps/chosen": -58.51319122314453, "logps/rejected": -68.90160369873047, "loss": 0.8326, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3551309406757355, "rewards/margins": 0.9747313261032104, "rewards/rejected": -1.3298622369766235, "step": 510 }, { "epoch": 0.6267535454197011, "grad_norm": 9.314105033874512, "learning_rate": 9.374233128834355e-07, "logits/chosen": -2.377114772796631, "logits/rejected": -2.3697092533111572, "logps/chosen": -60.8753662109375, "logps/rejected": -74.31483459472656, "loss": 0.7864, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3091081380844116, "rewards/margins": 1.6102204322814941, "rewards/rejected": -1.9193285703659058, "step": 511 }, { "epoch": 0.6279800689919509, "grad_norm": 6.812248229980469, "learning_rate": 9.373006134969324e-07, "logits/chosen": -2.3404693603515625, "logits/rejected": -2.4002881050109863, "logps/chosen": -64.98336791992188, "logps/rejected": -68.0872802734375, "loss": 0.941, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8650385737419128, "rewards/margins": 0.329955518245697, "rewards/rejected": -1.1949940919876099, "step": 512 }, { "epoch": 0.6292065925642009, "grad_norm": 7.714319705963135, "learning_rate": 9.371779141104294e-07, "logits/chosen": -2.3051276206970215, "logits/rejected": -2.3465652465820312, "logps/chosen": -57.63951110839844, "logps/rejected": -68.60336303710938, "loss": 0.7879, "rewards/accuracies": 0.65625, "rewards/chosen": 0.07532000541687012, "rewards/margins": 1.2222557067871094, "rewards/rejected": -1.1469355821609497, "step": 513 }, { "epoch": 0.6304331161364507, "grad_norm": 7.830132007598877, "learning_rate": 9.370552147239263e-07, "logits/chosen": -2.2208759784698486, "logits/rejected": -2.279721736907959, "logps/chosen": -54.21980285644531, "logps/rejected": -68.75199127197266, "loss": 0.8177, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4271034598350525, "rewards/margins": 1.4343509674072266, "rewards/rejected": -1.0072473287582397, "step": 514 }, { "epoch": 0.6316596397087006, "grad_norm": 9.109386444091797, "learning_rate": 9.369325153374233e-07, "logits/chosen": -2.2918694019317627, "logits/rejected": -2.307260513305664, "logps/chosen": -58.21902084350586, "logps/rejected": -73.90743255615234, "loss": 0.7863, "rewards/accuracies": 0.75, "rewards/chosen": -0.26842257380485535, "rewards/margins": 1.4215513467788696, "rewards/rejected": -1.6899739503860474, "step": 515 }, { "epoch": 0.6328861632809506, "grad_norm": 8.998930931091309, "learning_rate": 9.368098159509203e-07, "logits/chosen": -2.2812063694000244, "logits/rejected": -2.3244223594665527, "logps/chosen": -61.283329010009766, "logps/rejected": -69.80229187011719, "loss": 0.8416, "rewards/accuracies": 0.625, "rewards/chosen": -0.360363245010376, "rewards/margins": 1.0343570709228516, "rewards/rejected": -1.3947203159332275, "step": 516 }, { "epoch": 0.6341126868532004, "grad_norm": 7.92228364944458, "learning_rate": 9.366871165644171e-07, "logits/chosen": -2.282447338104248, "logits/rejected": -2.3312110900878906, "logps/chosen": -60.14985656738281, "logps/rejected": -68.28052520751953, "loss": 0.7567, "rewards/accuracies": 0.84375, "rewards/chosen": -0.09147713333368301, "rewards/margins": 1.3892053365707397, "rewards/rejected": -1.480682373046875, "step": 517 }, { "epoch": 0.6353392104254504, "grad_norm": 10.249625205993652, "learning_rate": 9.365644171779141e-07, "logits/chosen": -2.293452739715576, "logits/rejected": -2.3576266765594482, "logps/chosen": -65.19003295898438, "logps/rejected": -73.6274642944336, "loss": 0.802, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1353892982006073, "rewards/margins": 1.4064077138900757, "rewards/rejected": -1.5417969226837158, "step": 518 }, { "epoch": 0.6365657339977002, "grad_norm": 16.336259841918945, "learning_rate": 9.36441717791411e-07, "logits/chosen": -2.3036608695983887, "logits/rejected": -2.3478970527648926, "logps/chosen": -67.85183715820312, "logps/rejected": -79.09296417236328, "loss": 0.8143, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2022764682769775, "rewards/margins": 1.3588935136795044, "rewards/rejected": -2.5611698627471924, "step": 519 }, { "epoch": 0.6377922575699502, "grad_norm": 9.658611297607422, "learning_rate": 9.363190184049079e-07, "logits/chosen": -2.320300579071045, "logits/rejected": -2.316096544265747, "logps/chosen": -64.4505844116211, "logps/rejected": -73.89893341064453, "loss": 0.8679, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7934815883636475, "rewards/margins": 0.8449189066886902, "rewards/rejected": -1.6384005546569824, "step": 520 }, { "epoch": 0.6390187811422001, "grad_norm": 9.122262001037598, "learning_rate": 9.361963190184049e-07, "logits/chosen": -2.31026029586792, "logits/rejected": -2.3249545097351074, "logps/chosen": -60.04098129272461, "logps/rejected": -66.65489196777344, "loss": 0.8321, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13142742216587067, "rewards/margins": 1.043819785118103, "rewards/rejected": -1.175247073173523, "step": 521 }, { "epoch": 0.64024530471445, "grad_norm": 9.968196868896484, "learning_rate": 9.360736196319018e-07, "logits/chosen": -2.37369704246521, "logits/rejected": -2.39310359954834, "logps/chosen": -61.077816009521484, "logps/rejected": -73.01761627197266, "loss": 0.8191, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7069434523582458, "rewards/margins": 1.0987541675567627, "rewards/rejected": -1.8056976795196533, "step": 522 }, { "epoch": 0.6414718282866999, "grad_norm": 14.934799194335938, "learning_rate": 9.359509202453986e-07, "logits/chosen": -2.2715725898742676, "logits/rejected": -2.27085018157959, "logps/chosen": -61.32576370239258, "logps/rejected": -67.18350982666016, "loss": 0.9035, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4123188555240631, "rewards/margins": 0.5447914004325867, "rewards/rejected": -0.9571102857589722, "step": 523 }, { "epoch": 0.6426983518589497, "grad_norm": 8.621955871582031, "learning_rate": 9.358282208588956e-07, "logits/chosen": -2.3329122066497803, "logits/rejected": -2.3703055381774902, "logps/chosen": -58.54581832885742, "logps/rejected": -66.6397705078125, "loss": 0.8552, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3254621624946594, "rewards/margins": 0.8464565277099609, "rewards/rejected": -1.1719186305999756, "step": 524 }, { "epoch": 0.6439248754311997, "grad_norm": 8.706058502197266, "learning_rate": 9.357055214723926e-07, "logits/chosen": -2.2788145542144775, "logits/rejected": -2.2865538597106934, "logps/chosen": -61.63462829589844, "logps/rejected": -69.77930450439453, "loss": 0.8675, "rewards/accuracies": 0.625, "rewards/chosen": -0.5619584321975708, "rewards/margins": 0.836825966835022, "rewards/rejected": -1.3987843990325928, "step": 525 }, { "epoch": 0.6451513990034496, "grad_norm": 6.611227512359619, "learning_rate": 9.355828220858896e-07, "logits/chosen": -2.3449997901916504, "logits/rejected": -2.3574118614196777, "logps/chosen": -65.54649353027344, "logps/rejected": -71.9680404663086, "loss": 0.9255, "rewards/accuracies": 0.5, "rewards/chosen": -0.7515543699264526, "rewards/margins": 0.43873268365859985, "rewards/rejected": -1.1902869939804077, "step": 526 }, { "epoch": 0.6463779225756995, "grad_norm": 9.459365844726562, "learning_rate": 9.354601226993865e-07, "logits/chosen": -2.319378137588501, "logits/rejected": -2.3240904808044434, "logps/chosen": -54.33362579345703, "logps/rejected": -62.5583610534668, "loss": 0.8674, "rewards/accuracies": 0.78125, "rewards/chosen": 0.14649325609207153, "rewards/margins": 0.9114359021186829, "rewards/rejected": -0.7649425268173218, "step": 527 }, { "epoch": 0.6476044461479494, "grad_norm": 6.799969673156738, "learning_rate": 9.353374233128834e-07, "logits/chosen": -2.2876853942871094, "logits/rejected": -2.316082000732422, "logps/chosen": -56.43767166137695, "logps/rejected": -63.403106689453125, "loss": 0.7998, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4048154354095459, "rewards/margins": 1.0093402862548828, "rewards/rejected": -0.6045249104499817, "step": 528 }, { "epoch": 0.6488309697201993, "grad_norm": 8.903447151184082, "learning_rate": 9.352147239263803e-07, "logits/chosen": -2.2747819423675537, "logits/rejected": -2.325678825378418, "logps/chosen": -55.720184326171875, "logps/rejected": -69.83671569824219, "loss": 0.7871, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10743777453899384, "rewards/margins": 1.243768572807312, "rewards/rejected": -1.1363308429718018, "step": 529 }, { "epoch": 0.6500574932924492, "grad_norm": 12.171642303466797, "learning_rate": 9.350920245398772e-07, "logits/chosen": -2.2579517364501953, "logits/rejected": -2.305494546890259, "logps/chosen": -55.29380798339844, "logps/rejected": -66.92892456054688, "loss": 0.7858, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4856160879135132, "rewards/margins": 1.5581791400909424, "rewards/rejected": -1.0725629329681396, "step": 530 }, { "epoch": 0.6512840168646992, "grad_norm": 7.342774868011475, "learning_rate": 9.349693251533742e-07, "logits/chosen": -2.3204874992370605, "logits/rejected": -2.340167760848999, "logps/chosen": -61.5841064453125, "logps/rejected": -72.48159790039062, "loss": 0.8141, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3872118592262268, "rewards/margins": 1.117339015007019, "rewards/rejected": -1.5045509338378906, "step": 531 }, { "epoch": 0.652510540436949, "grad_norm": 14.705004692077637, "learning_rate": 9.348466257668711e-07, "logits/chosen": -2.273967981338501, "logits/rejected": -2.349405288696289, "logps/chosen": -51.370601654052734, "logps/rejected": -61.5272216796875, "loss": 0.8069, "rewards/accuracies": 0.625, "rewards/chosen": 0.7982270121574402, "rewards/margins": 1.3256323337554932, "rewards/rejected": -0.527405321598053, "step": 532 }, { "epoch": 0.653737064009199, "grad_norm": 7.366089344024658, "learning_rate": 9.34723926380368e-07, "logits/chosen": -2.3034393787384033, "logits/rejected": -2.2638604640960693, "logps/chosen": -60.83547592163086, "logps/rejected": -60.34566116333008, "loss": 1.0176, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5092983841896057, "rewards/margins": 0.03620046749711037, "rewards/rejected": -0.5454988479614258, "step": 533 }, { "epoch": 0.6549635875814488, "grad_norm": 9.508506774902344, "learning_rate": 9.34601226993865e-07, "logits/chosen": -2.2231247425079346, "logits/rejected": -2.280707836151123, "logps/chosen": -51.81614303588867, "logps/rejected": -67.51631164550781, "loss": 0.7241, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5936034917831421, "rewards/margins": 1.6903691291809082, "rewards/rejected": -1.0967656373977661, "step": 534 }, { "epoch": 0.6561901111536987, "grad_norm": 12.258294105529785, "learning_rate": 9.34478527607362e-07, "logits/chosen": -2.261167049407959, "logits/rejected": -2.311980724334717, "logps/chosen": -65.0634994506836, "logps/rejected": -71.78377532958984, "loss": 0.8331, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07974690198898315, "rewards/margins": 0.8806390762329102, "rewards/rejected": -0.9603859186172485, "step": 535 }, { "epoch": 0.6574166347259487, "grad_norm": 6.395153045654297, "learning_rate": 9.343558282208589e-07, "logits/chosen": -2.2484140396118164, "logits/rejected": -2.28145432472229, "logps/chosen": -56.29123306274414, "logps/rejected": -70.28652954101562, "loss": 0.8185, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0838388055562973, "rewards/margins": 1.2096343040466309, "rewards/rejected": -1.293473243713379, "step": 536 }, { "epoch": 0.6586431582981985, "grad_norm": 7.218222618103027, "learning_rate": 9.342331288343558e-07, "logits/chosen": -2.300482749938965, "logits/rejected": -2.295657157897949, "logps/chosen": -54.90458679199219, "logps/rejected": -61.710975646972656, "loss": 0.8177, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2187550663948059, "rewards/margins": 1.1848353147506714, "rewards/rejected": -0.966080367565155, "step": 537 }, { "epoch": 0.6598696818704485, "grad_norm": 7.576307773590088, "learning_rate": 9.341104294478527e-07, "logits/chosen": -2.363396406173706, "logits/rejected": -2.372523546218872, "logps/chosen": -61.639404296875, "logps/rejected": -73.47854614257812, "loss": 0.7871, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19264459609985352, "rewards/margins": 1.320235252380371, "rewards/rejected": -1.5128798484802246, "step": 538 }, { "epoch": 0.6610962054426983, "grad_norm": 8.431209564208984, "learning_rate": 9.339877300613497e-07, "logits/chosen": -2.2830936908721924, "logits/rejected": -2.3301503658294678, "logps/chosen": -59.0533447265625, "logps/rejected": -65.53759765625, "loss": 0.8714, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21554651856422424, "rewards/margins": 0.7767370939254761, "rewards/rejected": -0.9922837018966675, "step": 539 }, { "epoch": 0.6623227290149483, "grad_norm": 8.392796516418457, "learning_rate": 9.338650306748466e-07, "logits/chosen": -2.2809882164001465, "logits/rejected": -2.3019661903381348, "logps/chosen": -58.86069107055664, "logps/rejected": -67.2827377319336, "loss": 0.8666, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1313830018043518, "rewards/margins": 0.7039623260498047, "rewards/rejected": -0.8353453874588013, "step": 540 }, { "epoch": 0.6635492525871982, "grad_norm": 8.523146629333496, "learning_rate": 9.337423312883435e-07, "logits/chosen": -2.3001253604888916, "logits/rejected": -2.3113207817077637, "logps/chosen": -67.42888641357422, "logps/rejected": -76.84024047851562, "loss": 0.8511, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9930042028427124, "rewards/margins": 0.8418569564819336, "rewards/rejected": -1.8348612785339355, "step": 541 }, { "epoch": 0.664775776159448, "grad_norm": 8.020461082458496, "learning_rate": 9.336196319018404e-07, "logits/chosen": -2.271003007888794, "logits/rejected": -2.3570289611816406, "logps/chosen": -62.23624038696289, "logps/rejected": -72.72364044189453, "loss": 0.8155, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19013161957263947, "rewards/margins": 1.180936574935913, "rewards/rejected": -1.3710681200027466, "step": 542 }, { "epoch": 0.666002299731698, "grad_norm": 8.546735763549805, "learning_rate": 9.334969325153373e-07, "logits/chosen": -2.298731803894043, "logits/rejected": -2.2901618480682373, "logps/chosen": -64.0213851928711, "logps/rejected": -69.20173645019531, "loss": 0.9338, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5981302261352539, "rewards/margins": 0.41084954142570496, "rewards/rejected": -1.0089796781539917, "step": 543 }, { "epoch": 0.6672288233039478, "grad_norm": 11.216336250305176, "learning_rate": 9.333742331288343e-07, "logits/chosen": -2.2612195014953613, "logits/rejected": -2.321162223815918, "logps/chosen": -58.27053451538086, "logps/rejected": -69.30536651611328, "loss": 0.8073, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2580223083496094, "rewards/margins": 1.3227269649505615, "rewards/rejected": -1.0647046566009521, "step": 544 }, { "epoch": 0.6684553468761978, "grad_norm": 6.955873489379883, "learning_rate": 9.332515337423313e-07, "logits/chosen": -2.338756561279297, "logits/rejected": -2.3822021484375, "logps/chosen": -56.777015686035156, "logps/rejected": -72.55117797851562, "loss": 0.7532, "rewards/accuracies": 0.78125, "rewards/chosen": -0.11143322288990021, "rewards/margins": 1.6517027616500854, "rewards/rejected": -1.7631359100341797, "step": 545 }, { "epoch": 0.6696818704484477, "grad_norm": 18.006446838378906, "learning_rate": 9.331288343558283e-07, "logits/chosen": -2.3473522663116455, "logits/rejected": -2.3481101989746094, "logps/chosen": -60.006629943847656, "logps/rejected": -74.81230926513672, "loss": 0.7698, "rewards/accuracies": 0.5625, "rewards/chosen": -0.29161855578422546, "rewards/margins": 1.5053942203521729, "rewards/rejected": -1.7970128059387207, "step": 546 }, { "epoch": 0.6709083940206976, "grad_norm": 6.726373195648193, "learning_rate": 9.330061349693251e-07, "logits/chosen": -2.2934632301330566, "logits/rejected": -2.3265790939331055, "logps/chosen": -59.37623977661133, "logps/rejected": -79.25657653808594, "loss": 0.7337, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23699524998664856, "rewards/margins": 1.9415342807769775, "rewards/rejected": -2.1785292625427246, "step": 547 }, { "epoch": 0.6721349175929475, "grad_norm": 10.909767150878906, "learning_rate": 9.32883435582822e-07, "logits/chosen": -2.28757643699646, "logits/rejected": -2.3686161041259766, "logps/chosen": -65.412109375, "logps/rejected": -71.95783233642578, "loss": 0.9189, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9346727728843689, "rewards/margins": 0.5954035520553589, "rewards/rejected": -1.530076503753662, "step": 548 }, { "epoch": 0.6733614411651974, "grad_norm": 10.899640083312988, "learning_rate": 9.32760736196319e-07, "logits/chosen": -2.3565733432769775, "logits/rejected": -2.3507132530212402, "logps/chosen": -69.80231475830078, "logps/rejected": -71.61231231689453, "loss": 0.9155, "rewards/accuracies": 0.46875, "rewards/chosen": -1.0889759063720703, "rewards/margins": 0.44892776012420654, "rewards/rejected": -1.5379037857055664, "step": 549 }, { "epoch": 0.6745879647374473, "grad_norm": 9.266079902648926, "learning_rate": 9.326380368098159e-07, "logits/chosen": -2.2602126598358154, "logits/rejected": -2.3363146781921387, "logps/chosen": -59.884674072265625, "logps/rejected": -72.76403045654297, "loss": 0.7437, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06003255397081375, "rewards/margins": 1.4780374765396118, "rewards/rejected": -1.5380700826644897, "step": 550 }, { "epoch": 0.6758144883096971, "grad_norm": 6.0118632316589355, "learning_rate": 9.325153374233128e-07, "logits/chosen": -2.347285270690918, "logits/rejected": -2.3831470012664795, "logps/chosen": -60.226295471191406, "logps/rejected": -70.59164428710938, "loss": 0.8493, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2926024794578552, "rewards/margins": 1.1575523614883423, "rewards/rejected": -1.4501547813415527, "step": 551 }, { "epoch": 0.6770410118819471, "grad_norm": 9.114612579345703, "learning_rate": 9.323926380368098e-07, "logits/chosen": -2.276625394821167, "logits/rejected": -2.314908981323242, "logps/chosen": -63.22563552856445, "logps/rejected": -74.66020202636719, "loss": 0.7863, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5005452632904053, "rewards/margins": 1.3911653757095337, "rewards/rejected": -1.891710638999939, "step": 552 }, { "epoch": 0.678267535454197, "grad_norm": 9.589200019836426, "learning_rate": 9.322699386503066e-07, "logits/chosen": -2.339280843734741, "logits/rejected": -2.3872439861297607, "logps/chosen": -55.66611862182617, "logps/rejected": -69.98146057128906, "loss": 0.7391, "rewards/accuracies": 0.75, "rewards/chosen": 0.44867265224456787, "rewards/margins": 1.8423982858657837, "rewards/rejected": -1.3937256336212158, "step": 553 }, { "epoch": 0.6794940590264469, "grad_norm": 7.912712574005127, "learning_rate": 9.321472392638036e-07, "logits/chosen": -2.310548782348633, "logits/rejected": -2.325474262237549, "logps/chosen": -68.89805603027344, "logps/rejected": -75.16880798339844, "loss": 0.8637, "rewards/accuracies": 0.75, "rewards/chosen": -0.6054129004478455, "rewards/margins": 0.8984012007713318, "rewards/rejected": -1.5038142204284668, "step": 554 }, { "epoch": 0.6807205825986968, "grad_norm": 14.996952056884766, "learning_rate": 9.320245398773006e-07, "logits/chosen": -2.347393751144409, "logits/rejected": -2.376823902130127, "logps/chosen": -58.68550109863281, "logps/rejected": -74.77680969238281, "loss": 0.7465, "rewards/accuracies": 0.8125, "rewards/chosen": -0.27234649658203125, "rewards/margins": 1.7905319929122925, "rewards/rejected": -2.0628786087036133, "step": 555 }, { "epoch": 0.6819471061709467, "grad_norm": 8.974493026733398, "learning_rate": 9.319018404907975e-07, "logits/chosen": -2.313666820526123, "logits/rejected": -2.335261583328247, "logps/chosen": -71.22733306884766, "logps/rejected": -83.0995864868164, "loss": 0.8667, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5345067977905273, "rewards/margins": 1.0851296186447144, "rewards/rejected": -2.6196365356445312, "step": 556 }, { "epoch": 0.6831736297431966, "grad_norm": 12.487804412841797, "learning_rate": 9.317791411042945e-07, "logits/chosen": -2.2803525924682617, "logits/rejected": -2.3004984855651855, "logps/chosen": -68.32247161865234, "logps/rejected": -73.96334838867188, "loss": 0.8526, "rewards/accuracies": 0.65625, "rewards/chosen": -0.792393147945404, "rewards/margins": 1.05362868309021, "rewards/rejected": -1.8460218906402588, "step": 557 }, { "epoch": 0.6844001533154466, "grad_norm": 10.464531898498535, "learning_rate": 9.316564417177914e-07, "logits/chosen": -2.330113410949707, "logits/rejected": -2.341836452484131, "logps/chosen": -56.89804458618164, "logps/rejected": -75.28050994873047, "loss": 0.7355, "rewards/accuracies": 0.71875, "rewards/chosen": -0.14135949313640594, "rewards/margins": 1.6553622484207153, "rewards/rejected": -1.7967216968536377, "step": 558 }, { "epoch": 0.6856266768876964, "grad_norm": 7.770110130310059, "learning_rate": 9.315337423312883e-07, "logits/chosen": -2.2711360454559326, "logits/rejected": -2.324342727661133, "logps/chosen": -57.81732177734375, "logps/rejected": -72.7045669555664, "loss": 0.8163, "rewards/accuracies": 0.65625, "rewards/chosen": -0.18666993081569672, "rewards/margins": 1.494633436203003, "rewards/rejected": -1.6813031435012817, "step": 559 }, { "epoch": 0.6868532004599464, "grad_norm": 9.039891242980957, "learning_rate": 9.314110429447852e-07, "logits/chosen": -2.2801454067230225, "logits/rejected": -2.3108372688293457, "logps/chosen": -61.730403900146484, "logps/rejected": -75.92930603027344, "loss": 0.7666, "rewards/accuracies": 0.8125, "rewards/chosen": -0.35674673318862915, "rewards/margins": 1.608339786529541, "rewards/rejected": -1.9650864601135254, "step": 560 }, { "epoch": 0.6880797240321962, "grad_norm": 12.437756538391113, "learning_rate": 9.312883435582821e-07, "logits/chosen": -2.322504997253418, "logits/rejected": -2.3400299549102783, "logps/chosen": -55.98720169067383, "logps/rejected": -74.43323516845703, "loss": 0.6811, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06951531767845154, "rewards/margins": 2.0016298294067383, "rewards/rejected": -2.0711452960968018, "step": 561 }, { "epoch": 0.6893062476044461, "grad_norm": 10.027499198913574, "learning_rate": 9.311656441717791e-07, "logits/chosen": -2.343214750289917, "logits/rejected": -2.3717124462127686, "logps/chosen": -63.2491569519043, "logps/rejected": -69.61153411865234, "loss": 0.9205, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7083663940429688, "rewards/margins": 0.41066569089889526, "rewards/rejected": -1.1190321445465088, "step": 562 }, { "epoch": 0.6905327711766961, "grad_norm": 8.508244514465332, "learning_rate": 9.31042944785276e-07, "logits/chosen": -2.36301326751709, "logits/rejected": -2.380441665649414, "logps/chosen": -55.348445892333984, "logps/rejected": -68.50688934326172, "loss": 0.7515, "rewards/accuracies": 0.75, "rewards/chosen": 0.18429362773895264, "rewards/margins": 1.4770528078079224, "rewards/rejected": -1.2927590608596802, "step": 563 }, { "epoch": 0.6917592947489459, "grad_norm": 13.109759330749512, "learning_rate": 9.30920245398773e-07, "logits/chosen": -2.2779011726379395, "logits/rejected": -2.317551374435425, "logps/chosen": -60.74443817138672, "logps/rejected": -68.96379089355469, "loss": 0.8815, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5420994758605957, "rewards/margins": 0.8151593804359436, "rewards/rejected": -1.357258915901184, "step": 564 }, { "epoch": 0.6929858183211959, "grad_norm": 9.56418228149414, "learning_rate": 9.3079754601227e-07, "logits/chosen": -2.2768077850341797, "logits/rejected": -2.329249858856201, "logps/chosen": -54.165077209472656, "logps/rejected": -60.947059631347656, "loss": 0.8594, "rewards/accuracies": 0.71875, "rewards/chosen": 0.23767870664596558, "rewards/margins": 0.8708788156509399, "rewards/rejected": -0.6332000494003296, "step": 565 }, { "epoch": 0.6942123418934457, "grad_norm": 8.238395690917969, "learning_rate": 9.306748466257668e-07, "logits/chosen": -2.338639736175537, "logits/rejected": -2.3390090465545654, "logps/chosen": -56.59130859375, "logps/rejected": -64.36544036865234, "loss": 0.8762, "rewards/accuracies": 0.59375, "rewards/chosen": 0.12171059101819992, "rewards/margins": 0.6640492677688599, "rewards/rejected": -0.5423387289047241, "step": 566 }, { "epoch": 0.6954388654656957, "grad_norm": 11.253479957580566, "learning_rate": 9.305521472392638e-07, "logits/chosen": -2.2902207374572754, "logits/rejected": -2.31223201751709, "logps/chosen": -56.953575134277344, "logps/rejected": -65.74649810791016, "loss": 0.8497, "rewards/accuracies": 0.5625, "rewards/chosen": 0.444801390171051, "rewards/margins": 0.8412935733795166, "rewards/rejected": -0.3964921832084656, "step": 567 }, { "epoch": 0.6966653890379456, "grad_norm": 14.654374122619629, "learning_rate": 9.304294478527607e-07, "logits/chosen": -2.339343547821045, "logits/rejected": -2.3221681118011475, "logps/chosen": -54.7830810546875, "logps/rejected": -61.035133361816406, "loss": 0.9136, "rewards/accuracies": 0.625, "rewards/chosen": -0.009711645543575287, "rewards/margins": 0.5581769943237305, "rewards/rejected": -0.5678887367248535, "step": 568 }, { "epoch": 0.6978919126101955, "grad_norm": 9.897828102111816, "learning_rate": 9.303067484662577e-07, "logits/chosen": -2.3032116889953613, "logits/rejected": -2.3286995887756348, "logps/chosen": -51.05723571777344, "logps/rejected": -61.756622314453125, "loss": 0.8501, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5315821170806885, "rewards/margins": 0.711736261844635, "rewards/rejected": -0.18015410006046295, "step": 569 }, { "epoch": 0.6991184361824454, "grad_norm": 12.673943519592285, "learning_rate": 9.301840490797546e-07, "logits/chosen": -2.265469789505005, "logits/rejected": -2.3137779235839844, "logps/chosen": -49.38782501220703, "logps/rejected": -61.382381439208984, "loss": 0.7477, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6954943537712097, "rewards/margins": 1.6536672115325928, "rewards/rejected": -0.9581727981567383, "step": 570 }, { "epoch": 0.7003449597546952, "grad_norm": 13.5719575881958, "learning_rate": 9.300613496932514e-07, "logits/chosen": -2.284740686416626, "logits/rejected": -2.3013055324554443, "logps/chosen": -50.19514465332031, "logps/rejected": -60.164730072021484, "loss": 0.839, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6267763376235962, "rewards/margins": 1.1616934537887573, "rewards/rejected": -0.5349169969558716, "step": 571 }, { "epoch": 0.7015714833269452, "grad_norm": 10.688520431518555, "learning_rate": 9.299386503067484e-07, "logits/chosen": -2.2534451484680176, "logits/rejected": -2.2989323139190674, "logps/chosen": -49.78384780883789, "logps/rejected": -65.59477996826172, "loss": 0.7604, "rewards/accuracies": 0.75, "rewards/chosen": 0.6940033435821533, "rewards/margins": 1.684749722480774, "rewards/rejected": -0.9907464981079102, "step": 572 }, { "epoch": 0.7027980068991951, "grad_norm": 8.775924682617188, "learning_rate": 9.298159509202453e-07, "logits/chosen": -2.2955751419067383, "logits/rejected": -2.243931531906128, "logps/chosen": -55.985198974609375, "logps/rejected": -61.88511276245117, "loss": 0.9259, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006951808929443359, "rewards/margins": 0.5380268692970276, "rewards/rejected": -0.5449787378311157, "step": 573 }, { "epoch": 0.704024530471445, "grad_norm": 7.818479537963867, "learning_rate": 9.296932515337423e-07, "logits/chosen": -2.3468329906463623, "logits/rejected": -2.353578805923462, "logps/chosen": -58.906715393066406, "logps/rejected": -66.89838409423828, "loss": 0.8997, "rewards/accuracies": 0.59375, "rewards/chosen": -0.43746820092201233, "rewards/margins": 0.4854050278663635, "rewards/rejected": -0.9228732585906982, "step": 574 }, { "epoch": 0.7052510540436949, "grad_norm": 7.286082744598389, "learning_rate": 9.295705521472392e-07, "logits/chosen": -2.3233015537261963, "logits/rejected": -2.378835439682007, "logps/chosen": -54.49592590332031, "logps/rejected": -63.3937873840332, "loss": 0.847, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2009173184633255, "rewards/margins": 0.8631978034973145, "rewards/rejected": -0.6622805595397949, "step": 575 }, { "epoch": 0.7064775776159448, "grad_norm": 9.59925365447998, "learning_rate": 9.294478527607362e-07, "logits/chosen": -2.338947296142578, "logits/rejected": -2.34936261177063, "logps/chosen": -56.98419952392578, "logps/rejected": -67.15357208251953, "loss": 0.8955, "rewards/accuracies": 0.625, "rewards/chosen": 0.04107164591550827, "rewards/margins": 0.734626054763794, "rewards/rejected": -0.6935545206069946, "step": 576 }, { "epoch": 0.7077041011881947, "grad_norm": 11.59819507598877, "learning_rate": 9.293251533742331e-07, "logits/chosen": -2.3054168224334717, "logits/rejected": -2.2933928966522217, "logps/chosen": -54.94304656982422, "logps/rejected": -64.98521423339844, "loss": 0.8326, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5610406398773193, "rewards/margins": 1.154592514038086, "rewards/rejected": -0.5935518741607666, "step": 577 }, { "epoch": 0.7089306247604447, "grad_norm": 16.26985740661621, "learning_rate": 9.2920245398773e-07, "logits/chosen": -2.292741060256958, "logits/rejected": -2.309478759765625, "logps/chosen": -48.45525360107422, "logps/rejected": -62.87601852416992, "loss": 0.7668, "rewards/accuracies": 0.75, "rewards/chosen": 0.5688651204109192, "rewards/margins": 1.4158092737197876, "rewards/rejected": -0.8469440937042236, "step": 578 }, { "epoch": 0.7101571483326945, "grad_norm": 8.031174659729004, "learning_rate": 9.29079754601227e-07, "logits/chosen": -2.279045820236206, "logits/rejected": -2.301210880279541, "logps/chosen": -57.885475158691406, "logps/rejected": -66.00077819824219, "loss": 0.861, "rewards/accuracies": 0.6875, "rewards/chosen": 0.025983083993196487, "rewards/margins": 0.8840608596801758, "rewards/rejected": -0.8580777049064636, "step": 579 }, { "epoch": 0.7113836719049444, "grad_norm": 9.65632152557373, "learning_rate": 9.289570552147239e-07, "logits/chosen": -2.3422274589538574, "logits/rejected": -2.422367572784424, "logps/chosen": -59.20549011230469, "logps/rejected": -68.67304229736328, "loss": 0.8243, "rewards/accuracies": 0.625, "rewards/chosen": 0.33174896240234375, "rewards/margins": 1.2424843311309814, "rewards/rejected": -0.9107352495193481, "step": 580 }, { "epoch": 0.7126101954771943, "grad_norm": 9.67186450958252, "learning_rate": 9.288343558282208e-07, "logits/chosen": -2.2359960079193115, "logits/rejected": -2.248699903488159, "logps/chosen": -57.58419418334961, "logps/rejected": -70.75463104248047, "loss": 0.7792, "rewards/accuracies": 0.71875, "rewards/chosen": 0.06215418875217438, "rewards/margins": 1.5010356903076172, "rewards/rejected": -1.4388816356658936, "step": 581 }, { "epoch": 0.7138367190494442, "grad_norm": 7.048551082611084, "learning_rate": 9.287116564417178e-07, "logits/chosen": -2.2980613708496094, "logits/rejected": -2.3465449810028076, "logps/chosen": -67.60566711425781, "logps/rejected": -76.85855865478516, "loss": 0.7868, "rewards/accuracies": 0.75, "rewards/chosen": -0.7559199929237366, "rewards/margins": 1.1968106031417847, "rewards/rejected": -1.9527308940887451, "step": 582 }, { "epoch": 0.7150632426216942, "grad_norm": 10.24864673614502, "learning_rate": 9.285889570552146e-07, "logits/chosen": -2.2674543857574463, "logits/rejected": -2.2874555587768555, "logps/chosen": -59.59565734863281, "logps/rejected": -70.73409271240234, "loss": 0.8008, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2942139506340027, "rewards/margins": 1.1383801698684692, "rewards/rejected": -1.4325940608978271, "step": 583 }, { "epoch": 0.716289766193944, "grad_norm": 8.916784286499023, "learning_rate": 9.284662576687115e-07, "logits/chosen": -2.2622292041778564, "logits/rejected": -2.3056252002716064, "logps/chosen": -53.95337677001953, "logps/rejected": -69.65558624267578, "loss": 0.7314, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04953154921531677, "rewards/margins": 1.6330455541610718, "rewards/rejected": -1.5835139751434326, "step": 584 }, { "epoch": 0.717516289766194, "grad_norm": 11.77900505065918, "learning_rate": 9.283435582822085e-07, "logits/chosen": -2.2871623039245605, "logits/rejected": -2.331449270248413, "logps/chosen": -62.624053955078125, "logps/rejected": -75.9681625366211, "loss": 0.7424, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1969836801290512, "rewards/margins": 1.7484643459320068, "rewards/rejected": -1.9454480409622192, "step": 585 }, { "epoch": 0.7187428133384438, "grad_norm": 17.21878433227539, "learning_rate": 9.282208588957055e-07, "logits/chosen": -2.3660521507263184, "logits/rejected": -2.407336950302124, "logps/chosen": -61.580413818359375, "logps/rejected": -79.77962493896484, "loss": 0.7, "rewards/accuracies": 0.84375, "rewards/chosen": 0.21815960109233856, "rewards/margins": 2.232773780822754, "rewards/rejected": -2.0146141052246094, "step": 586 }, { "epoch": 0.7199693369106938, "grad_norm": 11.29532527923584, "learning_rate": 9.280981595092025e-07, "logits/chosen": -2.30912184715271, "logits/rejected": -2.3292322158813477, "logps/chosen": -61.20800018310547, "logps/rejected": -72.18270111083984, "loss": 0.8224, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3649390935897827, "rewards/margins": 1.3378161191940308, "rewards/rejected": -1.702755093574524, "step": 587 }, { "epoch": 0.7211958604829437, "grad_norm": 9.75147533416748, "learning_rate": 9.279754601226994e-07, "logits/chosen": -2.2454025745391846, "logits/rejected": -2.2498621940612793, "logps/chosen": -56.570465087890625, "logps/rejected": -62.893516540527344, "loss": 0.8538, "rewards/accuracies": 0.65625, "rewards/chosen": 0.14859585464000702, "rewards/margins": 0.9561804533004761, "rewards/rejected": -0.8075845241546631, "step": 588 }, { "epoch": 0.7224223840551935, "grad_norm": 8.748492240905762, "learning_rate": 9.278527607361962e-07, "logits/chosen": -2.28096079826355, "logits/rejected": -2.308866024017334, "logps/chosen": -58.8023567199707, "logps/rejected": -67.5240478515625, "loss": 0.8472, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5051662921905518, "rewards/margins": 0.8952624797821045, "rewards/rejected": -1.4004287719726562, "step": 589 }, { "epoch": 0.7236489076274435, "grad_norm": 9.136466979980469, "learning_rate": 9.277300613496932e-07, "logits/chosen": -2.289551258087158, "logits/rejected": -2.3106579780578613, "logps/chosen": -62.372093200683594, "logps/rejected": -73.7970962524414, "loss": 0.8117, "rewards/accuracies": 0.625, "rewards/chosen": -0.8020513653755188, "rewards/margins": 1.318994164466858, "rewards/rejected": -2.1210455894470215, "step": 590 }, { "epoch": 0.7248754311996933, "grad_norm": 9.042577743530273, "learning_rate": 9.276073619631901e-07, "logits/chosen": -2.3610782623291016, "logits/rejected": -2.330918073654175, "logps/chosen": -58.60077667236328, "logps/rejected": -72.74961853027344, "loss": 0.7708, "rewards/accuracies": 0.625, "rewards/chosen": -0.06865046918392181, "rewards/margins": 1.3468554019927979, "rewards/rejected": -1.4155058860778809, "step": 591 }, { "epoch": 0.7261019547719433, "grad_norm": 10.695420265197754, "learning_rate": 9.274846625766871e-07, "logits/chosen": -2.330453395843506, "logits/rejected": -2.3255245685577393, "logps/chosen": -64.05906677246094, "logps/rejected": -68.36270904541016, "loss": 0.9148, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1932646930217743, "rewards/margins": 0.6923441886901855, "rewards/rejected": -0.8856087923049927, "step": 592 }, { "epoch": 0.7273284783441932, "grad_norm": 9.541699409484863, "learning_rate": 9.27361963190184e-07, "logits/chosen": -2.267338991165161, "logits/rejected": -2.297095775604248, "logps/chosen": -58.46726989746094, "logps/rejected": -73.53872680664062, "loss": 0.7343, "rewards/accuracies": 0.71875, "rewards/chosen": -0.13814926147460938, "rewards/margins": 1.821387767791748, "rewards/rejected": -1.959537148475647, "step": 593 }, { "epoch": 0.7285550019164431, "grad_norm": 9.166854858398438, "learning_rate": 9.272392638036809e-07, "logits/chosen": -2.323471784591675, "logits/rejected": -2.3528032302856445, "logps/chosen": -65.6761703491211, "logps/rejected": -72.436279296875, "loss": 0.9356, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9933082461357117, "rewards/margins": 0.577534019947052, "rewards/rejected": -1.5708421468734741, "step": 594 }, { "epoch": 0.729781525488693, "grad_norm": 13.955548286437988, "learning_rate": 9.271165644171778e-07, "logits/chosen": -2.3555002212524414, "logits/rejected": -2.328615427017212, "logps/chosen": -56.25914001464844, "logps/rejected": -68.89152526855469, "loss": 0.8351, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10349279642105103, "rewards/margins": 1.0956017971038818, "rewards/rejected": -0.9921090602874756, "step": 595 }, { "epoch": 0.7310080490609429, "grad_norm": 8.703801155090332, "learning_rate": 9.269938650306748e-07, "logits/chosen": -2.355661392211914, "logits/rejected": -2.370736837387085, "logps/chosen": -64.705810546875, "logps/rejected": -71.20306396484375, "loss": 0.9029, "rewards/accuracies": 0.625, "rewards/chosen": -0.7409550547599792, "rewards/margins": 0.6706516146659851, "rewards/rejected": -1.4116069078445435, "step": 596 }, { "epoch": 0.7322345726331928, "grad_norm": 16.2983341217041, "learning_rate": 9.268711656441718e-07, "logits/chosen": -2.3084723949432373, "logits/rejected": -2.340944766998291, "logps/chosen": -61.51966857910156, "logps/rejected": -66.68075561523438, "loss": 0.8971, "rewards/accuracies": 0.6875, "rewards/chosen": -0.421392023563385, "rewards/margins": 0.5807924270629883, "rewards/rejected": -1.0021843910217285, "step": 597 }, { "epoch": 0.7334610962054428, "grad_norm": 8.216326713562012, "learning_rate": 9.267484662576687e-07, "logits/chosen": -2.342155933380127, "logits/rejected": -2.3320271968841553, "logps/chosen": -59.3330192565918, "logps/rejected": -67.18372344970703, "loss": 0.8949, "rewards/accuracies": 0.59375, "rewards/chosen": -0.30557116866111755, "rewards/margins": 0.6768689751625061, "rewards/rejected": -0.9824402332305908, "step": 598 }, { "epoch": 0.7346876197776926, "grad_norm": 124.4095458984375, "learning_rate": 9.266257668711656e-07, "logits/chosen": -2.254607677459717, "logits/rejected": -2.2987399101257324, "logps/chosen": -56.345237731933594, "logps/rejected": -68.88853454589844, "loss": 0.8089, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15247464179992676, "rewards/margins": 1.3912099599838257, "rewards/rejected": -1.5436848402023315, "step": 599 }, { "epoch": 0.7359141433499425, "grad_norm": 8.805765151977539, "learning_rate": 9.265030674846626e-07, "logits/chosen": -2.3126349449157715, "logits/rejected": -2.361355781555176, "logps/chosen": -59.40022659301758, "logps/rejected": -69.08999633789062, "loss": 0.8423, "rewards/accuracies": 0.625, "rewards/chosen": 0.023345351219177246, "rewards/margins": 1.067705750465393, "rewards/rejected": -1.0443603992462158, "step": 600 }, { "epoch": 0.7371406669221924, "grad_norm": 9.193358421325684, "learning_rate": 9.263803680981594e-07, "logits/chosen": -2.241170644760132, "logits/rejected": -2.2844552993774414, "logps/chosen": -56.55282974243164, "logps/rejected": -71.90237426757812, "loss": 0.744, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0026469752192497253, "rewards/margins": 1.6321651935577393, "rewards/rejected": -1.634812355041504, "step": 601 }, { "epoch": 0.7383671904944423, "grad_norm": 10.571754455566406, "learning_rate": 9.262576687116564e-07, "logits/chosen": -2.326824188232422, "logits/rejected": -2.3311009407043457, "logps/chosen": -56.109806060791016, "logps/rejected": -70.64253997802734, "loss": 0.7887, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2661699652671814, "rewards/margins": 1.6315428018569946, "rewards/rejected": -1.3653727769851685, "step": 602 }, { "epoch": 0.7395937140666922, "grad_norm": 9.528412818908691, "learning_rate": 9.261349693251533e-07, "logits/chosen": -2.2502899169921875, "logits/rejected": -2.2989346981048584, "logps/chosen": -51.65226745605469, "logps/rejected": -67.70036315917969, "loss": 0.6868, "rewards/accuracies": 0.75, "rewards/chosen": 0.6657217144966125, "rewards/margins": 2.003209114074707, "rewards/rejected": -1.3374874591827393, "step": 603 }, { "epoch": 0.7408202376389421, "grad_norm": 8.67426872253418, "learning_rate": 9.260122699386502e-07, "logits/chosen": -2.3085267543792725, "logits/rejected": -2.3567490577697754, "logps/chosen": -59.510013580322266, "logps/rejected": -66.33341217041016, "loss": 0.9251, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12120608240365982, "rewards/margins": 0.5616217255592346, "rewards/rejected": -0.6828278303146362, "step": 604 }, { "epoch": 0.7420467612111921, "grad_norm": 8.4693021774292, "learning_rate": 9.258895705521472e-07, "logits/chosen": -2.2986528873443604, "logits/rejected": -2.3213789463043213, "logps/chosen": -54.316017150878906, "logps/rejected": -67.63763427734375, "loss": 0.7932, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6563230156898499, "rewards/margins": 1.505735158920288, "rewards/rejected": -0.8494119048118591, "step": 605 }, { "epoch": 0.7432732847834419, "grad_norm": 16.728315353393555, "learning_rate": 9.257668711656442e-07, "logits/chosen": -2.3109536170959473, "logits/rejected": -2.359126567840576, "logps/chosen": -53.0015983581543, "logps/rejected": -65.63568115234375, "loss": 0.7242, "rewards/accuracies": 0.75, "rewards/chosen": 0.5579187870025635, "rewards/margins": 1.6330440044403076, "rewards/rejected": -1.0751253366470337, "step": 606 }, { "epoch": 0.7444998083556919, "grad_norm": 13.779340744018555, "learning_rate": 9.256441717791412e-07, "logits/chosen": -2.374664545059204, "logits/rejected": -2.3782272338867188, "logps/chosen": -60.107887268066406, "logps/rejected": -72.2346420288086, "loss": 0.8018, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5751191973686218, "rewards/margins": 1.1060092449188232, "rewards/rejected": -1.6811285018920898, "step": 607 }, { "epoch": 0.7457263319279417, "grad_norm": 6.956925868988037, "learning_rate": 9.25521472392638e-07, "logits/chosen": -2.2798571586608887, "logits/rejected": -2.3189640045166016, "logps/chosen": -59.346946716308594, "logps/rejected": -64.20388793945312, "loss": 0.901, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2715712785720825, "rewards/margins": 0.5851571559906006, "rewards/rejected": -0.8567284941673279, "step": 608 }, { "epoch": 0.7469528555001916, "grad_norm": 8.355161666870117, "learning_rate": 9.253987730061349e-07, "logits/chosen": -2.248718500137329, "logits/rejected": -2.2826671600341797, "logps/chosen": -56.843238830566406, "logps/rejected": -66.55265808105469, "loss": 0.8564, "rewards/accuracies": 0.59375, "rewards/chosen": -0.27943891286849976, "rewards/margins": 0.9543172717094421, "rewards/rejected": -1.233756184577942, "step": 609 }, { "epoch": 0.7481793790724416, "grad_norm": 9.179821968078613, "learning_rate": 9.252760736196319e-07, "logits/chosen": -2.3134729862213135, "logits/rejected": -2.3477115631103516, "logps/chosen": -58.95569610595703, "logps/rejected": -69.17179870605469, "loss": 0.8212, "rewards/accuracies": 0.59375, "rewards/chosen": -0.16953644156455994, "rewards/margins": 1.1357407569885254, "rewards/rejected": -1.3052773475646973, "step": 610 }, { "epoch": 0.7494059026446914, "grad_norm": 9.645809173583984, "learning_rate": 9.251533742331288e-07, "logits/chosen": -2.3459768295288086, "logits/rejected": -2.377971649169922, "logps/chosen": -64.52494812011719, "logps/rejected": -70.13710021972656, "loss": 0.9332, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6593866944313049, "rewards/margins": 0.5279346704483032, "rewards/rejected": -1.1873213052749634, "step": 611 }, { "epoch": 0.7506324262169414, "grad_norm": 9.233060836791992, "learning_rate": 9.250306748466258e-07, "logits/chosen": -2.3302621841430664, "logits/rejected": -2.3396694660186768, "logps/chosen": -58.17101287841797, "logps/rejected": -67.82484436035156, "loss": 0.8483, "rewards/accuracies": 0.59375, "rewards/chosen": 0.011935852468013763, "rewards/margins": 0.8615016341209412, "rewards/rejected": -0.849565863609314, "step": 612 }, { "epoch": 0.7518589497891912, "grad_norm": 16.324188232421875, "learning_rate": 9.249079754601226e-07, "logits/chosen": -2.270725727081299, "logits/rejected": -2.281228542327881, "logps/chosen": -53.242767333984375, "logps/rejected": -68.18978118896484, "loss": 0.7728, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6905674934387207, "rewards/margins": 1.5801177024841309, "rewards/rejected": -0.8895503878593445, "step": 613 }, { "epoch": 0.7530854733614412, "grad_norm": 13.188577651977539, "learning_rate": 9.247852760736195e-07, "logits/chosen": -2.2829837799072266, "logits/rejected": -2.316638708114624, "logps/chosen": -57.61945343017578, "logps/rejected": -69.58167266845703, "loss": 0.7889, "rewards/accuracies": 0.65625, "rewards/chosen": 0.013192079961299896, "rewards/margins": 1.4717566967010498, "rewards/rejected": -1.4585644006729126, "step": 614 }, { "epoch": 0.7543119969336911, "grad_norm": 9.094612121582031, "learning_rate": 9.246625766871165e-07, "logits/chosen": -2.2740771770477295, "logits/rejected": -2.285521984100342, "logps/chosen": -64.8430404663086, "logps/rejected": -73.77095794677734, "loss": 0.8641, "rewards/accuracies": 0.71875, "rewards/chosen": -0.06585628539323807, "rewards/margins": 1.0599260330200195, "rewards/rejected": -1.1257823705673218, "step": 615 }, { "epoch": 0.755538520505941, "grad_norm": 8.067927360534668, "learning_rate": 9.245398773006135e-07, "logits/chosen": -2.2817254066467285, "logits/rejected": -2.340090274810791, "logps/chosen": -61.21546936035156, "logps/rejected": -68.266845703125, "loss": 0.8362, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3770674169063568, "rewards/margins": 1.0377012491226196, "rewards/rejected": -1.4147685766220093, "step": 616 }, { "epoch": 0.7567650440781909, "grad_norm": 10.948524475097656, "learning_rate": 9.244171779141104e-07, "logits/chosen": -2.3250327110290527, "logits/rejected": -2.379302740097046, "logps/chosen": -55.47987365722656, "logps/rejected": -70.46961975097656, "loss": 0.8238, "rewards/accuracies": 0.59375, "rewards/chosen": 0.1901741921901703, "rewards/margins": 1.642844557762146, "rewards/rejected": -1.4526704549789429, "step": 617 }, { "epoch": 0.7579915676504407, "grad_norm": 13.443299293518066, "learning_rate": 9.242944785276074e-07, "logits/chosen": -2.253659248352051, "logits/rejected": -2.2468349933624268, "logps/chosen": -48.950218200683594, "logps/rejected": -64.4928970336914, "loss": 0.7738, "rewards/accuracies": 0.65625, "rewards/chosen": 0.7114500999450684, "rewards/margins": 1.596569299697876, "rewards/rejected": -0.8851190805435181, "step": 618 }, { "epoch": 0.7592180912226907, "grad_norm": 15.44863510131836, "learning_rate": 9.241717791411042e-07, "logits/chosen": -2.2365634441375732, "logits/rejected": -2.2815165519714355, "logps/chosen": -57.9228515625, "logps/rejected": -66.09293365478516, "loss": 0.8166, "rewards/accuracies": 0.75, "rewards/chosen": 0.2595008909702301, "rewards/margins": 1.0835331678390503, "rewards/rejected": -0.8240323662757874, "step": 619 }, { "epoch": 0.7604446147949406, "grad_norm": 9.181641578674316, "learning_rate": 9.240490797546012e-07, "logits/chosen": -2.356034755706787, "logits/rejected": -2.379232168197632, "logps/chosen": -55.226478576660156, "logps/rejected": -75.8466796875, "loss": 0.6489, "rewards/accuracies": 0.78125, "rewards/chosen": 0.09115689992904663, "rewards/margins": 2.2597639560699463, "rewards/rejected": -2.168607234954834, "step": 620 }, { "epoch": 0.7616711383671905, "grad_norm": 10.257709503173828, "learning_rate": 9.239263803680981e-07, "logits/chosen": -2.2926383018493652, "logits/rejected": -2.2988476753234863, "logps/chosen": -58.321800231933594, "logps/rejected": -75.39716339111328, "loss": 0.7446, "rewards/accuracies": 0.65625, "rewards/chosen": -0.13473576307296753, "rewards/margins": 1.9244401454925537, "rewards/rejected": -2.059175729751587, "step": 621 }, { "epoch": 0.7628976619394404, "grad_norm": 7.467625141143799, "learning_rate": 9.23803680981595e-07, "logits/chosen": -2.3506364822387695, "logits/rejected": -2.3407959938049316, "logps/chosen": -65.13848876953125, "logps/rejected": -77.0218505859375, "loss": 0.8688, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9116711616516113, "rewards/margins": 1.1322132349014282, "rewards/rejected": -2.04388427734375, "step": 622 }, { "epoch": 0.7641241855116903, "grad_norm": 9.4278564453125, "learning_rate": 9.23680981595092e-07, "logits/chosen": -2.366086959838867, "logits/rejected": -2.3850927352905273, "logps/chosen": -60.757328033447266, "logps/rejected": -79.63356018066406, "loss": 0.7925, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4868384599685669, "rewards/margins": 1.913658857345581, "rewards/rejected": -2.4004974365234375, "step": 623 }, { "epoch": 0.7653507090839402, "grad_norm": 9.837169647216797, "learning_rate": 9.235582822085889e-07, "logits/chosen": -2.2890892028808594, "logits/rejected": -2.3439981937408447, "logps/chosen": -68.0261459350586, "logps/rejected": -72.4471435546875, "loss": 0.8884, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8592469692230225, "rewards/margins": 0.8715381622314453, "rewards/rejected": -1.7307851314544678, "step": 624 }, { "epoch": 0.7665772326561902, "grad_norm": 18.205265045166016, "learning_rate": 9.234355828220858e-07, "logits/chosen": -2.266136646270752, "logits/rejected": -2.2778990268707275, "logps/chosen": -64.07193756103516, "logps/rejected": -76.11946105957031, "loss": 0.7734, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6243966221809387, "rewards/margins": 1.3314253091812134, "rewards/rejected": -1.9558218717575073, "step": 625 }, { "epoch": 0.76780375622844, "grad_norm": 9.891305923461914, "learning_rate": 9.233128834355828e-07, "logits/chosen": -2.3492648601531982, "logits/rejected": -2.331272602081299, "logps/chosen": -75.21408081054688, "logps/rejected": -81.37236785888672, "loss": 0.8986, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6378452777862549, "rewards/margins": 0.7052940130233765, "rewards/rejected": -2.343139171600342, "step": 626 }, { "epoch": 0.7690302798006899, "grad_norm": 8.38474178314209, "learning_rate": 9.231901840490797e-07, "logits/chosen": -2.355069160461426, "logits/rejected": -2.371952533721924, "logps/chosen": -64.55348205566406, "logps/rejected": -79.42249298095703, "loss": 0.8089, "rewards/accuracies": 0.75, "rewards/chosen": -0.6147032380104065, "rewards/margins": 1.3977017402648926, "rewards/rejected": -2.0124051570892334, "step": 627 }, { "epoch": 0.7702568033729398, "grad_norm": 8.130574226379395, "learning_rate": 9.230674846625767e-07, "logits/chosen": -2.324636936187744, "logits/rejected": -2.3173563480377197, "logps/chosen": -70.31655883789062, "logps/rejected": -84.13267517089844, "loss": 0.8074, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4617235660552979, "rewards/margins": 1.2765254974365234, "rewards/rejected": -2.7382490634918213, "step": 628 }, { "epoch": 0.7714833269451897, "grad_norm": 10.013291358947754, "learning_rate": 9.229447852760736e-07, "logits/chosen": -2.253293752670288, "logits/rejected": -2.295081853866577, "logps/chosen": -60.77599334716797, "logps/rejected": -79.71411895751953, "loss": 0.7129, "rewards/accuracies": 0.8125, "rewards/chosen": -0.26797884702682495, "rewards/margins": 2.2072227001190186, "rewards/rejected": -2.475201368331909, "step": 629 }, { "epoch": 0.7727098505174397, "grad_norm": 7.210981369018555, "learning_rate": 9.228220858895706e-07, "logits/chosen": -2.336528778076172, "logits/rejected": -2.360374689102173, "logps/chosen": -69.75347900390625, "logps/rejected": -80.6923828125, "loss": 0.8398, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2790226936340332, "rewards/margins": 1.2707695960998535, "rewards/rejected": -2.549792528152466, "step": 630 }, { "epoch": 0.7739363740896895, "grad_norm": 11.960291862487793, "learning_rate": 9.226993865030674e-07, "logits/chosen": -2.3119423389434814, "logits/rejected": -2.326443672180176, "logps/chosen": -61.088966369628906, "logps/rejected": -72.40455627441406, "loss": 0.8368, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6418159604072571, "rewards/margins": 1.2352416515350342, "rewards/rejected": -1.8770577907562256, "step": 631 }, { "epoch": 0.7751628976619395, "grad_norm": 9.553863525390625, "learning_rate": 9.225766871165643e-07, "logits/chosen": -2.316941738128662, "logits/rejected": -2.342557907104492, "logps/chosen": -60.28474044799805, "logps/rejected": -75.63448333740234, "loss": 0.7295, "rewards/accuracies": 0.78125, "rewards/chosen": -0.23342381417751312, "rewards/margins": 1.7851743698120117, "rewards/rejected": -2.0185983180999756, "step": 632 }, { "epoch": 0.7763894212341893, "grad_norm": 11.671259880065918, "learning_rate": 9.224539877300613e-07, "logits/chosen": -2.352478504180908, "logits/rejected": -2.3669419288635254, "logps/chosen": -59.25769805908203, "logps/rejected": -79.21047973632812, "loss": 0.7341, "rewards/accuracies": 0.75, "rewards/chosen": -0.2571795582771301, "rewards/margins": 1.9435014724731445, "rewards/rejected": -2.200681209564209, "step": 633 }, { "epoch": 0.7776159448064393, "grad_norm": 12.602333068847656, "learning_rate": 9.223312883435582e-07, "logits/chosen": -2.3711252212524414, "logits/rejected": -2.393437623977661, "logps/chosen": -60.48626708984375, "logps/rejected": -68.75515747070312, "loss": 0.8731, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6428142786026001, "rewards/margins": 0.7193955183029175, "rewards/rejected": -1.362209677696228, "step": 634 }, { "epoch": 0.7788424683786892, "grad_norm": 7.879671573638916, "learning_rate": 9.222085889570552e-07, "logits/chosen": -2.2944881916046143, "logits/rejected": -2.3086676597595215, "logps/chosen": -58.739524841308594, "logps/rejected": -71.62300109863281, "loss": 0.8276, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3182264268398285, "rewards/margins": 1.3196823596954346, "rewards/rejected": -1.6379088163375854, "step": 635 }, { "epoch": 0.780068991950939, "grad_norm": 16.746726989746094, "learning_rate": 9.220858895705521e-07, "logits/chosen": -2.311828136444092, "logits/rejected": -2.2836525440216064, "logps/chosen": -59.33317947387695, "logps/rejected": -67.35277557373047, "loss": 0.8854, "rewards/accuracies": 0.65625, "rewards/chosen": -0.24580083787441254, "rewards/margins": 0.6782973408699036, "rewards/rejected": -0.9240981340408325, "step": 636 }, { "epoch": 0.781295515523189, "grad_norm": 9.017732620239258, "learning_rate": 9.21963190184049e-07, "logits/chosen": -2.2848613262176514, "logits/rejected": -2.2981655597686768, "logps/chosen": -53.84661102294922, "logps/rejected": -69.15458679199219, "loss": 0.7275, "rewards/accuracies": 0.75, "rewards/chosen": 0.5681328773498535, "rewards/margins": 1.8856914043426514, "rewards/rejected": -1.3175584077835083, "step": 637 }, { "epoch": 0.7825220390954388, "grad_norm": 8.931150436401367, "learning_rate": 9.21840490797546e-07, "logits/chosen": -2.3962345123291016, "logits/rejected": -2.422126531600952, "logps/chosen": -61.04222869873047, "logps/rejected": -77.06233978271484, "loss": 0.7531, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3114934265613556, "rewards/margins": 1.7265459299087524, "rewards/rejected": -2.0380396842956543, "step": 638 }, { "epoch": 0.7837485626676888, "grad_norm": 9.73668098449707, "learning_rate": 9.217177914110429e-07, "logits/chosen": -2.30734920501709, "logits/rejected": -2.318844795227051, "logps/chosen": -60.004608154296875, "logps/rejected": -72.73016357421875, "loss": 0.8155, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3416459858417511, "rewards/margins": 1.337454080581665, "rewards/rejected": -1.6791001558303833, "step": 639 }, { "epoch": 0.7849750862399387, "grad_norm": 10.73846435546875, "learning_rate": 9.215950920245399e-07, "logits/chosen": -2.289323091506958, "logits/rejected": -2.311591386795044, "logps/chosen": -53.41640090942383, "logps/rejected": -62.24925994873047, "loss": 0.8358, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6064504981040955, "rewards/margins": 1.2128429412841797, "rewards/rejected": -0.6063923835754395, "step": 640 }, { "epoch": 0.7862016098121886, "grad_norm": 10.354142189025879, "learning_rate": 9.214723926380368e-07, "logits/chosen": -2.3831498622894287, "logits/rejected": -2.4162230491638184, "logps/chosen": -52.547767639160156, "logps/rejected": -66.86243438720703, "loss": 0.7031, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6024008989334106, "rewards/margins": 1.873653769493103, "rewards/rejected": -1.271253228187561, "step": 641 }, { "epoch": 0.7874281333844385, "grad_norm": 12.618638038635254, "learning_rate": 9.213496932515337e-07, "logits/chosen": -2.3345301151275635, "logits/rejected": -2.332193613052368, "logps/chosen": -53.545082092285156, "logps/rejected": -66.56002044677734, "loss": 0.8441, "rewards/accuracies": 0.625, "rewards/chosen": 0.5056248307228088, "rewards/margins": 1.2084946632385254, "rewards/rejected": -0.7028698325157166, "step": 642 }, { "epoch": 0.7886546569566883, "grad_norm": 9.207320213317871, "learning_rate": 9.212269938650306e-07, "logits/chosen": -2.3522441387176514, "logits/rejected": -2.3699965476989746, "logps/chosen": -50.67286682128906, "logps/rejected": -61.02608871459961, "loss": 0.8567, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4983188509941101, "rewards/margins": 1.0573824644088745, "rewards/rejected": -0.5590636134147644, "step": 643 }, { "epoch": 0.7898811805289383, "grad_norm": 9.658195495605469, "learning_rate": 9.211042944785275e-07, "logits/chosen": -2.332854986190796, "logits/rejected": -2.3805460929870605, "logps/chosen": -49.711456298828125, "logps/rejected": -62.388675689697266, "loss": 0.7301, "rewards/accuracies": 0.75, "rewards/chosen": 0.7734567523002625, "rewards/margins": 1.9059326648712158, "rewards/rejected": -1.1324758529663086, "step": 644 }, { "epoch": 0.7911077041011882, "grad_norm": 18.80950927734375, "learning_rate": 9.209815950920244e-07, "logits/chosen": -2.333230972290039, "logits/rejected": -2.343088150024414, "logps/chosen": -48.53990173339844, "logps/rejected": -59.230445861816406, "loss": 0.8312, "rewards/accuracies": 0.625, "rewards/chosen": 0.8591504693031311, "rewards/margins": 1.0173981189727783, "rewards/rejected": -0.15824764966964722, "step": 645 }, { "epoch": 0.7923342276734381, "grad_norm": 9.177109718322754, "learning_rate": 9.208588957055214e-07, "logits/chosen": -2.327000856399536, "logits/rejected": -2.3499996662139893, "logps/chosen": -57.14579391479492, "logps/rejected": -62.19219207763672, "loss": 0.9044, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2156064808368683, "rewards/margins": 0.6850378513336182, "rewards/rejected": -0.4694312810897827, "step": 646 }, { "epoch": 0.793560751245688, "grad_norm": 11.935958862304688, "learning_rate": 9.207361963190184e-07, "logits/chosen": -2.273545026779175, "logits/rejected": -2.315734624862671, "logps/chosen": -56.60676574707031, "logps/rejected": -66.57262420654297, "loss": 0.8439, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22770240902900696, "rewards/margins": 1.291101098060608, "rewards/rejected": -1.0633985996246338, "step": 647 }, { "epoch": 0.7947872748179379, "grad_norm": 16.81027603149414, "learning_rate": 9.206134969325154e-07, "logits/chosen": -2.3297150135040283, "logits/rejected": -2.332322597503662, "logps/chosen": -51.223873138427734, "logps/rejected": -64.61618041992188, "loss": 0.8178, "rewards/accuracies": 0.65625, "rewards/chosen": 0.6156766414642334, "rewards/margins": 1.2744529247283936, "rewards/rejected": -0.6587762832641602, "step": 648 }, { "epoch": 0.7960137983901878, "grad_norm": 8.570659637451172, "learning_rate": 9.204907975460122e-07, "logits/chosen": -2.308121681213379, "logits/rejected": -2.3053600788116455, "logps/chosen": -56.63975524902344, "logps/rejected": -68.54670715332031, "loss": 0.7982, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1842343509197235, "rewards/margins": 1.2914600372314453, "rewards/rejected": -1.1072256565093994, "step": 649 }, { "epoch": 0.7972403219624378, "grad_norm": 14.253768920898438, "learning_rate": 9.203680981595091e-07, "logits/chosen": -2.288888931274414, "logits/rejected": -2.3204281330108643, "logps/chosen": -49.050968170166016, "logps/rejected": -61.67433547973633, "loss": 0.7816, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8045152425765991, "rewards/margins": 1.3725816011428833, "rewards/rejected": -0.5680663585662842, "step": 650 }, { "epoch": 0.7984668455346876, "grad_norm": 13.81462287902832, "learning_rate": 9.202453987730061e-07, "logits/chosen": -2.317599296569824, "logits/rejected": -2.3477015495300293, "logps/chosen": -50.36981201171875, "logps/rejected": -60.42034149169922, "loss": 0.8146, "rewards/accuracies": 0.71875, "rewards/chosen": 0.996171772480011, "rewards/margins": 1.3375723361968994, "rewards/rejected": -0.3414003551006317, "step": 651 }, { "epoch": 0.7996933691069376, "grad_norm": 9.512079238891602, "learning_rate": 9.20122699386503e-07, "logits/chosen": -2.3515892028808594, "logits/rejected": -2.349236488342285, "logps/chosen": -59.5420036315918, "logps/rejected": -72.1475601196289, "loss": 0.7943, "rewards/accuracies": 0.625, "rewards/chosen": -0.4420780837535858, "rewards/margins": 1.3437901735305786, "rewards/rejected": -1.7858681678771973, "step": 652 }, { "epoch": 0.7996933691069376, "eval_logits/chosen": -2.3324570655822754, "eval_logits/rejected": -2.350707530975342, "eval_logps/chosen": -66.52630615234375, "eval_logps/rejected": -76.05001831054688, "eval_loss": 0.8434896469116211, "eval_rewards/accuracies": 0.6303854584693909, "eval_rewards/chosen": -0.889494001865387, "eval_rewards/margins": 1.0656503438949585, "eval_rewards/rejected": -1.9551442861557007, "eval_runtime": 1582.337, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.279, "step": 652 }, { "epoch": 0.8009198926791874, "grad_norm": 7.935880184173584, "learning_rate": 9.2e-07, "logits/chosen": -2.245551109313965, "logits/rejected": -2.3176581859588623, "logps/chosen": -60.180904388427734, "logps/rejected": -69.36451721191406, "loss": 0.7994, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10076242685317993, "rewards/margins": 1.239267110824585, "rewards/rejected": -1.1385047435760498, "step": 653 }, { "epoch": 0.8021464162514373, "grad_norm": 10.104310035705566, "learning_rate": 9.198773006134969e-07, "logits/chosen": -2.342941999435425, "logits/rejected": -2.383258819580078, "logps/chosen": -51.622650146484375, "logps/rejected": -71.16558074951172, "loss": 0.7043, "rewards/accuracies": 0.875, "rewards/chosen": 0.602125883102417, "rewards/margins": 2.3484513759613037, "rewards/rejected": -1.7463257312774658, "step": 654 }, { "epoch": 0.8033729398236872, "grad_norm": 9.247900009155273, "learning_rate": 9.197546012269937e-07, "logits/chosen": -2.2995903491973877, "logits/rejected": -2.2969772815704346, "logps/chosen": -59.535400390625, "logps/rejected": -67.57411193847656, "loss": 0.8338, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1230362206697464, "rewards/margins": 1.1259372234344482, "rewards/rejected": -1.2489733695983887, "step": 655 }, { "epoch": 0.8045994633959371, "grad_norm": 9.67653751373291, "learning_rate": 9.196319018404907e-07, "logits/chosen": -2.3247318267822266, "logits/rejected": -2.3865914344787598, "logps/chosen": -58.21672058105469, "logps/rejected": -70.30341339111328, "loss": 0.7492, "rewards/accuracies": 0.59375, "rewards/chosen": 0.16979065537452698, "rewards/margins": 1.6141681671142578, "rewards/rejected": -1.4443774223327637, "step": 656 }, { "epoch": 0.8058259869681871, "grad_norm": 9.164427757263184, "learning_rate": 9.195092024539877e-07, "logits/chosen": -2.361299991607666, "logits/rejected": -2.420964241027832, "logps/chosen": -56.72093200683594, "logps/rejected": -77.4923095703125, "loss": 0.6679, "rewards/accuracies": 0.75, "rewards/chosen": 0.04184906929731369, "rewards/margins": 2.490784168243408, "rewards/rejected": -2.448935031890869, "step": 657 }, { "epoch": 0.8070525105404369, "grad_norm": 8.266878128051758, "learning_rate": 9.193865030674847e-07, "logits/chosen": -2.3537139892578125, "logits/rejected": -2.392237424850464, "logps/chosen": -58.15681838989258, "logps/rejected": -69.86741638183594, "loss": 0.8052, "rewards/accuracies": 0.625, "rewards/chosen": 0.35418501496315, "rewards/margins": 1.4182847738265991, "rewards/rejected": -1.064099907875061, "step": 658 }, { "epoch": 0.8082790341126869, "grad_norm": 11.567789077758789, "learning_rate": 9.192638036809816e-07, "logits/chosen": -2.355281352996826, "logits/rejected": -2.3708550930023193, "logps/chosen": -60.026145935058594, "logps/rejected": -72.51702880859375, "loss": 0.8444, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6385226249694824, "rewards/margins": 1.2536392211914062, "rewards/rejected": -1.8921618461608887, "step": 659 }, { "epoch": 0.8095055576849367, "grad_norm": 7.988635540008545, "learning_rate": 9.191411042944785e-07, "logits/chosen": -2.2689192295074463, "logits/rejected": -2.2988295555114746, "logps/chosen": -69.2240219116211, "logps/rejected": -79.57112121582031, "loss": 0.8196, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8796970844268799, "rewards/margins": 1.278885006904602, "rewards/rejected": -2.1585822105407715, "step": 660 }, { "epoch": 0.8107320812571867, "grad_norm": 10.638254165649414, "learning_rate": 9.190184049079754e-07, "logits/chosen": -2.357827663421631, "logits/rejected": -2.354454755783081, "logps/chosen": -55.55390167236328, "logps/rejected": -71.6357650756836, "loss": 0.6829, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6074456572532654, "rewards/margins": 1.9937398433685303, "rewards/rejected": -1.3862941265106201, "step": 661 }, { "epoch": 0.8119586048294366, "grad_norm": 8.820642471313477, "learning_rate": 9.188957055214723e-07, "logits/chosen": -2.3452768325805664, "logits/rejected": -2.3525283336639404, "logps/chosen": -59.371822357177734, "logps/rejected": -72.44430541992188, "loss": 0.807, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06352090835571289, "rewards/margins": 1.542394757270813, "rewards/rejected": -1.6059157848358154, "step": 662 }, { "epoch": 0.8131851284016864, "grad_norm": 9.331399917602539, "learning_rate": 9.187730061349693e-07, "logits/chosen": -2.3274729251861572, "logits/rejected": -2.3472890853881836, "logps/chosen": -61.57952880859375, "logps/rejected": -68.89891815185547, "loss": 0.8974, "rewards/accuracies": 0.5, "rewards/chosen": -0.15721729397773743, "rewards/margins": 0.7139896750450134, "rewards/rejected": -0.8712069988250732, "step": 663 }, { "epoch": 0.8144116519739364, "grad_norm": 13.246220588684082, "learning_rate": 9.186503067484662e-07, "logits/chosen": -2.346414804458618, "logits/rejected": -2.3156962394714355, "logps/chosen": -61.66350555419922, "logps/rejected": -65.44638061523438, "loss": 0.9089, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4287817180156708, "rewards/margins": 0.5043349266052246, "rewards/rejected": -0.933116614818573, "step": 664 }, { "epoch": 0.8156381755461862, "grad_norm": 13.028084754943848, "learning_rate": 9.185276073619631e-07, "logits/chosen": -2.361319065093994, "logits/rejected": -2.3906843662261963, "logps/chosen": -58.87308883666992, "logps/rejected": -74.79901885986328, "loss": 0.7754, "rewards/accuracies": 0.59375, "rewards/chosen": -0.30407875776290894, "rewards/margins": 1.5028196573257446, "rewards/rejected": -1.8068983554840088, "step": 665 }, { "epoch": 0.8168646991184362, "grad_norm": 7.783277988433838, "learning_rate": 9.184049079754601e-07, "logits/chosen": -2.2676913738250732, "logits/rejected": -2.260277032852173, "logps/chosen": -62.00727081298828, "logps/rejected": -71.17218780517578, "loss": 0.8716, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0778605118393898, "rewards/margins": 0.9771950244903564, "rewards/rejected": -1.0550554990768433, "step": 666 }, { "epoch": 0.8180912226906861, "grad_norm": 6.5222578048706055, "learning_rate": 9.18282208588957e-07, "logits/chosen": -2.331296443939209, "logits/rejected": -2.3365015983581543, "logps/chosen": -55.6722526550293, "logps/rejected": -68.10961151123047, "loss": 0.798, "rewards/accuracies": 0.59375, "rewards/chosen": 0.3688492178916931, "rewards/margins": 1.4425398111343384, "rewards/rejected": -1.0736905336380005, "step": 667 }, { "epoch": 0.819317746262936, "grad_norm": 8.224742889404297, "learning_rate": 9.18159509202454e-07, "logits/chosen": -2.3652453422546387, "logits/rejected": -2.4026899337768555, "logps/chosen": -62.09703826904297, "logps/rejected": -75.64712524414062, "loss": 0.7834, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3665889501571655, "rewards/margins": 1.5998753309249878, "rewards/rejected": -1.9664642810821533, "step": 668 }, { "epoch": 0.8205442698351859, "grad_norm": 13.405926704406738, "learning_rate": 9.180368098159509e-07, "logits/chosen": -2.3254141807556152, "logits/rejected": -2.3457133769989014, "logps/chosen": -50.34049606323242, "logps/rejected": -63.522300720214844, "loss": 0.7626, "rewards/accuracies": 0.71875, "rewards/chosen": 1.156459927558899, "rewards/margins": 1.736420750617981, "rewards/rejected": -0.579960823059082, "step": 669 }, { "epoch": 0.8217707934074358, "grad_norm": 9.727025985717773, "learning_rate": 9.179141104294478e-07, "logits/chosen": -2.3545634746551514, "logits/rejected": -2.337174892425537, "logps/chosen": -62.820655822753906, "logps/rejected": -76.49797821044922, "loss": 0.8105, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2618141174316406, "rewards/margins": 1.3496042490005493, "rewards/rejected": -1.6114182472229004, "step": 670 }, { "epoch": 0.8229973169796857, "grad_norm": 9.549312591552734, "learning_rate": 9.177914110429448e-07, "logits/chosen": -2.2900173664093018, "logits/rejected": -2.340515375137329, "logps/chosen": -58.80765914916992, "logps/rejected": -74.0993881225586, "loss": 0.7638, "rewards/accuracies": 0.78125, "rewards/chosen": -0.14064088463783264, "rewards/margins": 1.630147933959961, "rewards/rejected": -1.7707889080047607, "step": 671 }, { "epoch": 0.8242238405519356, "grad_norm": 9.624863624572754, "learning_rate": 9.176687116564417e-07, "logits/chosen": -2.338285207748413, "logits/rejected": -2.327646255493164, "logps/chosen": -55.44289779663086, "logps/rejected": -66.77896118164062, "loss": 0.7961, "rewards/accuracies": 0.65625, "rewards/chosen": 0.4407651722431183, "rewards/margins": 1.256903052330017, "rewards/rejected": -0.8161377906799316, "step": 672 }, { "epoch": 0.8254503641241855, "grad_norm": 8.527990341186523, "learning_rate": 9.175460122699386e-07, "logits/chosen": -2.364675521850586, "logits/rejected": -2.4031949043273926, "logps/chosen": -59.026588439941406, "logps/rejected": -75.5904541015625, "loss": 0.7705, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1009804904460907, "rewards/margins": 1.9053632020950317, "rewards/rejected": -2.0063436031341553, "step": 673 }, { "epoch": 0.8266768876964354, "grad_norm": 14.691767692565918, "learning_rate": 9.174233128834355e-07, "logits/chosen": -2.320371389389038, "logits/rejected": -2.297663927078247, "logps/chosen": -60.47084045410156, "logps/rejected": -81.15025329589844, "loss": 0.6718, "rewards/accuracies": 0.875, "rewards/chosen": -0.06357944011688232, "rewards/margins": 2.249911308288574, "rewards/rejected": -2.313490867614746, "step": 674 }, { "epoch": 0.8279034112686853, "grad_norm": 10.5551176071167, "learning_rate": 9.173006134969324e-07, "logits/chosen": -2.326432704925537, "logits/rejected": -2.367621421813965, "logps/chosen": -63.59779739379883, "logps/rejected": -75.82540130615234, "loss": 0.8192, "rewards/accuracies": 0.625, "rewards/chosen": -0.7845696210861206, "rewards/margins": 0.9785676598548889, "rewards/rejected": -1.7631373405456543, "step": 675 }, { "epoch": 0.8291299348409352, "grad_norm": 10.108527183532715, "learning_rate": 9.171779141104294e-07, "logits/chosen": -2.2656137943267822, "logits/rejected": -2.305607557296753, "logps/chosen": -53.53995132446289, "logps/rejected": -70.40386199951172, "loss": 0.7105, "rewards/accuracies": 0.6875, "rewards/chosen": 0.44142866134643555, "rewards/margins": 1.7380285263061523, "rewards/rejected": -1.2965998649597168, "step": 676 }, { "epoch": 0.8303564584131852, "grad_norm": 7.305680751800537, "learning_rate": 9.170552147239264e-07, "logits/chosen": -2.3045783042907715, "logits/rejected": -2.339338541030884, "logps/chosen": -56.312198638916016, "logps/rejected": -67.6728515625, "loss": 0.7982, "rewards/accuracies": 0.65625, "rewards/chosen": 0.18669281899929047, "rewards/margins": 1.2422897815704346, "rewards/rejected": -1.0555970668792725, "step": 677 }, { "epoch": 0.831582981985435, "grad_norm": 10.628191947937012, "learning_rate": 9.169325153374233e-07, "logits/chosen": -2.3603570461273193, "logits/rejected": -2.384246826171875, "logps/chosen": -69.0494613647461, "logps/rejected": -79.61172485351562, "loss": 0.837, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3604917526245117, "rewards/margins": 1.0412753820419312, "rewards/rejected": -2.4017672538757324, "step": 678 }, { "epoch": 0.832809505557685, "grad_norm": 7.967296123504639, "learning_rate": 9.168098159509202e-07, "logits/chosen": -2.3288686275482178, "logits/rejected": -2.3678152561187744, "logps/chosen": -63.077823638916016, "logps/rejected": -85.35736083984375, "loss": 0.6964, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5120095610618591, "rewards/margins": 2.2989578247070312, "rewards/rejected": -2.810967206954956, "step": 679 }, { "epoch": 0.8340360291299348, "grad_norm": 14.004111289978027, "learning_rate": 9.166871165644171e-07, "logits/chosen": -2.300139904022217, "logits/rejected": -2.3409671783447266, "logps/chosen": -58.78809356689453, "logps/rejected": -73.5428237915039, "loss": 0.8125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.43862712383270264, "rewards/margins": 1.3998923301696777, "rewards/rejected": -1.8385194540023804, "step": 680 }, { "epoch": 0.8352625527021847, "grad_norm": 9.023746490478516, "learning_rate": 9.165644171779141e-07, "logits/chosen": -2.3164591789245605, "logits/rejected": -2.3278820514678955, "logps/chosen": -67.07087707519531, "logps/rejected": -80.67027282714844, "loss": 0.788, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9547200798988342, "rewards/margins": 1.5520657300949097, "rewards/rejected": -2.5067858695983887, "step": 681 }, { "epoch": 0.8364890762744347, "grad_norm": 13.48436164855957, "learning_rate": 9.16441717791411e-07, "logits/chosen": -2.3089404106140137, "logits/rejected": -2.322652578353882, "logps/chosen": -73.64990997314453, "logps/rejected": -90.0468978881836, "loss": 0.7957, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4596253633499146, "rewards/margins": 1.5739610195159912, "rewards/rejected": -3.033586263656616, "step": 682 }, { "epoch": 0.8377155998466845, "grad_norm": 10.410265922546387, "learning_rate": 9.163190184049079e-07, "logits/chosen": -2.3296289443969727, "logits/rejected": -2.3607325553894043, "logps/chosen": -63.1534309387207, "logps/rejected": -80.67068481445312, "loss": 0.7789, "rewards/accuracies": 0.75, "rewards/chosen": -0.5047922730445862, "rewards/margins": 1.8234668970108032, "rewards/rejected": -2.328259229660034, "step": 683 }, { "epoch": 0.8389421234189345, "grad_norm": 8.571969985961914, "learning_rate": 9.161963190184049e-07, "logits/chosen": -2.3215503692626953, "logits/rejected": -2.3543591499328613, "logps/chosen": -65.14662170410156, "logps/rejected": -76.84249114990234, "loss": 0.7945, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6037508845329285, "rewards/margins": 1.4830594062805176, "rewards/rejected": -2.086810350418091, "step": 684 }, { "epoch": 0.8401686469911843, "grad_norm": 13.792708396911621, "learning_rate": 9.160736196319017e-07, "logits/chosen": -2.2816529273986816, "logits/rejected": -2.312856674194336, "logps/chosen": -56.81477355957031, "logps/rejected": -75.33580780029297, "loss": 0.739, "rewards/accuracies": 0.78125, "rewards/chosen": 0.023847848176956177, "rewards/margins": 2.0666415691375732, "rewards/rejected": -2.0427937507629395, "step": 685 }, { "epoch": 0.8413951705634343, "grad_norm": 9.823832511901855, "learning_rate": 9.159509202453987e-07, "logits/chosen": -2.3315229415893555, "logits/rejected": -2.3553099632263184, "logps/chosen": -64.5879135131836, "logps/rejected": -72.19095611572266, "loss": 0.8903, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1202638149261475, "rewards/margins": 0.8255153894424438, "rewards/rejected": -1.9457790851593018, "step": 686 }, { "epoch": 0.8426216941356842, "grad_norm": 10.029295921325684, "learning_rate": 9.158282208588957e-07, "logits/chosen": -2.3377745151519775, "logits/rejected": -2.343043565750122, "logps/chosen": -64.05613708496094, "logps/rejected": -74.95272064208984, "loss": 0.7795, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6258469820022583, "rewards/margins": 1.1708076000213623, "rewards/rejected": -1.796654462814331, "step": 687 }, { "epoch": 0.8438482177079341, "grad_norm": 10.698575973510742, "learning_rate": 9.157055214723926e-07, "logits/chosen": -2.2474451065063477, "logits/rejected": -2.293134927749634, "logps/chosen": -59.265541076660156, "logps/rejected": -76.81659698486328, "loss": 0.7102, "rewards/accuracies": 0.75, "rewards/chosen": 0.06361949443817139, "rewards/margins": 1.866489052772522, "rewards/rejected": -1.8028695583343506, "step": 688 }, { "epoch": 0.845074741280184, "grad_norm": 14.971747398376465, "learning_rate": 9.155828220858896e-07, "logits/chosen": -2.3153462409973145, "logits/rejected": -2.3518624305725098, "logps/chosen": -59.71057891845703, "logps/rejected": -77.99291229248047, "loss": 0.6953, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4103856384754181, "rewards/margins": 1.9152450561523438, "rewards/rejected": -2.3256311416625977, "step": 689 }, { "epoch": 0.8463012648524338, "grad_norm": 11.37988567352295, "learning_rate": 9.154601226993865e-07, "logits/chosen": -2.284646987915039, "logits/rejected": -2.317906141281128, "logps/chosen": -56.3682861328125, "logps/rejected": -74.65428924560547, "loss": 0.6675, "rewards/accuracies": 0.71875, "rewards/chosen": -0.016318194568157196, "rewards/margins": 2.1844418048858643, "rewards/rejected": -2.2007601261138916, "step": 690 }, { "epoch": 0.8475277884246838, "grad_norm": 10.605772018432617, "learning_rate": 9.153374233128834e-07, "logits/chosen": -2.2736973762512207, "logits/rejected": -2.3436293601989746, "logps/chosen": -55.613121032714844, "logps/rejected": -69.29090118408203, "loss": 0.7607, "rewards/accuracies": 0.71875, "rewards/chosen": 0.10005934536457062, "rewards/margins": 1.951346755027771, "rewards/rejected": -1.8512873649597168, "step": 691 }, { "epoch": 0.8487543119969337, "grad_norm": 10.8259859085083, "learning_rate": 9.152147239263803e-07, "logits/chosen": -2.2922797203063965, "logits/rejected": -2.3251540660858154, "logps/chosen": -61.71345138549805, "logps/rejected": -73.17768859863281, "loss": 0.8111, "rewards/accuracies": 0.59375, "rewards/chosen": -0.47549837827682495, "rewards/margins": 1.260640025138855, "rewards/rejected": -1.7361385822296143, "step": 692 }, { "epoch": 0.8499808355691836, "grad_norm": 10.764558792114258, "learning_rate": 9.150920245398772e-07, "logits/chosen": -2.3045079708099365, "logits/rejected": -2.3422904014587402, "logps/chosen": -66.6361312866211, "logps/rejected": -79.66236877441406, "loss": 0.8002, "rewards/accuracies": 0.6875, "rewards/chosen": -0.706358790397644, "rewards/margins": 1.4603681564331055, "rewards/rejected": -2.16672682762146, "step": 693 }, { "epoch": 0.8512073591414335, "grad_norm": 13.751595497131348, "learning_rate": 9.149693251533742e-07, "logits/chosen": -2.3406124114990234, "logits/rejected": -2.3362462520599365, "logps/chosen": -64.75001525878906, "logps/rejected": -78.5385513305664, "loss": 0.8142, "rewards/accuracies": 0.6875, "rewards/chosen": -1.048129916191101, "rewards/margins": 1.2539513111114502, "rewards/rejected": -2.3020811080932617, "step": 694 }, { "epoch": 0.8524338827136834, "grad_norm": 13.781304359436035, "learning_rate": 9.148466257668711e-07, "logits/chosen": -2.270991325378418, "logits/rejected": -2.3376729488372803, "logps/chosen": -59.48948669433594, "logps/rejected": -73.97107696533203, "loss": 0.8084, "rewards/accuracies": 0.75, "rewards/chosen": -0.35606637597084045, "rewards/margins": 1.488366961479187, "rewards/rejected": -1.844433307647705, "step": 695 }, { "epoch": 0.8536604062859333, "grad_norm": 11.496192932128906, "learning_rate": 9.147239263803681e-07, "logits/chosen": -2.310849666595459, "logits/rejected": -2.286806583404541, "logps/chosen": -54.54897689819336, "logps/rejected": -73.6068115234375, "loss": 0.8146, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4909363090991974, "rewards/margins": 1.6663697957992554, "rewards/rejected": -1.1754335165023804, "step": 696 }, { "epoch": 0.8548869298581833, "grad_norm": 8.980607986450195, "learning_rate": 9.146012269938649e-07, "logits/chosen": -2.265573024749756, "logits/rejected": -2.3161027431488037, "logps/chosen": -48.839378356933594, "logps/rejected": -60.452735900878906, "loss": 0.7996, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1909191608428955, "rewards/margins": 1.4424948692321777, "rewards/rejected": -0.2515758275985718, "step": 697 }, { "epoch": 0.8561134534304331, "grad_norm": 11.576333999633789, "learning_rate": 9.144785276073619e-07, "logits/chosen": -2.3208396434783936, "logits/rejected": -2.336918354034424, "logps/chosen": -48.864952087402344, "logps/rejected": -64.31371307373047, "loss": 0.7808, "rewards/accuracies": 0.625, "rewards/chosen": 0.6271963119506836, "rewards/margins": 1.5646122694015503, "rewards/rejected": -0.9374160766601562, "step": 698 }, { "epoch": 0.857339977002683, "grad_norm": 9.86825180053711, "learning_rate": 9.143558282208589e-07, "logits/chosen": -2.3259811401367188, "logits/rejected": -2.3451180458068848, "logps/chosen": -60.317039489746094, "logps/rejected": -67.65389251708984, "loss": 0.9224, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2248253971338272, "rewards/margins": 0.6106698513031006, "rewards/rejected": -0.8354954123497009, "step": 699 }, { "epoch": 0.8585665005749329, "grad_norm": 21.551706314086914, "learning_rate": 9.142331288343558e-07, "logits/chosen": -2.297441005706787, "logits/rejected": -2.305910110473633, "logps/chosen": -56.20954513549805, "logps/rejected": -62.5340576171875, "loss": 0.8289, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0374508872628212, "rewards/margins": 1.136623740196228, "rewards/rejected": -1.1740747690200806, "step": 700 }, { "epoch": 0.8597930241471828, "grad_norm": 11.656436920166016, "learning_rate": 9.141104294478528e-07, "logits/chosen": -2.301535129547119, "logits/rejected": -2.3291776180267334, "logps/chosen": -61.98308563232422, "logps/rejected": -66.45418548583984, "loss": 0.9348, "rewards/accuracies": 0.625, "rewards/chosen": -0.3453504741191864, "rewards/margins": 0.39318668842315674, "rewards/rejected": -0.7385370135307312, "step": 701 }, { "epoch": 0.8610195477194328, "grad_norm": 13.653353691101074, "learning_rate": 9.139877300613497e-07, "logits/chosen": -2.2792248725891113, "logits/rejected": -2.3375115394592285, "logps/chosen": -47.96044921875, "logps/rejected": -67.19918823242188, "loss": 0.6923, "rewards/accuracies": 0.71875, "rewards/chosen": 1.1441549062728882, "rewards/margins": 1.9820733070373535, "rewards/rejected": -0.8379185199737549, "step": 702 }, { "epoch": 0.8622460712916826, "grad_norm": 10.435065269470215, "learning_rate": 9.138650306748465e-07, "logits/chosen": -2.269679307937622, "logits/rejected": -2.3595504760742188, "logps/chosen": -51.500953674316406, "logps/rejected": -64.60509490966797, "loss": 0.7785, "rewards/accuracies": 0.75, "rewards/chosen": 0.6834092140197754, "rewards/margins": 1.6113933324813843, "rewards/rejected": -0.9279839992523193, "step": 703 }, { "epoch": 0.8634725948639326, "grad_norm": 7.1248698234558105, "learning_rate": 9.137423312883435e-07, "logits/chosen": -2.3062572479248047, "logits/rejected": -2.3261160850524902, "logps/chosen": -52.59947967529297, "logps/rejected": -66.35884094238281, "loss": 0.7901, "rewards/accuracies": 0.65625, "rewards/chosen": 0.21269404888153076, "rewards/margins": 1.328313946723938, "rewards/rejected": -1.1156200170516968, "step": 704 }, { "epoch": 0.8646991184361824, "grad_norm": 9.11876392364502, "learning_rate": 9.136196319018404e-07, "logits/chosen": -2.312817096710205, "logits/rejected": -2.3849947452545166, "logps/chosen": -47.312381744384766, "logps/rejected": -65.68283081054688, "loss": 0.7448, "rewards/accuracies": 0.75, "rewards/chosen": 0.9207842946052551, "rewards/margins": 1.77652907371521, "rewards/rejected": -0.8557448983192444, "step": 705 }, { "epoch": 0.8659256420084324, "grad_norm": 12.090575218200684, "learning_rate": 9.134969325153374e-07, "logits/chosen": -2.3620219230651855, "logits/rejected": -2.3644819259643555, "logps/chosen": -54.60433578491211, "logps/rejected": -67.56256866455078, "loss": 0.7638, "rewards/accuracies": 0.65625, "rewards/chosen": 0.7085028290748596, "rewards/margins": 1.5781995058059692, "rewards/rejected": -0.8696965575218201, "step": 706 }, { "epoch": 0.8671521655806822, "grad_norm": 10.156705856323242, "learning_rate": 9.133742331288343e-07, "logits/chosen": -2.3268544673919678, "logits/rejected": -2.3229057788848877, "logps/chosen": -62.92507553100586, "logps/rejected": -71.60334014892578, "loss": 0.8359, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3999979794025421, "rewards/margins": 1.0236289501190186, "rewards/rejected": -1.4236267805099487, "step": 707 }, { "epoch": 0.8683786891529321, "grad_norm": 10.420570373535156, "learning_rate": 9.132515337423313e-07, "logits/chosen": -2.299114465713501, "logits/rejected": -2.3477935791015625, "logps/chosen": -60.78331756591797, "logps/rejected": -72.19308471679688, "loss": 0.7834, "rewards/accuracies": 0.6875, "rewards/chosen": -0.46073323488235474, "rewards/margins": 1.5286749601364136, "rewards/rejected": -1.989408254623413, "step": 708 }, { "epoch": 0.8696052127251821, "grad_norm": 10.5252103805542, "learning_rate": 9.131288343558282e-07, "logits/chosen": -2.3269667625427246, "logits/rejected": -2.347744941711426, "logps/chosen": -55.975257873535156, "logps/rejected": -71.75910949707031, "loss": 0.7365, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06929029524326324, "rewards/margins": 1.7891627550125122, "rewards/rejected": -1.7198723554611206, "step": 709 }, { "epoch": 0.8708317362974319, "grad_norm": 9.494606018066406, "learning_rate": 9.130061349693251e-07, "logits/chosen": -2.307861089706421, "logits/rejected": -2.344870090484619, "logps/chosen": -59.635135650634766, "logps/rejected": -72.28292846679688, "loss": 0.7788, "rewards/accuracies": 0.71875, "rewards/chosen": -0.32084256410598755, "rewards/margins": 1.340873122215271, "rewards/rejected": -1.6617157459259033, "step": 710 }, { "epoch": 0.8720582598696819, "grad_norm": 11.173823356628418, "learning_rate": 9.12883435582822e-07, "logits/chosen": -2.2649741172790527, "logits/rejected": -2.2688276767730713, "logps/chosen": -67.1307373046875, "logps/rejected": -76.18175506591797, "loss": 0.8534, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9996108412742615, "rewards/margins": 0.8008675575256348, "rewards/rejected": -1.8004783391952515, "step": 711 }, { "epoch": 0.8732847834419317, "grad_norm": 10.336365699768066, "learning_rate": 9.12760736196319e-07, "logits/chosen": -2.295780897140503, "logits/rejected": -2.3434691429138184, "logps/chosen": -59.70408630371094, "logps/rejected": -78.51313781738281, "loss": 0.7397, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2186414897441864, "rewards/margins": 1.922384262084961, "rewards/rejected": -2.1410255432128906, "step": 712 }, { "epoch": 0.8745113070141817, "grad_norm": 9.00507926940918, "learning_rate": 9.126380368098159e-07, "logits/chosen": -2.361680030822754, "logits/rejected": -2.338592052459717, "logps/chosen": -63.39106750488281, "logps/rejected": -82.07675170898438, "loss": 0.7959, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7654085755348206, "rewards/margins": 1.7298614978790283, "rewards/rejected": -2.495270252227783, "step": 713 }, { "epoch": 0.8757378305864316, "grad_norm": 11.08970832824707, "learning_rate": 9.125153374233129e-07, "logits/chosen": -2.286147356033325, "logits/rejected": -2.3423407077789307, "logps/chosen": -55.46036148071289, "logps/rejected": -77.00863647460938, "loss": 0.7112, "rewards/accuracies": 0.75, "rewards/chosen": 0.14290344715118408, "rewards/margins": 2.140028953552246, "rewards/rejected": -1.9971256256103516, "step": 714 }, { "epoch": 0.8769643541586815, "grad_norm": 7.086929798126221, "learning_rate": 9.123926380368097e-07, "logits/chosen": -2.368665933609009, "logits/rejected": -2.3442869186401367, "logps/chosen": -64.54269409179688, "logps/rejected": -70.31712341308594, "loss": 0.911, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6941922903060913, "rewards/margins": 0.7551091909408569, "rewards/rejected": -1.4493014812469482, "step": 715 }, { "epoch": 0.8781908777309314, "grad_norm": 12.788636207580566, "learning_rate": 9.122699386503066e-07, "logits/chosen": -2.3675642013549805, "logits/rejected": -2.339548110961914, "logps/chosen": -58.250972747802734, "logps/rejected": -79.6556396484375, "loss": 0.7339, "rewards/accuracies": 0.71875, "rewards/chosen": -0.028475582599639893, "rewards/margins": 2.1960368156433105, "rewards/rejected": -2.2245123386383057, "step": 716 }, { "epoch": 0.8794174013031812, "grad_norm": 6.501903533935547, "learning_rate": 9.121472392638036e-07, "logits/chosen": -2.296774387359619, "logits/rejected": -2.3105099201202393, "logps/chosen": -71.44734954833984, "logps/rejected": -84.30469512939453, "loss": 0.7861, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3967498540878296, "rewards/margins": 1.4726066589355469, "rewards/rejected": -2.869356632232666, "step": 717 }, { "epoch": 0.8806439248754312, "grad_norm": 17.958892822265625, "learning_rate": 9.120245398773006e-07, "logits/chosen": -2.241018295288086, "logits/rejected": -2.325758934020996, "logps/chosen": -59.325870513916016, "logps/rejected": -74.18983459472656, "loss": 0.8692, "rewards/accuracies": 0.71875, "rewards/chosen": -0.29590511322021484, "rewards/margins": 1.4323290586471558, "rewards/rejected": -1.728234052658081, "step": 718 }, { "epoch": 0.8818704484476811, "grad_norm": 8.928686141967773, "learning_rate": 9.119018404907976e-07, "logits/chosen": -2.270104169845581, "logits/rejected": -2.279447078704834, "logps/chosen": -60.831260681152344, "logps/rejected": -78.3927993774414, "loss": 0.6973, "rewards/accuracies": 0.75, "rewards/chosen": -0.09286296367645264, "rewards/margins": 1.947009801864624, "rewards/rejected": -2.039872646331787, "step": 719 }, { "epoch": 0.883096972019931, "grad_norm": 15.369510650634766, "learning_rate": 9.117791411042945e-07, "logits/chosen": -2.380671501159668, "logits/rejected": -2.413609027862549, "logps/chosen": -65.94934844970703, "logps/rejected": -74.60470581054688, "loss": 0.8532, "rewards/accuracies": 0.625, "rewards/chosen": -1.0490599870681763, "rewards/margins": 0.9093424081802368, "rewards/rejected": -1.958402395248413, "step": 720 }, { "epoch": 0.8843234955921809, "grad_norm": 8.66001033782959, "learning_rate": 9.116564417177913e-07, "logits/chosen": -2.270859718322754, "logits/rejected": -2.3015365600585938, "logps/chosen": -64.42475128173828, "logps/rejected": -81.15049743652344, "loss": 0.7775, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5835793614387512, "rewards/margins": 1.5086205005645752, "rewards/rejected": -2.0921998023986816, "step": 721 }, { "epoch": 0.8855500191644308, "grad_norm": 9.589869499206543, "learning_rate": 9.115337423312883e-07, "logits/chosen": -2.378425121307373, "logits/rejected": -2.4081177711486816, "logps/chosen": -61.30906295776367, "logps/rejected": -73.06535339355469, "loss": 0.8099, "rewards/accuracies": 0.75, "rewards/chosen": -0.4025610089302063, "rewards/margins": 1.3340879678726196, "rewards/rejected": -1.7366489171981812, "step": 722 }, { "epoch": 0.8867765427366807, "grad_norm": 9.504730224609375, "learning_rate": 9.114110429447852e-07, "logits/chosen": -2.276024103164673, "logits/rejected": -2.3315563201904297, "logps/chosen": -61.44717788696289, "logps/rejected": -75.08296966552734, "loss": 0.8271, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2499653846025467, "rewards/margins": 1.4858808517456055, "rewards/rejected": -1.7358462810516357, "step": 723 }, { "epoch": 0.8880030663089307, "grad_norm": 10.034958839416504, "learning_rate": 9.112883435582822e-07, "logits/chosen": -2.296215534210205, "logits/rejected": -2.3126652240753174, "logps/chosen": -69.22993469238281, "logps/rejected": -82.36970520019531, "loss": 0.8284, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1011055707931519, "rewards/margins": 1.3188526630401611, "rewards/rejected": -2.4199578762054443, "step": 724 }, { "epoch": 0.8892295898811805, "grad_norm": 9.814302444458008, "learning_rate": 9.111656441717791e-07, "logits/chosen": -2.316377878189087, "logits/rejected": -2.351818084716797, "logps/chosen": -65.67301940917969, "logps/rejected": -77.27143859863281, "loss": 0.791, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4489126205444336, "rewards/margins": 1.3703700304031372, "rewards/rejected": -1.8192826509475708, "step": 725 }, { "epoch": 0.8904561134534305, "grad_norm": 7.65138578414917, "learning_rate": 9.11042944785276e-07, "logits/chosen": -2.3018131256103516, "logits/rejected": -2.382079601287842, "logps/chosen": -71.636962890625, "logps/rejected": -78.0294189453125, "loss": 0.8441, "rewards/accuracies": 0.625, "rewards/chosen": -1.3524165153503418, "rewards/margins": 0.9212237000465393, "rewards/rejected": -2.2736401557922363, "step": 726 }, { "epoch": 0.8916826370256803, "grad_norm": 11.024555206298828, "learning_rate": 9.109202453987729e-07, "logits/chosen": -2.408489227294922, "logits/rejected": -2.4048471450805664, "logps/chosen": -69.4646224975586, "logps/rejected": -80.96134948730469, "loss": 0.899, "rewards/accuracies": 0.71875, "rewards/chosen": -1.369322419166565, "rewards/margins": 1.0668628215789795, "rewards/rejected": -2.436185121536255, "step": 727 }, { "epoch": 0.8929091605979302, "grad_norm": 12.496420860290527, "learning_rate": 9.107975460122699e-07, "logits/chosen": -2.3277428150177, "logits/rejected": -2.3894622325897217, "logps/chosen": -69.44269561767578, "logps/rejected": -80.91560363769531, "loss": 0.8358, "rewards/accuracies": 0.625, "rewards/chosen": -1.4259084463119507, "rewards/margins": 1.2707183361053467, "rewards/rejected": -2.696626663208008, "step": 728 }, { "epoch": 0.8941356841701802, "grad_norm": 9.671626091003418, "learning_rate": 9.106748466257669e-07, "logits/chosen": -2.305280923843384, "logits/rejected": -2.3667705059051514, "logps/chosen": -66.1238021850586, "logps/rejected": -77.69881439208984, "loss": 0.8075, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5886647701263428, "rewards/margins": 1.2933601140975952, "rewards/rejected": -1.8820250034332275, "step": 729 }, { "epoch": 0.89536220774243, "grad_norm": 8.43654727935791, "learning_rate": 9.105521472392638e-07, "logits/chosen": -2.3553497791290283, "logits/rejected": -2.4036736488342285, "logps/chosen": -69.24310302734375, "logps/rejected": -85.69400024414062, "loss": 0.749, "rewards/accuracies": 0.625, "rewards/chosen": -1.1425443887710571, "rewards/margins": 1.7914124727249146, "rewards/rejected": -2.9339568614959717, "step": 730 }, { "epoch": 0.89658873131468, "grad_norm": 12.371342658996582, "learning_rate": 9.104294478527607e-07, "logits/chosen": -2.3697526454925537, "logits/rejected": -2.385972261428833, "logps/chosen": -68.31834411621094, "logps/rejected": -82.81649780273438, "loss": 0.8256, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7599461078643799, "rewards/margins": 1.652598261833191, "rewards/rejected": -2.4125444889068604, "step": 731 }, { "epoch": 0.8978152548869298, "grad_norm": 10.83505916595459, "learning_rate": 9.103067484662577e-07, "logits/chosen": -2.3310909271240234, "logits/rejected": -2.3230018615722656, "logps/chosen": -75.28482055664062, "logps/rejected": -78.33267211914062, "loss": 0.9179, "rewards/accuracies": 0.46875, "rewards/chosen": -2.169229507446289, "rewards/margins": 0.011897042393684387, "rewards/rejected": -2.181126594543457, "step": 732 }, { "epoch": 0.8990417784591798, "grad_norm": 9.10014533996582, "learning_rate": 9.101840490797545e-07, "logits/chosen": -2.290858745574951, "logits/rejected": -2.316220998764038, "logps/chosen": -57.51649475097656, "logps/rejected": -78.10677337646484, "loss": 0.7011, "rewards/accuracies": 0.625, "rewards/chosen": -0.18609276413917542, "rewards/margins": 2.1448662281036377, "rewards/rejected": -2.3309590816497803, "step": 733 }, { "epoch": 0.9002683020314297, "grad_norm": 15.017547607421875, "learning_rate": 9.100613496932515e-07, "logits/chosen": -2.260244131088257, "logits/rejected": -2.305507183074951, "logps/chosen": -47.628700256347656, "logps/rejected": -65.74173736572266, "loss": 0.7322, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8648556470870972, "rewards/margins": 2.011693000793457, "rewards/rejected": -1.1468372344970703, "step": 734 }, { "epoch": 0.9014948256036796, "grad_norm": 12.382323265075684, "learning_rate": 9.099386503067484e-07, "logits/chosen": -2.316304922103882, "logits/rejected": -2.3339784145355225, "logps/chosen": -60.274742126464844, "logps/rejected": -75.65333557128906, "loss": 0.8174, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6622899770736694, "rewards/margins": 1.5029091835021973, "rewards/rejected": -2.1651992797851562, "step": 735 }, { "epoch": 0.9027213491759295, "grad_norm": 10.360733985900879, "learning_rate": 9.098159509202453e-07, "logits/chosen": -2.249326467514038, "logits/rejected": -2.2792341709136963, "logps/chosen": -60.169891357421875, "logps/rejected": -78.90335083007812, "loss": 0.7869, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6172305941581726, "rewards/margins": 1.8192617893218994, "rewards/rejected": -2.436492443084717, "step": 736 }, { "epoch": 0.9039478727481793, "grad_norm": 11.223026275634766, "learning_rate": 9.096932515337423e-07, "logits/chosen": -2.3832902908325195, "logits/rejected": -2.4318807125091553, "logps/chosen": -58.6498908996582, "logps/rejected": -71.6484146118164, "loss": 0.7304, "rewards/accuracies": 0.78125, "rewards/chosen": -0.13463807106018066, "rewards/margins": 1.5646555423736572, "rewards/rejected": -1.699293613433838, "step": 737 }, { "epoch": 0.9051743963204293, "grad_norm": 12.742450714111328, "learning_rate": 9.095705521472393e-07, "logits/chosen": -2.356459617614746, "logits/rejected": -2.408928394317627, "logps/chosen": -58.39771270751953, "logps/rejected": -80.47499084472656, "loss": 0.6881, "rewards/accuracies": 0.75, "rewards/chosen": -0.3753722310066223, "rewards/margins": 2.1750636100769043, "rewards/rejected": -2.550435781478882, "step": 738 }, { "epoch": 0.9064009198926792, "grad_norm": 10.624967575073242, "learning_rate": 9.094478527607361e-07, "logits/chosen": -2.353365898132324, "logits/rejected": -2.369328022003174, "logps/chosen": -56.018375396728516, "logps/rejected": -78.39376831054688, "loss": 0.7063, "rewards/accuracies": 0.875, "rewards/chosen": 0.2498396784067154, "rewards/margins": 2.592072010040283, "rewards/rejected": -2.3422319889068604, "step": 739 }, { "epoch": 0.9076274434649291, "grad_norm": 8.746869087219238, "learning_rate": 9.093251533742331e-07, "logits/chosen": -2.3284382820129395, "logits/rejected": -2.363675832748413, "logps/chosen": -60.679649353027344, "logps/rejected": -79.2003173828125, "loss": 0.7445, "rewards/accuracies": 0.6875, "rewards/chosen": -0.26891636848449707, "rewards/margins": 2.001103401184082, "rewards/rejected": -2.270019769668579, "step": 740 }, { "epoch": 0.908853967037179, "grad_norm": 20.159706115722656, "learning_rate": 9.0920245398773e-07, "logits/chosen": -2.372281074523926, "logits/rejected": -2.3754098415374756, "logps/chosen": -69.38038635253906, "logps/rejected": -74.97349548339844, "loss": 0.903, "rewards/accuracies": 0.59375, "rewards/chosen": -1.053619146347046, "rewards/margins": 0.5106468200683594, "rewards/rejected": -1.5642659664154053, "step": 741 }, { "epoch": 0.9100804906094289, "grad_norm": 13.480059623718262, "learning_rate": 9.09079754601227e-07, "logits/chosen": -2.415121555328369, "logits/rejected": -2.4519224166870117, "logps/chosen": -65.62371826171875, "logps/rejected": -77.20606994628906, "loss": 0.8015, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8546415567398071, "rewards/margins": 1.3754794597625732, "rewards/rejected": -2.230120897293091, "step": 742 }, { "epoch": 0.9113070141816788, "grad_norm": 8.489531517028809, "learning_rate": 9.089570552147239e-07, "logits/chosen": -2.3454434871673584, "logits/rejected": -2.3522932529449463, "logps/chosen": -69.520751953125, "logps/rejected": -74.14189910888672, "loss": 0.959, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6312227249145508, "rewards/margins": 0.2881169319152832, "rewards/rejected": -1.9193397760391235, "step": 743 }, { "epoch": 0.9125335377539288, "grad_norm": 10.224541664123535, "learning_rate": 9.088343558282208e-07, "logits/chosen": -2.3359575271606445, "logits/rejected": -2.3561456203460693, "logps/chosen": -60.797855377197266, "logps/rejected": -73.16397857666016, "loss": 0.8058, "rewards/accuracies": 0.65625, "rewards/chosen": -0.42618051171302795, "rewards/margins": 1.4103715419769287, "rewards/rejected": -1.8365520238876343, "step": 744 }, { "epoch": 0.9137600613261786, "grad_norm": 10.91353988647461, "learning_rate": 9.087116564417177e-07, "logits/chosen": -2.3587684631347656, "logits/rejected": -2.385871171951294, "logps/chosen": -59.356876373291016, "logps/rejected": -70.5888671875, "loss": 0.8074, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1955077350139618, "rewards/margins": 1.1078457832336426, "rewards/rejected": -1.3033535480499268, "step": 745 }, { "epoch": 0.9149865848984285, "grad_norm": 8.696556091308594, "learning_rate": 9.085889570552146e-07, "logits/chosen": -2.3367767333984375, "logits/rejected": -2.3575711250305176, "logps/chosen": -65.71466827392578, "logps/rejected": -80.75625610351562, "loss": 0.7962, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7565900087356567, "rewards/margins": 1.5387977361679077, "rewards/rejected": -2.2953877449035645, "step": 746 }, { "epoch": 0.9162131084706784, "grad_norm": 8.612192153930664, "learning_rate": 9.084662576687116e-07, "logits/chosen": -2.275575637817383, "logits/rejected": -2.321540594100952, "logps/chosen": -51.45393371582031, "logps/rejected": -68.800048828125, "loss": 0.7499, "rewards/accuracies": 0.65625, "rewards/chosen": 0.42221546173095703, "rewards/margins": 1.8348344564437866, "rewards/rejected": -1.4126191139221191, "step": 747 }, { "epoch": 0.9174396320429283, "grad_norm": 10.837723731994629, "learning_rate": 9.083435582822086e-07, "logits/chosen": -2.37855863571167, "logits/rejected": -2.3894591331481934, "logps/chosen": -65.48775482177734, "logps/rejected": -81.79073333740234, "loss": 0.7704, "rewards/accuracies": 0.625, "rewards/chosen": -0.8559983372688293, "rewards/margins": 1.5441632270812988, "rewards/rejected": -2.4001617431640625, "step": 748 }, { "epoch": 0.9186661556151783, "grad_norm": 8.538366317749023, "learning_rate": 9.082208588957055e-07, "logits/chosen": -2.3740365505218506, "logits/rejected": -2.388507604598999, "logps/chosen": -64.95053100585938, "logps/rejected": -75.89096069335938, "loss": 0.8407, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7071806192398071, "rewards/margins": 0.998566210269928, "rewards/rejected": -1.7057468891143799, "step": 749 }, { "epoch": 0.9198926791874281, "grad_norm": 13.889049530029297, "learning_rate": 9.080981595092025e-07, "logits/chosen": -2.239367723464966, "logits/rejected": -2.2640295028686523, "logps/chosen": -60.125877380371094, "logps/rejected": -75.65338134765625, "loss": 0.7758, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1920228749513626, "rewards/margins": 1.7307740449905396, "rewards/rejected": -1.9227967262268066, "step": 750 }, { "epoch": 0.9211192027596781, "grad_norm": 12.196105003356934, "learning_rate": 9.079754601226993e-07, "logits/chosen": -2.399749517440796, "logits/rejected": -2.441824436187744, "logps/chosen": -66.96810150146484, "logps/rejected": -81.9935302734375, "loss": 0.7465, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9475656151771545, "rewards/margins": 1.6566448211669922, "rewards/rejected": -2.604210615158081, "step": 751 }, { "epoch": 0.9223457263319279, "grad_norm": 11.199895858764648, "learning_rate": 9.078527607361963e-07, "logits/chosen": -2.3608362674713135, "logits/rejected": -2.3709664344787598, "logps/chosen": -60.762508392333984, "logps/rejected": -75.62083435058594, "loss": 0.7917, "rewards/accuracies": 0.625, "rewards/chosen": 0.029484480619430542, "rewards/margins": 1.6456962823867798, "rewards/rejected": -1.6162116527557373, "step": 752 }, { "epoch": 0.9235722499041779, "grad_norm": 18.097686767578125, "learning_rate": 9.077300613496932e-07, "logits/chosen": -2.334876537322998, "logits/rejected": -2.3175837993621826, "logps/chosen": -63.76424026489258, "logps/rejected": -72.82670593261719, "loss": 0.8878, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7179352641105652, "rewards/margins": 1.0711995363235474, "rewards/rejected": -1.7891349792480469, "step": 753 }, { "epoch": 0.9247987734764278, "grad_norm": 11.954986572265625, "learning_rate": 9.076073619631901e-07, "logits/chosen": -2.3646535873413086, "logits/rejected": -2.365323066711426, "logps/chosen": -62.370094299316406, "logps/rejected": -73.98262786865234, "loss": 0.7959, "rewards/accuracies": 0.75, "rewards/chosen": -0.4507000744342804, "rewards/margins": 1.3001272678375244, "rewards/rejected": -1.7508275508880615, "step": 754 }, { "epoch": 0.9260252970486776, "grad_norm": 8.335708618164062, "learning_rate": 9.074846625766871e-07, "logits/chosen": -2.3123817443847656, "logits/rejected": -2.324650764465332, "logps/chosen": -58.69358825683594, "logps/rejected": -77.26524353027344, "loss": 0.7369, "rewards/accuracies": 0.65625, "rewards/chosen": -0.30385279655456543, "rewards/margins": 2.081165313720703, "rewards/rejected": -2.3850181102752686, "step": 755 }, { "epoch": 0.9272518206209276, "grad_norm": 9.895817756652832, "learning_rate": 9.07361963190184e-07, "logits/chosen": -2.2566823959350586, "logits/rejected": -2.286426544189453, "logps/chosen": -56.908447265625, "logps/rejected": -70.65885925292969, "loss": 0.8221, "rewards/accuracies": 0.71875, "rewards/chosen": 0.21259000897407532, "rewards/margins": 1.6865596771240234, "rewards/rejected": -1.4739696979522705, "step": 756 }, { "epoch": 0.9284783441931774, "grad_norm": 8.405856132507324, "learning_rate": 9.072392638036809e-07, "logits/chosen": -2.315112352371216, "logits/rejected": -2.351963996887207, "logps/chosen": -57.969181060791016, "logps/rejected": -78.91305541992188, "loss": 0.7181, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1907920092344284, "rewards/margins": 2.335632562637329, "rewards/rejected": -2.1448404788970947, "step": 757 }, { "epoch": 0.9297048677654274, "grad_norm": 7.407548904418945, "learning_rate": 9.071165644171778e-07, "logits/chosen": -2.2973294258117676, "logits/rejected": -2.34576678276062, "logps/chosen": -60.38761520385742, "logps/rejected": -69.013427734375, "loss": 0.8981, "rewards/accuracies": 0.625, "rewards/chosen": -0.5154203176498413, "rewards/margins": 0.652461588382721, "rewards/rejected": -1.1678818464279175, "step": 758 }, { "epoch": 0.9309313913376772, "grad_norm": 11.808429718017578, "learning_rate": 9.069938650306748e-07, "logits/chosen": -2.3065192699432373, "logits/rejected": -2.3361763954162598, "logps/chosen": -55.363773345947266, "logps/rejected": -77.1197509765625, "loss": 0.6913, "rewards/accuracies": 0.75, "rewards/chosen": 0.050882965326309204, "rewards/margins": 2.23587965965271, "rewards/rejected": -2.184997081756592, "step": 759 }, { "epoch": 0.9321579149099272, "grad_norm": 10.664855003356934, "learning_rate": 9.068711656441718e-07, "logits/chosen": -2.322208881378174, "logits/rejected": -2.306084156036377, "logps/chosen": -71.1731185913086, "logps/rejected": -87.74295806884766, "loss": 0.8032, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7957788705825806, "rewards/margins": 1.362748622894287, "rewards/rejected": -3.1585276126861572, "step": 760 }, { "epoch": 0.9333844384821771, "grad_norm": 12.454619407653809, "learning_rate": 9.067484662576687e-07, "logits/chosen": -2.314021110534668, "logits/rejected": -2.3380627632141113, "logps/chosen": -55.83209991455078, "logps/rejected": -71.0593490600586, "loss": 0.7467, "rewards/accuracies": 0.75, "rewards/chosen": 0.20418056845664978, "rewards/margins": 1.6838805675506592, "rewards/rejected": -1.4796998500823975, "step": 761 }, { "epoch": 0.934610962054427, "grad_norm": 9.874329566955566, "learning_rate": 9.066257668711657e-07, "logits/chosen": -2.3411989212036133, "logits/rejected": -2.3752241134643555, "logps/chosen": -57.216182708740234, "logps/rejected": -76.94351196289062, "loss": 0.6914, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04637928307056427, "rewards/margins": 2.179558753967285, "rewards/rejected": -2.133179187774658, "step": 762 }, { "epoch": 0.9358374856266769, "grad_norm": 13.662768363952637, "learning_rate": 9.065030674846625e-07, "logits/chosen": -2.3662283420562744, "logits/rejected": -2.405724287033081, "logps/chosen": -66.26388549804688, "logps/rejected": -80.18268585205078, "loss": 0.7609, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7277179956436157, "rewards/margins": 1.62358820438385, "rewards/rejected": -2.3513059616088867, "step": 763 }, { "epoch": 0.9370640091989267, "grad_norm": 15.576281547546387, "learning_rate": 9.063803680981594e-07, "logits/chosen": -2.2858328819274902, "logits/rejected": -2.353194236755371, "logps/chosen": -58.758480072021484, "logps/rejected": -75.209228515625, "loss": 0.7302, "rewards/accuracies": 0.78125, "rewards/chosen": -0.02344241738319397, "rewards/margins": 2.0252861976623535, "rewards/rejected": -2.0487287044525146, "step": 764 }, { "epoch": 0.9382905327711767, "grad_norm": 10.880400657653809, "learning_rate": 9.062576687116564e-07, "logits/chosen": -2.32763409614563, "logits/rejected": -2.335951805114746, "logps/chosen": -57.62969207763672, "logps/rejected": -74.20957946777344, "loss": 0.7458, "rewards/accuracies": 0.6875, "rewards/chosen": -0.16682833433151245, "rewards/margins": 1.8015650510787964, "rewards/rejected": -1.968393325805664, "step": 765 }, { "epoch": 0.9395170563434266, "grad_norm": 9.16293716430664, "learning_rate": 9.061349693251533e-07, "logits/chosen": -2.3818085193634033, "logits/rejected": -2.3474180698394775, "logps/chosen": -61.386268615722656, "logps/rejected": -77.27691650390625, "loss": 0.7245, "rewards/accuracies": 0.625, "rewards/chosen": -0.2213079333305359, "rewards/margins": 1.8693524599075317, "rewards/rejected": -2.090660333633423, "step": 766 }, { "epoch": 0.9407435799156765, "grad_norm": 12.017632484436035, "learning_rate": 9.060122699386503e-07, "logits/chosen": -2.289222002029419, "logits/rejected": -2.300865888595581, "logps/chosen": -59.18625259399414, "logps/rejected": -74.3079833984375, "loss": 0.7894, "rewards/accuracies": 0.625, "rewards/chosen": -0.018375694751739502, "rewards/margins": 1.7154080867767334, "rewards/rejected": -1.7337837219238281, "step": 767 }, { "epoch": 0.9419701034879264, "grad_norm": 11.563502311706543, "learning_rate": 9.058895705521472e-07, "logits/chosen": -2.36908221244812, "logits/rejected": -2.374403476715088, "logps/chosen": -59.02870559692383, "logps/rejected": -70.79082489013672, "loss": 0.8098, "rewards/accuracies": 0.75, "rewards/chosen": -0.256335586309433, "rewards/margins": 1.5048751831054688, "rewards/rejected": -1.7612106800079346, "step": 768 }, { "epoch": 0.9431966270601763, "grad_norm": 14.290942192077637, "learning_rate": 9.057668711656441e-07, "logits/chosen": -2.3714113235473633, "logits/rejected": -2.383723020553589, "logps/chosen": -55.97398376464844, "logps/rejected": -71.98214721679688, "loss": 0.7276, "rewards/accuracies": 0.8125, "rewards/chosen": 0.414126992225647, "rewards/margins": 2.0440125465393066, "rewards/rejected": -1.6298856735229492, "step": 769 }, { "epoch": 0.9444231506324262, "grad_norm": 11.370984077453613, "learning_rate": 9.056441717791411e-07, "logits/chosen": -2.3045690059661865, "logits/rejected": -2.352675676345825, "logps/chosen": -62.694847106933594, "logps/rejected": -75.15155792236328, "loss": 0.8003, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4176429212093353, "rewards/margins": 1.4615519046783447, "rewards/rejected": -1.879194974899292, "step": 770 }, { "epoch": 0.9456496742046762, "grad_norm": 11.834383964538574, "learning_rate": 9.05521472392638e-07, "logits/chosen": -2.363884449005127, "logits/rejected": -2.3536972999572754, "logps/chosen": -59.63422393798828, "logps/rejected": -69.97554779052734, "loss": 0.826, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10614447295665741, "rewards/margins": 1.0781512260437012, "rewards/rejected": -1.1842957735061646, "step": 771 }, { "epoch": 0.946876197776926, "grad_norm": 10.317729949951172, "learning_rate": 9.053987730061349e-07, "logits/chosen": -2.3566012382507324, "logits/rejected": -2.3487660884857178, "logps/chosen": -62.17597198486328, "logps/rejected": -81.35395812988281, "loss": 0.7382, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7061350345611572, "rewards/margins": 1.6009258031845093, "rewards/rejected": -2.307060718536377, "step": 772 }, { "epoch": 0.948102721349176, "grad_norm": 15.760668754577637, "learning_rate": 9.052760736196319e-07, "logits/chosen": -2.3871355056762695, "logits/rejected": -2.3808488845825195, "logps/chosen": -68.457275390625, "logps/rejected": -70.58596801757812, "loss": 0.9263, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9798476696014404, "rewards/margins": 0.36094728112220764, "rewards/rejected": -1.3407950401306152, "step": 773 }, { "epoch": 0.9493292449214258, "grad_norm": 17.174983978271484, "learning_rate": 9.051533742331288e-07, "logits/chosen": -2.3449504375457764, "logits/rejected": -2.35842227935791, "logps/chosen": -54.81658935546875, "logps/rejected": -72.34201049804688, "loss": 0.7615, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10419551283121109, "rewards/margins": 1.7681453227996826, "rewards/rejected": -1.6639500856399536, "step": 774 }, { "epoch": 0.9505557684936757, "grad_norm": 8.909220695495605, "learning_rate": 9.050306748466257e-07, "logits/chosen": -2.3454325199127197, "logits/rejected": -2.309173345565796, "logps/chosen": -69.05570983886719, "logps/rejected": -71.43368530273438, "loss": 0.9162, "rewards/accuracies": 0.53125, "rewards/chosen": -0.7872940897941589, "rewards/margins": 0.26802951097488403, "rewards/rejected": -1.055323600769043, "step": 775 }, { "epoch": 0.9517822920659257, "grad_norm": 13.701456069946289, "learning_rate": 9.049079754601226e-07, "logits/chosen": -2.2828361988067627, "logits/rejected": -2.3076720237731934, "logps/chosen": -56.27594757080078, "logps/rejected": -71.8547134399414, "loss": 0.748, "rewards/accuracies": 0.75, "rewards/chosen": -0.0339437872171402, "rewards/margins": 1.813568353652954, "rewards/rejected": -1.8475122451782227, "step": 776 }, { "epoch": 0.9530088156381755, "grad_norm": 11.148427963256836, "learning_rate": 9.047852760736195e-07, "logits/chosen": -2.3533244132995605, "logits/rejected": -2.348036289215088, "logps/chosen": -57.48014450073242, "logps/rejected": -71.27723693847656, "loss": 0.7938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06901805102825165, "rewards/margins": 1.5466018915176392, "rewards/rejected": -1.6156200170516968, "step": 777 }, { "epoch": 0.9542353392104255, "grad_norm": 10.417884826660156, "learning_rate": 9.046625766871165e-07, "logits/chosen": -2.3721656799316406, "logits/rejected": -2.380429744720459, "logps/chosen": -60.848873138427734, "logps/rejected": -74.42229461669922, "loss": 0.8067, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34779295325279236, "rewards/margins": 1.4903984069824219, "rewards/rejected": -1.838191270828247, "step": 778 }, { "epoch": 0.9554618627826753, "grad_norm": 21.254568099975586, "learning_rate": 9.045398773006135e-07, "logits/chosen": -2.265517234802246, "logits/rejected": -2.316340208053589, "logps/chosen": -61.42648696899414, "logps/rejected": -70.74826049804688, "loss": 0.7537, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17096799612045288, "rewards/margins": 1.2823898792266846, "rewards/rejected": -1.4533579349517822, "step": 779 }, { "epoch": 0.9566883863549253, "grad_norm": 9.756060600280762, "learning_rate": 9.044171779141105e-07, "logits/chosen": -2.367671489715576, "logits/rejected": -2.4082837104797363, "logps/chosen": -65.62579345703125, "logps/rejected": -74.86209869384766, "loss": 0.8591, "rewards/accuracies": 0.53125, "rewards/chosen": -0.770451009273529, "rewards/margins": 0.8974118828773499, "rewards/rejected": -1.667863130569458, "step": 780 }, { "epoch": 0.9579149099271752, "grad_norm": 10.02858829498291, "learning_rate": 9.042944785276073e-07, "logits/chosen": -2.3607935905456543, "logits/rejected": -2.3907535076141357, "logps/chosen": -52.39589309692383, "logps/rejected": -74.60308074951172, "loss": 0.6891, "rewards/accuracies": 0.6875, "rewards/chosen": 0.32686692476272583, "rewards/margins": 2.2292768955230713, "rewards/rejected": -1.9024100303649902, "step": 781 }, { "epoch": 0.959141433499425, "grad_norm": 11.613953590393066, "learning_rate": 9.041717791411042e-07, "logits/chosen": -2.320850372314453, "logits/rejected": -2.344444751739502, "logps/chosen": -63.25963592529297, "logps/rejected": -72.68120574951172, "loss": 0.8427, "rewards/accuracies": 0.625, "rewards/chosen": -0.8094865679740906, "rewards/margins": 1.0103000402450562, "rewards/rejected": -1.8197866678237915, "step": 782 }, { "epoch": 0.960367957071675, "grad_norm": 13.561912536621094, "learning_rate": 9.040490797546012e-07, "logits/chosen": -2.369262456893921, "logits/rejected": -2.3845555782318115, "logps/chosen": -59.35563659667969, "logps/rejected": -76.09325408935547, "loss": 0.8058, "rewards/accuracies": 0.65625, "rewards/chosen": -0.010457605123519897, "rewards/margins": 1.506291389465332, "rewards/rejected": -1.5167487859725952, "step": 783 }, { "epoch": 0.9615944806439248, "grad_norm": 8.324163436889648, "learning_rate": 9.039263803680981e-07, "logits/chosen": -2.3172712326049805, "logits/rejected": -2.372357130050659, "logps/chosen": -48.88582992553711, "logps/rejected": -73.49968719482422, "loss": 0.6723, "rewards/accuracies": 0.75, "rewards/chosen": 1.3337041139602661, "rewards/margins": 2.58624267578125, "rewards/rejected": -1.2525384426116943, "step": 784 }, { "epoch": 0.9628210042161748, "grad_norm": 11.518498420715332, "learning_rate": 9.038036809815951e-07, "logits/chosen": -2.209005832672119, "logits/rejected": -2.2566068172454834, "logps/chosen": -42.70106506347656, "logps/rejected": -71.38939666748047, "loss": 0.5999, "rewards/accuracies": 0.84375, "rewards/chosen": 1.5389304161071777, "rewards/margins": 3.003967761993408, "rewards/rejected": -1.4650375843048096, "step": 785 }, { "epoch": 0.9640475277884247, "grad_norm": 11.022368431091309, "learning_rate": 9.03680981595092e-07, "logits/chosen": -2.3720316886901855, "logits/rejected": -2.4021971225738525, "logps/chosen": -60.34259033203125, "logps/rejected": -75.59004211425781, "loss": 0.7806, "rewards/accuracies": 0.625, "rewards/chosen": -0.575713574886322, "rewards/margins": 1.1295971870422363, "rewards/rejected": -1.7053108215332031, "step": 786 }, { "epoch": 0.9652740513606746, "grad_norm": 12.663945198059082, "learning_rate": 9.035582822085888e-07, "logits/chosen": -2.278822660446167, "logits/rejected": -2.3031768798828125, "logps/chosen": -57.05033874511719, "logps/rejected": -74.2607421875, "loss": 0.7568, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1680455207824707, "rewards/margins": 1.6752582788467407, "rewards/rejected": -1.8433040380477905, "step": 787 }, { "epoch": 0.9665005749329245, "grad_norm": 10.659233093261719, "learning_rate": 9.034355828220858e-07, "logits/chosen": -2.381140947341919, "logits/rejected": -2.388798236846924, "logps/chosen": -64.1029052734375, "logps/rejected": -79.26260375976562, "loss": 0.7782, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6507426500320435, "rewards/margins": 1.621636986732483, "rewards/rejected": -2.2723796367645264, "step": 788 }, { "epoch": 0.9677270985051744, "grad_norm": 9.191034317016602, "learning_rate": 9.033128834355828e-07, "logits/chosen": -2.386543035507202, "logits/rejected": -2.3604676723480225, "logps/chosen": -62.91292953491211, "logps/rejected": -74.19570922851562, "loss": 0.8142, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4736517369747162, "rewards/margins": 1.3079372644424438, "rewards/rejected": -1.7815890312194824, "step": 789 }, { "epoch": 0.9689536220774243, "grad_norm": 12.349140167236328, "learning_rate": 9.031901840490798e-07, "logits/chosen": -2.384235382080078, "logits/rejected": -2.4182260036468506, "logps/chosen": -61.82942199707031, "logps/rejected": -84.33656311035156, "loss": 0.7374, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4466436803340912, "rewards/margins": 2.253688335418701, "rewards/rejected": -2.700331926345825, "step": 790 }, { "epoch": 0.9701801456496743, "grad_norm": 29.346948623657227, "learning_rate": 9.030674846625767e-07, "logits/chosen": -2.344172239303589, "logits/rejected": -2.3440282344818115, "logps/chosen": -67.29844665527344, "logps/rejected": -76.73138427734375, "loss": 0.8246, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8907421231269836, "rewards/margins": 1.130690574645996, "rewards/rejected": -2.021432876586914, "step": 791 }, { "epoch": 0.9714066692219241, "grad_norm": 22.67384910583496, "learning_rate": 9.029447852760736e-07, "logits/chosen": -2.3139803409576416, "logits/rejected": -2.337186098098755, "logps/chosen": -52.762535095214844, "logps/rejected": -71.66212463378906, "loss": 0.745, "rewards/accuracies": 0.65625, "rewards/chosen": 0.42220303416252136, "rewards/margins": 2.035590171813965, "rewards/rejected": -1.613387107849121, "step": 792 }, { "epoch": 0.972633192794174, "grad_norm": 16.127126693725586, "learning_rate": 9.028220858895705e-07, "logits/chosen": -2.4030871391296387, "logits/rejected": -2.430755138397217, "logps/chosen": -61.73321533203125, "logps/rejected": -76.83351135253906, "loss": 0.8159, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3617843985557556, "rewards/margins": 1.548261046409607, "rewards/rejected": -1.9100456237792969, "step": 793 }, { "epoch": 0.9738597163664239, "grad_norm": 9.452905654907227, "learning_rate": 9.026993865030674e-07, "logits/chosen": -2.290966272354126, "logits/rejected": -2.3319168090820312, "logps/chosen": -50.921165466308594, "logps/rejected": -74.64774322509766, "loss": 0.6695, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7986983060836792, "rewards/margins": 2.727611780166626, "rewards/rejected": -1.9289132356643677, "step": 794 }, { "epoch": 0.9750862399386738, "grad_norm": 11.072033882141113, "learning_rate": 9.025766871165644e-07, "logits/chosen": -2.26678204536438, "logits/rejected": -2.3226523399353027, "logps/chosen": -65.1519775390625, "logps/rejected": -80.00495910644531, "loss": 0.7311, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3985157608985901, "rewards/margins": 2.218921661376953, "rewards/rejected": -2.6174373626708984, "step": 795 }, { "epoch": 0.9763127635109238, "grad_norm": 18.543413162231445, "learning_rate": 9.024539877300613e-07, "logits/chosen": -2.3708112239837646, "logits/rejected": -2.3894219398498535, "logps/chosen": -53.42853546142578, "logps/rejected": -73.10462951660156, "loss": 0.704, "rewards/accuracies": 0.75, "rewards/chosen": 0.5159695148468018, "rewards/margins": 2.4036152362823486, "rewards/rejected": -1.8876457214355469, "step": 796 }, { "epoch": 0.9775392870831736, "grad_norm": 11.02979850769043, "learning_rate": 9.023312883435582e-07, "logits/chosen": -2.3430495262145996, "logits/rejected": -2.374324321746826, "logps/chosen": -61.82308578491211, "logps/rejected": -71.71858215332031, "loss": 0.8899, "rewards/accuracies": 0.59375, "rewards/chosen": -0.41315406560897827, "rewards/margins": 0.9240970611572266, "rewards/rejected": -1.3372511863708496, "step": 797 }, { "epoch": 0.9787658106554236, "grad_norm": 13.412287712097168, "learning_rate": 9.022085889570552e-07, "logits/chosen": -2.2967867851257324, "logits/rejected": -2.3305251598358154, "logps/chosen": -63.95162582397461, "logps/rejected": -73.89507293701172, "loss": 0.8305, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36398014426231384, "rewards/margins": 1.1807594299316406, "rewards/rejected": -1.5447392463684082, "step": 798 }, { "epoch": 0.9799923342276734, "grad_norm": 12.901312828063965, "learning_rate": 9.020858895705521e-07, "logits/chosen": -2.320394515991211, "logits/rejected": -2.3544328212738037, "logps/chosen": -54.14912796020508, "logps/rejected": -69.55473327636719, "loss": 0.7491, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6922698020935059, "rewards/margins": 1.7827640771865845, "rewards/rejected": -1.0904940366744995, "step": 799 }, { "epoch": 0.9812188577999233, "grad_norm": 15.681968688964844, "learning_rate": 9.019631901840491e-07, "logits/chosen": -2.2907567024230957, "logits/rejected": -2.3262510299682617, "logps/chosen": -41.15876007080078, "logps/rejected": -57.378700256347656, "loss": 0.7689, "rewards/accuracies": 0.71875, "rewards/chosen": 1.5558489561080933, "rewards/margins": 1.5344440937042236, "rewards/rejected": 0.021404772996902466, "step": 800 }, { "epoch": 0.9824453813721733, "grad_norm": 9.08390998840332, "learning_rate": 9.01840490797546e-07, "logits/chosen": -2.37024188041687, "logits/rejected": -2.3864896297454834, "logps/chosen": -50.99842834472656, "logps/rejected": -65.64170837402344, "loss": 0.7668, "rewards/accuracies": 0.75, "rewards/chosen": 0.6697547435760498, "rewards/margins": 1.8013334274291992, "rewards/rejected": -1.1315786838531494, "step": 801 }, { "epoch": 0.9836719049444231, "grad_norm": 10.8651123046875, "learning_rate": 9.017177914110429e-07, "logits/chosen": -2.350895404815674, "logits/rejected": -2.384427070617676, "logps/chosen": -51.69807052612305, "logps/rejected": -68.07276153564453, "loss": 0.7854, "rewards/accuracies": 0.71875, "rewards/chosen": 0.11543992161750793, "rewards/margins": 1.5943543910980225, "rewards/rejected": -1.4789143800735474, "step": 802 }, { "epoch": 0.9848984285166731, "grad_norm": 19.659198760986328, "learning_rate": 9.015950920245399e-07, "logits/chosen": -2.3291993141174316, "logits/rejected": -2.3247060775756836, "logps/chosen": -54.37761306762695, "logps/rejected": -63.98621368408203, "loss": 0.8134, "rewards/accuracies": 0.625, "rewards/chosen": 0.2990967631340027, "rewards/margins": 1.19594407081604, "rewards/rejected": -0.8968473076820374, "step": 803 }, { "epoch": 0.9861249520889229, "grad_norm": 12.0306978225708, "learning_rate": 9.014723926380368e-07, "logits/chosen": -2.3488845825195312, "logits/rejected": -2.343942403793335, "logps/chosen": -57.11164855957031, "logps/rejected": -63.265830993652344, "loss": 0.8924, "rewards/accuracies": 0.5, "rewards/chosen": -0.25913286209106445, "rewards/margins": 0.5710494518280029, "rewards/rejected": -0.8301823735237122, "step": 804 }, { "epoch": 0.9873514756611729, "grad_norm": 14.435657501220703, "learning_rate": 9.013496932515336e-07, "logits/chosen": -2.325545310974121, "logits/rejected": -2.2926342487335205, "logps/chosen": -56.271995544433594, "logps/rejected": -70.674072265625, "loss": 0.7662, "rewards/accuracies": 0.71875, "rewards/chosen": 0.11584970355033875, "rewards/margins": 1.544112205505371, "rewards/rejected": -1.428262710571289, "step": 805 }, { "epoch": 0.9885779992334228, "grad_norm": 11.94987964630127, "learning_rate": 9.012269938650306e-07, "logits/chosen": -2.333202362060547, "logits/rejected": -2.3875362873077393, "logps/chosen": -62.899879455566406, "logps/rejected": -87.78916931152344, "loss": 0.6797, "rewards/accuracies": 0.75, "rewards/chosen": -0.5097256302833557, "rewards/margins": 2.373441696166992, "rewards/rejected": -2.8831675052642822, "step": 806 }, { "epoch": 0.9898045228056727, "grad_norm": 15.075984954833984, "learning_rate": 9.011042944785275e-07, "logits/chosen": -2.394197463989258, "logits/rejected": -2.3919410705566406, "logps/chosen": -62.68242645263672, "logps/rejected": -72.83601379394531, "loss": 0.8079, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3699432611465454, "rewards/margins": 1.3453470468521118, "rewards/rejected": -1.7152904272079468, "step": 807 }, { "epoch": 0.9910310463779226, "grad_norm": 9.110042572021484, "learning_rate": 9.009815950920245e-07, "logits/chosen": -2.3297367095947266, "logits/rejected": -2.3389945030212402, "logps/chosen": -56.19036102294922, "logps/rejected": -68.82630920410156, "loss": 0.8155, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3971952199935913, "rewards/margins": 1.516019344329834, "rewards/rejected": -1.1188240051269531, "step": 808 }, { "epoch": 0.9922575699501724, "grad_norm": 9.232560157775879, "learning_rate": 9.008588957055215e-07, "logits/chosen": -2.348766803741455, "logits/rejected": -2.330723285675049, "logps/chosen": -66.37715148925781, "logps/rejected": -78.89362335205078, "loss": 0.8172, "rewards/accuracies": 0.6875, "rewards/chosen": -0.922232985496521, "rewards/margins": 1.3193747997283936, "rewards/rejected": -2.241607666015625, "step": 809 }, { "epoch": 0.9934840935224224, "grad_norm": 9.148397445678711, "learning_rate": 9.007361963190184e-07, "logits/chosen": -2.4108123779296875, "logits/rejected": -2.359874963760376, "logps/chosen": -67.57110595703125, "logps/rejected": -76.57615661621094, "loss": 0.8781, "rewards/accuracies": 0.53125, "rewards/chosen": -0.9225595593452454, "rewards/margins": 0.9114837646484375, "rewards/rejected": -1.834043264389038, "step": 810 }, { "epoch": 0.9947106170946722, "grad_norm": 7.92744255065918, "learning_rate": 9.006134969325153e-07, "logits/chosen": -2.3478457927703857, "logits/rejected": -2.419086217880249, "logps/chosen": -53.332820892333984, "logps/rejected": -74.28766632080078, "loss": 0.7472, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5420712828636169, "rewards/margins": 2.052290201187134, "rewards/rejected": -1.5102190971374512, "step": 811 }, { "epoch": 0.9959371406669222, "grad_norm": 11.742538452148438, "learning_rate": 9.004907975460122e-07, "logits/chosen": -2.36745548248291, "logits/rejected": -2.3637330532073975, "logps/chosen": -57.895225524902344, "logps/rejected": -78.99217224121094, "loss": 0.7393, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0673568993806839, "rewards/margins": 2.1845169067382812, "rewards/rejected": -2.251873731613159, "step": 812 }, { "epoch": 0.9971636642391721, "grad_norm": 12.680534362792969, "learning_rate": 9.003680981595092e-07, "logits/chosen": -2.325202465057373, "logits/rejected": -2.3420193195343018, "logps/chosen": -53.33985900878906, "logps/rejected": -66.98403930664062, "loss": 0.8406, "rewards/accuracies": 0.59375, "rewards/chosen": 0.5992448329925537, "rewards/margins": 1.178383708000183, "rewards/rejected": -0.5791387557983398, "step": 813 }, { "epoch": 0.998390187811422, "grad_norm": 14.259099006652832, "learning_rate": 9.002453987730061e-07, "logits/chosen": -2.3358938694000244, "logits/rejected": -2.3470447063446045, "logps/chosen": -56.18610382080078, "logps/rejected": -68.64788055419922, "loss": 0.8002, "rewards/accuracies": 0.71875, "rewards/chosen": 0.17106761038303375, "rewards/margins": 1.4755622148513794, "rewards/rejected": -1.3044945001602173, "step": 814 }, { "epoch": 0.9996167113836719, "grad_norm": 8.856546401977539, "learning_rate": 9.00122699386503e-07, "logits/chosen": -2.3426284790039062, "logits/rejected": -2.3489274978637695, "logps/chosen": -66.29077911376953, "logps/rejected": -73.11625671386719, "loss": 0.9016, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0109076499938965, "rewards/margins": 0.6612274050712585, "rewards/rejected": -1.6721348762512207, "step": 815 }, { "epoch": 0.9996167113836719, "eval_logits/chosen": -2.3501715660095215, "eval_logits/rejected": -2.366697072982788, "eval_logps/chosen": -71.5538558959961, "eval_logps/rejected": -83.99466705322266, "eval_loss": 0.8272071480751038, "eval_rewards/accuracies": 0.6360543966293335, "eval_rewards/chosen": -1.3922499418258667, "eval_rewards/margins": 1.3573591709136963, "eval_rewards/rejected": -2.7496092319488525, "eval_runtime": 1583.3455, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.279, "step": 815 } ], "logging_steps": 1.0, "max_steps": 8150, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 163, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }