{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8321659893760142, "eval_steps": 500, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005547773262506761, "grad_norm": 86.47028350830078, "learning_rate": 9.999992405676425e-08, "logits/chosen": 0.04256238415837288, "logits/rejected": -0.03134341910481453, "logps/chosen": -188.57350158691406, "logps/rejected": -175.35324096679688, "loss": 2.5855, "nll_loss": 1.9216177463531494, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0365375280380249, "rewards/margins": 0.013619521632790565, "rewards/rejected": 0.022918006405234337, "step": 10 }, { "epoch": 0.0011095546525013522, "grad_norm": 105.78651428222656, "learning_rate": 9.999969622728772e-08, "logits/chosen": -0.11089731752872467, "logits/rejected": -0.15940730273723602, "logps/chosen": -214.8294677734375, "logps/rejected": -232.4839324951172, "loss": 2.5074, "nll_loss": 1.7895090579986572, "rewards/accuracies": 0.625, "rewards/chosen": 0.16805259883403778, "rewards/margins": 0.01384829543530941, "rewards/rejected": 0.154204323887825, "step": 20 }, { "epoch": 0.0016643319787520284, "grad_norm": 114.77359771728516, "learning_rate": 9.99993165122625e-08, "logits/chosen": -0.24033674597740173, "logits/rejected": -0.35203057527542114, "logps/chosen": -260.21307373046875, "logps/rejected": -285.6676025390625, "loss": 2.2734, "nll_loss": 1.5780918598175049, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.5284448266029358, "rewards/margins": 0.022413188591599464, "rewards/rejected": 0.5060315728187561, "step": 30 }, { "epoch": 0.0022191093050027044, "grad_norm": 103.22981262207031, "learning_rate": 9.999878491284204e-08, "logits/chosen": -0.11801149696111679, "logits/rejected": -0.18273355066776276, "logps/chosen": -189.36073303222656, "logps/rejected": -205.96383666992188, "loss": 2.1295, "nll_loss": 1.3643723726272583, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.8830834627151489, "rewards/margins": -0.03812415525317192, "rewards/rejected": 0.9212075471878052, "step": 40 }, { "epoch": 0.002773886631253381, "grad_norm": 66.34463500976562, "learning_rate": 9.99981014306412e-08, "logits/chosen": -0.2742760479450226, "logits/rejected": -0.3168026804924011, "logps/chosen": -194.3908233642578, "logps/rejected": -215.14193725585938, "loss": 1.8903, "nll_loss": 1.2410157918930054, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.4482768774032593, "rewards/margins": 0.08231005072593689, "rewards/rejected": 1.365966796875, "step": 50 }, { "epoch": 0.003328663957504057, "grad_norm": 53.22880935668945, "learning_rate": 9.999726606773624e-08, "logits/chosen": -0.3148336708545685, "logits/rejected": -0.40042591094970703, "logps/chosen": -217.2548065185547, "logps/rejected": -261.5478210449219, "loss": 1.8681, "nll_loss": 1.2332053184509277, "rewards/accuracies": 0.5, "rewards/chosen": 1.531562089920044, "rewards/margins": 0.05171762779355049, "rewards/rejected": 1.4798444509506226, "step": 60 }, { "epoch": 0.003883441283754733, "grad_norm": 62.05373764038086, "learning_rate": 9.999627882666472e-08, "logits/chosen": -0.39523762464523315, "logits/rejected": -0.46755728125572205, "logps/chosen": -224.984130859375, "logps/rejected": -253.46240234375, "loss": 1.8248, "nll_loss": 1.3099695444107056, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.6396353244781494, "rewards/margins": 0.09223873913288116, "rewards/rejected": 1.5473965406417847, "step": 70 }, { "epoch": 0.004438218610005409, "grad_norm": 59.43037796020508, "learning_rate": 9.999513971042565e-08, "logits/chosen": -0.33986055850982666, "logits/rejected": -0.37254834175109863, "logps/chosen": -210.43704223632812, "logps/rejected": -217.45449829101562, "loss": 1.8836, "nll_loss": 1.2146633863449097, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.6813926696777344, "rewards/margins": 0.15952646732330322, "rewards/rejected": 1.5218660831451416, "step": 80 }, { "epoch": 0.004992995936256085, "grad_norm": 62.62155532836914, "learning_rate": 9.999384872247934e-08, "logits/chosen": -0.17636926472187042, "logits/rejected": -0.25223809480667114, "logps/chosen": -157.65005493164062, "logps/rejected": -188.19406127929688, "loss": 1.792, "nll_loss": 1.0303140878677368, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.56096613407135, "rewards/margins": 0.1818581521511078, "rewards/rejected": 1.37910795211792, "step": 90 }, { "epoch": 0.005547773262506762, "grad_norm": 56.658512115478516, "learning_rate": 9.999240586674747e-08, "logits/chosen": -0.32734403014183044, "logits/rejected": -0.40522679686546326, "logps/chosen": -182.51206970214844, "logps/rejected": -214.18594360351562, "loss": 1.8367, "nll_loss": 1.136197805404663, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.675817847251892, "rewards/margins": 0.09024398773908615, "rewards/rejected": 1.5855739116668701, "step": 100 }, { "epoch": 0.006102550588757438, "grad_norm": 59.043766021728516, "learning_rate": 9.999081114761303e-08, "logits/chosen": -0.28781935572624207, "logits/rejected": -0.3697580397129059, "logps/chosen": -198.2109375, "logps/rejected": -214.5436553955078, "loss": 1.8053, "nll_loss": 1.127979040145874, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.7802499532699585, "rewards/margins": 0.28318971395492554, "rewards/rejected": 1.4970601797103882, "step": 110 }, { "epoch": 0.006657327915008114, "grad_norm": 50.7485466003418, "learning_rate": 9.998906456992035e-08, "logits/chosen": -0.35893505811691284, "logits/rejected": -0.46098464727401733, "logps/chosen": -207.06576538085938, "logps/rejected": -245.782958984375, "loss": 1.8427, "nll_loss": 1.1750494241714478, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.7762939929962158, "rewards/margins": 0.3124890923500061, "rewards/rejected": 1.463804841041565, "step": 120 }, { "epoch": 0.00721210524125879, "grad_norm": 59.86566162109375, "learning_rate": 9.998716613897509e-08, "logits/chosen": -0.29412102699279785, "logits/rejected": -0.38112324476242065, "logps/chosen": -203.17233276367188, "logps/rejected": -233.68563842773438, "loss": 1.8138, "nll_loss": 1.1727923154830933, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.8533748388290405, "rewards/margins": 0.33772191405296326, "rewards/rejected": 1.5156530141830444, "step": 130 }, { "epoch": 0.007766882567509466, "grad_norm": 53.81575393676758, "learning_rate": 9.998511586054413e-08, "logits/chosen": -0.310435950756073, "logits/rejected": -0.407255083322525, "logps/chosen": -203.92947387695312, "logps/rejected": -233.2860870361328, "loss": 1.7059, "nll_loss": 1.1747105121612549, "rewards/accuracies": 0.75, "rewards/chosen": 1.8009592294692993, "rewards/margins": 0.29193150997161865, "rewards/rejected": 1.5090277194976807, "step": 140 }, { "epoch": 0.008321659893760143, "grad_norm": 58.20719528198242, "learning_rate": 9.998291374085569e-08, "logits/chosen": -0.4028782844543457, "logits/rejected": -0.4347217082977295, "logps/chosen": -206.7960205078125, "logps/rejected": -217.45614624023438, "loss": 1.7235, "nll_loss": 1.26735520362854, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.8107702732086182, "rewards/margins": 0.17375993728637695, "rewards/rejected": 1.6370102167129517, "step": 150 }, { "epoch": 0.008876437220010818, "grad_norm": 77.8411865234375, "learning_rate": 9.99805597865992e-08, "logits/chosen": -0.4058915674686432, "logits/rejected": -0.5049432516098022, "logps/chosen": -199.4020233154297, "logps/rejected": -243.2211151123047, "loss": 1.7651, "nll_loss": 1.1955233812332153, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.9076124429702759, "rewards/margins": 0.31628042459487915, "rewards/rejected": 1.5913320779800415, "step": 160 }, { "epoch": 0.009431214546261495, "grad_norm": 66.69096374511719, "learning_rate": 9.997805400492532e-08, "logits/chosen": -0.34897083044052124, "logits/rejected": -0.4341156482696533, "logps/chosen": -185.31228637695312, "logps/rejected": -222.63546752929688, "loss": 1.7043, "nll_loss": 1.1404896974563599, "rewards/accuracies": 0.75, "rewards/chosen": 1.8629522323608398, "rewards/margins": 0.28871744871139526, "rewards/rejected": 1.5742347240447998, "step": 170 }, { "epoch": 0.00998599187251217, "grad_norm": 54.87003707885742, "learning_rate": 9.997539640344596e-08, "logits/chosen": -0.1554606407880783, "logits/rejected": -0.21996864676475525, "logps/chosen": -144.2554168701172, "logps/rejected": -139.35382080078125, "loss": 1.7168, "nll_loss": 0.9475277662277222, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.7399733066558838, "rewards/margins": 0.28856176137924194, "rewards/rejected": 1.451411485671997, "step": 180 }, { "epoch": 0.010540769198762847, "grad_norm": 69.4969482421875, "learning_rate": 9.99725869902342e-08, "logits/chosen": -0.37828877568244934, "logits/rejected": -0.38139861822128296, "logps/chosen": -236.5293426513672, "logps/rejected": -235.115966796875, "loss": 1.6938, "nll_loss": 1.2049442529678345, "rewards/accuracies": 0.75, "rewards/chosen": 2.005309581756592, "rewards/margins": 0.38655903935432434, "rewards/rejected": 1.6187505722045898, "step": 190 }, { "epoch": 0.011095546525013524, "grad_norm": 73.94921875, "learning_rate": 9.996962577382426e-08, "logits/chosen": -0.19098533689975739, "logits/rejected": -0.2874911427497864, "logps/chosen": -151.21435546875, "logps/rejected": -179.45016479492188, "loss": 1.6993, "nll_loss": 1.0086549520492554, "rewards/accuracies": 0.75, "rewards/chosen": 1.8682178258895874, "rewards/margins": 0.3940388262271881, "rewards/rejected": 1.4741789102554321, "step": 200 }, { "epoch": 0.011650323851264199, "grad_norm": 51.379154205322266, "learning_rate": 9.996651276321152e-08, "logits/chosen": -0.3097096383571625, "logits/rejected": -0.3990236222743988, "logps/chosen": -189.82765197753906, "logps/rejected": -216.7052764892578, "loss": 1.6636, "nll_loss": 1.052120327949524, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.9751056432724, "rewards/margins": 0.4109489917755127, "rewards/rejected": 1.5641567707061768, "step": 210 }, { "epoch": 0.012205101177514876, "grad_norm": 55.620426177978516, "learning_rate": 9.996324796785246e-08, "logits/chosen": -0.4845232367515564, "logits/rejected": -0.5828734040260315, "logps/chosen": -235.7965850830078, "logps/rejected": -251.658935546875, "loss": 1.7518, "nll_loss": 1.1625698804855347, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.1334118843078613, "rewards/margins": 0.5587369203567505, "rewards/rejected": 1.5746748447418213, "step": 220 }, { "epoch": 0.01275987850376555, "grad_norm": 65.55672454833984, "learning_rate": 9.995983139766464e-08, "logits/chosen": -0.5563563704490662, "logits/rejected": -0.643342137336731, "logps/chosen": -214.36947631835938, "logps/rejected": -240.8154754638672, "loss": 1.7696, "nll_loss": 1.4078218936920166, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 2.1256346702575684, "rewards/margins": 0.37678810954093933, "rewards/rejected": 1.748846411705017, "step": 230 }, { "epoch": 0.013314655830016228, "grad_norm": 58.193077087402344, "learning_rate": 9.99562630630267e-08, "logits/chosen": -0.3747154176235199, "logits/rejected": -0.46808844804763794, "logps/chosen": -179.42970275878906, "logps/rejected": -219.7896270751953, "loss": 1.6669, "nll_loss": 1.1212873458862305, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.1608200073242188, "rewards/margins": 0.5892239809036255, "rewards/rejected": 1.5715959072113037, "step": 240 }, { "epoch": 0.013869433156266903, "grad_norm": 77.5068588256836, "learning_rate": 9.995254297477825e-08, "logits/chosen": -0.3856232464313507, "logits/rejected": -0.5023887753486633, "logps/chosen": -181.36752319335938, "logps/rejected": -235.1916046142578, "loss": 1.6287, "nll_loss": 1.062013030052185, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.17362904548645, "rewards/margins": 0.7540701031684875, "rewards/rejected": 1.4195587635040283, "step": 250 }, { "epoch": 0.01442421048251758, "grad_norm": 61.57682418823242, "learning_rate": 9.994867114421993e-08, "logits/chosen": -0.46952685713768005, "logits/rejected": -0.5864278078079224, "logps/chosen": -212.41775512695312, "logps/rejected": -256.9131774902344, "loss": 1.6886, "nll_loss": 1.1452209949493408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1298396587371826, "rewards/margins": 0.6472753286361694, "rewards/rejected": 1.4825643301010132, "step": 260 }, { "epoch": 0.014978987808768256, "grad_norm": 79.09966278076172, "learning_rate": 9.99446475831133e-08, "logits/chosen": -0.32850465178489685, "logits/rejected": -0.46790608763694763, "logps/chosen": -152.0748291015625, "logps/rejected": -170.48959350585938, "loss": 1.6692, "nll_loss": 0.9634621739387512, "rewards/accuracies": 0.75, "rewards/chosen": 1.94126296043396, "rewards/margins": 0.6225162148475647, "rewards/rejected": 1.31874680519104, "step": 270 }, { "epoch": 0.015533765135018932, "grad_norm": 72.3378677368164, "learning_rate": 9.994047230368086e-08, "logits/chosen": -0.4431266188621521, "logits/rejected": -0.5226654410362244, "logps/chosen": -200.44613647460938, "logps/rejected": -239.3496551513672, "loss": 1.6507, "nll_loss": 1.104430913925171, "rewards/accuracies": 0.75, "rewards/chosen": 2.2142229080200195, "rewards/margins": 0.6966776251792908, "rewards/rejected": 1.5175453424453735, "step": 280 }, { "epoch": 0.016088542461269607, "grad_norm": 61.740116119384766, "learning_rate": 9.993614531860596e-08, "logits/chosen": -0.46498972177505493, "logits/rejected": -0.5346661806106567, "logps/chosen": -194.7556610107422, "logps/rejected": -229.0714874267578, "loss": 1.6973, "nll_loss": 1.1759881973266602, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.4024055004119873, "rewards/margins": 0.8359010815620422, "rewards/rejected": 1.566504716873169, "step": 290 }, { "epoch": 0.016643319787520285, "grad_norm": 73.7476577758789, "learning_rate": 9.993166664103283e-08, "logits/chosen": -0.511029064655304, "logits/rejected": -0.6332219839096069, "logps/chosen": -206.7277374267578, "logps/rejected": -246.34201049804688, "loss": 1.6736, "nll_loss": 1.2045782804489136, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.3775553703308105, "rewards/margins": 0.7974990010261536, "rewards/rejected": 1.5800564289093018, "step": 300 }, { "epoch": 0.01719809711377096, "grad_norm": 91.33706665039062, "learning_rate": 9.992703628456647e-08, "logits/chosen": -0.5446246862411499, "logits/rejected": -0.5947648286819458, "logps/chosen": -202.0845489501953, "logps/rejected": -219.13375854492188, "loss": 1.6406, "nll_loss": 1.200272798538208, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.4559853076934814, "rewards/margins": 0.7933844327926636, "rewards/rejected": 1.6626008749008179, "step": 310 }, { "epoch": 0.017752874440021636, "grad_norm": 61.63032150268555, "learning_rate": 9.992225426327267e-08, "logits/chosen": -0.606124997138977, "logits/rejected": -0.6858624219894409, "logps/chosen": -198.80601501464844, "logps/rejected": -236.6532440185547, "loss": 1.5924, "nll_loss": 1.2124780416488647, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.247375011444092, "rewards/margins": 0.5193211436271667, "rewards/rejected": 1.7280542850494385, "step": 320 }, { "epoch": 0.018307651766272314, "grad_norm": 79.13629150390625, "learning_rate": 9.991732059167788e-08, "logits/chosen": -0.39404815435409546, "logits/rejected": -0.4788663387298584, "logps/chosen": -189.56886291503906, "logps/rejected": -212.91897583007812, "loss": 1.6604, "nll_loss": 1.1193745136260986, "rewards/accuracies": 0.75, "rewards/chosen": 2.2288386821746826, "rewards/margins": 0.758010983467102, "rewards/rejected": 1.4708276987075806, "step": 330 }, { "epoch": 0.01886242909252299, "grad_norm": 99.47632598876953, "learning_rate": 9.99122352847693e-08, "logits/chosen": -0.3280274271965027, "logits/rejected": -0.3921307325363159, "logps/chosen": -163.58642578125, "logps/rejected": -182.4820098876953, "loss": 1.6261, "nll_loss": 0.9385612607002258, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.386798620223999, "rewards/margins": 0.7635468244552612, "rewards/rejected": 1.6232519149780273, "step": 340 }, { "epoch": 0.019417206418773664, "grad_norm": 78.503662109375, "learning_rate": 9.990699835799469e-08, "logits/chosen": -0.579135000705719, "logits/rejected": -0.6546199917793274, "logps/chosen": -209.2771759033203, "logps/rejected": -240.32766723632812, "loss": 1.6131, "nll_loss": 1.2728625535964966, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.6339731216430664, "rewards/margins": 0.9537647366523743, "rewards/rejected": 1.6802085638046265, "step": 350 }, { "epoch": 0.01997198374502434, "grad_norm": 69.08805084228516, "learning_rate": 9.99016098272624e-08, "logits/chosen": -0.49542540311813354, "logits/rejected": -0.6185147762298584, "logps/chosen": -178.43887329101562, "logps/rejected": -234.18759155273438, "loss": 1.6395, "nll_loss": 1.087710976600647, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.4325428009033203, "rewards/margins": 1.0164880752563477, "rewards/rejected": 1.4160544872283936, "step": 360 }, { "epoch": 0.020526761071275018, "grad_norm": 59.481170654296875, "learning_rate": 9.98960697089414e-08, "logits/chosen": -0.5423728227615356, "logits/rejected": -0.6052581071853638, "logps/chosen": -170.61349487304688, "logps/rejected": -195.85025024414062, "loss": 1.6007, "nll_loss": 1.1169687509536743, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.3467555046081543, "rewards/margins": 0.7417221069335938, "rewards/rejected": 1.605033278465271, "step": 370 }, { "epoch": 0.021081538397525693, "grad_norm": 51.370872497558594, "learning_rate": 9.9890378019861e-08, "logits/chosen": -0.7442265748977661, "logits/rejected": -0.8148695826530457, "logps/chosen": -241.3246612548828, "logps/rejected": -275.6942443847656, "loss": 1.5691, "nll_loss": 1.3698346614837646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.58601450920105, "rewards/margins": 0.5415834188461304, "rewards/rejected": 2.044431209564209, "step": 380 }, { "epoch": 0.02163631572377637, "grad_norm": 57.66556167602539, "learning_rate": 9.988453477731102e-08, "logits/chosen": -0.5455671548843384, "logits/rejected": -0.7036603689193726, "logps/chosen": -176.88845825195312, "logps/rejected": -220.8589630126953, "loss": 1.5857, "nll_loss": 1.0153230428695679, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.3425281047821045, "rewards/margins": 1.0104955434799194, "rewards/rejected": 1.3320326805114746, "step": 390 }, { "epoch": 0.022191093050027047, "grad_norm": 70.84477996826172, "learning_rate": 9.987853999904168e-08, "logits/chosen": -0.610481858253479, "logits/rejected": -0.6613737344741821, "logps/chosen": -218.7194366455078, "logps/rejected": -236.94515991210938, "loss": 1.6499, "nll_loss": 1.2476584911346436, "rewards/accuracies": 0.75, "rewards/chosen": 2.5958473682403564, "rewards/margins": 0.9840685725212097, "rewards/rejected": 1.6117788553237915, "step": 400 }, { "epoch": 0.022745870376277722, "grad_norm": 116.12479400634766, "learning_rate": 9.987239370326348e-08, "logits/chosen": -0.5972326993942261, "logits/rejected": -0.6860564947128296, "logps/chosen": -182.69808959960938, "logps/rejected": -217.4706268310547, "loss": 1.6819, "nll_loss": 1.1688315868377686, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6170666217803955, "rewards/margins": 1.3096208572387695, "rewards/rejected": 1.3074455261230469, "step": 410 }, { "epoch": 0.023300647702528397, "grad_norm": 63.733253479003906, "learning_rate": 9.986609590864719e-08, "logits/chosen": -0.38191336393356323, "logits/rejected": -0.4844127297401428, "logps/chosen": -166.2962188720703, "logps/rejected": -198.14926147460938, "loss": 1.5327, "nll_loss": 1.038710117340088, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.370504856109619, "rewards/margins": 0.9496327638626099, "rewards/rejected": 1.4208720922470093, "step": 420 }, { "epoch": 0.023855425028779072, "grad_norm": 75.02265167236328, "learning_rate": 9.985964663432382e-08, "logits/chosen": -0.6233263611793518, "logits/rejected": -0.6644175052642822, "logps/chosen": -193.20115661621094, "logps/rejected": -218.0725555419922, "loss": 1.6417, "nll_loss": 1.2874863147735596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6517550945281982, "rewards/margins": 0.9659770727157593, "rewards/rejected": 1.685778021812439, "step": 430 }, { "epoch": 0.02441020235502975, "grad_norm": 65.05445098876953, "learning_rate": 9.985304589988453e-08, "logits/chosen": -0.606865406036377, "logits/rejected": -0.6933251619338989, "logps/chosen": -217.9680633544922, "logps/rejected": -251.608642578125, "loss": 1.5824, "nll_loss": 1.2322168350219727, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7905972003936768, "rewards/margins": 1.4538475275039673, "rewards/rejected": 1.3367496728897095, "step": 440 }, { "epoch": 0.024964979681280426, "grad_norm": 74.69596099853516, "learning_rate": 9.984629372538054e-08, "logits/chosen": -0.5925602912902832, "logits/rejected": -0.6561893224716187, "logps/chosen": -231.200439453125, "logps/rejected": -264.5725402832031, "loss": 1.6613, "nll_loss": 1.2283960580825806, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.739058256149292, "rewards/margins": 0.8342123031616211, "rewards/rejected": 1.904845952987671, "step": 450 }, { "epoch": 0.0255197570075311, "grad_norm": 75.31664276123047, "learning_rate": 9.983939013132314e-08, "logits/chosen": -0.44524669647216797, "logits/rejected": -0.5208362340927124, "logps/chosen": -167.56126403808594, "logps/rejected": -195.00735473632812, "loss": 1.5971, "nll_loss": 1.096200942993164, "rewards/accuracies": 0.75, "rewards/chosen": 2.3428120613098145, "rewards/margins": 0.7069460153579712, "rewards/rejected": 1.6358659267425537, "step": 460 }, { "epoch": 0.02607453433378178, "grad_norm": 63.74409484863281, "learning_rate": 9.98323351386836e-08, "logits/chosen": -0.5915762782096863, "logits/rejected": -0.6436539888381958, "logps/chosen": -196.46066284179688, "logps/rejected": -213.55349731445312, "loss": 1.5516, "nll_loss": 1.2401580810546875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.625981092453003, "rewards/margins": 1.0905460119247437, "rewards/rejected": 1.5354353189468384, "step": 470 }, { "epoch": 0.026629311660032455, "grad_norm": 87.08705139160156, "learning_rate": 9.982512876889306e-08, "logits/chosen": -0.4064413905143738, "logits/rejected": -0.5316869020462036, "logps/chosen": -157.93026733398438, "logps/rejected": -187.75381469726562, "loss": 1.5626, "nll_loss": 0.9701600074768066, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.28480863571167, "rewards/margins": 0.9264081120491028, "rewards/rejected": 1.3584007024765015, "step": 480 }, { "epoch": 0.02718408898628313, "grad_norm": 102.86444091796875, "learning_rate": 9.981777104384251e-08, "logits/chosen": -0.4961855411529541, "logits/rejected": -0.5452759265899658, "logps/chosen": -196.3448028564453, "logps/rejected": -235.82666015625, "loss": 1.6135, "nll_loss": 1.1173430681228638, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.8101401329040527, "rewards/margins": 1.1897588968276978, "rewards/rejected": 1.6203811168670654, "step": 490 }, { "epoch": 0.027738866312533805, "grad_norm": 93.84529876708984, "learning_rate": 9.981026198588274e-08, "logits/chosen": -0.6152495741844177, "logits/rejected": -0.6937671899795532, "logps/chosen": -199.5059356689453, "logps/rejected": -227.841552734375, "loss": 1.6164, "nll_loss": 1.1395609378814697, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.346026659011841, "rewards/margins": 1.5802319049835205, "rewards/rejected": 1.765795111656189, "step": 500 }, { "epoch": 0.027738866312533805, "eval_logits/chosen": -0.6478434801101685, "eval_logits/rejected": -0.7143262028694153, "eval_logps/chosen": -226.89523315429688, "eval_logps/rejected": -264.53521728515625, "eval_loss": 1.531517505645752, "eval_nll_loss": 1.1829402446746826, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 3.1065964698791504, "eval_rewards/margins": 1.6522696018218994, "eval_rewards/rejected": 1.4543266296386719, "eval_runtime": 17.055, "eval_samples_per_second": 15.01, "eval_steps_per_second": 1.876, "step": 500 }, { "epoch": 0.028293643638784484, "grad_norm": 74.31314849853516, "learning_rate": 9.980260161782426e-08, "logits/chosen": -0.6298533082008362, "logits/rejected": -0.6791015863418579, "logps/chosen": -218.2083282470703, "logps/rejected": -267.1485290527344, "loss": 1.5716, "nll_loss": 1.2364524602890015, "rewards/accuracies": 0.75, "rewards/chosen": 3.068859577178955, "rewards/margins": 1.1550592184066772, "rewards/rejected": 1.9138002395629883, "step": 510 }, { "epoch": 0.02884842096503516, "grad_norm": 68.77310943603516, "learning_rate": 9.979478996293715e-08, "logits/chosen": -0.6073177456855774, "logits/rejected": -0.7440160512924194, "logps/chosen": -205.8466339111328, "logps/rejected": -260.246337890625, "loss": 1.602, "nll_loss": 1.171318769454956, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.0000762939453125, "rewards/margins": 1.3454234600067139, "rewards/rejected": 1.6546528339385986, "step": 520 }, { "epoch": 0.029403198291285834, "grad_norm": 67.26824188232422, "learning_rate": 9.978682704495114e-08, "logits/chosen": -0.6818415522575378, "logits/rejected": -0.7456626892089844, "logps/chosen": -211.9142303466797, "logps/rejected": -255.11474609375, "loss": 1.5557, "nll_loss": 1.2860023975372314, "rewards/accuracies": 0.75, "rewards/chosen": 3.183370590209961, "rewards/margins": 1.4452582597732544, "rewards/rejected": 1.7381126880645752, "step": 530 }, { "epoch": 0.029957975617536513, "grad_norm": 56.54665756225586, "learning_rate": 9.977871288805541e-08, "logits/chosen": -0.5180370211601257, "logits/rejected": -0.6200538873672485, "logps/chosen": -194.35459899902344, "logps/rejected": -227.3801727294922, "loss": 1.5135, "nll_loss": 1.081203579902649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9965686798095703, "rewards/margins": 1.2237999439239502, "rewards/rejected": 1.7727687358856201, "step": 540 }, { "epoch": 0.030512752943787188, "grad_norm": 56.38474655151367, "learning_rate": 9.977044751689857e-08, "logits/chosen": -0.3500790297985077, "logits/rejected": -0.46968212723731995, "logps/chosen": -147.5152587890625, "logps/rejected": -186.73446655273438, "loss": 1.5259, "nll_loss": 0.9268163442611694, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.580355167388916, "rewards/margins": 1.3773891925811768, "rewards/rejected": 1.2029658555984497, "step": 550 }, { "epoch": 0.031067530270037863, "grad_norm": 67.37399291992188, "learning_rate": 9.976203095658858e-08, "logits/chosen": -0.5463757514953613, "logits/rejected": -0.6025634407997131, "logps/chosen": -173.28712463378906, "logps/rejected": -226.4482421875, "loss": 1.541, "nll_loss": 1.0410239696502686, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.802887201309204, "rewards/margins": 1.544614553451538, "rewards/rejected": 1.2582728862762451, "step": 560 }, { "epoch": 0.03162230759628854, "grad_norm": 49.75868606567383, "learning_rate": 9.975346323269267e-08, "logits/chosen": -0.4543730616569519, "logits/rejected": -0.5487458109855652, "logps/chosen": -211.029541015625, "logps/rejected": -250.74288940429688, "loss": 1.498, "nll_loss": 1.0694835186004639, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.212559938430786, "rewards/margins": 1.8369373083114624, "rewards/rejected": 1.3756221532821655, "step": 570 }, { "epoch": 0.03217708492253921, "grad_norm": 61.089996337890625, "learning_rate": 9.974474437123729e-08, "logits/chosen": -0.540164053440094, "logits/rejected": -0.6340337991714478, "logps/chosen": -191.64480590820312, "logps/rejected": -229.88601684570312, "loss": 1.5814, "nll_loss": 1.1106784343719482, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.882720947265625, "rewards/margins": 1.5230586528778076, "rewards/rejected": 1.3596618175506592, "step": 580 }, { "epoch": 0.03273186224878989, "grad_norm": 104.65751647949219, "learning_rate": 9.973587439870794e-08, "logits/chosen": -0.4297700822353363, "logits/rejected": -0.4823095202445984, "logps/chosen": -164.86154174804688, "logps/rejected": -193.41534423828125, "loss": 1.5013, "nll_loss": 1.0106995105743408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8032305240631104, "rewards/margins": 1.1382739543914795, "rewards/rejected": 1.6649566888809204, "step": 590 }, { "epoch": 0.03328663957504057, "grad_norm": 71.44564819335938, "learning_rate": 9.972685334204924e-08, "logits/chosen": -0.46265238523483276, "logits/rejected": -0.5998337864875793, "logps/chosen": -156.38107299804688, "logps/rejected": -203.08433532714844, "loss": 1.5262, "nll_loss": 0.9892654418945312, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.995356559753418, "rewards/margins": 1.4686530828475952, "rewards/rejected": 1.5267035961151123, "step": 600 }, { "epoch": 0.03384141690129124, "grad_norm": 56.26408767700195, "learning_rate": 9.97176812286647e-08, "logits/chosen": -0.5253828763961792, "logits/rejected": -0.6142227649688721, "logps/chosen": -191.7180938720703, "logps/rejected": -222.5738067626953, "loss": 1.5112, "nll_loss": 1.0993685722351074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0786499977111816, "rewards/margins": 1.657843828201294, "rewards/rejected": 1.4208059310913086, "step": 610 }, { "epoch": 0.03439619422754192, "grad_norm": 63.693519592285156, "learning_rate": 9.970835808641671e-08, "logits/chosen": -0.49338769912719727, "logits/rejected": -0.6104284524917603, "logps/chosen": -183.81399536132812, "logps/rejected": -230.7450408935547, "loss": 1.5753, "nll_loss": 1.096469521522522, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.137953042984009, "rewards/margins": 2.151078939437866, "rewards/rejected": 0.9868742227554321, "step": 620 }, { "epoch": 0.0349509715537926, "grad_norm": 88.63583374023438, "learning_rate": 9.969888394362646e-08, "logits/chosen": -0.509103536605835, "logits/rejected": -0.5807837843894958, "logps/chosen": -160.57984924316406, "logps/rejected": -197.91236877441406, "loss": 1.5844, "nll_loss": 1.0496104955673218, "rewards/accuracies": 0.75, "rewards/chosen": 2.8470611572265625, "rewards/margins": 1.0343971252441406, "rewards/rejected": 1.8126637935638428, "step": 630 }, { "epoch": 0.03550574888004327, "grad_norm": 86.9113540649414, "learning_rate": 9.968925882907385e-08, "logits/chosen": -0.5038835406303406, "logits/rejected": -0.5882635116577148, "logps/chosen": -213.25167846679688, "logps/rejected": -233.34841918945312, "loss": 1.4754, "nll_loss": 1.0951595306396484, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.2811102867126465, "rewards/margins": 1.6264727115631104, "rewards/rejected": 1.654637336730957, "step": 640 }, { "epoch": 0.03606052620629395, "grad_norm": 68.81783294677734, "learning_rate": 9.967948277199735e-08, "logits/chosen": -0.537903368473053, "logits/rejected": -0.5941354036331177, "logps/chosen": -180.93685913085938, "logps/rejected": -201.2696075439453, "loss": 1.5086, "nll_loss": 1.1625057458877563, "rewards/accuracies": 0.625, "rewards/chosen": 3.2426884174346924, "rewards/margins": 1.281341314315796, "rewards/rejected": 1.9613468647003174, "step": 650 }, { "epoch": 0.03661530353254463, "grad_norm": 46.247196197509766, "learning_rate": 9.966955580209398e-08, "logits/chosen": -0.536319375038147, "logits/rejected": -0.6314636468887329, "logps/chosen": -193.7196807861328, "logps/rejected": -213.14944458007812, "loss": 1.4659, "nll_loss": 1.107458472251892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1198184490203857, "rewards/margins": 1.1047157049179077, "rewards/rejected": 2.0151028633117676, "step": 660 }, { "epoch": 0.0371700808587953, "grad_norm": 46.85151672363281, "learning_rate": 9.96594779495192e-08, "logits/chosen": -0.43557968735694885, "logits/rejected": -0.531055748462677, "logps/chosen": -155.8391571044922, "logps/rejected": -196.74227905273438, "loss": 1.5626, "nll_loss": 1.0155750513076782, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.0035159587860107, "rewards/margins": 1.622287392616272, "rewards/rejected": 1.3812288045883179, "step": 670 }, { "epoch": 0.03772485818504598, "grad_norm": 174.26632690429688, "learning_rate": 9.964924924488679e-08, "logits/chosen": -0.6362596750259399, "logits/rejected": -0.7269699573516846, "logps/chosen": -206.3799285888672, "logps/rejected": -230.740966796875, "loss": 1.4886, "nll_loss": 1.1488772630691528, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 3.098313093185425, "rewards/margins": 1.6047836542129517, "rewards/rejected": 1.4935296773910522, "step": 680 }, { "epoch": 0.03827963551129665, "grad_norm": 80.29759216308594, "learning_rate": 9.963886971926878e-08, "logits/chosen": -0.6532067060470581, "logits/rejected": -0.7521528005599976, "logps/chosen": -216.44223022460938, "logps/rejected": -262.30413818359375, "loss": 1.5963, "nll_loss": 1.176645040512085, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6967339515686035, "rewards/margins": 1.538691520690918, "rewards/rejected": 2.1580424308776855, "step": 690 }, { "epoch": 0.03883441283754733, "grad_norm": 84.83824157714844, "learning_rate": 9.96283394041954e-08, "logits/chosen": -0.5008292198181152, "logits/rejected": -0.6104961633682251, "logps/chosen": -166.3019561767578, "logps/rejected": -190.43441772460938, "loss": 1.5288, "nll_loss": 1.0193572044372559, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.90037202835083, "rewards/margins": 1.5122241973876953, "rewards/rejected": 1.3881480693817139, "step": 700 }, { "epoch": 0.03938919016379801, "grad_norm": 57.603782653808594, "learning_rate": 9.961765833165484e-08, "logits/chosen": -0.7234429121017456, "logits/rejected": -0.7425190210342407, "logps/chosen": -206.9298553466797, "logps/rejected": -241.0787811279297, "loss": 1.5898, "nll_loss": 1.2404186725616455, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.3879165649414062, "rewards/margins": 1.2144347429275513, "rewards/rejected": 2.1734814643859863, "step": 710 }, { "epoch": 0.03994396749004868, "grad_norm": 55.41166687011719, "learning_rate": 9.960682653409335e-08, "logits/chosen": -0.4222160279750824, "logits/rejected": -0.5175357460975647, "logps/chosen": -190.08583068847656, "logps/rejected": -220.3984375, "loss": 1.5289, "nll_loss": 1.068023920059204, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.291684627532959, "rewards/margins": 1.6787996292114258, "rewards/rejected": 1.6128854751586914, "step": 720 }, { "epoch": 0.04049874481629936, "grad_norm": 54.48349380493164, "learning_rate": 9.959584404441498e-08, "logits/chosen": -0.5538616180419922, "logits/rejected": -0.6187587380409241, "logps/chosen": -182.77786254882812, "logps/rejected": -196.11972045898438, "loss": 1.4448, "nll_loss": 1.0926969051361084, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 3.199425220489502, "rewards/margins": 0.9773725271224976, "rewards/rejected": 2.222052812576294, "step": 730 }, { "epoch": 0.041053522142550036, "grad_norm": 114.1539077758789, "learning_rate": 9.958471089598157e-08, "logits/chosen": -0.4567710757255554, "logits/rejected": -0.5534912943840027, "logps/chosen": -155.90713500976562, "logps/rejected": -199.89883422851562, "loss": 1.4889, "nll_loss": 0.9103935360908508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 3.1568973064422607, "rewards/margins": 1.3936717510223389, "rewards/rejected": 1.7632255554199219, "step": 740 }, { "epoch": 0.04160829946880071, "grad_norm": 62.2696533203125, "learning_rate": 9.957342712261261e-08, "logits/chosen": -0.6232119798660278, "logits/rejected": -0.699394166469574, "logps/chosen": -175.486083984375, "logps/rejected": -237.1534881591797, "loss": 1.5239, "nll_loss": 1.146842122077942, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.41463041305542, "rewards/margins": 1.6498804092407227, "rewards/rejected": 1.7647498846054077, "step": 750 }, { "epoch": 0.04216307679505139, "grad_norm": 67.86518859863281, "learning_rate": 9.956199275858517e-08, "logits/chosen": -0.5560430288314819, "logits/rejected": -0.6693638563156128, "logps/chosen": -180.52297973632812, "logps/rejected": -226.023681640625, "loss": 1.5402, "nll_loss": 1.0728282928466797, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.674917697906494, "rewards/margins": 1.7028071880340576, "rewards/rejected": 1.9721105098724365, "step": 760 }, { "epoch": 0.042717854121302065, "grad_norm": 62.7828254699707, "learning_rate": 9.955040783863372e-08, "logits/chosen": -0.6319522857666016, "logits/rejected": -0.7485511302947998, "logps/chosen": -214.3317108154297, "logps/rejected": -242.33926391601562, "loss": 1.4777, "nll_loss": 1.2144583463668823, "rewards/accuracies": 0.875, "rewards/chosen": 3.5419113636016846, "rewards/margins": 1.8491764068603516, "rewards/rejected": 1.692734956741333, "step": 770 }, { "epoch": 0.04327263144755274, "grad_norm": 53.742855072021484, "learning_rate": 9.953867239795012e-08, "logits/chosen": -0.6601709127426147, "logits/rejected": -0.734355092048645, "logps/chosen": -194.30819702148438, "logps/rejected": -248.2646026611328, "loss": 1.4185, "nll_loss": 1.2157113552093506, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3003029823303223, "rewards/margins": 1.761346459388733, "rewards/rejected": 1.5389564037322998, "step": 780 }, { "epoch": 0.043827408773803415, "grad_norm": 61.616336822509766, "learning_rate": 9.95267864721835e-08, "logits/chosen": -0.5370885729789734, "logits/rejected": -0.6669055223464966, "logps/chosen": -195.1478271484375, "logps/rejected": -231.2130889892578, "loss": 1.5329, "nll_loss": 1.0364364385604858, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.594130754470825, "rewards/margins": 2.225724220275879, "rewards/rejected": 1.368406057357788, "step": 790 }, { "epoch": 0.044382186100054094, "grad_norm": 75.67598724365234, "learning_rate": 9.951475009744003e-08, "logits/chosen": -0.7143956422805786, "logits/rejected": -0.8007022142410278, "logps/chosen": -206.11630249023438, "logps/rejected": -242.289794921875, "loss": 1.5044, "nll_loss": 1.1931127309799194, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.010062217712402, "rewards/margins": 1.7878021001815796, "rewards/rejected": 2.222259998321533, "step": 800 }, { "epoch": 0.044936963426304766, "grad_norm": 61.18656539916992, "learning_rate": 9.950256331028302e-08, "logits/chosen": -0.5474633574485779, "logits/rejected": -0.6423521041870117, "logps/chosen": -190.60850524902344, "logps/rejected": -223.872802734375, "loss": 1.5677, "nll_loss": 1.1300718784332275, "rewards/accuracies": 0.75, "rewards/chosen": 3.4836227893829346, "rewards/margins": 1.1142834424972534, "rewards/rejected": 2.3693394660949707, "step": 810 }, { "epoch": 0.045491740752555444, "grad_norm": 47.220394134521484, "learning_rate": 9.949022614773256e-08, "logits/chosen": -0.3815156817436218, "logits/rejected": -0.5080620050430298, "logps/chosen": -151.33038330078125, "logps/rejected": -183.33843994140625, "loss": 1.4676, "nll_loss": 0.9675423502922058, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.125697612762451, "rewards/margins": 1.7089370489120483, "rewards/rejected": 1.4167604446411133, "step": 820 }, { "epoch": 0.046046518078806116, "grad_norm": 57.33856964111328, "learning_rate": 9.94777386472657e-08, "logits/chosen": -0.702529788017273, "logits/rejected": -0.7468611598014832, "logps/chosen": -195.8870391845703, "logps/rejected": -253.25631713867188, "loss": 1.4441, "nll_loss": 1.1202274560928345, "rewards/accuracies": 0.875, "rewards/chosen": 3.815953016281128, "rewards/margins": 1.9469791650772095, "rewards/rejected": 1.8689740896224976, "step": 830 }, { "epoch": 0.046601295405056795, "grad_norm": 70.29081726074219, "learning_rate": 9.9465100846816e-08, "logits/chosen": -0.3773537278175354, "logits/rejected": -0.48507270216941833, "logps/chosen": -152.4845428466797, "logps/rejected": -187.5793914794922, "loss": 1.4435, "nll_loss": 0.8839551210403442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.077239513397217, "rewards/margins": 1.1600372791290283, "rewards/rejected": 1.9172027111053467, "step": 840 }, { "epoch": 0.04715607273130747, "grad_norm": 62.970558166503906, "learning_rate": 9.945231278477374e-08, "logits/chosen": -0.4544038772583008, "logits/rejected": -0.5889581441879272, "logps/chosen": -163.62313842773438, "logps/rejected": -207.5544891357422, "loss": 1.4982, "nll_loss": 1.0126638412475586, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.214735507965088, "rewards/margins": 2.0006070137023926, "rewards/rejected": 1.2141282558441162, "step": 850 }, { "epoch": 0.047710850057558145, "grad_norm": 136.8060760498047, "learning_rate": 9.943937449998556e-08, "logits/chosen": -0.697186291217804, "logits/rejected": -0.7817809581756592, "logps/chosen": -224.75277709960938, "logps/rejected": -257.46600341796875, "loss": 1.6197, "nll_loss": 1.2137398719787598, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.9525508880615234, "rewards/margins": 1.7223342657089233, "rewards/rejected": 2.2302165031433105, "step": 860 }, { "epoch": 0.04826562738380882, "grad_norm": 75.36298370361328, "learning_rate": 9.94262860317545e-08, "logits/chosen": -0.5289721488952637, "logits/rejected": -0.5983772873878479, "logps/chosen": -175.97885131835938, "logps/rejected": -205.8876953125, "loss": 1.4627, "nll_loss": 1.0002416372299194, "rewards/accuracies": 0.75, "rewards/chosen": 3.4056830406188965, "rewards/margins": 1.640223503112793, "rewards/rejected": 1.765459418296814, "step": 870 }, { "epoch": 0.0488204047100595, "grad_norm": 63.62454605102539, "learning_rate": 9.941304741983973e-08, "logits/chosen": -0.38287869095802307, "logits/rejected": -0.5381209254264832, "logps/chosen": -142.0276641845703, "logps/rejected": -179.9833984375, "loss": 1.4147, "nll_loss": 0.9833440780639648, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.339392900466919, "rewards/margins": 1.7146053314208984, "rewards/rejected": 1.6247879266738892, "step": 880 }, { "epoch": 0.049375182036310174, "grad_norm": 61.91400146484375, "learning_rate": 9.939965870445664e-08, "logits/chosen": -0.40123963356018066, "logits/rejected": -0.5148253440856934, "logps/chosen": -160.2271270751953, "logps/rejected": -185.10690307617188, "loss": 1.5126, "nll_loss": 0.9827170372009277, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.2213611602783203, "rewards/margins": 1.576289415359497, "rewards/rejected": 1.6450717449188232, "step": 890 }, { "epoch": 0.04992995936256085, "grad_norm": 69.13755798339844, "learning_rate": 9.938611992627646e-08, "logits/chosen": -0.552926242351532, "logits/rejected": -0.6027761697769165, "logps/chosen": -182.66445922851562, "logps/rejected": -219.7320556640625, "loss": 1.5206, "nll_loss": 1.1046699285507202, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.8532652854919434, "rewards/margins": 1.7462717294692993, "rewards/rejected": 2.1069931983947754, "step": 900 }, { "epoch": 0.05048473668881153, "grad_norm": 54.19386291503906, "learning_rate": 9.937243112642638e-08, "logits/chosen": -0.29705414175987244, "logits/rejected": -0.4054291248321533, "logps/chosen": -164.974609375, "logps/rejected": -170.09359741210938, "loss": 1.5039, "nll_loss": 0.947147011756897, "rewards/accuracies": 0.75, "rewards/chosen": 3.277088165283203, "rewards/margins": 1.8471357822418213, "rewards/rejected": 1.4299525022506714, "step": 910 }, { "epoch": 0.0510395140150622, "grad_norm": 56.5627555847168, "learning_rate": 9.935859234648924e-08, "logits/chosen": -0.27827510237693787, "logits/rejected": -0.43911415338516235, "logps/chosen": -122.29353332519531, "logps/rejected": -162.04446411132812, "loss": 1.3927, "nll_loss": 0.7657750844955444, "rewards/accuracies": 0.75, "rewards/chosen": 2.9163410663604736, "rewards/margins": 1.802353858947754, "rewards/rejected": 1.1139872074127197, "step": 920 }, { "epoch": 0.05159429134131288, "grad_norm": 77.0843505859375, "learning_rate": 9.934460362850354e-08, "logits/chosen": -0.515708327293396, "logits/rejected": -0.5827298164367676, "logps/chosen": -187.35801696777344, "logps/rejected": -232.73379516601562, "loss": 1.5256, "nll_loss": 1.2366583347320557, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.5616250038146973, "rewards/margins": 1.882982611656189, "rewards/rejected": 1.6786420345306396, "step": 930 }, { "epoch": 0.05214906866756356, "grad_norm": 49.424930572509766, "learning_rate": 9.93304650149632e-08, "logits/chosen": -0.4492836892604828, "logits/rejected": -0.5456808805465698, "logps/chosen": -197.4065399169922, "logps/rejected": -237.48648071289062, "loss": 1.532, "nll_loss": 1.0610148906707764, "rewards/accuracies": 0.875, "rewards/chosen": 3.585148572921753, "rewards/margins": 2.1268129348754883, "rewards/rejected": 1.4583359956741333, "step": 940 }, { "epoch": 0.05270384599381423, "grad_norm": 85.44376373291016, "learning_rate": 9.931617654881752e-08, "logits/chosen": -0.3419482111930847, "logits/rejected": -0.45933622121810913, "logps/chosen": -134.19366455078125, "logps/rejected": -165.301025390625, "loss": 1.4692, "nll_loss": 0.9292080998420715, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 3.1901347637176514, "rewards/margins": 1.2426807880401611, "rewards/rejected": 1.9474540948867798, "step": 950 }, { "epoch": 0.05325862332006491, "grad_norm": 58.297786712646484, "learning_rate": 9.930173827347097e-08, "logits/chosen": -0.36291202902793884, "logits/rejected": -0.4757510721683502, "logps/chosen": -183.56942749023438, "logps/rejected": -212.82138061523438, "loss": 1.4557, "nll_loss": 0.9521719217300415, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.624375820159912, "rewards/margins": 1.6794826984405518, "rewards/rejected": 1.94489324092865, "step": 960 }, { "epoch": 0.05381340064631558, "grad_norm": 93.24034118652344, "learning_rate": 9.928715023278314e-08, "logits/chosen": -0.3988448977470398, "logits/rejected": -0.4805443286895752, "logps/chosen": -154.73455810546875, "logps/rejected": -181.87069702148438, "loss": 1.4925, "nll_loss": 1.0295617580413818, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4048991203308105, "rewards/margins": 1.5122768878936768, "rewards/rejected": 1.8926219940185547, "step": 970 }, { "epoch": 0.05436817797256626, "grad_norm": 74.3322982788086, "learning_rate": 9.927241247106855e-08, "logits/chosen": -0.5332263708114624, "logits/rejected": -0.6788077354431152, "logps/chosen": -206.145751953125, "logps/rejected": -250.1248016357422, "loss": 1.4918, "nll_loss": 1.1099421977996826, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.051599025726318, "rewards/margins": 2.0857737064361572, "rewards/rejected": 1.9658254384994507, "step": 980 }, { "epoch": 0.05492295529881694, "grad_norm": 59.86268997192383, "learning_rate": 9.92575250330965e-08, "logits/chosen": -0.4802762567996979, "logits/rejected": -0.593323826789856, "logps/chosen": -199.94229125976562, "logps/rejected": -239.09506225585938, "loss": 1.4978, "nll_loss": 1.1220160722732544, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.9692130088806152, "rewards/margins": 1.8433116674423218, "rewards/rejected": 2.125900983810425, "step": 990 }, { "epoch": 0.05547773262506761, "grad_norm": 75.09992980957031, "learning_rate": 9.924248796409105e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -167.0904998779297, "logps/rejected": -193.38198852539062, "loss": 1.4847, "nll_loss": NaN, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 3.5240886211395264, "rewards/margins": 1.3706116676330566, "rewards/rejected": 2.1534767150878906, "step": 1000 }, { "epoch": 0.05547773262506761, "eval_logits/chosen": -0.5849028825759888, "eval_logits/rejected": -0.6528286933898926, "eval_logps/chosen": -216.97853088378906, "eval_logps/rejected": -263.5906066894531, "eval_loss": 1.4300326108932495, "eval_nll_loss": 1.1286273002624512, "eval_rewards/accuracies": 0.84375, "eval_rewards/chosen": 4.0982666015625, "eval_rewards/margins": 2.5494768619537354, "eval_rewards/rejected": 1.5487897396087646, "eval_runtime": 17.0359, "eval_samples_per_second": 15.027, "eval_steps_per_second": 1.878, "step": 1000 }, { "epoch": 0.05603250995131829, "grad_norm": 63.52638244628906, "learning_rate": 9.922730130973071e-08, "logits/chosen": -0.5006999969482422, "logits/rejected": -0.6237480044364929, "logps/chosen": -202.09140014648438, "logps/rejected": -257.4411926269531, "loss": 1.5155, "nll_loss": 1.095794439315796, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9617226123809814, "rewards/margins": 2.351763963699341, "rewards/rejected": 1.6099590063095093, "step": 1010 }, { "epoch": 0.05658728727756897, "grad_norm": 60.034427642822266, "learning_rate": 9.921196511614846e-08, "logits/chosen": -0.42120856046676636, "logits/rejected": -0.4760383665561676, "logps/chosen": -141.9215087890625, "logps/rejected": -157.95162963867188, "loss": 1.4648, "nll_loss": 0.9661208391189575, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.336350679397583, "rewards/margins": 1.4146296977996826, "rewards/rejected": 1.92172110080719, "step": 1020 }, { "epoch": 0.05714206460381964, "grad_norm": 72.62284088134766, "learning_rate": 9.919647942993148e-08, "logits/chosen": -0.5252307057380676, "logits/rejected": -0.6019777655601501, "logps/chosen": -192.70838928222656, "logps/rejected": -247.6120147705078, "loss": 1.4889, "nll_loss": 1.0725551843643188, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.920849561691284, "rewards/margins": 1.793335199356079, "rewards/rejected": 2.127514600753784, "step": 1030 }, { "epoch": 0.05769684193007032, "grad_norm": 65.8388671875, "learning_rate": 9.91808442981211e-08, "logits/chosen": -0.41704192757606506, "logits/rejected": -0.5714241862297058, "logps/chosen": -186.66038513183594, "logps/rejected": -229.0773468017578, "loss": 1.3864, "nll_loss": 0.9837188720703125, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7575690746307373, "rewards/margins": 2.2268686294555664, "rewards/rejected": 1.5307005643844604, "step": 1040 }, { "epoch": 0.058251619256321, "grad_norm": 77.91301727294922, "learning_rate": 9.91650597682126e-08, "logits/chosen": -0.7347007989883423, "logits/rejected": -0.7379493713378906, "logps/chosen": -224.37423706054688, "logps/rejected": -263.5487976074219, "loss": 1.4952, "nll_loss": 1.2833170890808105, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.219830513000488, "rewards/margins": 1.8176199197769165, "rewards/rejected": 2.4022104740142822, "step": 1050 }, { "epoch": 0.05880639658257167, "grad_norm": 50.89873123168945, "learning_rate": 9.914912588815517e-08, "logits/chosen": -0.4994504451751709, "logits/rejected": -0.6095007658004761, "logps/chosen": -169.8141326904297, "logps/rejected": -214.57492065429688, "loss": 1.4384, "nll_loss": 1.0103446245193481, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.483236789703369, "rewards/margins": 1.6239086389541626, "rewards/rejected": 1.859328031539917, "step": 1060 }, { "epoch": 0.05936117390882235, "grad_norm": 75.92971801757812, "learning_rate": 9.913304270635156e-08, "logits/chosen": -0.3685445785522461, "logits/rejected": -0.4870659410953522, "logps/chosen": -164.15875244140625, "logps/rejected": -191.3291778564453, "loss": 1.5313, "nll_loss": 0.9337444305419922, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 3.4293224811553955, "rewards/margins": 1.4811770915985107, "rewards/rejected": 1.9481449127197266, "step": 1070 }, { "epoch": 0.059915951235073026, "grad_norm": 71.02056121826172, "learning_rate": 9.911681027165818e-08, "logits/chosen": -0.39040133357048035, "logits/rejected": -0.48705339431762695, "logps/chosen": -138.97422790527344, "logps/rejected": -162.4997100830078, "loss": 1.4756, "nll_loss": 0.9448060989379883, "rewards/accuracies": 0.875, "rewards/chosen": 3.3441505432128906, "rewards/margins": 1.700945258140564, "rewards/rejected": 1.643204927444458, "step": 1080 }, { "epoch": 0.0604707285613237, "grad_norm": 67.55992126464844, "learning_rate": 9.910042863338474e-08, "logits/chosen": -0.3325367569923401, "logits/rejected": -0.46114760637283325, "logps/chosen": -151.6166534423828, "logps/rejected": -196.2194366455078, "loss": 1.4461, "nll_loss": 0.9018818140029907, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.460953950881958, "rewards/margins": 2.2122249603271484, "rewards/rejected": 1.2487289905548096, "step": 1090 }, { "epoch": 0.061025505887574376, "grad_norm": 81.95460510253906, "learning_rate": 9.908389784129423e-08, "logits/chosen": -0.5383685231208801, "logits/rejected": -0.6315664052963257, "logps/chosen": -189.26133728027344, "logps/rejected": -218.469970703125, "loss": 1.4756, "nll_loss": 1.1009684801101685, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.154654502868652, "rewards/margins": 1.9512214660644531, "rewards/rejected": 2.203433036804199, "step": 1100 }, { "epoch": 0.061580283213825054, "grad_norm": 89.14315795898438, "learning_rate": 9.906721794560272e-08, "logits/chosen": -0.6819087862968445, "logits/rejected": -0.7490144968032837, "logps/chosen": -181.35693359375, "logps/rejected": -242.1012420654297, "loss": 1.4939, "nll_loss": 1.1502264738082886, "rewards/accuracies": 0.75, "rewards/chosen": 3.75810170173645, "rewards/margins": 1.9803187847137451, "rewards/rejected": 1.7777824401855469, "step": 1110 }, { "epoch": 0.062135060540075726, "grad_norm": 47.63044738769531, "learning_rate": 9.905038899697923e-08, "logits/chosen": -0.4637017846107483, "logits/rejected": -0.5964235067367554, "logps/chosen": -166.28646850585938, "logps/rejected": -200.87290954589844, "loss": 1.3853, "nll_loss": 0.9769280552864075, "rewards/accuracies": 0.75, "rewards/chosen": 3.4817116260528564, "rewards/margins": 1.9392492771148682, "rewards/rejected": 1.542462706565857, "step": 1120 }, { "epoch": 0.0626898378663264, "grad_norm": 115.41996765136719, "learning_rate": 9.903341104654555e-08, "logits/chosen": -0.6566618084907532, "logits/rejected": -0.7295863628387451, "logps/chosen": -207.2266082763672, "logps/rejected": -257.237548828125, "loss": 1.4687, "nll_loss": 1.2293752431869507, "rewards/accuracies": 0.75, "rewards/chosen": 4.08721923828125, "rewards/margins": 1.7286536693572998, "rewards/rejected": 2.358565330505371, "step": 1130 }, { "epoch": 0.06324461519257708, "grad_norm": 101.11539459228516, "learning_rate": 9.901628414587611e-08, "logits/chosen": -0.4132419228553772, "logits/rejected": -0.5075428485870361, "logps/chosen": -157.56349182128906, "logps/rejected": -176.73516845703125, "loss": 1.4391, "nll_loss": 1.0173633098602295, "rewards/accuracies": 0.75, "rewards/chosen": 3.75007700920105, "rewards/margins": 1.989717721939087, "rewards/rejected": 1.760359525680542, "step": 1140 }, { "epoch": 0.06379939251882775, "grad_norm": 74.02896118164062, "learning_rate": 9.899900834699777e-08, "logits/chosen": -0.4107815623283386, "logits/rejected": -0.5122383236885071, "logps/chosen": -172.81393432617188, "logps/rejected": -206.0386199951172, "loss": 1.5624, "nll_loss": 0.9882047772407532, "rewards/accuracies": 0.75, "rewards/chosen": 3.779421329498291, "rewards/margins": 2.0062572956085205, "rewards/rejected": 1.7731640338897705, "step": 1150 }, { "epoch": 0.06435416984507843, "grad_norm": 46.51359558105469, "learning_rate": 9.898158370238976e-08, "logits/chosen": -0.7199384570121765, "logits/rejected": -0.7897688746452332, "logps/chosen": -253.076171875, "logps/rejected": -301.18597412109375, "loss": 1.4927, "nll_loss": 1.3301750421524048, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.715771675109863, "rewards/margins": 2.7596588134765625, "rewards/rejected": 1.9561126232147217, "step": 1160 }, { "epoch": 0.06490894717132911, "grad_norm": 70.0849609375, "learning_rate": 9.896401026498343e-08, "logits/chosen": -0.5783329010009766, "logits/rejected": -0.6710189580917358, "logps/chosen": -198.61477661132812, "logps/rejected": -258.7814025878906, "loss": 1.5312, "nll_loss": 1.1829686164855957, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.90598726272583, "rewards/margins": 2.0684738159179688, "rewards/rejected": 1.8375133275985718, "step": 1170 }, { "epoch": 0.06546372449757978, "grad_norm": 166.74185180664062, "learning_rate": 9.894628808816212e-08, "logits/chosen": -0.43889349699020386, "logits/rejected": -0.5390881299972534, "logps/chosen": -200.86856079101562, "logps/rejected": -249.177734375, "loss": 1.4699, "nll_loss": 1.1773768663406372, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 3.9578583240509033, "rewards/margins": 1.9102592468261719, "rewards/rejected": 2.0475986003875732, "step": 1180 }, { "epoch": 0.06601850182383046, "grad_norm": 67.43829345703125, "learning_rate": 9.892841722576102e-08, "logits/chosen": -0.4639360308647156, "logits/rejected": -0.5482727885246277, "logps/chosen": -206.2231903076172, "logps/rejected": -238.5017547607422, "loss": 1.4611, "nll_loss": 1.0525023937225342, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9667439460754395, "rewards/margins": 1.8209073543548584, "rewards/rejected": 2.145836353302002, "step": 1190 }, { "epoch": 0.06657327915008114, "grad_norm": 52.553775787353516, "learning_rate": 9.891039773206698e-08, "logits/chosen": -0.26780468225479126, "logits/rejected": -0.4361448287963867, "logps/chosen": -144.7643585205078, "logps/rejected": -160.23165893554688, "loss": 1.4146, "nll_loss": 0.84807288646698, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4257991313934326, "rewards/margins": 2.3991568088531494, "rewards/rejected": 1.026642084121704, "step": 1200 }, { "epoch": 0.06712805647633181, "grad_norm": 67.21038818359375, "learning_rate": 9.889222966181832e-08, "logits/chosen": -0.3794083595275879, "logits/rejected": -0.4984433650970459, "logps/chosen": -149.71102905273438, "logps/rejected": -184.6141357421875, "loss": 1.4207, "nll_loss": 0.8969683647155762, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.4109930992126465, "rewards/margins": 2.001525640487671, "rewards/rejected": 1.4094676971435547, "step": 1210 }, { "epoch": 0.06768283380258248, "grad_norm": 52.498809814453125, "learning_rate": 9.887391307020474e-08, "logits/chosen": -0.5022028684616089, "logits/rejected": -0.6018491983413696, "logps/chosen": -176.34487915039062, "logps/rejected": -228.7765350341797, "loss": 1.4615, "nll_loss": 1.0426609516143799, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.007895469665527, "rewards/margins": 2.2430477142333984, "rewards/rejected": 1.7648475170135498, "step": 1220 }, { "epoch": 0.06823761112883317, "grad_norm": 87.10646057128906, "learning_rate": 9.885544801286707e-08, "logits/chosen": -0.5170990228652954, "logits/rejected": -0.6354082226753235, "logps/chosen": -222.16311645507812, "logps/rejected": -261.0590515136719, "loss": 1.4259, "nll_loss": 1.1672570705413818, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.265625476837158, "rewards/margins": 1.9461997747421265, "rewards/rejected": 2.319425106048584, "step": 1230 }, { "epoch": 0.06879238845508384, "grad_norm": 51.09988784790039, "learning_rate": 9.883683454589719e-08, "logits/chosen": -0.5030714273452759, "logits/rejected": -0.515740156173706, "logps/chosen": -187.69720458984375, "logps/rejected": -203.89950561523438, "loss": 1.45, "nll_loss": 1.1443597078323364, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7398152351379395, "rewards/margins": 1.2381267547607422, "rewards/rejected": 2.5016884803771973, "step": 1240 }, { "epoch": 0.06934716578133451, "grad_norm": 55.36213302612305, "learning_rate": 9.881807272583775e-08, "logits/chosen": -0.504234790802002, "logits/rejected": -0.6037534475326538, "logps/chosen": -199.28404235839844, "logps/rejected": -232.842529296875, "loss": 1.5279, "nll_loss": 1.1022770404815674, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.129483699798584, "rewards/margins": 2.379916191101074, "rewards/rejected": 1.7495676279067993, "step": 1250 }, { "epoch": 0.0699019431075852, "grad_norm": 81.34074401855469, "learning_rate": 9.87991626096821e-08, "logits/chosen": -0.39803606271743774, "logits/rejected": -0.5363016724586487, "logps/chosen": -169.8374786376953, "logps/rejected": -200.1742706298828, "loss": 1.4199, "nll_loss": 1.0287504196166992, "rewards/accuracies": 0.875, "rewards/chosen": 3.8434112071990967, "rewards/margins": 2.135047435760498, "rewards/rejected": 1.7083642482757568, "step": 1260 }, { "epoch": 0.07045672043383587, "grad_norm": 58.16438674926758, "learning_rate": 9.878010425487406e-08, "logits/chosen": -0.3032079339027405, "logits/rejected": -0.4739023745059967, "logps/chosen": -171.28402709960938, "logps/rejected": -232.79196166992188, "loss": 1.4122, "nll_loss": 0.9198330044746399, "rewards/accuracies": 0.875, "rewards/chosen": 3.7980828285217285, "rewards/margins": 2.5010132789611816, "rewards/rejected": 1.2970690727233887, "step": 1270 }, { "epoch": 0.07101149776008654, "grad_norm": 136.7597198486328, "learning_rate": 9.876089771930773e-08, "logits/chosen": -0.3859093189239502, "logits/rejected": -0.480471670627594, "logps/chosen": -202.7171173095703, "logps/rejected": -246.14987182617188, "loss": 1.4481, "nll_loss": 1.058734655380249, "rewards/accuracies": 0.875, "rewards/chosen": 4.078368186950684, "rewards/margins": 2.5813488960266113, "rewards/rejected": 1.497018814086914, "step": 1280 }, { "epoch": 0.07156627508633723, "grad_norm": 72.67900848388672, "learning_rate": 9.87415430613274e-08, "logits/chosen": -0.6447448134422302, "logits/rejected": -0.6883346438407898, "logps/chosen": -196.11752319335938, "logps/rejected": -229.57937622070312, "loss": 1.5414, "nll_loss": 1.1916755437850952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.364915370941162, "rewards/margins": 1.8597400188446045, "rewards/rejected": 2.505175828933716, "step": 1290 }, { "epoch": 0.0721210524125879, "grad_norm": 87.22559356689453, "learning_rate": 9.872204033972725e-08, "logits/chosen": -0.48257774114608765, "logits/rejected": -0.5261390209197998, "logps/chosen": -162.76255798339844, "logps/rejected": -173.9418182373047, "loss": 1.4177, "nll_loss": 1.1087771654129028, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 3.952287197113037, "rewards/margins": 0.9866729974746704, "rewards/rejected": 2.9656143188476562, "step": 1300 }, { "epoch": 0.07267582973883857, "grad_norm": 58.74521255493164, "learning_rate": 9.87023896137513e-08, "logits/chosen": -0.5969017744064331, "logits/rejected": -0.6603950262069702, "logps/chosen": -196.26272583007812, "logps/rejected": -233.4521484375, "loss": 1.4637, "nll_loss": 1.133876085281372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.516307830810547, "rewards/margins": 2.2669475078582764, "rewards/rejected": 2.2493605613708496, "step": 1310 }, { "epoch": 0.07323060706508926, "grad_norm": 71.52519226074219, "learning_rate": 9.868259094309312e-08, "logits/chosen": -0.4480765461921692, "logits/rejected": -0.5158835649490356, "logps/chosen": -180.5970916748047, "logps/rejected": -208.57308959960938, "loss": 1.4415, "nll_loss": 0.9754490852355957, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.8777382373809814, "rewards/margins": 2.0324997901916504, "rewards/rejected": 1.8452380895614624, "step": 1320 }, { "epoch": 0.07378538439133993, "grad_norm": 80.70097351074219, "learning_rate": 9.866264438789573e-08, "logits/chosen": -0.3521033823490143, "logits/rejected": -0.5131471157073975, "logps/chosen": -164.20703125, "logps/rejected": -206.5201873779297, "loss": 1.4401, "nll_loss": 0.8772087097167969, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5664222240448, "rewards/margins": 2.3574984073638916, "rewards/rejected": 1.2089238166809082, "step": 1330 }, { "epoch": 0.0743401617175906, "grad_norm": 82.3136215209961, "learning_rate": 9.864255000875135e-08, "logits/chosen": -0.6259538531303406, "logits/rejected": -0.6996973156929016, "logps/chosen": -176.74862670898438, "logps/rejected": -215.646728515625, "loss": 1.342, "nll_loss": 1.1308798789978027, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.244176864624023, "rewards/margins": 2.25408935546875, "rewards/rejected": 1.9900877475738525, "step": 1340 }, { "epoch": 0.07489493904384127, "grad_norm": 53.10133361816406, "learning_rate": 9.862230786670127e-08, "logits/chosen": -0.4824862480163574, "logits/rejected": -0.5932101607322693, "logps/chosen": -173.16281127929688, "logps/rejected": -226.42526245117188, "loss": 1.4382, "nll_loss": 1.0056755542755127, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.18387508392334, "rewards/margins": 2.5786943435668945, "rewards/rejected": 1.6051805019378662, "step": 1350 }, { "epoch": 0.07544971637009196, "grad_norm": 64.7579574584961, "learning_rate": 9.860191802323567e-08, "logits/chosen": -0.46298331022262573, "logits/rejected": -0.5535684823989868, "logps/chosen": -187.52774047851562, "logps/rejected": -206.24996948242188, "loss": 1.4767, "nll_loss": 1.0748491287231445, "rewards/accuracies": 0.75, "rewards/chosen": 3.873023271560669, "rewards/margins": 1.7396652698516846, "rewards/rejected": 2.1333580017089844, "step": 1360 }, { "epoch": 0.07600449369634263, "grad_norm": 70.06917572021484, "learning_rate": 9.858138054029334e-08, "logits/chosen": -0.4316721558570862, "logits/rejected": -0.5568591356277466, "logps/chosen": -165.09683227539062, "logps/rejected": -199.37698364257812, "loss": 1.5074, "nll_loss": 0.9594496488571167, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.6436493396759033, "rewards/margins": 1.799843430519104, "rewards/rejected": 1.8438060283660889, "step": 1370 }, { "epoch": 0.0765592710225933, "grad_norm": 64.8796157836914, "learning_rate": 9.85606954802616e-08, "logits/chosen": -0.49656182527542114, "logits/rejected": -0.6436377763748169, "logps/chosen": -179.42794799804688, "logps/rejected": -237.13687133789062, "loss": 1.4397, "nll_loss": 1.0689128637313843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.391759872436523, "rewards/margins": 2.9239375591278076, "rewards/rejected": 1.4678226709365845, "step": 1380 }, { "epoch": 0.07711404834884399, "grad_norm": 64.41740417480469, "learning_rate": 9.85398629059761e-08, "logits/chosen": -0.4562680125236511, "logits/rejected": -0.5946453809738159, "logps/chosen": -196.96487426757812, "logps/rejected": -232.92086791992188, "loss": 1.5122, "nll_loss": 1.0467199087142944, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.238462924957275, "rewards/margins": 2.175060987472534, "rewards/rejected": 2.063401937484741, "step": 1390 }, { "epoch": 0.07766882567509466, "grad_norm": 75.7577133178711, "learning_rate": 9.851888288072053e-08, "logits/chosen": -0.49670663475990295, "logits/rejected": -0.5817729234695435, "logps/chosen": -221.2942352294922, "logps/rejected": -270.5111999511719, "loss": 1.5238, "nll_loss": 1.1094176769256592, "rewards/accuracies": 0.75, "rewards/chosen": 4.342289924621582, "rewards/margins": 1.7921016216278076, "rewards/rejected": 2.5501883029937744, "step": 1400 }, { "epoch": 0.07822360300134533, "grad_norm": 52.14347839355469, "learning_rate": 9.849775546822654e-08, "logits/chosen": -0.3518516719341278, "logits/rejected": -0.47065719962120056, "logps/chosen": -170.92910766601562, "logps/rejected": -220.5255584716797, "loss": 1.4916, "nll_loss": 1.0167186260223389, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.730181932449341, "rewards/margins": 2.0523428916931152, "rewards/rejected": 1.6778392791748047, "step": 1410 }, { "epoch": 0.07877838032759601, "grad_norm": 91.87435150146484, "learning_rate": 9.847648073267349e-08, "logits/chosen": -0.35065048933029175, "logits/rejected": -0.44443875551223755, "logps/chosen": -166.6290740966797, "logps/rejected": -195.34881591796875, "loss": 1.4514, "nll_loss": 1.0776584148406982, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 3.8567519187927246, "rewards/margins": 1.3364388942718506, "rewards/rejected": 2.5203135013580322, "step": 1420 }, { "epoch": 0.07933315765384669, "grad_norm": 70.29676055908203, "learning_rate": 9.845505873868828e-08, "logits/chosen": -0.5298658609390259, "logits/rejected": -0.5880465507507324, "logps/chosen": -199.88330078125, "logps/rejected": -245.65884399414062, "loss": 1.4259, "nll_loss": 1.1225529909133911, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.7313690185546875, "rewards/margins": 2.4238762855529785, "rewards/rejected": 2.307492733001709, "step": 1430 }, { "epoch": 0.07988793498009736, "grad_norm": 48.4738883972168, "learning_rate": 9.843348955134512e-08, "logits/chosen": -0.24719564616680145, "logits/rejected": -0.36896735429763794, "logps/chosen": -125.46437072753906, "logps/rejected": -174.4749755859375, "loss": 1.3892, "nll_loss": 0.8810212016105652, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6172351837158203, "rewards/margins": 1.618438482284546, "rewards/rejected": 1.9987967014312744, "step": 1440 }, { "epoch": 0.08044271230634804, "grad_norm": 53.75647735595703, "learning_rate": 9.841177323616539e-08, "logits/chosen": -0.38867291808128357, "logits/rejected": -0.45642417669296265, "logps/chosen": -171.5618896484375, "logps/rejected": -219.5745391845703, "loss": 1.4276, "nll_loss": 1.084578514099121, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.974804639816284, "rewards/margins": 1.9698415994644165, "rewards/rejected": 2.0049631595611572, "step": 1450 }, { "epoch": 0.08099748963259872, "grad_norm": 77.83335876464844, "learning_rate": 9.838990985911733e-08, "logits/chosen": -0.2802060842514038, "logits/rejected": -0.42082786560058594, "logps/chosen": -182.0515594482422, "logps/rejected": -234.0809783935547, "loss": 1.36, "nll_loss": 0.9822225570678711, "rewards/accuracies": 0.875, "rewards/chosen": 4.118023872375488, "rewards/margins": 2.575249671936035, "rewards/rejected": 1.5427742004394531, "step": 1460 }, { "epoch": 0.08155226695884939, "grad_norm": 102.21620178222656, "learning_rate": 9.836789948661601e-08, "logits/chosen": -0.525530993938446, "logits/rejected": -0.6287695169448853, "logps/chosen": -194.15786743164062, "logps/rejected": -247.2244873046875, "loss": 1.4288, "nll_loss": 1.122293472290039, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.391050338745117, "rewards/margins": 3.0546398162841797, "rewards/rejected": 1.3364105224609375, "step": 1470 }, { "epoch": 0.08210704428510007, "grad_norm": 45.071205139160156, "learning_rate": 9.834574218552296e-08, "logits/chosen": -0.47796911001205444, "logits/rejected": -0.5628957748413086, "logps/chosen": -219.58468627929688, "logps/rejected": -254.8559112548828, "loss": 1.4685, "nll_loss": 1.1712572574615479, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.881595611572266, "rewards/margins": 2.209044933319092, "rewards/rejected": 2.6725502014160156, "step": 1480 }, { "epoch": 0.08266182161135074, "grad_norm": 55.145233154296875, "learning_rate": 9.832343802314609e-08, "logits/chosen": -0.4577816426753998, "logits/rejected": -0.5360755920410156, "logps/chosen": -166.1474151611328, "logps/rejected": -212.7266082763672, "loss": 1.5458, "nll_loss": 1.0354721546173096, "rewards/accuracies": 0.75, "rewards/chosen": 3.9975013732910156, "rewards/margins": 1.7976630926132202, "rewards/rejected": 2.1998379230499268, "step": 1490 }, { "epoch": 0.08321659893760142, "grad_norm": 60.92216491699219, "learning_rate": 9.830098706723939e-08, "logits/chosen": -0.4298805296421051, "logits/rejected": -0.5215967893600464, "logps/chosen": -168.5172576904297, "logps/rejected": -216.61776733398438, "loss": 1.4519, "nll_loss": 1.0098575353622437, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 3.8528778553009033, "rewards/margins": 2.0149474143981934, "rewards/rejected": 1.837930679321289, "step": 1500 }, { "epoch": 0.08321659893760142, "eval_logits/chosen": -0.5136753916740417, "eval_logits/rejected": -0.5830292105674744, "eval_logps/chosen": -210.95797729492188, "eval_logps/rejected": -261.8099060058594, "eval_loss": 1.3702356815338135, "eval_nll_loss": 1.0946283340454102, "eval_rewards/accuracies": 0.8125, "eval_rewards/chosen": 4.700324535369873, "eval_rewards/margins": 2.973465919494629, "eval_rewards/rejected": 1.7268586158752441, "eval_runtime": 17.0904, "eval_samples_per_second": 14.979, "eval_steps_per_second": 1.872, "step": 1500 }, { "epoch": 0.0837713762638521, "grad_norm": 53.873538970947266, "learning_rate": 9.82783893860028e-08, "logits/chosen": -0.5111064314842224, "logits/rejected": -0.5315500497817993, "logps/chosen": -219.55331420898438, "logps/rejected": -232.83358764648438, "loss": 1.4566, "nll_loss": 1.1236168146133423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.430356502532959, "rewards/margins": 2.316408634185791, "rewards/rejected": 2.113948345184326, "step": 1510 }, { "epoch": 0.08432615359010277, "grad_norm": 71.6829605102539, "learning_rate": 9.825564504808194e-08, "logits/chosen": -0.2526777386665344, "logits/rejected": -0.4250311851501465, "logps/chosen": -171.2292938232422, "logps/rejected": -220.65261840820312, "loss": 1.3977, "nll_loss": 1.0460208654403687, "rewards/accuracies": 0.875, "rewards/chosen": 3.5906548500061035, "rewards/margins": 2.9093871116638184, "rewards/rejected": 0.6812671422958374, "step": 1520 }, { "epoch": 0.08488093091635344, "grad_norm": 113.11854553222656, "learning_rate": 9.8232754122568e-08, "logits/chosen": -0.40277594327926636, "logits/rejected": -0.535345733165741, "logps/chosen": -165.22146606445312, "logps/rejected": -204.9488525390625, "loss": 1.3966, "nll_loss": 0.9646211862564087, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.141239166259766, "rewards/margins": 2.4749937057495117, "rewards/rejected": 1.6662452220916748, "step": 1530 }, { "epoch": 0.08543570824260413, "grad_norm": 52.08089065551758, "learning_rate": 9.820971667899738e-08, "logits/chosen": -0.31119513511657715, "logits/rejected": -0.412311851978302, "logps/chosen": -157.13917541503906, "logps/rejected": -171.68382263183594, "loss": 1.4096, "nll_loss": 0.9384014010429382, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6781508922576904, "rewards/margins": 1.3841872215270996, "rewards/rejected": 2.29396390914917, "step": 1540 }, { "epoch": 0.0859904855688548, "grad_norm": 47.32821273803711, "learning_rate": 9.81865327873516e-08, "logits/chosen": -0.38009652495384216, "logits/rejected": -0.46700453758239746, "logps/chosen": -166.48367309570312, "logps/rejected": -226.55978393554688, "loss": 1.4395, "nll_loss": 1.0256550312042236, "rewards/accuracies": 0.75, "rewards/chosen": 4.013241291046143, "rewards/margins": 2.4705066680908203, "rewards/rejected": 1.5427347421646118, "step": 1550 }, { "epoch": 0.08654526289510547, "grad_norm": 62.39463806152344, "learning_rate": 9.816320251805707e-08, "logits/chosen": -0.42687439918518066, "logits/rejected": -0.5247625708580017, "logps/chosen": -184.75782775878906, "logps/rejected": -218.777099609375, "loss": 1.3588, "nll_loss": 1.0533530712127686, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.659199237823486, "rewards/margins": 2.8975088596343994, "rewards/rejected": 1.7616904973983765, "step": 1560 }, { "epoch": 0.08710004022135616, "grad_norm": 61.17539978027344, "learning_rate": 9.813972594198482e-08, "logits/chosen": -0.1231955885887146, "logits/rejected": -0.18562906980514526, "logps/chosen": -125.1419448852539, "logps/rejected": -158.606201171875, "loss": 1.52, "nll_loss": 0.7582105398178101, "rewards/accuracies": 0.625, "rewards/chosen": 3.6675257682800293, "rewards/margins": 1.8140029907226562, "rewards/rejected": 1.8535226583480835, "step": 1570 }, { "epoch": 0.08765481754760683, "grad_norm": 67.13497161865234, "learning_rate": 9.811610313045036e-08, "logits/chosen": -0.40413790941238403, "logits/rejected": -0.5055649280548096, "logps/chosen": -157.7107696533203, "logps/rejected": -210.97189331054688, "loss": 1.4775, "nll_loss": 0.965429425239563, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.296413898468018, "rewards/margins": 2.041860342025757, "rewards/rejected": 2.2545535564422607, "step": 1580 }, { "epoch": 0.0882095948738575, "grad_norm": 56.057708740234375, "learning_rate": 9.809233415521336e-08, "logits/chosen": -0.5187186598777771, "logits/rejected": -0.649011492729187, "logps/chosen": -191.6517791748047, "logps/rejected": -231.47738647460938, "loss": 1.5193, "nll_loss": 1.0975863933563232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.417916297912598, "rewards/margins": 2.629579544067383, "rewards/rejected": 1.7883371114730835, "step": 1590 }, { "epoch": 0.08876437220010819, "grad_norm": 42.684879302978516, "learning_rate": 9.806841908847757e-08, "logits/chosen": -0.4185276925563812, "logits/rejected": -0.5338040590286255, "logps/chosen": -170.87448120117188, "logps/rejected": -228.2291259765625, "loss": 1.3534, "nll_loss": 0.9347308278083801, "rewards/accuracies": 0.875, "rewards/chosen": 4.035393238067627, "rewards/margins": 2.006787061691284, "rewards/rejected": 2.0286059379577637, "step": 1600 }, { "epoch": 0.08931914952635886, "grad_norm": 44.60700607299805, "learning_rate": 9.804435800289046e-08, "logits/chosen": -0.4805383086204529, "logits/rejected": -0.5604225397109985, "logps/chosen": -190.8730926513672, "logps/rejected": -237.6107940673828, "loss": 1.3941, "nll_loss": 1.0892784595489502, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.903960227966309, "rewards/margins": 2.260164499282837, "rewards/rejected": 2.6437954902648926, "step": 1610 }, { "epoch": 0.08987392685260953, "grad_norm": 77.23152160644531, "learning_rate": 9.802015097154314e-08, "logits/chosen": -0.2898111641407013, "logits/rejected": -0.43970757722854614, "logps/chosen": -159.96951293945312, "logps/rejected": -189.67385864257812, "loss": 1.4362, "nll_loss": 0.9260392189025879, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.9348278045654297, "rewards/margins": 2.105313301086426, "rewards/rejected": 1.8295142650604248, "step": 1620 }, { "epoch": 0.0904287041788602, "grad_norm": 158.89340209960938, "learning_rate": 9.799579806796998e-08, "logits/chosen": -0.40225309133529663, "logits/rejected": -0.5593348741531372, "logps/chosen": -159.14244079589844, "logps/rejected": -219.98593139648438, "loss": 1.3974, "nll_loss": 0.9277356863021851, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.9359307289123535, "rewards/margins": 2.5688395500183105, "rewards/rejected": 1.3670909404754639, "step": 1630 }, { "epoch": 0.09098348150511089, "grad_norm": 59.240806579589844, "learning_rate": 9.797129936614854e-08, "logits/chosen": -0.3756170868873596, "logits/rejected": -0.5068528056144714, "logps/chosen": -181.37208557128906, "logps/rejected": -230.23306274414062, "loss": 1.3775, "nll_loss": 1.0865579843521118, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.105896472930908, "rewards/margins": 2.750957489013672, "rewards/rejected": 1.3549387454986572, "step": 1640 }, { "epoch": 0.09153825883136156, "grad_norm": 55.147117614746094, "learning_rate": 9.794665494049925e-08, "logits/chosen": -0.4884285032749176, "logits/rejected": -0.5975539088249207, "logps/chosen": -193.12168884277344, "logps/rejected": -249.9346160888672, "loss": 1.4176, "nll_loss": 1.0450865030288696, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.538626670837402, "rewards/margins": 2.036604404449463, "rewards/rejected": 2.5020222663879395, "step": 1650 }, { "epoch": 0.09209303615761223, "grad_norm": 81.85306549072266, "learning_rate": 9.792186486588518e-08, "logits/chosen": -0.4586809277534485, "logits/rejected": -0.5848284959793091, "logps/chosen": -202.53909301757812, "logps/rejected": -270.3190002441406, "loss": 1.4877, "nll_loss": 1.1120998859405518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.986132621765137, "rewards/margins": 2.992452383041382, "rewards/rejected": 1.9936797618865967, "step": 1660 }, { "epoch": 0.09264781348386292, "grad_norm": 55.15419387817383, "learning_rate": 9.789692921761188e-08, "logits/chosen": -0.4707298278808594, "logits/rejected": -0.6037745475769043, "logps/chosen": -176.55191040039062, "logps/rejected": -206.2583465576172, "loss": 1.5395, "nll_loss": 1.0647156238555908, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.59691858291626, "rewards/margins": 2.427427291870117, "rewards/rejected": 2.1694915294647217, "step": 1670 }, { "epoch": 0.09320259081011359, "grad_norm": 55.7188720703125, "learning_rate": 9.787184807142712e-08, "logits/chosen": -0.3414674699306488, "logits/rejected": -0.4763232171535492, "logps/chosen": -168.2011260986328, "logps/rejected": -199.79759216308594, "loss": 1.4015, "nll_loss": 0.9019485712051392, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.1074419021606445, "rewards/margins": 2.4492523670196533, "rewards/rejected": 1.6581900119781494, "step": 1680 }, { "epoch": 0.09375736813636426, "grad_norm": 109.35962677001953, "learning_rate": 9.784662150352062e-08, "logits/chosen": -0.4362913966178894, "logits/rejected": -0.4770258963108063, "logps/chosen": -187.99073791503906, "logps/rejected": -217.7333984375, "loss": 1.4942, "nll_loss": 1.0999925136566162, "rewards/accuracies": 0.75, "rewards/chosen": 4.209378242492676, "rewards/margins": 1.955529808998108, "rewards/rejected": 2.2538483142852783, "step": 1690 }, { "epoch": 0.09431214546261495, "grad_norm": 59.52936935424805, "learning_rate": 9.782124959052387e-08, "logits/chosen": -0.31050777435302734, "logits/rejected": -0.4834163784980774, "logps/chosen": -151.91183471679688, "logps/rejected": -206.20889282226562, "loss": 1.4293, "nll_loss": 1.0492300987243652, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.9366631507873535, "rewards/margins": 2.777130365371704, "rewards/rejected": 1.1595325469970703, "step": 1700 }, { "epoch": 0.09486692278886562, "grad_norm": 54.30078125, "learning_rate": 9.779573240950986e-08, "logits/chosen": -0.3345809876918793, "logits/rejected": -0.49536198377609253, "logps/chosen": -164.2360076904297, "logps/rejected": -208.17587280273438, "loss": 1.4578, "nll_loss": 0.9239109754562378, "rewards/accuracies": 0.875, "rewards/chosen": 4.0734710693359375, "rewards/margins": 2.3916687965393066, "rewards/rejected": 1.6818021535873413, "step": 1710 }, { "epoch": 0.09542170011511629, "grad_norm": 70.32289123535156, "learning_rate": 9.777007003799293e-08, "logits/chosen": -0.3356882631778717, "logits/rejected": -0.44931039214134216, "logps/chosen": -144.88253784179688, "logps/rejected": -189.18826293945312, "loss": 1.3894, "nll_loss": 0.9175260663032532, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.052679538726807, "rewards/margins": 2.4650509357452393, "rewards/rejected": 1.5876284837722778, "step": 1720 }, { "epoch": 0.09597647744136698, "grad_norm": 63.81721878051758, "learning_rate": 9.774426255392838e-08, "logits/chosen": -0.4146268367767334, "logits/rejected": -0.5499147772789001, "logps/chosen": -183.33480834960938, "logps/rejected": -236.8433837890625, "loss": 1.4114, "nll_loss": 1.0112346410751343, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3223090171813965, "rewards/margins": 3.125511646270752, "rewards/rejected": 1.196797251701355, "step": 1730 }, { "epoch": 0.09653125476761765, "grad_norm": 62.25202560424805, "learning_rate": 9.771831003571235e-08, "logits/chosen": -0.38659048080444336, "logits/rejected": -0.5219605565071106, "logps/chosen": -150.69058227539062, "logps/rejected": -203.98416137695312, "loss": 1.4122, "nll_loss": 0.9478418231010437, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.085387229919434, "rewards/margins": 2.139543056488037, "rewards/rejected": 1.9458439350128174, "step": 1740 }, { "epoch": 0.09708603209386832, "grad_norm": 59.93496322631836, "learning_rate": 9.769221256218163e-08, "logits/chosen": -0.3459341526031494, "logits/rejected": -0.4226457476615906, "logps/chosen": -146.584228515625, "logps/rejected": -199.90530395507812, "loss": 1.351, "nll_loss": 1.013200283050537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9782562255859375, "rewards/margins": 1.7799739837646484, "rewards/rejected": 2.198282241821289, "step": 1750 }, { "epoch": 0.097640809420119, "grad_norm": 176.6165313720703, "learning_rate": 9.766597021261323e-08, "logits/chosen": -0.4375430643558502, "logits/rejected": -0.5497900247573853, "logps/chosen": -189.30075073242188, "logps/rejected": -244.5822296142578, "loss": 1.3991, "nll_loss": 1.0267442464828491, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.295384407043457, "rewards/margins": 2.189558744430542, "rewards/rejected": 2.105825185775757, "step": 1760 }, { "epoch": 0.09819558674636968, "grad_norm": 69.3628158569336, "learning_rate": 9.763958306672433e-08, "logits/chosen": -0.3780445456504822, "logits/rejected": -0.46087446808815, "logps/chosen": -152.63388061523438, "logps/rejected": -201.66612243652344, "loss": 1.4051, "nll_loss": 0.9626606702804565, "rewards/accuracies": 0.625, "rewards/chosen": 3.8893821239471436, "rewards/margins": 1.7870652675628662, "rewards/rejected": 2.1023168563842773, "step": 1770 }, { "epoch": 0.09875036407262035, "grad_norm": 75.25344848632812, "learning_rate": 9.761305120467192e-08, "logits/chosen": -0.5166088342666626, "logits/rejected": -0.6094520688056946, "logps/chosen": -200.65896606445312, "logps/rejected": -249.94650268554688, "loss": 1.4433, "nll_loss": 1.131493330001831, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.6476335525512695, "rewards/margins": 2.0360946655273438, "rewards/rejected": 2.6115384101867676, "step": 1780 }, { "epoch": 0.09930514139887103, "grad_norm": 152.2681884765625, "learning_rate": 9.758637470705263e-08, "logits/chosen": -0.4471518397331238, "logits/rejected": -0.5774090886116028, "logps/chosen": -179.9422149658203, "logps/rejected": -255.3101348876953, "loss": 1.4773, "nll_loss": 1.0336309671401978, "rewards/accuracies": 0.875, "rewards/chosen": 5.0185651779174805, "rewards/margins": 3.1911580562591553, "rewards/rejected": 1.827406883239746, "step": 1790 }, { "epoch": 0.0998599187251217, "grad_norm": 46.491920471191406, "learning_rate": 9.755955365490245e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.13861083984375, "logps/rejected": -212.89462280273438, "loss": 1.3892, "nll_loss": NaN, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.287023067474365, "rewards/margins": 2.0191829204559326, "rewards/rejected": 2.26784086227417, "step": 1800 }, { "epoch": 0.10041469605137238, "grad_norm": 48.82244873046875, "learning_rate": 9.753258812969647e-08, "logits/chosen": -0.3557376265525818, "logits/rejected": -0.5209270715713501, "logps/chosen": -165.89688110351562, "logps/rejected": -213.8446502685547, "loss": 1.4122, "nll_loss": 0.9721792340278625, "rewards/accuracies": 0.75, "rewards/chosen": 4.09226655960083, "rewards/margins": 2.568732976913452, "rewards/rejected": 1.5235334634780884, "step": 1810 }, { "epoch": 0.10096947337762306, "grad_norm": 55.496524810791016, "learning_rate": 9.750547821334867e-08, "logits/chosen": -0.4283338487148285, "logits/rejected": -0.5281612873077393, "logps/chosen": -168.81820678710938, "logps/rejected": -198.29931640625, "loss": 1.3929, "nll_loss": 1.0313835144042969, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.235574245452881, "rewards/margins": 1.6748672723770142, "rewards/rejected": 2.5607073307037354, "step": 1820 }, { "epoch": 0.10152425070387373, "grad_norm": 97.6830825805664, "learning_rate": 9.747822398821163e-08, "logits/chosen": -0.2920827269554138, "logits/rejected": -0.38235941529273987, "logps/chosen": -158.22317504882812, "logps/rejected": -206.8228302001953, "loss": 1.47, "nll_loss": 0.9028658866882324, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.040580749511719, "rewards/margins": 2.828991413116455, "rewards/rejected": 1.2115892171859741, "step": 1830 }, { "epoch": 0.1020790280301244, "grad_norm": 58.198036193847656, "learning_rate": 9.74508255370763e-08, "logits/chosen": -0.32580018043518066, "logits/rejected": -0.4605080187320709, "logps/chosen": -179.208984375, "logps/rejected": -224.35549926757812, "loss": 1.4179, "nll_loss": 0.9562959671020508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.129389762878418, "rewards/margins": 2.2468628883361816, "rewards/rejected": 1.8825271129608154, "step": 1840 }, { "epoch": 0.10263380535637509, "grad_norm": 52.22254180908203, "learning_rate": 9.74232829431718e-08, "logits/chosen": -0.33645763993263245, "logits/rejected": -0.40842223167419434, "logps/chosen": -180.7502899169922, "logps/rejected": -201.20211791992188, "loss": 1.4376, "nll_loss": 0.9569438099861145, "rewards/accuracies": 0.75, "rewards/chosen": 4.07291316986084, "rewards/margins": 2.01356840133667, "rewards/rejected": 2.05934476852417, "step": 1850 }, { "epoch": 0.10318858268262576, "grad_norm": 71.80465698242188, "learning_rate": 9.739559629016504e-08, "logits/chosen": -0.40867680311203003, "logits/rejected": -0.5316632986068726, "logps/chosen": -166.32406616210938, "logps/rejected": -198.76620483398438, "loss": 1.3739, "nll_loss": 0.9971723556518555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.112570285797119, "rewards/margins": 2.156506299972534, "rewards/rejected": 1.9560636281967163, "step": 1860 }, { "epoch": 0.10374336000887643, "grad_norm": 81.6459732055664, "learning_rate": 9.73677656621606e-08, "logits/chosen": -0.2995353639125824, "logits/rejected": -0.47169438004493713, "logps/chosen": -158.6389923095703, "logps/rejected": -209.0341033935547, "loss": 1.3715, "nll_loss": 0.9142085313796997, "rewards/accuracies": 0.75, "rewards/chosen": 3.959867000579834, "rewards/margins": 2.00348162651062, "rewards/rejected": 1.9563853740692139, "step": 1870 }, { "epoch": 0.10429813733512712, "grad_norm": 129.5898895263672, "learning_rate": 9.733979114370039e-08, "logits/chosen": -0.36188822984695435, "logits/rejected": -0.49326688051223755, "logps/chosen": -160.85301208496094, "logps/rejected": -207.2004852294922, "loss": 1.4365, "nll_loss": 1.1323637962341309, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.231746196746826, "rewards/margins": 2.284331798553467, "rewards/rejected": 1.9474146366119385, "step": 1880 }, { "epoch": 0.10485291466137779, "grad_norm": 35.927913665771484, "learning_rate": 9.731167281976343e-08, "logits/chosen": -0.20025630295276642, "logits/rejected": -0.40075206756591797, "logps/chosen": -136.97433471679688, "logps/rejected": -193.73257446289062, "loss": 1.3541, "nll_loss": 0.7926868200302124, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.765598773956299, "rewards/margins": 2.8811652660369873, "rewards/rejected": 0.8844332695007324, "step": 1890 }, { "epoch": 0.10540769198762846, "grad_norm": 52.5135498046875, "learning_rate": 9.728341077576558e-08, "logits/chosen": -0.3085986077785492, "logits/rejected": -0.5054864883422852, "logps/chosen": -144.28353881835938, "logps/rejected": -197.0537567138672, "loss": 1.4769, "nll_loss": 0.9166660308837891, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9618003368377686, "rewards/margins": 1.8736820220947266, "rewards/rejected": 2.088118553161621, "step": 1900 }, { "epoch": 0.10596246931387915, "grad_norm": 67.79092407226562, "learning_rate": 9.725500509755928e-08, "logits/chosen": -0.5144957304000854, "logits/rejected": -0.6106340885162354, "logps/chosen": -197.23281860351562, "logps/rejected": -215.8070068359375, "loss": 1.4379, "nll_loss": 1.1905429363250732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.12790060043335, "rewards/margins": 1.6955938339233398, "rewards/rejected": 2.432307004928589, "step": 1910 }, { "epoch": 0.10651724664012982, "grad_norm": 51.10321807861328, "learning_rate": 9.722645587143332e-08, "logits/chosen": -0.4561639726161957, "logits/rejected": -0.5225346088409424, "logps/chosen": -188.32070922851562, "logps/rejected": -241.41073608398438, "loss": 1.3968, "nll_loss": 1.073650598526001, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.139203071594238, "rewards/margins": 2.8594624996185303, "rewards/rejected": 2.279740571975708, "step": 1920 }, { "epoch": 0.10707202396638049, "grad_norm": 44.254398345947266, "learning_rate": 9.719776318411248e-08, "logits/chosen": -0.4358777105808258, "logits/rejected": -0.5370916724205017, "logps/chosen": -185.11911010742188, "logps/rejected": -222.16281127929688, "loss": 1.3303, "nll_loss": 1.0807971954345703, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.67964506149292, "rewards/margins": 2.1169772148132324, "rewards/rejected": 2.5626680850982666, "step": 1930 }, { "epoch": 0.10762680129263116, "grad_norm": 37.96709060668945, "learning_rate": 9.716892712275742e-08, "logits/chosen": -0.28701865673065186, "logits/rejected": -0.4266432821750641, "logps/chosen": -169.7532958984375, "logps/rejected": -221.8658905029297, "loss": 1.3887, "nll_loss": 1.0122158527374268, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.1084699630737305, "rewards/margins": 2.203993558883667, "rewards/rejected": 1.9044767618179321, "step": 1940 }, { "epoch": 0.10818157861888185, "grad_norm": 74.52854919433594, "learning_rate": 9.713994777496426e-08, "logits/chosen": -0.35312619805336, "logits/rejected": -0.44773387908935547, "logps/chosen": -211.1968536376953, "logps/rejected": -245.0993194580078, "loss": 1.3873, "nll_loss": 1.113231897354126, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.581801414489746, "rewards/margins": 1.4854530096054077, "rewards/rejected": 3.096348285675049, "step": 1950 }, { "epoch": 0.10873635594513252, "grad_norm": 54.876041412353516, "learning_rate": 9.711082522876444e-08, "logits/chosen": -0.32861536741256714, "logits/rejected": -0.44481348991394043, "logps/chosen": -170.75796508789062, "logps/rejected": -215.32333374023438, "loss": 1.431, "nll_loss": 0.9936912655830383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.210268020629883, "rewards/margins": 1.985047698020935, "rewards/rejected": 2.2252204418182373, "step": 1960 }, { "epoch": 0.10929113327138319, "grad_norm": 57.80482864379883, "learning_rate": 9.708155957262437e-08, "logits/chosen": -0.2759546935558319, "logits/rejected": -0.3380012512207031, "logps/chosen": -157.31033325195312, "logps/rejected": -198.11666870117188, "loss": 1.3951, "nll_loss": 1.0163196325302124, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.121042251586914, "rewards/margins": 1.612929105758667, "rewards/rejected": 2.508113145828247, "step": 1970 }, { "epoch": 0.10984591059763388, "grad_norm": 86.91264343261719, "learning_rate": 9.705215089544518e-08, "logits/chosen": -0.3782724142074585, "logits/rejected": -0.5245341062545776, "logps/chosen": -195.6573944091797, "logps/rejected": -244.4193878173828, "loss": 1.4257, "nll_loss": 1.0299656391143799, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.9395599365234375, "rewards/margins": 2.814535140991211, "rewards/rejected": 2.1250247955322266, "step": 1980 }, { "epoch": 0.11040068792388455, "grad_norm": 43.226234436035156, "learning_rate": 9.702259928656249e-08, "logits/chosen": -0.10544047504663467, "logits/rejected": -0.2132861167192459, "logps/chosen": -164.14208984375, "logps/rejected": -190.37362670898438, "loss": 1.427, "nll_loss": 1.0020391941070557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.172114372253418, "rewards/margins": 2.6576035022735596, "rewards/rejected": 1.5145113468170166, "step": 1990 }, { "epoch": 0.11095546525013522, "grad_norm": 64.444091796875, "learning_rate": 9.69929048357461e-08, "logits/chosen": -0.26225289702415466, "logits/rejected": -0.3790452182292938, "logps/chosen": -168.84194946289062, "logps/rejected": -215.24838256835938, "loss": 1.3836, "nll_loss": 1.0095182657241821, "rewards/accuracies": 0.875, "rewards/chosen": 4.499636650085449, "rewards/margins": 2.150195598602295, "rewards/rejected": 2.349440813064575, "step": 2000 }, { "epoch": 0.11095546525013522, "eval_logits/chosen": -0.4270898401737213, "eval_logits/rejected": -0.5021917819976807, "eval_logps/chosen": -208.1876983642578, "eval_logps/rejected": -262.03790283203125, "eval_loss": 1.341249704360962, "eval_nll_loss": 1.0813319683074951, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 4.977352142333984, "eval_rewards/margins": 3.27329158782959, "eval_rewards/rejected": 1.7040609121322632, "eval_runtime": 16.7677, "eval_samples_per_second": 15.267, "eval_steps_per_second": 1.908, "step": 2000 }, { "epoch": 0.1115102425763859, "grad_norm": 63.21308135986328, "learning_rate": 9.69630676331997e-08, "logits/chosen": -0.20330052077770233, "logits/rejected": -0.34869131445884705, "logps/chosen": -144.7039031982422, "logps/rejected": -180.5189666748047, "loss": 1.3936, "nll_loss": 0.895270049571991, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.967087984085083, "rewards/margins": 2.0332229137420654, "rewards/rejected": 1.933864951133728, "step": 2010 }, { "epoch": 0.11206501990263658, "grad_norm": 45.90394973754883, "learning_rate": 9.693308776956066e-08, "logits/chosen": -0.31239813566207886, "logits/rejected": -0.39418482780456543, "logps/chosen": -167.0019989013672, "logps/rejected": -191.33401489257812, "loss": 1.4304, "nll_loss": 1.0243754386901855, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.358006954193115, "rewards/margins": 1.9015105962753296, "rewards/rejected": 2.456496238708496, "step": 2020 }, { "epoch": 0.11261979722888725, "grad_norm": 104.78388214111328, "learning_rate": 9.690296533589967e-08, "logits/chosen": -0.40280312299728394, "logits/rejected": -0.535481333732605, "logps/chosen": -194.77020263671875, "logps/rejected": -264.6238708496094, "loss": 1.3595, "nll_loss": 1.0842351913452148, "rewards/accuracies": 0.875, "rewards/chosen": 5.004550933837891, "rewards/margins": 3.121086597442627, "rewards/rejected": 1.8834642171859741, "step": 2030 }, { "epoch": 0.11317457455513794, "grad_norm": 76.62211608886719, "learning_rate": 9.687270042372054e-08, "logits/chosen": -0.4506068825721741, "logits/rejected": -0.540243923664093, "logps/chosen": -207.814208984375, "logps/rejected": -275.18365478515625, "loss": 1.4491, "nll_loss": 1.1151775121688843, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.382890224456787, "rewards/margins": 3.2836947441101074, "rewards/rejected": 2.0991952419281006, "step": 2040 }, { "epoch": 0.11372935188138861, "grad_norm": 56.325111389160156, "learning_rate": 9.684229312495988e-08, "logits/chosen": -0.37510785460472107, "logits/rejected": -0.4385475516319275, "logps/chosen": -180.5703582763672, "logps/rejected": -232.40567016601562, "loss": 1.4121, "nll_loss": 1.07108473777771, "rewards/accuracies": 0.875, "rewards/chosen": 4.931944847106934, "rewards/margins": 2.992361307144165, "rewards/rejected": 1.939583420753479, "step": 2050 }, { "epoch": 0.11428412920763928, "grad_norm": 70.96062469482422, "learning_rate": 9.681174353198685e-08, "logits/chosen": -0.28505033254623413, "logits/rejected": -0.40620937943458557, "logps/chosen": -187.11529541015625, "logps/rejected": -233.93344116210938, "loss": 1.4701, "nll_loss": 1.0873926877975464, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.375863075256348, "rewards/margins": 2.9457461833953857, "rewards/rejected": 1.4301164150238037, "step": 2060 }, { "epoch": 0.11483890653388996, "grad_norm": 68.80590057373047, "learning_rate": 9.678105173760285e-08, "logits/chosen": -0.18282446265220642, "logits/rejected": -0.39009958505630493, "logps/chosen": -157.3737030029297, "logps/rejected": -227.16940307617188, "loss": 1.4138, "nll_loss": 0.8834668397903442, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.3074188232421875, "rewards/margins": 3.3912670612335205, "rewards/rejected": 0.9161517024040222, "step": 2070 }, { "epoch": 0.11539368386014064, "grad_norm": 70.03636932373047, "learning_rate": 9.675021783504122e-08, "logits/chosen": -0.30556461215019226, "logits/rejected": -0.35940033197402954, "logps/chosen": -158.53201293945312, "logps/rejected": -205.63998413085938, "loss": 1.3905, "nll_loss": 1.0669022798538208, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.223614692687988, "rewards/margins": 2.5228731632232666, "rewards/rejected": 1.7007417678833008, "step": 2080 }, { "epoch": 0.11594846118639131, "grad_norm": 55.57160949707031, "learning_rate": 9.671924191796705e-08, "logits/chosen": -0.2807254195213318, "logits/rejected": -0.4453394412994385, "logps/chosen": -152.5956268310547, "logps/rejected": -212.7981719970703, "loss": 1.4375, "nll_loss": 0.9396727681159973, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.076318740844727, "rewards/margins": 2.426535129547119, "rewards/rejected": 1.649783730506897, "step": 2090 }, { "epoch": 0.116503238512642, "grad_norm": 69.33473205566406, "learning_rate": 9.668812408047677e-08, "logits/chosen": -0.3224617540836334, "logits/rejected": -0.4488009810447693, "logps/chosen": -158.0413818359375, "logps/rejected": -199.71896362304688, "loss": 1.3829, "nll_loss": 1.0075881481170654, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.9751713275909424, "rewards/margins": 2.181872844696045, "rewards/rejected": 1.7932977676391602, "step": 2100 }, { "epoch": 0.11705801583889267, "grad_norm": 60.37345504760742, "learning_rate": 9.665686441709795e-08, "logits/chosen": -0.2691134512424469, "logits/rejected": -0.3475096821784973, "logps/chosen": -143.7865753173828, "logps/rejected": -180.9481658935547, "loss": 1.3225, "nll_loss": 0.8925089836120605, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.228122711181641, "rewards/margins": 1.9724922180175781, "rewards/rejected": 2.2556302547454834, "step": 2110 }, { "epoch": 0.11761279316514334, "grad_norm": 75.00153350830078, "learning_rate": 9.6625463022789e-08, "logits/chosen": -0.5172563195228577, "logits/rejected": -0.5626317858695984, "logps/chosen": -240.8362274169922, "logps/rejected": -280.37213134765625, "loss": 1.4174, "nll_loss": 1.210435390472412, "rewards/accuracies": 0.875, "rewards/chosen": 5.2368035316467285, "rewards/margins": 2.980569839477539, "rewards/rejected": 2.2562336921691895, "step": 2120 }, { "epoch": 0.11816757049139402, "grad_norm": 92.12297058105469, "learning_rate": 9.659391999293887e-08, "logits/chosen": -0.3115905225276947, "logits/rejected": -0.41539478302001953, "logps/chosen": -164.57894897460938, "logps/rejected": -210.8973846435547, "loss": 1.425, "nll_loss": 1.0213747024536133, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.291318893432617, "rewards/margins": 2.2107698917388916, "rewards/rejected": 2.0805487632751465, "step": 2130 }, { "epoch": 0.1187223478176447, "grad_norm": 83.99625396728516, "learning_rate": 9.656223542336671e-08, "logits/chosen": -0.4168204367160797, "logits/rejected": -0.5383001565933228, "logps/chosen": -192.26571655273438, "logps/rejected": -239.5349578857422, "loss": 1.4688, "nll_loss": 1.14029860496521, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.050605297088623, "rewards/margins": 3.2355990409851074, "rewards/rejected": 1.8150064945220947, "step": 2140 }, { "epoch": 0.11927712514389537, "grad_norm": 30.861711502075195, "learning_rate": 9.65304094103217e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -168.80422973632812, "logps/rejected": -230.22152709960938, "loss": 1.4595, "nll_loss": NaN, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 4.314932346343994, "rewards/margins": 2.1812479496002197, "rewards/rejected": 2.1336848735809326, "step": 2150 }, { "epoch": 0.11983190247014605, "grad_norm": 57.356346130371094, "learning_rate": 9.649844205048267e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -189.3910369873047, "logps/rejected": -230.20474243164062, "loss": 1.4426, "nll_loss": NaN, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.695038795471191, "rewards/margins": 2.3562328815460205, "rewards/rejected": 2.338805913925171, "step": 2160 }, { "epoch": 0.12038667979639672, "grad_norm": 87.51946258544922, "learning_rate": 9.646633344095778e-08, "logits/chosen": -0.19644995033740997, "logits/rejected": -0.37703800201416016, "logps/chosen": -167.8843231201172, "logps/rejected": -216.3540802001953, "loss": 1.4262, "nll_loss": 0.916726291179657, "rewards/accuracies": 0.875, "rewards/chosen": 4.303994178771973, "rewards/margins": 2.7248735427856445, "rewards/rejected": 1.57912015914917, "step": 2170 }, { "epoch": 0.1209414571226474, "grad_norm": 49.67451858520508, "learning_rate": 9.643408367928432e-08, "logits/chosen": -0.2518579959869385, "logits/rejected": -0.43059998750686646, "logps/chosen": -168.8131103515625, "logps/rejected": -232.3626708984375, "loss": 1.353, "nll_loss": 0.9257919192314148, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.668224811553955, "rewards/margins": 3.628369092941284, "rewards/rejected": 1.0398554801940918, "step": 2180 }, { "epoch": 0.12149623444889808, "grad_norm": 34.286373138427734, "learning_rate": 9.640169286342832e-08, "logits/chosen": -0.2528410851955414, "logits/rejected": -0.3437945246696472, "logps/chosen": -153.04017639160156, "logps/rejected": -205.0538787841797, "loss": 1.389, "nll_loss": 1.110828161239624, "rewards/accuracies": 0.75, "rewards/chosen": 4.003229141235352, "rewards/margins": 1.8116796016693115, "rewards/rejected": 2.191549777984619, "step": 2190 }, { "epoch": 0.12205101177514875, "grad_norm": 55.41353988647461, "learning_rate": 9.636916109178433e-08, "logits/chosen": -0.251595675945282, "logits/rejected": -0.3192422389984131, "logps/chosen": -196.3855438232422, "logps/rejected": -246.07162475585938, "loss": 1.3982, "nll_loss": 1.0199682712554932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.79991340637207, "rewards/margins": 2.171217441558838, "rewards/rejected": 2.6286959648132324, "step": 2200 }, { "epoch": 0.12260578910139942, "grad_norm": 102.95879364013672, "learning_rate": 9.633648846317505e-08, "logits/chosen": -0.2962803542613983, "logits/rejected": -0.3776516020298004, "logps/chosen": -202.29464721679688, "logps/rejected": -251.30032348632812, "loss": 1.4217, "nll_loss": 1.0513694286346436, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.9984965324401855, "rewards/margins": 2.980966091156006, "rewards/rejected": 2.017530679702759, "step": 2210 }, { "epoch": 0.12316056642765011, "grad_norm": 114.79225158691406, "learning_rate": 9.630367507685111e-08, "logits/chosen": -0.10262326151132584, "logits/rejected": -0.22282084822654724, "logps/chosen": -125.6513900756836, "logps/rejected": -162.16390991210938, "loss": 1.396, "nll_loss": 0.80219566822052, "rewards/accuracies": 0.875, "rewards/chosen": 3.7719345092773438, "rewards/margins": 2.6604316234588623, "rewards/rejected": 1.1115028858184814, "step": 2220 }, { "epoch": 0.12371534375390078, "grad_norm": 68.96896362304688, "learning_rate": 9.627072103249068e-08, "logits/chosen": -0.30825161933898926, "logits/rejected": -0.41360312700271606, "logps/chosen": -183.67562866210938, "logps/rejected": -223.6616668701172, "loss": 1.4085, "nll_loss": 1.0462400913238525, "rewards/accuracies": 0.75, "rewards/chosen": 4.440855979919434, "rewards/margins": 2.040837049484253, "rewards/rejected": 2.4000189304351807, "step": 2230 }, { "epoch": 0.12427012108015145, "grad_norm": 57.11476516723633, "learning_rate": 9.623762643019926e-08, "logits/chosen": -0.11846674978733063, "logits/rejected": -0.26219338178634644, "logps/chosen": -150.19265747070312, "logps/rejected": -192.1765899658203, "loss": 1.3587, "nll_loss": 0.9004217982292175, "rewards/accuracies": 0.875, "rewards/chosen": 4.165079593658447, "rewards/margins": 2.5247955322265625, "rewards/rejected": 1.6402838230133057, "step": 2240 }, { "epoch": 0.12482489840640212, "grad_norm": 69.78447723388672, "learning_rate": 9.620439137050927e-08, "logits/chosen": -0.19403323531150818, "logits/rejected": -0.30210986733436584, "logps/chosen": -170.3800506591797, "logps/rejected": -195.099609375, "loss": 1.3113, "nll_loss": 0.9437860250473022, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.235895156860352, "rewards/margins": 2.4345595836639404, "rewards/rejected": 1.801335334777832, "step": 2250 }, { "epoch": 0.1253796757326528, "grad_norm": 77.17230987548828, "learning_rate": 9.617101595437982e-08, "logits/chosen": -0.05190909653902054, "logits/rejected": -0.2363159954547882, "logps/chosen": -140.75799560546875, "logps/rejected": -211.8653106689453, "loss": 1.3641, "nll_loss": 0.8239955902099609, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.866058826446533, "rewards/margins": 2.844568967819214, "rewards/rejected": 1.0214897394180298, "step": 2260 }, { "epoch": 0.1259344530589035, "grad_norm": 49.746421813964844, "learning_rate": 9.613750028319642e-08, "logits/chosen": -0.047062940895557404, "logits/rejected": -0.15455859899520874, "logps/chosen": -143.74386596679688, "logps/rejected": -195.1613311767578, "loss": 1.4324, "nll_loss": 0.9071475863456726, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.077420234680176, "rewards/margins": 2.103487730026245, "rewards/rejected": 1.9739322662353516, "step": 2270 }, { "epoch": 0.12648923038515417, "grad_norm": 51.56296920776367, "learning_rate": 9.61038444587706e-08, "logits/chosen": -0.13705101609230042, "logits/rejected": -0.2227870225906372, "logps/chosen": -129.62725830078125, "logps/rejected": -178.81573486328125, "loss": 1.3893, "nll_loss": 0.9397897720336914, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.89752197265625, "rewards/margins": 1.2923376560211182, "rewards/rejected": 2.6051838397979736, "step": 2280 }, { "epoch": 0.12704400771140484, "grad_norm": 42.245574951171875, "learning_rate": 9.607004858333964e-08, "logits/chosen": -0.44759708642959595, "logits/rejected": -0.49922627210617065, "logps/chosen": -196.9328155517578, "logps/rejected": -245.5009307861328, "loss": 1.4152, "nll_loss": 1.1975692510604858, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.185031890869141, "rewards/margins": 2.9058709144592285, "rewards/rejected": 2.279160976409912, "step": 2290 }, { "epoch": 0.1275987850376555, "grad_norm": 67.2354736328125, "learning_rate": 9.60361127595663e-08, "logits/chosen": -0.320967435836792, "logits/rejected": -0.4030250012874603, "logps/chosen": -155.604736328125, "logps/rejected": -204.4038848876953, "loss": 1.4695, "nll_loss": 0.9363700747489929, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.52934455871582, "rewards/margins": 2.116450548171997, "rewards/rejected": 2.4128942489624023, "step": 2300 }, { "epoch": 0.12815356236390618, "grad_norm": 63.65040969848633, "learning_rate": 9.600203709053839e-08, "logits/chosen": -0.3212184011936188, "logits/rejected": -0.4064360558986664, "logps/chosen": -176.45187377929688, "logps/rejected": -233.3936309814453, "loss": 1.4012, "nll_loss": 1.0256242752075195, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.018517971038818, "rewards/margins": 2.8526389598846436, "rewards/rejected": 2.1658787727355957, "step": 2310 }, { "epoch": 0.12870833969015685, "grad_norm": 63.4724235534668, "learning_rate": 9.596782167976859e-08, "logits/chosen": -0.11651863902807236, "logits/rejected": -0.23722949624061584, "logps/chosen": -128.25538635253906, "logps/rejected": -159.47409057617188, "loss": 1.5035, "nll_loss": 0.8557698130607605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.848564624786377, "rewards/margins": 1.983266830444336, "rewards/rejected": 1.8652980327606201, "step": 2320 }, { "epoch": 0.12926311701640755, "grad_norm": 92.61750030517578, "learning_rate": 9.593346663119406e-08, "logits/chosen": -0.2534486651420593, "logits/rejected": -0.34855300188064575, "logps/chosen": -171.09487915039062, "logps/rejected": -218.22903442382812, "loss": 1.3338, "nll_loss": 0.996772289276123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.49443244934082, "rewards/margins": 2.554107666015625, "rewards/rejected": 1.9403250217437744, "step": 2330 }, { "epoch": 0.12981789434265822, "grad_norm": 55.44472122192383, "learning_rate": 9.589897204917612e-08, "logits/chosen": -0.21271423995494843, "logits/rejected": -0.30972006916999817, "logps/chosen": -167.2717742919922, "logps/rejected": -234.3817596435547, "loss": 1.4084, "nll_loss": 0.9619334936141968, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.373074531555176, "rewards/margins": 2.7067365646362305, "rewards/rejected": 1.6663382053375244, "step": 2340 }, { "epoch": 0.1303726716689089, "grad_norm": 47.895484924316406, "learning_rate": 9.586433803850002e-08, "logits/chosen": -0.3097013235092163, "logits/rejected": -0.4161096215248108, "logps/chosen": -190.05905151367188, "logps/rejected": -239.8685302734375, "loss": 1.3589, "nll_loss": 1.0496143102645874, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.816398620605469, "rewards/margins": 2.6777281761169434, "rewards/rejected": 2.1386706829071045, "step": 2350 }, { "epoch": 0.13092744899515957, "grad_norm": 78.9901351928711, "learning_rate": 9.582956470437448e-08, "logits/chosen": -0.18011830747127533, "logits/rejected": -0.3116056025028229, "logps/chosen": -165.65756225585938, "logps/rejected": -230.0876007080078, "loss": 1.3762, "nll_loss": 0.921899676322937, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.897641658782959, "rewards/margins": 3.6547799110412598, "rewards/rejected": 1.2428618669509888, "step": 2360 }, { "epoch": 0.13148222632141024, "grad_norm": 59.10150909423828, "learning_rate": 9.57946521524315e-08, "logits/chosen": -0.18872778117656708, "logits/rejected": -0.2746146023273468, "logps/chosen": -181.9303436279297, "logps/rejected": -245.42684936523438, "loss": 1.4089, "nll_loss": 0.9592266082763672, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.487532615661621, "rewards/margins": 2.240269899368286, "rewards/rejected": 2.247262477874756, "step": 2370 }, { "epoch": 0.1320370036476609, "grad_norm": 83.45726776123047, "learning_rate": 9.575960048872594e-08, "logits/chosen": -0.18271943926811218, "logits/rejected": -0.3004222512245178, "logps/chosen": -165.93307495117188, "logps/rejected": -194.65318298339844, "loss": 1.3263, "nll_loss": 0.9600454568862915, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.347784996032715, "rewards/margins": 2.3133761882781982, "rewards/rejected": 2.0344088077545166, "step": 2380 }, { "epoch": 0.13259178097391158, "grad_norm": 63.885406494140625, "learning_rate": 9.572440981973529e-08, "logits/chosen": -0.0731525644659996, "logits/rejected": -0.19846148788928986, "logps/chosen": -142.23365783691406, "logps/rejected": -195.64901733398438, "loss": 1.2997, "nll_loss": 0.7700817584991455, "rewards/accuracies": 0.875, "rewards/chosen": 4.073976039886475, "rewards/margins": 2.837172508239746, "rewards/rejected": 1.2368037700653076, "step": 2390 }, { "epoch": 0.13314655830016228, "grad_norm": 73.3870849609375, "learning_rate": 9.56890802523593e-08, "logits/chosen": -0.3486558198928833, "logits/rejected": -0.46232134103775024, "logps/chosen": -193.32313537597656, "logps/rejected": -259.9707336425781, "loss": 1.4372, "nll_loss": 1.0756404399871826, "rewards/accuracies": 0.875, "rewards/chosen": 4.954958915710449, "rewards/margins": 3.1835172176361084, "rewards/rejected": 1.7714424133300781, "step": 2400 }, { "epoch": 0.13370133562641295, "grad_norm": 63.42995834350586, "learning_rate": 9.565361189391958e-08, "logits/chosen": -0.26387467980384827, "logits/rejected": -0.3511542081832886, "logps/chosen": -138.95855712890625, "logps/rejected": -180.8175048828125, "loss": 1.4044, "nll_loss": 0.8946741819381714, "rewards/accuracies": 0.875, "rewards/chosen": 4.0163798332214355, "rewards/margins": 2.472100257873535, "rewards/rejected": 1.54427969455719, "step": 2410 }, { "epoch": 0.13425611295266363, "grad_norm": 60.89449691772461, "learning_rate": 9.561800485215947e-08, "logits/chosen": -0.24570202827453613, "logits/rejected": -0.354546457529068, "logps/chosen": -169.95260620117188, "logps/rejected": -213.67324829101562, "loss": 1.4563, "nll_loss": 0.9607990980148315, "rewards/accuracies": 0.75, "rewards/chosen": 4.3260698318481445, "rewards/margins": 2.637070655822754, "rewards/rejected": 1.6889995336532593, "step": 2420 }, { "epoch": 0.1348108902789143, "grad_norm": 66.38797760009766, "learning_rate": 9.55822592352435e-08, "logits/chosen": -0.406474769115448, "logits/rejected": -0.4907301962375641, "logps/chosen": -184.1668701171875, "logps/rejected": -223.18600463867188, "loss": 1.3542, "nll_loss": 1.0416371822357178, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.934833526611328, "rewards/margins": 3.2224936485290527, "rewards/rejected": 1.7123390436172485, "step": 2430 }, { "epoch": 0.13536566760516497, "grad_norm": 76.15042877197266, "learning_rate": 9.554637515175716e-08, "logits/chosen": -0.3425951600074768, "logits/rejected": -0.4833051264286041, "logps/chosen": -144.9541778564453, "logps/rejected": -199.95538330078125, "loss": 1.323, "nll_loss": 0.9206892251968384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.348945617675781, "rewards/margins": 2.3510971069335938, "rewards/rejected": 1.9978487491607666, "step": 2440 }, { "epoch": 0.13592044493141564, "grad_norm": 71.86099243164062, "learning_rate": 9.551035271070663e-08, "logits/chosen": -0.40942057967185974, "logits/rejected": -0.505861759185791, "logps/chosen": -166.58743286132812, "logps/rejected": -208.3878173828125, "loss": 1.3985, "nll_loss": 1.0548110008239746, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.600960731506348, "rewards/margins": 2.512728214263916, "rewards/rejected": 2.0882325172424316, "step": 2450 }, { "epoch": 0.13647522225766634, "grad_norm": 53.405128479003906, "learning_rate": 9.547419202151832e-08, "logits/chosen": -0.3199925720691681, "logits/rejected": -0.42514413595199585, "logps/chosen": -169.04751586914062, "logps/rejected": -202.2387237548828, "loss": 1.331, "nll_loss": 0.8953672647476196, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.623941898345947, "rewards/margins": 3.305065155029297, "rewards/rejected": 1.3188765048980713, "step": 2460 }, { "epoch": 0.137029999583917, "grad_norm": 49.56265640258789, "learning_rate": 9.54378931940386e-08, "logits/chosen": -0.2953924536705017, "logits/rejected": -0.3510599434375763, "logps/chosen": -169.89207458496094, "logps/rejected": -211.8363800048828, "loss": 1.3748, "nll_loss": 0.9912246465682983, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.64087438583374, "rewards/margins": 2.8930931091308594, "rewards/rejected": 1.7477811574935913, "step": 2470 }, { "epoch": 0.13758477691016768, "grad_norm": 119.32328033447266, "learning_rate": 9.540145633853352e-08, "logits/chosen": -0.2507792115211487, "logits/rejected": -0.3617081940174103, "logps/chosen": -166.5975799560547, "logps/rejected": -203.4107666015625, "loss": 1.4113, "nll_loss": 1.057375192642212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.210103511810303, "rewards/margins": 2.3813328742980957, "rewards/rejected": 1.8287712335586548, "step": 2480 }, { "epoch": 0.13813955423641835, "grad_norm": 49.55260467529297, "learning_rate": 9.536488156568836e-08, "logits/chosen": -0.25834280252456665, "logits/rejected": -0.3647618591785431, "logps/chosen": -181.38433837890625, "logps/rejected": -227.53884887695312, "loss": 1.3538, "nll_loss": 1.0067346096038818, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.781300067901611, "rewards/margins": 3.069465160369873, "rewards/rejected": 1.7118346691131592, "step": 2490 }, { "epoch": 0.13869433156266903, "grad_norm": 72.44939422607422, "learning_rate": 9.53281689866074e-08, "logits/chosen": -0.3534695506095886, "logits/rejected": -0.46817415952682495, "logps/chosen": -177.7882843017578, "logps/rejected": -227.4732666015625, "loss": 1.3581, "nll_loss": 1.0279648303985596, "rewards/accuracies": 0.875, "rewards/chosen": 5.046257495880127, "rewards/margins": 3.195817232131958, "rewards/rejected": 1.850440263748169, "step": 2500 }, { "epoch": 0.13869433156266903, "eval_logits/chosen": -0.41355088353157043, "eval_logits/rejected": -0.4752618968486786, "eval_logps/chosen": -205.35128784179688, "eval_logps/rejected": -262.29425048828125, "eval_loss": 1.344786286354065, "eval_nll_loss": 1.066452980041504, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 5.260993003845215, "eval_rewards/margins": 3.582566976547241, "eval_rewards/rejected": 1.6784261465072632, "eval_runtime": 16.7354, "eval_samples_per_second": 15.297, "eval_steps_per_second": 1.912, "step": 2500 }, { "epoch": 0.1392491088889197, "grad_norm": 80.55164337158203, "learning_rate": 9.529131871281351e-08, "logits/chosen": -0.3231019079685211, "logits/rejected": -0.38222265243530273, "logps/chosen": -167.98793029785156, "logps/rejected": -213.79104614257812, "loss": 1.4011, "nll_loss": 1.0517648458480835, "rewards/accuracies": 0.875, "rewards/chosen": 4.999415397644043, "rewards/margins": 2.8201375007629395, "rewards/rejected": 2.1792776584625244, "step": 2510 }, { "epoch": 0.1398038862151704, "grad_norm": 69.63285827636719, "learning_rate": 9.525433085624788e-08, "logits/chosen": -0.24561266601085663, "logits/rejected": -0.31572413444519043, "logps/chosen": -184.41099548339844, "logps/rejected": -204.45999145507812, "loss": 1.3704, "nll_loss": 0.9846397638320923, "rewards/accuracies": 0.75, "rewards/chosen": 4.188200950622559, "rewards/margins": 2.166051149368286, "rewards/rejected": 2.0221495628356934, "step": 2520 }, { "epoch": 0.14035866354142107, "grad_norm": 53.4434700012207, "learning_rate": 9.521720552926957e-08, "logits/chosen": -0.4215124249458313, "logits/rejected": -0.4378494322299957, "logps/chosen": -199.27281188964844, "logps/rejected": -233.3812713623047, "loss": 1.43, "nll_loss": 1.1478370428085327, "rewards/accuracies": 0.75, "rewards/chosen": 4.725975036621094, "rewards/margins": 1.5603116750717163, "rewards/rejected": 3.165663242340088, "step": 2530 }, { "epoch": 0.14091344086767174, "grad_norm": 73.6417236328125, "learning_rate": 9.517994284465531e-08, "logits/chosen": -0.2891234755516052, "logits/rejected": -0.38554567098617554, "logps/chosen": -153.97988891601562, "logps/rejected": -211.9022674560547, "loss": 1.3697, "nll_loss": 0.9445359110832214, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.174930572509766, "rewards/margins": 2.6157615184783936, "rewards/rejected": 1.559168815612793, "step": 2540 }, { "epoch": 0.1414682181939224, "grad_norm": 59.96479034423828, "learning_rate": 9.514254291559905e-08, "logits/chosen": -0.1881057322025299, "logits/rejected": -0.2699630856513977, "logps/chosen": -128.26663208007812, "logps/rejected": -177.18075561523438, "loss": 1.3287, "nll_loss": 0.8441031575202942, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 4.137763977050781, "rewards/margins": 2.495912551879883, "rewards/rejected": 1.6418521404266357, "step": 2550 }, { "epoch": 0.14202299552017308, "grad_norm": 65.00444030761719, "learning_rate": 9.510500585571164e-08, "logits/chosen": -0.4077147841453552, "logits/rejected": -0.5147227048873901, "logps/chosen": -179.1188201904297, "logps/rejected": -270.15191650390625, "loss": 1.4109, "nll_loss": 1.0775953531265259, "rewards/accuracies": 0.875, "rewards/chosen": 5.214983940124512, "rewards/margins": 3.057213306427002, "rewards/rejected": 2.157771110534668, "step": 2560 }, { "epoch": 0.14257777284642376, "grad_norm": 60.63315200805664, "learning_rate": 9.506733177902051e-08, "logits/chosen": -0.3077200949192047, "logits/rejected": -0.4166482985019684, "logps/chosen": -184.0797119140625, "logps/rejected": -217.2776641845703, "loss": 1.4804, "nll_loss": 1.0299670696258545, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.7556843757629395, "rewards/margins": 2.782696485519409, "rewards/rejected": 1.9729883670806885, "step": 2570 }, { "epoch": 0.14313255017267446, "grad_norm": 68.85614013671875, "learning_rate": 9.502952079996933e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -166.62692260742188, "logps/rejected": -220.95968627929688, "loss": 1.3492, "nll_loss": NaN, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.6029486656188965, "rewards/margins": 2.756474494934082, "rewards/rejected": 1.8464739322662354, "step": 2580 }, { "epoch": 0.14368732749892513, "grad_norm": 37.320533752441406, "learning_rate": 9.49915730334176e-08, "logits/chosen": -0.3359619677066803, "logits/rejected": -0.41889262199401855, "logps/chosen": -158.69314575195312, "logps/rejected": -187.04347229003906, "loss": 1.3012, "nll_loss": 0.8397552371025085, "rewards/accuracies": 0.875, "rewards/chosen": 4.5668864250183105, "rewards/margins": 2.7107009887695312, "rewards/rejected": 1.8561855554580688, "step": 2590 }, { "epoch": 0.1442421048251758, "grad_norm": 81.52698516845703, "learning_rate": 9.495348859464041e-08, "logits/chosen": -0.29530245065689087, "logits/rejected": -0.4065031111240387, "logps/chosen": -196.07713317871094, "logps/rejected": -223.0460662841797, "loss": 1.4478, "nll_loss": 1.1197636127471924, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.687514305114746, "rewards/margins": 2.8510119915008545, "rewards/rejected": 1.8365027904510498, "step": 2600 }, { "epoch": 0.14479688215142647, "grad_norm": 67.42103576660156, "learning_rate": 9.491526759932793e-08, "logits/chosen": -0.3822721838951111, "logits/rejected": -0.4680250287055969, "logps/chosen": -198.443359375, "logps/rejected": -236.1912384033203, "loss": 1.4023, "nll_loss": 1.0499932765960693, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.903171539306641, "rewards/margins": 1.9136698246002197, "rewards/rejected": 2.989501714706421, "step": 2610 }, { "epoch": 0.14535165947767714, "grad_norm": 62.30340576171875, "learning_rate": 9.487691016358524e-08, "logits/chosen": -0.26125961542129517, "logits/rejected": -0.42635488510131836, "logps/chosen": -184.85794067382812, "logps/rejected": -258.67816162109375, "loss": 1.3805, "nll_loss": 1.0284109115600586, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.900014400482178, "rewards/margins": 4.008078575134277, "rewards/rejected": 0.8919361233711243, "step": 2620 }, { "epoch": 0.1459064368039278, "grad_norm": 103.89203643798828, "learning_rate": 9.483841640393181e-08, "logits/chosen": -0.33772093057632446, "logits/rejected": -0.45589035749435425, "logps/chosen": -168.879150390625, "logps/rejected": -214.2537078857422, "loss": 1.4352, "nll_loss": 0.9708501100540161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.847733497619629, "rewards/margins": 2.6163687705993652, "rewards/rejected": 2.2313640117645264, "step": 2630 }, { "epoch": 0.1464612141301785, "grad_norm": 50.89051055908203, "learning_rate": 9.47997864373013e-08, "logits/chosen": -0.2704085409641266, "logits/rejected": -0.32763293385505676, "logps/chosen": -138.16543579101562, "logps/rejected": -175.11236572265625, "loss": 1.3185, "nll_loss": 0.8956116437911987, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.168116092681885, "rewards/margins": 2.5568606853485107, "rewards/rejected": 1.6112550497055054, "step": 2640 }, { "epoch": 0.14701599145642918, "grad_norm": 48.47789001464844, "learning_rate": 9.47610203810411e-08, "logits/chosen": -0.35066038370132446, "logits/rejected": -0.4566461145877838, "logps/chosen": -175.8914031982422, "logps/rejected": -235.3515625, "loss": 1.3196, "nll_loss": 0.9624403119087219, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.729374408721924, "rewards/margins": 2.690577983856201, "rewards/rejected": 2.038796901702881, "step": 2650 }, { "epoch": 0.14757076878267986, "grad_norm": 43.42479705810547, "learning_rate": 9.472211835291199e-08, "logits/chosen": -0.4881797432899475, "logits/rejected": -0.5933297872543335, "logps/chosen": -187.93209838867188, "logps/rejected": -237.4537353515625, "loss": 1.3203, "nll_loss": 1.1305028200149536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.392927169799805, "rewards/margins": 2.4824376106262207, "rewards/rejected": 2.910489559173584, "step": 2660 }, { "epoch": 0.14812554610893053, "grad_norm": 65.60295104980469, "learning_rate": 9.468308047108779e-08, "logits/chosen": -0.43689948320388794, "logits/rejected": -0.4873555302619934, "logps/chosen": -210.8523406982422, "logps/rejected": -237.1154327392578, "loss": 1.4299, "nll_loss": 1.1426368951797485, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.3748369216918945, "rewards/margins": 2.182204484939575, "rewards/rejected": 3.1926326751708984, "step": 2670 }, { "epoch": 0.1486803234351812, "grad_norm": 91.42697143554688, "learning_rate": 9.464390685415504e-08, "logits/chosen": -0.3554970324039459, "logits/rejected": -0.3768675923347473, "logps/chosen": -195.17538452148438, "logps/rejected": -215.8704071044922, "loss": 1.4193, "nll_loss": 1.110939860343933, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.654454231262207, "rewards/margins": 1.4705301523208618, "rewards/rejected": 3.183924436569214, "step": 2680 }, { "epoch": 0.14923510076143187, "grad_norm": 68.16394805908203, "learning_rate": 9.46045976211126e-08, "logits/chosen": -0.4355560839176178, "logits/rejected": -0.5083562135696411, "logps/chosen": -196.14529418945312, "logps/rejected": -241.79342651367188, "loss": 1.5208, "nll_loss": 1.1575534343719482, "rewards/accuracies": 0.75, "rewards/chosen": 4.816002368927002, "rewards/margins": 2.4721813201904297, "rewards/rejected": 2.343820333480835, "step": 2690 }, { "epoch": 0.14978987808768254, "grad_norm": 43.191001892089844, "learning_rate": 9.456515289137125e-08, "logits/chosen": -0.2441895306110382, "logits/rejected": -0.3482380509376526, "logps/chosen": -182.13026428222656, "logps/rejected": -232.80166625976562, "loss": 1.3585, "nll_loss": 0.9620084762573242, "rewards/accuracies": 0.875, "rewards/chosen": 4.937588214874268, "rewards/margins": 2.732166290283203, "rewards/rejected": 2.2054219245910645, "step": 2700 }, { "epoch": 0.15034465541393324, "grad_norm": 72.89177703857422, "learning_rate": 9.452557278475344e-08, "logits/chosen": -0.42393484711647034, "logits/rejected": -0.5233877301216125, "logps/chosen": -169.06707763671875, "logps/rejected": -248.55252075195312, "loss": 1.3353, "nll_loss": 1.0183719396591187, "rewards/accuracies": 0.875, "rewards/chosen": 5.008650779724121, "rewards/margins": 2.579192638397217, "rewards/rejected": 2.4294581413269043, "step": 2710 }, { "epoch": 0.15089943274018391, "grad_norm": 61.85404968261719, "learning_rate": 9.448585742149279e-08, "logits/chosen": -0.3834911584854126, "logits/rejected": -0.516968846321106, "logps/chosen": -186.55929565429688, "logps/rejected": -254.18295288085938, "loss": 1.4313, "nll_loss": 1.0471513271331787, "rewards/accuracies": 0.875, "rewards/chosen": 5.193806171417236, "rewards/margins": 2.868375301361084, "rewards/rejected": 2.3254313468933105, "step": 2720 }, { "epoch": 0.15145421006643459, "grad_norm": 58.406558990478516, "learning_rate": 9.444600692223388e-08, "logits/chosen": -0.41122984886169434, "logits/rejected": -0.5016804337501526, "logps/chosen": -190.3726348876953, "logps/rejected": -236.7998046875, "loss": 1.3765, "nll_loss": 1.0485427379608154, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.190579891204834, "rewards/margins": 2.7871081829071045, "rewards/rejected": 2.4034721851348877, "step": 2730 }, { "epoch": 0.15200898739268526, "grad_norm": 55.32042694091797, "learning_rate": 9.44060214080317e-08, "logits/chosen": -0.19496165215969086, "logits/rejected": -0.30803734064102173, "logps/chosen": -174.32630920410156, "logps/rejected": -217.2548370361328, "loss": 1.4081, "nll_loss": 0.9257704615592957, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.517314910888672, "rewards/margins": 2.4943456649780273, "rewards/rejected": 2.0229687690734863, "step": 2740 }, { "epoch": 0.15256376471893593, "grad_norm": 50.69704818725586, "learning_rate": 9.436590100035144e-08, "logits/chosen": -0.4139935076236725, "logits/rejected": -0.48557084798812866, "logps/chosen": -186.61923217773438, "logps/rejected": -255.1016387939453, "loss": 1.3999, "nll_loss": 1.157854676246643, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.923837661743164, "rewards/margins": 2.315887928009033, "rewards/rejected": 2.607950210571289, "step": 2750 }, { "epoch": 0.1531185420451866, "grad_norm": 71.0564956665039, "learning_rate": 9.432564582106803e-08, "logits/chosen": -0.34736424684524536, "logits/rejected": -0.397489994764328, "logps/chosen": -162.69871520996094, "logps/rejected": -206.42636108398438, "loss": 1.287, "nll_loss": 1.0692551136016846, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.872149467468262, "rewards/margins": 2.6135616302490234, "rewards/rejected": 2.2585878372192383, "step": 2760 }, { "epoch": 0.1536733193714373, "grad_norm": 119.2959213256836, "learning_rate": 9.428525599246582e-08, "logits/chosen": -0.4050524830818176, "logits/rejected": -0.4646620750427246, "logps/chosen": -180.15858459472656, "logps/rejected": -206.5132293701172, "loss": 1.3877, "nll_loss": 1.1630154848098755, "rewards/accuracies": 0.875, "rewards/chosen": 4.824166297912598, "rewards/margins": 2.219174385070801, "rewards/rejected": 2.6049914360046387, "step": 2770 }, { "epoch": 0.15422809669768797, "grad_norm": 66.43024444580078, "learning_rate": 9.424473163723818e-08, "logits/chosen": -0.2653568387031555, "logits/rejected": -0.3475129008293152, "logps/chosen": -180.5072021484375, "logps/rejected": -226.7493896484375, "loss": 1.3441, "nll_loss": 0.967044472694397, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.633328437805176, "rewards/margins": 3.564223527908325, "rewards/rejected": 1.0691044330596924, "step": 2780 }, { "epoch": 0.15478287402393864, "grad_norm": 64.4342269897461, "learning_rate": 9.420407287848716e-08, "logits/chosen": -0.30602386593818665, "logits/rejected": -0.35876408219337463, "logps/chosen": -184.25668334960938, "logps/rejected": -224.2547607421875, "loss": 1.3739, "nll_loss": 0.9765238761901855, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.992084980010986, "rewards/margins": 2.586191177368164, "rewards/rejected": 2.4058938026428223, "step": 2790 }, { "epoch": 0.15533765135018932, "grad_norm": 84.9169921875, "learning_rate": 9.416327983972303e-08, "logits/chosen": -0.45902055501937866, "logits/rejected": -0.5061752200126648, "logps/chosen": -180.96231079101562, "logps/rejected": -223.2859344482422, "loss": 1.4444, "nll_loss": 1.0957945585250854, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.965345859527588, "rewards/margins": 2.9676547050476074, "rewards/rejected": 1.9976913928985596, "step": 2800 }, { "epoch": 0.15589242867644, "grad_norm": 67.10375213623047, "learning_rate": 9.412235264486403e-08, "logits/chosen": -0.36773133277893066, "logits/rejected": -0.44952455163002014, "logps/chosen": -169.8708953857422, "logps/rejected": -204.29342651367188, "loss": 1.3693, "nll_loss": 1.1890907287597656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.539219379425049, "rewards/margins": 2.4898459911346436, "rewards/rejected": 2.049373149871826, "step": 2810 }, { "epoch": 0.15644720600269066, "grad_norm": 137.09971618652344, "learning_rate": 9.40812914182359e-08, "logits/chosen": -0.5243615508079529, "logits/rejected": -0.5689690709114075, "logps/chosen": -236.2031707763672, "logps/rejected": -265.97564697265625, "loss": 1.3745, "nll_loss": 1.4004731178283691, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.410025596618652, "rewards/margins": 2.4597747325897217, "rewards/rejected": 2.9502501487731934, "step": 2820 }, { "epoch": 0.15700198332894136, "grad_norm": 76.55516052246094, "learning_rate": 9.404009628457152e-08, "logits/chosen": -0.2582315504550934, "logits/rejected": -0.3453051447868347, "logps/chosen": -173.9943084716797, "logps/rejected": -205.97421264648438, "loss": 1.3869, "nll_loss": 1.0055551528930664, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.79890775680542, "rewards/margins": 2.4663453102111816, "rewards/rejected": 2.3325629234313965, "step": 2830 }, { "epoch": 0.15755676065519203, "grad_norm": 105.99337768554688, "learning_rate": 9.399876736901059e-08, "logits/chosen": -0.1436496376991272, "logits/rejected": -0.26093825697898865, "logps/chosen": -134.3251495361328, "logps/rejected": -169.06387329101562, "loss": 1.272, "nll_loss": 0.8987034559249878, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.7388052940368652, "rewards/margins": 2.1922247409820557, "rewards/rejected": 1.5465809106826782, "step": 2840 }, { "epoch": 0.1581115379814427, "grad_norm": 73.1646957397461, "learning_rate": 9.395730479709914e-08, "logits/chosen": -0.2593434154987335, "logits/rejected": -0.3618290424346924, "logps/chosen": -166.7423095703125, "logps/rejected": -228.65707397460938, "loss": 1.3584, "nll_loss": 0.9439231157302856, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.7212347984313965, "rewards/margins": 3.2248616218566895, "rewards/rejected": 1.496372938156128, "step": 2850 }, { "epoch": 0.15866631530769337, "grad_norm": 53.906005859375, "learning_rate": 9.391570869478928e-08, "logits/chosen": -0.3020009398460388, "logits/rejected": -0.40985146164894104, "logps/chosen": -196.04159545898438, "logps/rejected": -255.9525146484375, "loss": 1.4542, "nll_loss": 1.0601094961166382, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.943997383117676, "rewards/margins": 3.0306007862091064, "rewards/rejected": 1.9133962392807007, "step": 2860 }, { "epoch": 0.15922109263394404, "grad_norm": 50.61529541015625, "learning_rate": 9.387397918843868e-08, "logits/chosen": -0.387464702129364, "logits/rejected": -0.4730927050113678, "logps/chosen": -171.99435424804688, "logps/rejected": -221.2834930419922, "loss": 1.3183, "nll_loss": 1.0262539386749268, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.058302402496338, "rewards/margins": 2.2185654640197754, "rewards/rejected": 2.8397369384765625, "step": 2870 }, { "epoch": 0.15977586996019472, "grad_norm": 54.42586135864258, "learning_rate": 9.383211640481031e-08, "logits/chosen": -0.3841005563735962, "logits/rejected": -0.4482545852661133, "logps/chosen": -210.54629516601562, "logps/rejected": -233.26083374023438, "loss": 1.4826, "nll_loss": 1.2217636108398438, "rewards/accuracies": 0.75, "rewards/chosen": 5.09761905670166, "rewards/margins": 2.4193615913391113, "rewards/rejected": 2.678257703781128, "step": 2880 }, { "epoch": 0.16033064728644542, "grad_norm": 71.05906677246094, "learning_rate": 9.379012047107198e-08, "logits/chosen": -0.25542598962783813, "logits/rejected": -0.3733716905117035, "logps/chosen": -196.41189575195312, "logps/rejected": -223.8795928955078, "loss": 1.3641, "nll_loss": 0.9989587068557739, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.955661773681641, "rewards/margins": 2.755467176437378, "rewards/rejected": 2.2001938819885254, "step": 2890 }, { "epoch": 0.1608854246126961, "grad_norm": 64.01283264160156, "learning_rate": 9.374799151479595e-08, "logits/chosen": -0.16815884411334991, "logits/rejected": -0.23455043137073517, "logps/chosen": -159.39224243164062, "logps/rejected": -203.01136779785156, "loss": 1.3999, "nll_loss": 0.9491283297538757, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.669705390930176, "rewards/margins": 1.8462598323822021, "rewards/rejected": 2.8234455585479736, "step": 2900 }, { "epoch": 0.16144020193894676, "grad_norm": 36.37623596191406, "learning_rate": 9.370572966395862e-08, "logits/chosen": -0.4131261706352234, "logits/rejected": -0.46238717436790466, "logps/chosen": -209.1028289794922, "logps/rejected": -246.9708251953125, "loss": 1.3781, "nll_loss": 1.1953654289245605, "rewards/accuracies": 0.875, "rewards/chosen": 5.529759407043457, "rewards/margins": 2.8865723609924316, "rewards/rejected": 2.6431870460510254, "step": 2910 }, { "epoch": 0.16199497926519743, "grad_norm": 104.28368377685547, "learning_rate": 9.366333504694005e-08, "logits/chosen": -0.3713774085044861, "logits/rejected": -0.426577627658844, "logps/chosen": -194.2244415283203, "logps/rejected": -248.76950073242188, "loss": 1.406, "nll_loss": 1.1306129693984985, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.217923641204834, "rewards/margins": 2.2736878395080566, "rewards/rejected": 2.9442355632781982, "step": 2920 }, { "epoch": 0.1625497565914481, "grad_norm": 71.3147964477539, "learning_rate": 9.36208077925236e-08, "logits/chosen": -0.2785489559173584, "logits/rejected": -0.3307238221168518, "logps/chosen": -167.78414916992188, "logps/rejected": -202.63185119628906, "loss": 1.3369, "nll_loss": 1.0394691228866577, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.59388542175293, "rewards/margins": 2.529740810394287, "rewards/rejected": 2.0641446113586426, "step": 2930 }, { "epoch": 0.16310453391769877, "grad_norm": 59.345767974853516, "learning_rate": 9.357814802989559e-08, "logits/chosen": -0.2544178366661072, "logits/rejected": -0.3156990706920624, "logps/chosen": -164.45431518554688, "logps/rejected": -215.3196258544922, "loss": 1.2965, "nll_loss": 0.9578148722648621, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.471480369567871, "rewards/margins": 2.459791898727417, "rewards/rejected": 2.011688232421875, "step": 2940 }, { "epoch": 0.16365931124394947, "grad_norm": 44.61174011230469, "learning_rate": 9.35353558886448e-08, "logits/chosen": -0.203196719288826, "logits/rejected": -0.27118998765945435, "logps/chosen": -191.95758056640625, "logps/rejected": -233.2113037109375, "loss": 1.3525, "nll_loss": 0.9398837089538574, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.863980770111084, "rewards/margins": 2.6459555625915527, "rewards/rejected": 2.2180254459381104, "step": 2950 }, { "epoch": 0.16421408857020015, "grad_norm": 116.35255432128906, "learning_rate": 9.349243149876222e-08, "logits/chosen": -0.12396907806396484, "logits/rejected": -0.23714527487754822, "logps/chosen": -148.67955017089844, "logps/rejected": -216.0026397705078, "loss": 1.4087, "nll_loss": 0.8972294926643372, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.648340702056885, "rewards/margins": 3.5599422454833984, "rewards/rejected": 1.0883982181549072, "step": 2960 }, { "epoch": 0.16476886589645082, "grad_norm": 80.23884582519531, "learning_rate": 9.34493749906405e-08, "logits/chosen": -0.23079581558704376, "logits/rejected": -0.3237621486186981, "logps/chosen": -154.03265380859375, "logps/rejected": -206.5592498779297, "loss": 1.2691, "nll_loss": 0.9200903177261353, "rewards/accuracies": 0.875, "rewards/chosen": 4.6948981285095215, "rewards/margins": 3.045192241668701, "rewards/rejected": 1.6497061252593994, "step": 2970 }, { "epoch": 0.1653236432227015, "grad_norm": 71.23548889160156, "learning_rate": 9.340618649507368e-08, "logits/chosen": -0.23892824351787567, "logits/rejected": -0.3380570411682129, "logps/chosen": -201.36761474609375, "logps/rejected": -257.79241943359375, "loss": 1.3058, "nll_loss": 1.0435470342636108, "rewards/accuracies": 0.875, "rewards/chosen": 5.308444023132324, "rewards/margins": 3.2355926036834717, "rewards/rejected": 2.0728511810302734, "step": 2980 }, { "epoch": 0.16587842054895216, "grad_norm": 57.883033752441406, "learning_rate": 9.33628661432567e-08, "logits/chosen": -0.18109621107578278, "logits/rejected": -0.27738335728645325, "logps/chosen": -195.41867065429688, "logps/rejected": -238.0283966064453, "loss": 1.2382, "nll_loss": 1.0464454889297485, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.014590263366699, "rewards/margins": 3.5181357860565186, "rewards/rejected": 1.4964540004730225, "step": 2990 }, { "epoch": 0.16643319787520283, "grad_norm": 51.56818771362305, "learning_rate": 9.331941406678509e-08, "logits/chosen": -0.23022404313087463, "logits/rejected": -0.3125895857810974, "logps/chosen": -166.96844482421875, "logps/rejected": -210.93698120117188, "loss": 1.375, "nll_loss": 0.9681900143623352, "rewards/accuracies": 0.75, "rewards/chosen": 4.88405179977417, "rewards/margins": 3.1794562339782715, "rewards/rejected": 1.7045953273773193, "step": 3000 }, { "epoch": 0.16643319787520283, "eval_logits/chosen": -0.3941521942615509, "eval_logits/rejected": -0.45799243450164795, "eval_logps/chosen": -203.202392578125, "eval_logps/rejected": -262.8951110839844, "eval_loss": 1.3165993690490723, "eval_nll_loss": 1.0564641952514648, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 5.475882530212402, "eval_rewards/margins": 3.857545852661133, "eval_rewards/rejected": 1.6183371543884277, "eval_runtime": 16.8723, "eval_samples_per_second": 15.173, "eval_steps_per_second": 1.897, "step": 3000 }, { "epoch": 0.1669879752014535, "grad_norm": 56.567138671875, "learning_rate": 9.327583039765452e-08, "logits/chosen": -0.05765485763549805, "logits/rejected": -0.15203312039375305, "logps/chosen": -147.2929229736328, "logps/rejected": -186.84970092773438, "loss": 1.3847, "nll_loss": 0.8762642741203308, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.501176357269287, "rewards/margins": 2.618312358856201, "rewards/rejected": 1.882863998413086, "step": 3010 }, { "epoch": 0.1675427525277042, "grad_norm": 134.11328125, "learning_rate": 9.323211526826034e-08, "logits/chosen": -0.33124592900276184, "logits/rejected": -0.46882420778274536, "logps/chosen": -182.77406311035156, "logps/rejected": -228.29013061523438, "loss": 1.3804, "nll_loss": 1.0590949058532715, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.268399715423584, "rewards/margins": 3.4461803436279297, "rewards/rejected": 1.8222192525863647, "step": 3020 }, { "epoch": 0.16809752985395487, "grad_norm": 43.63470458984375, "learning_rate": 9.31882688113973e-08, "logits/chosen": -0.4445907175540924, "logits/rejected": -0.5267969369888306, "logps/chosen": -192.04876708984375, "logps/rejected": -261.2143249511719, "loss": 1.2969, "nll_loss": 1.1113297939300537, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.295589447021484, "rewards/margins": 3.152954339981079, "rewards/rejected": 2.1426353454589844, "step": 3030 }, { "epoch": 0.16865230718020555, "grad_norm": 65.57992553710938, "learning_rate": 9.314429116025908e-08, "logits/chosen": -0.30112895369529724, "logits/rejected": -0.3968796730041504, "logps/chosen": -181.86544799804688, "logps/rejected": -220.15896606445312, "loss": 1.3056, "nll_loss": 0.9839954376220703, "rewards/accuracies": 0.875, "rewards/chosen": 4.820133209228516, "rewards/margins": 3.0142996311187744, "rewards/rejected": 1.8058335781097412, "step": 3040 }, { "epoch": 0.16920708450645622, "grad_norm": 95.29837799072266, "learning_rate": 9.310018244843788e-08, "logits/chosen": -0.2917076051235199, "logits/rejected": -0.39179855585098267, "logps/chosen": -170.2184295654297, "logps/rejected": -213.0507049560547, "loss": 1.4588, "nll_loss": 0.9763672947883606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.969861030578613, "rewards/margins": 2.7968153953552246, "rewards/rejected": 2.1730453968048096, "step": 3050 }, { "epoch": 0.1697618618327069, "grad_norm": 125.24748992919922, "learning_rate": 9.305594280992403e-08, "logits/chosen": -0.3027943968772888, "logits/rejected": -0.40738552808761597, "logps/chosen": -187.3314971923828, "logps/rejected": -250.4483642578125, "loss": 1.3594, "nll_loss": 1.0154049396514893, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.080492973327637, "rewards/margins": 2.5206997394561768, "rewards/rejected": 2.559792995452881, "step": 3060 }, { "epoch": 0.17031663915895756, "grad_norm": 45.90873718261719, "learning_rate": 9.301157237910559e-08, "logits/chosen": -0.3818047046661377, "logits/rejected": -0.46516746282577515, "logps/chosen": -188.83514404296875, "logps/rejected": -254.16354370117188, "loss": 1.3877, "nll_loss": 1.0332419872283936, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.420754432678223, "rewards/margins": 2.7936758995056152, "rewards/rejected": 2.6270785331726074, "step": 3070 }, { "epoch": 0.17087141648520826, "grad_norm": 90.6324691772461, "learning_rate": 9.296707129076793e-08, "logits/chosen": -0.3408610224723816, "logits/rejected": -0.459175169467926, "logps/chosen": -175.30142211914062, "logps/rejected": -227.79067993164062, "loss": 1.4235, "nll_loss": 1.0021774768829346, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.705409049987793, "rewards/margins": 2.9445652961730957, "rewards/rejected": 1.7608436346054077, "step": 3080 }, { "epoch": 0.17142619381145893, "grad_norm": 53.0914306640625, "learning_rate": 9.29224396800933e-08, "logits/chosen": -0.36489245295524597, "logits/rejected": -0.425426721572876, "logps/chosen": -183.64111328125, "logps/rejected": -202.63961791992188, "loss": 1.3559, "nll_loss": 1.0612441301345825, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.426742076873779, "rewards/margins": 2.255833387374878, "rewards/rejected": 2.1709086894989014, "step": 3090 }, { "epoch": 0.1719809711377096, "grad_norm": 58.5355224609375, "learning_rate": 9.287767768266046e-08, "logits/chosen": -0.35046714544296265, "logits/rejected": -0.4076654314994812, "logps/chosen": -153.88473510742188, "logps/rejected": -202.56646728515625, "loss": 1.4322, "nll_loss": 1.020686388015747, "rewards/accuracies": 0.75, "rewards/chosen": 4.608212471008301, "rewards/margins": 2.0267574787139893, "rewards/rejected": 2.5814545154571533, "step": 3100 }, { "epoch": 0.17253574846396028, "grad_norm": 60.846561431884766, "learning_rate": 9.283278543444427e-08, "logits/chosen": -0.29373809695243835, "logits/rejected": -0.3485686182975769, "logps/chosen": -175.6277618408203, "logps/rejected": -191.9043426513672, "loss": 1.3505, "nll_loss": 1.0474942922592163, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 4.512275695800781, "rewards/margins": 1.951250672340393, "rewards/rejected": 2.5610251426696777, "step": 3110 }, { "epoch": 0.17309052579021095, "grad_norm": 63.5445671081543, "learning_rate": 9.278776307181517e-08, "logits/chosen": -0.4287452697753906, "logits/rejected": -0.4757808744907379, "logps/chosen": -212.8755340576172, "logps/rejected": -244.24636840820312, "loss": 1.4416, "nll_loss": 1.128204107284546, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.158940315246582, "rewards/margins": 2.630156993865967, "rewards/rejected": 2.528783082962036, "step": 3120 }, { "epoch": 0.17364530311646162, "grad_norm": 51.99097442626953, "learning_rate": 9.2742610731539e-08, "logits/chosen": -0.32245302200317383, "logits/rejected": -0.4503195285797119, "logps/chosen": -162.48129272460938, "logps/rejected": -205.0572052001953, "loss": 1.42, "nll_loss": 0.9195488691329956, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.636639595031738, "rewards/margins": 2.9686594009399414, "rewards/rejected": 1.6679799556732178, "step": 3130 }, { "epoch": 0.17420008044271232, "grad_norm": 53.674537658691406, "learning_rate": 9.269732855077628e-08, "logits/chosen": -0.36281704902648926, "logits/rejected": -0.44908076524734497, "logps/chosen": -148.302734375, "logps/rejected": -188.53761291503906, "loss": 1.4262, "nll_loss": 0.9345152974128723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.75624418258667, "rewards/margins": 2.426478862762451, "rewards/rejected": 2.3297653198242188, "step": 3140 }, { "epoch": 0.174754857768963, "grad_norm": 32.641780853271484, "learning_rate": 9.265191666708207e-08, "logits/chosen": -0.27269551157951355, "logits/rejected": -0.3571647107601166, "logps/chosen": -205.01992797851562, "logps/rejected": -225.32373046875, "loss": 1.3487, "nll_loss": 1.0292689800262451, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.078030586242676, "rewards/margins": 2.4737343788146973, "rewards/rejected": 2.6042959690093994, "step": 3150 }, { "epoch": 0.17530963509521366, "grad_norm": 55.32010269165039, "learning_rate": 9.260637521840538e-08, "logits/chosen": -0.3078479766845703, "logits/rejected": -0.4543367028236389, "logps/chosen": -163.51068115234375, "logps/rejected": -225.3749542236328, "loss": 1.3688, "nll_loss": 0.9280654788017273, "rewards/accuracies": 0.875, "rewards/chosen": 4.966848373413086, "rewards/margins": 2.6937406063079834, "rewards/rejected": 2.2731080055236816, "step": 3160 }, { "epoch": 0.17586441242146433, "grad_norm": 95.10186767578125, "learning_rate": 9.256070434308878e-08, "logits/chosen": -0.25554990768432617, "logits/rejected": -0.3518657684326172, "logps/chosen": -154.96678161621094, "logps/rejected": -189.62200927734375, "loss": 1.3324, "nll_loss": 0.9391428232192993, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.641543388366699, "rewards/margins": 2.5734002590179443, "rewards/rejected": 2.068143367767334, "step": 3170 }, { "epoch": 0.176419189747715, "grad_norm": 79.28388977050781, "learning_rate": 9.251490417986807e-08, "logits/chosen": -0.13116374611854553, "logits/rejected": -0.24308066070079803, "logps/chosen": -138.0506591796875, "logps/rejected": -191.6248016357422, "loss": 1.4005, "nll_loss": 0.8182266354560852, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.467595100402832, "rewards/margins": 3.2676169872283936, "rewards/rejected": 1.1999778747558594, "step": 3180 }, { "epoch": 0.17697396707396568, "grad_norm": 62.85856628417969, "learning_rate": 9.246897486787172e-08, "logits/chosen": -0.15434524416923523, "logits/rejected": -0.3066304326057434, "logps/chosen": -148.60952758789062, "logps/rejected": -212.33450317382812, "loss": 1.3648, "nll_loss": 0.9511687159538269, "rewards/accuracies": 0.75, "rewards/chosen": 4.542972087860107, "rewards/margins": 3.7544562816619873, "rewards/rejected": 0.7885159254074097, "step": 3190 }, { "epoch": 0.17752874440021638, "grad_norm": 69.8587417602539, "learning_rate": 9.242291654662058e-08, "logits/chosen": -0.28641340136528015, "logits/rejected": -0.39799395203590393, "logps/chosen": -153.35104370117188, "logps/rejected": -213.3860321044922, "loss": 1.361, "nll_loss": 0.9252685308456421, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.663729667663574, "rewards/margins": 3.158186435699463, "rewards/rejected": 1.5055434703826904, "step": 3200 }, { "epoch": 0.17808352172646705, "grad_norm": 65.60191345214844, "learning_rate": 9.237672935602734e-08, "logits/chosen": -0.3066278100013733, "logits/rejected": -0.3982074558734894, "logps/chosen": -216.09536743164062, "logps/rejected": -270.0486145019531, "loss": 1.3723, "nll_loss": 1.1032354831695557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.352534294128418, "rewards/margins": 3.3085875511169434, "rewards/rejected": 2.0439465045928955, "step": 3210 }, { "epoch": 0.17863829905271772, "grad_norm": 111.08596801757812, "learning_rate": 9.233041343639621e-08, "logits/chosen": -0.43451136350631714, "logits/rejected": -0.5363648533821106, "logps/chosen": -206.8416290283203, "logps/rejected": -263.03863525390625, "loss": 1.2986, "nll_loss": 1.0895191431045532, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.949709892272949, "rewards/margins": 3.783278226852417, "rewards/rejected": 2.166430950164795, "step": 3220 }, { "epoch": 0.1791930763789684, "grad_norm": 85.79315948486328, "learning_rate": 9.228396892842243e-08, "logits/chosen": -0.3698079288005829, "logits/rejected": -0.5223512649536133, "logps/chosen": -154.75775146484375, "logps/rejected": -210.3184051513672, "loss": 1.4173, "nll_loss": 0.984754741191864, "rewards/accuracies": 0.875, "rewards/chosen": 4.734168529510498, "rewards/margins": 2.8217694759368896, "rewards/rejected": 1.9123990535736084, "step": 3230 }, { "epoch": 0.17974785370521906, "grad_norm": 55.84793472290039, "learning_rate": 9.223739597319182e-08, "logits/chosen": -0.3412432074546814, "logits/rejected": -0.42043352127075195, "logps/chosen": -166.50973510742188, "logps/rejected": -232.7230224609375, "loss": 1.3652, "nll_loss": 1.0250871181488037, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.225703716278076, "rewards/margins": 3.070754051208496, "rewards/rejected": 2.1549501419067383, "step": 3240 }, { "epoch": 0.18030263103146973, "grad_norm": 53.27821731567383, "learning_rate": 9.219069471218044e-08, "logits/chosen": -0.3862474858760834, "logits/rejected": -0.47613492608070374, "logps/chosen": -147.55599975585938, "logps/rejected": -225.82376098632812, "loss": 1.2634, "nll_loss": 1.085354208946228, "rewards/accuracies": 0.75, "rewards/chosen": 5.058647632598877, "rewards/margins": 2.990551471710205, "rewards/rejected": 2.0680959224700928, "step": 3250 }, { "epoch": 0.1808574083577204, "grad_norm": 74.8572998046875, "learning_rate": 9.214386528725407e-08, "logits/chosen": -0.43352779746055603, "logits/rejected": -0.5267191529273987, "logps/chosen": -222.47268676757812, "logps/rejected": -283.19281005859375, "loss": 1.4565, "nll_loss": 1.1316627264022827, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.734960079193115, "rewards/margins": 3.585192918777466, "rewards/rejected": 2.1497673988342285, "step": 3260 }, { "epoch": 0.1814121856839711, "grad_norm": 56.274078369140625, "learning_rate": 9.209690784066784e-08, "logits/chosen": -0.13440726697444916, "logits/rejected": -0.24160249531269073, "logps/chosen": -160.8746795654297, "logps/rejected": -216.3030242919922, "loss": 1.3005, "nll_loss": 0.9158796072006226, "rewards/accuracies": 0.875, "rewards/chosen": 4.995485305786133, "rewards/margins": 3.5055174827575684, "rewards/rejected": 1.489967942237854, "step": 3270 }, { "epoch": 0.18196696301022178, "grad_norm": 40.22864532470703, "learning_rate": 9.204982251506576e-08, "logits/chosen": -0.1497855931520462, "logits/rejected": -0.2931815981864929, "logps/chosen": -152.04678344726562, "logps/rejected": -208.2039337158203, "loss": 1.4019, "nll_loss": 0.9629203081130981, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.667003154754639, "rewards/margins": 3.2013118267059326, "rewards/rejected": 1.465691328048706, "step": 3280 }, { "epoch": 0.18252174033647245, "grad_norm": 70.88353729248047, "learning_rate": 9.200260945348033e-08, "logits/chosen": -0.14935025572776794, "logits/rejected": -0.2693983018398285, "logps/chosen": -145.81430053710938, "logps/rejected": -183.57470703125, "loss": 1.3995, "nll_loss": 0.9631720781326294, "rewards/accuracies": 0.75, "rewards/chosen": 4.340367317199707, "rewards/margins": 1.9793999195098877, "rewards/rejected": 2.3609676361083984, "step": 3290 }, { "epoch": 0.18307651766272312, "grad_norm": 52.66188430786133, "learning_rate": 9.195526879933205e-08, "logits/chosen": -0.2618695795536041, "logits/rejected": -0.33439141511917114, "logps/chosen": -146.67202758789062, "logps/rejected": -185.0167236328125, "loss": 1.4054, "nll_loss": 1.035718560218811, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.421456336975098, "rewards/margins": 1.8323252201080322, "rewards/rejected": 2.5891308784484863, "step": 3300 }, { "epoch": 0.1836312949889738, "grad_norm": 62.6783561706543, "learning_rate": 9.190780069642899e-08, "logits/chosen": -0.4468691349029541, "logits/rejected": -0.48922890424728394, "logps/chosen": -185.74270629882812, "logps/rejected": -238.2687530517578, "loss": 1.3825, "nll_loss": 1.0476751327514648, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.556589126586914, "rewards/margins": 3.2610023021698, "rewards/rejected": 2.295586585998535, "step": 3310 }, { "epoch": 0.18418607231522446, "grad_norm": 119.20405578613281, "learning_rate": 9.186020528896643e-08, "logits/chosen": -0.27735182642936707, "logits/rejected": -0.3652539551258087, "logps/chosen": -179.13577270507812, "logps/rejected": -219.6503143310547, "loss": 1.351, "nll_loss": 1.0589101314544678, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.890892505645752, "rewards/margins": 2.9163360595703125, "rewards/rejected": 1.9745559692382812, "step": 3320 }, { "epoch": 0.18474084964147516, "grad_norm": 69.34968566894531, "learning_rate": 9.181248272152633e-08, "logits/chosen": -0.3122369647026062, "logits/rejected": -0.3962582051753998, "logps/chosen": -191.41275024414062, "logps/rejected": -223.7455291748047, "loss": 1.3463, "nll_loss": 1.0294417142868042, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.048547744750977, "rewards/margins": 2.3484997749328613, "rewards/rejected": 2.7000479698181152, "step": 3330 }, { "epoch": 0.18529562696772583, "grad_norm": 51.15849685668945, "learning_rate": 9.176463313907694e-08, "logits/chosen": -0.27559852600097656, "logits/rejected": -0.44314026832580566, "logps/chosen": -193.53561401367188, "logps/rejected": -257.9790344238281, "loss": 1.4233, "nll_loss": 0.9996274709701538, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.9851603507995605, "rewards/margins": 3.9858479499816895, "rewards/rejected": 0.9993122220039368, "step": 3340 }, { "epoch": 0.1858504042939765, "grad_norm": 65.05023193359375, "learning_rate": 9.171665668697234e-08, "logits/chosen": -0.14803513884544373, "logits/rejected": -0.20512041449546814, "logps/chosen": -157.99771118164062, "logps/rejected": -202.19908142089844, "loss": 1.3478, "nll_loss": 0.8801227807998657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.644710063934326, "rewards/margins": 3.1068365573883057, "rewards/rejected": 1.5378737449645996, "step": 3350 }, { "epoch": 0.18640518162022718, "grad_norm": 72.04387664794922, "learning_rate": 9.166855351095204e-08, "logits/chosen": -0.28203243017196655, "logits/rejected": -0.4075242578983307, "logps/chosen": -157.84893798828125, "logps/rejected": -190.1215057373047, "loss": 1.3512, "nll_loss": 0.9680745005607605, "rewards/accuracies": 0.75, "rewards/chosen": 4.681795597076416, "rewards/margins": 2.9883322715759277, "rewards/rejected": 1.6934630870819092, "step": 3360 }, { "epoch": 0.18695995894647785, "grad_norm": 135.1023406982422, "learning_rate": 9.162032375714044e-08, "logits/chosen": -0.27743062376976013, "logits/rejected": -0.4327467978000641, "logps/chosen": -176.78150939941406, "logps/rejected": -229.76913452148438, "loss": 1.4111, "nll_loss": 0.9727862477302551, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.038991928100586, "rewards/margins": 4.594063758850098, "rewards/rejected": 0.4449283480644226, "step": 3370 }, { "epoch": 0.18751473627272852, "grad_norm": 94.94296264648438, "learning_rate": 9.157196757204649e-08, "logits/chosen": -0.2864229679107666, "logits/rejected": -0.4441998600959778, "logps/chosen": -181.42529296875, "logps/rejected": -223.7584228515625, "loss": 1.3273, "nll_loss": 1.0347633361816406, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.656604766845703, "rewards/margins": 3.1743931770324707, "rewards/rejected": 1.4822113513946533, "step": 3380 }, { "epoch": 0.18806951359897922, "grad_norm": 63.17582702636719, "learning_rate": 9.152348510256319e-08, "logits/chosen": -0.25547483563423157, "logits/rejected": -0.4117124080657959, "logps/chosen": -160.04031372070312, "logps/rejected": -231.7164764404297, "loss": 1.3238, "nll_loss": 0.9673099517822266, "rewards/accuracies": 0.875, "rewards/chosen": 4.919541835784912, "rewards/margins": 4.370184898376465, "rewards/rejected": 0.5493569374084473, "step": 3390 }, { "epoch": 0.1886242909252299, "grad_norm": 54.76870346069336, "learning_rate": 9.147487649596719e-08, "logits/chosen": -0.4662798047065735, "logits/rejected": -0.5804362297058105, "logps/chosen": -205.5302276611328, "logps/rejected": -257.89093017578125, "loss": 1.5078, "nll_loss": 1.1512181758880615, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.4895339012146, "rewards/margins": 3.2194790840148926, "rewards/rejected": 2.270055055618286, "step": 3400 }, { "epoch": 0.18917906825148056, "grad_norm": 52.00660705566406, "learning_rate": 9.142614189991827e-08, "logits/chosen": -0.3087933361530304, "logits/rejected": -0.44009774923324585, "logps/chosen": -206.9576416015625, "logps/rejected": -280.1715087890625, "loss": 1.3352, "nll_loss": 1.108897089958191, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.316677570343018, "rewards/margins": 3.676166534423828, "rewards/rejected": 1.6405115127563477, "step": 3410 }, { "epoch": 0.18973384557773124, "grad_norm": 62.3161735534668, "learning_rate": 9.13772814624589e-08, "logits/chosen": -0.30704936385154724, "logits/rejected": -0.38141196966171265, "logps/chosen": -157.9688262939453, "logps/rejected": -221.52737426757812, "loss": 1.4271, "nll_loss": 0.9901927709579468, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.816000938415527, "rewards/margins": 3.078101634979248, "rewards/rejected": 1.7378990650177002, "step": 3420 }, { "epoch": 0.1902886229039819, "grad_norm": 83.45936584472656, "learning_rate": 9.132829533201396e-08, "logits/chosen": -0.44340506196022034, "logits/rejected": -0.5546245574951172, "logps/chosen": -176.84097290039062, "logps/rejected": -237.9415283203125, "loss": 1.4669, "nll_loss": 1.084262728691101, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.157680511474609, "rewards/margins": 2.991424083709717, "rewards/rejected": 2.1662566661834717, "step": 3430 }, { "epoch": 0.19084340023023258, "grad_norm": 88.83804321289062, "learning_rate": 9.127918365739001e-08, "logits/chosen": -0.3826572299003601, "logits/rejected": -0.505754828453064, "logps/chosen": -201.6626434326172, "logps/rejected": -257.3487243652344, "loss": 1.3895, "nll_loss": 1.059676170349121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.6069536209106445, "rewards/margins": 2.890880584716797, "rewards/rejected": 2.716073751449585, "step": 3440 }, { "epoch": 0.19139817755648328, "grad_norm": 56.37425994873047, "learning_rate": 9.122994658777503e-08, "logits/chosen": -0.2930208742618561, "logits/rejected": -0.45751482248306274, "logps/chosen": -163.3193817138672, "logps/rejected": -212.1604766845703, "loss": 1.3403, "nll_loss": 0.9310756921768188, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.8973612785339355, "rewards/margins": 3.659257173538208, "rewards/rejected": 1.2381041049957275, "step": 3450 }, { "epoch": 0.19195295488273395, "grad_norm": 41.779510498046875, "learning_rate": 9.118058427273791e-08, "logits/chosen": -0.34141066670417786, "logits/rejected": -0.4689091742038727, "logps/chosen": -173.6635284423828, "logps/rejected": -234.62466430664062, "loss": 1.2819, "nll_loss": 0.981812596321106, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.877215385437012, "rewards/margins": 3.6084609031677246, "rewards/rejected": 1.2687546014785767, "step": 3460 }, { "epoch": 0.19250773220898462, "grad_norm": 30.11418914794922, "learning_rate": 9.113109686222802e-08, "logits/chosen": -0.4162047803401947, "logits/rejected": -0.5399635434150696, "logps/chosen": -196.7699737548828, "logps/rejected": -259.7813720703125, "loss": 1.2991, "nll_loss": 1.053601861000061, "rewards/accuracies": 0.875, "rewards/chosen": 5.753113746643066, "rewards/margins": 4.110931396484375, "rewards/rejected": 1.6421819925308228, "step": 3470 }, { "epoch": 0.1930625095352353, "grad_norm": 68.62322998046875, "learning_rate": 9.108148450657471e-08, "logits/chosen": -0.24979765713214874, "logits/rejected": -0.37531179189682007, "logps/chosen": -177.31985473632812, "logps/rejected": -224.9247283935547, "loss": 1.3996, "nll_loss": 1.179163932800293, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.5680108070373535, "rewards/margins": 2.4868268966674805, "rewards/rejected": 2.081183910369873, "step": 3480 }, { "epoch": 0.19361728686148597, "grad_norm": 65.12769317626953, "learning_rate": 9.103174735648692e-08, "logits/chosen": -0.34614425897598267, "logits/rejected": -0.49961429834365845, "logps/chosen": -150.73887634277344, "logps/rejected": -206.9713897705078, "loss": 1.341, "nll_loss": 0.9850081205368042, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.518560886383057, "rewards/margins": 2.364466905593872, "rewards/rejected": 2.1540937423706055, "step": 3490 }, { "epoch": 0.19417206418773664, "grad_norm": 59.67082214355469, "learning_rate": 9.098188556305261e-08, "logits/chosen": -0.2790587544441223, "logits/rejected": -0.46283188462257385, "logps/chosen": -146.38931274414062, "logps/rejected": -194.45590209960938, "loss": 1.2778, "nll_loss": 0.9563590884208679, "rewards/accuracies": 0.875, "rewards/chosen": 4.381845951080322, "rewards/margins": 2.9258816242218018, "rewards/rejected": 1.4559640884399414, "step": 3500 }, { "epoch": 0.19417206418773664, "eval_logits/chosen": -0.4424753189086914, "eval_logits/rejected": -0.5204493403434753, "eval_logps/chosen": -201.9140167236328, "eval_logps/rejected": -256.9493103027344, "eval_loss": 1.3134431838989258, "eval_nll_loss": 1.0490245819091797, "eval_rewards/accuracies": 0.84375, "eval_rewards/chosen": 5.6047210693359375, "eval_rewards/margins": 3.391801118850708, "eval_rewards/rejected": 2.2129199504852295, "eval_runtime": 17.2468, "eval_samples_per_second": 14.843, "eval_steps_per_second": 1.855, "step": 3500 }, { "epoch": 0.19472684151398734, "grad_norm": 74.80335998535156, "learning_rate": 9.093189927773848e-08, "logits/chosen": -0.3249863088130951, "logits/rejected": -0.47237634658813477, "logps/chosen": -183.12828063964844, "logps/rejected": -219.3392333984375, "loss": 1.3372, "nll_loss": 0.9999750852584839, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.970829486846924, "rewards/margins": 2.9720358848571777, "rewards/rejected": 1.998793601989746, "step": 3510 }, { "epoch": 0.195281618840238, "grad_norm": 54.24885177612305, "learning_rate": 9.088178865238928e-08, "logits/chosen": -0.23620446026325226, "logits/rejected": -0.33106979727745056, "logps/chosen": -144.73150634765625, "logps/rejected": -200.6682586669922, "loss": 1.2637, "nll_loss": 0.8744996786117554, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.980032444000244, "rewards/margins": 2.8522746562957764, "rewards/rejected": 2.127758264541626, "step": 3520 }, { "epoch": 0.19583639616648868, "grad_norm": 88.81790924072266, "learning_rate": 9.083155383922756e-08, "logits/chosen": -0.24368822574615479, "logits/rejected": -0.37356775999069214, "logps/chosen": -169.39254760742188, "logps/rejected": -211.821044921875, "loss": 1.4507, "nll_loss": 0.9627988934516907, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.646332740783691, "rewards/margins": 2.7353675365448, "rewards/rejected": 1.9109646081924438, "step": 3530 }, { "epoch": 0.19639117349273935, "grad_norm": 71.2747573852539, "learning_rate": 9.078119499085308e-08, "logits/chosen": -0.418663889169693, "logits/rejected": -0.5115020275115967, "logps/chosen": -181.2946319580078, "logps/rejected": -225.5801544189453, "loss": 1.3974, "nll_loss": 1.0265872478485107, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.286177635192871, "rewards/margins": 2.8992366790771484, "rewards/rejected": 2.3869409561157227, "step": 3540 }, { "epoch": 0.19694595081899002, "grad_norm": 39.39947509765625, "learning_rate": 9.07307122602424e-08, "logits/chosen": -0.3985862135887146, "logits/rejected": -0.46568727493286133, "logps/chosen": -187.3225860595703, "logps/rejected": -242.97030639648438, "loss": 1.3301, "nll_loss": 1.0382740497589111, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.31097936630249, "rewards/margins": 3.2747280597686768, "rewards/rejected": 2.0362515449523926, "step": 3550 }, { "epoch": 0.1975007281452407, "grad_norm": 29.353214263916016, "learning_rate": 9.06801058007484e-08, "logits/chosen": -0.16209930181503296, "logits/rejected": -0.29742684960365295, "logps/chosen": -153.34727478027344, "logps/rejected": -165.13827514648438, "loss": 1.3441, "nll_loss": 0.8343960642814636, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.275020599365234, "rewards/margins": 2.413543462753296, "rewards/rejected": 1.8614771366119385, "step": 3560 }, { "epoch": 0.19805550547149137, "grad_norm": 69.83041381835938, "learning_rate": 9.062937576609982e-08, "logits/chosen": -0.4704197347164154, "logits/rejected": -0.5258275270462036, "logps/chosen": -200.0384063720703, "logps/rejected": -245.9759063720703, "loss": 1.5084, "nll_loss": 1.0869133472442627, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 5.382255554199219, "rewards/margins": 2.419602632522583, "rewards/rejected": 2.9626529216766357, "step": 3570 }, { "epoch": 0.19861028279774207, "grad_norm": 56.682098388671875, "learning_rate": 9.057852231040075e-08, "logits/chosen": -0.2369583398103714, "logits/rejected": -0.3535544276237488, "logps/chosen": -155.8750457763672, "logps/rejected": -205.74368286132812, "loss": 1.3495, "nll_loss": 0.940041184425354, "rewards/accuracies": 0.75, "rewards/chosen": 4.6919403076171875, "rewards/margins": 2.4483542442321777, "rewards/rejected": 2.2435860633850098, "step": 3580 }, { "epoch": 0.19916506012399274, "grad_norm": 79.86531829833984, "learning_rate": 9.052754558813028e-08, "logits/chosen": -0.38561543822288513, "logits/rejected": -0.5354014039039612, "logps/chosen": -171.94937133789062, "logps/rejected": -215.95858764648438, "loss": 1.3566, "nll_loss": 1.0209826231002808, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.939810276031494, "rewards/margins": 2.798379898071289, "rewards/rejected": 2.141430616378784, "step": 3590 }, { "epoch": 0.1997198374502434, "grad_norm": 65.32847595214844, "learning_rate": 9.047644575414183e-08, "logits/chosen": -0.2097257375717163, "logits/rejected": -0.3981267809867859, "logps/chosen": -149.56321716308594, "logps/rejected": -175.35711669921875, "loss": 1.3656, "nll_loss": 0.8642207980155945, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.44227409362793, "rewards/margins": 2.8110909461975098, "rewards/rejected": 1.6311830282211304, "step": 3600 }, { "epoch": 0.20027461477649408, "grad_norm": 49.38480758666992, "learning_rate": 9.042522296366291e-08, "logits/chosen": -0.3963824212551117, "logits/rejected": -0.49302539229393005, "logps/chosen": -176.93246459960938, "logps/rejected": -230.24087524414062, "loss": 1.3291, "nll_loss": 1.04789137840271, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.288444519042969, "rewards/margins": 3.1669602394104004, "rewards/rejected": 2.1214840412139893, "step": 3610 }, { "epoch": 0.20082939210274475, "grad_norm": 61.247657775878906, "learning_rate": 9.037387737229451e-08, "logits/chosen": -0.2520487308502197, "logits/rejected": -0.4036117494106293, "logps/chosen": -172.23992919921875, "logps/rejected": -202.6747283935547, "loss": 1.3291, "nll_loss": 0.9279230833053589, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.3425374031066895, "rewards/margins": 2.726191759109497, "rewards/rejected": 1.616346001625061, "step": 3620 }, { "epoch": 0.20138416942899542, "grad_norm": 61.63615798950195, "learning_rate": 9.032240913601062e-08, "logits/chosen": -0.27439185976982117, "logits/rejected": -0.3957839012145996, "logps/chosen": -159.75448608398438, "logps/rejected": -222.67611694335938, "loss": 1.4667, "nll_loss": 0.9911657571792603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.459640979766846, "rewards/margins": 2.8293843269348145, "rewards/rejected": 1.6302568912506104, "step": 3630 }, { "epoch": 0.20193894675524612, "grad_norm": 76.18391418457031, "learning_rate": 9.027081841115783e-08, "logits/chosen": -0.25864553451538086, "logits/rejected": -0.45542287826538086, "logps/chosen": -158.814453125, "logps/rejected": -234.5258331298828, "loss": 1.3452, "nll_loss": 0.8534186482429504, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.526062488555908, "rewards/margins": 3.695394992828369, "rewards/rejected": 0.8306673169136047, "step": 3640 }, { "epoch": 0.2024937240814968, "grad_norm": 69.41482543945312, "learning_rate": 9.021910535445479e-08, "logits/chosen": -0.3268418312072754, "logits/rejected": -0.42958277463912964, "logps/chosen": -173.4744110107422, "logps/rejected": -223.49221801757812, "loss": 1.2794, "nll_loss": 0.9873861074447632, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.12941837310791, "rewards/margins": 3.557917356491089, "rewards/rejected": 1.57150137424469, "step": 3650 }, { "epoch": 0.20304850140774747, "grad_norm": 75.36470794677734, "learning_rate": 9.01672701229918e-08, "logits/chosen": -0.23507532477378845, "logits/rejected": -0.40880221128463745, "logps/chosen": -144.60598754882812, "logps/rejected": -190.6197967529297, "loss": 1.3332, "nll_loss": 0.8716999292373657, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.289609432220459, "rewards/margins": 3.0599446296691895, "rewards/rejected": 1.229664921760559, "step": 3660 }, { "epoch": 0.20360327873399814, "grad_norm": 58.159019470214844, "learning_rate": 9.011531287423023e-08, "logits/chosen": -0.057216621935367584, "logits/rejected": -0.18780682981014252, "logps/chosen": -105.15031433105469, "logps/rejected": -145.9329833984375, "loss": 1.3099, "nll_loss": 0.7050063610076904, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.8075637817382812, "rewards/margins": 2.8922011852264404, "rewards/rejected": 0.9153624773025513, "step": 3670 }, { "epoch": 0.2041580560602488, "grad_norm": 36.62731170654297, "learning_rate": 9.006323376600215e-08, "logits/chosen": -0.4482879042625427, "logits/rejected": -0.5565906763076782, "logps/chosen": -187.14883422851562, "logps/rejected": -242.4229278564453, "loss": 1.3601, "nll_loss": 1.094943881034851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.3773579597473145, "rewards/margins": 3.112643003463745, "rewards/rejected": 2.2647151947021484, "step": 3680 }, { "epoch": 0.20471283338649948, "grad_norm": 91.01419067382812, "learning_rate": 9.001103295650985e-08, "logits/chosen": -0.3312477171421051, "logits/rejected": -0.4301799237728119, "logps/chosen": -177.8890380859375, "logps/rejected": -210.2823944091797, "loss": 1.2762, "nll_loss": 1.0129368305206299, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.19062614440918, "rewards/margins": 2.9621591567993164, "rewards/rejected": 2.228466510772705, "step": 3690 }, { "epoch": 0.20526761071275018, "grad_norm": 66.5330810546875, "learning_rate": 8.99587106043252e-08, "logits/chosen": -0.2117491215467453, "logits/rejected": -0.3421010971069336, "logps/chosen": -177.11669921875, "logps/rejected": -205.1708221435547, "loss": 1.3927, "nll_loss": 0.9429581761360168, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.606531143188477, "rewards/margins": 2.5099503993988037, "rewards/rejected": 2.0965805053710938, "step": 3700 }, { "epoch": 0.20582238803900085, "grad_norm": 35.68384552001953, "learning_rate": 8.990626686838938e-08, "logits/chosen": -0.3598509430885315, "logits/rejected": -0.45173701643943787, "logps/chosen": -162.70602416992188, "logps/rejected": -220.9865264892578, "loss": 1.2909, "nll_loss": 0.9705358743667603, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.9746623039245605, "rewards/margins": 3.077357053756714, "rewards/rejected": 1.8973052501678467, "step": 3710 }, { "epoch": 0.20637716536525152, "grad_norm": 72.9140396118164, "learning_rate": 8.985370190801227e-08, "logits/chosen": -0.24999204277992249, "logits/rejected": -0.35079583525657654, "logps/chosen": -172.55189514160156, "logps/rejected": -207.9408416748047, "loss": 1.3975, "nll_loss": 0.9223276972770691, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.864678382873535, "rewards/margins": 3.5197761058807373, "rewards/rejected": 1.3449018001556396, "step": 3720 }, { "epoch": 0.2069319426915022, "grad_norm": 58.62187576293945, "learning_rate": 8.980101588287201e-08, "logits/chosen": -0.39123472571372986, "logits/rejected": -0.48360252380371094, "logps/chosen": -181.18020629882812, "logps/rejected": -233.19320678710938, "loss": 1.375, "nll_loss": 1.0049530267715454, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.3949456214904785, "rewards/margins": 3.387439012527466, "rewards/rejected": 2.0075063705444336, "step": 3730 }, { "epoch": 0.20748672001775287, "grad_norm": 104.83113861083984, "learning_rate": 8.974820895301444e-08, "logits/chosen": -0.40971869230270386, "logits/rejected": -0.5111854076385498, "logps/chosen": -171.76803588867188, "logps/rejected": -222.0377197265625, "loss": 1.3896, "nll_loss": 1.0767524242401123, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.4347333908081055, "rewards/margins": 3.326315402984619, "rewards/rejected": 2.1084187030792236, "step": 3740 }, { "epoch": 0.20804149734400354, "grad_norm": 131.49130249023438, "learning_rate": 8.96952812788528e-08, "logits/chosen": -0.4055160582065582, "logits/rejected": -0.5088625550270081, "logps/chosen": -180.02684020996094, "logps/rejected": -240.64077758789062, "loss": 1.3895, "nll_loss": 1.0220630168914795, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.497774124145508, "rewards/margins": 2.485865831375122, "rewards/rejected": 3.0119082927703857, "step": 3750 }, { "epoch": 0.20859627467025424, "grad_norm": 75.92974853515625, "learning_rate": 8.964223302116698e-08, "logits/chosen": -0.3788098692893982, "logits/rejected": -0.4966840147972107, "logps/chosen": -190.69357299804688, "logps/rejected": -241.46841430664062, "loss": 1.3045, "nll_loss": 1.0417449474334717, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.564336776733398, "rewards/margins": 3.94038462638855, "rewards/rejected": 1.6239522695541382, "step": 3760 }, { "epoch": 0.2091510519965049, "grad_norm": 112.6208267211914, "learning_rate": 8.958906434110325e-08, "logits/chosen": -0.27260252833366394, "logits/rejected": -0.4252198338508606, "logps/chosen": -157.2131805419922, "logps/rejected": -219.95974731445312, "loss": 1.3116, "nll_loss": 0.8680321574211121, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.7769927978515625, "rewards/margins": 3.1072120666503906, "rewards/rejected": 1.6697801351547241, "step": 3770 }, { "epoch": 0.20970582932275558, "grad_norm": 94.54066467285156, "learning_rate": 8.95357754001737e-08, "logits/chosen": -0.26051098108291626, "logits/rejected": -0.41779977083206177, "logps/chosen": -149.70997619628906, "logps/rejected": -197.0052032470703, "loss": 1.284, "nll_loss": 0.8986402750015259, "rewards/accuracies": 0.875, "rewards/chosen": 4.675868034362793, "rewards/margins": 3.015355348587036, "rewards/rejected": 1.660513162612915, "step": 3780 }, { "epoch": 0.21026060664900625, "grad_norm": 100.28746032714844, "learning_rate": 8.948236636025568e-08, "logits/chosen": -0.36036959290504456, "logits/rejected": -0.47816309332847595, "logps/chosen": -153.33084106445312, "logps/rejected": -204.0556640625, "loss": 1.454, "nll_loss": 0.9947422742843628, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.761590957641602, "rewards/margins": 2.528191089630127, "rewards/rejected": 2.2333998680114746, "step": 3790 }, { "epoch": 0.21081538397525693, "grad_norm": 67.10070037841797, "learning_rate": 8.942883738359142e-08, "logits/chosen": -0.3197064697742462, "logits/rejected": -0.43918901681900024, "logps/chosen": -173.57681274414062, "logps/rejected": -223.16213989257812, "loss": 1.2664, "nll_loss": 0.9543741941452026, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.900728702545166, "rewards/margins": 3.069725513458252, "rewards/rejected": 1.831003189086914, "step": 3800 }, { "epoch": 0.2113701613015076, "grad_norm": 49.583927154541016, "learning_rate": 8.937518863278746e-08, "logits/chosen": -0.43831509351730347, "logits/rejected": -0.5418807864189148, "logps/chosen": -170.5087432861328, "logps/rejected": -228.23348999023438, "loss": 1.3477, "nll_loss": 1.0029704570770264, "rewards/accuracies": 0.875, "rewards/chosen": 5.487241744995117, "rewards/margins": 2.9966530799865723, "rewards/rejected": 2.490588665008545, "step": 3810 }, { "epoch": 0.2119249386277583, "grad_norm": 105.17793273925781, "learning_rate": 8.932142027081419e-08, "logits/chosen": -0.3503844141960144, "logits/rejected": -0.5124867558479309, "logps/chosen": -184.93325805664062, "logps/rejected": -245.14987182617188, "loss": 1.3364, "nll_loss": 0.9998448491096497, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.2992753982543945, "rewards/margins": 3.449542284011841, "rewards/rejected": 1.8497333526611328, "step": 3820 }, { "epoch": 0.21247971595400897, "grad_norm": 69.32069396972656, "learning_rate": 8.926753246100536e-08, "logits/chosen": -0.30155476927757263, "logits/rejected": -0.35403114557266235, "logps/chosen": -171.98782348632812, "logps/rejected": -201.35647583007812, "loss": 1.3318, "nll_loss": 1.062892198562622, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 4.722691059112549, "rewards/margins": 2.3339271545410156, "rewards/rejected": 2.388763666152954, "step": 3830 }, { "epoch": 0.21303449328025964, "grad_norm": 33.47819900512695, "learning_rate": 8.921352536705752e-08, "logits/chosen": -0.3153464198112488, "logits/rejected": -0.4027198851108551, "logps/chosen": -159.6675567626953, "logps/rejected": -198.66439819335938, "loss": 1.2908, "nll_loss": 0.9326319694519043, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.927337646484375, "rewards/margins": 2.171002149581909, "rewards/rejected": 2.7563347816467285, "step": 3840 }, { "epoch": 0.2135892706065103, "grad_norm": 73.93592834472656, "learning_rate": 8.915939915302967e-08, "logits/chosen": -0.20850825309753418, "logits/rejected": -0.29826563596725464, "logps/chosen": -170.40194702148438, "logps/rejected": -181.62571716308594, "loss": 1.3736, "nll_loss": 0.9485648274421692, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.793632507324219, "rewards/margins": 3.0252745151519775, "rewards/rejected": 1.768358588218689, "step": 3850 }, { "epoch": 0.21414404793276098, "grad_norm": 73.56593322753906, "learning_rate": 8.910515398334255e-08, "logits/chosen": -0.4447970390319824, "logits/rejected": -0.5151744484901428, "logps/chosen": -174.51461791992188, "logps/rejected": -230.3745574951172, "loss": 1.3978, "nll_loss": 1.098456621170044, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.449948310852051, "rewards/margins": 3.0486416816711426, "rewards/rejected": 2.4013073444366455, "step": 3860 }, { "epoch": 0.21469882525901166, "grad_norm": 48.50322341918945, "learning_rate": 8.905079002277832e-08, "logits/chosen": -0.22557875514030457, "logits/rejected": -0.3795395493507385, "logps/chosen": -138.54998779296875, "logps/rejected": -195.5319366455078, "loss": 1.2399, "nll_loss": 0.8102123141288757, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.490899085998535, "rewards/margins": 3.248749256134033, "rewards/rejected": 1.2421494722366333, "step": 3870 }, { "epoch": 0.21525360258526233, "grad_norm": 55.75164794921875, "learning_rate": 8.899630743648e-08, "logits/chosen": -0.31376057863235474, "logits/rejected": -0.43367958068847656, "logps/chosen": -150.63986206054688, "logps/rejected": -201.40696716308594, "loss": 1.2756, "nll_loss": 0.9280077815055847, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.763642311096191, "rewards/margins": 3.3326945304870605, "rewards/rejected": 1.4309481382369995, "step": 3880 }, { "epoch": 0.21580837991151303, "grad_norm": 82.2542495727539, "learning_rate": 8.894170638995092e-08, "logits/chosen": -0.2537023425102234, "logits/rejected": -0.35121041536331177, "logps/chosen": -159.0094757080078, "logps/rejected": -208.6914825439453, "loss": 1.4474, "nll_loss": 0.9805082082748413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.797659873962402, "rewards/margins": 2.736910581588745, "rewards/rejected": 2.0607495307922363, "step": 3890 }, { "epoch": 0.2163631572377637, "grad_norm": 43.822174072265625, "learning_rate": 8.888698704905431e-08, "logits/chosen": -0.21386781334877014, "logits/rejected": -0.3521370589733124, "logps/chosen": -144.4310302734375, "logps/rejected": -178.11569213867188, "loss": 1.2435, "nll_loss": 0.872231662273407, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.516562461853027, "rewards/margins": 2.044229030609131, "rewards/rejected": 2.4723331928253174, "step": 3900 }, { "epoch": 0.21691793456401437, "grad_norm": 64.06157684326172, "learning_rate": 8.88321495800127e-08, "logits/chosen": -0.38649290800094604, "logits/rejected": -0.4504520893096924, "logps/chosen": -201.0390167236328, "logps/rejected": -220.2386932373047, "loss": 1.5477, "nll_loss": 1.1234185695648193, "rewards/accuracies": 0.625, "rewards/chosen": 5.286801815032959, "rewards/margins": 2.4049549102783203, "rewards/rejected": 2.8818471431732178, "step": 3910 }, { "epoch": 0.21747271189026504, "grad_norm": 59.72712707519531, "learning_rate": 8.87771941494075e-08, "logits/chosen": -0.24501697719097137, "logits/rejected": -0.3806004822254181, "logps/chosen": -153.45704650878906, "logps/rejected": -185.648681640625, "loss": 1.3693, "nll_loss": 0.9162791967391968, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.498444557189941, "rewards/margins": 2.5761585235595703, "rewards/rejected": 1.9222854375839233, "step": 3920 }, { "epoch": 0.2180274892165157, "grad_norm": 74.50149536132812, "learning_rate": 8.872212092417844e-08, "logits/chosen": -0.2650856375694275, "logits/rejected": -0.34951329231262207, "logps/chosen": -147.10775756835938, "logps/rejected": -179.2303466796875, "loss": 1.3909, "nll_loss": 0.9994792938232422, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.621931076049805, "rewards/margins": 2.5670480728149414, "rewards/rejected": 2.0548832416534424, "step": 3930 }, { "epoch": 0.21858226654276638, "grad_norm": 34.133785247802734, "learning_rate": 8.866693007162307e-08, "logits/chosen": -0.29337453842163086, "logits/rejected": -0.40382012724876404, "logps/chosen": -188.85833740234375, "logps/rejected": -240.5803680419922, "loss": 1.3901, "nll_loss": 1.0012753009796143, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.997965335845947, "rewards/margins": 2.5151822566986084, "rewards/rejected": 2.4827828407287598, "step": 3940 }, { "epoch": 0.21913704386901708, "grad_norm": 33.943721771240234, "learning_rate": 8.861162175939625e-08, "logits/chosen": -0.2020225077867508, "logits/rejected": -0.330327570438385, "logps/chosen": -167.04249572753906, "logps/rejected": -215.63134765625, "loss": 1.3655, "nll_loss": 1.047996997833252, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.789084434509277, "rewards/margins": 2.9170351028442383, "rewards/rejected": 1.87204909324646, "step": 3950 }, { "epoch": 0.21969182119526776, "grad_norm": 72.16616821289062, "learning_rate": 8.855619615550972e-08, "logits/chosen": -0.30495089292526245, "logits/rejected": -0.4407684803009033, "logps/chosen": -179.2265167236328, "logps/rejected": -260.6602478027344, "loss": 1.2582, "nll_loss": 0.9948042035102844, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.850760459899902, "rewards/margins": 4.7629876136779785, "rewards/rejected": 1.087773084640503, "step": 3960 }, { "epoch": 0.22024659852151843, "grad_norm": 62.64510726928711, "learning_rate": 8.850065342833141e-08, "logits/chosen": -0.1904325932264328, "logits/rejected": -0.4101056158542633, "logps/chosen": -133.64218139648438, "logps/rejected": -197.58729553222656, "loss": 1.3394, "nll_loss": 0.8108884692192078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.4841461181640625, "rewards/margins": 3.7502377033233643, "rewards/rejected": 0.7339082956314087, "step": 3970 }, { "epoch": 0.2208013758477691, "grad_norm": 55.91902160644531, "learning_rate": 8.844499374658512e-08, "logits/chosen": -0.4355524182319641, "logits/rejected": -0.5665684938430786, "logps/chosen": -178.23260498046875, "logps/rejected": -245.59194946289062, "loss": 1.413, "nll_loss": 1.092327356338501, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.46936559677124, "rewards/margins": 3.4504482746124268, "rewards/rejected": 2.018918037414551, "step": 3980 }, { "epoch": 0.22135615317401977, "grad_norm": 46.83743667602539, "learning_rate": 8.838921727934991e-08, "logits/chosen": -0.3046635389328003, "logits/rejected": -0.3972231447696686, "logps/chosen": -161.92007446289062, "logps/rejected": -226.1973876953125, "loss": 1.3331, "nll_loss": 1.0980288982391357, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.024444580078125, "rewards/margins": 3.8393218517303467, "rewards/rejected": 1.1851226091384888, "step": 3990 }, { "epoch": 0.22191093050027044, "grad_norm": 68.99779510498047, "learning_rate": 8.833332419605959e-08, "logits/chosen": -0.33155789971351624, "logits/rejected": -0.38299810886383057, "logps/chosen": -166.82423400878906, "logps/rejected": -200.9204864501953, "loss": 1.3213, "nll_loss": 1.010746955871582, "rewards/accuracies": 0.875, "rewards/chosen": 4.797917366027832, "rewards/margins": 2.3995375633239746, "rewards/rejected": 2.3983798027038574, "step": 4000 }, { "epoch": 0.22191093050027044, "eval_logits/chosen": -0.3887759745121002, "eval_logits/rejected": -0.4639728367328644, "eval_logps/chosen": -200.67454528808594, "eval_logps/rejected": -261.816162109375, "eval_loss": 1.2828993797302246, "eval_nll_loss": 1.0431652069091797, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 5.728669166564941, "eval_rewards/margins": 4.002435684204102, "eval_rewards/rejected": 1.726233720779419, "eval_runtime": 17.2188, "eval_samples_per_second": 14.867, "eval_steps_per_second": 1.858, "step": 4000 }, { "epoch": 0.22246570782652114, "grad_norm": 140.62405395507812, "learning_rate": 8.827731466650223e-08, "logits/chosen": -0.20445005595684052, "logits/rejected": -0.3240208625793457, "logps/chosen": -153.79769897460938, "logps/rejected": -195.5605926513672, "loss": 1.35, "nll_loss": 1.039541482925415, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.536130905151367, "rewards/margins": 2.774941921234131, "rewards/rejected": 1.7611888647079468, "step": 4010 }, { "epoch": 0.2230204851527718, "grad_norm": 75.44985961914062, "learning_rate": 8.822118886081961e-08, "logits/chosen": -0.16724136471748352, "logits/rejected": -0.3808498978614807, "logps/chosen": -160.11105346679688, "logps/rejected": -210.70327758789062, "loss": 1.3192, "nll_loss": 0.8742419481277466, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.984594345092773, "rewards/margins": 3.297489643096924, "rewards/rejected": 1.6871049404144287, "step": 4020 }, { "epoch": 0.22357526247902249, "grad_norm": 60.77824401855469, "learning_rate": 8.816494694950675e-08, "logits/chosen": -0.24743108451366425, "logits/rejected": -0.40931805968284607, "logps/chosen": -162.1366729736328, "logps/rejected": -207.17422485351562, "loss": 1.3964, "nll_loss": 0.8974650502204895, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.02827262878418, "rewards/margins": 3.0538411140441895, "rewards/rejected": 1.9744312763214111, "step": 4030 }, { "epoch": 0.22413003980527316, "grad_norm": 85.68519592285156, "learning_rate": 8.810858910341137e-08, "logits/chosen": -0.4329708218574524, "logits/rejected": -0.5223310589790344, "logps/chosen": -189.80859375, "logps/rejected": -210.829833984375, "loss": 1.3957, "nll_loss": 1.077449917793274, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.409401893615723, "rewards/margins": 2.2836270332336426, "rewards/rejected": 3.1257758140563965, "step": 4040 }, { "epoch": 0.22468481713152383, "grad_norm": 55.49538040161133, "learning_rate": 8.805211549373334e-08, "logits/chosen": -0.25056955218315125, "logits/rejected": -0.3673659861087799, "logps/chosen": -178.02268981933594, "logps/rejected": -230.04541015625, "loss": 1.3563, "nll_loss": 0.9556955099105835, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.048086166381836, "rewards/margins": 2.6971967220306396, "rewards/rejected": 2.350889205932617, "step": 4050 }, { "epoch": 0.2252395944577745, "grad_norm": 68.45481872558594, "learning_rate": 8.799552629202423e-08, "logits/chosen": -0.33979806303977966, "logits/rejected": -0.4661482274532318, "logps/chosen": -175.78135681152344, "logps/rejected": -208.96316528320312, "loss": 1.3225, "nll_loss": 0.9899007678031921, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.1168084144592285, "rewards/margins": 3.1300911903381348, "rewards/rejected": 1.9867169857025146, "step": 4060 }, { "epoch": 0.2257943717840252, "grad_norm": 36.38805389404297, "learning_rate": 8.793882167018671e-08, "logits/chosen": -0.33051663637161255, "logits/rejected": -0.37100648880004883, "logps/chosen": -189.0030517578125, "logps/rejected": -244.38217163085938, "loss": 1.2831, "nll_loss": 1.0483386516571045, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.193713188171387, "rewards/margins": 2.911698818206787, "rewards/rejected": 2.282015085220337, "step": 4070 }, { "epoch": 0.22634914911027587, "grad_norm": 108.84422302246094, "learning_rate": 8.788200180047407e-08, "logits/chosen": -0.20089511573314667, "logits/rejected": -0.32828986644744873, "logps/chosen": -148.81640625, "logps/rejected": -205.16873168945312, "loss": 1.3197, "nll_loss": 0.8446512222290039, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.87085485458374, "rewards/margins": 3.210014820098877, "rewards/rejected": 1.660840630531311, "step": 4080 }, { "epoch": 0.22690392643652654, "grad_norm": 129.64999389648438, "learning_rate": 8.78250668554897e-08, "logits/chosen": -0.2841266989707947, "logits/rejected": -0.37601885199546814, "logps/chosen": -163.07171630859375, "logps/rejected": -204.03134155273438, "loss": 1.296, "nll_loss": 0.9791350364685059, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.949339866638184, "rewards/margins": 2.7364394664764404, "rewards/rejected": 2.2129006385803223, "step": 4090 }, { "epoch": 0.22745870376277721, "grad_norm": 68.07540130615234, "learning_rate": 8.776801700818656e-08, "logits/chosen": -0.3015419542789459, "logits/rejected": -0.38536086678504944, "logps/chosen": -148.27731323242188, "logps/rejected": -204.63931274414062, "loss": 1.3958, "nll_loss": 1.005378007888794, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.849169731140137, "rewards/margins": 3.0641539096832275, "rewards/rejected": 1.7850162982940674, "step": 4100 }, { "epoch": 0.2280134810890279, "grad_norm": 73.26811981201172, "learning_rate": 8.771085243186669e-08, "logits/chosen": -0.12376414239406586, "logits/rejected": -0.23722615838050842, "logps/chosen": -138.64968872070312, "logps/rejected": -163.23028564453125, "loss": 1.3529, "nll_loss": 0.7532340288162231, "rewards/accuracies": 0.875, "rewards/chosen": 4.229341506958008, "rewards/margins": 2.931063175201416, "rewards/rejected": 1.2982782125473022, "step": 4110 }, { "epoch": 0.22856825841527856, "grad_norm": 97.30023956298828, "learning_rate": 8.765357330018055e-08, "logits/chosen": 0.011545022949576378, "logits/rejected": -0.09808467328548431, "logps/chosen": -129.2597198486328, "logps/rejected": -153.56124877929688, "loss": 1.3334, "nll_loss": 0.8299420475959778, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.067888259887695, "rewards/margins": 2.3037543296813965, "rewards/rejected": 1.7641338109970093, "step": 4120 }, { "epoch": 0.22912303574152926, "grad_norm": 89.39717102050781, "learning_rate": 8.759617978712666e-08, "logits/chosen": -0.40456628799438477, "logits/rejected": -0.4786996841430664, "logps/chosen": -192.72161865234375, "logps/rejected": -263.7474670410156, "loss": 1.375, "nll_loss": 1.0599608421325684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.777060031890869, "rewards/margins": 4.0243730545043945, "rewards/rejected": 1.7526872158050537, "step": 4130 }, { "epoch": 0.22967781306777993, "grad_norm": 55.39466094970703, "learning_rate": 8.753867206705098e-08, "logits/chosen": -0.3558950424194336, "logits/rejected": -0.42491593956947327, "logps/chosen": -179.45462036132812, "logps/rejected": -226.12252807617188, "loss": 1.3064, "nll_loss": 1.0498034954071045, "rewards/accuracies": 0.75, "rewards/chosen": 5.1217498779296875, "rewards/margins": 2.216548204421997, "rewards/rejected": 2.9052016735076904, "step": 4140 }, { "epoch": 0.2302325903940306, "grad_norm": 108.61188507080078, "learning_rate": 8.748105031464643e-08, "logits/chosen": -0.4373705983161926, "logits/rejected": -0.5036669969558716, "logps/chosen": -169.29861450195312, "logps/rejected": -204.67160034179688, "loss": 1.3576, "nll_loss": 1.0841724872589111, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.15322208404541, "rewards/margins": 2.883450984954834, "rewards/rejected": 2.269770622253418, "step": 4150 }, { "epoch": 0.23078736772028127, "grad_norm": 63.595794677734375, "learning_rate": 8.74233147049523e-08, "logits/chosen": -0.15530423820018768, "logits/rejected": -0.24546948075294495, "logps/chosen": -165.0889892578125, "logps/rejected": -188.34654235839844, "loss": 1.3574, "nll_loss": 0.9425103068351746, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.631152153015137, "rewards/margins": 2.0830862522125244, "rewards/rejected": 2.5480661392211914, "step": 4160 }, { "epoch": 0.23134214504653194, "grad_norm": 119.82685852050781, "learning_rate": 8.736546541335371e-08, "logits/chosen": -0.40301522612571716, "logits/rejected": -0.4933186173439026, "logps/chosen": -208.2997589111328, "logps/rejected": -272.7257385253906, "loss": 1.371, "nll_loss": 1.113821268081665, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.932578086853027, "rewards/margins": 3.926304578781128, "rewards/rejected": 2.006272554397583, "step": 4170 }, { "epoch": 0.23189692237278262, "grad_norm": 63.66457748413086, "learning_rate": 8.730750261558119e-08, "logits/chosen": -0.30639415979385376, "logits/rejected": -0.3842052221298218, "logps/chosen": -189.58531188964844, "logps/rejected": -267.81903076171875, "loss": 1.4157, "nll_loss": 1.135546326637268, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 5.254769802093506, "rewards/margins": 2.687643527984619, "rewards/rejected": 2.5671257972717285, "step": 4180 }, { "epoch": 0.2324516996990333, "grad_norm": 67.39790344238281, "learning_rate": 8.724942648771003e-08, "logits/chosen": -0.3820488154888153, "logits/rejected": -0.5085964202880859, "logps/chosen": -182.63333129882812, "logps/rejected": -258.7193908691406, "loss": 1.3241, "nll_loss": 1.1056026220321655, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.469270706176758, "rewards/margins": 3.692648410797119, "rewards/rejected": 1.7766224145889282, "step": 4190 }, { "epoch": 0.233006477025284, "grad_norm": 31.938941955566406, "learning_rate": 8.71912372061598e-08, "logits/chosen": -0.3462616801261902, "logits/rejected": -0.4664136469364166, "logps/chosen": -193.03187561035156, "logps/rejected": -243.20504760742188, "loss": 1.3471, "nll_loss": 1.0250604152679443, "rewards/accuracies": 0.875, "rewards/chosen": 5.235898017883301, "rewards/margins": 3.276707410812378, "rewards/rejected": 1.9591907262802124, "step": 4200 }, { "epoch": 0.23356125435153466, "grad_norm": 43.32523727416992, "learning_rate": 8.713293494769378e-08, "logits/chosen": -0.2849578261375427, "logits/rejected": -0.42158952355384827, "logps/chosen": -177.0216522216797, "logps/rejected": -234.6879425048828, "loss": 1.3577, "nll_loss": 1.0065664052963257, "rewards/accuracies": 0.75, "rewards/chosen": 5.119265079498291, "rewards/margins": 2.2945590019226074, "rewards/rejected": 2.8247063159942627, "step": 4210 }, { "epoch": 0.23411603167778533, "grad_norm": 81.4455337524414, "learning_rate": 8.707451988941846e-08, "logits/chosen": -0.23719966411590576, "logits/rejected": -0.3504584729671478, "logps/chosen": -195.0302276611328, "logps/rejected": -249.18551635742188, "loss": 1.3832, "nll_loss": 1.0661619901657104, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.4248480796813965, "rewards/margins": 3.603041172027588, "rewards/rejected": 1.8218071460723877, "step": 4220 }, { "epoch": 0.234670809004036, "grad_norm": 60.50730895996094, "learning_rate": 8.701599220878297e-08, "logits/chosen": -0.27704206109046936, "logits/rejected": -0.41974037885665894, "logps/chosen": -190.57310485839844, "logps/rejected": -243.5283966064453, "loss": 1.4058, "nll_loss": 1.0585263967514038, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 5.695257663726807, "rewards/margins": 2.6654000282287598, "rewards/rejected": 3.0298571586608887, "step": 4230 }, { "epoch": 0.23522558633028667, "grad_norm": 129.30686950683594, "learning_rate": 8.695735208357859e-08, "logits/chosen": -0.37667936086654663, "logits/rejected": -0.4965333938598633, "logps/chosen": -202.59652709960938, "logps/rejected": -249.3206787109375, "loss": 1.3356, "nll_loss": 1.1458075046539307, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.792067527770996, "rewards/margins": 2.907867431640625, "rewards/rejected": 2.8842005729675293, "step": 4240 }, { "epoch": 0.23578036365653735, "grad_norm": 80.4828109741211, "learning_rate": 8.689859969193816e-08, "logits/chosen": -0.2171669751405716, "logits/rejected": -0.33526545763015747, "logps/chosen": -186.0350341796875, "logps/rejected": -253.03451538085938, "loss": 1.3282, "nll_loss": 0.947732150554657, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.438222885131836, "rewards/margins": 3.2459194660186768, "rewards/rejected": 2.192303419113159, "step": 4250 }, { "epoch": 0.23633514098278804, "grad_norm": 47.65826416015625, "learning_rate": 8.683973521233552e-08, "logits/chosen": -0.22899992763996124, "logits/rejected": -0.3872426152229309, "logps/chosen": -183.62936401367188, "logps/rejected": -225.7091827392578, "loss": 1.2983, "nll_loss": 0.975047767162323, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.47422456741333, "rewards/margins": 3.2760257720947266, "rewards/rejected": 2.1981987953186035, "step": 4260 }, { "epoch": 0.23688991830903872, "grad_norm": 166.94833374023438, "learning_rate": 8.678075882358505e-08, "logits/chosen": -0.3284236788749695, "logits/rejected": -0.40388602018356323, "logps/chosen": -186.22012329101562, "logps/rejected": -234.9075927734375, "loss": 1.4147, "nll_loss": 1.1497867107391357, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.253143310546875, "rewards/margins": 2.696260452270508, "rewards/rejected": 2.5568830966949463, "step": 4270 }, { "epoch": 0.2374446956352894, "grad_norm": 53.439842224121094, "learning_rate": 8.672167070484104e-08, "logits/chosen": -0.0332237184047699, "logits/rejected": -0.19702832400798798, "logps/chosen": -143.75985717773438, "logps/rejected": -175.3542938232422, "loss": 1.2939, "nll_loss": 0.7661317586898804, "rewards/accuracies": 0.875, "rewards/chosen": 4.622331619262695, "rewards/margins": 2.994150161743164, "rewards/rejected": 1.6281816959381104, "step": 4280 }, { "epoch": 0.23799947296154006, "grad_norm": 62.4766731262207, "learning_rate": 8.666247103559725e-08, "logits/chosen": -0.17678096890449524, "logits/rejected": -0.2632465958595276, "logps/chosen": -160.861083984375, "logps/rejected": -194.93165588378906, "loss": 1.3603, "nll_loss": 0.9933874011039734, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.4631524085998535, "rewards/margins": 1.8023935556411743, "rewards/rejected": 2.6607584953308105, "step": 4290 }, { "epoch": 0.23855425028779073, "grad_norm": 146.61288452148438, "learning_rate": 8.660315999568622e-08, "logits/chosen": 0.062420736998319626, "logits/rejected": -0.1173282116651535, "logps/chosen": -104.2728500366211, "logps/rejected": -138.85101318359375, "loss": 1.2621, "nll_loss": 0.6970239877700806, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7959773540496826, "rewards/margins": 2.769336223602295, "rewards/rejected": 1.0266412496566772, "step": 4300 }, { "epoch": 0.2391090276140414, "grad_norm": 86.5093994140625, "learning_rate": 8.654373776527886e-08, "logits/chosen": -0.4773966372013092, "logits/rejected": -0.5677574276924133, "logps/chosen": -207.92684936523438, "logps/rejected": -268.27423095703125, "loss": 1.3202, "nll_loss": 1.1020228862762451, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.070623397827148, "rewards/margins": 3.890537977218628, "rewards/rejected": 2.1800854206085205, "step": 4310 }, { "epoch": 0.2396638049402921, "grad_norm": 75.80883026123047, "learning_rate": 8.648420452488381e-08, "logits/chosen": -0.03923141211271286, "logits/rejected": -0.20453393459320068, "logps/chosen": -129.60968017578125, "logps/rejected": -183.7969207763672, "loss": 1.3065, "nll_loss": 0.7038072943687439, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.458006381988525, "rewards/margins": 2.861825466156006, "rewards/rejected": 1.59618079662323, "step": 4320 }, { "epoch": 0.24021858226654277, "grad_norm": 61.33818054199219, "learning_rate": 8.642456045534697e-08, "logits/chosen": -0.09462814033031464, "logits/rejected": -0.273030549287796, "logps/chosen": -146.52047729492188, "logps/rejected": -194.67807006835938, "loss": 1.3122, "nll_loss": 0.88775235414505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.361196994781494, "rewards/margins": 2.2622735500335693, "rewards/rejected": 2.0989232063293457, "step": 4330 }, { "epoch": 0.24077335959279345, "grad_norm": 95.40966033935547, "learning_rate": 8.636480573785088e-08, "logits/chosen": -0.06831637769937515, "logits/rejected": -0.19084826111793518, "logps/chosen": -132.2844696044922, "logps/rejected": -149.97573852539062, "loss": 1.2389, "nll_loss": 0.7682362794876099, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.953458070755005, "rewards/margins": 2.6358141899108887, "rewards/rejected": 1.3176437616348267, "step": 4340 }, { "epoch": 0.24132813691904412, "grad_norm": 66.08728790283203, "learning_rate": 8.630494055391418e-08, "logits/chosen": -0.22335462272167206, "logits/rejected": -0.3700319826602936, "logps/chosen": -151.97804260253906, "logps/rejected": -237.0194549560547, "loss": 1.3026, "nll_loss": 0.9709617495536804, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.424733638763428, "rewards/margins": 3.8076565265655518, "rewards/rejected": 1.6170778274536133, "step": 4350 }, { "epoch": 0.2418829142452948, "grad_norm": 84.6449966430664, "learning_rate": 8.624496508539112e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -157.78152465820312, "logps/rejected": -205.28628540039062, "loss": 1.4204, "nll_loss": NaN, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.4751176834106445, "rewards/margins": 4.087596416473389, "rewards/rejected": 1.387520670890808, "step": 4360 }, { "epoch": 0.24243769157154546, "grad_norm": 94.63330078125, "learning_rate": 8.618487951447095e-08, "logits/chosen": -0.4150986075401306, "logits/rejected": -0.5466437339782715, "logps/chosen": -187.41586303710938, "logps/rejected": -238.2434539794922, "loss": 1.2492, "nll_loss": 1.0808436870574951, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.215986251831055, "rewards/margins": 3.6832032203674316, "rewards/rejected": 1.5327831506729126, "step": 4370 }, { "epoch": 0.24299246889779616, "grad_norm": 55.334938049316406, "learning_rate": 8.612468402367738e-08, "logits/chosen": -0.20146456360816956, "logits/rejected": -0.3176548182964325, "logps/chosen": -161.24691772460938, "logps/rejected": -197.61886596679688, "loss": 1.356, "nll_loss": 0.9338601231575012, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.561044216156006, "rewards/margins": 3.121908664703369, "rewards/rejected": 1.4391355514526367, "step": 4380 }, { "epoch": 0.24354724622404683, "grad_norm": 69.12451171875, "learning_rate": 8.606437879586799e-08, "logits/chosen": -0.22908329963684082, "logits/rejected": -0.352277934551239, "logps/chosen": -165.62339782714844, "logps/rejected": -209.05032348632812, "loss": 1.3846, "nll_loss": 0.9394363164901733, "rewards/accuracies": 0.875, "rewards/chosen": 5.0261735916137695, "rewards/margins": 3.0195279121398926, "rewards/rejected": 2.006645679473877, "step": 4390 }, { "epoch": 0.2441020235502975, "grad_norm": 102.31427764892578, "learning_rate": 8.60039640142338e-08, "logits/chosen": -0.2176387757062912, "logits/rejected": -0.31441378593444824, "logps/chosen": -168.29946899414062, "logps/rejected": -223.59280395507812, "loss": 1.4105, "nll_loss": 0.9233170747756958, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.863080024719238, "rewards/margins": 2.7318460941314697, "rewards/rejected": 2.1312336921691895, "step": 4400 }, { "epoch": 0.24465680087654817, "grad_norm": 68.1030502319336, "learning_rate": 8.594343986229853e-08, "logits/chosen": -0.14959707856178284, "logits/rejected": -0.2593737244606018, "logps/chosen": -169.5933074951172, "logps/rejected": -197.62954711914062, "loss": 1.2849, "nll_loss": 0.8743448257446289, "rewards/accuracies": 0.875, "rewards/chosen": 4.89533805847168, "rewards/margins": 2.8288931846618652, "rewards/rejected": 2.0664451122283936, "step": 4410 }, { "epoch": 0.24521157820279885, "grad_norm": 99.41896057128906, "learning_rate": 8.588280652391819e-08, "logits/chosen": -0.2931186556816101, "logits/rejected": -0.4342547357082367, "logps/chosen": -183.0137939453125, "logps/rejected": -246.53598022460938, "loss": 1.3999, "nll_loss": 0.9624223709106445, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 5.3488383293151855, "rewards/margins": 4.081454277038574, "rewards/rejected": 1.2673838138580322, "step": 4420 }, { "epoch": 0.24576635552904952, "grad_norm": 67.750732421875, "learning_rate": 8.582206418328044e-08, "logits/chosen": -0.3752570152282715, "logits/rejected": -0.4712037146091461, "logps/chosen": -182.9212646484375, "logps/rejected": -234.5699005126953, "loss": 1.3834, "nll_loss": 1.0797786712646484, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.707915782928467, "rewards/margins": 3.22369122505188, "rewards/rejected": 2.484224796295166, "step": 4430 }, { "epoch": 0.24632113285530022, "grad_norm": 47.93496322631836, "learning_rate": 8.57612130249041e-08, "logits/chosen": -0.3381286859512329, "logits/rejected": -0.43061742186546326, "logps/chosen": -175.91787719726562, "logps/rejected": -221.03335571289062, "loss": 1.2475, "nll_loss": 0.9814583659172058, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.677148342132568, "rewards/margins": 3.701653242111206, "rewards/rejected": 1.9754953384399414, "step": 4440 }, { "epoch": 0.2468759101815509, "grad_norm": 56.9894905090332, "learning_rate": 8.570025323363852e-08, "logits/chosen": -0.03819179907441139, "logits/rejected": -0.20645050704479218, "logps/chosen": -150.8126983642578, "logps/rejected": -158.15496826171875, "loss": 1.3705, "nll_loss": 0.8135994076728821, "rewards/accuracies": 0.875, "rewards/chosen": 4.05548620223999, "rewards/margins": 1.940616250038147, "rewards/rejected": 2.1148698329925537, "step": 4450 }, { "epoch": 0.24743068750780156, "grad_norm": 65.15169525146484, "learning_rate": 8.563918499466304e-08, "logits/chosen": -0.3688794672489166, "logits/rejected": -0.5100642442703247, "logps/chosen": -182.76837158203125, "logps/rejected": -246.40133666992188, "loss": 1.3296, "nll_loss": 1.0879732370376587, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.046154022216797, "rewards/margins": 3.9471893310546875, "rewards/rejected": 2.0989649295806885, "step": 4460 }, { "epoch": 0.24798546483405223, "grad_norm": 58.4198112487793, "learning_rate": 8.557800849348647e-08, "logits/chosen": -0.42738962173461914, "logits/rejected": -0.549644410610199, "logps/chosen": -210.28811645507812, "logps/rejected": -254.00711059570312, "loss": 1.3312, "nll_loss": 1.1513640880584717, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.145230293273926, "rewards/margins": 3.2168631553649902, "rewards/rejected": 2.9283671379089355, "step": 4470 }, { "epoch": 0.2485402421603029, "grad_norm": 103.25559997558594, "learning_rate": 8.551672391594645e-08, "logits/chosen": -0.10428180545568466, "logits/rejected": -0.195042222738266, "logps/chosen": -157.71859741210938, "logps/rejected": -180.86099243164062, "loss": 1.3473, "nll_loss": 0.8617512583732605, "rewards/accuracies": 0.875, "rewards/chosen": 4.883713722229004, "rewards/margins": 2.410423994064331, "rewards/rejected": 2.473289966583252, "step": 4480 }, { "epoch": 0.24909501948655358, "grad_norm": 52.8361701965332, "learning_rate": 8.545533144820892e-08, "logits/chosen": -0.20854294300079346, "logits/rejected": -0.30969762802124023, "logps/chosen": -180.6295166015625, "logps/rejected": -237.1515350341797, "loss": 1.471, "nll_loss": 0.9408046007156372, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.4925456047058105, "rewards/margins": 2.8565022945404053, "rewards/rejected": 2.6360433101654053, "step": 4490 }, { "epoch": 0.24964979681280425, "grad_norm": 72.89659881591797, "learning_rate": 8.539383127676763e-08, "logits/chosen": -0.23576506972312927, "logits/rejected": -0.3891776502132416, "logps/chosen": -212.4176483154297, "logps/rejected": -269.67706298828125, "loss": 1.3893, "nll_loss": 1.0825352668762207, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.811644077301025, "rewards/margins": 3.8783583641052246, "rewards/rejected": 1.9332859516143799, "step": 4500 }, { "epoch": 0.24964979681280425, "eval_logits/chosen": -0.3306578993797302, "eval_logits/rejected": -0.40762603282928467, "eval_logps/chosen": -199.13613891601562, "eval_logps/rejected": -259.206298828125, "eval_loss": 1.2888281345367432, "eval_nll_loss": 1.0351033210754395, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 5.882508277893066, "eval_rewards/margins": 3.895287036895752, "eval_rewards/rejected": 1.9872204065322876, "eval_runtime": 16.8043, "eval_samples_per_second": 15.234, "eval_steps_per_second": 1.904, "step": 4500 }, { "epoch": 0.25020457413905495, "grad_norm": 72.03968811035156, "learning_rate": 8.533222358844345e-08, "logits/chosen": -0.38020601868629456, "logits/rejected": -0.4878564774990082, "logps/chosen": -176.05238342285156, "logps/rejected": -223.0900115966797, "loss": 1.3742, "nll_loss": 1.063004970550537, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.97015905380249, "rewards/margins": 3.1073145866394043, "rewards/rejected": 1.8628448247909546, "step": 4510 }, { "epoch": 0.2507593514653056, "grad_norm": 25.4820613861084, "learning_rate": 8.527050857038385e-08, "logits/chosen": -0.037890512496232986, "logits/rejected": -0.22132185101509094, "logps/chosen": -138.64907836914062, "logps/rejected": -177.23446655273438, "loss": 1.2826, "nll_loss": 0.7740511298179626, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.439934253692627, "rewards/margins": 3.037219285964966, "rewards/rejected": 1.4027149677276611, "step": 4520 }, { "epoch": 0.2513141287915563, "grad_norm": 50.93412399291992, "learning_rate": 8.520868641006238e-08, "logits/chosen": -0.1365940272808075, "logits/rejected": -0.30282798409461975, "logps/chosen": -132.58665466308594, "logps/rejected": -169.67105102539062, "loss": 1.3555, "nll_loss": 0.7856622338294983, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.3856024742126465, "rewards/margins": 2.379370927810669, "rewards/rejected": 2.0062317848205566, "step": 4530 }, { "epoch": 0.251868906117807, "grad_norm": 46.51472091674805, "learning_rate": 8.514675729527801e-08, "logits/chosen": -0.34577757120132446, "logits/rejected": -0.4794695973396301, "logps/chosen": -205.43167114257812, "logps/rejected": -267.23724365234375, "loss": 1.2885, "nll_loss": 1.064664602279663, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.145432472229004, "rewards/margins": 3.6067302227020264, "rewards/rejected": 2.5387020111083984, "step": 4540 }, { "epoch": 0.25242368344405763, "grad_norm": 62.636844635009766, "learning_rate": 8.508472141415466e-08, "logits/chosen": -0.18032360076904297, "logits/rejected": -0.2991257905960083, "logps/chosen": -165.84439086914062, "logps/rejected": -233.81494140625, "loss": 1.3154, "nll_loss": 0.9324311017990112, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.935610771179199, "rewards/margins": 3.13999080657959, "rewards/rejected": 1.7956199645996094, "step": 4550 }, { "epoch": 0.25297846077030833, "grad_norm": 73.33432006835938, "learning_rate": 8.502257895514053e-08, "logits/chosen": -0.30248206853866577, "logits/rejected": -0.442868709564209, "logps/chosen": -171.95774841308594, "logps/rejected": -244.6792449951172, "loss": 1.2763, "nll_loss": 0.9759060144424438, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.5328264236450195, "rewards/margins": 4.3905439376831055, "rewards/rejected": 1.1422834396362305, "step": 4560 }, { "epoch": 0.253533238096559, "grad_norm": 36.83837127685547, "learning_rate": 8.496033010700761e-08, "logits/chosen": -0.2894170880317688, "logits/rejected": -0.44485601782798767, "logps/chosen": -197.3072509765625, "logps/rejected": -252.8368377685547, "loss": 1.379, "nll_loss": 1.0642839670181274, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.660626411437988, "rewards/margins": 4.005428314208984, "rewards/rejected": 1.6551977396011353, "step": 4570 }, { "epoch": 0.2540880154228097, "grad_norm": 65.84944915771484, "learning_rate": 8.489797505885105e-08, "logits/chosen": -0.18601150810718536, "logits/rejected": -0.3102690577507019, "logps/chosen": -180.2449493408203, "logps/rejected": -254.8947296142578, "loss": 1.2849, "nll_loss": 0.957818329334259, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.565386772155762, "rewards/margins": 3.5227177143096924, "rewards/rejected": 2.0426688194274902, "step": 4580 }, { "epoch": 0.2546427927490603, "grad_norm": 127.11788177490234, "learning_rate": 8.483551400008864e-08, "logits/chosen": -0.0906374454498291, "logits/rejected": -0.26230689883232117, "logps/chosen": -161.72662353515625, "logps/rejected": -198.145263671875, "loss": 1.3407, "nll_loss": 0.9006088972091675, "rewards/accuracies": 0.875, "rewards/chosen": 4.747382164001465, "rewards/margins": 2.776177167892456, "rewards/rejected": 1.9712049961090088, "step": 4590 }, { "epoch": 0.255197570075311, "grad_norm": 45.21589279174805, "learning_rate": 8.477294712046014e-08, "logits/chosen": -0.10336129367351532, "logits/rejected": -0.31331485509872437, "logps/chosen": -147.0296630859375, "logps/rejected": -198.47779846191406, "loss": 1.293, "nll_loss": 0.8451549410820007, "rewards/accuracies": 0.875, "rewards/chosen": 4.42185640335083, "rewards/margins": 2.895052433013916, "rewards/rejected": 1.5268040895462036, "step": 4600 }, { "epoch": 0.2557523474015617, "grad_norm": 189.2617950439453, "learning_rate": 8.471027461002683e-08, "logits/chosen": -0.19419385492801666, "logits/rejected": -0.3464725613594055, "logps/chosen": -148.30079650878906, "logps/rejected": -208.98556518554688, "loss": 1.313, "nll_loss": 0.8720955848693848, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.144837379455566, "rewards/margins": 3.403660297393799, "rewards/rejected": 1.7411772012710571, "step": 4610 }, { "epoch": 0.25630712472781236, "grad_norm": 66.42888641357422, "learning_rate": 8.46474966591708e-08, "logits/chosen": -0.25781363248825073, "logits/rejected": -0.35114097595214844, "logps/chosen": -172.5027618408203, "logps/rejected": -205.27633666992188, "loss": 1.3957, "nll_loss": 0.9941679835319519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.990196704864502, "rewards/margins": 2.257720470428467, "rewards/rejected": 2.732475757598877, "step": 4620 }, { "epoch": 0.25686190205406306, "grad_norm": 56.031105041503906, "learning_rate": 8.458461345859453e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -166.62411499023438, "logps/rejected": -231.1585693359375, "loss": 1.2733, "nll_loss": NaN, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.820073127746582, "rewards/margins": 3.484143018722534, "rewards/rejected": 1.3359302282333374, "step": 4630 }, { "epoch": 0.2574166793803137, "grad_norm": 89.39034271240234, "learning_rate": 8.452162519932012e-08, "logits/chosen": -0.3336629867553711, "logits/rejected": -0.41933974623680115, "logps/chosen": -194.59652709960938, "logps/rejected": -244.4859161376953, "loss": 1.4353, "nll_loss": 1.1331466436386108, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.238390922546387, "rewards/margins": 1.9931983947753906, "rewards/rejected": 3.245192766189575, "step": 4640 }, { "epoch": 0.2579714567065644, "grad_norm": 59.741416931152344, "learning_rate": 8.44585320726889e-08, "logits/chosen": -0.29769474267959595, "logits/rejected": -0.38778576254844666, "logps/chosen": -181.58316040039062, "logps/rejected": -249.23178100585938, "loss": 1.317, "nll_loss": 1.0540294647216797, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.46547794342041, "rewards/margins": 3.398808717727661, "rewards/rejected": 2.06666898727417, "step": 4650 }, { "epoch": 0.2585262340328151, "grad_norm": 90.04450225830078, "learning_rate": 8.43953342703607e-08, "logits/chosen": -0.04572884738445282, "logits/rejected": -0.17261159420013428, "logps/chosen": -130.46591186523438, "logps/rejected": -192.3133087158203, "loss": 1.2683, "nll_loss": 0.7979989647865295, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.13479471206665, "rewards/margins": 3.162766456604004, "rewards/rejected": 0.9720277786254883, "step": 4660 }, { "epoch": 0.25908101135906575, "grad_norm": 51.916786193847656, "learning_rate": 8.433203198431336e-08, "logits/chosen": -0.10603030771017075, "logits/rejected": -0.25791341066360474, "logps/chosen": -147.1509246826172, "logps/rejected": -206.18197631835938, "loss": 1.4653, "nll_loss": 0.8920075297355652, "rewards/accuracies": 0.875, "rewards/chosen": 4.549837589263916, "rewards/margins": 2.8990442752838135, "rewards/rejected": 1.6507936716079712, "step": 4670 }, { "epoch": 0.25963578868531645, "grad_norm": 55.086280822753906, "learning_rate": 8.426862540684206e-08, "logits/chosen": -0.2217932492494583, "logits/rejected": -0.3490327298641205, "logps/chosen": -168.48159790039062, "logps/rejected": -217.4625244140625, "loss": 1.29, "nll_loss": 1.0062181949615479, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.952488422393799, "rewards/margins": 2.8383209705352783, "rewards/rejected": 2.1141676902770996, "step": 4680 }, { "epoch": 0.2601905660115671, "grad_norm": 64.567626953125, "learning_rate": 8.420511473055886e-08, "logits/chosen": -0.15327800810337067, "logits/rejected": -0.2819003164768219, "logps/chosen": -166.82540893554688, "logps/rejected": -217.75234985351562, "loss": 1.2892, "nll_loss": 0.9549553990364075, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.273451328277588, "rewards/margins": 3.76080322265625, "rewards/rejected": 1.5126473903656006, "step": 4690 }, { "epoch": 0.2607453433378178, "grad_norm": 49.292476654052734, "learning_rate": 8.414150014839199e-08, "logits/chosen": -0.2889734208583832, "logits/rejected": -0.4110495448112488, "logps/chosen": -206.3968505859375, "logps/rejected": -283.59552001953125, "loss": 1.3333, "nll_loss": 1.0700557231903076, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.977264404296875, "rewards/margins": 3.681428909301758, "rewards/rejected": 2.2958357334136963, "step": 4700 }, { "epoch": 0.26130012066406844, "grad_norm": 83.20051574707031, "learning_rate": 8.407778185358536e-08, "logits/chosen": -0.498958021402359, "logits/rejected": -0.5524585247039795, "logps/chosen": -214.4412078857422, "logps/rejected": -242.309814453125, "loss": 1.3835, "nll_loss": 1.2496415376663208, "rewards/accuracies": 0.75, "rewards/chosen": 5.918186664581299, "rewards/margins": 2.482825517654419, "rewards/rejected": 3.4353606700897217, "step": 4710 }, { "epoch": 0.26185489799031914, "grad_norm": 60.543052673339844, "learning_rate": 8.40139600396979e-08, "logits/chosen": -0.2590022683143616, "logits/rejected": -0.3653254508972168, "logps/chosen": -154.97506713867188, "logps/rejected": -201.74185180664062, "loss": 1.3559, "nll_loss": 0.9882938265800476, "rewards/accuracies": 0.875, "rewards/chosen": 5.30222749710083, "rewards/margins": 3.420788288116455, "rewards/rejected": 1.8814388513565063, "step": 4720 }, { "epoch": 0.26240967531656983, "grad_norm": 37.433589935302734, "learning_rate": 8.3950034900603e-08, "logits/chosen": -0.23119473457336426, "logits/rejected": -0.32892414927482605, "logps/chosen": -158.8278350830078, "logps/rejected": -200.85879516601562, "loss": 1.3303, "nll_loss": 0.9745529890060425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.2692413330078125, "rewards/margins": 3.4871573448181152, "rewards/rejected": 1.7820838689804077, "step": 4730 }, { "epoch": 0.2629644526428205, "grad_norm": 150.04627990722656, "learning_rate": 8.388600663048794e-08, "logits/chosen": -0.1670699119567871, "logits/rejected": -0.3385860025882721, "logps/chosen": -179.1607208251953, "logps/rejected": -281.8492126464844, "loss": 1.4205, "nll_loss": 0.9550544619560242, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.709284782409668, "rewards/margins": 4.198940277099609, "rewards/rejected": 1.510345220565796, "step": 4740 }, { "epoch": 0.2635192299690712, "grad_norm": 52.15652847290039, "learning_rate": 8.382187542385328e-08, "logits/chosen": -0.2738000452518463, "logits/rejected": -0.349651962518692, "logps/chosen": -177.93954467773438, "logps/rejected": -226.75778198242188, "loss": 1.2965, "nll_loss": 1.01934814453125, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.248774528503418, "rewards/margins": 3.257829189300537, "rewards/rejected": 1.9909454584121704, "step": 4750 }, { "epoch": 0.2640740072953218, "grad_norm": 49.90021896362305, "learning_rate": 8.37576414755123e-08, "logits/chosen": -0.35484129190444946, "logits/rejected": -0.41778916120529175, "logps/chosen": -189.77159118652344, "logps/rejected": -231.64547729492188, "loss": 1.341, "nll_loss": 1.0807373523712158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.559969425201416, "rewards/margins": 2.28197979927063, "rewards/rejected": 3.277989625930786, "step": 4760 }, { "epoch": 0.2646287846215725, "grad_norm": 48.15873336791992, "learning_rate": 8.369330498059033e-08, "logits/chosen": -0.16155406832695007, "logits/rejected": -0.2664300501346588, "logps/chosen": -164.81192016601562, "logps/rejected": -231.3193817138672, "loss": 1.3654, "nll_loss": 1.0048094987869263, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.006320953369141, "rewards/margins": 3.9034602642059326, "rewards/rejected": 1.1028602123260498, "step": 4770 }, { "epoch": 0.26518356194782317, "grad_norm": 229.7599639892578, "learning_rate": 8.362886613452423e-08, "logits/chosen": -0.15005961060523987, "logits/rejected": -0.23518244922161102, "logps/chosen": -163.5731658935547, "logps/rejected": -202.78944396972656, "loss": 1.3263, "nll_loss": 0.8745111227035522, "rewards/accuracies": 0.75, "rewards/chosen": 4.994362831115723, "rewards/margins": 2.4524505138397217, "rewards/rejected": 2.541912317276001, "step": 4780 }, { "epoch": 0.26573833927407386, "grad_norm": 70.23784637451172, "learning_rate": 8.35643251330618e-08, "logits/chosen": -0.2799859046936035, "logits/rejected": -0.37818005681037903, "logps/chosen": -196.68984985351562, "logps/rejected": -237.1707763671875, "loss": 1.2948, "nll_loss": 1.0815504789352417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.128037929534912, "rewards/margins": 3.218574047088623, "rewards/rejected": 1.909463882446289, "step": 4790 }, { "epoch": 0.26629311660032456, "grad_norm": 57.96487808227539, "learning_rate": 8.349968217226113e-08, "logits/chosen": -0.31223830580711365, "logits/rejected": -0.4266796112060547, "logps/chosen": -175.681640625, "logps/rejected": -212.2870635986328, "loss": 1.2977, "nll_loss": 1.0572354793548584, "rewards/accuracies": 0.875, "rewards/chosen": 5.238940238952637, "rewards/margins": 2.6529510021209717, "rewards/rejected": 2.585989475250244, "step": 4800 }, { "epoch": 0.2668478939265752, "grad_norm": 43.18544387817383, "learning_rate": 8.343493744849001e-08, "logits/chosen": -0.2462203949689865, "logits/rejected": -0.3878275752067566, "logps/chosen": -180.4509735107422, "logps/rejected": -250.7042236328125, "loss": 1.3514, "nll_loss": 1.009977102279663, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.467653751373291, "rewards/margins": 3.171955108642578, "rewards/rejected": 2.295698642730713, "step": 4810 }, { "epoch": 0.2674026712528259, "grad_norm": 58.05904006958008, "learning_rate": 8.337009115842545e-08, "logits/chosen": -0.1778537929058075, "logits/rejected": -0.3089439272880554, "logps/chosen": -165.0500946044922, "logps/rejected": -208.13623046875, "loss": 1.3021, "nll_loss": 0.9191769361495972, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.851661682128906, "rewards/margins": 2.92620587348938, "rewards/rejected": 1.9254562854766846, "step": 4820 }, { "epoch": 0.26795744857907655, "grad_norm": 40.6214714050293, "learning_rate": 8.330514349905293e-08, "logits/chosen": -0.29483598470687866, "logits/rejected": -0.43914732336997986, "logps/chosen": -169.65567016601562, "logps/rejected": -205.42495727539062, "loss": 1.3685, "nll_loss": 0.9740726351737976, "rewards/accuracies": 0.875, "rewards/chosen": 5.574697971343994, "rewards/margins": 3.5862374305725098, "rewards/rejected": 1.988459587097168, "step": 4830 }, { "epoch": 0.26851222590532725, "grad_norm": 68.7054443359375, "learning_rate": 8.324009466766581e-08, "logits/chosen": -0.30889657139778137, "logits/rejected": -0.3688036799430847, "logps/chosen": -150.3580780029297, "logps/rejected": -210.03738403320312, "loss": 1.3273, "nll_loss": 0.9760260581970215, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.090402603149414, "rewards/margins": 2.458418846130371, "rewards/rejected": 2.631983757019043, "step": 4840 }, { "epoch": 0.26906700323157795, "grad_norm": 60.75359344482422, "learning_rate": 8.317494486186489e-08, "logits/chosen": -0.32037153840065, "logits/rejected": -0.43114009499549866, "logps/chosen": -177.12033081054688, "logps/rejected": -215.9650421142578, "loss": 1.4469, "nll_loss": 1.0228039026260376, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.64739990234375, "rewards/margins": 2.7197928428649902, "rewards/rejected": 2.9276070594787598, "step": 4850 }, { "epoch": 0.2696217805578286, "grad_norm": 88.06070709228516, "learning_rate": 8.310969427955765e-08, "logits/chosen": -0.2338133305311203, "logits/rejected": -0.3076700270175934, "logps/chosen": -172.38645935058594, "logps/rejected": -207.7739715576172, "loss": 1.4337, "nll_loss": 0.9386089444160461, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.505553722381592, "rewards/margins": 2.553183078765869, "rewards/rejected": 2.9523708820343018, "step": 4860 }, { "epoch": 0.2701765578840793, "grad_norm": 58.89626693725586, "learning_rate": 8.304434311895768e-08, "logits/chosen": -0.05045692250132561, "logits/rejected": -0.20222020149230957, "logps/chosen": -127.2392578125, "logps/rejected": -165.931640625, "loss": 1.3058, "nll_loss": 0.7586954832077026, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.104668617248535, "rewards/margins": 2.188541889190674, "rewards/rejected": 1.9161268472671509, "step": 4870 }, { "epoch": 0.27073133521032994, "grad_norm": 72.38153076171875, "learning_rate": 8.297889157858413e-08, "logits/chosen": -0.3325496315956116, "logits/rejected": -0.424597829580307, "logps/chosen": -190.23590087890625, "logps/rejected": -260.87847900390625, "loss": 1.3395, "nll_loss": 1.0346920490264893, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.8594069480896, "rewards/margins": 4.145157814025879, "rewards/rejected": 1.7142490148544312, "step": 4880 }, { "epoch": 0.27128611253658064, "grad_norm": 128.99923706054688, "learning_rate": 8.291333985726106e-08, "logits/chosen": -0.03347639739513397, "logits/rejected": -0.145706906914711, "logps/chosen": -136.19009399414062, "logps/rejected": -180.11630249023438, "loss": 1.301, "nll_loss": 0.8283224105834961, "rewards/accuracies": 0.75, "rewards/chosen": 4.277609825134277, "rewards/margins": 2.1975202560424805, "rewards/rejected": 2.080089569091797, "step": 4890 }, { "epoch": 0.2718408898628313, "grad_norm": 69.27930450439453, "learning_rate": 8.284768815411691e-08, "logits/chosen": -0.16743507981300354, "logits/rejected": -0.26962658762931824, "logps/chosen": -178.30587768554688, "logps/rejected": -240.0220947265625, "loss": 1.3253, "nll_loss": 0.9475281834602356, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.275069236755371, "rewards/margins": 2.969179391860962, "rewards/rejected": 2.30588960647583, "step": 4900 }, { "epoch": 0.272395667189082, "grad_norm": 62.62528610229492, "learning_rate": 8.278193666858374e-08, "logits/chosen": -0.20674102008342743, "logits/rejected": -0.3136216998100281, "logps/chosen": -142.73001098632812, "logps/rejected": -173.5380096435547, "loss": 1.3661, "nll_loss": 0.9335249066352844, "rewards/accuracies": 0.875, "rewards/chosen": 4.942017555236816, "rewards/margins": 2.9952781200408936, "rewards/rejected": 1.9467391967773438, "step": 4910 }, { "epoch": 0.2729504445153327, "grad_norm": 47.427555084228516, "learning_rate": 8.271608560039681e-08, "logits/chosen": -0.28811416029930115, "logits/rejected": -0.3628733158111572, "logps/chosen": -187.3347930908203, "logps/rejected": -237.3675994873047, "loss": 1.3381, "nll_loss": 1.1174544095993042, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.298020362854004, "rewards/margins": 2.991478443145752, "rewards/rejected": 2.3065414428710938, "step": 4920 }, { "epoch": 0.2735052218415833, "grad_norm": 172.0816192626953, "learning_rate": 8.26501351495938e-08, "logits/chosen": -0.23228800296783447, "logits/rejected": -0.30110445618629456, "logps/chosen": -188.532470703125, "logps/rejected": -249.3604736328125, "loss": 1.3669, "nll_loss": 1.0762465000152588, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.646094799041748, "rewards/margins": 3.560511350631714, "rewards/rejected": 2.085583448410034, "step": 4930 }, { "epoch": 0.274059999167834, "grad_norm": 54.714622497558594, "learning_rate": 8.258408551651438e-08, "logits/chosen": -0.3558768928050995, "logits/rejected": -0.42641109228134155, "logps/chosen": -194.4613494873047, "logps/rejected": -249.61874389648438, "loss": 1.3133, "nll_loss": 1.0652390718460083, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.462862491607666, "rewards/margins": 3.3691318035125732, "rewards/rejected": 2.093731164932251, "step": 4940 }, { "epoch": 0.27461477649408467, "grad_norm": 70.50579833984375, "learning_rate": 8.251793690179945e-08, "logits/chosen": -0.171391561627388, "logits/rejected": -0.29056456685066223, "logps/chosen": -167.0868682861328, "logps/rejected": -223.43124389648438, "loss": 1.3015, "nll_loss": 0.9458906054496765, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.318848133087158, "rewards/margins": 3.345581531524658, "rewards/rejected": 1.973266363143921, "step": 4950 }, { "epoch": 0.27516955382033537, "grad_norm": 57.20966339111328, "learning_rate": 8.245168950639061e-08, "logits/chosen": 0.025897592306137085, "logits/rejected": -0.12947197258472443, "logps/chosen": -122.27632904052734, "logps/rejected": -155.8192596435547, "loss": 1.3076, "nll_loss": 0.7416020035743713, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.2231645584106445, "rewards/margins": 2.9529929161071777, "rewards/rejected": 1.270171880722046, "step": 4960 }, { "epoch": 0.27572433114658607, "grad_norm": 38.52761459350586, "learning_rate": 8.23853435315295e-08, "logits/chosen": -0.2517802119255066, "logits/rejected": -0.3721666634082794, "logps/chosen": -145.8294219970703, "logps/rejected": -208.90017700195312, "loss": 1.1446, "nll_loss": 0.9143912196159363, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.211873531341553, "rewards/margins": 3.9393725395202637, "rewards/rejected": 1.2725012302398682, "step": 4970 }, { "epoch": 0.2762791084728367, "grad_norm": 43.90656661987305, "learning_rate": 8.231889917875728e-08, "logits/chosen": -0.36203673481941223, "logits/rejected": -0.500227153301239, "logps/chosen": -173.27207946777344, "logps/rejected": -255.75564575195312, "loss": 1.2673, "nll_loss": 1.028189778327942, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.356875419616699, "rewards/margins": 4.0634074211120605, "rewards/rejected": 1.2934677600860596, "step": 4980 }, { "epoch": 0.2768338857990874, "grad_norm": 71.87471008300781, "learning_rate": 8.225235664991386e-08, "logits/chosen": -0.2676452100276947, "logits/rejected": -0.3847208321094513, "logps/chosen": -193.01084899902344, "logps/rejected": -229.0294189453125, "loss": 1.301, "nll_loss": 1.0524907112121582, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.651148796081543, "rewards/margins": 3.842705488204956, "rewards/rejected": 1.808443307876587, "step": 4990 }, { "epoch": 0.27738866312533805, "grad_norm": 37.92792892456055, "learning_rate": 8.218571614713749e-08, "logits/chosen": -0.24387606978416443, "logits/rejected": -0.41826462745666504, "logps/chosen": -145.3743133544922, "logps/rejected": -180.5663604736328, "loss": 1.2525, "nll_loss": 0.9066478610038757, "rewards/accuracies": 0.875, "rewards/chosen": 4.747946262359619, "rewards/margins": 3.3187623023986816, "rewards/rejected": 1.4291837215423584, "step": 5000 }, { "epoch": 0.27738866312533805, "eval_logits/chosen": -0.3629521131515503, "eval_logits/rejected": -0.4396223723888397, "eval_logps/chosen": -198.44891357421875, "eval_logps/rejected": -261.6014404296875, "eval_loss": 1.2831330299377441, "eval_nll_loss": 1.0325713157653809, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 5.951231479644775, "eval_rewards/margins": 4.203526020050049, "eval_rewards/rejected": 1.7477052211761475, "eval_runtime": 17.0807, "eval_samples_per_second": 14.988, "eval_steps_per_second": 1.873, "step": 5000 }, { "epoch": 0.27794344045158875, "grad_norm": 56.25941467285156, "learning_rate": 8.211897787286396e-08, "logits/chosen": -0.1583927869796753, "logits/rejected": -0.3076084554195404, "logps/chosen": -151.69210815429688, "logps/rejected": -194.51449584960938, "loss": 1.2783, "nll_loss": 0.8849924206733704, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.681919574737549, "rewards/margins": 2.732123851776123, "rewards/rejected": 1.9497959613800049, "step": 5010 }, { "epoch": 0.2784982177778394, "grad_norm": 55.836021423339844, "learning_rate": 8.205214202982609e-08, "logits/chosen": -0.25684595108032227, "logits/rejected": -0.35995739698410034, "logps/chosen": -166.514404296875, "logps/rejected": -221.72021484375, "loss": 1.3165, "nll_loss": 0.9287391901016235, "rewards/accuracies": 0.875, "rewards/chosen": 5.375771522521973, "rewards/margins": 3.230828046798706, "rewards/rejected": 2.1449429988861084, "step": 5020 }, { "epoch": 0.2790529951040901, "grad_norm": 79.99015045166016, "learning_rate": 8.198520882105311e-08, "logits/chosen": -0.38538604974746704, "logits/rejected": -0.4918065071105957, "logps/chosen": -172.71304321289062, "logps/rejected": -217.1675567626953, "loss": 1.2925, "nll_loss": 0.9629594087600708, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.571718692779541, "rewards/margins": 3.5471160411834717, "rewards/rejected": 2.024602174758911, "step": 5030 }, { "epoch": 0.2796077724303408, "grad_norm": 62.02948760986328, "learning_rate": 8.191817844986996e-08, "logits/chosen": -0.3658435046672821, "logits/rejected": -0.5107392072677612, "logps/chosen": -177.09124755859375, "logps/rejected": -239.96658325195312, "loss": 1.2561, "nll_loss": 0.9874979257583618, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.4753217697143555, "rewards/margins": 3.384706974029541, "rewards/rejected": 2.0906152725219727, "step": 5040 }, { "epoch": 0.28016254975659144, "grad_norm": 58.71518325805664, "learning_rate": 8.185105111989682e-08, "logits/chosen": -0.20356634259223938, "logits/rejected": -0.2974574863910675, "logps/chosen": -140.64859008789062, "logps/rejected": -194.08328247070312, "loss": 1.3252, "nll_loss": 0.8530987501144409, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.67659854888916, "rewards/margins": 3.028804302215576, "rewards/rejected": 1.6477943658828735, "step": 5050 }, { "epoch": 0.28071732708284214, "grad_norm": 84.55443572998047, "learning_rate": 8.178382703504831e-08, "logits/chosen": -0.5204389095306396, "logits/rejected": -0.5782557725906372, "logps/chosen": -219.82516479492188, "logps/rejected": -287.87445068359375, "loss": 1.4168, "nll_loss": 1.1804500818252563, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.438179016113281, "rewards/margins": 3.6439692974090576, "rewards/rejected": 2.794210195541382, "step": 5060 }, { "epoch": 0.2812721044090928, "grad_norm": 55.49232864379883, "learning_rate": 8.171650639953305e-08, "logits/chosen": -0.3548034131526947, "logits/rejected": -0.49223846197128296, "logps/chosen": -180.63319396972656, "logps/rejected": -264.86114501953125, "loss": 1.3345, "nll_loss": 0.9848377108573914, "rewards/accuracies": 0.875, "rewards/chosen": 5.830276966094971, "rewards/margins": 4.735908508300781, "rewards/rejected": 1.0943679809570312, "step": 5070 }, { "epoch": 0.2818268817353435, "grad_norm": 59.67625427246094, "learning_rate": 8.164908941785286e-08, "logits/chosen": -0.19392921030521393, "logits/rejected": -0.337412029504776, "logps/chosen": -152.64126586914062, "logps/rejected": -227.7115936279297, "loss": 1.2606, "nll_loss": 0.9056864976882935, "rewards/accuracies": 0.875, "rewards/chosen": 5.199661731719971, "rewards/margins": 3.7226932048797607, "rewards/rejected": 1.476968765258789, "step": 5080 }, { "epoch": 0.2823816590615941, "grad_norm": 48.38815689086914, "learning_rate": 8.158157629480236e-08, "logits/chosen": -0.2246595323085785, "logits/rejected": -0.43237733840942383, "logps/chosen": -147.0095672607422, "logps/rejected": -226.11062622070312, "loss": 1.2936, "nll_loss": 0.8035075068473816, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.727481842041016, "rewards/margins": 3.7552146911621094, "rewards/rejected": 0.9722667932510376, "step": 5090 }, { "epoch": 0.2829364363878448, "grad_norm": 79.66407012939453, "learning_rate": 8.151396723546809e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -133.1691131591797, "logps/rejected": -176.14329528808594, "loss": 1.247, "nll_loss": NaN, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.611157417297363, "rewards/margins": 2.2311840057373047, "rewards/rejected": 2.3799734115600586, "step": 5100 }, { "epoch": 0.2834912137140955, "grad_norm": 38.1684455871582, "learning_rate": 8.144626244522812e-08, "logits/chosen": -0.2781577706336975, "logits/rejected": -0.35782188177108765, "logps/chosen": -173.4530792236328, "logps/rejected": -204.86978149414062, "loss": 1.4075, "nll_loss": 0.9582212567329407, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.3481879234313965, "rewards/margins": 2.9867639541625977, "rewards/rejected": 2.361424446105957, "step": 5110 }, { "epoch": 0.28404599104034617, "grad_norm": 38.051395416259766, "learning_rate": 8.137846212975126e-08, "logits/chosen": -0.39350074529647827, "logits/rejected": -0.45697134733200073, "logps/chosen": -186.7138214111328, "logps/rejected": -244.53775024414062, "loss": 1.2772, "nll_loss": 1.0653064250946045, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.167884349822998, "rewards/margins": 3.2606379985809326, "rewards/rejected": 2.9072463512420654, "step": 5120 }, { "epoch": 0.28460076836659687, "grad_norm": 73.94496154785156, "learning_rate": 8.131056649499653e-08, "logits/chosen": -0.25749891996383667, "logits/rejected": -0.38721853494644165, "logps/chosen": -185.4833221435547, "logps/rejected": -205.6341094970703, "loss": 1.2753, "nll_loss": 0.8648307919502258, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.020583152770996, "rewards/margins": 2.860358238220215, "rewards/rejected": 2.1602249145507812, "step": 5130 }, { "epoch": 0.2851555456928475, "grad_norm": 103.15850830078125, "learning_rate": 8.12425757472125e-08, "logits/chosen": -0.34344482421875, "logits/rejected": -0.43374189734458923, "logps/chosen": -163.52737426757812, "logps/rejected": -212.4480743408203, "loss": 1.2976, "nll_loss": 1.0665273666381836, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.501097679138184, "rewards/margins": 3.606482744216919, "rewards/rejected": 1.8946149349212646, "step": 5140 }, { "epoch": 0.2857103230190982, "grad_norm": 64.51885223388672, "learning_rate": 8.117449009293668e-08, "logits/chosen": -0.31838101148605347, "logits/rejected": -0.4258570671081543, "logps/chosen": -150.1742706298828, "logps/rejected": -219.8562469482422, "loss": 1.3037, "nll_loss": 0.9224601984024048, "rewards/accuracies": 0.75, "rewards/chosen": 5.4263763427734375, "rewards/margins": 2.586714267730713, "rewards/rejected": 2.8396620750427246, "step": 5150 }, { "epoch": 0.2862651003453489, "grad_norm": 85.3049545288086, "learning_rate": 8.110630973899484e-08, "logits/chosen": -0.39840513467788696, "logits/rejected": -0.5297509431838989, "logps/chosen": -180.4063720703125, "logps/rejected": -234.35665893554688, "loss": 1.2411, "nll_loss": 1.0500776767730713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.590461254119873, "rewards/margins": 3.995450258255005, "rewards/rejected": 1.5950109958648682, "step": 5160 }, { "epoch": 0.28681987767159955, "grad_norm": 65.11550903320312, "learning_rate": 8.103803489250045e-08, "logits/chosen": -0.23196351528167725, "logits/rejected": -0.39111918210983276, "logps/chosen": -154.30833435058594, "logps/rejected": -203.83786010742188, "loss": 1.3235, "nll_loss": 0.9584856033325195, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.514666557312012, "rewards/margins": 3.025212287902832, "rewards/rejected": 1.489454746246338, "step": 5170 }, { "epoch": 0.28737465499785025, "grad_norm": 67.98075866699219, "learning_rate": 8.096966576085405e-08, "logits/chosen": -0.47045016288757324, "logits/rejected": -0.5754284262657166, "logps/chosen": -190.63046264648438, "logps/rejected": -263.2176513671875, "loss": 1.3354, "nll_loss": 1.1000330448150635, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.919167518615723, "rewards/margins": 3.543025493621826, "rewards/rejected": 2.3761425018310547, "step": 5180 }, { "epoch": 0.2879294323241009, "grad_norm": 83.0150375366211, "learning_rate": 8.090120255174253e-08, "logits/chosen": -0.24219787120819092, "logits/rejected": -0.3379160463809967, "logps/chosen": -164.17941284179688, "logps/rejected": -207.69076538085938, "loss": 1.3324, "nll_loss": 0.9007360339164734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.087338447570801, "rewards/margins": 3.1830947399139404, "rewards/rejected": 1.9042432308197021, "step": 5190 }, { "epoch": 0.2884842096503516, "grad_norm": 51.171478271484375, "learning_rate": 8.083264547313862e-08, "logits/chosen": -0.21256570518016815, "logits/rejected": -0.3457311689853668, "logps/chosen": -143.1696319580078, "logps/rejected": -185.2307586669922, "loss": 1.2872, "nll_loss": 0.8276432156562805, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.949501991271973, "rewards/margins": 3.206437587738037, "rewards/rejected": 1.7430641651153564, "step": 5200 }, { "epoch": 0.28903898697660224, "grad_norm": 55.23284912109375, "learning_rate": 8.076399473330014e-08, "logits/chosen": -0.4154040813446045, "logits/rejected": -0.5505325198173523, "logps/chosen": -184.9228057861328, "logps/rejected": -258.3560485839844, "loss": 1.431, "nll_loss": 1.021315097808838, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.933206558227539, "rewards/margins": 4.160671234130859, "rewards/rejected": 1.772534966468811, "step": 5210 }, { "epoch": 0.28959376430285294, "grad_norm": 81.54315185546875, "learning_rate": 8.06952505407695e-08, "logits/chosen": -0.31606870889663696, "logits/rejected": -0.43339866399765015, "logps/chosen": -158.47915649414062, "logps/rejected": -229.88601684570312, "loss": 1.3092, "nll_loss": 0.9658956527709961, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.462421417236328, "rewards/margins": 3.690382480621338, "rewards/rejected": 1.7720390558242798, "step": 5220 }, { "epoch": 0.29014854162910364, "grad_norm": 62.62627029418945, "learning_rate": 8.062641310437293e-08, "logits/chosen": -0.2253562957048416, "logits/rejected": -0.42890849709510803, "logps/chosen": -169.52810668945312, "logps/rejected": -232.82861328125, "loss": 1.3288, "nll_loss": 1.0389689207077026, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.488898754119873, "rewards/margins": 3.7329201698303223, "rewards/rejected": 1.7559791803359985, "step": 5230 }, { "epoch": 0.2907033189553543, "grad_norm": 77.60379791259766, "learning_rate": 8.055748263321998e-08, "logits/chosen": -0.3747365474700928, "logits/rejected": -0.4889054298400879, "logps/chosen": -154.60496520996094, "logps/rejected": -223.3097686767578, "loss": 1.3717, "nll_loss": 0.9673991203308105, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.296849250793457, "rewards/margins": 2.987121105194092, "rewards/rejected": 2.309727668762207, "step": 5240 }, { "epoch": 0.291258096281605, "grad_norm": 45.01763916015625, "learning_rate": 8.048845933670271e-08, "logits/chosen": -0.4019540250301361, "logits/rejected": -0.5339347720146179, "logps/chosen": -198.4036102294922, "logps/rejected": -255.08529663085938, "loss": 1.3284, "nll_loss": 1.1002283096313477, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.304580211639404, "rewards/margins": 2.89150071144104, "rewards/rejected": 2.4130795001983643, "step": 5250 }, { "epoch": 0.2918128736078556, "grad_norm": 58.75382995605469, "learning_rate": 8.041934342449526e-08, "logits/chosen": -0.3500133156776428, "logits/rejected": -0.5009504556655884, "logps/chosen": -188.2596893310547, "logps/rejected": -240.044921875, "loss": 1.3187, "nll_loss": 1.071189284324646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.830065727233887, "rewards/margins": 2.8829729557037354, "rewards/rejected": 2.9470925331115723, "step": 5260 }, { "epoch": 0.2923676509341063, "grad_norm": 40.570159912109375, "learning_rate": 8.035013510655307e-08, "logits/chosen": -0.33516281843185425, "logits/rejected": -0.47703152894973755, "logps/chosen": -156.31471252441406, "logps/rejected": -210.299072265625, "loss": 1.263, "nll_loss": 0.8956218957901001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.275986671447754, "rewards/margins": 2.6534581184387207, "rewards/rejected": 2.622528553009033, "step": 5270 }, { "epoch": 0.292922428260357, "grad_norm": 72.06739044189453, "learning_rate": 8.028083459311225e-08, "logits/chosen": -0.431640088558197, "logits/rejected": -0.5278843641281128, "logps/chosen": -206.6097412109375, "logps/rejected": -259.64697265625, "loss": 1.3747, "nll_loss": 1.1126128435134888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.9721574783325195, "rewards/margins": 2.6364171504974365, "rewards/rejected": 3.335740566253662, "step": 5280 }, { "epoch": 0.29347720558660767, "grad_norm": 43.916831970214844, "learning_rate": 8.021144209468904e-08, "logits/chosen": -0.37015438079833984, "logits/rejected": -0.44294947385787964, "logps/chosen": -180.5640106201172, "logps/rejected": -225.8361053466797, "loss": 1.3638, "nll_loss": 1.0486891269683838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.0682053565979, "rewards/margins": 2.2644426822662354, "rewards/rejected": 2.803762912750244, "step": 5290 }, { "epoch": 0.29403198291285837, "grad_norm": 64.19252014160156, "learning_rate": 8.014195782207909e-08, "logits/chosen": -0.20861633121967316, "logits/rejected": -0.38095623254776, "logps/chosen": -168.60348510742188, "logps/rejected": -208.478271484375, "loss": 1.3344, "nll_loss": 0.8773584365844727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.338396072387695, "rewards/margins": 3.4838390350341797, "rewards/rejected": 1.8545570373535156, "step": 5300 }, { "epoch": 0.294586760239109, "grad_norm": 47.61259841918945, "learning_rate": 8.007238198635677e-08, "logits/chosen": -0.39370396733283997, "logits/rejected": -0.4605169892311096, "logps/chosen": -189.17117309570312, "logps/rejected": -248.403564453125, "loss": 1.3185, "nll_loss": 1.0556788444519043, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.049463748931885, "rewards/margins": 3.6159896850585938, "rewards/rejected": 2.43347430229187, "step": 5310 }, { "epoch": 0.2951415375653597, "grad_norm": 47.16741943359375, "learning_rate": 8.000271479887468e-08, "logits/chosen": -0.25164592266082764, "logits/rejected": -0.37962883710861206, "logps/chosen": -146.64503479003906, "logps/rejected": -178.7125701904297, "loss": 1.3137, "nll_loss": 0.9622234106063843, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.87002420425415, "rewards/margins": 2.9429609775543213, "rewards/rejected": 1.927063226699829, "step": 5320 }, { "epoch": 0.29569631489161036, "grad_norm": 50.34662628173828, "learning_rate": 7.993295647126288e-08, "logits/chosen": -0.12188470363616943, "logits/rejected": -0.28352001309394836, "logps/chosen": -144.2179412841797, "logps/rejected": -193.8548583984375, "loss": 1.3383, "nll_loss": 0.823199450969696, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.984102249145508, "rewards/margins": 3.198787212371826, "rewards/rejected": 1.7853147983551025, "step": 5330 }, { "epoch": 0.29625109221786106, "grad_norm": 46.27009582519531, "learning_rate": 7.986310721542828e-08, "logits/chosen": -0.23408794403076172, "logits/rejected": -0.35068511962890625, "logps/chosen": -190.06494140625, "logps/rejected": -255.37680053710938, "loss": 1.3908, "nll_loss": 0.9979566335678101, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.786375999450684, "rewards/margins": 3.3670754432678223, "rewards/rejected": 2.4193005561828613, "step": 5340 }, { "epoch": 0.29680586954411176, "grad_norm": 56.96269607543945, "learning_rate": 7.979316724355406e-08, "logits/chosen": -0.292167603969574, "logits/rejected": -0.4041009843349457, "logps/chosen": -128.87632751464844, "logps/rejected": -187.7605438232422, "loss": 1.3735, "nll_loss": 0.8670805096626282, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.034547328948975, "rewards/margins": 3.0269715785980225, "rewards/rejected": 2.0075759887695312, "step": 5350 }, { "epoch": 0.2973606468703624, "grad_norm": 54.19409942626953, "learning_rate": 7.972313676809887e-08, "logits/chosen": -0.31982582807540894, "logits/rejected": -0.40687140822410583, "logps/chosen": -194.898193359375, "logps/rejected": -229.9585723876953, "loss": 1.4132, "nll_loss": 1.0126712322235107, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.524321556091309, "rewards/margins": 3.129094123840332, "rewards/rejected": 2.3952271938323975, "step": 5360 }, { "epoch": 0.2979154241966131, "grad_norm": 41.81538391113281, "learning_rate": 7.96530160017964e-08, "logits/chosen": -0.23952436447143555, "logits/rejected": -0.395558625459671, "logps/chosen": -172.92860412597656, "logps/rejected": -210.91049194335938, "loss": 1.2618, "nll_loss": 0.9645511507987976, "rewards/accuracies": 0.875, "rewards/chosen": 5.2552056312561035, "rewards/margins": 2.6192994117736816, "rewards/rejected": 2.635906457901001, "step": 5370 }, { "epoch": 0.29847020152286374, "grad_norm": 43.95526885986328, "learning_rate": 7.958280515765454e-08, "logits/chosen": -0.34653595089912415, "logits/rejected": -0.4484923481941223, "logps/chosen": -188.808349609375, "logps/rejected": -257.86285400390625, "loss": 1.3475, "nll_loss": 1.0703051090240479, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.61892032623291, "rewards/margins": 3.484872817993164, "rewards/rejected": 2.1340479850769043, "step": 5380 }, { "epoch": 0.29902497884911444, "grad_norm": 64.28650665283203, "learning_rate": 7.951250444895484e-08, "logits/chosen": -0.14369474351406097, "logits/rejected": -0.2900177538394928, "logps/chosen": -147.00808715820312, "logps/rejected": -188.2275390625, "loss": 1.341, "nll_loss": 0.9086960554122925, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.827338218688965, "rewards/margins": 2.7463221549987793, "rewards/rejected": 2.0810163021087646, "step": 5390 }, { "epoch": 0.2995797561753651, "grad_norm": 49.227413177490234, "learning_rate": 7.944211408925183e-08, "logits/chosen": -0.43552136421203613, "logits/rejected": -0.5203756093978882, "logps/chosen": -205.6902618408203, "logps/rejected": -263.37103271484375, "loss": 1.2393, "nll_loss": 1.137157678604126, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.868722438812256, "rewards/margins": 3.6083991527557373, "rewards/rejected": 2.2603237628936768, "step": 5400 }, { "epoch": 0.3001345335016158, "grad_norm": 39.7113151550293, "learning_rate": 7.937163429237237e-08, "logits/chosen": -0.2219894379377365, "logits/rejected": -0.3694431781768799, "logps/chosen": -159.9281768798828, "logps/rejected": -204.74143981933594, "loss": 1.3106, "nll_loss": 0.9053483009338379, "rewards/accuracies": 0.75, "rewards/chosen": 4.833624839782715, "rewards/margins": 3.136951208114624, "rewards/rejected": 1.6966737508773804, "step": 5410 }, { "epoch": 0.3006893108278665, "grad_norm": 117.45632934570312, "learning_rate": 7.930106527241505e-08, "logits/chosen": -0.23928344249725342, "logits/rejected": -0.3349587023258209, "logps/chosen": -139.458984375, "logps/rejected": -189.7228546142578, "loss": 1.316, "nll_loss": 0.8406341671943665, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.879055500030518, "rewards/margins": 2.8671770095825195, "rewards/rejected": 2.011878728866577, "step": 5420 }, { "epoch": 0.30124408815411713, "grad_norm": 29.80186653137207, "learning_rate": 7.923040724374941e-08, "logits/chosen": -0.30750900506973267, "logits/rejected": -0.4656451344490051, "logps/chosen": -169.89950561523438, "logps/rejected": -207.82431030273438, "loss": 1.2527, "nll_loss": 0.9901590347290039, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.296086311340332, "rewards/margins": 3.4520249366760254, "rewards/rejected": 1.8440611362457275, "step": 5430 }, { "epoch": 0.30179886548036783, "grad_norm": 61.57847213745117, "learning_rate": 7.915966042101546e-08, "logits/chosen": -0.43216371536254883, "logits/rejected": -0.5349610447883606, "logps/chosen": -205.59500122070312, "logps/rejected": -233.59249877929688, "loss": 1.3843, "nll_loss": 1.1686880588531494, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.404385566711426, "rewards/margins": 2.747410297393799, "rewards/rejected": 2.656975507736206, "step": 5440 }, { "epoch": 0.3023536428066185, "grad_norm": 55.9067268371582, "learning_rate": 7.908882501912288e-08, "logits/chosen": -0.35131892561912537, "logits/rejected": -0.5348880290985107, "logps/chosen": -167.07186889648438, "logps/rejected": -208.3030242919922, "loss": 1.3274, "nll_loss": 1.009548306465149, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.066190719604492, "rewards/margins": 2.7632575035095215, "rewards/rejected": 2.3029332160949707, "step": 5450 }, { "epoch": 0.30290842013286917, "grad_norm": 48.4643669128418, "learning_rate": 7.901790125325047e-08, "logits/chosen": -0.2430580109357834, "logits/rejected": -0.38270917534828186, "logps/chosen": -196.25991821289062, "logps/rejected": -240.02505493164062, "loss": 1.341, "nll_loss": 1.0512897968292236, "rewards/accuracies": 0.75, "rewards/chosen": 5.403705596923828, "rewards/margins": 3.0696892738342285, "rewards/rejected": 2.3340163230895996, "step": 5460 }, { "epoch": 0.30346319745911987, "grad_norm": 26.129262924194336, "learning_rate": 7.894688933884545e-08, "logits/chosen": -0.43403467535972595, "logits/rejected": -0.5458344221115112, "logps/chosen": -206.26931762695312, "logps/rejected": -286.28814697265625, "loss": 1.2033, "nll_loss": 1.093693494796753, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.592001438140869, "rewards/margins": 3.990468978881836, "rewards/rejected": 2.601532459259033, "step": 5470 }, { "epoch": 0.3040179747853705, "grad_norm": 55.46004104614258, "learning_rate": 7.887578949162278e-08, "logits/chosen": -0.3617546856403351, "logits/rejected": -0.43136462569236755, "logps/chosen": -183.30076599121094, "logps/rejected": -205.0895233154297, "loss": 1.4718, "nll_loss": 1.0770082473754883, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.315445899963379, "rewards/margins": 2.013780117034912, "rewards/rejected": 3.301665782928467, "step": 5480 }, { "epoch": 0.3045727521116212, "grad_norm": 63.63032913208008, "learning_rate": 7.880460192756457e-08, "logits/chosen": -0.45876961946487427, "logits/rejected": -0.5710569620132446, "logps/chosen": -186.36093139648438, "logps/rejected": -239.4054718017578, "loss": 1.3289, "nll_loss": 1.0660300254821777, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.134969234466553, "rewards/margins": 3.396899461746216, "rewards/rejected": 2.738069772720337, "step": 5490 }, { "epoch": 0.30512752943787186, "grad_norm": 83.45735931396484, "learning_rate": 7.873332686291938e-08, "logits/chosen": -0.1018824353814125, "logits/rejected": -0.3132437765598297, "logps/chosen": -124.9186019897461, "logps/rejected": -162.782470703125, "loss": 1.3713, "nll_loss": 0.772121787071228, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.188645839691162, "rewards/margins": 2.7185497283935547, "rewards/rejected": 1.4700956344604492, "step": 5500 }, { "epoch": 0.30512752943787186, "eval_logits/chosen": -0.40301600098609924, "eval_logits/rejected": -0.4954206645488739, "eval_logps/chosen": -197.5357208251953, "eval_logps/rejected": -259.53399658203125, "eval_loss": 1.2622016668319702, "eval_nll_loss": 1.0264045000076294, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 6.042550086975098, "eval_rewards/margins": 4.08809757232666, "eval_rewards/rejected": 1.9544516801834106, "eval_runtime": 16.9134, "eval_samples_per_second": 15.136, "eval_steps_per_second": 1.892, "step": 5500 }, { "epoch": 0.30568230676412256, "grad_norm": 72.64631652832031, "learning_rate": 7.866196451420155e-08, "logits/chosen": -0.1749780923128128, "logits/rejected": -0.3413892686367035, "logps/chosen": -123.80096435546875, "logps/rejected": -170.3175048828125, "loss": 1.2754, "nll_loss": 0.8201160430908203, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.313254356384277, "rewards/margins": 2.8019752502441406, "rewards/rejected": 1.511278510093689, "step": 5510 }, { "epoch": 0.3062370840903732, "grad_norm": 69.69657897949219, "learning_rate": 7.859051509819062e-08, "logits/chosen": -0.3276621103286743, "logits/rejected": -0.44854670763015747, "logps/chosen": -178.0767822265625, "logps/rejected": -224.83853149414062, "loss": 1.2687, "nll_loss": 0.9359772801399231, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.529772758483887, "rewards/margins": 3.243734359741211, "rewards/rejected": 2.2860379219055176, "step": 5520 }, { "epoch": 0.3067918614166239, "grad_norm": 35.46376419067383, "learning_rate": 7.851897883193056e-08, "logits/chosen": -0.20039483904838562, "logits/rejected": -0.34632977843284607, "logps/chosen": -147.12570190429688, "logps/rejected": -192.63580322265625, "loss": 1.1467, "nll_loss": 0.8703436851501465, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.039349555969238, "rewards/margins": 3.207817792892456, "rewards/rejected": 1.831531286239624, "step": 5530 }, { "epoch": 0.3073466387428746, "grad_norm": 105.12731170654297, "learning_rate": 7.84473559327292e-08, "logits/chosen": -0.22078721225261688, "logits/rejected": -0.41360822319984436, "logps/chosen": -154.78302001953125, "logps/rejected": -225.735107421875, "loss": 1.3309, "nll_loss": 0.8465398550033569, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.447340488433838, "rewards/margins": 3.7463760375976562, "rewards/rejected": 1.7009642124176025, "step": 5540 }, { "epoch": 0.30790141606912524, "grad_norm": 83.25653076171875, "learning_rate": 7.837564661815754e-08, "logits/chosen": -0.36427661776542664, "logits/rejected": -0.4735857844352722, "logps/chosen": -184.8861083984375, "logps/rejected": -257.52734375, "loss": 1.4227, "nll_loss": 1.0542347431182861, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.679947376251221, "rewards/margins": 3.7396247386932373, "rewards/rejected": 1.9403222799301147, "step": 5550 }, { "epoch": 0.30845619339537594, "grad_norm": 97.9723892211914, "learning_rate": 7.830385110604904e-08, "logits/chosen": -0.3181317150592804, "logits/rejected": -0.4537307322025299, "logps/chosen": -123.86064147949219, "logps/rejected": -191.56643676757812, "loss": 1.4407, "nll_loss": 0.8734070658683777, "rewards/accuracies": 0.75, "rewards/chosen": 4.979821681976318, "rewards/margins": 2.7552473545074463, "rewards/rejected": 2.224574565887451, "step": 5560 }, { "epoch": 0.3090109707216266, "grad_norm": 95.3475570678711, "learning_rate": 7.82319696144991e-08, "logits/chosen": -0.24439339339733124, "logits/rejected": -0.37864407896995544, "logps/chosen": -133.63058471679688, "logps/rejected": -181.42242431640625, "loss": 1.3581, "nll_loss": 0.8765896558761597, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.396186828613281, "rewards/margins": 2.396735429763794, "rewards/rejected": 1.9994512796401978, "step": 5570 }, { "epoch": 0.3095657480478773, "grad_norm": 103.33505249023438, "learning_rate": 7.816000236186418e-08, "logits/chosen": -0.28526854515075684, "logits/rejected": -0.44429856538772583, "logps/chosen": -170.72952270507812, "logps/rejected": -233.9705047607422, "loss": 1.2686, "nll_loss": 0.9443384408950806, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.931466102600098, "rewards/margins": 3.274135112762451, "rewards/rejected": 1.6573302745819092, "step": 5580 }, { "epoch": 0.310120525374128, "grad_norm": 51.87260437011719, "learning_rate": 7.808794956676134e-08, "logits/chosen": -0.307265043258667, "logits/rejected": -0.49012789130210876, "logps/chosen": -169.12460327148438, "logps/rejected": -198.01025390625, "loss": 1.2568, "nll_loss": 0.9683774709701538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.167562961578369, "rewards/margins": 3.205660343170166, "rewards/rejected": 1.9619033336639404, "step": 5590 }, { "epoch": 0.31067530270037863, "grad_norm": 50.249149322509766, "learning_rate": 7.80158114480675e-08, "logits/chosen": -0.2516246438026428, "logits/rejected": -0.3341858685016632, "logps/chosen": -165.31979370117188, "logps/rejected": -207.95602416992188, "loss": 1.2628, "nll_loss": 1.0670020580291748, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.323300838470459, "rewards/margins": 2.661064386367798, "rewards/rejected": 2.6622369289398193, "step": 5600 }, { "epoch": 0.31123008002662933, "grad_norm": 54.5699348449707, "learning_rate": 7.794358822491871e-08, "logits/chosen": -0.11392636597156525, "logits/rejected": -0.2459922581911087, "logps/chosen": -128.5890350341797, "logps/rejected": -170.20652770996094, "loss": 1.213, "nll_loss": 0.8366823196411133, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.278365135192871, "rewards/margins": 3.0675265789031982, "rewards/rejected": 1.2108380794525146, "step": 5610 }, { "epoch": 0.31178485735288, "grad_norm": 45.721458435058594, "learning_rate": 7.787128011670963e-08, "logits/chosen": -0.19624245166778564, "logits/rejected": -0.34312087297439575, "logps/chosen": -169.8636932373047, "logps/rejected": -233.64169311523438, "loss": 1.1976, "nll_loss": 0.9160765409469604, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.577919006347656, "rewards/margins": 4.408540725708008, "rewards/rejected": 1.1693775653839111, "step": 5620 }, { "epoch": 0.3123396346791307, "grad_norm": 46.51015853881836, "learning_rate": 7.779888734309266e-08, "logits/chosen": -0.35512202978134155, "logits/rejected": -0.48626452684402466, "logps/chosen": -187.6857147216797, "logps/rejected": -256.5793151855469, "loss": 1.2573, "nll_loss": 1.1082860231399536, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.167753219604492, "rewards/margins": 4.530683517456055, "rewards/rejected": 1.6370693445205688, "step": 5630 }, { "epoch": 0.3128944120053813, "grad_norm": 50.111907958984375, "learning_rate": 7.772641012397753e-08, "logits/chosen": -0.3077355921268463, "logits/rejected": -0.44563180208206177, "logps/chosen": -190.90426635742188, "logps/rejected": -258.58197021484375, "loss": 1.294, "nll_loss": 1.0387169122695923, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.6006927490234375, "rewards/margins": 3.1984353065490723, "rewards/rejected": 2.402257204055786, "step": 5640 }, { "epoch": 0.313449189331632, "grad_norm": 60.936222076416016, "learning_rate": 7.765384867953037e-08, "logits/chosen": -0.30347099900245667, "logits/rejected": -0.45295968651771545, "logps/chosen": -183.19973754882812, "logps/rejected": -263.80865478515625, "loss": 1.2964, "nll_loss": 1.023938775062561, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.2091240882873535, "rewards/margins": 3.616779327392578, "rewards/rejected": 2.5923449993133545, "step": 5650 }, { "epoch": 0.3140039666578827, "grad_norm": 57.22261047363281, "learning_rate": 7.758120323017326e-08, "logits/chosen": -0.12532678246498108, "logits/rejected": -0.24024459719657898, "logps/chosen": -146.21481323242188, "logps/rejected": -204.45602416992188, "loss": 1.3377, "nll_loss": 0.853805661201477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.998294830322266, "rewards/margins": 2.6482481956481934, "rewards/rejected": 2.3500466346740723, "step": 5660 }, { "epoch": 0.31455874398413336, "grad_norm": 42.02796936035156, "learning_rate": 7.750847399658335e-08, "logits/chosen": -0.05001025274395943, "logits/rejected": -0.1884036660194397, "logps/chosen": -116.2176742553711, "logps/rejected": -169.38729858398438, "loss": 1.2722, "nll_loss": 0.7996511459350586, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.614314079284668, "rewards/margins": 3.169663906097412, "rewards/rejected": 1.4446502923965454, "step": 5670 }, { "epoch": 0.31511352131038406, "grad_norm": 65.07894897460938, "learning_rate": 7.743566119969244e-08, "logits/chosen": -0.21025721728801727, "logits/rejected": -0.35305628180503845, "logps/chosen": -165.0023193359375, "logps/rejected": -235.9938507080078, "loss": 1.2099, "nll_loss": 0.9207907915115356, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.444872856140137, "rewards/margins": 3.8752048015594482, "rewards/rejected": 1.5696674585342407, "step": 5680 }, { "epoch": 0.3156682986366347, "grad_norm": 53.95488739013672, "learning_rate": 7.73627650606861e-08, "logits/chosen": -0.3568010628223419, "logits/rejected": -0.44880276918411255, "logps/chosen": -210.515625, "logps/rejected": -255.9861297607422, "loss": 1.3298, "nll_loss": 1.187220811843872, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.920581340789795, "rewards/margins": 2.686098098754883, "rewards/rejected": 3.234483003616333, "step": 5690 }, { "epoch": 0.3162230759628854, "grad_norm": 45.283164978027344, "learning_rate": 7.728978580100303e-08, "logits/chosen": -0.29234832525253296, "logits/rejected": -0.41959208250045776, "logps/chosen": -172.02313232421875, "logps/rejected": -223.96923828125, "loss": 1.2833, "nll_loss": 1.0394980907440186, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.348699569702148, "rewards/margins": 3.0042288303375244, "rewards/rejected": 2.344470500946045, "step": 5700 }, { "epoch": 0.31677785328913605, "grad_norm": 77.68114471435547, "learning_rate": 7.721672364233453e-08, "logits/chosen": -0.31327614188194275, "logits/rejected": -0.4360221028327942, "logps/chosen": -181.8870391845703, "logps/rejected": -235.1174774169922, "loss": 1.2906, "nll_loss": 0.9546974301338196, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.674635887145996, "rewards/margins": 4.261141300201416, "rewards/rejected": 1.4134950637817383, "step": 5710 }, { "epoch": 0.31733263061538675, "grad_norm": 29.960615158081055, "learning_rate": 7.714357880662364e-08, "logits/chosen": 0.09260416030883789, "logits/rejected": -0.08293385803699493, "logps/chosen": -101.0294418334961, "logps/rejected": -150.16123962402344, "loss": 1.2613, "nll_loss": 0.6419572234153748, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.6931400299072266, "rewards/margins": 2.5476291179656982, "rewards/rejected": 1.1455105543136597, "step": 5720 }, { "epoch": 0.31788740794163745, "grad_norm": 55.809993743896484, "learning_rate": 7.707035151606455e-08, "logits/chosen": -0.2473038136959076, "logits/rejected": -0.39639773964881897, "logps/chosen": -135.81277465820312, "logps/rejected": -187.45169067382812, "loss": 1.2082, "nll_loss": 0.8615878820419312, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.824166774749756, "rewards/margins": 3.423949718475342, "rewards/rejected": 1.4002166986465454, "step": 5730 }, { "epoch": 0.3184421852678881, "grad_norm": 70.8182373046875, "learning_rate": 7.699704199310203e-08, "logits/chosen": -0.3030581772327423, "logits/rejected": -0.42742496728897095, "logps/chosen": -160.92726135253906, "logps/rejected": -204.8309326171875, "loss": 1.3201, "nll_loss": 1.1673412322998047, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.198462009429932, "rewards/margins": 3.667888641357422, "rewards/rejected": 1.5305726528167725, "step": 5740 }, { "epoch": 0.3189969625941388, "grad_norm": 75.41050720214844, "learning_rate": 7.692365046043051e-08, "logits/chosen": -0.3121715784072876, "logits/rejected": -0.4054687023162842, "logps/chosen": -174.28924560546875, "logps/rejected": -240.83523559570312, "loss": 1.2978, "nll_loss": 1.0281198024749756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.9810380935668945, "rewards/margins": 4.004823207855225, "rewards/rejected": 1.9762141704559326, "step": 5750 }, { "epoch": 0.31955173992038943, "grad_norm": 56.148040771484375, "learning_rate": 7.685017714099365e-08, "logits/chosen": -0.1841479241847992, "logits/rejected": -0.38559359312057495, "logps/chosen": -153.3293914794922, "logps/rejected": -228.2798309326172, "loss": 1.2664, "nll_loss": 0.9011721611022949, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.384884834289551, "rewards/margins": 5.131956100463867, "rewards/rejected": 0.2529294788837433, "step": 5760 }, { "epoch": 0.32010651724664013, "grad_norm": 46.79436492919922, "learning_rate": 7.677662225798349e-08, "logits/chosen": -0.2041047066450119, "logits/rejected": -0.3556447923183441, "logps/chosen": -161.56558227539062, "logps/rejected": -232.87765502929688, "loss": 1.3588, "nll_loss": 0.9337288737297058, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 5.395378589630127, "rewards/margins": 2.9714081287384033, "rewards/rejected": 2.4239706993103027, "step": 5770 }, { "epoch": 0.32066129457289083, "grad_norm": 63.75236892700195, "learning_rate": 7.670298603483987e-08, "logits/chosen": -0.26754432916641235, "logits/rejected": -0.31302887201309204, "logps/chosen": -171.03587341308594, "logps/rejected": -225.9440460205078, "loss": 1.2878, "nll_loss": 1.025702953338623, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.298377990722656, "rewards/margins": 3.062826633453369, "rewards/rejected": 2.235551595687866, "step": 5780 }, { "epoch": 0.3212160718991415, "grad_norm": 49.006954193115234, "learning_rate": 7.662926869524971e-08, "logits/chosen": -0.3983635902404785, "logits/rejected": -0.5497706532478333, "logps/chosen": -210.3919219970703, "logps/rejected": -255.5663604736328, "loss": 1.2036, "nll_loss": 1.1042499542236328, "rewards/accuracies": 0.875, "rewards/chosen": 5.7940168380737305, "rewards/margins": 3.4843997955322266, "rewards/rejected": 2.3096179962158203, "step": 5790 }, { "epoch": 0.3217708492253922, "grad_norm": 60.399696350097656, "learning_rate": 7.655547046314634e-08, "logits/chosen": -0.3618773818016052, "logits/rejected": -0.4993468225002289, "logps/chosen": -168.93533325195312, "logps/rejected": -230.4624786376953, "loss": 1.2193, "nll_loss": 0.9717389345169067, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.369915008544922, "rewards/margins": 4.094571590423584, "rewards/rejected": 1.2753427028656006, "step": 5800 }, { "epoch": 0.3223256265516428, "grad_norm": 42.58092498779297, "learning_rate": 7.648159156270884e-08, "logits/chosen": -0.23907014727592468, "logits/rejected": -0.4223829209804535, "logps/chosen": -161.8720703125, "logps/rejected": -214.8459014892578, "loss": 1.3719, "nll_loss": 0.9058443903923035, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.540342807769775, "rewards/margins": 3.800814390182495, "rewards/rejected": 1.739527702331543, "step": 5810 }, { "epoch": 0.3228804038778935, "grad_norm": 70.84566497802734, "learning_rate": 7.64076322183613e-08, "logits/chosen": -0.3701106607913971, "logits/rejected": -0.4296882152557373, "logps/chosen": -183.67086791992188, "logps/rejected": -228.63644409179688, "loss": 1.3083, "nll_loss": 1.0598195791244507, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.584314823150635, "rewards/margins": 3.2408530712127686, "rewards/rejected": 2.3434619903564453, "step": 5820 }, { "epoch": 0.32343518120414416, "grad_norm": 37.22153091430664, "learning_rate": 7.633359265477222e-08, "logits/chosen": -0.2679436206817627, "logits/rejected": -0.4335872530937195, "logps/chosen": -164.10592651367188, "logps/rejected": -201.74392700195312, "loss": 1.2779, "nll_loss": 0.9599549174308777, "rewards/accuracies": 0.875, "rewards/chosen": 5.297649383544922, "rewards/margins": 3.396411180496216, "rewards/rejected": 1.9012380838394165, "step": 5830 }, { "epoch": 0.32398995853039486, "grad_norm": 97.0199203491211, "learning_rate": 7.625947309685372e-08, "logits/chosen": -0.2867860794067383, "logits/rejected": -0.3901570439338684, "logps/chosen": -176.51451110839844, "logps/rejected": -227.54580688476562, "loss": 1.1868, "nll_loss": 0.9550184011459351, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.572012424468994, "rewards/margins": 3.2614574432373047, "rewards/rejected": 2.3105552196502686, "step": 5840 }, { "epoch": 0.32454473585664556, "grad_norm": 129.2616424560547, "learning_rate": 7.6185273769761e-08, "logits/chosen": -0.3186994194984436, "logits/rejected": -0.4096761643886566, "logps/chosen": -181.9349822998047, "logps/rejected": -229.8415985107422, "loss": 1.2713, "nll_loss": 1.0548815727233887, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.698060035705566, "rewards/margins": 4.21523380279541, "rewards/rejected": 1.4828267097473145, "step": 5850 }, { "epoch": 0.3250995131828962, "grad_norm": 57.745906829833984, "learning_rate": 7.611099489889152e-08, "logits/chosen": -0.4499587416648865, "logits/rejected": -0.4957137107849121, "logps/chosen": -208.1059112548828, "logps/rejected": -243.59033203125, "loss": 1.4432, "nll_loss": 1.3240139484405518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.761335849761963, "rewards/margins": 2.7761924266815186, "rewards/rejected": 2.9851431846618652, "step": 5860 }, { "epoch": 0.3256542905091469, "grad_norm": 51.654205322265625, "learning_rate": 7.60366367098844e-08, "logits/chosen": -0.1490587294101715, "logits/rejected": -0.34141451120376587, "logps/chosen": -151.16256713867188, "logps/rejected": -187.86569213867188, "loss": 1.2655, "nll_loss": 0.8017619252204895, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.672058582305908, "rewards/margins": 3.017301082611084, "rewards/rejected": 1.6547574996948242, "step": 5870 }, { "epoch": 0.32620906783539755, "grad_norm": 70.94071960449219, "learning_rate": 7.59621994286197e-08, "logits/chosen": -0.25258737802505493, "logits/rejected": -0.3760277330875397, "logps/chosen": -159.57235717773438, "logps/rejected": -209.1165008544922, "loss": 1.2939, "nll_loss": 0.9444242715835571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.442964553833008, "rewards/margins": 2.996890068054199, "rewards/rejected": 2.446074962615967, "step": 5880 }, { "epoch": 0.32676384516164825, "grad_norm": 49.107723236083984, "learning_rate": 7.588768328121776e-08, "logits/chosen": -0.36431506276130676, "logits/rejected": -0.44438228011131287, "logps/chosen": -172.80581665039062, "logps/rejected": -219.6732940673828, "loss": 1.3315, "nll_loss": 1.0387299060821533, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.738280296325684, "rewards/margins": 3.2641844749450684, "rewards/rejected": 2.474095106124878, "step": 5890 }, { "epoch": 0.32731862248789895, "grad_norm": 64.77669525146484, "learning_rate": 7.581308849403842e-08, "logits/chosen": -0.3097997307777405, "logits/rejected": -0.5356290340423584, "logps/chosen": -154.3521270751953, "logps/rejected": -218.3408660888672, "loss": 1.2495, "nll_loss": 0.8772909045219421, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.518646717071533, "rewards/margins": 4.598464012145996, "rewards/rejected": 0.9201822280883789, "step": 5900 }, { "epoch": 0.3278733998141496, "grad_norm": 87.04254150390625, "learning_rate": 7.573841529368051e-08, "logits/chosen": -0.42871952056884766, "logits/rejected": -0.5346238017082214, "logps/chosen": -185.951171875, "logps/rejected": -256.2978820800781, "loss": 1.244, "nll_loss": 1.0659410953521729, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.769937038421631, "rewards/margins": 4.028483867645264, "rewards/rejected": 1.7414535284042358, "step": 5910 }, { "epoch": 0.3284281771404003, "grad_norm": 73.32978820800781, "learning_rate": 7.566366390698098e-08, "logits/chosen": -0.2544618248939514, "logits/rejected": -0.4156056344509125, "logps/chosen": -204.7109375, "logps/rejected": -284.37750244140625, "loss": 1.2187, "nll_loss": 0.9991675615310669, "rewards/accuracies": 0.875, "rewards/chosen": 6.162306785583496, "rewards/margins": 4.747017860412598, "rewards/rejected": 1.4152885675430298, "step": 5920 }, { "epoch": 0.32898295446665093, "grad_norm": 59.712581634521484, "learning_rate": 7.558883456101432e-08, "logits/chosen": -0.2760107219219208, "logits/rejected": -0.48132261633872986, "logps/chosen": -177.5280303955078, "logps/rejected": -234.1001739501953, "loss": 1.2621, "nll_loss": 0.9008985757827759, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.312003135681152, "rewards/margins": 3.9453353881835938, "rewards/rejected": 1.3666683435440063, "step": 5930 }, { "epoch": 0.32953773179290163, "grad_norm": 59.13243103027344, "learning_rate": 7.551392748309187e-08, "logits/chosen": -0.16078224778175354, "logits/rejected": -0.3124062418937683, "logps/chosen": -140.71485900878906, "logps/rejected": -176.59397888183594, "loss": 1.3002, "nll_loss": 0.9051309823989868, "rewards/accuracies": 0.875, "rewards/chosen": 4.805662631988525, "rewards/margins": 2.5614771842956543, "rewards/rejected": 2.244184970855713, "step": 5940 }, { "epoch": 0.3300925091191523, "grad_norm": 87.19740295410156, "learning_rate": 7.543894290076102e-08, "logits/chosen": -0.13854815065860748, "logits/rejected": -0.27854079008102417, "logps/chosen": -123.906005859375, "logps/rejected": -191.5620880126953, "loss": 1.3135, "nll_loss": 0.7680860757827759, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.787467002868652, "rewards/margins": 3.6352336406707764, "rewards/rejected": 1.1522338390350342, "step": 5950 }, { "epoch": 0.330647286445403, "grad_norm": 81.57664489746094, "learning_rate": 7.536388104180467e-08, "logits/chosen": -0.2890421450138092, "logits/rejected": -0.39421191811561584, "logps/chosen": -133.77871704101562, "logps/rejected": -190.2563018798828, "loss": 1.3539, "nll_loss": 0.8960251808166504, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.03255033493042, "rewards/margins": 2.8215224742889404, "rewards/rejected": 2.2110276222229004, "step": 5960 }, { "epoch": 0.3312020637716537, "grad_norm": 43.42544937133789, "learning_rate": 7.528874213424044e-08, "logits/chosen": -0.2997869551181793, "logits/rejected": -0.46176987886428833, "logps/chosen": -162.94911193847656, "logps/rejected": -232.91073608398438, "loss": 1.279, "nll_loss": 0.9380319714546204, "rewards/accuracies": 0.875, "rewards/chosen": 5.328009605407715, "rewards/margins": 3.1782174110412598, "rewards/rejected": 2.149792432785034, "step": 5970 }, { "epoch": 0.3317568410979043, "grad_norm": 56.400909423828125, "learning_rate": 7.521352640631997e-08, "logits/chosen": -0.39626017212867737, "logits/rejected": -0.47180286049842834, "logps/chosen": -176.3207244873047, "logps/rejected": -240.38919067382812, "loss": 1.3374, "nll_loss": 1.0063221454620361, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.927728652954102, "rewards/margins": 2.973665237426758, "rewards/rejected": 2.9540631771087646, "step": 5980 }, { "epoch": 0.332311618424155, "grad_norm": 52.270530700683594, "learning_rate": 7.513823408652833e-08, "logits/chosen": -0.31507277488708496, "logits/rejected": -0.46405959129333496, "logps/chosen": -192.87774658203125, "logps/rejected": -255.1183624267578, "loss": 1.2586, "nll_loss": 0.9924084544181824, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.717407703399658, "rewards/margins": 3.2618069648742676, "rewards/rejected": 2.4556007385253906, "step": 5990 }, { "epoch": 0.33286639575040566, "grad_norm": 41.6818733215332, "learning_rate": 7.506286540358317e-08, "logits/chosen": -0.2737719416618347, "logits/rejected": -0.37862199544906616, "logps/chosen": -185.80844116210938, "logps/rejected": -265.85015869140625, "loss": 1.2765, "nll_loss": 1.0262348651885986, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.482663154602051, "rewards/margins": 3.4432480335235596, "rewards/rejected": 2.039414882659912, "step": 6000 }, { "epoch": 0.33286639575040566, "eval_logits/chosen": -0.3862064480781555, "eval_logits/rejected": -0.4756089448928833, "eval_logps/chosen": -196.4144287109375, "eval_logps/rejected": -260.7208557128906, "eval_loss": 1.2678471803665161, "eval_nll_loss": 1.0233957767486572, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 6.154678821563721, "eval_rewards/margins": 4.318915843963623, "eval_rewards/rejected": 1.8357634544372559, "eval_runtime": 16.8909, "eval_samples_per_second": 15.156, "eval_steps_per_second": 1.895, "step": 6000 }, { "epoch": 0.33342117307665636, "grad_norm": 87.84968566894531, "learning_rate": 7.49874205864342e-08, "logits/chosen": -0.21110494434833527, "logits/rejected": -0.38663357496261597, "logps/chosen": -159.3927764892578, "logps/rejected": -252.47006225585938, "loss": 1.3007, "nll_loss": 0.8897081613540649, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.111419677734375, "rewards/margins": 4.202252388000488, "rewards/rejected": 1.9091672897338867, "step": 6010 }, { "epoch": 0.333975950402907, "grad_norm": 55.64853286743164, "learning_rate": 7.491189986426235e-08, "logits/chosen": -0.45345860719680786, "logits/rejected": -0.519438624382019, "logps/chosen": -216.1964874267578, "logps/rejected": -305.46832275390625, "loss": 1.4051, "nll_loss": 1.1889097690582275, "rewards/accuracies": 0.875, "rewards/chosen": 6.22586727142334, "rewards/margins": 4.845824241638184, "rewards/rejected": 1.3800431489944458, "step": 6020 }, { "epoch": 0.3345307277291577, "grad_norm": 87.58065795898438, "learning_rate": 7.48363034664791e-08, "logits/chosen": -0.22561879456043243, "logits/rejected": -0.41260385513305664, "logps/chosen": -136.2498779296875, "logps/rejected": -181.90325927734375, "loss": 1.2827, "nll_loss": 0.8572355508804321, "rewards/accuracies": 0.75, "rewards/chosen": 4.760739803314209, "rewards/margins": 2.4254050254821777, "rewards/rejected": 2.3353352546691895, "step": 6030 }, { "epoch": 0.3350855050554084, "grad_norm": 84.19505310058594, "learning_rate": 7.476063162272593e-08, "logits/chosen": -0.3665398061275482, "logits/rejected": -0.5166418552398682, "logps/chosen": -181.16824340820312, "logps/rejected": -262.6795349121094, "loss": 1.3055, "nll_loss": 1.013683557510376, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.933252811431885, "rewards/margins": 4.092905521392822, "rewards/rejected": 1.8403470516204834, "step": 6040 }, { "epoch": 0.33564028238165905, "grad_norm": 39.95737838745117, "learning_rate": 7.468488456287336e-08, "logits/chosen": -0.37900015711784363, "logits/rejected": -0.4724903106689453, "logps/chosen": -173.86795043945312, "logps/rejected": -231.43423461914062, "loss": 1.2916, "nll_loss": 0.9488736987113953, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.916155815124512, "rewards/margins": 4.1103668212890625, "rewards/rejected": 1.8057887554168701, "step": 6050 }, { "epoch": 0.33619505970790975, "grad_norm": 63.77442169189453, "learning_rate": 7.460906251702051e-08, "logits/chosen": -0.21798260509967804, "logits/rejected": -0.40063363313674927, "logps/chosen": -165.91812133789062, "logps/rejected": -195.58705139160156, "loss": 1.2618, "nll_loss": 0.9206756353378296, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.39694881439209, "rewards/margins": 2.942946195602417, "rewards/rejected": 2.4540023803710938, "step": 6060 }, { "epoch": 0.3367498370341604, "grad_norm": 50.75809860229492, "learning_rate": 7.45331657154942e-08, "logits/chosen": -0.32556071877479553, "logits/rejected": -0.42382025718688965, "logps/chosen": -152.8153839111328, "logps/rejected": -189.4619140625, "loss": 1.2941, "nll_loss": 0.9182936549186707, "rewards/accuracies": 0.875, "rewards/chosen": 5.209329605102539, "rewards/margins": 2.8177552223205566, "rewards/rejected": 2.3915748596191406, "step": 6070 }, { "epoch": 0.3373046143604111, "grad_norm": 94.52537536621094, "learning_rate": 7.445719438884839e-08, "logits/chosen": -0.18494704365730286, "logits/rejected": -0.2840834856033325, "logps/chosen": -163.761962890625, "logps/rejected": -203.1731414794922, "loss": 1.175, "nll_loss": 0.9018437266349792, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.398464679718018, "rewards/margins": 3.006654739379883, "rewards/rejected": 2.3918099403381348, "step": 6080 }, { "epoch": 0.3378593916866618, "grad_norm": 53.11320877075195, "learning_rate": 7.438114876786343e-08, "logits/chosen": -0.18565864861011505, "logits/rejected": -0.31152859330177307, "logps/chosen": -140.01583862304688, "logps/rejected": -183.9373321533203, "loss": 1.2597, "nll_loss": 0.8148931264877319, "rewards/accuracies": 0.75, "rewards/chosen": 4.700402736663818, "rewards/margins": 2.360290288925171, "rewards/rejected": 2.3401126861572266, "step": 6090 }, { "epoch": 0.33841416901291244, "grad_norm": 148.61221313476562, "learning_rate": 7.430502908354531e-08, "logits/chosen": -0.3025711178779602, "logits/rejected": -0.4514090120792389, "logps/chosen": -179.38238525390625, "logps/rejected": -237.6366424560547, "loss": 1.3181, "nll_loss": 1.0456587076187134, "rewards/accuracies": 0.875, "rewards/chosen": 5.581388473510742, "rewards/margins": 4.103987216949463, "rewards/rejected": 1.4774014949798584, "step": 6100 }, { "epoch": 0.33896894633916314, "grad_norm": 74.82305145263672, "learning_rate": 7.422883556712507e-08, "logits/chosen": -0.30521565675735474, "logits/rejected": -0.4374857544898987, "logps/chosen": -171.0484619140625, "logps/rejected": -224.58627319335938, "loss": 1.2929, "nll_loss": 0.9364229440689087, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.537566184997559, "rewards/margins": 3.221212387084961, "rewards/rejected": 2.3163530826568604, "step": 6110 }, { "epoch": 0.3395237236654138, "grad_norm": 97.04814147949219, "learning_rate": 7.415256845005797e-08, "logits/chosen": -0.2522074282169342, "logits/rejected": -0.42201119661331177, "logps/chosen": -184.75643920898438, "logps/rejected": -244.8933868408203, "loss": 1.3599, "nll_loss": 0.9097296595573425, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.552473068237305, "rewards/margins": 3.796072006225586, "rewards/rejected": 1.7564010620117188, "step": 6120 }, { "epoch": 0.3400785009916645, "grad_norm": 60.06885528564453, "learning_rate": 7.407622796402291e-08, "logits/chosen": -0.3063388764858246, "logits/rejected": -0.4394128918647766, "logps/chosen": -221.8954620361328, "logps/rejected": -280.248291015625, "loss": 1.4306, "nll_loss": 1.0352271795272827, "rewards/accuracies": 0.875, "rewards/chosen": 6.173859596252441, "rewards/margins": 4.336389541625977, "rewards/rejected": 1.837469458580017, "step": 6130 }, { "epoch": 0.3406332783179151, "grad_norm": 54.11576461791992, "learning_rate": 7.399981434092159e-08, "logits/chosen": -0.3295516073703766, "logits/rejected": -0.48471516370773315, "logps/chosen": -211.0443572998047, "logps/rejected": -267.0463562011719, "loss": 1.2962, "nll_loss": 1.0900715589523315, "rewards/accuracies": 0.875, "rewards/chosen": 6.180323600769043, "rewards/margins": 4.42136812210083, "rewards/rejected": 1.7589561939239502, "step": 6140 }, { "epoch": 0.3411880556441658, "grad_norm": 73.50775909423828, "learning_rate": 7.392332781287797e-08, "logits/chosen": -0.18400311470031738, "logits/rejected": -0.301456481218338, "logps/chosen": -131.61074829101562, "logps/rejected": -181.8965301513672, "loss": 1.2867, "nll_loss": 0.8391935229301453, "rewards/accuracies": 0.875, "rewards/chosen": 4.861627101898193, "rewards/margins": 2.8276915550231934, "rewards/rejected": 2.033935546875, "step": 6150 }, { "epoch": 0.3417428329704165, "grad_norm": 55.874847412109375, "learning_rate": 7.384676861223738e-08, "logits/chosen": -0.42141732573509216, "logits/rejected": -0.4614016115665436, "logps/chosen": -205.42623901367188, "logps/rejected": -246.082275390625, "loss": 1.288, "nll_loss": 1.1639835834503174, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.899552822113037, "rewards/margins": 2.474811315536499, "rewards/rejected": 3.424741268157959, "step": 6160 }, { "epoch": 0.34229761029666717, "grad_norm": 58.53584671020508, "learning_rate": 7.377013697156595e-08, "logits/chosen": -0.3395301401615143, "logits/rejected": -0.43350130319595337, "logps/chosen": -198.6548614501953, "logps/rejected": -271.24078369140625, "loss": 1.4002, "nll_loss": 1.0972192287445068, "rewards/accuracies": 0.875, "rewards/chosen": 6.351205348968506, "rewards/margins": 3.5542285442352295, "rewards/rejected": 2.7969765663146973, "step": 6170 }, { "epoch": 0.34285238762291786, "grad_norm": 93.97845458984375, "learning_rate": 7.369343312364993e-08, "logits/chosen": -0.11968524754047394, "logits/rejected": -0.27802419662475586, "logps/chosen": -145.43263244628906, "logps/rejected": -194.38095092773438, "loss": 1.3325, "nll_loss": 0.8231142163276672, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.9595794677734375, "rewards/margins": 3.4239859580993652, "rewards/rejected": 1.5355936288833618, "step": 6180 }, { "epoch": 0.3434071649491685, "grad_norm": 41.60972595214844, "learning_rate": 7.361665730149482e-08, "logits/chosen": -0.27187636494636536, "logits/rejected": -0.3486558794975281, "logps/chosen": -160.63827514648438, "logps/rejected": -214.30783081054688, "loss": 1.2974, "nll_loss": 0.9356172680854797, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.591213703155518, "rewards/margins": 2.9364173412323, "rewards/rejected": 2.6547961235046387, "step": 6190 }, { "epoch": 0.3439619422754192, "grad_norm": 108.99187469482422, "learning_rate": 7.353980973832478e-08, "logits/chosen": -0.39349788427352905, "logits/rejected": -0.4973227381706238, "logps/chosen": -183.8045196533203, "logps/rejected": -229.7374725341797, "loss": 1.3744, "nll_loss": 1.0677894353866577, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.8697943687438965, "rewards/margins": 3.1557323932647705, "rewards/rejected": 2.714062213897705, "step": 6200 }, { "epoch": 0.3445167196016699, "grad_norm": 43.160152435302734, "learning_rate": 7.346289066758194e-08, "logits/chosen": -0.31956276297569275, "logits/rejected": -0.4108821749687195, "logps/chosen": -156.1692352294922, "logps/rejected": -214.0746612548828, "loss": 1.3614, "nll_loss": 1.099229097366333, "rewards/accuracies": 0.875, "rewards/chosen": 5.477211952209473, "rewards/margins": 3.1682682037353516, "rewards/rejected": 2.308943510055542, "step": 6210 }, { "epoch": 0.34507149692792055, "grad_norm": 114.94517517089844, "learning_rate": 7.338590032292561e-08, "logits/chosen": -0.4515753388404846, "logits/rejected": -0.5310046672821045, "logps/chosen": -192.64334106445312, "logps/rejected": -232.1820068359375, "loss": 1.3761, "nll_loss": 1.1607623100280762, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.136222839355469, "rewards/margins": 3.0856223106384277, "rewards/rejected": 3.05060076713562, "step": 6220 }, { "epoch": 0.34562627425417125, "grad_norm": 55.0934944152832, "learning_rate": 7.330883893823163e-08, "logits/chosen": -0.14323115348815918, "logits/rejected": -0.328891783952713, "logps/chosen": -136.67495727539062, "logps/rejected": -173.80918884277344, "loss": 1.3251, "nll_loss": 0.8461788892745972, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.520416736602783, "rewards/margins": 2.688857316970825, "rewards/rejected": 1.8315595388412476, "step": 6230 }, { "epoch": 0.3461810515804219, "grad_norm": 93.47306823730469, "learning_rate": 7.323170674759163e-08, "logits/chosen": -0.304673433303833, "logits/rejected": -0.43938857316970825, "logps/chosen": -200.05377197265625, "logps/rejected": -229.02261352539062, "loss": 1.3026, "nll_loss": 0.9279875755310059, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.0606160163879395, "rewards/margins": 4.205328464508057, "rewards/rejected": 1.8552885055541992, "step": 6240 }, { "epoch": 0.3467358289066726, "grad_norm": 74.74398040771484, "learning_rate": 7.315450398531235e-08, "logits/chosen": -0.10720958560705185, "logits/rejected": -0.25590941309928894, "logps/chosen": -126.95005798339844, "logps/rejected": -181.28817749023438, "loss": 1.2535, "nll_loss": 0.7301589846611023, "rewards/accuracies": 0.875, "rewards/chosen": 4.625161170959473, "rewards/margins": 3.0095062255859375, "rewards/rejected": 1.615654706954956, "step": 6250 }, { "epoch": 0.34729060623292324, "grad_norm": 63.662574768066406, "learning_rate": 7.307723088591488e-08, "logits/chosen": -0.17334458231925964, "logits/rejected": -0.30409249663352966, "logps/chosen": -142.11819458007812, "logps/rejected": -182.3699493408203, "loss": 1.2557, "nll_loss": 0.8528593182563782, "rewards/accuracies": 0.75, "rewards/chosen": 4.944436073303223, "rewards/margins": 2.245910167694092, "rewards/rejected": 2.69852614402771, "step": 6260 }, { "epoch": 0.34784538355917394, "grad_norm": 82.51528930664062, "learning_rate": 7.299988768413401e-08, "logits/chosen": -0.3210276961326599, "logits/rejected": -0.43619388341903687, "logps/chosen": -189.78506469726562, "logps/rejected": -248.33517456054688, "loss": 1.2835, "nll_loss": 1.0168853998184204, "rewards/accuracies": 0.75, "rewards/chosen": 5.723790645599365, "rewards/margins": 3.3456077575683594, "rewards/rejected": 2.378182888031006, "step": 6270 }, { "epoch": 0.34840016088542464, "grad_norm": 71.50834655761719, "learning_rate": 7.292247461491743e-08, "logits/chosen": -0.3468714952468872, "logits/rejected": -0.46626749634742737, "logps/chosen": -194.5014190673828, "logps/rejected": -250.09317016601562, "loss": 1.2907, "nll_loss": 1.0468021631240845, "rewards/accuracies": 0.875, "rewards/chosen": 6.295615196228027, "rewards/margins": 4.136518955230713, "rewards/rejected": 2.1590962409973145, "step": 6280 }, { "epoch": 0.3489549382116753, "grad_norm": 54.759300231933594, "learning_rate": 7.284499191342512e-08, "logits/chosen": -0.03164532408118248, "logits/rejected": -0.23100057244300842, "logps/chosen": -116.4410400390625, "logps/rejected": -183.38046264648438, "loss": 1.239, "nll_loss": 0.7256309390068054, "rewards/accuracies": 0.875, "rewards/chosen": 4.489354610443115, "rewards/margins": 3.0173919200897217, "rewards/rejected": 1.471962571144104, "step": 6290 }, { "epoch": 0.349509715537926, "grad_norm": 70.90135192871094, "learning_rate": 7.276743981502856e-08, "logits/chosen": -0.4476935863494873, "logits/rejected": -0.49387326836586, "logps/chosen": -207.6614990234375, "logps/rejected": -271.92535400390625, "loss": 1.346, "nll_loss": 1.2119020223617554, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 6.015636444091797, "rewards/margins": 2.942776918411255, "rewards/rejected": 3.072859525680542, "step": 6300 }, { "epoch": 0.3500644928641766, "grad_norm": 43.998878479003906, "learning_rate": 7.268981855531002e-08, "logits/chosen": -0.2172735184431076, "logits/rejected": -0.3375437557697296, "logps/chosen": -164.04248046875, "logps/rejected": -231.4910430908203, "loss": 1.2797, "nll_loss": 0.9179700613021851, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 5.578200817108154, "rewards/margins": 3.939047336578369, "rewards/rejected": 1.6391534805297852, "step": 6310 }, { "epoch": 0.3506192701904273, "grad_norm": 50.79728317260742, "learning_rate": 7.261212837006191e-08, "logits/chosen": -0.2464495599269867, "logits/rejected": -0.3708285391330719, "logps/chosen": -154.8589630126953, "logps/rejected": -199.90542602539062, "loss": 1.2937, "nll_loss": 0.9330763816833496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.324396133422852, "rewards/margins": 3.398838758468628, "rewards/rejected": 1.9255568981170654, "step": 6320 }, { "epoch": 0.35117404751667797, "grad_norm": 47.69200897216797, "learning_rate": 7.253436949528598e-08, "logits/chosen": -0.2670142650604248, "logits/rejected": -0.42380857467651367, "logps/chosen": -136.66729736328125, "logps/rejected": -179.9617156982422, "loss": 1.3436, "nll_loss": 0.8983263969421387, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.134589195251465, "rewards/margins": 2.9455533027648926, "rewards/rejected": 2.1890358924865723, "step": 6330 }, { "epoch": 0.35172882484292867, "grad_norm": 35.19856643676758, "learning_rate": 7.245654216719267e-08, "logits/chosen": -0.32339194416999817, "logits/rejected": -0.4838895797729492, "logps/chosen": -182.80630493164062, "logps/rejected": -240.4121551513672, "loss": 1.3196, "nll_loss": 1.1221892833709717, "rewards/accuracies": 0.875, "rewards/chosen": 5.559865951538086, "rewards/margins": 3.9628207683563232, "rewards/rejected": 1.5970450639724731, "step": 6340 }, { "epoch": 0.35228360216917937, "grad_norm": 39.05183792114258, "learning_rate": 7.237864662220031e-08, "logits/chosen": -0.34584400057792664, "logits/rejected": -0.4345241189002991, "logps/chosen": -180.0929718017578, "logps/rejected": -244.9677276611328, "loss": 1.2868, "nll_loss": 1.0596837997436523, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.154149055480957, "rewards/margins": 4.189995288848877, "rewards/rejected": 1.9641540050506592, "step": 6350 }, { "epoch": 0.35283837949543, "grad_norm": 61.75810241699219, "learning_rate": 7.230068309693454e-08, "logits/chosen": -0.1510430872440338, "logits/rejected": -0.26399290561676025, "logps/chosen": -153.47390747070312, "logps/rejected": -201.64395141601562, "loss": 1.3012, "nll_loss": 0.9015968441963196, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.250309944152832, "rewards/margins": 2.9471797943115234, "rewards/rejected": 2.303130626678467, "step": 6360 }, { "epoch": 0.3533931568216807, "grad_norm": 62.2823600769043, "learning_rate": 7.222265182822739e-08, "logits/chosen": -0.292441189289093, "logits/rejected": -0.4160211682319641, "logps/chosen": -167.91571044921875, "logps/rejected": -215.4639129638672, "loss": 1.2773, "nll_loss": 0.963361382484436, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.473585605621338, "rewards/margins": 3.273808240890503, "rewards/rejected": 2.199777603149414, "step": 6370 }, { "epoch": 0.35394793414793135, "grad_norm": 67.76326751708984, "learning_rate": 7.21445530531168e-08, "logits/chosen": -0.22603091597557068, "logits/rejected": -0.35425618290901184, "logps/chosen": -149.12693786621094, "logps/rejected": -204.117431640625, "loss": 1.3084, "nll_loss": 1.0864441394805908, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.302445888519287, "rewards/margins": 3.3712353706359863, "rewards/rejected": 1.9312105178833008, "step": 6380 }, { "epoch": 0.35450271147418205, "grad_norm": 98.6044692993164, "learning_rate": 7.206638700884569e-08, "logits/chosen": -0.33501431345939636, "logits/rejected": -0.49409008026123047, "logps/chosen": -177.7139129638672, "logps/rejected": -237.3137969970703, "loss": 1.3065, "nll_loss": 1.0060980319976807, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.7570481300354, "rewards/margins": 4.340456485748291, "rewards/rejected": 1.4165910482406616, "step": 6390 }, { "epoch": 0.35505748880043275, "grad_norm": 44.0657958984375, "learning_rate": 7.198815393286135e-08, "logits/chosen": -0.2693483233451843, "logits/rejected": -0.40953415632247925, "logps/chosen": -174.3963623046875, "logps/rejected": -241.756591796875, "loss": 1.249, "nll_loss": 0.9351913332939148, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.488059043884277, "rewards/margins": 3.997734785079956, "rewards/rejected": 1.4903242588043213, "step": 6400 }, { "epoch": 0.3556122661266834, "grad_norm": 45.25605392456055, "learning_rate": 7.190985406281472e-08, "logits/chosen": -0.08713646233081818, "logits/rejected": -0.25797972083091736, "logps/chosen": -117.9123764038086, "logps/rejected": -182.8262481689453, "loss": 1.3499, "nll_loss": 0.7855955958366394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.2343363761901855, "rewards/margins": 3.5206990242004395, "rewards/rejected": 0.7136377096176147, "step": 6410 }, { "epoch": 0.3561670434529341, "grad_norm": 49.59719467163086, "learning_rate": 7.183148763655959e-08, "logits/chosen": -0.27555233240127563, "logits/rejected": -0.36578166484832764, "logps/chosen": -191.7743682861328, "logps/rejected": -243.2725372314453, "loss": 1.3099, "nll_loss": 0.9913687705993652, "rewards/accuracies": 0.875, "rewards/chosen": 5.697419166564941, "rewards/margins": 3.9420788288116455, "rewards/rejected": 1.7553411722183228, "step": 6420 }, { "epoch": 0.35672182077918474, "grad_norm": 57.11975860595703, "learning_rate": 7.175305489215199e-08, "logits/chosen": -0.08695421367883682, "logits/rejected": -0.2602617144584656, "logps/chosen": -121.39566802978516, "logps/rejected": -164.87734985351562, "loss": 1.2562, "nll_loss": 0.7267943620681763, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.647629261016846, "rewards/margins": 3.4465274810791016, "rewards/rejected": 1.2011014223098755, "step": 6430 }, { "epoch": 0.35727659810543544, "grad_norm": 72.39891052246094, "learning_rate": 7.167455606784934e-08, "logits/chosen": -0.32719069719314575, "logits/rejected": -0.43272677063941956, "logps/chosen": -189.12789916992188, "logps/rejected": -252.99569702148438, "loss": 1.4202, "nll_loss": 1.029658555984497, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.1457319259643555, "rewards/margins": 3.5280590057373047, "rewards/rejected": 2.6176724433898926, "step": 6440 }, { "epoch": 0.3578313754316861, "grad_norm": 29.843097686767578, "learning_rate": 7.159599140210986e-08, "logits/chosen": -0.2822156548500061, "logits/rejected": -0.42548808455467224, "logps/chosen": -179.24459838867188, "logps/rejected": -261.34515380859375, "loss": 1.2763, "nll_loss": 1.0095112323760986, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.766944885253906, "rewards/margins": 3.254338502883911, "rewards/rejected": 2.512606143951416, "step": 6450 }, { "epoch": 0.3583861527579368, "grad_norm": 137.72471618652344, "learning_rate": 7.151736113359174e-08, "logits/chosen": -0.2707064151763916, "logits/rejected": -0.36198943853378296, "logps/chosen": -164.41000366210938, "logps/rejected": -227.1502227783203, "loss": 1.3286, "nll_loss": 1.1287130117416382, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.302213191986084, "rewards/margins": 2.804896593093872, "rewards/rejected": 2.497316360473633, "step": 6460 }, { "epoch": 0.3589409300841875, "grad_norm": 31.62238311767578, "learning_rate": 7.143866550115245e-08, "logits/chosen": 0.03674770146608353, "logits/rejected": -0.16360250115394592, "logps/chosen": -127.9668960571289, "logps/rejected": -163.06024169921875, "loss": 1.2334, "nll_loss": 0.7001178860664368, "rewards/accuracies": 0.875, "rewards/chosen": 4.386387825012207, "rewards/margins": 2.4408748149871826, "rewards/rejected": 1.945513367652893, "step": 6470 }, { "epoch": 0.3594957074104381, "grad_norm": 56.53056335449219, "learning_rate": 7.135990474384804e-08, "logits/chosen": -0.3590020537376404, "logits/rejected": -0.4130149781703949, "logps/chosen": -167.0102996826172, "logps/rejected": -226.590087890625, "loss": 1.3655, "nll_loss": 1.0488640069961548, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.666329860687256, "rewards/margins": 3.3042640686035156, "rewards/rejected": 2.3620657920837402, "step": 6480 }, { "epoch": 0.3600504847366888, "grad_norm": 56.09820556640625, "learning_rate": 7.128107910093238e-08, "logits/chosen": -0.21703293919563293, "logits/rejected": -0.3460865616798401, "logps/chosen": -160.18495178222656, "logps/rejected": -196.6850128173828, "loss": 1.3055, "nll_loss": 0.9067890048027039, "rewards/accuracies": 0.75, "rewards/chosen": 5.332886695861816, "rewards/margins": 2.839141607284546, "rewards/rejected": 2.493744373321533, "step": 6490 }, { "epoch": 0.36060526206293947, "grad_norm": 77.31085968017578, "learning_rate": 7.120218881185642e-08, "logits/chosen": -0.28630420565605164, "logits/rejected": -0.40152207016944885, "logps/chosen": -167.91921997070312, "logps/rejected": -241.7128448486328, "loss": 1.2308, "nll_loss": 0.9291079640388489, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.025139331817627, "rewards/margins": 2.8772501945495605, "rewards/rejected": 3.147888660430908, "step": 6500 }, { "epoch": 0.36060526206293947, "eval_logits/chosen": -0.35594862699508667, "eval_logits/rejected": -0.4358241856098175, "eval_logps/chosen": -194.94239807128906, "eval_logps/rejected": -258.62860107421875, "eval_loss": 1.255896806716919, "eval_nll_loss": 1.0143333673477173, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 6.301882743835449, "eval_rewards/margins": 4.256891250610352, "eval_rewards/rejected": 2.0449914932250977, "eval_runtime": 16.8633, "eval_samples_per_second": 15.181, "eval_steps_per_second": 1.898, "step": 6500 }, { "epoch": 0.36116003938919017, "grad_norm": 55.182987213134766, "learning_rate": 7.112323411626755e-08, "logits/chosen": -0.3085078299045563, "logits/rejected": -0.4444068372249603, "logps/chosen": -163.02085876464844, "logps/rejected": -214.41885375976562, "loss": 1.326, "nll_loss": 1.0153495073318481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.5913190841674805, "rewards/margins": 2.8857014179229736, "rewards/rejected": 2.705617904663086, "step": 6510 }, { "epoch": 0.3617148167154408, "grad_norm": 91.24496459960938, "learning_rate": 7.104421525400874e-08, "logits/chosen": -0.23191985487937927, "logits/rejected": -0.371439129114151, "logps/chosen": -157.10472106933594, "logps/rejected": -209.3927764892578, "loss": 1.3215, "nll_loss": 1.0198981761932373, "rewards/accuracies": 0.875, "rewards/chosen": 5.36658239364624, "rewards/margins": 3.5386276245117188, "rewards/rejected": 1.827954649925232, "step": 6520 }, { "epoch": 0.3622695940416915, "grad_norm": 49.67591094970703, "learning_rate": 7.096513246511794e-08, "logits/chosen": -0.20982857048511505, "logits/rejected": -0.3763899505138397, "logps/chosen": -142.84642028808594, "logps/rejected": -192.2998504638672, "loss": 1.4071, "nll_loss": 0.9238711595535278, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.907094478607178, "rewards/margins": 3.333144426345825, "rewards/rejected": 1.573950171470642, "step": 6530 }, { "epoch": 0.3628243713679422, "grad_norm": 62.50767135620117, "learning_rate": 7.088598598982727e-08, "logits/chosen": -0.22332048416137695, "logits/rejected": -0.349341481924057, "logps/chosen": -163.889404296875, "logps/rejected": -205.19857788085938, "loss": 1.2796, "nll_loss": 0.9458600878715515, "rewards/accuracies": 0.75, "rewards/chosen": 5.2342610359191895, "rewards/margins": 3.4151597023010254, "rewards/rejected": 1.819101333618164, "step": 6540 }, { "epoch": 0.36337914869419286, "grad_norm": 88.51134490966797, "learning_rate": 7.080677606856229e-08, "logits/chosen": -0.1471829116344452, "logits/rejected": -0.2903767228126526, "logps/chosen": -142.53680419921875, "logps/rejected": -210.1653594970703, "loss": 1.2684, "nll_loss": 0.8725835084915161, "rewards/accuracies": 0.875, "rewards/chosen": 5.088635444641113, "rewards/margins": 3.848362445831299, "rewards/rejected": 1.2402734756469727, "step": 6550 }, { "epoch": 0.36393392602044355, "grad_norm": 63.4984245300293, "learning_rate": 7.07275029419413e-08, "logits/chosen": -0.2597203254699707, "logits/rejected": -0.34430503845214844, "logps/chosen": -186.06716918945312, "logps/rejected": -232.9912109375, "loss": 1.3937, "nll_loss": 1.1478632688522339, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.421383857727051, "rewards/margins": 3.0764412879943848, "rewards/rejected": 2.344942331314087, "step": 6560 }, { "epoch": 0.3644887033466942, "grad_norm": 62.26931381225586, "learning_rate": 7.064816685077461e-08, "logits/chosen": -0.2954166829586029, "logits/rejected": -0.39529484510421753, "logps/chosen": -208.4893035888672, "logps/rejected": -244.24404907226562, "loss": 1.4052, "nll_loss": 1.03569495677948, "rewards/accuracies": 0.75, "rewards/chosen": 6.040772438049316, "rewards/margins": 3.601053237915039, "rewards/rejected": 2.4397192001342773, "step": 6570 }, { "epoch": 0.3650434806729449, "grad_norm": 34.1624641418457, "learning_rate": 7.056876803606382e-08, "logits/chosen": -0.2604549825191498, "logits/rejected": -0.3620299696922302, "logps/chosen": -169.99151611328125, "logps/rejected": -234.2142791748047, "loss": 1.2377, "nll_loss": 0.9837854504585266, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.080147743225098, "rewards/margins": 3.937419891357422, "rewards/rejected": 2.142728090286255, "step": 6580 }, { "epoch": 0.3655982579991956, "grad_norm": 57.836952209472656, "learning_rate": 7.048930673900104e-08, "logits/chosen": -0.09565924108028412, "logits/rejected": -0.18147985637187958, "logps/chosen": -140.58352661132812, "logps/rejected": -192.8702850341797, "loss": 1.2783, "nll_loss": 0.8632059097290039, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.876048564910889, "rewards/margins": 2.7102205753326416, "rewards/rejected": 2.165827989578247, "step": 6590 }, { "epoch": 0.36615303532544624, "grad_norm": 118.21241760253906, "learning_rate": 7.040978320096819e-08, "logits/chosen": -0.2354263812303543, "logits/rejected": -0.3318116068840027, "logps/chosen": -145.45526123046875, "logps/rejected": -204.20809936523438, "loss": 1.3199, "nll_loss": 0.897225558757782, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.396674633026123, "rewards/margins": 3.439645290374756, "rewards/rejected": 1.9570293426513672, "step": 6600 }, { "epoch": 0.36670781265169694, "grad_norm": 68.2356185913086, "learning_rate": 7.033019766353625e-08, "logits/chosen": -0.17760439217090607, "logits/rejected": -0.2736101448535919, "logps/chosen": -198.8605499267578, "logps/rejected": -246.8716583251953, "loss": 1.3336, "nll_loss": 1.0561082363128662, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.714756965637207, "rewards/margins": 2.732408046722412, "rewards/rejected": 2.982348918914795, "step": 6610 }, { "epoch": 0.3672625899779476, "grad_norm": 87.32295989990234, "learning_rate": 7.025055036846454e-08, "logits/chosen": -0.21067604422569275, "logits/rejected": -0.32407501339912415, "logps/chosen": -202.57394409179688, "logps/rejected": -250.7236785888672, "loss": 1.3081, "nll_loss": 1.0166271924972534, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.010329246520996, "rewards/margins": 3.8012759685516357, "rewards/rejected": 2.2090530395507812, "step": 6620 }, { "epoch": 0.3678173673041983, "grad_norm": 56.69834899902344, "learning_rate": 7.017084155770005e-08, "logits/chosen": -0.2860775887966156, "logits/rejected": -0.3612442910671234, "logps/chosen": -203.62258911132812, "logps/rejected": -248.18661499023438, "loss": 1.2513, "nll_loss": 1.1245375871658325, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.074135780334473, "rewards/margins": 3.8243346214294434, "rewards/rejected": 2.249800682067871, "step": 6630 }, { "epoch": 0.36837214463044893, "grad_norm": 85.24093627929688, "learning_rate": 7.009107147337652e-08, "logits/chosen": -0.03159003332257271, "logits/rejected": -0.18280380964279175, "logps/chosen": -153.42019653320312, "logps/rejected": -214.30972290039062, "loss": 1.27, "nll_loss": 0.8551816940307617, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.650689125061035, "rewards/margins": 3.2693932056427, "rewards/rejected": 1.3812963962554932, "step": 6640 }, { "epoch": 0.3689269219566996, "grad_norm": 57.682281494140625, "learning_rate": 7.001124035781389e-08, "logits/chosen": 0.09463175386190414, "logits/rejected": -0.0054311128333210945, "logps/chosen": -131.00662231445312, "logps/rejected": -162.55885314941406, "loss": 1.3126, "nll_loss": 0.7956680059432983, "rewards/accuracies": 0.75, "rewards/chosen": 4.460573196411133, "rewards/margins": 2.7722792625427246, "rewards/rejected": 1.6882938146591187, "step": 6650 }, { "epoch": 0.3694816992829503, "grad_norm": 45.95630645751953, "learning_rate": 6.993134845351752e-08, "logits/chosen": -0.2924136221408844, "logits/rejected": -0.3941074013710022, "logps/chosen": -157.63040161132812, "logps/rejected": -208.11837768554688, "loss": 1.2819, "nll_loss": 1.0189913511276245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.672625541687012, "rewards/margins": 2.997842788696289, "rewards/rejected": 2.6747829914093018, "step": 6660 }, { "epoch": 0.37003647660920097, "grad_norm": 115.9908218383789, "learning_rate": 6.985139600317737e-08, "logits/chosen": 0.06917702406644821, "logits/rejected": -0.09741564840078354, "logps/chosen": -116.82330322265625, "logps/rejected": -147.33279418945312, "loss": 1.3045, "nll_loss": 0.704028308391571, "rewards/accuracies": 0.75, "rewards/chosen": 3.9377639293670654, "rewards/margins": 2.025374412536621, "rewards/rejected": 1.9123893976211548, "step": 6670 }, { "epoch": 0.37059125393545167, "grad_norm": 60.326393127441406, "learning_rate": 6.977138324966736e-08, "logits/chosen": -0.12928955256938934, "logits/rejected": -0.2916646897792816, "logps/chosen": -146.9344482421875, "logps/rejected": -200.44168090820312, "loss": 1.2645, "nll_loss": 0.8929560780525208, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.845291614532471, "rewards/margins": 3.032985210418701, "rewards/rejected": 1.8123064041137695, "step": 6680 }, { "epoch": 0.3711460312617023, "grad_norm": 48.884490966796875, "learning_rate": 6.969131043604459e-08, "logits/chosen": -0.17368502914905548, "logits/rejected": -0.3455658555030823, "logps/chosen": -155.38258361816406, "logps/rejected": -208.5419158935547, "loss": 1.2361, "nll_loss": 0.8771988153457642, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.127696514129639, "rewards/margins": 3.4352517127990723, "rewards/rejected": 1.692445158958435, "step": 6690 }, { "epoch": 0.371700808587953, "grad_norm": 57.625953674316406, "learning_rate": 6.961117780554862e-08, "logits/chosen": -0.11713214218616486, "logits/rejected": -0.2630918622016907, "logps/chosen": -159.48318481445312, "logps/rejected": -208.9144287109375, "loss": 1.3387, "nll_loss": 0.9308179616928101, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.009263515472412, "rewards/margins": 3.5683231353759766, "rewards/rejected": 1.4409408569335938, "step": 6700 }, { "epoch": 0.3722555859142037, "grad_norm": 43.676998138427734, "learning_rate": 6.953098560160065e-08, "logits/chosen": -0.18010783195495605, "logits/rejected": -0.27125436067581177, "logps/chosen": -168.66122436523438, "logps/rejected": -218.2757568359375, "loss": 1.2475, "nll_loss": 0.9375994801521301, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.377259254455566, "rewards/margins": 2.9341747760772705, "rewards/rejected": 2.443084478378296, "step": 6710 }, { "epoch": 0.37281036324045436, "grad_norm": 113.7532958984375, "learning_rate": 6.945073406780295e-08, "logits/chosen": -0.1638331115245819, "logits/rejected": -0.30707019567489624, "logps/chosen": -158.09225463867188, "logps/rejected": -197.9178009033203, "loss": 1.3798, "nll_loss": 0.962394118309021, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 4.8423566818237305, "rewards/margins": 2.6290435791015625, "rewards/rejected": 2.213313341140747, "step": 6720 }, { "epoch": 0.37336514056670506, "grad_norm": 86.59712219238281, "learning_rate": 6.937042344793795e-08, "logits/chosen": -0.22376978397369385, "logits/rejected": -0.3277062177658081, "logps/chosen": -177.826416015625, "logps/rejected": -240.59805297851562, "loss": 1.3694, "nll_loss": 1.1009342670440674, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.5532402992248535, "rewards/margins": 3.9395835399627686, "rewards/rejected": 1.6136566400527954, "step": 6730 }, { "epoch": 0.3739199178929557, "grad_norm": 84.39332580566406, "learning_rate": 6.929005398596754e-08, "logits/chosen": -0.11341975629329681, "logits/rejected": -0.29075413942337036, "logps/chosen": -174.98912048339844, "logps/rejected": -228.1398468017578, "loss": 1.2888, "nll_loss": 0.8445985913276672, "rewards/accuracies": 0.875, "rewards/chosen": 5.468979835510254, "rewards/margins": 3.9075286388397217, "rewards/rejected": 1.5614511966705322, "step": 6740 }, { "epoch": 0.3744746952192064, "grad_norm": 46.01534652709961, "learning_rate": 6.920962592603248e-08, "logits/chosen": -0.37522125244140625, "logits/rejected": -0.4878465533256531, "logps/chosen": -181.1302490234375, "logps/rejected": -237.25146484375, "loss": 1.3075, "nll_loss": 1.0549627542495728, "rewards/accuracies": 0.875, "rewards/chosen": 5.972900867462158, "rewards/margins": 3.768439769744873, "rewards/rejected": 2.204460859298706, "step": 6750 }, { "epoch": 0.37502947254545704, "grad_norm": 68.10105895996094, "learning_rate": 6.91291395124514e-08, "logits/chosen": -0.23928236961364746, "logits/rejected": -0.36333370208740234, "logps/chosen": -166.55661010742188, "logps/rejected": -216.7520751953125, "loss": 1.283, "nll_loss": 1.0072357654571533, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.385659694671631, "rewards/margins": 3.2875118255615234, "rewards/rejected": 2.0981478691101074, "step": 6760 }, { "epoch": 0.37558424987170774, "grad_norm": 126.58575439453125, "learning_rate": 6.904859498972025e-08, "logits/chosen": -0.1630457043647766, "logits/rejected": -0.3187350332736969, "logps/chosen": -127.83060455322266, "logps/rejected": -168.25228881835938, "loss": 1.2153, "nll_loss": 0.8524566888809204, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.729722499847412, "rewards/margins": 2.9829154014587402, "rewards/rejected": 1.7468068599700928, "step": 6770 }, { "epoch": 0.37613902719795844, "grad_norm": 45.973533630371094, "learning_rate": 6.89679926025115e-08, "logits/chosen": -0.2023843228816986, "logits/rejected": -0.3685084283351898, "logps/chosen": -160.76187133789062, "logps/rejected": -228.87161254882812, "loss": 1.2253, "nll_loss": 0.9022408723831177, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.037689685821533, "rewards/margins": 4.123410224914551, "rewards/rejected": 0.9142792820930481, "step": 6780 }, { "epoch": 0.3766938045242091, "grad_norm": 44.571598052978516, "learning_rate": 6.888733259567342e-08, "logits/chosen": -0.3374364674091339, "logits/rejected": -0.5107888579368591, "logps/chosen": -184.53736877441406, "logps/rejected": -266.7218322753906, "loss": 1.2618, "nll_loss": 0.9846833944320679, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 6.135041236877441, "rewards/margins": 4.377993106842041, "rewards/rejected": 1.7570486068725586, "step": 6790 }, { "epoch": 0.3772485818504598, "grad_norm": 103.22528839111328, "learning_rate": 6.880661521422927e-08, "logits/chosen": -0.21125967800617218, "logits/rejected": -0.28824880719184875, "logps/chosen": -154.03274536132812, "logps/rejected": -200.83773803710938, "loss": 1.3361, "nll_loss": 0.9575176239013672, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.292486667633057, "rewards/margins": 2.889380931854248, "rewards/rejected": 2.4031059741973877, "step": 6800 }, { "epoch": 0.37780335917671043, "grad_norm": 61.16334915161133, "learning_rate": 6.87258407033766e-08, "logits/chosen": -0.16521167755126953, "logits/rejected": -0.2563553750514984, "logps/chosen": -161.29776000976562, "logps/rejected": -237.66824340820312, "loss": 1.2426, "nll_loss": 0.944124698638916, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.560735702514648, "rewards/margins": 3.922863721847534, "rewards/rejected": 1.6378719806671143, "step": 6810 }, { "epoch": 0.37835813650296113, "grad_norm": 140.47105407714844, "learning_rate": 6.864500930848652e-08, "logits/chosen": -0.31639528274536133, "logits/rejected": -0.4148218035697937, "logps/chosen": -182.83103942871094, "logps/rejected": -220.06185913085938, "loss": 1.2797, "nll_loss": 0.9983618855476379, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.573078632354736, "rewards/margins": 3.072183132171631, "rewards/rejected": 2.5008950233459473, "step": 6820 }, { "epoch": 0.3789129138292118, "grad_norm": 55.61164093017578, "learning_rate": 6.856412127510297e-08, "logits/chosen": -0.20365670323371887, "logits/rejected": -0.36336570978164673, "logps/chosen": -181.97103881835938, "logps/rejected": -239.91830444335938, "loss": 1.3521, "nll_loss": 0.9995267987251282, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.553870677947998, "rewards/margins": 4.438251972198486, "rewards/rejected": 1.1156189441680908, "step": 6830 }, { "epoch": 0.37946769115546247, "grad_norm": 30.3891544342041, "learning_rate": 6.848317684894188e-08, "logits/chosen": -0.2444746196269989, "logits/rejected": -0.33190420269966125, "logps/chosen": -174.93524169921875, "logps/rejected": -221.8345947265625, "loss": 1.2883, "nll_loss": 1.066310167312622, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.140133857727051, "rewards/margins": 3.0378527641296387, "rewards/rejected": 2.102281093597412, "step": 6840 }, { "epoch": 0.38002246848171317, "grad_norm": 106.51737213134766, "learning_rate": 6.840217627589051e-08, "logits/chosen": -0.26744550466537476, "logits/rejected": -0.354861319065094, "logps/chosen": -171.31802368164062, "logps/rejected": -202.1862030029297, "loss": 1.2587, "nll_loss": 1.028227686882019, "rewards/accuracies": 0.875, "rewards/chosen": 5.459754943847656, "rewards/margins": 2.871218204498291, "rewards/rejected": 2.5885369777679443, "step": 6850 }, { "epoch": 0.3805772458079638, "grad_norm": 50.665340423583984, "learning_rate": 6.832111980200672e-08, "logits/chosen": -0.18003907799720764, "logits/rejected": -0.296830415725708, "logps/chosen": -170.67274475097656, "logps/rejected": -216.8302764892578, "loss": 1.2318, "nll_loss": 0.9463116526603699, "rewards/accuracies": 0.75, "rewards/chosen": 4.955447196960449, "rewards/margins": 2.8384132385253906, "rewards/rejected": 2.1170341968536377, "step": 6860 }, { "epoch": 0.3811320231342145, "grad_norm": 83.80812072753906, "learning_rate": 6.82400076735181e-08, "logits/chosen": -0.33803707361221313, "logits/rejected": -0.462869793176651, "logps/chosen": -175.27798461914062, "logps/rejected": -253.77969360351562, "loss": 1.275, "nll_loss": 1.0101854801177979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.634285926818848, "rewards/margins": 3.6914801597595215, "rewards/rejected": 1.9428050518035889, "step": 6870 }, { "epoch": 0.38168680046046516, "grad_norm": 60.65707778930664, "learning_rate": 6.815884013682139e-08, "logits/chosen": -0.14707748591899872, "logits/rejected": -0.31664079427719116, "logps/chosen": -151.28451538085938, "logps/rejected": -203.17376708984375, "loss": 1.3218, "nll_loss": 0.8372349739074707, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.27748441696167, "rewards/margins": 4.052838325500488, "rewards/rejected": 1.2246456146240234, "step": 6880 }, { "epoch": 0.38224157778671586, "grad_norm": 78.37731170654297, "learning_rate": 6.807761743848158e-08, "logits/chosen": -0.1943022906780243, "logits/rejected": -0.36641693115234375, "logps/chosen": -173.49557495117188, "logps/rejected": -239.4379425048828, "loss": 1.2835, "nll_loss": 0.8731621503829956, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.856075763702393, "rewards/margins": 3.8867263793945312, "rewards/rejected": 1.9693495035171509, "step": 6890 }, { "epoch": 0.38279635511296656, "grad_norm": 48.03612518310547, "learning_rate": 6.799633982523128e-08, "logits/chosen": -0.19575706124305725, "logits/rejected": -0.35647979378700256, "logps/chosen": -148.83901977539062, "logps/rejected": -195.1193084716797, "loss": 1.2447, "nll_loss": 0.8911144137382507, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.207388877868652, "rewards/margins": 3.3151965141296387, "rewards/rejected": 1.8921921253204346, "step": 6900 }, { "epoch": 0.3833511324392172, "grad_norm": 57.33967971801758, "learning_rate": 6.791500754396985e-08, "logits/chosen": -0.3484516739845276, "logits/rejected": -0.49889832735061646, "logps/chosen": -184.61288452148438, "logps/rejected": -240.37765502929688, "loss": 1.36, "nll_loss": 1.0312130451202393, "rewards/accuracies": 0.875, "rewards/chosen": 5.726377010345459, "rewards/margins": 3.6866767406463623, "rewards/rejected": 2.039700746536255, "step": 6910 }, { "epoch": 0.3839059097654679, "grad_norm": 52.44933319091797, "learning_rate": 6.783362084176276e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -145.9274444580078, "logps/rejected": -202.3636932373047, "loss": 1.2302, "nll_loss": NaN, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.59585428237915, "rewards/margins": 3.771801710128784, "rewards/rejected": 1.824052095413208, "step": 6920 }, { "epoch": 0.38446068709171854, "grad_norm": 75.28496551513672, "learning_rate": 6.775217996584082e-08, "logits/chosen": -0.2931756377220154, "logits/rejected": -0.3632332682609558, "logps/chosen": -188.1034698486328, "logps/rejected": -240.4395294189453, "loss": 1.3405, "nll_loss": 1.1550260782241821, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.495295524597168, "rewards/margins": 2.5328545570373535, "rewards/rejected": 2.9624409675598145, "step": 6930 }, { "epoch": 0.38501546441796924, "grad_norm": 82.79104614257812, "learning_rate": 6.767068516359935e-08, "logits/chosen": -0.3296845555305481, "logits/rejected": -0.39916354417800903, "logps/chosen": -171.7633819580078, "logps/rejected": -192.93478393554688, "loss": 1.3266, "nll_loss": 1.0685354471206665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.36434268951416, "rewards/margins": 2.498317003250122, "rewards/rejected": 2.866025447845459, "step": 6940 }, { "epoch": 0.3855702417442199, "grad_norm": 53.79294204711914, "learning_rate": 6.758913668259752e-08, "logits/chosen": -0.28452345728874207, "logits/rejected": -0.4068564474582672, "logps/chosen": -166.2971954345703, "logps/rejected": -252.1527862548828, "loss": 1.239, "nll_loss": 0.9916399717330933, "rewards/accuracies": 0.875, "rewards/chosen": 6.018822193145752, "rewards/margins": 4.4249725341796875, "rewards/rejected": 1.5938496589660645, "step": 6950 }, { "epoch": 0.3861250190704706, "grad_norm": 66.47594451904297, "learning_rate": 6.750753477055755e-08, "logits/chosen": -0.31195324659347534, "logits/rejected": -0.3991023898124695, "logps/chosen": -172.429931640625, "logps/rejected": -227.45504760742188, "loss": 1.3355, "nll_loss": 1.016535758972168, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.862557411193848, "rewards/margins": 2.859070301055908, "rewards/rejected": 3.0034873485565186, "step": 6960 }, { "epoch": 0.3866797963967213, "grad_norm": 118.2043685913086, "learning_rate": 6.742587967536397e-08, "logits/chosen": -0.25057241320610046, "logits/rejected": -0.4117020070552826, "logps/chosen": -170.09341430664062, "logps/rejected": -239.7622528076172, "loss": 1.2453, "nll_loss": 0.9312776327133179, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.926008701324463, "rewards/margins": 4.62038516998291, "rewards/rejected": 1.3056236505508423, "step": 6970 }, { "epoch": 0.38723457372297193, "grad_norm": 85.15673828125, "learning_rate": 6.734417164506285e-08, "logits/chosen": -0.015497421845793724, "logits/rejected": -0.21804824471473694, "logps/chosen": -149.40304565429688, "logps/rejected": -213.0756378173828, "loss": 1.2931, "nll_loss": 0.7935541272163391, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.081489562988281, "rewards/margins": 3.855226516723633, "rewards/rejected": 1.2262629270553589, "step": 6980 }, { "epoch": 0.38778935104922263, "grad_norm": 41.28948211669922, "learning_rate": 6.726241092786111e-08, "logits/chosen": -0.2652779221534729, "logits/rejected": -0.3552890121936798, "logps/chosen": -182.173095703125, "logps/rejected": -232.24490356445312, "loss": 1.2805, "nll_loss": 1.117540955543518, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.992371082305908, "rewards/margins": 4.001796722412109, "rewards/rejected": 1.9905742406845093, "step": 6990 }, { "epoch": 0.3883441283754733, "grad_norm": 61.72700119018555, "learning_rate": 6.718059777212566e-08, "logits/chosen": -0.27194350957870483, "logits/rejected": -0.35817548632621765, "logps/chosen": -174.56044006347656, "logps/rejected": -219.2909698486328, "loss": 1.1839, "nll_loss": 0.9531155824661255, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 5.9334845542907715, "rewards/margins": 3.5387001037597656, "rewards/rejected": 2.394784927368164, "step": 7000 }, { "epoch": 0.3883441283754733, "eval_logits/chosen": -0.353268563747406, "eval_logits/rejected": -0.4408227503299713, "eval_logps/chosen": -194.59059143066406, "eval_logps/rejected": -265.7848815917969, "eval_loss": 1.2542152404785156, "eval_nll_loss": 1.0102587938308716, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.337061882019043, "eval_rewards/margins": 5.007699966430664, "eval_rewards/rejected": 1.3293613195419312, "eval_runtime": 16.7119, "eval_samples_per_second": 15.318, "eval_steps_per_second": 1.915, "step": 7000 }, { "epoch": 0.388898905701724, "grad_norm": 96.24934387207031, "learning_rate": 6.709873242638272e-08, "logits/chosen": -0.34589165449142456, "logits/rejected": -0.4874038100242615, "logps/chosen": -172.8664093017578, "logps/rejected": -225.47140502929688, "loss": 1.2438, "nll_loss": 1.0253851413726807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.837738037109375, "rewards/margins": 3.9034225940704346, "rewards/rejected": 1.9343160390853882, "step": 7010 }, { "epoch": 0.3894536830279747, "grad_norm": 55.82901382446289, "learning_rate": 6.70168151393171e-08, "logits/chosen": -0.21283188462257385, "logits/rejected": -0.3394726514816284, "logps/chosen": -164.8054962158203, "logps/rejected": -225.3345947265625, "loss": 1.4238, "nll_loss": 0.9743822813034058, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.448554039001465, "rewards/margins": 3.3105034828186035, "rewards/rejected": 2.1380505561828613, "step": 7020 }, { "epoch": 0.3900084603542253, "grad_norm": 63.66157913208008, "learning_rate": 6.693484615977133e-08, "logits/chosen": -0.3501175045967102, "logits/rejected": -0.4347058832645416, "logps/chosen": -203.2848358154297, "logps/rejected": -250.0037078857422, "loss": 1.3144, "nll_loss": 1.1313148736953735, "rewards/accuracies": 0.875, "rewards/chosen": 6.2256083488464355, "rewards/margins": 3.5999526977539062, "rewards/rejected": 2.625655174255371, "step": 7030 }, { "epoch": 0.390563237680476, "grad_norm": 45.28962326049805, "learning_rate": 6.6852825736745e-08, "logits/chosen": -0.21284322440624237, "logits/rejected": -0.3513473868370056, "logps/chosen": -151.68064880371094, "logps/rejected": -213.38381958007812, "loss": 1.2049, "nll_loss": 0.8993641138076782, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.502038955688477, "rewards/margins": 3.628171920776367, "rewards/rejected": 1.8738670349121094, "step": 7040 }, { "epoch": 0.39111801500672666, "grad_norm": 30.618072509765625, "learning_rate": 6.677075411939394e-08, "logits/chosen": -0.08350099623203278, "logits/rejected": -0.31030920147895813, "logps/chosen": -151.19979858398438, "logps/rejected": -221.28964233398438, "loss": 1.2577, "nll_loss": 0.8976804614067078, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.1023359298706055, "rewards/margins": 4.105113506317139, "rewards/rejected": 0.9972225427627563, "step": 7050 }, { "epoch": 0.39167279233297736, "grad_norm": 43.067962646484375, "learning_rate": 6.668863155702955e-08, "logits/chosen": -0.22113993763923645, "logits/rejected": -0.336910605430603, "logps/chosen": -177.03387451171875, "logps/rejected": -243.845458984375, "loss": 1.185, "nll_loss": 0.9910527467727661, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.8803253173828125, "rewards/margins": 4.664963722229004, "rewards/rejected": 1.2153613567352295, "step": 7060 }, { "epoch": 0.392227569659228, "grad_norm": 40.56425476074219, "learning_rate": 6.660645829911793e-08, "logits/chosen": -0.08752115815877914, "logits/rejected": -0.25853779911994934, "logps/chosen": -148.84317016601562, "logps/rejected": -215.58499145507812, "loss": 1.3292, "nll_loss": 0.9114246368408203, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.128087997436523, "rewards/margins": 3.934530258178711, "rewards/rejected": 1.1935579776763916, "step": 7070 }, { "epoch": 0.3927823469854787, "grad_norm": 66.71498107910156, "learning_rate": 6.652423459527923e-08, "logits/chosen": -0.21504049003124237, "logits/rejected": -0.3345591723918915, "logps/chosen": -161.94309997558594, "logps/rejected": -222.70516967773438, "loss": 1.2319, "nll_loss": 0.9897100329399109, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.446173191070557, "rewards/margins": 4.251335620880127, "rewards/rejected": 1.1948375701904297, "step": 7080 }, { "epoch": 0.3933371243117294, "grad_norm": 60.648468017578125, "learning_rate": 6.644196069528676e-08, "logits/chosen": -0.3080524802207947, "logits/rejected": -0.46810024976730347, "logps/chosen": -156.82632446289062, "logps/rejected": -215.0540771484375, "loss": 1.2722, "nll_loss": 1.0197008848190308, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.787126541137695, "rewards/margins": 3.726269483566284, "rewards/rejected": 2.0608572959899902, "step": 7090 }, { "epoch": 0.39389190163798005, "grad_norm": 29.4898624420166, "learning_rate": 6.635963684906644e-08, "logits/chosen": -0.2957158088684082, "logits/rejected": -0.4307606816291809, "logps/chosen": -181.65115356445312, "logps/rejected": -263.91302490234375, "loss": 1.2249, "nll_loss": 1.0066194534301758, "rewards/accuracies": 0.875, "rewards/chosen": 6.1734938621521, "rewards/margins": 4.535821914672852, "rewards/rejected": 1.6376720666885376, "step": 7100 }, { "epoch": 0.39444667896423075, "grad_norm": 86.3984603881836, "learning_rate": 6.62772633066958e-08, "logits/chosen": -0.3266414403915405, "logits/rejected": -0.47777050733566284, "logps/chosen": -178.56004333496094, "logps/rejected": -253.69216918945312, "loss": 1.3158, "nll_loss": 1.0822752714157104, "rewards/accuracies": 0.875, "rewards/chosen": 5.967124938964844, "rewards/margins": 4.293940544128418, "rewards/rejected": 1.6731847524642944, "step": 7110 }, { "epoch": 0.3950014562904814, "grad_norm": 45.96630096435547, "learning_rate": 6.619484031840338e-08, "logits/chosen": -0.31532272696495056, "logits/rejected": -0.3624531924724579, "logps/chosen": -205.3643035888672, "logps/rejected": -238.4527130126953, "loss": 1.2695, "nll_loss": 1.106184720993042, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.021925926208496, "rewards/margins": 3.5777347087860107, "rewards/rejected": 2.4441912174224854, "step": 7120 }, { "epoch": 0.3955562336167321, "grad_norm": 159.8441925048828, "learning_rate": 6.611236813456791e-08, "logits/chosen": -0.20600607991218567, "logits/rejected": -0.3467629849910736, "logps/chosen": -138.72528076171875, "logps/rejected": -222.2270965576172, "loss": 1.3248, "nll_loss": 0.9142557382583618, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.90877628326416, "rewards/margins": 2.3921499252319336, "rewards/rejected": 2.5166258811950684, "step": 7130 }, { "epoch": 0.39611101094298273, "grad_norm": 19.28346824645996, "learning_rate": 6.602984700571758e-08, "logits/chosen": -0.4229269027709961, "logits/rejected": -0.5715084671974182, "logps/chosen": -167.72642517089844, "logps/rejected": -256.966796875, "loss": 1.2952, "nll_loss": 1.0315793752670288, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.049108505249023, "rewards/margins": 3.8893866539001465, "rewards/rejected": 2.1597213745117188, "step": 7140 }, { "epoch": 0.39666578826923343, "grad_norm": 52.33828353881836, "learning_rate": 6.594727718252925e-08, "logits/chosen": -0.34071478247642517, "logits/rejected": -0.4567103981971741, "logps/chosen": -173.2262420654297, "logps/rejected": -233.28775024414062, "loss": 1.2967, "nll_loss": 0.9634785652160645, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.447007179260254, "rewards/margins": 3.2220001220703125, "rewards/rejected": 2.2250072956085205, "step": 7150 }, { "epoch": 0.39722056559548413, "grad_norm": 48.68062210083008, "learning_rate": 6.586465891582768e-08, "logits/chosen": -0.34071049094200134, "logits/rejected": -0.45068711042404175, "logps/chosen": -160.2462921142578, "logps/rejected": -223.63253784179688, "loss": 1.3201, "nll_loss": 1.000896692276001, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.550699710845947, "rewards/margins": 3.277545213699341, "rewards/rejected": 2.2731540203094482, "step": 7160 }, { "epoch": 0.3977753429217348, "grad_norm": 49.71293258666992, "learning_rate": 6.578199245658486e-08, "logits/chosen": -0.23354479670524597, "logits/rejected": -0.3580577075481415, "logps/chosen": -170.16897583007812, "logps/rejected": -197.76730346679688, "loss": 1.3182, "nll_loss": 0.9331790804862976, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.230744361877441, "rewards/margins": 2.730969190597534, "rewards/rejected": 2.49977445602417, "step": 7170 }, { "epoch": 0.3983301202479855, "grad_norm": 53.93007278442383, "learning_rate": 6.569927805591908e-08, "logits/chosen": -0.24669504165649414, "logits/rejected": -0.37552201747894287, "logps/chosen": -188.89300537109375, "logps/rejected": -267.8440856933594, "loss": 1.1627, "nll_loss": 0.9904123544692993, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.9379072189331055, "rewards/margins": 4.1582183837890625, "rewards/rejected": 1.779689073562622, "step": 7180 }, { "epoch": 0.3988848975742361, "grad_norm": 209.7054443359375, "learning_rate": 6.561651596509432e-08, "logits/chosen": -0.2623883783817291, "logits/rejected": -0.4429514408111572, "logps/chosen": -176.68624877929688, "logps/rejected": -223.68154907226562, "loss": 1.2484, "nll_loss": 0.9244028925895691, "rewards/accuracies": 0.875, "rewards/chosen": 5.663640022277832, "rewards/margins": 3.5748519897460938, "rewards/rejected": 2.08878755569458, "step": 7190 }, { "epoch": 0.3994396749004868, "grad_norm": 38.98826217651367, "learning_rate": 6.553370643551945e-08, "logits/chosen": -0.2990649342536926, "logits/rejected": -0.4146324694156647, "logps/chosen": -183.01441955566406, "logps/rejected": -265.43853759765625, "loss": 1.3292, "nll_loss": 0.9667603373527527, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.6139750480651855, "rewards/margins": 3.454200029373169, "rewards/rejected": 2.1597747802734375, "step": 7200 }, { "epoch": 0.3999944522267375, "grad_norm": 54.74693298339844, "learning_rate": 6.545084971874738e-08, "logits/chosen": -0.25584933161735535, "logits/rejected": -0.3960145115852356, "logps/chosen": -148.88522338867188, "logps/rejected": -198.43917846679688, "loss": 1.2716, "nll_loss": 0.9009321331977844, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.381265163421631, "rewards/margins": 3.4116127490997314, "rewards/rejected": 1.9696524143218994, "step": 7210 }, { "epoch": 0.40054922955298816, "grad_norm": 58.379478454589844, "learning_rate": 6.53679460664744e-08, "logits/chosen": -0.34522515535354614, "logits/rejected": -0.45888328552246094, "logps/chosen": -163.30357360839844, "logps/rejected": -226.4535369873047, "loss": 1.2642, "nll_loss": 1.0160595178604126, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.892518043518066, "rewards/margins": 3.9560694694519043, "rewards/rejected": 1.9364478588104248, "step": 7220 }, { "epoch": 0.40110400687923886, "grad_norm": 89.80985260009766, "learning_rate": 6.528499573053938e-08, "logits/chosen": -0.30434325337409973, "logits/rejected": -0.48230838775634766, "logps/chosen": -167.24014282226562, "logps/rejected": -245.2574005126953, "loss": 1.308, "nll_loss": 0.9712599515914917, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.582326412200928, "rewards/margins": 4.144760608673096, "rewards/rejected": 1.4375665187835693, "step": 7230 }, { "epoch": 0.4016587842054895, "grad_norm": 55.74021911621094, "learning_rate": 6.520199896292299e-08, "logits/chosen": -0.32192081212997437, "logits/rejected": -0.48280245065689087, "logps/chosen": -189.1600341796875, "logps/rejected": -241.54629516601562, "loss": 1.3141, "nll_loss": 0.9787147641181946, "rewards/accuracies": 0.875, "rewards/chosen": 5.8414106369018555, "rewards/margins": 4.506801128387451, "rewards/rejected": 1.3346093893051147, "step": 7240 }, { "epoch": 0.4022135615317402, "grad_norm": 57.21150207519531, "learning_rate": 6.511895601574698e-08, "logits/chosen": -0.3998798727989197, "logits/rejected": -0.5131269693374634, "logps/chosen": -195.90878295898438, "logps/rejected": -243.5328369140625, "loss": 1.2844, "nll_loss": 1.0310008525848389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.184939384460449, "rewards/margins": 3.873645305633545, "rewards/rejected": 2.3112943172454834, "step": 7250 }, { "epoch": 0.40276833885799085, "grad_norm": 56.23503875732422, "learning_rate": 6.503586714127331e-08, "logits/chosen": -0.039901845157146454, "logits/rejected": -0.25853031873703003, "logps/chosen": -115.4613037109375, "logps/rejected": -147.33761596679688, "loss": 1.3694, "nll_loss": 0.7300417423248291, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.834766387939453, "rewards/margins": 2.127188205718994, "rewards/rejected": 1.7075786590576172, "step": 7260 }, { "epoch": 0.40332311618424155, "grad_norm": 109.07487487792969, "learning_rate": 6.495273259190355e-08, "logits/chosen": -0.3013862371444702, "logits/rejected": -0.4309801161289215, "logps/chosen": -156.0321502685547, "logps/rejected": -198.90753173828125, "loss": 1.4112, "nll_loss": 1.0063666105270386, "rewards/accuracies": 0.75, "rewards/chosen": 4.712536811828613, "rewards/margins": 2.2817347049713135, "rewards/rejected": 2.4308016300201416, "step": 7270 }, { "epoch": 0.40387789351049225, "grad_norm": 54.24409103393555, "learning_rate": 6.486955262017794e-08, "logits/chosen": -0.346465528011322, "logits/rejected": -0.39551910758018494, "logps/chosen": -209.0965118408203, "logps/rejected": -232.0498809814453, "loss": 1.317, "nll_loss": 1.1299588680267334, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.459007263183594, "rewards/margins": 3.1379857063293457, "rewards/rejected": 3.321021556854248, "step": 7280 }, { "epoch": 0.4044326708367429, "grad_norm": 87.00990295410156, "learning_rate": 6.478632747877472e-08, "logits/chosen": -0.29052549600601196, "logits/rejected": -0.43641576170921326, "logps/chosen": -188.79385375976562, "logps/rejected": -254.0786590576172, "loss": 1.2827, "nll_loss": 0.95270836353302, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.114146709442139, "rewards/margins": 3.6468193531036377, "rewards/rejected": 2.467327356338501, "step": 7290 }, { "epoch": 0.4049874481629936, "grad_norm": 89.29615020751953, "learning_rate": 6.470305742050936e-08, "logits/chosen": -0.2050275355577469, "logits/rejected": -0.3741667866706848, "logps/chosen": -141.3899688720703, "logps/rejected": -197.33541870117188, "loss": 1.2951, "nll_loss": 0.8298746943473816, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.978948593139648, "rewards/margins": 2.2912750244140625, "rewards/rejected": 2.687673568725586, "step": 7300 }, { "epoch": 0.40554222548924423, "grad_norm": 63.30764389038086, "learning_rate": 6.461974269833378e-08, "logits/chosen": -0.22644257545471191, "logits/rejected": -0.36334604024887085, "logps/chosen": -170.58517456054688, "logps/rejected": -225.7571258544922, "loss": 1.3345, "nll_loss": 0.973869800567627, "rewards/accuracies": 0.875, "rewards/chosen": 5.511404991149902, "rewards/margins": 3.0940215587615967, "rewards/rejected": 2.417384386062622, "step": 7310 }, { "epoch": 0.40609700281549493, "grad_norm": 74.49880981445312, "learning_rate": 6.453638356533555e-08, "logits/chosen": -0.32636964321136475, "logits/rejected": -0.4954894185066223, "logps/chosen": -161.60977172851562, "logps/rejected": -225.15234375, "loss": 1.295, "nll_loss": 0.973361611366272, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.697045803070068, "rewards/margins": 3.5218639373779297, "rewards/rejected": 2.1751818656921387, "step": 7320 }, { "epoch": 0.40665178014174563, "grad_norm": 62.1312370300293, "learning_rate": 6.445298027473716e-08, "logits/chosen": -0.3880705237388611, "logits/rejected": -0.5019701719284058, "logps/chosen": -182.28811645507812, "logps/rejected": -240.3295440673828, "loss": 1.249, "nll_loss": 1.040808081626892, "rewards/accuracies": 0.875, "rewards/chosen": 6.146364688873291, "rewards/margins": 3.690261125564575, "rewards/rejected": 2.456103563308716, "step": 7330 }, { "epoch": 0.4072065574679963, "grad_norm": 42.314449310302734, "learning_rate": 6.436953307989523e-08, "logits/chosen": -0.28808385133743286, "logits/rejected": -0.4283718466758728, "logps/chosen": -176.02244567871094, "logps/rejected": -221.84640502929688, "loss": 1.266, "nll_loss": 0.9531732797622681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.6160783767700195, "rewards/margins": 3.110626697540283, "rewards/rejected": 2.5054516792297363, "step": 7340 }, { "epoch": 0.407761334794247, "grad_norm": 77.77169036865234, "learning_rate": 6.428604223429979e-08, "logits/chosen": -0.3338751196861267, "logits/rejected": -0.520000696182251, "logps/chosen": -147.56625366210938, "logps/rejected": -203.83847045898438, "loss": 1.2764, "nll_loss": 0.9144219160079956, "rewards/accuracies": 0.875, "rewards/chosen": 5.272387981414795, "rewards/margins": 3.1382250785827637, "rewards/rejected": 2.134162664413452, "step": 7350 }, { "epoch": 0.4083161121204976, "grad_norm": 51.88246536254883, "learning_rate": 6.420250799157342e-08, "logits/chosen": -0.3343006670475006, "logits/rejected": -0.4886436462402344, "logps/chosen": -129.22486877441406, "logps/rejected": -158.54843139648438, "loss": 1.3176, "nll_loss": 0.842542290687561, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.768006801605225, "rewards/margins": 2.8421809673309326, "rewards/rejected": 1.9258254766464233, "step": 7360 }, { "epoch": 0.4088708894467483, "grad_norm": 65.615966796875, "learning_rate": 6.411893060547055e-08, "logits/chosen": -0.10722409188747406, "logits/rejected": -0.30478712916374207, "logps/chosen": -141.62173461914062, "logps/rejected": -174.9485321044922, "loss": 1.2427, "nll_loss": 0.8185256719589233, "rewards/accuracies": 0.875, "rewards/chosen": 4.766512870788574, "rewards/margins": 2.7977631092071533, "rewards/rejected": 1.968750238418579, "step": 7370 }, { "epoch": 0.40942566677299896, "grad_norm": 45.00594711303711, "learning_rate": 6.403531032987667e-08, "logits/chosen": -0.38103166222572327, "logits/rejected": -0.5468229055404663, "logps/chosen": -189.6179656982422, "logps/rejected": -287.31390380859375, "loss": 1.3187, "nll_loss": 0.9494185447692871, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.6198248863220215, "rewards/margins": 5.335195541381836, "rewards/rejected": 1.2846286296844482, "step": 7380 }, { "epoch": 0.40998044409924966, "grad_norm": 54.82295227050781, "learning_rate": 6.395164741880753e-08, "logits/chosen": -0.3098643720149994, "logits/rejected": -0.4438135027885437, "logps/chosen": -164.07351684570312, "logps/rejected": -229.0188446044922, "loss": 1.3101, "nll_loss": 0.9140411615371704, "rewards/accuracies": 0.75, "rewards/chosen": 5.3129096031188965, "rewards/margins": 3.516838788986206, "rewards/rejected": 1.7960714101791382, "step": 7390 }, { "epoch": 0.41053522142550036, "grad_norm": 40.2969856262207, "learning_rate": 6.386794212640845e-08, "logits/chosen": -0.2711586356163025, "logits/rejected": -0.4007849097251892, "logps/chosen": -150.17539978027344, "logps/rejected": -182.2269744873047, "loss": 1.1839, "nll_loss": 0.8594070672988892, "rewards/accuracies": 0.875, "rewards/chosen": 5.0169172286987305, "rewards/margins": 3.0875864028930664, "rewards/rejected": 1.929330587387085, "step": 7400 }, { "epoch": 0.411089998751751, "grad_norm": 54.541927337646484, "learning_rate": 6.378419470695342e-08, "logits/chosen": -0.43582743406295776, "logits/rejected": -0.6034985184669495, "logps/chosen": -183.70852661132812, "logps/rejected": -258.99169921875, "loss": 1.2531, "nll_loss": 1.0488944053649902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.260799407958984, "rewards/margins": 4.284533500671387, "rewards/rejected": 1.9762649536132812, "step": 7410 }, { "epoch": 0.4116447760780017, "grad_norm": 51.705360412597656, "learning_rate": 6.370040541484449e-08, "logits/chosen": -0.24617047607898712, "logits/rejected": -0.4318224787712097, "logps/chosen": -178.9082489013672, "logps/rejected": -240.6511993408203, "loss": 1.2556, "nll_loss": 0.9786630868911743, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.324519157409668, "rewards/margins": 3.5243849754333496, "rewards/rejected": 1.8001340627670288, "step": 7420 }, { "epoch": 0.41219955340425235, "grad_norm": 40.38996505737305, "learning_rate": 6.361657450461084e-08, "logits/chosen": -0.2616347670555115, "logits/rejected": -0.4238702654838562, "logps/chosen": -124.56805419921875, "logps/rejected": -181.3067169189453, "loss": 1.2497, "nll_loss": 0.8138860464096069, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.362429141998291, "rewards/margins": 3.0529863834381104, "rewards/rejected": 2.3094425201416016, "step": 7430 }, { "epoch": 0.41275433073050305, "grad_norm": 44.85422134399414, "learning_rate": 6.353270223090806e-08, "logits/chosen": -0.32470908761024475, "logits/rejected": -0.41462892293930054, "logps/chosen": -198.8808135986328, "logps/rejected": -252.1754608154297, "loss": 1.294, "nll_loss": 1.0325891971588135, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 6.107321262359619, "rewards/margins": 3.250342845916748, "rewards/rejected": 2.856978416442871, "step": 7440 }, { "epoch": 0.4133091080567537, "grad_norm": 33.28203582763672, "learning_rate": 6.344878884851746e-08, "logits/chosen": -0.42162925004959106, "logits/rejected": -0.5931268930435181, "logps/chosen": -183.57373046875, "logps/rejected": -248.9408416748047, "loss": 1.3005, "nll_loss": 1.037781000137329, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.289162635803223, "rewards/margins": 5.122281074523926, "rewards/rejected": 1.1668803691864014, "step": 7450 }, { "epoch": 0.4138638853830044, "grad_norm": 75.61338806152344, "learning_rate": 6.33648346123452e-08, "logits/chosen": -0.3089607357978821, "logits/rejected": -0.4007699489593506, "logps/chosen": -183.18475341796875, "logps/rejected": -222.86441040039062, "loss": 1.2974, "nll_loss": 1.0575367212295532, "rewards/accuracies": 0.875, "rewards/chosen": 5.529407978057861, "rewards/margins": 2.9048867225646973, "rewards/rejected": 2.624520778656006, "step": 7460 }, { "epoch": 0.4144186627092551, "grad_norm": 41.54254913330078, "learning_rate": 6.32808397774215e-08, "logits/chosen": -0.37035509943962097, "logits/rejected": -0.54868483543396, "logps/chosen": -169.4999542236328, "logps/rejected": -246.3911590576172, "loss": 1.2784, "nll_loss": 1.0069705247879028, "rewards/accuracies": 0.875, "rewards/chosen": 5.76138973236084, "rewards/margins": 4.051873207092285, "rewards/rejected": 1.7095155715942383, "step": 7470 }, { "epoch": 0.41497344003550574, "grad_norm": 45.810035705566406, "learning_rate": 6.319680459889995e-08, "logits/chosen": -0.17138861119747162, "logits/rejected": -0.3850787281990051, "logps/chosen": -146.49534606933594, "logps/rejected": -208.6310577392578, "loss": 1.1755, "nll_loss": 0.8154012560844421, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.7701897621154785, "rewards/margins": 4.011888027191162, "rewards/rejected": 0.7583020329475403, "step": 7480 }, { "epoch": 0.41552821736175644, "grad_norm": 76.499267578125, "learning_rate": 6.311272933205672e-08, "logits/chosen": -0.31379497051239014, "logits/rejected": -0.3845873475074768, "logps/chosen": -177.3786163330078, "logps/rejected": -214.22543334960938, "loss": 1.3975, "nll_loss": 1.2314379215240479, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.535956859588623, "rewards/margins": 3.367109775543213, "rewards/rejected": 2.1688480377197266, "step": 7490 }, { "epoch": 0.4160829946880071, "grad_norm": 63.57807922363281, "learning_rate": 6.302861423228967e-08, "logits/chosen": -0.4402855932712555, "logits/rejected": -0.5427097678184509, "logps/chosen": -202.45889282226562, "logps/rejected": -282.24188232421875, "loss": 1.2701, "nll_loss": 1.1113221645355225, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.235448360443115, "rewards/margins": 3.985610246658325, "rewards/rejected": 2.24983811378479, "step": 7500 }, { "epoch": 0.4160829946880071, "eval_logits/chosen": -0.398401141166687, "eval_logits/rejected": -0.49995607137680054, "eval_logps/chosen": -193.20004272460938, "eval_logps/rejected": -259.1756591796875, "eval_loss": 1.2661241292953491, "eval_nll_loss": 1.002261757850647, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": 6.4761176109313965, "eval_rewards/margins": 4.485833644866943, "eval_rewards/rejected": 1.9902844429016113, "eval_runtime": 17.0997, "eval_samples_per_second": 14.971, "eval_steps_per_second": 1.871, "step": 7500 }, { "epoch": 0.4166377720142578, "grad_norm": 34.99973678588867, "learning_rate": 6.294445955511774e-08, "logits/chosen": -0.29553765058517456, "logits/rejected": -0.4380635619163513, "logps/chosen": -183.63485717773438, "logps/rejected": -227.50021362304688, "loss": 1.3205, "nll_loss": 1.03634512424469, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.569860458374023, "rewards/margins": 3.396176815032959, "rewards/rejected": 2.173682689666748, "step": 7510 }, { "epoch": 0.4171925493405085, "grad_norm": 48.45505905151367, "learning_rate": 6.286026555618009e-08, "logits/chosen": -0.3055883049964905, "logits/rejected": -0.469032347202301, "logps/chosen": -161.53659057617188, "logps/rejected": -226.0988311767578, "loss": 1.2877, "nll_loss": 0.8650112152099609, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.755152225494385, "rewards/margins": 4.042037010192871, "rewards/rejected": 1.7131156921386719, "step": 7520 }, { "epoch": 0.4177473266667591, "grad_norm": 55.78778076171875, "learning_rate": 6.277603249123526e-08, "logits/chosen": -0.36147961020469666, "logits/rejected": -0.4718469977378845, "logps/chosen": -143.3127899169922, "logps/rejected": -181.94027709960938, "loss": 1.2009, "nll_loss": 0.9830204248428345, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.296572685241699, "rewards/margins": 2.2858357429504395, "rewards/rejected": 3.0107367038726807, "step": 7530 }, { "epoch": 0.4183021039930098, "grad_norm": 80.88349151611328, "learning_rate": 6.269176061616056e-08, "logits/chosen": -0.14564813673496246, "logits/rejected": -0.3274744749069214, "logps/chosen": -134.18099975585938, "logps/rejected": -197.35910034179688, "loss": 1.3682, "nll_loss": 0.7621486186981201, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.868192195892334, "rewards/margins": 3.5857315063476562, "rewards/rejected": 1.2824609279632568, "step": 7540 }, { "epoch": 0.41885688131926047, "grad_norm": 51.41616439819336, "learning_rate": 6.260745018695112e-08, "logits/chosen": -0.36704492568969727, "logits/rejected": -0.47699612379074097, "logps/chosen": -164.365478515625, "logps/rejected": -218.2649688720703, "loss": 1.2076, "nll_loss": 0.9271243810653687, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.707385540008545, "rewards/margins": 3.966140031814575, "rewards/rejected": 1.7412458658218384, "step": 7550 }, { "epoch": 0.41941165864551117, "grad_norm": 49.04743576049805, "learning_rate": 6.25231014597192e-08, "logits/chosen": -0.3328538239002228, "logits/rejected": -0.46134382486343384, "logps/chosen": -156.789306640625, "logps/rejected": -211.8652801513672, "loss": 1.2794, "nll_loss": 0.9207913279533386, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.7461256980896, "rewards/margins": 3.3402061462402344, "rewards/rejected": 2.4059205055236816, "step": 7560 }, { "epoch": 0.4199664359717618, "grad_norm": 83.76142883300781, "learning_rate": 6.243871469069344e-08, "logits/chosen": -0.3034781813621521, "logits/rejected": -0.49480685591697693, "logps/chosen": -167.407470703125, "logps/rejected": -225.42703247070312, "loss": 1.2352, "nll_loss": 0.9763292074203491, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.7099928855896, "rewards/margins": 3.6460483074188232, "rewards/rejected": 2.063944101333618, "step": 7570 }, { "epoch": 0.4205212132980125, "grad_norm": 78.60682678222656, "learning_rate": 6.235429013621798e-08, "logits/chosen": -0.23759326338768005, "logits/rejected": -0.4189354479312897, "logps/chosen": -144.0743408203125, "logps/rejected": -215.2904815673828, "loss": 1.2802, "nll_loss": 0.8422958254814148, "rewards/accuracies": 0.875, "rewards/chosen": 5.4411234855651855, "rewards/margins": 3.513167142868042, "rewards/rejected": 1.927955985069275, "step": 7580 }, { "epoch": 0.4210759906242632, "grad_norm": 56.77098083496094, "learning_rate": 6.226982805275181e-08, "logits/chosen": -0.32095006108283997, "logits/rejected": -0.548249363899231, "logps/chosen": -151.5083465576172, "logps/rejected": -217.2533416748047, "loss": 1.3749, "nll_loss": 0.8653049468994141, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.12197732925415, "rewards/margins": 3.2202446460723877, "rewards/rejected": 1.9017328023910522, "step": 7590 }, { "epoch": 0.42163076795051385, "grad_norm": 49.3482780456543, "learning_rate": 6.218532869686786e-08, "logits/chosen": -0.22720670700073242, "logits/rejected": -0.4425369203090668, "logps/chosen": -140.2606964111328, "logps/rejected": -214.157470703125, "loss": 1.2169, "nll_loss": 0.7681783437728882, "rewards/accuracies": 0.875, "rewards/chosen": 5.133579730987549, "rewards/margins": 4.256910800933838, "rewards/rejected": 0.8766688108444214, "step": 7600 }, { "epoch": 0.42218554527676455, "grad_norm": 82.47681427001953, "learning_rate": 6.210079232525232e-08, "logits/chosen": -0.346763551235199, "logits/rejected": -0.4864253103733063, "logps/chosen": -176.0386962890625, "logps/rejected": -258.1570129394531, "loss": 1.2363, "nll_loss": 0.9986612200737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.702577590942383, "rewards/margins": 3.368680477142334, "rewards/rejected": 2.333897352218628, "step": 7610 }, { "epoch": 0.4227403226030152, "grad_norm": 63.12968826293945, "learning_rate": 6.201621919470382e-08, "logits/chosen": -0.2454340159893036, "logits/rejected": -0.38405701518058777, "logps/chosen": -192.60916137695312, "logps/rejected": -259.3959045410156, "loss": 1.3147, "nll_loss": 0.9730021357536316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.158176422119141, "rewards/margins": 4.365778923034668, "rewards/rejected": 1.792396903038025, "step": 7620 }, { "epoch": 0.4232950999292659, "grad_norm": 90.59529876708984, "learning_rate": 6.193160956213261e-08, "logits/chosen": -0.16950708627700806, "logits/rejected": -0.4081265926361084, "logps/chosen": -146.45358276367188, "logps/rejected": -197.4040069580078, "loss": 1.2613, "nll_loss": 0.8664730787277222, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.412256240844727, "rewards/margins": 3.448835849761963, "rewards/rejected": 1.9634202718734741, "step": 7630 }, { "epoch": 0.4238498772555166, "grad_norm": 41.387332916259766, "learning_rate": 6.184696368455991e-08, "logits/chosen": -0.3791458308696747, "logits/rejected": -0.5487962961196899, "logps/chosen": -178.0819549560547, "logps/rejected": -246.73733520507812, "loss": 1.2968, "nll_loss": 0.9921107292175293, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.117079734802246, "rewards/margins": 4.516340255737305, "rewards/rejected": 1.6007391214370728, "step": 7640 }, { "epoch": 0.42440465458176724, "grad_norm": 67.04618835449219, "learning_rate": 6.176228181911699e-08, "logits/chosen": -0.37947243452072144, "logits/rejected": -0.49038830399513245, "logps/chosen": -164.1470947265625, "logps/rejected": -222.6766357421875, "loss": 1.3031, "nll_loss": 1.011852502822876, "rewards/accuracies": 0.875, "rewards/chosen": 5.655206680297852, "rewards/margins": 3.4010486602783203, "rewards/rejected": 2.2541584968566895, "step": 7650 }, { "epoch": 0.42495943190801794, "grad_norm": 80.08260345458984, "learning_rate": 6.167756422304439e-08, "logits/chosen": -0.21255847811698914, "logits/rejected": -0.47834545373916626, "logps/chosen": -147.35255432128906, "logps/rejected": -204.34854125976562, "loss": 1.2285, "nll_loss": 0.8217668533325195, "rewards/accuracies": 0.875, "rewards/chosen": 5.1367998123168945, "rewards/margins": 3.8483917713165283, "rewards/rejected": 1.288407564163208, "step": 7660 }, { "epoch": 0.4255142092342686, "grad_norm": 72.43340301513672, "learning_rate": 6.159281115369131e-08, "logits/chosen": -0.34400323033332825, "logits/rejected": -0.49123507738113403, "logps/chosen": -186.2705078125, "logps/rejected": -230.4435577392578, "loss": 1.3026, "nll_loss": 1.2007331848144531, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.783455848693848, "rewards/margins": 3.2348504066467285, "rewards/rejected": 2.5486056804656982, "step": 7670 }, { "epoch": 0.4260689865605193, "grad_norm": 63.78506088256836, "learning_rate": 6.150802286851461e-08, "logits/chosen": -0.40512070059776306, "logits/rejected": -0.5488404631614685, "logps/chosen": -159.0139923095703, "logps/rejected": -209.39480590820312, "loss": 1.3335, "nll_loss": 0.9720331430435181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.443997383117676, "rewards/margins": 3.3262696266174316, "rewards/rejected": 2.117727279663086, "step": 7680 }, { "epoch": 0.4266237638867699, "grad_norm": 102.8187484741211, "learning_rate": 6.142319962507817e-08, "logits/chosen": -0.3015509247779846, "logits/rejected": -0.4275835156440735, "logps/chosen": -171.422607421875, "logps/rejected": -211.2342529296875, "loss": 1.3355, "nll_loss": 0.940872848033905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.379124641418457, "rewards/margins": 3.149885416030884, "rewards/rejected": 2.2292397022247314, "step": 7690 }, { "epoch": 0.4271785412130206, "grad_norm": 106.95166778564453, "learning_rate": 6.133834168105205e-08, "logits/chosen": -0.29108524322509766, "logits/rejected": -0.5013249516487122, "logps/chosen": -143.29544067382812, "logps/rejected": -204.5666961669922, "loss": 1.3016, "nll_loss": 0.8779792785644531, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.290759086608887, "rewards/margins": 3.5017638206481934, "rewards/rejected": 1.7889950275421143, "step": 7700 }, { "epoch": 0.4277333185392713, "grad_norm": 68.40714263916016, "learning_rate": 6.125344929421172e-08, "logits/chosen": -0.305257648229599, "logits/rejected": -0.3654315769672394, "logps/chosen": -197.51206970214844, "logps/rejected": -251.21578979492188, "loss": 1.2782, "nll_loss": 1.050001859664917, "rewards/accuracies": 0.875, "rewards/chosen": 6.111239910125732, "rewards/margins": 3.144008159637451, "rewards/rejected": 2.967231512069702, "step": 7710 }, { "epoch": 0.42828809586552197, "grad_norm": 87.34870910644531, "learning_rate": 6.116852272243728e-08, "logits/chosen": -0.242166668176651, "logits/rejected": -0.48238492012023926, "logps/chosen": -162.7891082763672, "logps/rejected": -229.2385711669922, "loss": 1.3349, "nll_loss": 0.8462405204772949, "rewards/accuracies": 0.875, "rewards/chosen": 5.6684393882751465, "rewards/margins": 3.6242847442626953, "rewards/rejected": 2.044154405593872, "step": 7720 }, { "epoch": 0.42884287319177267, "grad_norm": 76.2271499633789, "learning_rate": 6.108356222371268e-08, "logits/chosen": -0.12172901630401611, "logits/rejected": -0.3345574736595154, "logps/chosen": -122.95631408691406, "logps/rejected": -185.2032470703125, "loss": 1.3135, "nll_loss": 0.8006790280342102, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.69210147857666, "rewards/margins": 2.9726243019104004, "rewards/rejected": 1.7194769382476807, "step": 7730 }, { "epoch": 0.4293976505180233, "grad_norm": 69.55945587158203, "learning_rate": 6.099856805612493e-08, "logits/chosen": -0.25045618414878845, "logits/rejected": -0.4043782651424408, "logps/chosen": -142.79592895507812, "logps/rejected": -229.7549285888672, "loss": 1.2622, "nll_loss": 0.8604512214660645, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.3889946937561035, "rewards/margins": 4.070669651031494, "rewards/rejected": 1.3183256387710571, "step": 7740 }, { "epoch": 0.429952427844274, "grad_norm": 77.58515930175781, "learning_rate": 6.091354047786332e-08, "logits/chosen": -0.31509530544281006, "logits/rejected": -0.42753204703330994, "logps/chosen": -145.33651733398438, "logps/rejected": -180.94540405273438, "loss": 1.2589, "nll_loss": 0.9200709462165833, "rewards/accuracies": 0.875, "rewards/chosen": 5.249917030334473, "rewards/margins": 3.6292223930358887, "rewards/rejected": 1.6206945180892944, "step": 7750 }, { "epoch": 0.43050720517052465, "grad_norm": 100.63086700439453, "learning_rate": 6.082847974721861e-08, "logits/chosen": -0.24173316359519958, "logits/rejected": -0.40897685289382935, "logps/chosen": -144.01553344726562, "logps/rejected": -218.42141723632812, "loss": 1.2609, "nll_loss": 0.9144455790519714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.449666500091553, "rewards/margins": 3.5731265544891357, "rewards/rejected": 1.876539945602417, "step": 7760 }, { "epoch": 0.43106198249677535, "grad_norm": 48.98227310180664, "learning_rate": 6.074338612258229e-08, "logits/chosen": -0.4577174186706543, "logits/rejected": -0.5763063430786133, "logps/chosen": -181.20730590820312, "logps/rejected": -251.55136108398438, "loss": 1.3029, "nll_loss": 1.092089056968689, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.994641304016113, "rewards/margins": 3.4837639331817627, "rewards/rejected": 2.5108776092529297, "step": 7770 }, { "epoch": 0.43161675982302605, "grad_norm": 114.86874389648438, "learning_rate": 6.065825986244578e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -161.11328125, "logps/rejected": -193.9613037109375, "loss": 1.272, "nll_loss": NaN, "rewards/accuracies": 0.875, "rewards/chosen": 5.616585731506348, "rewards/margins": 3.6385693550109863, "rewards/rejected": 1.9780166149139404, "step": 7780 }, { "epoch": 0.4321715371492767, "grad_norm": 58.877315521240234, "learning_rate": 6.057310122539963e-08, "logits/chosen": -0.2762307822704315, "logits/rejected": -0.420022577047348, "logps/chosen": -127.3053207397461, "logps/rejected": -196.6819305419922, "loss": 1.3587, "nll_loss": 0.8521549105644226, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.0783371925354, "rewards/margins": 3.1709678173065186, "rewards/rejected": 1.9073699712753296, "step": 7790 }, { "epoch": 0.4327263144755274, "grad_norm": 61.59865188598633, "learning_rate": 6.04879104701327e-08, "logits/chosen": -0.3291458487510681, "logits/rejected": -0.44707727432250977, "logps/chosen": -156.1598358154297, "logps/rejected": -205.094970703125, "loss": 1.2417, "nll_loss": 0.9146450161933899, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.510323524475098, "rewards/margins": 3.3687164783477783, "rewards/rejected": 2.1416075229644775, "step": 7800 }, { "epoch": 0.43328109180177804, "grad_norm": 89.74889373779297, "learning_rate": 6.04026878554315e-08, "logits/chosen": -0.3558953106403351, "logits/rejected": -0.4734135568141937, "logps/chosen": -167.73020935058594, "logps/rejected": -225.3911895751953, "loss": 1.3202, "nll_loss": 1.0521650314331055, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.655695915222168, "rewards/margins": 3.121359348297119, "rewards/rejected": 2.5343360900878906, "step": 7810 }, { "epoch": 0.43383586912802874, "grad_norm": 48.8852653503418, "learning_rate": 6.031743364017922e-08, "logits/chosen": -0.31145888566970825, "logits/rejected": -0.3819272518157959, "logps/chosen": -169.56024169921875, "logps/rejected": -249.2076416015625, "loss": 1.2963, "nll_loss": 0.9819513559341431, "rewards/accuracies": 0.75, "rewards/chosen": 5.6514410972595215, "rewards/margins": 3.171518087387085, "rewards/rejected": 2.479923725128174, "step": 7820 }, { "epoch": 0.43439064645427944, "grad_norm": 40.11418151855469, "learning_rate": 6.023214808335516e-08, "logits/chosen": -0.1905912607908249, "logits/rejected": -0.36486178636550903, "logps/chosen": -163.63925170898438, "logps/rejected": -221.80239868164062, "loss": 1.2658, "nll_loss": 0.9231510162353516, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.2673845291137695, "rewards/margins": 3.396528720855713, "rewards/rejected": 1.8708559274673462, "step": 7830 }, { "epoch": 0.4349454237805301, "grad_norm": 81.7651138305664, "learning_rate": 6.014683144403374e-08, "logits/chosen": -0.008128717541694641, "logits/rejected": -0.19771425426006317, "logps/chosen": -112.61724853515625, "logps/rejected": -176.70272827148438, "loss": 1.2552, "nll_loss": 0.7131599187850952, "rewards/accuracies": 0.875, "rewards/chosen": 4.644561767578125, "rewards/margins": 3.75565767288208, "rewards/rejected": 0.8889042735099792, "step": 7840 }, { "epoch": 0.4355002011067808, "grad_norm": 65.13880920410156, "learning_rate": 6.006148398138382e-08, "logits/chosen": -0.3456365466117859, "logits/rejected": -0.48094815015792847, "logps/chosen": -161.97412109375, "logps/rejected": -196.0426025390625, "loss": 1.3188, "nll_loss": 0.9527362585067749, "rewards/accuracies": 0.875, "rewards/chosen": 5.310242652893066, "rewards/margins": 3.3656864166259766, "rewards/rejected": 1.944556474685669, "step": 7850 }, { "epoch": 0.4360549784330314, "grad_norm": 49.75828170776367, "learning_rate": 5.997610595466792e-08, "logits/chosen": -0.2645300030708313, "logits/rejected": -0.40959396958351135, "logps/chosen": -147.76052856445312, "logps/rejected": -211.4885711669922, "loss": 1.308, "nll_loss": 0.8695026636123657, "rewards/accuracies": 0.875, "rewards/chosen": 5.175740718841553, "rewards/margins": 4.212712287902832, "rewards/rejected": 0.9630285501480103, "step": 7860 }, { "epoch": 0.4366097557592821, "grad_norm": 45.04252243041992, "learning_rate": 5.989069762324135e-08, "logits/chosen": -0.1510230004787445, "logits/rejected": -0.3848511874675751, "logps/chosen": -152.15814208984375, "logps/rejected": -230.59036254882812, "loss": 1.3166, "nll_loss": 0.8440617322921753, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.116968154907227, "rewards/margins": 4.361637115478516, "rewards/rejected": 0.7553306818008423, "step": 7870 }, { "epoch": 0.43716453308553277, "grad_norm": 59.32204818725586, "learning_rate": 5.980525924655152e-08, "logits/chosen": -0.3439778685569763, "logits/rejected": -0.4573608338832855, "logps/chosen": -202.69509887695312, "logps/rejected": -277.2159423828125, "loss": 1.269, "nll_loss": 1.026094675064087, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.560296058654785, "rewards/margins": 4.462855339050293, "rewards/rejected": 2.097440481185913, "step": 7880 }, { "epoch": 0.43771931041178347, "grad_norm": 68.54647064208984, "learning_rate": 5.971979108413714e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -156.3693389892578, "logps/rejected": -215.7193603515625, "loss": 1.3733, "nll_loss": NaN, "rewards/accuracies": 0.75, "rewards/chosen": 5.428963661193848, "rewards/margins": 3.6827340126037598, "rewards/rejected": 1.7462295293807983, "step": 7890 }, { "epoch": 0.43827408773803417, "grad_norm": 77.02647399902344, "learning_rate": 5.96342933956273e-08, "logits/chosen": -0.24855390191078186, "logits/rejected": -0.3666438162326813, "logps/chosen": -145.66111755371094, "logps/rejected": -181.38365173339844, "loss": 1.2417, "nll_loss": 1.033527135848999, "rewards/accuracies": 0.75, "rewards/chosen": 4.633823871612549, "rewards/margins": 2.5259013175964355, "rewards/rejected": 2.1079225540161133, "step": 7900 }, { "epoch": 0.4388288650642848, "grad_norm": 145.5952606201172, "learning_rate": 5.9548766440740906e-08, "logits/chosen": -0.31071895360946655, "logits/rejected": -0.4172093868255615, "logps/chosen": -172.99037170410156, "logps/rejected": -224.00509643554688, "loss": 1.3486, "nll_loss": 0.9945386648178101, "rewards/accuracies": 0.875, "rewards/chosen": 6.091534614562988, "rewards/margins": 4.539047718048096, "rewards/rejected": 1.5524866580963135, "step": 7910 }, { "epoch": 0.4393836423905355, "grad_norm": 45.83077621459961, "learning_rate": 5.9463210479285674e-08, "logits/chosen": -0.20147080719470978, "logits/rejected": -0.36271151900291443, "logps/chosen": -136.16787719726562, "logps/rejected": -197.44253540039062, "loss": 1.2506, "nll_loss": 0.8147318959236145, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.147400856018066, "rewards/margins": 3.7860970497131348, "rewards/rejected": 1.3613044023513794, "step": 7920 }, { "epoch": 0.43993841971678616, "grad_norm": 51.59779739379883, "learning_rate": 5.937762577115747e-08, "logits/chosen": -0.37672844529151917, "logits/rejected": -0.4909901022911072, "logps/chosen": -157.14938354492188, "logps/rejected": -196.3175048828125, "loss": 1.2589, "nll_loss": 0.977447509765625, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.745203971862793, "rewards/margins": 2.8865718841552734, "rewards/rejected": 2.8586316108703613, "step": 7930 }, { "epoch": 0.44049319704303685, "grad_norm": 83.54605102539062, "learning_rate": 5.929201257633948e-08, "logits/chosen": -0.4630278944969177, "logits/rejected": -0.5463398694992065, "logps/chosen": -196.7548065185547, "logps/rejected": -225.6660614013672, "loss": 1.2924, "nll_loss": 1.0636637210845947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.932701587677002, "rewards/margins": 2.5180087089538574, "rewards/rejected": 3.4146926403045654, "step": 7940 }, { "epoch": 0.44104797436928755, "grad_norm": 47.81173324584961, "learning_rate": 5.920637115490141e-08, "logits/chosen": -0.39089900255203247, "logits/rejected": -0.4534800052642822, "logps/chosen": -153.1407012939453, "logps/rejected": -196.2056884765625, "loss": 1.2349, "nll_loss": 0.9944251775741577, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.820315361022949, "rewards/margins": 3.712320327758789, "rewards/rejected": 2.10799503326416, "step": 7950 }, { "epoch": 0.4416027516955382, "grad_norm": 67.46906280517578, "learning_rate": 5.9120701766998774e-08, "logits/chosen": -0.21322064101696014, "logits/rejected": -0.343554824590683, "logps/chosen": -157.1032257080078, "logps/rejected": -220.17630004882812, "loss": 1.1961, "nll_loss": 0.8462135195732117, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.530757904052734, "rewards/margins": 3.692348003387451, "rewards/rejected": 1.8384101390838623, "step": 7960 }, { "epoch": 0.4421575290217889, "grad_norm": 33.015899658203125, "learning_rate": 5.9035004672871936e-08, "logits/chosen": -0.17605528235435486, "logits/rejected": -0.31549564003944397, "logps/chosen": -157.47113037109375, "logps/rejected": -201.6073455810547, "loss": 1.2521, "nll_loss": 0.9050248861312866, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.284222602844238, "rewards/margins": 2.9695611000061035, "rewards/rejected": 2.3146615028381348, "step": 7970 }, { "epoch": 0.44271230634803954, "grad_norm": 78.65603637695312, "learning_rate": 5.89492801328455e-08, "logits/chosen": -0.3612144887447357, "logits/rejected": -0.4951377511024475, "logps/chosen": -181.75048828125, "logps/rejected": -274.47943115234375, "loss": 1.2657, "nll_loss": 0.9882364273071289, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.6054205894470215, "rewards/margins": 4.520341873168945, "rewards/rejected": 2.0850789546966553, "step": 7980 }, { "epoch": 0.44326708367429024, "grad_norm": 58.317527770996094, "learning_rate": 5.8863528407327456e-08, "logits/chosen": -0.17070798575878143, "logits/rejected": -0.3363649845123291, "logps/chosen": -142.54373168945312, "logps/rejected": -174.45375061035156, "loss": 1.2523, "nll_loss": 0.8363102078437805, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.006918907165527, "rewards/margins": 3.044194221496582, "rewards/rejected": 1.9627254009246826, "step": 7990 }, { "epoch": 0.4438218610005409, "grad_norm": 45.0575065612793, "learning_rate": 5.87777497568083e-08, "logits/chosen": -0.21342894434928894, "logits/rejected": -0.418354332447052, "logps/chosen": -115.73429107666016, "logps/rejected": -180.33444213867188, "loss": 1.213, "nll_loss": 0.7383561134338379, "rewards/accuracies": 0.875, "rewards/chosen": 4.680453300476074, "rewards/margins": 3.650794267654419, "rewards/rejected": 1.0296586751937866, "step": 8000 }, { "epoch": 0.4438218610005409, "eval_logits/chosen": -0.38671383261680603, "eval_logits/rejected": -0.48004651069641113, "eval_logps/chosen": -192.80557250976562, "eval_logps/rejected": -259.6051940917969, "eval_loss": 1.2556079626083374, "eval_nll_loss": 1.0014536380767822, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 6.515565395355225, "eval_rewards/margins": 4.568235397338867, "eval_rewards/rejected": 1.9473298788070679, "eval_runtime": 16.7445, "eval_samples_per_second": 15.289, "eval_steps_per_second": 1.911, "step": 8000 }, { "epoch": 0.4443766383267916, "grad_norm": 47.568199157714844, "learning_rate": 5.86919444418604e-08, "logits/chosen": -0.2681363523006439, "logits/rejected": -0.4715178608894348, "logps/chosen": -137.9403533935547, "logps/rejected": -166.81297302246094, "loss": 1.2904, "nll_loss": 0.7847102284431458, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.970905303955078, "rewards/margins": 2.872488498687744, "rewards/rejected": 2.098417282104492, "step": 8010 }, { "epoch": 0.4449314156530423, "grad_norm": 55.3165283203125, "learning_rate": 5.860611272313706e-08, "logits/chosen": -0.3643186390399933, "logits/rejected": -0.48495370149612427, "logps/chosen": -189.58767700195312, "logps/rejected": -261.2303161621094, "loss": 1.2308, "nll_loss": 1.0931975841522217, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.005456447601318, "rewards/margins": 4.076797962188721, "rewards/rejected": 1.928658127784729, "step": 8020 }, { "epoch": 0.4454861929792929, "grad_norm": 93.05540466308594, "learning_rate": 5.852025486137182e-08, "logits/chosen": -0.20182017982006073, "logits/rejected": -0.3323633670806885, "logps/chosen": -143.80166625976562, "logps/rejected": -195.39883422851562, "loss": 1.3046, "nll_loss": 0.864376425743103, "rewards/accuracies": 0.75, "rewards/chosen": 5.173520565032959, "rewards/margins": 3.3077869415283203, "rewards/rejected": 1.8657335042953491, "step": 8030 }, { "epoch": 0.4460409703055436, "grad_norm": 80.69502258300781, "learning_rate": 5.8434371117377645e-08, "logits/chosen": -0.2694427967071533, "logits/rejected": -0.40171557664871216, "logps/chosen": -152.2576446533203, "logps/rejected": -233.55606079101562, "loss": 1.2215, "nll_loss": 0.8913043141365051, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.948529243469238, "rewards/margins": 4.456477165222168, "rewards/rejected": 1.4920519590377808, "step": 8040 }, { "epoch": 0.44659574763179427, "grad_norm": 38.91741943359375, "learning_rate": 5.834846175204611e-08, "logits/chosen": -0.1876542568206787, "logits/rejected": -0.3668864965438843, "logps/chosen": -171.0609588623047, "logps/rejected": -219.9167022705078, "loss": 1.2786, "nll_loss": 0.9668686985969543, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.387633323669434, "rewards/margins": 3.384929656982422, "rewards/rejected": 2.00270414352417, "step": 8050 }, { "epoch": 0.44715052495804497, "grad_norm": 54.68962860107422, "learning_rate": 5.826252702634661e-08, "logits/chosen": -0.30333903431892395, "logits/rejected": -0.4032462239265442, "logps/chosen": -168.95272827148438, "logps/rejected": -223.0485076904297, "loss": 1.2644, "nll_loss": 0.9916449785232544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.742356300354004, "rewards/margins": 3.2595245838165283, "rewards/rejected": 2.4828314781188965, "step": 8060 }, { "epoch": 0.4477053022842956, "grad_norm": 72.57445526123047, "learning_rate": 5.81765672013256e-08, "logits/chosen": -0.25930994749069214, "logits/rejected": -0.4766874313354492, "logps/chosen": -138.98260498046875, "logps/rejected": -199.86651611328125, "loss": 1.2728, "nll_loss": 0.9009540677070618, "rewards/accuracies": 0.875, "rewards/chosen": 5.168518543243408, "rewards/margins": 4.168160438537598, "rewards/rejected": 1.0003578662872314, "step": 8070 }, { "epoch": 0.4482600796105463, "grad_norm": 82.89904022216797, "learning_rate": 5.809058253810577e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -138.91119384765625, "logps/rejected": -177.7933349609375, "loss": 1.2864, "nll_loss": NaN, "rewards/accuracies": 0.875, "rewards/chosen": 5.166953086853027, "rewards/margins": 3.048910617828369, "rewards/rejected": 2.118042230606079, "step": 8080 }, { "epoch": 0.448814856936797, "grad_norm": 45.50212860107422, "learning_rate": 5.8004573297885263e-08, "logits/chosen": -0.1541905403137207, "logits/rejected": -0.3506318926811218, "logps/chosen": -135.6262969970703, "logps/rejected": -191.90237426757812, "loss": 1.2552, "nll_loss": 0.8606699705123901, "rewards/accuracies": 0.875, "rewards/chosen": 4.86007022857666, "rewards/margins": 3.3907406330108643, "rewards/rejected": 1.469329595565796, "step": 8090 }, { "epoch": 0.44936963426304766, "grad_norm": 64.80812072753906, "learning_rate": 5.791853974193688e-08, "logits/chosen": -0.32614752650260925, "logits/rejected": -0.46904468536376953, "logps/chosen": -197.24119567871094, "logps/rejected": -231.13327026367188, "loss": 1.1762, "nll_loss": 1.0258362293243408, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.1628522872924805, "rewards/margins": 4.323519706726074, "rewards/rejected": 1.8393325805664062, "step": 8100 }, { "epoch": 0.44992441158929836, "grad_norm": 66.36274719238281, "learning_rate": 5.783248213160729e-08, "logits/chosen": -0.3098219633102417, "logits/rejected": -0.4934779703617096, "logps/chosen": -178.09774780273438, "logps/rejected": -239.51025390625, "loss": 1.2542, "nll_loss": 0.9184685945510864, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.972233772277832, "rewards/margins": 4.269294261932373, "rewards/rejected": 1.7029390335083008, "step": 8110 }, { "epoch": 0.450479188915549, "grad_norm": 68.26948547363281, "learning_rate": 5.774640072831621e-08, "logits/chosen": -0.3017955422401428, "logits/rejected": -0.46287283301353455, "logps/chosen": -183.70526123046875, "logps/rejected": -258.19964599609375, "loss": 1.2733, "nll_loss": 0.9797613024711609, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.402983665466309, "rewards/margins": 4.828103065490723, "rewards/rejected": 1.5748809576034546, "step": 8120 }, { "epoch": 0.4510339662417997, "grad_norm": 42.09645080566406, "learning_rate": 5.766029579355567e-08, "logits/chosen": -0.34000691771507263, "logits/rejected": -0.5368366241455078, "logps/chosen": -171.09927368164062, "logps/rejected": -246.84521484375, "loss": 1.2752, "nll_loss": 0.9592534899711609, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.863207817077637, "rewards/margins": 4.2700419425964355, "rewards/rejected": 1.5931650400161743, "step": 8130 }, { "epoch": 0.4515887435680504, "grad_norm": 61.56931686401367, "learning_rate": 5.7574167588889155e-08, "logits/chosen": -0.4223068654537201, "logits/rejected": -0.5126476883888245, "logps/chosen": -190.4967803955078, "logps/rejected": -259.795654296875, "loss": 1.2975, "nll_loss": 1.0304621458053589, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.281733512878418, "rewards/margins": 4.427463531494141, "rewards/rejected": 1.8542697429656982, "step": 8140 }, { "epoch": 0.45214352089430104, "grad_norm": 36.00699996948242, "learning_rate": 5.7488016375950846e-08, "logits/chosen": -0.2626270651817322, "logits/rejected": -0.43936362862586975, "logps/chosen": -160.6858673095703, "logps/rejected": -210.3122100830078, "loss": 1.2147, "nll_loss": 0.9187489748001099, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.460170269012451, "rewards/margins": 3.7872111797332764, "rewards/rejected": 1.6729589700698853, "step": 8150 }, { "epoch": 0.45269829822055174, "grad_norm": 56.432472229003906, "learning_rate": 5.740184241644482e-08, "logits/chosen": -0.3512900471687317, "logits/rejected": -0.5015803575515747, "logps/chosen": -176.67373657226562, "logps/rejected": -235.1858673095703, "loss": 1.2242, "nll_loss": 0.967685341835022, "rewards/accuracies": 0.875, "rewards/chosen": 5.801245212554932, "rewards/margins": 4.152280330657959, "rewards/rejected": 1.6489654779434204, "step": 8160 }, { "epoch": 0.4532530755468024, "grad_norm": 42.60057830810547, "learning_rate": 5.7315645972144264e-08, "logits/chosen": -0.42258042097091675, "logits/rejected": -0.561353862285614, "logps/chosen": -172.57858276367188, "logps/rejected": -234.5561065673828, "loss": 1.3028, "nll_loss": 0.972865104675293, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.304925441741943, "rewards/margins": 4.14249324798584, "rewards/rejected": 2.1624317169189453, "step": 8170 }, { "epoch": 0.4538078528730531, "grad_norm": 75.68475341796875, "learning_rate": 5.7229427304890644e-08, "logits/chosen": -0.35305720567703247, "logits/rejected": -0.5380151271820068, "logps/chosen": -183.86520385742188, "logps/rejected": -225.5127716064453, "loss": 1.2598, "nll_loss": 0.9746831059455872, "rewards/accuracies": 0.875, "rewards/chosen": 5.931631088256836, "rewards/margins": 4.358283042907715, "rewards/rejected": 1.5733486413955688, "step": 8180 }, { "epoch": 0.45436263019930373, "grad_norm": 70.02229309082031, "learning_rate": 5.7143186676592935e-08, "logits/chosen": -0.3164612650871277, "logits/rejected": -0.41461247205734253, "logps/chosen": -180.4053497314453, "logps/rejected": -213.3650360107422, "loss": 1.2599, "nll_loss": 0.9668909907341003, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.490021705627441, "rewards/margins": 2.3829715251922607, "rewards/rejected": 3.1070501804351807, "step": 8190 }, { "epoch": 0.45491740752555443, "grad_norm": 64.77501678466797, "learning_rate": 5.705692434922683e-08, "logits/chosen": -0.3636978268623352, "logits/rejected": -0.5518943071365356, "logps/chosen": -172.8309783935547, "logps/rejected": -235.52029418945312, "loss": 1.2163, "nll_loss": 0.9208908081054688, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.8655290603637695, "rewards/margins": 3.3627452850341797, "rewards/rejected": 2.5027828216552734, "step": 8200 }, { "epoch": 0.45547218485180513, "grad_norm": 63.131656646728516, "learning_rate": 5.697064058483395e-08, "logits/chosen": -0.4812677800655365, "logits/rejected": -0.6031264066696167, "logps/chosen": -204.84005737304688, "logps/rejected": -278.93475341796875, "loss": 1.3567, "nll_loss": 1.2066683769226074, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.644658088684082, "rewards/margins": 3.36299467086792, "rewards/rejected": 3.281662702560425, "step": 8210 }, { "epoch": 0.4560269621780558, "grad_norm": 80.57061004638672, "learning_rate": 5.688433564552103e-08, "logits/chosen": -0.4052867889404297, "logits/rejected": -0.5403534770011902, "logps/chosen": -149.51043701171875, "logps/rejected": -223.8475799560547, "loss": 1.3456, "nll_loss": 0.9441035389900208, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.9508514404296875, "rewards/margins": 4.125848293304443, "rewards/rejected": 1.825002670288086, "step": 8220 }, { "epoch": 0.45658173950430647, "grad_norm": 62.646263122558594, "learning_rate": 5.67980097934591e-08, "logits/chosen": -0.3800668716430664, "logits/rejected": -0.4854021668434143, "logps/chosen": -169.1044921875, "logps/rejected": -245.3473663330078, "loss": 1.2799, "nll_loss": 1.067249059677124, "rewards/accuracies": 0.875, "rewards/chosen": 5.665637016296387, "rewards/margins": 3.1536102294921875, "rewards/rejected": 2.512026309967041, "step": 8230 }, { "epoch": 0.4571365168305571, "grad_norm": 59.984928131103516, "learning_rate": 5.6711663290882774e-08, "logits/chosen": -0.42158278822898865, "logits/rejected": -0.5549692511558533, "logps/chosen": -177.200927734375, "logps/rejected": -229.2981414794922, "loss": 1.3148, "nll_loss": 1.011946439743042, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.219405651092529, "rewards/margins": 3.8054656982421875, "rewards/rejected": 2.413939952850342, "step": 8240 }, { "epoch": 0.4576912941568078, "grad_norm": 125.51600646972656, "learning_rate": 5.662529640008933e-08, "logits/chosen": -0.3310701549053192, "logits/rejected": -0.4897652566432953, "logps/chosen": -168.09390258789062, "logps/rejected": -217.44931030273438, "loss": 1.3698, "nll_loss": 0.9788816571235657, "rewards/accuracies": 0.875, "rewards/chosen": 5.848081111907959, "rewards/margins": 4.070723533630371, "rewards/rejected": 1.777358055114746, "step": 8250 }, { "epoch": 0.4582460714830585, "grad_norm": 57.40364456176758, "learning_rate": 5.6538909383438046e-08, "logits/chosen": -0.408261239528656, "logits/rejected": -0.5265794396400452, "logps/chosen": -196.43104553222656, "logps/rejected": -253.6921844482422, "loss": 1.3006, "nll_loss": 1.097115159034729, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.171659469604492, "rewards/margins": 3.760265827178955, "rewards/rejected": 2.411393880844116, "step": 8260 }, { "epoch": 0.45880084880930916, "grad_norm": 59.3066520690918, "learning_rate": 5.645250250334931e-08, "logits/chosen": -0.382033109664917, "logits/rejected": -0.4673156142234802, "logps/chosen": -200.22938537597656, "logps/rejected": -272.4017333984375, "loss": 1.2982, "nll_loss": 1.0755808353424072, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.385659217834473, "rewards/margins": 3.7684860229492188, "rewards/rejected": 2.617173671722412, "step": 8270 }, { "epoch": 0.45935562613555986, "grad_norm": 84.24140930175781, "learning_rate": 5.636607602230379e-08, "logits/chosen": -0.33731868863105774, "logits/rejected": -0.5461875200271606, "logps/chosen": -181.59646606445312, "logps/rejected": -280.0567626953125, "loss": 1.2524, "nll_loss": 0.9656723737716675, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 6.201190948486328, "rewards/margins": 5.510003089904785, "rewards/rejected": 0.6911883354187012, "step": 8280 }, { "epoch": 0.4599104034618105, "grad_norm": 57.184749603271484, "learning_rate": 5.62796302028418e-08, "logits/chosen": -0.26092246174812317, "logits/rejected": -0.3896161615848541, "logps/chosen": -164.76968383789062, "logps/rejected": -215.32467651367188, "loss": 1.2664, "nll_loss": 0.9338585138320923, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.4399824142456055, "rewards/margins": 4.290564060211182, "rewards/rejected": 1.149418592453003, "step": 8290 }, { "epoch": 0.4604651807880612, "grad_norm": 59.31813049316406, "learning_rate": 5.619316530756233e-08, "logits/chosen": -0.28790172934532166, "logits/rejected": -0.45928388833999634, "logps/chosen": -162.6776885986328, "logps/rejected": -223.90475463867188, "loss": 1.2686, "nll_loss": 0.9045939445495605, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.451429843902588, "rewards/margins": 3.5017216205596924, "rewards/rejected": 1.9497079849243164, "step": 8300 }, { "epoch": 0.46101995811431185, "grad_norm": 101.83157348632812, "learning_rate": 5.610668159912235e-08, "logits/chosen": -0.3260810375213623, "logits/rejected": -0.4430045485496521, "logps/chosen": -186.3931121826172, "logps/rejected": -235.7809295654297, "loss": 1.3681, "nll_loss": 0.9977655410766602, "rewards/accuracies": 0.75, "rewards/chosen": 5.799564361572266, "rewards/margins": 3.3150525093078613, "rewards/rejected": 2.484511613845825, "step": 8310 }, { "epoch": 0.46157473544056254, "grad_norm": 101.32489776611328, "learning_rate": 5.602017934023595e-08, "logits/chosen": -0.3339731693267822, "logits/rejected": -0.5301405191421509, "logps/chosen": -139.90518188476562, "logps/rejected": -179.57363891601562, "loss": 1.3411, "nll_loss": 0.942695140838623, "rewards/accuracies": 0.875, "rewards/chosen": 5.0137505531311035, "rewards/margins": 3.0543651580810547, "rewards/rejected": 1.9593846797943115, "step": 8320 }, { "epoch": 0.46212951276681324, "grad_norm": 84.96895599365234, "learning_rate": 5.59336587936736e-08, "logits/chosen": -0.46184906363487244, "logits/rejected": -0.5722322463989258, "logps/chosen": -195.6354217529297, "logps/rejected": -262.1464538574219, "loss": 1.2554, "nll_loss": 1.0777390003204346, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.625528812408447, "rewards/margins": 3.791055202484131, "rewards/rejected": 2.834473133087158, "step": 8330 }, { "epoch": 0.4626842900930639, "grad_norm": 91.9871826171875, "learning_rate": 5.5847120222261315e-08, "logits/chosen": -0.3586110770702362, "logits/rejected": -0.5264121890068054, "logps/chosen": -139.6001739501953, "logps/rejected": -184.0530242919922, "loss": 1.3581, "nll_loss": 0.9735239148139954, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.088287353515625, "rewards/margins": 3.1971933841705322, "rewards/rejected": 1.8910939693450928, "step": 8340 }, { "epoch": 0.4632390674193146, "grad_norm": 129.99005126953125, "learning_rate": 5.5760563888879844e-08, "logits/chosen": -0.3932887613773346, "logits/rejected": -0.5743144750595093, "logps/chosen": -142.7582550048828, "logps/rejected": -206.0090789794922, "loss": 1.2786, "nll_loss": 0.8861738443374634, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.229488849639893, "rewards/margins": 3.4306912422180176, "rewards/rejected": 1.7987968921661377, "step": 8350 }, { "epoch": 0.46379384474556523, "grad_norm": 50.705623626708984, "learning_rate": 5.567399005646393e-08, "logits/chosen": -0.372164249420166, "logits/rejected": -0.518144428730011, "logps/chosen": -165.54945373535156, "logps/rejected": -218.5325469970703, "loss": 1.2101, "nll_loss": 0.8808409571647644, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.740856647491455, "rewards/margins": 3.079467296600342, "rewards/rejected": 2.661389112472534, "step": 8360 }, { "epoch": 0.46434862207181593, "grad_norm": 68.00957489013672, "learning_rate": 5.558739898800141e-08, "logits/chosen": -0.4178202748298645, "logits/rejected": -0.5774034857749939, "logps/chosen": -168.80972290039062, "logps/rejected": -226.04690551757812, "loss": 1.2316, "nll_loss": 1.0256147384643555, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.702794075012207, "rewards/margins": 3.232090473175049, "rewards/rejected": 2.470703601837158, "step": 8370 }, { "epoch": 0.4649033993980666, "grad_norm": 56.85757827758789, "learning_rate": 5.550079094653257e-08, "logits/chosen": -0.28982049226760864, "logits/rejected": -0.4554738998413086, "logps/chosen": -168.7762451171875, "logps/rejected": -216.87002563476562, "loss": 1.364, "nll_loss": 0.9375017285346985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.515100479125977, "rewards/margins": 2.744253635406494, "rewards/rejected": 2.7708468437194824, "step": 8380 }, { "epoch": 0.4654581767243173, "grad_norm": 62.08861541748047, "learning_rate": 5.5414166195149194e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -122.28581237792969, "logps/rejected": -182.76296997070312, "loss": 1.2023, "nll_loss": NaN, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.9859161376953125, "rewards/margins": 3.501011371612549, "rewards/rejected": 1.4849050045013428, "step": 8390 }, { "epoch": 0.466012954050568, "grad_norm": 56.96398162841797, "learning_rate": 5.53275249969938e-08, "logits/chosen": -0.29233837127685547, "logits/rejected": -0.42494410276412964, "logps/chosen": -175.22488403320312, "logps/rejected": -226.5401153564453, "loss": 1.221, "nll_loss": 0.9822790026664734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.395953178405762, "rewards/margins": 3.6122539043426514, "rewards/rejected": 1.783699631690979, "step": 8400 }, { "epoch": 0.4665677313768186, "grad_norm": 69.49365997314453, "learning_rate": 5.524086761525896e-08, "logits/chosen": -0.3085178732872009, "logits/rejected": -0.5056566596031189, "logps/chosen": -170.92762756347656, "logps/rejected": -225.01986694335938, "loss": 1.2412, "nll_loss": 0.9332050085067749, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.219595909118652, "rewards/margins": 3.47466778755188, "rewards/rejected": 1.744927167892456, "step": 8410 }, { "epoch": 0.4671225087030693, "grad_norm": 50.1507453918457, "learning_rate": 5.515419431318632e-08, "logits/chosen": -0.39892831444740295, "logits/rejected": -0.49943628907203674, "logps/chosen": -162.7307586669922, "logps/rejected": -202.21437072753906, "loss": 1.225, "nll_loss": 1.015000343322754, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.446582317352295, "rewards/margins": 3.4593796730041504, "rewards/rejected": 1.9872024059295654, "step": 8420 }, { "epoch": 0.46767728602931996, "grad_norm": 57.40410614013672, "learning_rate": 5.506750535406594e-08, "logits/chosen": -0.24424946308135986, "logits/rejected": -0.47506189346313477, "logps/chosen": -140.15203857421875, "logps/rejected": -225.4398651123047, "loss": 1.1725, "nll_loss": 0.8635500073432922, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 5.439966678619385, "rewards/margins": 4.405338287353516, "rewards/rejected": 1.03462815284729, "step": 8430 }, { "epoch": 0.46823206335557066, "grad_norm": 69.4198226928711, "learning_rate": 5.49808010012354e-08, "logits/chosen": -0.23678168654441833, "logits/rejected": -0.3856371343135834, "logps/chosen": -145.4683074951172, "logps/rejected": -215.7239532470703, "loss": 1.1816, "nll_loss": 0.8480218052864075, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.327239036560059, "rewards/margins": 3.5169034004211426, "rewards/rejected": 1.8103358745574951, "step": 8440 }, { "epoch": 0.46878684068182136, "grad_norm": 121.41065979003906, "learning_rate": 5.489408151807907e-08, "logits/chosen": -0.20533113181591034, "logits/rejected": -0.386726438999176, "logps/chosen": -164.6047821044922, "logps/rejected": -240.8743133544922, "loss": 1.2385, "nll_loss": 0.946201503276825, "rewards/accuracies": 0.875, "rewards/chosen": 5.50950288772583, "rewards/margins": 3.9801108837127686, "rewards/rejected": 1.5293917655944824, "step": 8450 }, { "epoch": 0.469341618008072, "grad_norm": 67.52189636230469, "learning_rate": 5.480734716802729e-08, "logits/chosen": -0.047091174870729446, "logits/rejected": -0.30713146924972534, "logps/chosen": -107.66644287109375, "logps/rejected": -164.18824768066406, "loss": 1.219, "nll_loss": 0.7501475214958191, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.337796688079834, "rewards/margins": 4.029504299163818, "rewards/rejected": 0.30829155445098877, "step": 8460 }, { "epoch": 0.4698963953343227, "grad_norm": 45.683902740478516, "learning_rate": 5.472059821455554e-08, "logits/chosen": -0.3871431350708008, "logits/rejected": -0.514873206615448, "logps/chosen": -185.6728057861328, "logps/rejected": -251.6055145263672, "loss": 1.2143, "nll_loss": 1.1259502172470093, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.96354341506958, "rewards/margins": 4.187412261962891, "rewards/rejected": 1.7761310338974, "step": 8470 }, { "epoch": 0.47045117266057335, "grad_norm": 101.97664642333984, "learning_rate": 5.4633834921183665e-08, "logits/chosen": -0.2548361122608185, "logits/rejected": -0.5008405447006226, "logps/chosen": -151.93946838378906, "logps/rejected": -212.5448760986328, "loss": 1.314, "nll_loss": 0.8620659112930298, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.392413139343262, "rewards/margins": 3.4350333213806152, "rewards/rejected": 1.9573793411254883, "step": 8480 }, { "epoch": 0.47100594998682405, "grad_norm": 46.88422775268555, "learning_rate": 5.454705755147508e-08, "logits/chosen": -0.18084892630577087, "logits/rejected": -0.3403630256652832, "logps/chosen": -153.07003784179688, "logps/rejected": -207.1527557373047, "loss": 1.2825, "nll_loss": 0.9178160429000854, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.4134650230407715, "rewards/margins": 3.658824920654297, "rewards/rejected": 1.7546402215957642, "step": 8490 }, { "epoch": 0.4715607273130747, "grad_norm": 81.96524810791016, "learning_rate": 5.4460266369035954e-08, "logits/chosen": -0.154692143201828, "logits/rejected": -0.39466938376426697, "logps/chosen": -108.70941162109375, "logps/rejected": -186.53506469726562, "loss": 1.2631, "nll_loss": 0.7526475787162781, "rewards/accuracies": 0.875, "rewards/chosen": 4.2512054443359375, "rewards/margins": 3.975069046020508, "rewards/rejected": 0.27613669633865356, "step": 8500 }, { "epoch": 0.4715607273130747, "eval_logits/chosen": -0.38347405195236206, "eval_logits/rejected": -0.49657073616981506, "eval_logps/chosen": -193.00897216796875, "eval_logps/rejected": -264.6562194824219, "eval_loss": 1.2516002655029297, "eval_nll_loss": 1.0022295713424683, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.495223522186279, "eval_rewards/margins": 5.052995681762695, "eval_rewards/rejected": 1.442228078842163, "eval_runtime": 17.0205, "eval_samples_per_second": 15.041, "eval_steps_per_second": 1.88, "step": 8500 }, { "epoch": 0.4721155046393254, "grad_norm": 64.24961853027344, "learning_rate": 5.4373461637514416e-08, "logits/chosen": -0.15924356877803802, "logits/rejected": -0.3522131145000458, "logps/chosen": -130.67691040039062, "logps/rejected": -195.55931091308594, "loss": 1.1696, "nll_loss": 0.8569602966308594, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.041111946105957, "rewards/margins": 3.7395081520080566, "rewards/rejected": 1.3016037940979004, "step": 8510 }, { "epoch": 0.4726702819655761, "grad_norm": 83.25701904296875, "learning_rate": 5.428664362059975e-08, "logits/chosen": -0.3635895848274231, "logits/rejected": -0.5300859212875366, "logps/chosen": -167.30325317382812, "logps/rejected": -218.53466796875, "loss": 1.3063, "nll_loss": 0.9899178743362427, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.74692440032959, "rewards/margins": 3.647434711456299, "rewards/rejected": 2.099489688873291, "step": 8520 }, { "epoch": 0.47322505929182673, "grad_norm": 83.2901840209961, "learning_rate": 5.41998125820216e-08, "logits/chosen": -0.2491048276424408, "logits/rejected": -0.39172905683517456, "logps/chosen": -159.7559814453125, "logps/rejected": -203.1042938232422, "loss": 1.2351, "nll_loss": 0.899651050567627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.467600345611572, "rewards/margins": 2.6668825149536133, "rewards/rejected": 2.80071759223938, "step": 8530 }, { "epoch": 0.47377983661807743, "grad_norm": 56.23780059814453, "learning_rate": 5.4112968785549174e-08, "logits/chosen": -0.15644797682762146, "logits/rejected": -0.34491434693336487, "logps/chosen": -113.5403823852539, "logps/rejected": -181.77078247070312, "loss": 1.2335, "nll_loss": 0.8350857496261597, "rewards/accuracies": 0.875, "rewards/chosen": 4.577191352844238, "rewards/margins": 2.9863953590393066, "rewards/rejected": 1.5907953977584839, "step": 8540 }, { "epoch": 0.4743346139443281, "grad_norm": 90.52593994140625, "learning_rate": 5.402611249499042e-08, "logits/chosen": -0.2765730023384094, "logits/rejected": -0.48341649770736694, "logps/chosen": -156.0672607421875, "logps/rejected": -248.24545288085938, "loss": 1.2362, "nll_loss": 0.9277788996696472, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.159286022186279, "rewards/margins": 5.113110542297363, "rewards/rejected": 1.0461763143539429, "step": 8550 }, { "epoch": 0.4748893912705788, "grad_norm": 82.83740997314453, "learning_rate": 5.393924397419126e-08, "logits/chosen": -0.11408629268407822, "logits/rejected": -0.34016355872154236, "logps/chosen": -131.93338012695312, "logps/rejected": -187.15115356445312, "loss": 1.2929, "nll_loss": 0.835770308971405, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.571127891540527, "rewards/margins": 2.900984287261963, "rewards/rejected": 1.6701438426971436, "step": 8560 }, { "epoch": 0.4754441685968295, "grad_norm": 59.472686767578125, "learning_rate": 5.385236348703474e-08, "logits/chosen": -0.21524448692798615, "logits/rejected": -0.46747082471847534, "logps/chosen": -117.48472595214844, "logps/rejected": -176.40567016601562, "loss": 1.1203, "nll_loss": 0.7838854789733887, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 5.015366554260254, "rewards/margins": 4.129748344421387, "rewards/rejected": 0.8856188058853149, "step": 8570 }, { "epoch": 0.4759989459230801, "grad_norm": 52.82234573364258, "learning_rate": 5.376547129744029e-08, "logits/chosen": -0.3290513753890991, "logits/rejected": -0.48952069878578186, "logps/chosen": -157.22695922851562, "logps/rejected": -216.16525268554688, "loss": 1.2843, "nll_loss": 0.9082091450691223, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.499343395233154, "rewards/margins": 3.3456497192382812, "rewards/rejected": 2.153693199157715, "step": 8580 }, { "epoch": 0.4765537232493308, "grad_norm": 70.94815826416016, "learning_rate": 5.367856766936286e-08, "logits/chosen": -0.27662572264671326, "logits/rejected": -0.4078160226345062, "logps/chosen": -172.33746337890625, "logps/rejected": -196.3927459716797, "loss": 1.3422, "nll_loss": 1.0085538625717163, "rewards/accuracies": 0.75, "rewards/chosen": 5.572179317474365, "rewards/margins": 2.264984607696533, "rewards/rejected": 3.3071951866149902, "step": 8590 }, { "epoch": 0.47710850057558146, "grad_norm": 103.72970581054688, "learning_rate": 5.359165286679217e-08, "logits/chosen": -0.36042019724845886, "logits/rejected": -0.5228351354598999, "logps/chosen": -179.4456024169922, "logps/rejected": -268.6436462402344, "loss": 1.2705, "nll_loss": 1.020342469215393, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.373924732208252, "rewards/margins": 4.730695724487305, "rewards/rejected": 1.643228530883789, "step": 8600 }, { "epoch": 0.47766327790183216, "grad_norm": 78.71900177001953, "learning_rate": 5.350472715375186e-08, "logits/chosen": -0.2985449433326721, "logits/rejected": -0.4532528817653656, "logps/chosen": -162.94070434570312, "logps/rejected": -212.1439208984375, "loss": 1.2438, "nll_loss": 0.8956171274185181, "rewards/accuracies": 0.875, "rewards/chosen": 5.750922203063965, "rewards/margins": 3.4489173889160156, "rewards/rejected": 2.30200457572937, "step": 8610 }, { "epoch": 0.4782180552280828, "grad_norm": 58.050025939941406, "learning_rate": 5.341779079429872e-08, "logits/chosen": -0.3309435546398163, "logits/rejected": -0.4891184866428375, "logps/chosen": -140.02577209472656, "logps/rejected": -206.16079711914062, "loss": 1.3336, "nll_loss": 0.8849604725837708, "rewards/accuracies": 0.875, "rewards/chosen": 5.365493297576904, "rewards/margins": 3.672743320465088, "rewards/rejected": 1.6927497386932373, "step": 8620 }, { "epoch": 0.4787728325543335, "grad_norm": 79.02501678466797, "learning_rate": 5.333084405252192e-08, "logits/chosen": -0.2821223735809326, "logits/rejected": -0.45447272062301636, "logps/chosen": -153.81051635742188, "logps/rejected": -217.93423461914062, "loss": 1.1836, "nll_loss": 0.8838475346565247, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 5.901255130767822, "rewards/margins": 4.203151226043701, "rewards/rejected": 1.698103666305542, "step": 8630 }, { "epoch": 0.4793276098805842, "grad_norm": 61.627197265625, "learning_rate": 5.32438871925421e-08, "logits/chosen": -0.40773114562034607, "logits/rejected": -0.5189584493637085, "logps/chosen": -181.827392578125, "logps/rejected": -259.85040283203125, "loss": 1.2252, "nll_loss": 1.0588048696517944, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.054421901702881, "rewards/margins": 3.4445533752441406, "rewards/rejected": 2.609868288040161, "step": 8640 }, { "epoch": 0.47988238720683485, "grad_norm": 69.41519165039062, "learning_rate": 5.3156920478510695e-08, "logits/chosen": -0.22698119282722473, "logits/rejected": -0.4370526671409607, "logps/chosen": -191.02120971679688, "logps/rejected": -230.5593719482422, "loss": 1.1659, "nll_loss": 0.9413886070251465, "rewards/accuracies": 0.875, "rewards/chosen": 5.745708465576172, "rewards/margins": 4.41402006149292, "rewards/rejected": 1.3316879272460938, "step": 8650 }, { "epoch": 0.48043716453308555, "grad_norm": 77.55827331542969, "learning_rate": 5.3069944174609046e-08, "logits/chosen": -0.347109854221344, "logits/rejected": -0.5010181665420532, "logps/chosen": -180.91476440429688, "logps/rejected": -262.96807861328125, "loss": 1.3218, "nll_loss": 0.9891365170478821, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.197776794433594, "rewards/margins": 4.570927619934082, "rewards/rejected": 1.6268491744995117, "step": 8660 }, { "epoch": 0.4809919418593362, "grad_norm": 70.76117706298828, "learning_rate": 5.298295854504764e-08, "logits/chosen": -0.3689562678337097, "logits/rejected": -0.4636387825012207, "logps/chosen": -180.4010772705078, "logps/rejected": -229.11538696289062, "loss": 1.2623, "nll_loss": 1.1043275594711304, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.257417678833008, "rewards/margins": 3.5675837993621826, "rewards/rejected": 2.6898341178894043, "step": 8670 }, { "epoch": 0.4815467191855869, "grad_norm": 59.07448196411133, "learning_rate": 5.2895963854065264e-08, "logits/chosen": -0.42123499512672424, "logits/rejected": -0.45919767022132874, "logps/chosen": -197.53662109375, "logps/rejected": -278.5355529785156, "loss": 1.2533, "nll_loss": 1.2173737287521362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.918522834777832, "rewards/margins": 4.231985569000244, "rewards/rejected": 2.6865363121032715, "step": 8680 }, { "epoch": 0.48210149651183754, "grad_norm": 122.18305969238281, "learning_rate": 5.2808960365928266e-08, "logits/chosen": -0.24855947494506836, "logits/rejected": -0.35904207825660706, "logps/chosen": -185.3103485107422, "logps/rejected": -249.7644500732422, "loss": 1.3725, "nll_loss": 0.9928406476974487, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.964116096496582, "rewards/margins": 3.4931252002716064, "rewards/rejected": 2.4709906578063965, "step": 8690 }, { "epoch": 0.48265627383808823, "grad_norm": 109.85166931152344, "learning_rate": 5.272194834492969e-08, "logits/chosen": -0.1494508683681488, "logits/rejected": -0.33946704864501953, "logps/chosen": -112.33955383300781, "logps/rejected": -179.93511962890625, "loss": 1.2902, "nll_loss": 0.7621714472770691, "rewards/accuracies": 0.875, "rewards/chosen": 4.732860088348389, "rewards/margins": 3.625771999359131, "rewards/rejected": 1.107088327407837, "step": 8700 }, { "epoch": 0.48321105116433893, "grad_norm": 64.41929626464844, "learning_rate": 5.263492805538853e-08, "logits/chosen": -0.1143936887383461, "logits/rejected": -0.3094675838947296, "logps/chosen": -109.27293395996094, "logps/rejected": -171.84115600585938, "loss": 1.2469, "nll_loss": 0.7721977233886719, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.127641677856445, "rewards/margins": 3.933940887451172, "rewards/rejected": 1.1937006711959839, "step": 8710 }, { "epoch": 0.4837658284905896, "grad_norm": 58.07503128051758, "learning_rate": 5.254789976164885e-08, "logits/chosen": -0.17686712741851807, "logits/rejected": -0.36666935682296753, "logps/chosen": -161.06690979003906, "logps/rejected": -220.7855987548828, "loss": 1.2123, "nll_loss": 1.0213537216186523, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.2469587326049805, "rewards/margins": 3.10516357421875, "rewards/rejected": 2.1417949199676514, "step": 8720 }, { "epoch": 0.4843206058168403, "grad_norm": 45.921226501464844, "learning_rate": 5.246086372807911e-08, "logits/chosen": -0.1610218733549118, "logits/rejected": -0.4254869520664215, "logps/chosen": -122.7448959350586, "logps/rejected": -185.78421020507812, "loss": 1.3354, "nll_loss": 0.7519733309745789, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.872366905212402, "rewards/margins": 3.6194496154785156, "rewards/rejected": 1.2529175281524658, "step": 8730 }, { "epoch": 0.4848753831430909, "grad_norm": 51.10985565185547, "learning_rate": 5.237382021907119e-08, "logits/chosen": -0.3228822350502014, "logits/rejected": -0.40563878417015076, "logps/chosen": -178.90646362304688, "logps/rejected": -218.18753051757812, "loss": 1.3275, "nll_loss": 1.1199922561645508, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.172387599945068, "rewards/margins": 3.0818145275115967, "rewards/rejected": 3.090573310852051, "step": 8740 }, { "epoch": 0.4854301604693416, "grad_norm": 46.320003509521484, "learning_rate": 5.228676949903973e-08, "logits/chosen": -0.30700522661209106, "logits/rejected": -0.4341781735420227, "logps/chosen": -164.91897583007812, "logps/rejected": -210.10507202148438, "loss": 1.1947, "nll_loss": 0.9750370979309082, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.663297653198242, "rewards/margins": 3.347306728363037, "rewards/rejected": 2.315990924835205, "step": 8750 }, { "epoch": 0.4859849377955923, "grad_norm": 40.002342224121094, "learning_rate": 5.219971183242125e-08, "logits/chosen": -0.044834405183792114, "logits/rejected": -0.23564691841602325, "logps/chosen": -93.63855743408203, "logps/rejected": -129.38055419921875, "loss": 1.1847, "nll_loss": 0.6099318265914917, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.2069268226623535, "rewards/margins": 2.6457130908966064, "rewards/rejected": 1.5612133741378784, "step": 8760 }, { "epoch": 0.48653971512184296, "grad_norm": 79.6065902709961, "learning_rate": 5.211264748367341e-08, "logits/chosen": -0.3970792889595032, "logits/rejected": -0.44048887491226196, "logps/chosen": -162.2034149169922, "logps/rejected": -220.4867401123047, "loss": 1.2505, "nll_loss": 0.9981945157051086, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.638679027557373, "rewards/margins": 3.384411334991455, "rewards/rejected": 2.2542672157287598, "step": 8770 }, { "epoch": 0.48709449244809366, "grad_norm": 65.16060638427734, "learning_rate": 5.2025576717274146e-08, "logits/chosen": -0.1481800228357315, "logits/rejected": -0.3821583390235901, "logps/chosen": -129.69326782226562, "logps/rejected": -197.94424438476562, "loss": 1.3856, "nll_loss": 0.7421929240226746, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.613026142120361, "rewards/margins": 2.946195125579834, "rewards/rejected": 1.6668307781219482, "step": 8780 }, { "epoch": 0.4876492697743443, "grad_norm": 76.15536499023438, "learning_rate": 5.193849979772086e-08, "logits/chosen": -0.21147122979164124, "logits/rejected": -0.413780152797699, "logps/chosen": -92.40141296386719, "logps/rejected": -118.81148529052734, "loss": 1.363, "nll_loss": 0.6374253630638123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.012933731079102, "rewards/margins": 2.4644181728363037, "rewards/rejected": 1.548515796661377, "step": 8790 }, { "epoch": 0.488204047100595, "grad_norm": 60.92570114135742, "learning_rate": 5.1851416989529696e-08, "logits/chosen": -0.3093903660774231, "logits/rejected": -0.44280901551246643, "logps/chosen": -137.2354736328125, "logps/rejected": -195.80068969726562, "loss": 1.2765, "nll_loss": 0.866968035697937, "rewards/accuracies": 0.875, "rewards/chosen": 5.153212070465088, "rewards/margins": 2.812201976776123, "rewards/rejected": 2.341010093688965, "step": 8800 }, { "epoch": 0.48875882442684565, "grad_norm": 73.39197540283203, "learning_rate": 5.176432855723466e-08, "logits/chosen": -0.3896110951900482, "logits/rejected": -0.510871946811676, "logps/chosen": -172.58920288085938, "logps/rejected": -250.2101593017578, "loss": 1.4006, "nll_loss": 0.951266884803772, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.155871391296387, "rewards/margins": 3.3034446239471436, "rewards/rejected": 2.8524270057678223, "step": 8810 }, { "epoch": 0.48931360175309635, "grad_norm": 49.965972900390625, "learning_rate": 5.167723476538682e-08, "logits/chosen": -0.2232964038848877, "logits/rejected": -0.39524856209754944, "logps/chosen": -132.72500610351562, "logps/rejected": -195.49729919433594, "loss": 1.1924, "nll_loss": 0.7789738774299622, "rewards/accuracies": 0.875, "rewards/chosen": 5.062056064605713, "rewards/margins": 3.040559768676758, "rewards/rejected": 2.0214955806732178, "step": 8820 }, { "epoch": 0.48986837907934705, "grad_norm": 73.50363159179688, "learning_rate": 5.15901358785536e-08, "logits/chosen": -0.3110652565956116, "logits/rejected": -0.41810736060142517, "logps/chosen": -155.49932861328125, "logps/rejected": -210.6866455078125, "loss": 1.2802, "nll_loss": 0.9495540857315063, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.892726421356201, "rewards/margins": 4.21453857421875, "rewards/rejected": 1.6781879663467407, "step": 8830 }, { "epoch": 0.4904231564055977, "grad_norm": 81.90071868896484, "learning_rate": 5.1503032161317814e-08, "logits/chosen": -0.3098284900188446, "logits/rejected": -0.43176165223121643, "logps/chosen": -173.8741455078125, "logps/rejected": -229.93814086914062, "loss": 1.3226, "nll_loss": 0.9427685737609863, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.143521308898926, "rewards/margins": 4.145721435546875, "rewards/rejected": 1.9978001117706299, "step": 8840 }, { "epoch": 0.4909779337318484, "grad_norm": 66.26280212402344, "learning_rate": 5.1415923878277e-08, "logits/chosen": -0.2731201648712158, "logits/rejected": -0.4314287602901459, "logps/chosen": -152.42535400390625, "logps/rejected": -195.3851318359375, "loss": 1.2677, "nll_loss": 0.9041376113891602, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.4018168449401855, "rewards/margins": 2.7450244426727295, "rewards/rejected": 2.656792640686035, "step": 8850 }, { "epoch": 0.49153271105809904, "grad_norm": 60.99270248413086, "learning_rate": 5.132881129404256e-08, "logits/chosen": -0.2745968997478485, "logits/rejected": -0.4192644953727722, "logps/chosen": -192.58935546875, "logps/rejected": -237.8802490234375, "loss": 1.2896, "nll_loss": 1.0134674310684204, "rewards/accuracies": 0.875, "rewards/chosen": 5.789472579956055, "rewards/margins": 3.4546120166778564, "rewards/rejected": 2.3348610401153564, "step": 8860 }, { "epoch": 0.49208748838434974, "grad_norm": 58.93909454345703, "learning_rate": 5.1241694673238924e-08, "logits/chosen": -0.22993163764476776, "logits/rejected": -0.3750077188014984, "logps/chosen": -160.7397003173828, "logps/rejected": -207.7425537109375, "loss": 1.2323, "nll_loss": 0.9182407259941101, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.209530830383301, "rewards/margins": 3.3358845710754395, "rewards/rejected": 1.8736464977264404, "step": 8870 }, { "epoch": 0.49264226571060044, "grad_norm": 61.5848503112793, "learning_rate": 5.115457428050285e-08, "logits/chosen": -0.4301369786262512, "logits/rejected": -0.5818208456039429, "logps/chosen": -185.5221710205078, "logps/rejected": -256.7213439941406, "loss": 1.2948, "nll_loss": 1.0040353536605835, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.889816761016846, "rewards/margins": 3.5707008838653564, "rewards/rejected": 2.31911563873291, "step": 8880 }, { "epoch": 0.4931970430368511, "grad_norm": 52.07457733154297, "learning_rate": 5.10674503804825e-08, "logits/chosen": -0.16518327593803406, "logits/rejected": -0.39885979890823364, "logps/chosen": -133.54811096191406, "logps/rejected": -192.93878173828125, "loss": 1.1787, "nll_loss": 0.7971738576889038, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.232979774475098, "rewards/margins": 3.939906358718872, "rewards/rejected": 1.2930728197097778, "step": 8890 }, { "epoch": 0.4937518203631018, "grad_norm": 76.2165756225586, "learning_rate": 5.098032323783672e-08, "logits/chosen": -0.22320708632469177, "logits/rejected": -0.36877983808517456, "logps/chosen": -153.4722137451172, "logps/rejected": -209.455078125, "loss": 1.3544, "nll_loss": 0.9227968454360962, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.538509845733643, "rewards/margins": 3.450572967529297, "rewards/rejected": 2.0879368782043457, "step": 8900 }, { "epoch": 0.4943065976893524, "grad_norm": 54.87151336669922, "learning_rate": 5.089319311723419e-08, "logits/chosen": -0.3627270758152008, "logits/rejected": -0.5207785964012146, "logps/chosen": -149.60006713867188, "logps/rejected": -223.32992553710938, "loss": 1.2309, "nll_loss": 0.9427006840705872, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.925708293914795, "rewards/margins": 4.0009918212890625, "rewards/rejected": 1.9247167110443115, "step": 8910 }, { "epoch": 0.4948613750156031, "grad_norm": 47.99515151977539, "learning_rate": 5.0806060283352636e-08, "logits/chosen": -0.40103524923324585, "logits/rejected": -0.508873462677002, "logps/chosen": -192.48849487304688, "logps/rejected": -243.2057342529297, "loss": 1.2448, "nll_loss": 1.0411288738250732, "rewards/accuracies": 0.875, "rewards/chosen": 5.980090141296387, "rewards/margins": 3.6297004222869873, "rewards/rejected": 2.350389242172241, "step": 8920 }, { "epoch": 0.49541615234185377, "grad_norm": 65.2103042602539, "learning_rate": 5.0718925000878054e-08, "logits/chosen": -0.4266236424446106, "logits/rejected": -0.5535237193107605, "logps/chosen": -175.2716064453125, "logps/rejected": -242.2535400390625, "loss": 1.1515, "nll_loss": 0.9992551803588867, "rewards/accuracies": 0.875, "rewards/chosen": 6.330218315124512, "rewards/margins": 3.893296480178833, "rewards/rejected": 2.436922550201416, "step": 8930 }, { "epoch": 0.49597092966810447, "grad_norm": 101.24113464355469, "learning_rate": 5.063178753450381e-08, "logits/chosen": -0.13781292736530304, "logits/rejected": -0.29342782497406006, "logps/chosen": -114.6077880859375, "logps/rejected": -174.44699096679688, "loss": 1.2208, "nll_loss": 0.7622730731964111, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.34787130355835, "rewards/margins": 2.809565305709839, "rewards/rejected": 1.5383061170578003, "step": 8940 }, { "epoch": 0.49652570699435516, "grad_norm": 75.74856567382812, "learning_rate": 5.0544648148930005e-08, "logits/chosen": -0.2966635823249817, "logits/rejected": -0.437977135181427, "logps/chosen": -201.9712677001953, "logps/rejected": -264.7134094238281, "loss": 1.2483, "nll_loss": 1.0638071298599243, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.1540117263793945, "rewards/margins": 4.22914981842041, "rewards/rejected": 1.9248619079589844, "step": 8950 }, { "epoch": 0.4970804843206058, "grad_norm": 53.319393157958984, "learning_rate": 5.0457507108862474e-08, "logits/chosen": -0.20097801089286804, "logits/rejected": -0.37530016899108887, "logps/chosen": -157.8563690185547, "logps/rejected": -205.6762237548828, "loss": 1.3172, "nll_loss": 0.9467114210128784, "rewards/accuracies": 0.75, "rewards/chosen": 5.260502815246582, "rewards/margins": 2.7744460105895996, "rewards/rejected": 2.4860565662384033, "step": 8960 }, { "epoch": 0.4976352616468565, "grad_norm": 103.03054809570312, "learning_rate": 5.0370364679012134e-08, "logits/chosen": -0.2561754286289215, "logits/rejected": -0.4238462448120117, "logps/chosen": -157.69876098632812, "logps/rejected": -251.7804412841797, "loss": 1.3476, "nll_loss": 0.8771616220474243, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.624480247497559, "rewards/margins": 4.1874284744262695, "rewards/rejected": 1.4370521306991577, "step": 8970 }, { "epoch": 0.49819003897310715, "grad_norm": 60.20049285888672, "learning_rate": 5.028322112409412e-08, "logits/chosen": -0.1408107429742813, "logits/rejected": -0.3607892692089081, "logps/chosen": -142.41387939453125, "logps/rejected": -230.49667358398438, "loss": 1.266, "nll_loss": 0.7529163956642151, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.326937675476074, "rewards/margins": 4.1454057693481445, "rewards/rejected": 1.181531310081482, "step": 8980 }, { "epoch": 0.49874481629935785, "grad_norm": 68.71036529541016, "learning_rate": 5.019607670882696e-08, "logits/chosen": -0.40607452392578125, "logits/rejected": -0.4826090335845947, "logps/chosen": -166.85350036621094, "logps/rejected": -235.6752471923828, "loss": 1.3243, "nll_loss": 1.003832459449768, "rewards/accuracies": 0.75, "rewards/chosen": 6.251927852630615, "rewards/margins": 3.4062037467956543, "rewards/rejected": 2.845724105834961, "step": 8990 }, { "epoch": 0.4992995936256085, "grad_norm": 53.574527740478516, "learning_rate": 5.010893169793181e-08, "logits/chosen": -0.19645099341869354, "logits/rejected": -0.3450215458869934, "logps/chosen": -160.8070068359375, "logps/rejected": -215.6662139892578, "loss": 1.2199, "nll_loss": 0.9896215200424194, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.662204742431641, "rewards/margins": 4.123475074768066, "rewards/rejected": 1.5387299060821533, "step": 9000 }, { "epoch": 0.4992995936256085, "eval_logits/chosen": -0.4024575352668762, "eval_logits/rejected": -0.5044897198677063, "eval_logps/chosen": -192.44297790527344, "eval_logps/rejected": -259.1492919921875, "eval_loss": 1.2387713193893433, "eval_nll_loss": 0.9994342923164368, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.55182409286499, "eval_rewards/margins": 4.558903217315674, "eval_rewards/rejected": 1.9929208755493164, "eval_runtime": 17.1585, "eval_samples_per_second": 14.92, "eval_steps_per_second": 1.865, "step": 9000 }, { "epoch": 0.4998543709518592, "grad_norm": 54.899871826171875, "learning_rate": 5.0021786356131635e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -167.99615478515625, "logps/rejected": -205.74496459960938, "loss": 1.3748, "nll_loss": NaN, "rewards/accuracies": 0.75, "rewards/chosen": 5.293550491333008, "rewards/margins": 2.5636558532714844, "rewards/rejected": 2.7298946380615234, "step": 9010 }, { "epoch": 0.5004091482781099, "grad_norm": 58.67256546020508, "learning_rate": 4.9934640948150405e-08, "logits/chosen": -0.3223643898963928, "logits/rejected": -0.47426262497901917, "logps/chosen": -164.1899871826172, "logps/rejected": -210.4023895263672, "loss": 1.1899, "nll_loss": 0.9342159032821655, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.415058612823486, "rewards/margins": 3.5778567790985107, "rewards/rejected": 1.837201476097107, "step": 9020 }, { "epoch": 0.5009639256043605, "grad_norm": 49.9996223449707, "learning_rate": 4.984749573871227e-08, "logits/chosen": -0.3344659209251404, "logits/rejected": -0.4435255527496338, "logps/chosen": -145.72366333007812, "logps/rejected": -205.8213348388672, "loss": 1.2959, "nll_loss": 0.9082239866256714, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.198209285736084, "rewards/margins": 3.0010673999786377, "rewards/rejected": 2.1971421241760254, "step": 9030 }, { "epoch": 0.5015187029306112, "grad_norm": 47.84605026245117, "learning_rate": 4.9760350992540836e-08, "logits/chosen": -0.3379828631877899, "logits/rejected": -0.48572176694869995, "logps/chosen": -164.86590576171875, "logps/rejected": -215.35763549804688, "loss": 1.2641, "nll_loss": 1.043215274810791, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.340142250061035, "rewards/margins": 3.15108585357666, "rewards/rejected": 2.189056396484375, "step": 9040 }, { "epoch": 0.5020734802568619, "grad_norm": 69.17318725585938, "learning_rate": 4.967320697435824e-08, "logits/chosen": -0.43921709060668945, "logits/rejected": -0.4727630615234375, "logps/chosen": -218.1172637939453, "logps/rejected": -279.47357177734375, "loss": 1.3764, "nll_loss": 1.1762627363204956, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 6.6129326820373535, "rewards/margins": 3.7094714641571045, "rewards/rejected": 2.90346097946167, "step": 9050 }, { "epoch": 0.5026282575831126, "grad_norm": 45.45820999145508, "learning_rate": 4.958606394888445e-08, "logits/chosen": -0.23763099312782288, "logits/rejected": -0.3468344211578369, "logps/chosen": -150.2931671142578, "logps/rejected": -200.9547576904297, "loss": 1.2636, "nll_loss": 0.8626793026924133, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.093936443328857, "rewards/margins": 3.113210678100586, "rewards/rejected": 1.9807260036468506, "step": 9060 }, { "epoch": 0.5031830349093632, "grad_norm": 41.96353530883789, "learning_rate": 4.949892218083638e-08, "logits/chosen": -0.30053022503852844, "logits/rejected": -0.4640938341617584, "logps/chosen": -155.72662353515625, "logps/rejected": -216.5644989013672, "loss": 1.2078, "nll_loss": 0.9101463556289673, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.764248847961426, "rewards/margins": 3.896909236907959, "rewards/rejected": 1.8673397302627563, "step": 9070 }, { "epoch": 0.503737812235614, "grad_norm": 95.07010650634766, "learning_rate": 4.941178193492713e-08, "logits/chosen": -0.26831507682800293, "logits/rejected": -0.3470011353492737, "logps/chosen": -150.46742248535156, "logps/rejected": -210.14584350585938, "loss": 1.2531, "nll_loss": 0.9086271524429321, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.583650588989258, "rewards/margins": 2.7660763263702393, "rewards/rejected": 2.8175745010375977, "step": 9080 }, { "epoch": 0.5042925895618646, "grad_norm": 55.530311584472656, "learning_rate": 4.932464347586522e-08, "logits/chosen": -0.36913302540779114, "logits/rejected": -0.43520718812942505, "logps/chosen": -192.30624389648438, "logps/rejected": -214.4044647216797, "loss": 1.394, "nll_loss": 1.1062183380126953, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.194088459014893, "rewards/margins": 3.0849320888519287, "rewards/rejected": 3.1091561317443848, "step": 9090 }, { "epoch": 0.5048473668881153, "grad_norm": 45.58949661254883, "learning_rate": 4.9237507068353705e-08, "logits/chosen": -0.31153604388237, "logits/rejected": -0.45507732033729553, "logps/chosen": -167.9213104248047, "logps/rejected": -238.15774536132812, "loss": 1.2972, "nll_loss": 0.9619966745376587, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.149403095245361, "rewards/margins": 4.006768226623535, "rewards/rejected": 2.142634630203247, "step": 9100 }, { "epoch": 0.5054021442143659, "grad_norm": 43.78194808959961, "learning_rate": 4.91503729770894e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -141.9030303955078, "logps/rejected": -233.8895721435547, "loss": 1.2054, "nll_loss": NaN, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.323995113372803, "rewards/margins": 3.1030120849609375, "rewards/rejected": 2.2209832668304443, "step": 9110 }, { "epoch": 0.5059569215406167, "grad_norm": 62.937625885009766, "learning_rate": 4.906324146676212e-08, "logits/chosen": -0.12656566500663757, "logits/rejected": -0.3648197650909424, "logps/chosen": -121.91495513916016, "logps/rejected": -181.48928833007812, "loss": 1.2619, "nll_loss": 0.7313886880874634, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.360525608062744, "rewards/margins": 3.2105624675750732, "rewards/rejected": 1.1499627828598022, "step": 9120 }, { "epoch": 0.5065116988668673, "grad_norm": 73.8137435913086, "learning_rate": 4.897611280205377e-08, "logits/chosen": -0.3255676031112671, "logits/rejected": -0.4526425302028656, "logps/chosen": -153.31971740722656, "logps/rejected": -201.62258911132812, "loss": 1.2252, "nll_loss": 0.9077402353286743, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.789377212524414, "rewards/margins": 3.29107928276062, "rewards/rejected": 2.498297929763794, "step": 9130 }, { "epoch": 0.507066476193118, "grad_norm": 136.17686462402344, "learning_rate": 4.888898724763772e-08, "logits/chosen": -0.37891626358032227, "logits/rejected": -0.5509136319160461, "logps/chosen": -166.04354858398438, "logps/rejected": -227.5803680419922, "loss": 1.334, "nll_loss": 0.9960344433784485, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.237698078155518, "rewards/margins": 3.9502997398376465, "rewards/rejected": 2.2873973846435547, "step": 9140 }, { "epoch": 0.5076212535193687, "grad_norm": 90.369384765625, "learning_rate": 4.8801865068177804e-08, "logits/chosen": -0.37518784403800964, "logits/rejected": -0.48688000440597534, "logps/chosen": -155.38690185546875, "logps/rejected": -219.45492553710938, "loss": 1.4013, "nll_loss": 1.0259861946105957, "rewards/accuracies": 0.75, "rewards/chosen": 5.525850296020508, "rewards/margins": 3.486804962158203, "rewards/rejected": 2.0390450954437256, "step": 9150 }, { "epoch": 0.5081760308456194, "grad_norm": 62.08201599121094, "learning_rate": 4.871474652832763e-08, "logits/chosen": -0.5720881223678589, "logits/rejected": -0.6331910490989685, "logps/chosen": -214.214111328125, "logps/rejected": -301.6823425292969, "loss": 1.3024, "nll_loss": 1.1901862621307373, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 7.2337236404418945, "rewards/margins": 3.9172492027282715, "rewards/rejected": 3.316474199295044, "step": 9160 }, { "epoch": 0.50873080817187, "grad_norm": 104.32915496826172, "learning_rate": 4.8627631892729755e-08, "logits/chosen": -0.3542863726615906, "logits/rejected": -0.47028714418411255, "logps/chosen": -151.53012084960938, "logps/rejected": -191.0808563232422, "loss": 1.3353, "nll_loss": 0.8963730931282043, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.543292999267578, "rewards/margins": 3.2578887939453125, "rewards/rejected": 2.2854042053222656, "step": 9170 }, { "epoch": 0.5092855854981206, "grad_norm": 84.5309829711914, "learning_rate": 4.854052142601485e-08, "logits/chosen": -0.42346876859664917, "logits/rejected": -0.45761674642562866, "logps/chosen": -210.19479370117188, "logps/rejected": -253.4579620361328, "loss": 1.2789, "nll_loss": 1.1564407348632812, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.436079978942871, "rewards/margins": 3.428272247314453, "rewards/rejected": 3.007807970046997, "step": 9180 }, { "epoch": 0.5098403628243714, "grad_norm": 56.795047760009766, "learning_rate": 4.8453415392800975e-08, "logits/chosen": -0.34913143515586853, "logits/rejected": -0.5700281262397766, "logps/chosen": -198.15798950195312, "logps/rejected": -287.1739196777344, "loss": 1.3482, "nll_loss": 0.9863710403442383, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 6.719827175140381, "rewards/margins": 5.301693916320801, "rewards/rejected": 1.4181333780288696, "step": 9190 }, { "epoch": 0.510395140150622, "grad_norm": 66.92633056640625, "learning_rate": 4.836631405769268e-08, "logits/chosen": -0.33419111371040344, "logits/rejected": -0.46793827414512634, "logps/chosen": -167.11634826660156, "logps/rejected": -215.9599151611328, "loss": 1.2058, "nll_loss": 0.9543848037719727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.533473014831543, "rewards/margins": 3.174680233001709, "rewards/rejected": 2.358793020248413, "step": 9200 }, { "epoch": 0.5109499174768727, "grad_norm": 61.43981170654297, "learning_rate": 4.827921768528025e-08, "logits/chosen": -0.25474053621292114, "logits/rejected": -0.3410353660583496, "logps/chosen": -163.89346313476562, "logps/rejected": -216.3996124267578, "loss": 1.3041, "nll_loss": 0.9180054664611816, "rewards/accuracies": 0.75, "rewards/chosen": 5.6906418800354, "rewards/margins": 2.7940871715545654, "rewards/rejected": 2.8965542316436768, "step": 9210 }, { "epoch": 0.5115046948031234, "grad_norm": 94.89171600341797, "learning_rate": 4.81921265401389e-08, "logits/chosen": -0.30657464265823364, "logits/rejected": -0.4406011998653412, "logps/chosen": -154.62619018554688, "logps/rejected": -199.08432006835938, "loss": 1.2845, "nll_loss": 0.9502051472663879, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.735767364501953, "rewards/margins": 3.5599846839904785, "rewards/rejected": 2.175783157348633, "step": 9220 }, { "epoch": 0.5120594721293741, "grad_norm": 71.24378204345703, "learning_rate": 4.810504088682795e-08, "logits/chosen": -0.4792702794075012, "logits/rejected": -0.5561822652816772, "logps/chosen": -193.745849609375, "logps/rejected": -241.57131958007812, "loss": 1.2862, "nll_loss": 1.1115574836730957, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 6.186019420623779, "rewards/margins": 2.6864452362060547, "rewards/rejected": 3.499574661254883, "step": 9230 }, { "epoch": 0.5126142494556247, "grad_norm": 50.33415603637695, "learning_rate": 4.8017960989890084e-08, "logits/chosen": -0.275177538394928, "logits/rejected": -0.39323943853378296, "logps/chosen": -178.32994079589844, "logps/rejected": -220.71353149414062, "loss": 1.3117, "nll_loss": 1.0821926593780518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.963408946990967, "rewards/margins": 2.5242600440979004, "rewards/rejected": 2.4391491413116455, "step": 9240 }, { "epoch": 0.5131690267818754, "grad_norm": 53.26866912841797, "learning_rate": 4.793088711385044e-08, "logits/chosen": -0.26483121514320374, "logits/rejected": -0.4290854036808014, "logps/chosen": -141.33038330078125, "logps/rejected": -187.74212646484375, "loss": 1.2095, "nll_loss": 0.8839927911758423, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.510378360748291, "rewards/margins": 3.708258867263794, "rewards/rejected": 1.8021198511123657, "step": 9250 }, { "epoch": 0.5137238041081261, "grad_norm": 58.61595153808594, "learning_rate": 4.7843819523215904e-08, "logits/chosen": -0.23977124691009521, "logits/rejected": -0.3805224895477295, "logps/chosen": -177.07260131835938, "logps/rejected": -231.0491180419922, "loss": 1.2659, "nll_loss": 0.9728581309318542, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.701986789703369, "rewards/margins": 3.4946236610412598, "rewards/rejected": 2.2073636054992676, "step": 9260 }, { "epoch": 0.5142785814343768, "grad_norm": 70.80389404296875, "learning_rate": 4.7756758482474266e-08, "logits/chosen": -0.33648785948753357, "logits/rejected": -0.5013399720191956, "logps/chosen": -190.47677612304688, "logps/rejected": -251.017578125, "loss": 1.2799, "nll_loss": 1.0157908201217651, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.240078926086426, "rewards/margins": 4.198624610900879, "rewards/rejected": 2.0414538383483887, "step": 9270 }, { "epoch": 0.5148333587606274, "grad_norm": 40.859596252441406, "learning_rate": 4.766970425609338e-08, "logits/chosen": -0.22810812294483185, "logits/rejected": -0.43857163190841675, "logps/chosen": -168.486083984375, "logps/rejected": -222.28384399414062, "loss": 1.1961, "nll_loss": 0.9034191370010376, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.565457344055176, "rewards/margins": 3.08699369430542, "rewards/rejected": 2.478463649749756, "step": 9280 }, { "epoch": 0.5153881360868782, "grad_norm": 60.25469970703125, "learning_rate": 4.758265710852047e-08, "logits/chosen": -0.3398720622062683, "logits/rejected": -0.4708589017391205, "logps/chosen": -140.80482482910156, "logps/rejected": -200.8313751220703, "loss": 1.2658, "nll_loss": 1.0187203884124756, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.520662307739258, "rewards/margins": 3.972774028778076, "rewards/rejected": 1.5478891134262085, "step": 9290 }, { "epoch": 0.5159429134131288, "grad_norm": 71.22685241699219, "learning_rate": 4.749561730418121e-08, "logits/chosen": -0.25368431210517883, "logits/rejected": -0.3661794662475586, "logps/chosen": -163.49826049804688, "logps/rejected": -211.91513061523438, "loss": 1.1583, "nll_loss": 1.0610846281051636, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.6848297119140625, "rewards/margins": 3.0135960578918457, "rewards/rejected": 2.6712327003479004, "step": 9300 }, { "epoch": 0.5164976907393795, "grad_norm": 40.56968688964844, "learning_rate": 4.7408585107478966e-08, "logits/chosen": -0.30613285303115845, "logits/rejected": -0.463456928730011, "logps/chosen": -157.46041870117188, "logps/rejected": -230.9835968017578, "loss": 1.1434, "nll_loss": 0.8922632932662964, "rewards/accuracies": 0.875, "rewards/chosen": 5.7883405685424805, "rewards/margins": 4.518655776977539, "rewards/rejected": 1.269684910774231, "step": 9310 }, { "epoch": 0.5170524680656302, "grad_norm": 105.7554702758789, "learning_rate": 4.7321560782794e-08, "logits/chosen": -0.2926510274410248, "logits/rejected": -0.4659983515739441, "logps/chosen": -149.47518920898438, "logps/rejected": -204.29615783691406, "loss": 1.2726, "nll_loss": 1.0316669940948486, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.572632789611816, "rewards/margins": 3.1992945671081543, "rewards/rejected": 2.373338460922241, "step": 9320 }, { "epoch": 0.5176072453918809, "grad_norm": 45.35145950317383, "learning_rate": 4.723454459448267e-08, "logits/chosen": -0.39577716588974, "logits/rejected": -0.5708610415458679, "logps/chosen": -166.49581909179688, "logps/rejected": -234.752197265625, "loss": 1.2841, "nll_loss": 1.0080711841583252, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.923181056976318, "rewards/margins": 4.514263153076172, "rewards/rejected": 1.408917784690857, "step": 9330 }, { "epoch": 0.5181620227181315, "grad_norm": 49.20162582397461, "learning_rate": 4.714753680687661e-08, "logits/chosen": -0.29385411739349365, "logits/rejected": -0.49700015783309937, "logps/chosen": -186.43679809570312, "logps/rejected": -231.0037841796875, "loss": 1.2745, "nll_loss": 0.9985346794128418, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.996167182922363, "rewards/margins": 4.20475435256958, "rewards/rejected": 1.7914127111434937, "step": 9340 }, { "epoch": 0.5187168000443821, "grad_norm": 62.807979583740234, "learning_rate": 4.706053768428194e-08, "logits/chosen": -0.3283035159111023, "logits/rejected": -0.45695775747299194, "logps/chosen": -152.83058166503906, "logps/rejected": -229.2628173828125, "loss": 1.2467, "nll_loss": 0.9692705869674683, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.828050136566162, "rewards/margins": 3.535831928253174, "rewards/rejected": 2.2922184467315674, "step": 9350 }, { "epoch": 0.5192715773706329, "grad_norm": 50.36726379394531, "learning_rate": 4.6973547490978464e-08, "logits/chosen": -0.2728636860847473, "logits/rejected": -0.3863973617553711, "logps/chosen": -159.2672882080078, "logps/rejected": -209.1708526611328, "loss": 1.268, "nll_loss": 0.9585272073745728, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.8927998542785645, "rewards/margins": 4.422527313232422, "rewards/rejected": 1.4702714681625366, "step": 9360 }, { "epoch": 0.5198263546968835, "grad_norm": 40.91887283325195, "learning_rate": 4.688656649121884e-08, "logits/chosen": -0.2662855088710785, "logits/rejected": -0.46767717599868774, "logps/chosen": -149.1509246826172, "logps/rejected": -214.30929565429688, "loss": 1.1965, "nll_loss": 0.9241575002670288, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.647675037384033, "rewards/margins": 3.9500439167022705, "rewards/rejected": 1.6976312398910522, "step": 9370 }, { "epoch": 0.5203811320231342, "grad_norm": 42.508792877197266, "learning_rate": 4.679959494922778e-08, "logits/chosen": -0.3795866370201111, "logits/rejected": -0.4913211464881897, "logps/chosen": -180.6576385498047, "logps/rejected": -218.34091186523438, "loss": 1.3003, "nll_loss": 1.0153512954711914, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.015936374664307, "rewards/margins": 3.199903964996338, "rewards/rejected": 2.8160321712493896, "step": 9380 }, { "epoch": 0.5209359093493849, "grad_norm": 62.78444290161133, "learning_rate": 4.6712633129201365e-08, "logits/chosen": -0.34073182940483093, "logits/rejected": -0.5074422955513, "logps/chosen": -158.64015197753906, "logps/rejected": -231.17691040039062, "loss": 1.2725, "nll_loss": 0.9371173977851868, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.835041046142578, "rewards/margins": 4.626204490661621, "rewards/rejected": 1.2088369131088257, "step": 9390 }, { "epoch": 0.5214906866756356, "grad_norm": 58.16633987426758, "learning_rate": 4.662568129530603e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -156.18252563476562, "logps/rejected": -216.96456909179688, "loss": 1.2165, "nll_loss": NaN, "rewards/accuracies": 0.875, "rewards/chosen": 5.5341057777404785, "rewards/margins": 4.276349067687988, "rewards/rejected": 1.2577569484710693, "step": 9400 }, { "epoch": 0.5220454640018862, "grad_norm": 66.12977600097656, "learning_rate": 4.6538739711677946e-08, "logits/chosen": -0.2907131314277649, "logits/rejected": -0.4284954071044922, "logps/chosen": -148.6352996826172, "logps/rejected": -196.28317260742188, "loss": 1.2471, "nll_loss": 0.9851943850517273, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.316567420959473, "rewards/margins": 3.416973829269409, "rewards/rejected": 1.899593710899353, "step": 9410 }, { "epoch": 0.5226002413281369, "grad_norm": 104.19569396972656, "learning_rate": 4.645180864242208e-08, "logits/chosen": -0.2709147334098816, "logits/rejected": -0.348580002784729, "logps/chosen": -168.90243530273438, "logps/rejected": -231.1278533935547, "loss": 1.2662, "nll_loss": 0.9691041111946106, "rewards/accuracies": 0.875, "rewards/chosen": 6.149167060852051, "rewards/margins": 3.7617859840393066, "rewards/rejected": 2.3873813152313232, "step": 9420 }, { "epoch": 0.5231550186543876, "grad_norm": 25.868131637573242, "learning_rate": 4.636488835161151e-08, "logits/chosen": -0.35118424892425537, "logits/rejected": -0.5087558031082153, "logps/chosen": -182.45474243164062, "logps/rejected": -249.97134399414062, "loss": 1.2443, "nll_loss": 1.010013222694397, "rewards/accuracies": 0.875, "rewards/chosen": 6.450409889221191, "rewards/margins": 4.733773231506348, "rewards/rejected": 1.7166366577148438, "step": 9430 }, { "epoch": 0.5237097959806383, "grad_norm": 50.46853256225586, "learning_rate": 4.6277979103286604e-08, "logits/chosen": -0.2707952857017517, "logits/rejected": -0.4313136637210846, "logps/chosen": -158.99472045898438, "logps/rejected": -214.5619354248047, "loss": 1.3258, "nll_loss": 0.9249902963638306, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.994052410125732, "rewards/margins": 3.4994099140167236, "rewards/rejected": 2.494642734527588, "step": 9440 }, { "epoch": 0.5242645733068889, "grad_norm": 123.99307250976562, "learning_rate": 4.6191081161454104e-08, "logits/chosen": -0.3308565020561218, "logits/rejected": -0.4992128312587738, "logps/chosen": -195.032470703125, "logps/rejected": -230.4774932861328, "loss": 1.2792, "nll_loss": 0.9932400584220886, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.721206188201904, "rewards/margins": 3.06563138961792, "rewards/rejected": 2.6555750370025635, "step": 9450 }, { "epoch": 0.5248193506331397, "grad_norm": 40.34004592895508, "learning_rate": 4.610419479008646e-08, "logits/chosen": -0.2730061411857605, "logits/rejected": -0.4122668206691742, "logps/chosen": -177.38949584960938, "logps/rejected": -219.531005859375, "loss": 1.2804, "nll_loss": 1.013285517692566, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.72647762298584, "rewards/margins": 3.58758544921875, "rewards/rejected": 2.1388919353485107, "step": 9460 }, { "epoch": 0.5253741279593903, "grad_norm": 49.95905685424805, "learning_rate": 4.601732025312094e-08, "logits/chosen": -0.07873831689357758, "logits/rejected": -0.35095566511154175, "logps/chosen": -119.7325439453125, "logps/rejected": -169.24378967285156, "loss": 1.2533, "nll_loss": 0.7773770093917847, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.6324920654296875, "rewards/margins": 3.6657466888427734, "rewards/rejected": 0.9667451977729797, "step": 9470 }, { "epoch": 0.525928905285641, "grad_norm": 76.98511505126953, "learning_rate": 4.5930457814458904e-08, "logits/chosen": -0.2371709793806076, "logits/rejected": -0.3776131272315979, "logps/chosen": -136.942626953125, "logps/rejected": -186.45135498046875, "loss": 1.3325, "nll_loss": 0.9275113940238953, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.964024543762207, "rewards/margins": 4.096395015716553, "rewards/rejected": 0.8676289319992065, "step": 9480 }, { "epoch": 0.5264836826118916, "grad_norm": 31.29893684387207, "learning_rate": 4.5843607737964936e-08, "logits/chosen": -0.26233673095703125, "logits/rejected": -0.3851124048233032, "logps/chosen": -166.62661743164062, "logps/rejected": -233.799072265625, "loss": 1.2413, "nll_loss": 1.006217122077942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.642024040222168, "rewards/margins": 4.011407852172852, "rewards/rejected": 1.6306159496307373, "step": 9490 }, { "epoch": 0.5270384599381424, "grad_norm": 68.17864227294922, "learning_rate": 4.575677028746606e-08, "logits/chosen": -0.31898969411849976, "logits/rejected": -0.44344624876976013, "logps/chosen": -159.30613708496094, "logps/rejected": -202.1865234375, "loss": 1.2221, "nll_loss": 0.952996551990509, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.8765974044799805, "rewards/margins": 4.293417930603027, "rewards/rejected": 1.5831793546676636, "step": 9500 }, { "epoch": 0.5270384599381424, "eval_logits/chosen": -0.3847997784614563, "eval_logits/rejected": -0.48578494787216187, "eval_logps/chosen": -191.77676391601562, "eval_logps/rejected": -261.9651184082031, "eval_loss": 1.2324037551879883, "eval_nll_loss": 0.9943583011627197, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.618445873260498, "eval_rewards/margins": 4.907104969024658, "eval_rewards/rejected": 1.7113406658172607, "eval_runtime": 16.8711, "eval_samples_per_second": 15.174, "eval_steps_per_second": 1.897, "step": 9500 }, { "epoch": 0.527593237264393, "grad_norm": 63.11122131347656, "learning_rate": 4.566994572675096e-08, "logits/chosen": -0.33774352073669434, "logits/rejected": -0.47510844469070435, "logps/chosen": -159.69754028320312, "logps/rejected": -214.65634155273438, "loss": 1.3338, "nll_loss": 0.9783695340156555, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 6.225392818450928, "rewards/margins": 3.6365272998809814, "rewards/rejected": 2.5888662338256836, "step": 9510 }, { "epoch": 0.5281480145906436, "grad_norm": 27.57427215576172, "learning_rate": 4.5583134319569135e-08, "logits/chosen": -0.209748774766922, "logits/rejected": -0.33275189995765686, "logps/chosen": -148.1979217529297, "logps/rejected": -178.30247497558594, "loss": 1.2496, "nll_loss": 0.9017229080200195, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.141815185546875, "rewards/margins": 2.856659412384033, "rewards/rejected": 2.285155773162842, "step": 9520 }, { "epoch": 0.5287027919168944, "grad_norm": 61.95778274536133, "learning_rate": 4.549633632963019e-08, "logits/chosen": -0.2876083254814148, "logits/rejected": -0.42835497856140137, "logps/chosen": -196.5648956298828, "logps/rejected": -240.12460327148438, "loss": 1.2341, "nll_loss": 0.9770992398262024, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.147914886474609, "rewards/margins": 4.078275203704834, "rewards/rejected": 2.069639205932617, "step": 9530 }, { "epoch": 0.529257569243145, "grad_norm": 61.39414596557617, "learning_rate": 4.540955202060293e-08, "logits/chosen": -0.3222961723804474, "logits/rejected": -0.5214440226554871, "logps/chosen": -192.00732421875, "logps/rejected": -298.51129150390625, "loss": 1.2755, "nll_loss": 0.9837193489074707, "rewards/accuracies": 0.875, "rewards/chosen": 6.313649654388428, "rewards/margins": 5.230317115783691, "rewards/rejected": 1.0833323001861572, "step": 9540 }, { "epoch": 0.5298123465693957, "grad_norm": 78.86589813232422, "learning_rate": 4.532278165611458e-08, "logits/chosen": -0.23326829075813293, "logits/rejected": -0.39289242029190063, "logps/chosen": -139.88034057617188, "logps/rejected": -177.99623107910156, "loss": 1.2034, "nll_loss": 0.8656681180000305, "rewards/accuracies": 0.875, "rewards/chosen": 5.314993858337402, "rewards/margins": 3.008488178253174, "rewards/rejected": 2.3065056800842285, "step": 9550 }, { "epoch": 0.5303671238956463, "grad_norm": 163.3700714111328, "learning_rate": 4.5236025499750055e-08, "logits/chosen": -0.42433637380599976, "logits/rejected": -0.5292702913284302, "logps/chosen": -195.80093383789062, "logps/rejected": -266.8990173339844, "loss": 1.3414, "nll_loss": 1.0810550451278687, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 6.120522499084473, "rewards/margins": 3.1135170459747314, "rewards/rejected": 3.007004976272583, "step": 9560 }, { "epoch": 0.5309219012218971, "grad_norm": 39.24164962768555, "learning_rate": 4.5149283815051045e-08, "logits/chosen": -0.16320346295833588, "logits/rejected": -0.2912190854549408, "logps/chosen": -142.26515197753906, "logps/rejected": -218.2663116455078, "loss": 1.2095, "nll_loss": 0.839637279510498, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.451222896575928, "rewards/margins": 4.63043737411499, "rewards/rejected": 0.8207852244377136, "step": 9570 }, { "epoch": 0.5314766785481477, "grad_norm": 57.75300598144531, "learning_rate": 4.506255686551537e-08, "logits/chosen": -0.37713342905044556, "logits/rejected": -0.5360641479492188, "logps/chosen": -187.2853546142578, "logps/rejected": -267.45574951171875, "loss": 1.3412, "nll_loss": 1.0303289890289307, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.236812114715576, "rewards/margins": 4.56418514251709, "rewards/rejected": 1.672626256942749, "step": 9580 }, { "epoch": 0.5320314558743984, "grad_norm": 67.17675018310547, "learning_rate": 4.497584491459601e-08, "logits/chosen": -0.23767873644828796, "logits/rejected": -0.42438793182373047, "logps/chosen": -161.55792236328125, "logps/rejected": -221.64810180664062, "loss": 1.2575, "nll_loss": 0.8866890072822571, "rewards/accuracies": 0.875, "rewards/chosen": 5.824664115905762, "rewards/margins": 4.561118125915527, "rewards/rejected": 1.2635459899902344, "step": 9590 }, { "epoch": 0.5325862332006491, "grad_norm": 58.9091796875, "learning_rate": 4.48891482257004e-08, "logits/chosen": -0.26320698857307434, "logits/rejected": -0.3689182698726654, "logps/chosen": -163.16287231445312, "logps/rejected": -196.4301300048828, "loss": 1.3376, "nll_loss": 1.03379225730896, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.770537853240967, "rewards/margins": 3.680947780609131, "rewards/rejected": 2.089590072631836, "step": 9600 }, { "epoch": 0.5331410105268998, "grad_norm": 61.47688293457031, "learning_rate": 4.480246706218964e-08, "logits/chosen": -0.2204124480485916, "logits/rejected": -0.31043797731399536, "logps/chosen": -161.21546936035156, "logps/rejected": -218.0150909423828, "loss": 1.306, "nll_loss": 0.8733453750610352, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.2873029708862305, "rewards/margins": 3.6474196910858154, "rewards/rejected": 1.6398833990097046, "step": 9610 }, { "epoch": 0.5336957878531504, "grad_norm": 48.654083251953125, "learning_rate": 4.471580168737763e-08, "logits/chosen": -0.20148694515228271, "logits/rejected": -0.3757801055908203, "logps/chosen": -156.82908630371094, "logps/rejected": -218.42886352539062, "loss": 1.3186, "nll_loss": 0.9077130556106567, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.419076919555664, "rewards/margins": 3.657808780670166, "rewards/rejected": 1.7612682580947876, "step": 9620 }, { "epoch": 0.5342505651794012, "grad_norm": 93.73970031738281, "learning_rate": 4.462915236453037e-08, "logits/chosen": -0.137271448969841, "logits/rejected": -0.24905912578105927, "logps/chosen": -151.14271545410156, "logps/rejected": -197.56529235839844, "loss": 1.2949, "nll_loss": 0.8466536402702332, "rewards/accuracies": 0.875, "rewards/chosen": 5.110020637512207, "rewards/margins": 3.130384922027588, "rewards/rejected": 1.9796355962753296, "step": 9630 }, { "epoch": 0.5348053425056518, "grad_norm": 65.98847198486328, "learning_rate": 4.4542519356865025e-08, "logits/chosen": -0.17641180753707886, "logits/rejected": -0.40553078055381775, "logps/chosen": -115.76924133300781, "logps/rejected": -193.11819458007812, "loss": 1.176, "nll_loss": 0.7737227082252502, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.877331733703613, "rewards/margins": 3.8386740684509277, "rewards/rejected": 1.0386579036712646, "step": 9640 }, { "epoch": 0.5353601198319025, "grad_norm": 54.22218704223633, "learning_rate": 4.445590292754926e-08, "logits/chosen": -0.27017146348953247, "logits/rejected": -0.44109922647476196, "logps/chosen": -162.62646484375, "logps/rejected": -268.144287109375, "loss": 1.2267, "nll_loss": 0.8913278579711914, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.323023319244385, "rewards/margins": 4.736567497253418, "rewards/rejected": 1.5864553451538086, "step": 9650 }, { "epoch": 0.5359148971581531, "grad_norm": 54.476959228515625, "learning_rate": 4.436930333970032e-08, "logits/chosen": -0.38154515624046326, "logits/rejected": -0.5023744702339172, "logps/chosen": -179.91366577148438, "logps/rejected": -209.097412109375, "loss": 1.2911, "nll_loss": 1.0336401462554932, "rewards/accuracies": 0.875, "rewards/chosen": 6.029297828674316, "rewards/margins": 3.258763551712036, "rewards/rejected": 2.7705347537994385, "step": 9660 }, { "epoch": 0.5364696744844039, "grad_norm": 41.79189682006836, "learning_rate": 4.428272085638431e-08, "logits/chosen": -0.23520343005657196, "logits/rejected": -0.42612147331237793, "logps/chosen": -166.47738647460938, "logps/rejected": -210.00961303710938, "loss": 1.181, "nll_loss": 0.9167992472648621, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.926709175109863, "rewards/margins": 4.105425834655762, "rewards/rejected": 1.821282982826233, "step": 9670 }, { "epoch": 0.5370244518106545, "grad_norm": 58.55911636352539, "learning_rate": 4.4196155740615434e-08, "logits/chosen": -0.3951946496963501, "logits/rejected": -0.46657371520996094, "logps/chosen": -187.66921997070312, "logps/rejected": -247.8221893310547, "loss": 1.2993, "nll_loss": 1.1953412294387817, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.88215970993042, "rewards/margins": 3.2391600608825684, "rewards/rejected": 2.6429996490478516, "step": 9680 }, { "epoch": 0.5375792291369051, "grad_norm": 67.51808166503906, "learning_rate": 4.4109608255355066e-08, "logits/chosen": -0.4071227014064789, "logits/rejected": -0.4993719160556793, "logps/chosen": -195.88389587402344, "logps/rejected": -245.6428680419922, "loss": 1.3397, "nll_loss": 1.1394661664962769, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 6.183575630187988, "rewards/margins": 3.207012891769409, "rewards/rejected": 2.976562976837158, "step": 9690 }, { "epoch": 0.5381340064631559, "grad_norm": 51.367610931396484, "learning_rate": 4.4023078663511065e-08, "logits/chosen": -0.32586947083473206, "logits/rejected": -0.48920202255249023, "logps/chosen": -170.09494018554688, "logps/rejected": -206.68032836914062, "loss": 1.2709, "nll_loss": 0.9710670709609985, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.917175769805908, "rewards/margins": 3.1624655723571777, "rewards/rejected": 2.754709482192993, "step": 9700 }, { "epoch": 0.5386887837894065, "grad_norm": 52.32279586791992, "learning_rate": 4.393656722793689e-08, "logits/chosen": -0.25959348678588867, "logits/rejected": -0.41398563981056213, "logps/chosen": -149.76123046875, "logps/rejected": -193.00668334960938, "loss": 1.3175, "nll_loss": 0.936292827129364, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.5646162033081055, "rewards/margins": 3.4865188598632812, "rewards/rejected": 2.078097105026245, "step": 9710 }, { "epoch": 0.5392435611156572, "grad_norm": 86.81396484375, "learning_rate": 4.38500742114309e-08, "logits/chosen": -0.2789912819862366, "logits/rejected": -0.3998999297618866, "logps/chosen": -200.81280517578125, "logps/rejected": -261.4305114746094, "loss": 1.2604, "nll_loss": 1.061387300491333, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.401424407958984, "rewards/margins": 3.613424777984619, "rewards/rejected": 2.7879996299743652, "step": 9720 }, { "epoch": 0.5397983384419078, "grad_norm": 75.32423400878906, "learning_rate": 4.376359987673546e-08, "logits/chosen": -0.26495617628097534, "logits/rejected": -0.3843373954296112, "logps/chosen": -143.46360778808594, "logps/rejected": -198.763916015625, "loss": 1.3028, "nll_loss": 0.8967748880386353, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.674996852874756, "rewards/margins": 3.3870913982391357, "rewards/rejected": 2.287905216217041, "step": 9730 }, { "epoch": 0.5403531157681586, "grad_norm": 37.746395111083984, "learning_rate": 4.367714448653622e-08, "logits/chosen": -0.24729077517986298, "logits/rejected": -0.42625313997268677, "logps/chosen": -166.9033966064453, "logps/rejected": -199.24484252929688, "loss": 1.3007, "nll_loss": 1.0432840585708618, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.175162315368652, "rewards/margins": 3.7920310497283936, "rewards/rejected": 1.383131742477417, "step": 9740 }, { "epoch": 0.5409078930944092, "grad_norm": 39.954654693603516, "learning_rate": 4.3590708303461256e-08, "logits/chosen": -0.24685220420360565, "logits/rejected": -0.37900620698928833, "logps/chosen": -172.75445556640625, "logps/rejected": -252.6306610107422, "loss": 1.2514, "nll_loss": 0.9287413358688354, "rewards/accuracies": 0.875, "rewards/chosen": 6.060909271240234, "rewards/margins": 3.7459397315979004, "rewards/rejected": 2.314969778060913, "step": 9750 }, { "epoch": 0.5414626704206599, "grad_norm": 104.29252624511719, "learning_rate": 4.350429159008029e-08, "logits/chosen": 0.005691577680408955, "logits/rejected": -0.12545771896839142, "logps/chosen": -160.78424072265625, "logps/rejected": -223.47946166992188, "loss": 1.3204, "nll_loss": 0.8255676031112671, "rewards/accuracies": 0.75, "rewards/chosen": 5.460650444030762, "rewards/margins": 3.252558469772339, "rewards/rejected": 2.208092212677002, "step": 9760 }, { "epoch": 0.5420174477469106, "grad_norm": 46.125762939453125, "learning_rate": 4.341789460890391e-08, "logits/chosen": -0.1777302771806717, "logits/rejected": -0.33578577637672424, "logps/chosen": -142.5837860107422, "logps/rejected": -191.37918090820312, "loss": 1.2636, "nll_loss": 0.8883357048034668, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.09456729888916, "rewards/margins": 2.4341490268707275, "rewards/rejected": 2.6604175567626953, "step": 9770 }, { "epoch": 0.5425722250731613, "grad_norm": 39.68583297729492, "learning_rate": 4.3331517622382805e-08, "logits/chosen": -0.22501273453235626, "logits/rejected": -0.4059979319572449, "logps/chosen": -159.4439239501953, "logps/rejected": -229.55899047851562, "loss": 1.2376, "nll_loss": 0.900013267993927, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.4433722496032715, "rewards/margins": 4.18411111831665, "rewards/rejected": 1.2592613697052002, "step": 9780 }, { "epoch": 0.5431270023994119, "grad_norm": 80.15180206298828, "learning_rate": 4.324516089290688e-08, "logits/chosen": -0.30077359080314636, "logits/rejected": -0.41646233201026917, "logps/chosen": -170.2849578857422, "logps/rejected": -233.38253784179688, "loss": 1.392, "nll_loss": 0.9865023493766785, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.135927677154541, "rewards/margins": 3.195493459701538, "rewards/rejected": 2.940434217453003, "step": 9790 }, { "epoch": 0.5436817797256626, "grad_norm": 62.00218200683594, "learning_rate": 4.3158824682804495e-08, "logits/chosen": -0.14447996020317078, "logits/rejected": -0.2670975923538208, "logps/chosen": -144.17227172851562, "logps/rejected": -204.5887908935547, "loss": 1.2697, "nll_loss": 1.0111229419708252, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.344749450683594, "rewards/margins": 3.5098907947540283, "rewards/rejected": 1.8348582983016968, "step": 9800 }, { "epoch": 0.5442365570519133, "grad_norm": 78.89068603515625, "learning_rate": 4.3072509254341703e-08, "logits/chosen": -0.29741746187210083, "logits/rejected": -0.44821491837501526, "logps/chosen": -179.11917114257812, "logps/rejected": -236.12158203125, "loss": 1.3057, "nll_loss": 1.0103237628936768, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.0418572425842285, "rewards/margins": 3.351210832595825, "rewards/rejected": 2.6906466484069824, "step": 9810 }, { "epoch": 0.544791334378164, "grad_norm": 53.10793685913086, "learning_rate": 4.2986214869721414e-08, "logits/chosen": -0.3061564564704895, "logits/rejected": -0.4098960757255554, "logps/chosen": -186.2776336669922, "logps/rejected": -243.23342895507812, "loss": 1.2418, "nll_loss": 0.9879401922225952, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.188605308532715, "rewards/margins": 4.205409049987793, "rewards/rejected": 1.9831968545913696, "step": 9820 }, { "epoch": 0.5453461117044146, "grad_norm": 57.80470275878906, "learning_rate": 4.289994179108264e-08, "logits/chosen": -0.16093483567237854, "logits/rejected": -0.3503780961036682, "logps/chosen": -138.66676330566406, "logps/rejected": -183.24270629882812, "loss": 1.2144, "nll_loss": 0.8346014022827148, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.369721412658691, "rewards/margins": 4.062706470489502, "rewards/rejected": 1.307015299797058, "step": 9830 }, { "epoch": 0.5459008890306654, "grad_norm": 95.7310562133789, "learning_rate": 4.2813690280499635e-08, "logits/chosen": -0.29143795371055603, "logits/rejected": -0.3641031086444855, "logps/chosen": -150.7353515625, "logps/rejected": -200.9915008544922, "loss": 1.3845, "nll_loss": 1.0360045433044434, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.378697872161865, "rewards/margins": 3.316908359527588, "rewards/rejected": 2.0617895126342773, "step": 9840 }, { "epoch": 0.546455666356916, "grad_norm": 49.37306213378906, "learning_rate": 4.272746059998116e-08, "logits/chosen": -0.37530088424682617, "logits/rejected": -0.4528474807739258, "logps/chosen": -190.97067260742188, "logps/rejected": -257.160400390625, "loss": 1.2963, "nll_loss": 1.115810751914978, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.650545597076416, "rewards/margins": 3.7628426551818848, "rewards/rejected": 2.8877029418945312, "step": 9850 }, { "epoch": 0.5470104436831666, "grad_norm": 60.86616134643555, "learning_rate": 4.264125301146965e-08, "logits/chosen": -0.3090120255947113, "logits/rejected": -0.4056679308414459, "logps/chosen": -162.8961944580078, "logps/rejected": -217.0439453125, "loss": 1.1644, "nll_loss": 1.0370877981185913, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.953455924987793, "rewards/margins": 3.510234832763672, "rewards/rejected": 2.443220853805542, "step": 9860 }, { "epoch": 0.5475652210094173, "grad_norm": 69.87907409667969, "learning_rate": 4.2555067776840403e-08, "logits/chosen": -0.1707712858915329, "logits/rejected": -0.3207014799118042, "logps/chosen": -162.30130004882812, "logps/rejected": -231.8777313232422, "loss": 1.2908, "nll_loss": 0.9085710644721985, "rewards/accuracies": 0.875, "rewards/chosen": 5.92129373550415, "rewards/margins": 4.297608852386475, "rewards/rejected": 1.6236846446990967, "step": 9870 }, { "epoch": 0.548119998335668, "grad_norm": 76.71466064453125, "learning_rate": 4.24689051579009e-08, "logits/chosen": -0.21910777688026428, "logits/rejected": -0.34946125745773315, "logps/chosen": -162.58026123046875, "logps/rejected": -215.5464324951172, "loss": 1.2457, "nll_loss": 0.9269440770149231, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.775429725646973, "rewards/margins": 4.113760471343994, "rewards/rejected": 1.6616685390472412, "step": 9880 }, { "epoch": 0.5486747756619187, "grad_norm": 48.44591522216797, "learning_rate": 4.238276541638984e-08, "logits/chosen": -0.20550286769866943, "logits/rejected": -0.38304099440574646, "logps/chosen": -153.71751403808594, "logps/rejected": -231.9537353515625, "loss": 1.2689, "nll_loss": 0.8824397325515747, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.776326656341553, "rewards/margins": 4.239264011383057, "rewards/rejected": 1.5370631217956543, "step": 9890 }, { "epoch": 0.5492295529881693, "grad_norm": 54.35772705078125, "learning_rate": 4.2296648813976446e-08, "logits/chosen": -0.24491152167320251, "logits/rejected": -0.4041506350040436, "logps/chosen": -169.57485961914062, "logps/rejected": -233.47744750976562, "loss": 1.2242, "nll_loss": 1.0028597116470337, "rewards/accuracies": 0.875, "rewards/chosen": 5.852323055267334, "rewards/margins": 3.8250155448913574, "rewards/rejected": 2.0273072719573975, "step": 9900 }, { "epoch": 0.5497843303144201, "grad_norm": 54.7266731262207, "learning_rate": 4.221055561225965e-08, "logits/chosen": -0.2844238579273224, "logits/rejected": -0.43756571412086487, "logps/chosen": -162.778076171875, "logps/rejected": -210.55270385742188, "loss": 1.2546, "nll_loss": 0.8987213373184204, "rewards/accuracies": 0.875, "rewards/chosen": 5.959184646606445, "rewards/margins": 4.301226615905762, "rewards/rejected": 1.6579582691192627, "step": 9910 }, { "epoch": 0.5503391076406707, "grad_norm": 67.63874816894531, "learning_rate": 4.212448607276729e-08, "logits/chosen": -0.24086585640907288, "logits/rejected": -0.4369584918022156, "logps/chosen": -155.12686157226562, "logps/rejected": -214.6593017578125, "loss": 1.3173, "nll_loss": 0.8765009045600891, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.212924003601074, "rewards/margins": 2.9610352516174316, "rewards/rejected": 2.2518887519836426, "step": 9920 }, { "epoch": 0.5508938849669214, "grad_norm": 58.325653076171875, "learning_rate": 4.203844045695538e-08, "logits/chosen": -0.19549937546253204, "logits/rejected": -0.32256126403808594, "logps/chosen": -148.02536010742188, "logps/rejected": -201.8097381591797, "loss": 1.2249, "nll_loss": 0.8750473260879517, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.65772008895874, "rewards/margins": 3.355874538421631, "rewards/rejected": 2.301845073699951, "step": 9930 }, { "epoch": 0.5514486622931721, "grad_norm": 61.73947525024414, "learning_rate": 4.19524190262072e-08, "logits/chosen": -0.3503522574901581, "logits/rejected": -0.4940328598022461, "logps/chosen": -201.44822692871094, "logps/rejected": -299.6861877441406, "loss": 1.2018, "nll_loss": 1.0702818632125854, "rewards/accuracies": 0.875, "rewards/chosen": 6.758185386657715, "rewards/margins": 5.5747480392456055, "rewards/rejected": 1.1834368705749512, "step": 9940 }, { "epoch": 0.5520034396194228, "grad_norm": 86.86346435546875, "learning_rate": 4.186642204183258e-08, "logits/chosen": -0.20920626819133759, "logits/rejected": -0.3422732651233673, "logps/chosen": -149.57943725585938, "logps/rejected": -201.55697631835938, "loss": 1.35, "nll_loss": 0.9252266883850098, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.3443379402160645, "rewards/margins": 3.2025299072265625, "rewards/rejected": 2.141807794570923, "step": 9950 }, { "epoch": 0.5525582169456734, "grad_norm": 41.97013854980469, "learning_rate": 4.17804497650671e-08, "logits/chosen": -0.40197426080703735, "logits/rejected": -0.4947798252105713, "logps/chosen": -180.4291534423828, "logps/rejected": -249.99404907226562, "loss": 1.2468, "nll_loss": 1.0687518119812012, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.4140801429748535, "rewards/margins": 4.156526565551758, "rewards/rejected": 2.2575535774230957, "step": 9960 }, { "epoch": 0.5531129942719241, "grad_norm": 40.65499496459961, "learning_rate": 4.169450245707125e-08, "logits/chosen": -0.25668230652809143, "logits/rejected": -0.43165111541748047, "logps/chosen": -163.89199829101562, "logps/rejected": -226.5345001220703, "loss": 1.2065, "nll_loss": 0.9410833120346069, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.106402397155762, "rewards/margins": 3.919654130935669, "rewards/rejected": 2.1867482662200928, "step": 9970 }, { "epoch": 0.5536677715981748, "grad_norm": 38.79108810424805, "learning_rate": 4.160858037892973e-08, "logits/chosen": -0.176131471991539, "logits/rejected": -0.38565102219581604, "logps/chosen": -157.2784881591797, "logps/rejected": -213.25076293945312, "loss": 1.1965, "nll_loss": 0.8632275462150574, "rewards/accuracies": 0.875, "rewards/chosen": 5.779348373413086, "rewards/margins": 4.213487148284912, "rewards/rejected": 1.5658613443374634, "step": 9980 }, { "epoch": 0.5542225489244255, "grad_norm": 92.07147979736328, "learning_rate": 4.152268379165054e-08, "logits/chosen": -0.2727094292640686, "logits/rejected": -0.42925962805747986, "logps/chosen": -166.07444763183594, "logps/rejected": -226.11392211914062, "loss": 1.3282, "nll_loss": 0.9418858289718628, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.982013702392578, "rewards/margins": 3.9414381980895996, "rewards/rejected": 2.0405755043029785, "step": 9990 }, { "epoch": 0.5547773262506761, "grad_norm": 66.82201385498047, "learning_rate": 4.143681295616429e-08, "logits/chosen": -0.30561619997024536, "logits/rejected": -0.4442395269870758, "logps/chosen": -172.85299682617188, "logps/rejected": -208.4752655029297, "loss": 1.2903, "nll_loss": 0.9878283739089966, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.75870418548584, "rewards/margins": 3.4746947288513184, "rewards/rejected": 2.2840092182159424, "step": 10000 }, { "epoch": 0.5547773262506761, "eval_logits/chosen": -0.37051522731781006, "eval_logits/rejected": -0.4826039671897888, "eval_logps/chosen": -191.36302185058594, "eval_logps/rejected": -262.9516296386719, "eval_loss": 1.2380090951919556, "eval_nll_loss": 0.9910838603973389, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": 6.659819602966309, "eval_rewards/margins": 5.047131538391113, "eval_rewards/rejected": 1.6126880645751953, "eval_runtime": 17.0446, "eval_samples_per_second": 15.019, "eval_steps_per_second": 1.877, "step": 10000 }, { "epoch": 0.5553321035769269, "grad_norm": 58.526893615722656, "learning_rate": 4.135096813332333e-08, "logits/chosen": -0.30008548498153687, "logits/rejected": -0.3977207839488983, "logps/chosen": -166.9575958251953, "logps/rejected": -219.8026885986328, "loss": 1.2512, "nll_loss": 1.047019600868225, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.596224784851074, "rewards/margins": 3.707197904586792, "rewards/rejected": 1.8890268802642822, "step": 10010 }, { "epoch": 0.5558868809031775, "grad_norm": 66.96456909179688, "learning_rate": 4.126514958390099e-08, "logits/chosen": -0.2781417965888977, "logits/rejected": -0.4583090841770172, "logps/chosen": -158.1600799560547, "logps/rejected": -217.20059204101562, "loss": 1.2551, "nll_loss": 0.959384560585022, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.230644702911377, "rewards/margins": 3.299389362335205, "rewards/rejected": 1.9312553405761719, "step": 10020 }, { "epoch": 0.5564416582294281, "grad_norm": 41.96564865112305, "learning_rate": 4.1179357568590836e-08, "logits/chosen": -0.16881588101387024, "logits/rejected": -0.3409457802772522, "logps/chosen": -135.94915771484375, "logps/rejected": -201.0962677001953, "loss": 1.202, "nll_loss": 0.8159046173095703, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.185823917388916, "rewards/margins": 4.460925579071045, "rewards/rejected": 0.7248983383178711, "step": 10030 }, { "epoch": 0.5569964355556788, "grad_norm": 120.59881591796875, "learning_rate": 4.109359234800579e-08, "logits/chosen": -0.1846725046634674, "logits/rejected": -0.36268824338912964, "logps/chosen": -168.23143005371094, "logps/rejected": -227.4480743408203, "loss": 1.2565, "nll_loss": 0.8777807354927063, "rewards/accuracies": 0.875, "rewards/chosen": 5.901741027832031, "rewards/margins": 3.807669162750244, "rewards/rejected": 2.09407114982605, "step": 10040 }, { "epoch": 0.5575512128819295, "grad_norm": 51.784149169921875, "learning_rate": 4.1007854182677384e-08, "logits/chosen": -0.08302908390760422, "logits/rejected": -0.302766889333725, "logps/chosen": -132.2078857421875, "logps/rejected": -199.78074645996094, "loss": 1.2573, "nll_loss": 0.7507218718528748, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.031310081481934, "rewards/margins": 3.765636920928955, "rewards/rejected": 1.2656733989715576, "step": 10050 }, { "epoch": 0.5581059902081802, "grad_norm": 44.574737548828125, "learning_rate": 4.092214333305496e-08, "logits/chosen": -0.17102904617786407, "logits/rejected": -0.32034677267074585, "logps/chosen": -147.87527465820312, "logps/rejected": -222.16213989257812, "loss": 1.3022, "nll_loss": 0.8825603723526001, "rewards/accuracies": 0.875, "rewards/chosen": 5.455257892608643, "rewards/margins": 3.493680953979492, "rewards/rejected": 1.9615771770477295, "step": 10060 }, { "epoch": 0.5586607675344308, "grad_norm": 93.73436737060547, "learning_rate": 4.0836460059504875e-08, "logits/chosen": -0.16540075838565826, "logits/rejected": -0.34344834089279175, "logps/chosen": -149.68399047851562, "logps/rejected": -181.32241821289062, "loss": 1.2071, "nll_loss": 0.8493715524673462, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.781062126159668, "rewards/margins": 3.1879143714904785, "rewards/rejected": 1.5931479930877686, "step": 10070 }, { "epoch": 0.5592155448606816, "grad_norm": 73.01387786865234, "learning_rate": 4.075080462230976e-08, "logits/chosen": -0.2254190742969513, "logits/rejected": -0.3859403431415558, "logps/chosen": -163.48228454589844, "logps/rejected": -199.42469787597656, "loss": 1.2813, "nll_loss": 0.9542709589004517, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.255620002746582, "rewards/margins": 3.1761951446533203, "rewards/rejected": 2.0794243812561035, "step": 10080 }, { "epoch": 0.5597703221869322, "grad_norm": 60.718994140625, "learning_rate": 4.066517728166765e-08, "logits/chosen": -0.3345080316066742, "logits/rejected": -0.43725553154945374, "logps/chosen": -169.29473876953125, "logps/rejected": -217.14334106445312, "loss": 1.2574, "nll_loss": 1.0213162899017334, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.477988243103027, "rewards/margins": 3.135385751724243, "rewards/rejected": 2.342602252960205, "step": 10090 }, { "epoch": 0.5603250995131829, "grad_norm": 154.75498962402344, "learning_rate": 4.0579578297691226e-08, "logits/chosen": -0.20659950375556946, "logits/rejected": -0.3297533392906189, "logps/chosen": -164.67564392089844, "logps/rejected": -219.97640991210938, "loss": 1.3394, "nll_loss": 0.9876292943954468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.401577949523926, "rewards/margins": 3.6367709636688232, "rewards/rejected": 1.7648069858551025, "step": 10100 }, { "epoch": 0.5608798768394335, "grad_norm": 52.578983306884766, "learning_rate": 4.0494007930407046e-08, "logits/chosen": -0.31942233443260193, "logits/rejected": -0.4834202826023102, "logps/chosen": -178.60403442382812, "logps/rejected": -239.25717163085938, "loss": 1.2078, "nll_loss": 0.9782170057296753, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.414088249206543, "rewards/margins": 4.0510382652282715, "rewards/rejected": 2.3630499839782715, "step": 10110 }, { "epoch": 0.5614346541656843, "grad_norm": 45.17326736450195, "learning_rate": 4.040846643975473e-08, "logits/chosen": -0.48407474160194397, "logits/rejected": -0.5528632402420044, "logps/chosen": -185.46778869628906, "logps/rejected": -264.75201416015625, "loss": 1.1985, "nll_loss": 1.0744699239730835, "rewards/accuracies": 0.875, "rewards/chosen": 6.133168697357178, "rewards/margins": 3.387519121170044, "rewards/rejected": 2.745649576187134, "step": 10120 }, { "epoch": 0.5619894314919349, "grad_norm": 77.66555786132812, "learning_rate": 4.032295408558619e-08, "logits/chosen": -0.341526597738266, "logits/rejected": -0.5378842949867249, "logps/chosen": -184.19808959960938, "logps/rejected": -249.416015625, "loss": 1.1846, "nll_loss": 0.9458611607551575, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.309676170349121, "rewards/margins": 4.639254570007324, "rewards/rejected": 1.6704213619232178, "step": 10130 }, { "epoch": 0.5625442088181856, "grad_norm": 51.0809326171875, "learning_rate": 4.023747112766482e-08, "logits/chosen": -0.24474194645881653, "logits/rejected": -0.4141760468482971, "logps/chosen": -166.1620635986328, "logps/rejected": -223.5570831298828, "loss": 1.2322, "nll_loss": 0.9518367648124695, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.8418779373168945, "rewards/margins": 3.5767345428466797, "rewards/rejected": 2.2651429176330566, "step": 10140 }, { "epoch": 0.5630989861444363, "grad_norm": 52.535606384277344, "learning_rate": 4.0152017825664705e-08, "logits/chosen": -0.15986858308315277, "logits/rejected": -0.4056181013584137, "logps/chosen": -183.78421020507812, "logps/rejected": -270.3041076660156, "loss": 1.1969, "nll_loss": 0.8582059144973755, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.990294933319092, "rewards/margins": 5.1435980796813965, "rewards/rejected": 0.8466971516609192, "step": 10150 }, { "epoch": 0.563653763470687, "grad_norm": 32.545166015625, "learning_rate": 4.006659443916987e-08, "logits/chosen": -0.23594574630260468, "logits/rejected": -0.3522702157497406, "logps/chosen": -153.6573944091797, "logps/rejected": -226.1883544921875, "loss": 1.1999, "nll_loss": 0.935680091381073, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.601001262664795, "rewards/margins": 3.8492636680603027, "rewards/rejected": 1.7517372369766235, "step": 10160 }, { "epoch": 0.5642085407969376, "grad_norm": 122.06332397460938, "learning_rate": 3.9981201227673424e-08, "logits/chosen": -0.2976406514644623, "logits/rejected": -0.43046554923057556, "logps/chosen": -148.02928161621094, "logps/rejected": -189.18588256835938, "loss": 1.3542, "nll_loss": 0.9401167631149292, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.531824111938477, "rewards/margins": 3.1392407417297363, "rewards/rejected": 2.3925833702087402, "step": 10170 }, { "epoch": 0.5647633181231883, "grad_norm": 74.4186782836914, "learning_rate": 3.989583845057688e-08, "logits/chosen": -0.293215811252594, "logits/rejected": -0.4330647587776184, "logps/chosen": -166.47183227539062, "logps/rejected": -227.81460571289062, "loss": 1.2225, "nll_loss": 0.9029404520988464, "rewards/accuracies": 0.875, "rewards/chosen": 5.619418144226074, "rewards/margins": 3.53371000289917, "rewards/rejected": 2.085707902908325, "step": 10180 }, { "epoch": 0.565318095449439, "grad_norm": 50.121543884277344, "learning_rate": 3.9810506367189226e-08, "logits/chosen": -0.23663277924060822, "logits/rejected": -0.4094238877296448, "logps/chosen": -141.23477172851562, "logps/rejected": -214.24105834960938, "loss": 1.217, "nll_loss": 0.841219425201416, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.489045143127441, "rewards/margins": 3.193114995956421, "rewards/rejected": 2.295930862426758, "step": 10190 }, { "epoch": 0.5658728727756897, "grad_norm": 29.837879180908203, "learning_rate": 3.972520523672626e-08, "logits/chosen": -0.21812088787555695, "logits/rejected": -0.41568484902381897, "logps/chosen": -146.0269317626953, "logps/rejected": -192.945068359375, "loss": 1.2653, "nll_loss": 0.8905706405639648, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.3971052169799805, "rewards/margins": 3.420079469680786, "rewards/rejected": 1.9770259857177734, "step": 10200 }, { "epoch": 0.5664276501019403, "grad_norm": 34.43855667114258, "learning_rate": 3.963993531830973e-08, "logits/chosen": -0.27292150259017944, "logits/rejected": -0.45809444785118103, "logps/chosen": -137.93099975585938, "logps/rejected": -193.16009521484375, "loss": 1.1547, "nll_loss": 0.792144775390625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.233412265777588, "rewards/margins": 3.822312593460083, "rewards/rejected": 1.4110995531082153, "step": 10210 }, { "epoch": 0.566982427428191, "grad_norm": 38.51008224487305, "learning_rate": 3.9554696870966566e-08, "logits/chosen": -0.19013547897338867, "logits/rejected": -0.38051319122314453, "logps/chosen": -131.56764221191406, "logps/rejected": -188.43069458007812, "loss": 1.3362, "nll_loss": 0.8542930483818054, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.759130954742432, "rewards/margins": 3.1565465927124023, "rewards/rejected": 1.6025844812393188, "step": 10220 }, { "epoch": 0.5675372047544417, "grad_norm": 118.84576416015625, "learning_rate": 3.9469490153628124e-08, "logits/chosen": -0.21577802300453186, "logits/rejected": -0.44243812561035156, "logps/chosen": -150.70347595214844, "logps/rejected": -203.06527709960938, "loss": 1.23, "nll_loss": 0.8202966451644897, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.528652191162109, "rewards/margins": 4.659850120544434, "rewards/rejected": 0.8688012957572937, "step": 10230 }, { "epoch": 0.5680919820806923, "grad_norm": 41.868560791015625, "learning_rate": 3.938431542512936e-08, "logits/chosen": -0.24971739947795868, "logits/rejected": -0.44349947571754456, "logps/chosen": -148.19473266601562, "logps/rejected": -202.19549560546875, "loss": 1.2403, "nll_loss": 0.8570820689201355, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.79079532623291, "rewards/margins": 3.260967969894409, "rewards/rejected": 2.5298266410827637, "step": 10240 }, { "epoch": 0.5686467594069431, "grad_norm": 69.5272445678711, "learning_rate": 3.9299172944208036e-08, "logits/chosen": -0.47588104009628296, "logits/rejected": -0.6104357838630676, "logps/chosen": -205.761962890625, "logps/rejected": -272.7579650878906, "loss": 1.3107, "nll_loss": 1.0669199228286743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.797511100769043, "rewards/margins": 3.980442762374878, "rewards/rejected": 2.8170692920684814, "step": 10250 }, { "epoch": 0.5692015367331937, "grad_norm": 47.80853271484375, "learning_rate": 3.9214062969503995e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -149.69068908691406, "logps/rejected": -196.16390991210938, "loss": 1.1962, "nll_loss": NaN, "rewards/accuracies": 0.875, "rewards/chosen": 5.366769313812256, "rewards/margins": 3.5515894889831543, "rewards/rejected": 1.815179467201233, "step": 10260 }, { "epoch": 0.5697563140594444, "grad_norm": 59.806941986083984, "learning_rate": 3.912898575955826e-08, "logits/chosen": -0.24707865715026855, "logits/rejected": -0.4839915633201599, "logps/chosen": -131.5078582763672, "logps/rejected": -189.6587677001953, "loss": 1.2316, "nll_loss": 0.7790622711181641, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.850626468658447, "rewards/margins": 3.8297977447509766, "rewards/rejected": 1.0208286046981812, "step": 10270 }, { "epoch": 0.570311091385695, "grad_norm": 55.08845520019531, "learning_rate": 3.9043941572812436e-08, "logits/chosen": -0.40836301445961, "logits/rejected": -0.5811692476272583, "logps/chosen": -166.8919219970703, "logps/rejected": -252.9898681640625, "loss": 1.2585, "nll_loss": 0.9906598329544067, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.0761308670043945, "rewards/margins": 4.707409858703613, "rewards/rejected": 1.36872136592865, "step": 10280 }, { "epoch": 0.5708658687119458, "grad_norm": 46.97995376586914, "learning_rate": 3.8958930667607734e-08, "logits/chosen": -0.21153624355793, "logits/rejected": -0.43253570795059204, "logps/chosen": -117.1732177734375, "logps/rejected": -182.49441528320312, "loss": 1.1881, "nll_loss": 0.7652009725570679, "rewards/accuracies": 0.875, "rewards/chosen": 4.728806018829346, "rewards/margins": 3.9170024394989014, "rewards/rejected": 0.8118033409118652, "step": 10290 }, { "epoch": 0.5714206460381964, "grad_norm": 57.82447052001953, "learning_rate": 3.887395330218428e-08, "logits/chosen": -0.2495851218700409, "logits/rejected": -0.39650827646255493, "logps/chosen": -152.25137329101562, "logps/rejected": -205.52682495117188, "loss": 1.2071, "nll_loss": 0.8481658101081848, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.563853740692139, "rewards/margins": 4.029789924621582, "rewards/rejected": 1.534063458442688, "step": 10300 }, { "epoch": 0.5719754233644471, "grad_norm": 56.055057525634766, "learning_rate": 3.878900973468031e-08, "logits/chosen": -0.4390603005886078, "logits/rejected": -0.5407704710960388, "logps/chosen": -184.212890625, "logps/rejected": -237.96286010742188, "loss": 1.3313, "nll_loss": 1.051011323928833, "rewards/accuracies": 0.875, "rewards/chosen": 6.2777605056762695, "rewards/margins": 4.191825866699219, "rewards/rejected": 2.08593487739563, "step": 10310 }, { "epoch": 0.5725302006906978, "grad_norm": 42.29498291015625, "learning_rate": 3.87041002231314e-08, "logits/chosen": -0.34510624408721924, "logits/rejected": -0.5676770210266113, "logps/chosen": -180.16378784179688, "logps/rejected": -238.06961059570312, "loss": 1.2451, "nll_loss": 0.9839975237846375, "rewards/accuracies": 0.875, "rewards/chosen": 5.807919979095459, "rewards/margins": 4.66348934173584, "rewards/rejected": 1.1444305181503296, "step": 10320 }, { "epoch": 0.5730849780169485, "grad_norm": 43.10969543457031, "learning_rate": 3.8619225025469684e-08, "logits/chosen": -0.3275555670261383, "logits/rejected": -0.47744303941726685, "logps/chosen": -163.6125030517578, "logps/rejected": -227.3212890625, "loss": 1.2735, "nll_loss": 0.9478472471237183, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.324474334716797, "rewards/margins": 4.421574592590332, "rewards/rejected": 1.902899980545044, "step": 10330 }, { "epoch": 0.5736397553431991, "grad_norm": 47.634117126464844, "learning_rate": 3.853438439952304e-08, "logits/chosen": -0.3265872001647949, "logits/rejected": -0.4961087703704834, "logps/chosen": -192.4685821533203, "logps/rejected": -227.6434783935547, "loss": 1.2575, "nll_loss": 0.9580524563789368, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.888747215270996, "rewards/margins": 3.3545432090759277, "rewards/rejected": 2.53420352935791, "step": 10340 }, { "epoch": 0.5741945326694498, "grad_norm": 84.28260040283203, "learning_rate": 3.844957860301433e-08, "logits/chosen": -0.15577220916748047, "logits/rejected": -0.37774962186813354, "logps/chosen": -134.08277893066406, "logps/rejected": -179.55654907226562, "loss": 1.2275, "nll_loss": 0.8151863813400269, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.769875526428223, "rewards/margins": 3.951662540435791, "rewards/rejected": 0.8182132840156555, "step": 10350 }, { "epoch": 0.5747493099957005, "grad_norm": 65.98307037353516, "learning_rate": 3.836480789356063e-08, "logits/chosen": -0.3224945366382599, "logits/rejected": -0.5012255311012268, "logps/chosen": -182.64144897460938, "logps/rejected": -218.14675903320312, "loss": 1.2064, "nll_loss": 1.0050209760665894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.930918216705322, "rewards/margins": 3.0051004886627197, "rewards/rejected": 2.9258179664611816, "step": 10360 }, { "epoch": 0.5753040873219512, "grad_norm": 68.94466400146484, "learning_rate": 3.828007252867239e-08, "logits/chosen": -0.1657598316669464, "logits/rejected": -0.330579936504364, "logps/chosen": -140.05340576171875, "logps/rejected": -205.56869506835938, "loss": 1.2834, "nll_loss": 0.8427098393440247, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.017988681793213, "rewards/margins": 3.076979398727417, "rewards/rejected": 1.9410085678100586, "step": 10370 }, { "epoch": 0.5758588646482018, "grad_norm": 83.23578643798828, "learning_rate": 3.819537276575276e-08, "logits/chosen": -0.38658449053764343, "logits/rejected": -0.5110979080200195, "logps/chosen": -163.3243865966797, "logps/rejected": -236.9144744873047, "loss": 1.2682, "nll_loss": 1.0007117986679077, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.975901126861572, "rewards/margins": 3.9243903160095215, "rewards/rejected": 2.0515103340148926, "step": 10380 }, { "epoch": 0.5764136419744526, "grad_norm": 69.59004211425781, "learning_rate": 3.811070886209668e-08, "logits/chosen": -0.14034242928028107, "logits/rejected": -0.3311167359352112, "logps/chosen": -171.51422119140625, "logps/rejected": -240.3944854736328, "loss": 1.253, "nll_loss": 0.900901198387146, "rewards/accuracies": 0.875, "rewards/chosen": 5.613317012786865, "rewards/margins": 4.6014533042907715, "rewards/rejected": 1.011863350868225, "step": 10390 }, { "epoch": 0.5769684193007032, "grad_norm": 81.56526184082031, "learning_rate": 3.80260810748902e-08, "logits/chosen": -0.3824862837791443, "logits/rejected": -0.4957052767276764, "logps/chosen": -162.27566528320312, "logps/rejected": -243.3124542236328, "loss": 1.2452, "nll_loss": 0.9620378613471985, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.729463577270508, "rewards/margins": 3.5673599243164062, "rewards/rejected": 2.1621041297912598, "step": 10400 }, { "epoch": 0.5775231966269538, "grad_norm": 46.983177185058594, "learning_rate": 3.79414896612096e-08, "logits/chosen": -0.4101601541042328, "logits/rejected": -0.4935119152069092, "logps/chosen": -196.01905822753906, "logps/rejected": -271.23883056640625, "loss": 1.2677, "nll_loss": 1.0919231176376343, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.388064384460449, "rewards/margins": 3.650418519973755, "rewards/rejected": 2.7376456260681152, "step": 10410 }, { "epoch": 0.5780779739532045, "grad_norm": 51.8054313659668, "learning_rate": 3.7856934878020746e-08, "logits/chosen": -0.3905481696128845, "logits/rejected": -0.5235660672187805, "logps/chosen": -174.46282958984375, "logps/rejected": -224.73825073242188, "loss": 1.2783, "nll_loss": 1.0395283699035645, "rewards/accuracies": 0.875, "rewards/chosen": 6.0996575355529785, "rewards/margins": 3.955806255340576, "rewards/rejected": 2.143850803375244, "step": 10420 }, { "epoch": 0.5786327512794552, "grad_norm": 61.640480041503906, "learning_rate": 3.777241698217818e-08, "logits/chosen": -0.23529402911663055, "logits/rejected": -0.41466569900512695, "logps/chosen": -154.8517608642578, "logps/rejected": -242.61160278320312, "loss": 1.2596, "nll_loss": 0.8378097414970398, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.4309611320495605, "rewards/margins": 3.666550397872925, "rewards/rejected": 1.764411211013794, "step": 10430 }, { "epoch": 0.5791875286057059, "grad_norm": 62.208614349365234, "learning_rate": 3.7687936230424414e-08, "logits/chosen": -0.26653987169265747, "logits/rejected": -0.41189995408058167, "logps/chosen": -182.07679748535156, "logps/rejected": -250.5347137451172, "loss": 1.2692, "nll_loss": 0.9324018359184265, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.15363073348999, "rewards/margins": 3.7218990325927734, "rewards/rejected": 2.431732177734375, "step": 10440 }, { "epoch": 0.5797423059319565, "grad_norm": 95.203125, "learning_rate": 3.760349287938909e-08, "logits/chosen": -0.20371286571025848, "logits/rejected": -0.402310848236084, "logps/chosen": -162.33380126953125, "logps/rejected": -244.218505859375, "loss": 1.2327, "nll_loss": 0.8449538350105286, "rewards/accuracies": 0.875, "rewards/chosen": 5.365697860717773, "rewards/margins": 4.171332359313965, "rewards/rejected": 1.1943647861480713, "step": 10450 }, { "epoch": 0.5802970832582073, "grad_norm": 41.2097282409668, "learning_rate": 3.751908718558826e-08, "logits/chosen": -0.36653703451156616, "logits/rejected": -0.5174434185028076, "logps/chosen": -170.66348266601562, "logps/rejected": -238.06167602539062, "loss": 1.2665, "nll_loss": 0.9725948572158813, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 6.139052391052246, "rewards/margins": 4.696810722351074, "rewards/rejected": 1.4422420263290405, "step": 10460 }, { "epoch": 0.5808518605844579, "grad_norm": 130.22348022460938, "learning_rate": 3.743471940542361e-08, "logits/chosen": -0.3808217942714691, "logits/rejected": -0.5171536803245544, "logps/chosen": -178.21490478515625, "logps/rejected": -234.63394165039062, "loss": 1.2486, "nll_loss": 1.1083260774612427, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.08544397354126, "rewards/margins": 3.4438986778259277, "rewards/rejected": 2.641545534133911, "step": 10470 }, { "epoch": 0.5814066379107086, "grad_norm": 113.35897064208984, "learning_rate": 3.735038979518161e-08, "logits/chosen": -0.355629563331604, "logits/rejected": -0.4770967364311218, "logps/chosen": -167.3038787841797, "logps/rejected": -203.6851348876953, "loss": 1.1651, "nll_loss": 0.9599370956420898, "rewards/accuracies": 0.875, "rewards/chosen": 5.775060653686523, "rewards/margins": 3.4364638328552246, "rewards/rejected": 2.338596820831299, "step": 10480 }, { "epoch": 0.5819614152369592, "grad_norm": 51.10672378540039, "learning_rate": 3.72660986110328e-08, "logits/chosen": -0.2594406008720398, "logits/rejected": -0.40329861640930176, "logps/chosen": -165.8750457763672, "logps/rejected": -217.21444702148438, "loss": 1.2455, "nll_loss": 0.9032737612724304, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.570055961608887, "rewards/margins": 3.448636293411255, "rewards/rejected": 2.121419668197632, "step": 10490 }, { "epoch": 0.58251619256321, "grad_norm": 61.78841781616211, "learning_rate": 3.7181846109031e-08, "logits/chosen": -0.255531907081604, "logits/rejected": -0.4038110673427582, "logps/chosen": -175.2942352294922, "logps/rejected": -242.3545684814453, "loss": 1.2611, "nll_loss": 0.9725499153137207, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.6681036949157715, "rewards/margins": 3.321598529815674, "rewards/rejected": 2.3465051651000977, "step": 10500 }, { "epoch": 0.58251619256321, "eval_logits/chosen": -0.4062573313713074, "eval_logits/rejected": -0.5181547403335571, "eval_logps/chosen": -191.35580444335938, "eval_logps/rejected": -264.09014892578125, "eval_loss": 1.240870475769043, "eval_nll_loss": 0.9917998313903809, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.660539627075195, "eval_rewards/margins": 5.161705017089844, "eval_rewards/rejected": 1.4988348484039307, "eval_runtime": 17.2273, "eval_samples_per_second": 14.86, "eval_steps_per_second": 1.858, "step": 10500 }, { "epoch": 0.5830709698894606, "grad_norm": 94.24137878417969, "learning_rate": 3.709763254511248e-08, "logits/chosen": -0.2867467403411865, "logits/rejected": -0.4277495741844177, "logps/chosen": -157.75086975097656, "logps/rejected": -211.0211944580078, "loss": 1.2738, "nll_loss": 0.9054722785949707, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.910742282867432, "rewards/margins": 4.365904331207275, "rewards/rejected": 1.544838309288025, "step": 10510 }, { "epoch": 0.5836257472157113, "grad_norm": 52.160987854003906, "learning_rate": 3.701345817509531e-08, "logits/chosen": -0.3296714425086975, "logits/rejected": -0.4535750448703766, "logps/chosen": -185.8523406982422, "logps/rejected": -232.6554718017578, "loss": 1.2354, "nll_loss": 1.038352370262146, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.936407566070557, "rewards/margins": 3.9164319038391113, "rewards/rejected": 2.0199761390686035, "step": 10520 }, { "epoch": 0.584180524541962, "grad_norm": 123.69461059570312, "learning_rate": 3.6929323254678435e-08, "logits/chosen": -0.31326359510421753, "logits/rejected": -0.46600404381752014, "logps/chosen": -174.47178649902344, "logps/rejected": -232.8724822998047, "loss": 1.3301, "nll_loss": 1.0197381973266602, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.738070487976074, "rewards/margins": 3.5029759407043457, "rewards/rejected": 2.235095500946045, "step": 10530 }, { "epoch": 0.5847353018682127, "grad_norm": 66.86030578613281, "learning_rate": 3.684522803944098e-08, "logits/chosen": -0.4315427243709564, "logits/rejected": -0.5430246591567993, "logps/chosen": -192.14537048339844, "logps/rejected": -238.3332977294922, "loss": 1.2138, "nll_loss": 1.0603669881820679, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.344313144683838, "rewards/margins": 3.1130709648132324, "rewards/rejected": 3.2312424182891846, "step": 10540 }, { "epoch": 0.5852900791944633, "grad_norm": 45.89120101928711, "learning_rate": 3.676117278484144e-08, "logits/chosen": -0.2674533724784851, "logits/rejected": -0.4509350657463074, "logps/chosen": -159.71278381347656, "logps/rejected": -221.8773956298828, "loss": 1.2318, "nll_loss": 0.8398923873901367, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.681628704071045, "rewards/margins": 4.198636054992676, "rewards/rejected": 1.4829930067062378, "step": 10550 }, { "epoch": 0.585844856520714, "grad_norm": 67.97987365722656, "learning_rate": 3.6677157746216934e-08, "logits/chosen": -0.21639053523540497, "logits/rejected": -0.4206268787384033, "logps/chosen": -151.79837036132812, "logps/rejected": -231.48812866210938, "loss": 1.2648, "nll_loss": 0.8437652587890625, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.830565929412842, "rewards/margins": 4.830432891845703, "rewards/rejected": 1.0001335144042969, "step": 10560 }, { "epoch": 0.5863996338469647, "grad_norm": 24.88826560974121, "learning_rate": 3.659318317878245e-08, "logits/chosen": -0.3304033577442169, "logits/rejected": -0.4739084839820862, "logps/chosen": -188.98007202148438, "logps/rejected": -256.1590881347656, "loss": 1.2539, "nll_loss": 1.040067195892334, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.439640998840332, "rewards/margins": 4.042901992797852, "rewards/rejected": 2.3967385292053223, "step": 10570 }, { "epoch": 0.5869544111732153, "grad_norm": 76.99559783935547, "learning_rate": 3.650924933762997e-08, "logits/chosen": -0.32587510347366333, "logits/rejected": -0.5330041646957397, "logps/chosen": -151.07997131347656, "logps/rejected": -200.09375, "loss": 1.2803, "nll_loss": 0.8896474838256836, "rewards/accuracies": 0.875, "rewards/chosen": 5.428950786590576, "rewards/margins": 3.8680737018585205, "rewards/rejected": 1.5608775615692139, "step": 10580 }, { "epoch": 0.587509188499466, "grad_norm": 46.51936340332031, "learning_rate": 3.642535647772781e-08, "logits/chosen": -0.5787987112998962, "logits/rejected": -0.620763897895813, "logps/chosen": -220.16983032226562, "logps/rejected": -271.59197998046875, "loss": 1.2583, "nll_loss": 1.241948127746582, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 7.415195465087891, "rewards/margins": 4.35185432434082, "rewards/rejected": 3.0633416175842285, "step": 10590 }, { "epoch": 0.5880639658257167, "grad_norm": 79.09966278076172, "learning_rate": 3.634150485391977e-08, "logits/chosen": -0.4392542839050293, "logits/rejected": -0.5849173665046692, "logps/chosen": -204.16653442382812, "logps/rejected": -281.7826232910156, "loss": 1.2942, "nll_loss": 1.087228536605835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.519995212554932, "rewards/margins": 4.00660514831543, "rewards/rejected": 2.5133910179138184, "step": 10600 }, { "epoch": 0.5886187431519674, "grad_norm": 130.8496551513672, "learning_rate": 3.62576947209244e-08, "logits/chosen": -0.25583991408348083, "logits/rejected": -0.40053972601890564, "logps/chosen": -142.29135131835938, "logps/rejected": -186.323486328125, "loss": 1.3215, "nll_loss": 0.8704953193664551, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 5.435754776000977, "rewards/margins": 3.020576000213623, "rewards/rejected": 2.4151787757873535, "step": 10610 }, { "epoch": 0.589173520478218, "grad_norm": 43.7702751159668, "learning_rate": 3.617392633333421e-08, "logits/chosen": -0.2983597218990326, "logits/rejected": -0.3602357804775238, "logps/chosen": -185.18138122558594, "logps/rejected": -221.4364471435547, "loss": 1.2755, "nll_loss": 1.1458022594451904, "rewards/accuracies": 0.75, "rewards/chosen": 5.94400691986084, "rewards/margins": 2.6994998455047607, "rewards/rejected": 3.2445075511932373, "step": 10620 }, { "epoch": 0.5897282978044688, "grad_norm": 63.04467010498047, "learning_rate": 3.60901999456149e-08, "logits/chosen": -0.22892241179943085, "logits/rejected": -0.4272095561027527, "logps/chosen": -138.57318115234375, "logps/rejected": -188.59591674804688, "loss": 1.2542, "nll_loss": 0.8126301765441895, "rewards/accuracies": 0.875, "rewards/chosen": 5.200963497161865, "rewards/margins": 3.0934946537017822, "rewards/rejected": 2.107468605041504, "step": 10630 }, { "epoch": 0.5902830751307194, "grad_norm": 105.11898040771484, "learning_rate": 3.6006515812104565e-08, "logits/chosen": -0.39995405077934265, "logits/rejected": -0.5333541035652161, "logps/chosen": -170.89114379882812, "logps/rejected": -247.3040313720703, "loss": 1.1993, "nll_loss": 0.9506009221076965, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 6.481733798980713, "rewards/margins": 5.089728832244873, "rewards/rejected": 1.392005443572998, "step": 10640 }, { "epoch": 0.5908378524569701, "grad_norm": 59.90850067138672, "learning_rate": 3.592287418701297e-08, "logits/chosen": -0.32686835527420044, "logits/rejected": -0.4767589569091797, "logps/chosen": -154.9896240234375, "logps/rejected": -234.1117706298828, "loss": 1.1636, "nll_loss": 0.8605798482894897, "rewards/accuracies": 0.875, "rewards/chosen": 5.890946388244629, "rewards/margins": 4.759671688079834, "rewards/rejected": 1.1312743425369263, "step": 10650 }, { "epoch": 0.5913926297832207, "grad_norm": 112.5676498413086, "learning_rate": 3.5839275324420725e-08, "logits/chosen": -0.31205517053604126, "logits/rejected": -0.5330938100814819, "logps/chosen": -156.6017303466797, "logps/rejected": -212.99722290039062, "loss": 1.264, "nll_loss": 0.8573010563850403, "rewards/accuracies": 0.875, "rewards/chosen": 5.145112037658691, "rewards/margins": 3.8558971881866455, "rewards/rejected": 1.2892147302627563, "step": 10660 }, { "epoch": 0.5919474071094715, "grad_norm": 54.38540267944336, "learning_rate": 3.5755719478278595e-08, "logits/chosen": -0.31130069494247437, "logits/rejected": -0.4759851098060608, "logps/chosen": -157.94412231445312, "logps/rejected": -222.71640014648438, "loss": 1.3018, "nll_loss": 0.908219039440155, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.573339939117432, "rewards/margins": 3.681096315383911, "rewards/rejected": 1.8922431468963623, "step": 10670 }, { "epoch": 0.5925021844357221, "grad_norm": 104.39202880859375, "learning_rate": 3.567220690240661e-08, "logits/chosen": -0.3926613926887512, "logits/rejected": -0.5438095331192017, "logps/chosen": -174.09701538085938, "logps/rejected": -228.9287872314453, "loss": 1.2854, "nll_loss": 0.9536849856376648, "rewards/accuracies": 0.875, "rewards/chosen": 6.012936115264893, "rewards/margins": 3.408534288406372, "rewards/rejected": 2.6044020652770996, "step": 10680 }, { "epoch": 0.5930569617619728, "grad_norm": 66.91699981689453, "learning_rate": 3.5588737850493375e-08, "logits/chosen": -0.31840792298316956, "logits/rejected": -0.4478437900543213, "logps/chosen": -159.5063018798828, "logps/rejected": -211.3325653076172, "loss": 1.2698, "nll_loss": 0.9285990595817566, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.953314304351807, "rewards/margins": 3.9582533836364746, "rewards/rejected": 1.9950603246688843, "step": 10690 }, { "epoch": 0.5936117390882235, "grad_norm": 85.52528381347656, "learning_rate": 3.550531257609529e-08, "logits/chosen": -0.3695484697818756, "logits/rejected": -0.48085230588912964, "logps/chosen": -152.49127197265625, "logps/rejected": -218.1444091796875, "loss": 1.3511, "nll_loss": 0.9453736543655396, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.081177711486816, "rewards/margins": 3.2808938026428223, "rewards/rejected": 2.800283908843994, "step": 10700 }, { "epoch": 0.5941665164144742, "grad_norm": 73.19735717773438, "learning_rate": 3.542193133263576e-08, "logits/chosen": -0.4986700117588043, "logits/rejected": -0.5374937057495117, "logps/chosen": -237.1245574951172, "logps/rejected": -281.1835632324219, "loss": 1.2915, "nll_loss": 1.1597903966903687, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 7.327892303466797, "rewards/margins": 4.328261852264404, "rewards/rejected": 2.999631404876709, "step": 10710 }, { "epoch": 0.5947212937407248, "grad_norm": 72.92422485351562, "learning_rate": 3.533859437340445e-08, "logits/chosen": -0.13615483045578003, "logits/rejected": -0.30375248193740845, "logps/chosen": -130.32920837402344, "logps/rejected": -205.73660278320312, "loss": 1.2372, "nll_loss": 0.8853503465652466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.057281494140625, "rewards/margins": 3.348877429962158, "rewards/rejected": 1.708404302597046, "step": 10720 }, { "epoch": 0.5952760710669754, "grad_norm": 77.15365600585938, "learning_rate": 3.5255301951556496e-08, "logits/chosen": -0.4535873830318451, "logits/rejected": -0.574942946434021, "logps/chosen": -211.65103149414062, "logps/rejected": -256.065673828125, "loss": 1.2748, "nll_loss": 1.1198481321334839, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 7.009836673736572, "rewards/margins": 4.057226181030273, "rewards/rejected": 2.952610492706299, "step": 10730 }, { "epoch": 0.5958308483932262, "grad_norm": 54.13565444946289, "learning_rate": 3.517205432011174e-08, "logits/chosen": -0.42517557740211487, "logits/rejected": -0.5419595241546631, "logps/chosen": -200.6864471435547, "logps/rejected": -258.7049255371094, "loss": 1.3266, "nll_loss": 1.0525949001312256, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.4116668701171875, "rewards/margins": 3.5005390644073486, "rewards/rejected": 2.911127805709839, "step": 10740 }, { "epoch": 0.5963856257194768, "grad_norm": 59.02808380126953, "learning_rate": 3.508885173195395e-08, "logits/chosen": -0.45422667264938354, "logits/rejected": -0.5683622360229492, "logps/chosen": -187.98797607421875, "logps/rejected": -259.0843505859375, "loss": 1.2412, "nll_loss": 1.0791963338851929, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.193970680236816, "rewards/margins": 3.6251118183135986, "rewards/rejected": 2.5688586235046387, "step": 10750 }, { "epoch": 0.5969404030457275, "grad_norm": 65.45647430419922, "learning_rate": 3.500569443983006e-08, "logits/chosen": -0.21939226984977722, "logits/rejected": -0.4209931492805481, "logps/chosen": -130.20509338378906, "logps/rejected": -179.78700256347656, "loss": 1.2037, "nll_loss": 0.7984130382537842, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.359994888305664, "rewards/margins": 3.702338457107544, "rewards/rejected": 1.6576560735702515, "step": 10760 }, { "epoch": 0.5974951803719782, "grad_norm": 58.52310562133789, "learning_rate": 3.492258269634948e-08, "logits/chosen": -0.3713361322879791, "logits/rejected": -0.5267468690872192, "logps/chosen": -166.95346069335938, "logps/rejected": -259.38800048828125, "loss": 1.2701, "nll_loss": 0.9824539422988892, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.330275058746338, "rewards/margins": 4.58265495300293, "rewards/rejected": 1.7476199865341187, "step": 10770 }, { "epoch": 0.5980499576982289, "grad_norm": 67.48011779785156, "learning_rate": 3.483951675398315e-08, "logits/chosen": -0.4535156786441803, "logits/rejected": -0.5738258957862854, "logps/chosen": -194.94332885742188, "logps/rejected": -265.9612731933594, "loss": 1.2541, "nll_loss": 1.1000378131866455, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.979112148284912, "rewards/margins": 4.8542375564575195, "rewards/rejected": 2.1248748302459717, "step": 10780 }, { "epoch": 0.5986047350244795, "grad_norm": 79.3395767211914, "learning_rate": 3.4756496865062966e-08, "logits/chosen": -0.38299891352653503, "logits/rejected": -0.5297697186470032, "logps/chosen": -201.93260192871094, "logps/rejected": -254.21224975585938, "loss": 1.2738, "nll_loss": 1.0478332042694092, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.4227614402771, "rewards/margins": 4.124935150146484, "rewards/rejected": 2.2978267669677734, "step": 10790 }, { "epoch": 0.5991595123507302, "grad_norm": 45.63748550415039, "learning_rate": 3.4673523281780856e-08, "logits/chosen": -0.292010098695755, "logits/rejected": -0.46596068143844604, "logps/chosen": -162.84024047851562, "logps/rejected": -217.80667114257812, "loss": 1.2855, "nll_loss": 0.8932517766952515, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.736856460571289, "rewards/margins": 4.134681224822998, "rewards/rejected": 1.6021745204925537, "step": 10800 }, { "epoch": 0.5997142896769809, "grad_norm": 66.92276763916016, "learning_rate": 3.45905962561881e-08, "logits/chosen": -0.39471474289894104, "logits/rejected": -0.5333040952682495, "logps/chosen": -181.5998992919922, "logps/rejected": -255.9614715576172, "loss": 1.202, "nll_loss": 1.0461175441741943, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.988242149353027, "rewards/margins": 4.026206016540527, "rewards/rejected": 1.9620361328125, "step": 10810 }, { "epoch": 0.6002690670032316, "grad_norm": 76.19792175292969, "learning_rate": 3.450771604019461e-08, "logits/chosen": -0.3107849359512329, "logits/rejected": -0.42468562722206116, "logps/chosen": -168.02243041992188, "logps/rejected": -228.12759399414062, "loss": 1.2738, "nll_loss": 0.9977830052375793, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.253368854522705, "rewards/margins": 4.136763572692871, "rewards/rejected": 2.116605758666992, "step": 10820 }, { "epoch": 0.6008238443294822, "grad_norm": 63.823062896728516, "learning_rate": 3.442488288556804e-08, "logits/chosen": -0.5227320790290833, "logits/rejected": -0.5916525721549988, "logps/chosen": -183.53652954101562, "logps/rejected": -252.58041381835938, "loss": 1.2982, "nll_loss": 1.063537359237671, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.722087860107422, "rewards/margins": 3.78173565864563, "rewards/rejected": 2.940351963043213, "step": 10830 }, { "epoch": 0.601378621655733, "grad_norm": 69.8787841796875, "learning_rate": 3.4342097043933096e-08, "logits/chosen": -0.3921436369419098, "logits/rejected": -0.4659816324710846, "logps/chosen": -190.76670837402344, "logps/rejected": -255.2912139892578, "loss": 1.3615, "nll_loss": 1.0732951164245605, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.0947136878967285, "rewards/margins": 4.343235969543457, "rewards/rejected": 1.7514781951904297, "step": 10840 }, { "epoch": 0.6019333989819836, "grad_norm": 31.128889083862305, "learning_rate": 3.425935876677076e-08, "logits/chosen": -0.43652114272117615, "logits/rejected": -0.5631170868873596, "logps/chosen": -177.95590209960938, "logps/rejected": -214.3686065673828, "loss": 1.2299, "nll_loss": 1.0268595218658447, "rewards/accuracies": 0.875, "rewards/chosen": 5.997559547424316, "rewards/margins": 3.688072681427002, "rewards/rejected": 2.3094871044158936, "step": 10850 }, { "epoch": 0.6024881763082343, "grad_norm": 77.23370361328125, "learning_rate": 3.417666830541754e-08, "logits/chosen": -0.45197612047195435, "logits/rejected": -0.5712178945541382, "logps/chosen": -177.9655303955078, "logps/rejected": -214.2071533203125, "loss": 1.289, "nll_loss": 1.0503698587417603, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.04358434677124, "rewards/margins": 3.6864120960235596, "rewards/rejected": 2.3571720123291016, "step": 10860 }, { "epoch": 0.603042953634485, "grad_norm": 74.53681182861328, "learning_rate": 3.4094025911064686e-08, "logits/chosen": -0.3006956875324249, "logits/rejected": -0.46492305397987366, "logps/chosen": -181.09133911132812, "logps/rejected": -242.59927368164062, "loss": 1.3742, "nll_loss": 0.8726722002029419, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.730262279510498, "rewards/margins": 3.801975727081299, "rewards/rejected": 1.9282863140106201, "step": 10870 }, { "epoch": 0.6035977309607357, "grad_norm": 42.141292572021484, "learning_rate": 3.401143183475743e-08, "logits/chosen": -0.30769291520118713, "logits/rejected": -0.4697556495666504, "logps/chosen": -160.7049102783203, "logps/rejected": -226.4869842529297, "loss": 1.1419, "nll_loss": 0.8911483883857727, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.688979625701904, "rewards/margins": 3.701953411102295, "rewards/rejected": 1.9870258569717407, "step": 10880 }, { "epoch": 0.6041525082869863, "grad_norm": 78.36772918701172, "learning_rate": 3.392888632739424e-08, "logits/chosen": -0.2836330235004425, "logits/rejected": -0.412407249212265, "logps/chosen": -148.725830078125, "logps/rejected": -197.4859161376953, "loss": 1.3034, "nll_loss": 0.9351836442947388, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.380554676055908, "rewards/margins": 2.8393402099609375, "rewards/rejected": 2.5412139892578125, "step": 10890 }, { "epoch": 0.604707285613237, "grad_norm": 65.53260803222656, "learning_rate": 3.3846389639726e-08, "logits/chosen": -0.31793665885925293, "logits/rejected": -0.4620705246925354, "logps/chosen": -169.74215698242188, "logps/rejected": -242.336181640625, "loss": 1.1877, "nll_loss": 0.9373221397399902, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.978935241699219, "rewards/margins": 3.973456621170044, "rewards/rejected": 2.005479097366333, "step": 10900 }, { "epoch": 0.6052620629394877, "grad_norm": 26.164304733276367, "learning_rate": 3.376394202235534e-08, "logits/chosen": -0.2988826632499695, "logits/rejected": -0.3849804401397705, "logps/chosen": -186.63185119628906, "logps/rejected": -238.55679321289062, "loss": 1.2071, "nll_loss": 0.9764013290405273, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 6.3566179275512695, "rewards/margins": 3.4110398292541504, "rewards/rejected": 2.945578098297119, "step": 10910 }, { "epoch": 0.6058168402657383, "grad_norm": 54.674285888671875, "learning_rate": 3.368154372573584e-08, "logits/chosen": -0.41310158371925354, "logits/rejected": -0.4816606044769287, "logps/chosen": -172.04104614257812, "logps/rejected": -199.1383056640625, "loss": 1.2596, "nll_loss": 1.0648918151855469, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.158577919006348, "rewards/margins": 3.6144866943359375, "rewards/rejected": 2.544090986251831, "step": 10920 }, { "epoch": 0.606371617591989, "grad_norm": 120.4090576171875, "learning_rate": 3.35991950001712e-08, "logits/chosen": -0.36028024554252625, "logits/rejected": -0.5143309831619263, "logps/chosen": -177.98716735839844, "logps/rejected": -237.66494750976562, "loss": 1.2803, "nll_loss": 0.9472376108169556, "rewards/accuracies": 0.875, "rewards/chosen": 6.03228759765625, "rewards/margins": 4.048565864562988, "rewards/rejected": 1.983722448348999, "step": 10930 }, { "epoch": 0.6069263949182397, "grad_norm": 77.94869995117188, "learning_rate": 3.351689609581458e-08, "logits/chosen": -0.298319011926651, "logits/rejected": -0.46092239022254944, "logps/chosen": -141.98526000976562, "logps/rejected": -201.03860473632812, "loss": 1.1757, "nll_loss": 0.8618471026420593, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.297152519226074, "rewards/margins": 3.743290424346924, "rewards/rejected": 1.5538616180419922, "step": 10940 }, { "epoch": 0.6074811722444904, "grad_norm": 67.52654266357422, "learning_rate": 3.3434647262667793e-08, "logits/chosen": -0.3805920481681824, "logits/rejected": -0.5205205678939819, "logps/chosen": -196.33363342285156, "logps/rejected": -256.85052490234375, "loss": 1.2294, "nll_loss": 1.0538861751556396, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.10671854019165, "rewards/margins": 4.37234354019165, "rewards/rejected": 1.734375238418579, "step": 10950 }, { "epoch": 0.608035949570741, "grad_norm": 98.63069152832031, "learning_rate": 3.335244875058051e-08, "logits/chosen": -0.23003113269805908, "logits/rejected": -0.4464952051639557, "logps/chosen": -185.92897033691406, "logps/rejected": -234.14205932617188, "loss": 1.2685, "nll_loss": 0.9236680269241333, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.777362823486328, "rewards/margins": 4.378003120422363, "rewards/rejected": 1.3993602991104126, "step": 10960 }, { "epoch": 0.6085907268969917, "grad_norm": 60.71028137207031, "learning_rate": 3.3270300809249596e-08, "logits/chosen": -0.34472283720970154, "logits/rejected": -0.5263240337371826, "logps/chosen": -154.20912170410156, "logps/rejected": -196.12025451660156, "loss": 1.2432, "nll_loss": 1.014953851699829, "rewards/accuracies": 0.875, "rewards/chosen": 5.40725564956665, "rewards/margins": 3.625709056854248, "rewards/rejected": 1.7815459966659546, "step": 10970 }, { "epoch": 0.6091455042232424, "grad_norm": 105.30965423583984, "learning_rate": 3.318820368821826e-08, "logits/chosen": -0.2851284444332123, "logits/rejected": -0.45964550971984863, "logps/chosen": -151.84864807128906, "logps/rejected": -204.26199340820312, "loss": 1.2432, "nll_loss": 1.0300170183181763, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.299997806549072, "rewards/margins": 3.8525798320770264, "rewards/rejected": 1.4474176168441772, "step": 10980 }, { "epoch": 0.6097002815494931, "grad_norm": 66.04049682617188, "learning_rate": 3.310615763687535e-08, "logits/chosen": -0.32158032059669495, "logits/rejected": -0.42908206582069397, "logps/chosen": -160.4062957763672, "logps/rejected": -207.47634887695312, "loss": 1.2513, "nll_loss": 0.8961458206176758, "rewards/accuracies": 0.875, "rewards/chosen": 5.963690757751465, "rewards/margins": 4.239657402038574, "rewards/rejected": 1.7240327596664429, "step": 10990 }, { "epoch": 0.6102550588757437, "grad_norm": 105.71235656738281, "learning_rate": 3.302416290445458e-08, "logits/chosen": -0.3293169438838959, "logits/rejected": -0.40171074867248535, "logps/chosen": -168.55116271972656, "logps/rejected": -217.5762176513672, "loss": 1.3974, "nll_loss": 0.9453521966934204, "rewards/accuracies": 0.75, "rewards/chosen": 6.0554094314575195, "rewards/margins": 3.4899826049804688, "rewards/rejected": 2.56542706489563, "step": 11000 }, { "epoch": 0.6102550588757437, "eval_logits/chosen": -0.45493215322494507, "eval_logits/rejected": -0.5725303292274475, "eval_logps/chosen": -190.5392303466797, "eval_logps/rejected": -259.3156433105469, "eval_loss": 1.225435495376587, "eval_nll_loss": 0.9884793758392334, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.742199897766113, "eval_rewards/margins": 4.765914440155029, "eval_rewards/rejected": 1.9762849807739258, "eval_runtime": 17.2793, "eval_samples_per_second": 14.815, "eval_steps_per_second": 1.852, "step": 11000 }, { "epoch": 0.6108098362019945, "grad_norm": 69.06470489501953, "learning_rate": 3.2942219740033706e-08, "logits/chosen": -0.33280569314956665, "logits/rejected": -0.508167564868927, "logps/chosen": -133.94076538085938, "logps/rejected": -198.4490966796875, "loss": 1.2233, "nll_loss": 0.8154407739639282, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.323696136474609, "rewards/margins": 2.990116596221924, "rewards/rejected": 2.3335793018341064, "step": 11010 }, { "epoch": 0.6113646135282451, "grad_norm": 67.94520568847656, "learning_rate": 3.2860328392533964e-08, "logits/chosen": -0.31693345308303833, "logits/rejected": -0.4917060434818268, "logps/chosen": -155.8323211669922, "logps/rejected": -205.52706909179688, "loss": 1.2193, "nll_loss": 0.8596027493476868, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.675806999206543, "rewards/margins": 3.0525705814361572, "rewards/rejected": 2.6232364177703857, "step": 11020 }, { "epoch": 0.6119193908544958, "grad_norm": 50.12065887451172, "learning_rate": 3.277848911071908e-08, "logits/chosen": -0.3317955732345581, "logits/rejected": -0.5724862813949585, "logps/chosen": -136.84103393554688, "logps/rejected": -195.4505615234375, "loss": 1.1545, "nll_loss": 0.8111773729324341, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.188634395599365, "rewards/margins": 3.3500258922576904, "rewards/rejected": 1.8386093378067017, "step": 11030 }, { "epoch": 0.6124741681807464, "grad_norm": 42.531307220458984, "learning_rate": 3.269670214319464e-08, "logits/chosen": -0.38377127051353455, "logits/rejected": -0.486454576253891, "logps/chosen": -164.94155883789062, "logps/rejected": -203.86782836914062, "loss": 1.3069, "nll_loss": 0.9467847943305969, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.810660362243652, "rewards/margins": 3.384552478790283, "rewards/rejected": 2.4261085987091064, "step": 11040 }, { "epoch": 0.6130289455069972, "grad_norm": 78.62700653076172, "learning_rate": 3.261496773840733e-08, "logits/chosen": -0.414069265127182, "logits/rejected": -0.5053438544273376, "logps/chosen": -187.6324462890625, "logps/rejected": -246.0666046142578, "loss": 1.3059, "nll_loss": 1.0669176578521729, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.589938163757324, "rewards/margins": 3.714946746826172, "rewards/rejected": 2.8749921321868896, "step": 11050 }, { "epoch": 0.6135837228332478, "grad_norm": 50.88478088378906, "learning_rate": 3.253328614464413e-08, "logits/chosen": -0.424401193857193, "logits/rejected": -0.5055257081985474, "logps/chosen": -191.259521484375, "logps/rejected": -248.0755157470703, "loss": 1.2733, "nll_loss": 1.0479724407196045, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.432443141937256, "rewards/margins": 3.278245449066162, "rewards/rejected": 3.1541976928710938, "step": 11060 }, { "epoch": 0.6141385001594984, "grad_norm": 49.54227828979492, "learning_rate": 3.245165761003168e-08, "logits/chosen": -0.38219529390335083, "logits/rejected": -0.5165926218032837, "logps/chosen": -180.51742553710938, "logps/rejected": -230.5033721923828, "loss": 1.2638, "nll_loss": 0.976039707660675, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.015735149383545, "rewards/margins": 3.3528194427490234, "rewards/rejected": 2.662916421890259, "step": 11070 }, { "epoch": 0.6146932774857492, "grad_norm": 70.92606353759766, "learning_rate": 3.237008238253534e-08, "logits/chosen": -0.3433719277381897, "logits/rejected": -0.5679585337638855, "logps/chosen": -178.82461547851562, "logps/rejected": -279.38922119140625, "loss": 1.3671, "nll_loss": 0.9539782404899597, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.885937213897705, "rewards/margins": 4.357199668884277, "rewards/rejected": 1.5287374258041382, "step": 11080 }, { "epoch": 0.6152480548119998, "grad_norm": 37.418861389160156, "learning_rate": 3.2288560709958596e-08, "logits/chosen": -0.3987385630607605, "logits/rejected": -0.5709208846092224, "logps/chosen": -168.68743896484375, "logps/rejected": -234.5459442138672, "loss": 1.2156, "nll_loss": 1.0553100109100342, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.184418201446533, "rewards/margins": 3.9626851081848145, "rewards/rejected": 2.2217330932617188, "step": 11090 }, { "epoch": 0.6158028321382505, "grad_norm": 36.667327880859375, "learning_rate": 3.220709283994222e-08, "logits/chosen": -0.20910899341106415, "logits/rejected": -0.42044633626937866, "logps/chosen": -140.78128051757812, "logps/rejected": -202.6675567626953, "loss": 1.1547, "nll_loss": 0.7739453315734863, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.707763671875, "rewards/margins": 3.4391884803771973, "rewards/rejected": 2.268575668334961, "step": 11100 }, { "epoch": 0.6163576094645011, "grad_norm": 46.022804260253906, "learning_rate": 3.212567901996355e-08, "logits/chosen": -0.2734110355377197, "logits/rejected": -0.4417598843574524, "logps/chosen": -156.12782287597656, "logps/rejected": -210.34262084960938, "loss": 1.1952, "nll_loss": 0.9398097991943359, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.590392589569092, "rewards/margins": 4.1208720207214355, "rewards/rejected": 1.4695208072662354, "step": 11110 }, { "epoch": 0.6169123867907519, "grad_norm": 56.89548110961914, "learning_rate": 3.204431949733577e-08, "logits/chosen": -0.33289042115211487, "logits/rejected": -0.5147531032562256, "logps/chosen": -167.79898071289062, "logps/rejected": -241.2939910888672, "loss": 1.179, "nll_loss": 0.8868080377578735, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.794384479522705, "rewards/margins": 4.220944404602051, "rewards/rejected": 1.5734400749206543, "step": 11120 }, { "epoch": 0.6174671641170025, "grad_norm": 62.29393005371094, "learning_rate": 3.1963014519207074e-08, "logits/chosen": -0.39875271916389465, "logits/rejected": -0.5109494924545288, "logps/chosen": -205.0402069091797, "logps/rejected": -248.0024871826172, "loss": 1.2758, "nll_loss": 1.076166033744812, "rewards/accuracies": 0.875, "rewards/chosen": 6.737574100494385, "rewards/margins": 4.10688591003418, "rewards/rejected": 2.630688428878784, "step": 11130 }, { "epoch": 0.6180219414432532, "grad_norm": 130.27410888671875, "learning_rate": 3.188176433256e-08, "logits/chosen": -0.2652292251586914, "logits/rejected": -0.42938145995140076, "logps/chosen": -148.16464233398438, "logps/rejected": -182.64065551757812, "loss": 1.2944, "nll_loss": 0.8596665263175964, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.408052444458008, "rewards/margins": 3.4761035442352295, "rewards/rejected": 1.9319489002227783, "step": 11140 }, { "epoch": 0.6185767187695039, "grad_norm": 62.68735885620117, "learning_rate": 3.180056918421062e-08, "logits/chosen": -0.3436311185359955, "logits/rejected": -0.5326007604598999, "logps/chosen": -145.62283325195312, "logps/rejected": -201.58200073242188, "loss": 1.2259, "nll_loss": 0.8367489576339722, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.320952892303467, "rewards/margins": 3.4347481727600098, "rewards/rejected": 1.8862043619155884, "step": 11150 }, { "epoch": 0.6191314960957546, "grad_norm": 67.24079132080078, "learning_rate": 3.171942932080782e-08, "logits/chosen": -0.3893461525440216, "logits/rejected": -0.5333869457244873, "logps/chosen": -173.96942138671875, "logps/rejected": -225.2637176513672, "loss": 1.2849, "nll_loss": 0.9754926562309265, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.036957740783691, "rewards/margins": 3.80385160446167, "rewards/rejected": 2.2331058979034424, "step": 11160 }, { "epoch": 0.6196862734220052, "grad_norm": 152.81712341308594, "learning_rate": 3.163834498883258e-08, "logits/chosen": -0.28430917859077454, "logits/rejected": -0.48452943563461304, "logps/chosen": -150.76608276367188, "logps/rejected": -196.15130615234375, "loss": 1.2478, "nll_loss": 0.8585501909255981, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.390089988708496, "rewards/margins": 2.8869361877441406, "rewards/rejected": 2.5031542778015137, "step": 11170 }, { "epoch": 0.620241050748256, "grad_norm": 23.189393997192383, "learning_rate": 3.155731643459715e-08, "logits/chosen": -0.3876117467880249, "logits/rejected": -0.47813859581947327, "logps/chosen": -183.8386993408203, "logps/rejected": -246.20559692382812, "loss": 1.2264, "nll_loss": 0.988499641418457, "rewards/accuracies": 0.875, "rewards/chosen": 6.404829502105713, "rewards/margins": 3.8341288566589355, "rewards/rejected": 2.5707004070281982, "step": 11180 }, { "epoch": 0.6207958280745066, "grad_norm": 71.80838775634766, "learning_rate": 3.147634390424434e-08, "logits/chosen": -0.3379099369049072, "logits/rejected": -0.5105775594711304, "logps/chosen": -168.1144256591797, "logps/rejected": -225.281005859375, "loss": 1.2116, "nll_loss": 0.8586258888244629, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.672602653503418, "rewards/margins": 3.8174891471862793, "rewards/rejected": 1.8551137447357178, "step": 11190 }, { "epoch": 0.6213506054007573, "grad_norm": 45.49497604370117, "learning_rate": 3.13954276437468e-08, "logits/chosen": -0.24583525955677032, "logits/rejected": -0.3375547528266907, "logps/chosen": -149.8053436279297, "logps/rejected": -203.1839599609375, "loss": 1.3485, "nll_loss": 0.9066442251205444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.1894211769104, "rewards/margins": 3.2792041301727295, "rewards/rejected": 1.910217046737671, "step": 11200 }, { "epoch": 0.6219053827270079, "grad_norm": 78.665771484375, "learning_rate": 3.131456789890622e-08, "logits/chosen": -0.24614350497722626, "logits/rejected": -0.40910449624061584, "logps/chosen": -141.6895294189453, "logps/rejected": -182.7397918701172, "loss": 1.2449, "nll_loss": 0.826746940612793, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.132467269897461, "rewards/margins": 2.5911591053009033, "rewards/rejected": 2.5413081645965576, "step": 11210 }, { "epoch": 0.6224601600532587, "grad_norm": 75.47736358642578, "learning_rate": 3.1233764915352644e-08, "logits/chosen": -0.31453007459640503, "logits/rejected": -0.39097392559051514, "logps/chosen": -152.90516662597656, "logps/rejected": -211.50540161132812, "loss": 1.2088, "nll_loss": 0.9485653638839722, "rewards/accuracies": 0.75, "rewards/chosen": 5.528751373291016, "rewards/margins": 2.273203134536743, "rewards/rejected": 3.2555489540100098, "step": 11220 }, { "epoch": 0.6230149373795093, "grad_norm": 62.74408721923828, "learning_rate": 3.1153018938543674e-08, "logits/chosen": -0.3634536862373352, "logits/rejected": -0.48061853647232056, "logps/chosen": -157.69241333007812, "logps/rejected": -209.5538330078125, "loss": 1.2621, "nll_loss": 0.9580618143081665, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.60360050201416, "rewards/margins": 2.582735538482666, "rewards/rejected": 3.020864963531494, "step": 11230 }, { "epoch": 0.62356971470576, "grad_norm": 65.83464813232422, "learning_rate": 3.1072330213763734e-08, "logits/chosen": -0.2914223074913025, "logits/rejected": -0.44978776574134827, "logps/chosen": -169.06375122070312, "logps/rejected": -215.7394256591797, "loss": 1.2399, "nll_loss": 1.0182244777679443, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.937629222869873, "rewards/margins": 4.014005184173584, "rewards/rejected": 1.9236243963241577, "step": 11240 }, { "epoch": 0.6241244920320107, "grad_norm": 81.04288482666016, "learning_rate": 3.099169898612334e-08, "logits/chosen": -0.42526236176490784, "logits/rejected": -0.5600941181182861, "logps/chosen": -196.39120483398438, "logps/rejected": -255.0546417236328, "loss": 1.2505, "nll_loss": 1.025933861732483, "rewards/accuracies": 0.875, "rewards/chosen": 6.386963844299316, "rewards/margins": 3.5854058265686035, "rewards/rejected": 2.8015573024749756, "step": 11250 }, { "epoch": 0.6246792693582613, "grad_norm": 77.561767578125, "learning_rate": 3.091112550055832e-08, "logits/chosen": -0.263704389333725, "logits/rejected": -0.3660004734992981, "logps/chosen": -153.57029724121094, "logps/rejected": -196.36021423339844, "loss": 1.1754, "nll_loss": 0.8896039128303528, "rewards/accuracies": 0.875, "rewards/chosen": 5.714977741241455, "rewards/margins": 3.1963276863098145, "rewards/rejected": 2.5186495780944824, "step": 11260 }, { "epoch": 0.625234046684512, "grad_norm": 64.04689025878906, "learning_rate": 3.083061000182917e-08, "logits/chosen": -0.2049088180065155, "logits/rejected": -0.44929853081703186, "logps/chosen": -120.06229400634766, "logps/rejected": -159.4713592529297, "loss": 1.289, "nll_loss": 0.7990074753761292, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 5.157519340515137, "rewards/margins": 3.633838653564453, "rewards/rejected": 1.5236806869506836, "step": 11270 }, { "epoch": 0.6257888240107626, "grad_norm": 75.26434326171875, "learning_rate": 3.075015273452016e-08, "logits/chosen": -0.1911907196044922, "logits/rejected": -0.41610366106033325, "logps/chosen": -136.66331481933594, "logps/rejected": -191.37852478027344, "loss": 1.1743, "nll_loss": 0.7738946676254272, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.248966217041016, "rewards/margins": 4.313348293304443, "rewards/rejected": 0.9356174468994141, "step": 11280 }, { "epoch": 0.6263436013370134, "grad_norm": 48.91354751586914, "learning_rate": 3.0669753943038706e-08, "logits/chosen": -0.3538931906223297, "logits/rejected": -0.49591121077537537, "logps/chosen": -180.0372772216797, "logps/rejected": -241.81753540039062, "loss": 1.202, "nll_loss": 1.0452322959899902, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.185958385467529, "rewards/margins": 4.097734451293945, "rewards/rejected": 2.088223695755005, "step": 11290 }, { "epoch": 0.626898378663264, "grad_norm": 67.3182144165039, "learning_rate": 3.058941387161456e-08, "logits/chosen": -0.38178586959838867, "logits/rejected": -0.5090783834457397, "logps/chosen": -180.46243286132812, "logps/rejected": -235.2760772705078, "loss": 1.2854, "nll_loss": 1.0128010511398315, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.531017303466797, "rewards/margins": 3.047769546508789, "rewards/rejected": 3.483248233795166, "step": 11300 }, { "epoch": 0.6274531559895147, "grad_norm": 45.30299377441406, "learning_rate": 3.0509132764299164e-08, "logits/chosen": -0.4374946057796478, "logits/rejected": -0.5233668684959412, "logps/chosen": -180.13943481445312, "logps/rejected": -221.87490844726562, "loss": 1.2035, "nll_loss": 1.003361463546753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.882946968078613, "rewards/margins": 2.9304733276367188, "rewards/rejected": 2.9524731636047363, "step": 11310 }, { "epoch": 0.6280079333157654, "grad_norm": 85.49980926513672, "learning_rate": 3.042891086496477e-08, "logits/chosen": -0.2379666566848755, "logits/rejected": -0.42325901985168457, "logps/chosen": -152.56871032714844, "logps/rejected": -216.0143585205078, "loss": 1.1971, "nll_loss": 0.8649711608886719, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.755773067474365, "rewards/margins": 4.935097694396973, "rewards/rejected": 0.820675253868103, "step": 11320 }, { "epoch": 0.6285627106420161, "grad_norm": 59.26189422607422, "learning_rate": 3.034874841730382e-08, "logits/chosen": -0.3192819058895111, "logits/rejected": -0.47028088569641113, "logps/chosen": -156.40786743164062, "logps/rejected": -207.82699584960938, "loss": 1.2448, "nll_loss": 0.9039347767829895, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.8771162033081055, "rewards/margins": 3.4613919258117676, "rewards/rejected": 2.415724277496338, "step": 11330 }, { "epoch": 0.6291174879682667, "grad_norm": 30.611183166503906, "learning_rate": 3.026864566482813e-08, "logits/chosen": -0.16953524947166443, "logits/rejected": -0.40352344512939453, "logps/chosen": -134.0901641845703, "logps/rejected": -189.51467895507812, "loss": 1.2499, "nll_loss": 0.774274468421936, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.538664817810059, "rewards/margins": 4.124792098999023, "rewards/rejected": 1.4138729572296143, "step": 11340 }, { "epoch": 0.6296722652945174, "grad_norm": 81.96833801269531, "learning_rate": 3.0188602850868185e-08, "logits/chosen": -0.3460689187049866, "logits/rejected": -0.5106383562088013, "logps/chosen": -172.60360717773438, "logps/rejected": -248.35702514648438, "loss": 1.2077, "nll_loss": 0.9729253053665161, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.984989643096924, "rewards/margins": 3.9146087169647217, "rewards/rejected": 2.070380687713623, "step": 11350 }, { "epoch": 0.6302270426207681, "grad_norm": 99.990966796875, "learning_rate": 3.01086202185724e-08, "logits/chosen": -0.2937987446784973, "logits/rejected": -0.44375085830688477, "logps/chosen": -135.4306182861328, "logps/rejected": -175.08709716796875, "loss": 1.232, "nll_loss": 0.8019644618034363, "rewards/accuracies": 0.875, "rewards/chosen": 5.3385844230651855, "rewards/margins": 3.8010196685791016, "rewards/rejected": 1.5375645160675049, "step": 11360 }, { "epoch": 0.6307818199470188, "grad_norm": 63.24259567260742, "learning_rate": 3.002869801090638e-08, "logits/chosen": -0.2819620370864868, "logits/rejected": -0.43697643280029297, "logps/chosen": -128.35104370117188, "logps/rejected": -210.3355255126953, "loss": 1.2474, "nll_loss": 0.7971504926681519, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.393406867980957, "rewards/margins": 4.027044773101807, "rewards/rejected": 1.3663625717163086, "step": 11370 }, { "epoch": 0.6313365972732694, "grad_norm": 74.21778869628906, "learning_rate": 2.994883647065216e-08, "logits/chosen": -0.4357389807701111, "logits/rejected": -0.5347990989685059, "logps/chosen": -178.0835723876953, "logps/rejected": -229.2137908935547, "loss": 1.2147, "nll_loss": 0.9867460131645203, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.3596696853637695, "rewards/margins": 3.9182181358337402, "rewards/rejected": 2.441451072692871, "step": 11380 }, { "epoch": 0.6318913745995202, "grad_norm": 59.573184967041016, "learning_rate": 2.98690358404075e-08, "logits/chosen": -0.3150174021720886, "logits/rejected": -0.44475072622299194, "logps/chosen": -152.2097625732422, "logps/rejected": -202.0469512939453, "loss": 1.3485, "nll_loss": 0.9981307983398438, "rewards/accuracies": 0.75, "rewards/chosen": 5.193087100982666, "rewards/margins": 2.5515267848968506, "rewards/rejected": 2.64155912399292, "step": 11390 }, { "epoch": 0.6324461519257708, "grad_norm": 62.636783599853516, "learning_rate": 2.978929636258508e-08, "logits/chosen": -0.49559253454208374, "logits/rejected": -0.6017329096794128, "logps/chosen": -169.13633728027344, "logps/rejected": -217.6695556640625, "loss": 1.318, "nll_loss": 1.0103613138198853, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.36987829208374, "rewards/margins": 3.7874996662139893, "rewards/rejected": 2.582379102706909, "step": 11400 }, { "epoch": 0.6330009292520214, "grad_norm": 92.36743927001953, "learning_rate": 2.970961827941192e-08, "logits/chosen": -0.31166213750839233, "logits/rejected": -0.43062323331832886, "logps/chosen": -156.07284545898438, "logps/rejected": -200.2255401611328, "loss": 1.2467, "nll_loss": 0.8944070935249329, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.562036991119385, "rewards/margins": 3.1733527183532715, "rewards/rejected": 2.3886845111846924, "step": 11410 }, { "epoch": 0.6335557065782721, "grad_norm": 63.728736877441406, "learning_rate": 2.9630001832928447e-08, "logits/chosen": -0.3078330457210541, "logits/rejected": -0.3683183193206787, "logps/chosen": -191.08816528320312, "logps/rejected": -222.91458129882812, "loss": 1.3035, "nll_loss": 1.1246745586395264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.97721004486084, "rewards/margins": 2.9635345935821533, "rewards/rejected": 3.013674736022949, "step": 11420 }, { "epoch": 0.6341104839045228, "grad_norm": 68.14006805419922, "learning_rate": 2.955044726498789e-08, "logits/chosen": -0.3305138051509857, "logits/rejected": -0.43370646238327026, "logps/chosen": -148.52413940429688, "logps/rejected": -200.3426513671875, "loss": 1.2064, "nll_loss": 0.9519137144088745, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.201475620269775, "rewards/margins": 3.2795891761779785, "rewards/rejected": 1.9218858480453491, "step": 11430 }, { "epoch": 0.6346652612307735, "grad_norm": 66.96622467041016, "learning_rate": 2.94709548172555e-08, "logits/chosen": -0.44262975454330444, "logits/rejected": -0.5093342065811157, "logps/chosen": -173.77906799316406, "logps/rejected": -233.55960083007812, "loss": 1.2975, "nll_loss": 1.1695655584335327, "rewards/accuracies": 0.875, "rewards/chosen": 6.275940418243408, "rewards/margins": 3.912945508956909, "rewards/rejected": 2.36299467086792, "step": 11440 }, { "epoch": 0.6352200385570241, "grad_norm": 63.15080642700195, "learning_rate": 2.9391524731207806e-08, "logits/chosen": -0.2056857794523239, "logits/rejected": -0.3412301540374756, "logps/chosen": -136.90403747558594, "logps/rejected": -157.2415313720703, "loss": 1.2194, "nll_loss": 0.8480414152145386, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.352352142333984, "rewards/margins": 3.119447946548462, "rewards/rejected": 2.2329041957855225, "step": 11450 }, { "epoch": 0.6357748158832749, "grad_norm": 76.51830291748047, "learning_rate": 2.931215724813195e-08, "logits/chosen": -0.30280178785324097, "logits/rejected": -0.44086456298828125, "logps/chosen": -156.64993286132812, "logps/rejected": -218.8702850341797, "loss": 1.3205, "nll_loss": 0.922582745552063, "rewards/accuracies": 0.875, "rewards/chosen": 5.7424116134643555, "rewards/margins": 3.8438477516174316, "rewards/rejected": 1.898564100265503, "step": 11460 }, { "epoch": 0.6363295932095255, "grad_norm": 77.34232330322266, "learning_rate": 2.9232852609124865e-08, "logits/chosen": -0.3481571674346924, "logits/rejected": -0.41753000020980835, "logps/chosen": -163.78369140625, "logps/rejected": -222.6044921875, "loss": 1.2826, "nll_loss": 1.0413355827331543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.836427688598633, "rewards/margins": 3.0139377117156982, "rewards/rejected": 2.822490692138672, "step": 11470 }, { "epoch": 0.6368843705357762, "grad_norm": 39.352542877197266, "learning_rate": 2.915361105509258e-08, "logits/chosen": -0.2512122094631195, "logits/rejected": -0.351836621761322, "logps/chosen": -166.06838989257812, "logps/rejected": -207.2430877685547, "loss": 1.2009, "nll_loss": 0.9286238551139832, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.441195487976074, "rewards/margins": 2.800069808959961, "rewards/rejected": 2.641125440597534, "step": 11480 }, { "epoch": 0.6374391478620269, "grad_norm": 33.86089324951172, "learning_rate": 2.9074432826749478e-08, "logits/chosen": -0.18522383272647858, "logits/rejected": -0.4233093857765198, "logps/chosen": -163.3899383544922, "logps/rejected": -212.404296875, "loss": 1.175, "nll_loss": 0.8490327000617981, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.2754621505737305, "rewards/margins": 4.018345832824707, "rewards/rejected": 1.2571160793304443, "step": 11490 }, { "epoch": 0.6379939251882776, "grad_norm": 58.185585021972656, "learning_rate": 2.899531816461761e-08, "logits/chosen": -0.48071545362472534, "logits/rejected": -0.6082251667976379, "logps/chosen": -180.03013610839844, "logps/rejected": -242.6453399658203, "loss": 1.3563, "nll_loss": 1.085688591003418, "rewards/accuracies": 0.875, "rewards/chosen": 6.111035346984863, "rewards/margins": 3.670149326324463, "rewards/rejected": 2.4408864974975586, "step": 11500 }, { "epoch": 0.6379939251882776, "eval_logits/chosen": -0.4249529242515564, "eval_logits/rejected": -0.5514447689056396, "eval_logps/chosen": -191.21336364746094, "eval_logps/rejected": -262.6428527832031, "eval_loss": 1.225059986114502, "eval_nll_loss": 0.9916093349456787, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.674785614013672, "eval_rewards/margins": 5.031222820281982, "eval_rewards/rejected": 1.643563151359558, "eval_runtime": 17.3547, "eval_samples_per_second": 14.751, "eval_steps_per_second": 1.844, "step": 11500 }, { "epoch": 0.6385487025145282, "grad_norm": 34.527950286865234, "learning_rate": 2.891626730902591e-08, "logits/chosen": -0.3377942144870758, "logits/rejected": -0.516598105430603, "logps/chosen": -141.05381774902344, "logps/rejected": -215.08511352539062, "loss": 1.2723, "nll_loss": 0.8781675100326538, "rewards/accuracies": 0.875, "rewards/chosen": 5.726903915405273, "rewards/margins": 4.327226161956787, "rewards/rejected": 1.3996771574020386, "step": 11510 }, { "epoch": 0.6391034798407789, "grad_norm": 65.67606353759766, "learning_rate": 2.8837280500109513e-08, "logits/chosen": -0.15242630243301392, "logits/rejected": -0.3232906460762024, "logps/chosen": -99.00645446777344, "logps/rejected": -157.29441833496094, "loss": 1.2496, "nll_loss": 0.7182794809341431, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.500516891479492, "rewards/margins": 3.2365946769714355, "rewards/rejected": 1.2639222145080566, "step": 11520 }, { "epoch": 0.6396582571670296, "grad_norm": 53.56328582763672, "learning_rate": 2.8758357977808935e-08, "logits/chosen": -0.25661394000053406, "logits/rejected": -0.4976634085178375, "logps/chosen": -157.01611328125, "logps/rejected": -216.496826171875, "loss": 1.2074, "nll_loss": 0.8713720440864563, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.420714378356934, "rewards/margins": 4.268970489501953, "rewards/rejected": 1.1517441272735596, "step": 11530 }, { "epoch": 0.6402130344932803, "grad_norm": 80.25347137451172, "learning_rate": 2.8679499981869477e-08, "logits/chosen": -0.4546354413032532, "logits/rejected": -0.56840980052948, "logps/chosen": -214.19345092773438, "logps/rejected": -275.9617919921875, "loss": 1.2605, "nll_loss": 1.1772795915603638, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.892640590667725, "rewards/margins": 3.761946201324463, "rewards/rejected": 3.1306941509246826, "step": 11540 }, { "epoch": 0.6407678118195309, "grad_norm": 43.13340377807617, "learning_rate": 2.860070675184036e-08, "logits/chosen": -0.3888665437698364, "logits/rejected": -0.48605984449386597, "logps/chosen": -158.18948364257812, "logps/rejected": -224.8271026611328, "loss": 1.2816, "nll_loss": 0.9999931454658508, "rewards/accuracies": 0.875, "rewards/chosen": 5.979371547698975, "rewards/margins": 4.293971061706543, "rewards/rejected": 1.6854002475738525, "step": 11550 }, { "epoch": 0.6413225891457817, "grad_norm": 45.231292724609375, "learning_rate": 2.8521978527074115e-08, "logits/chosen": -0.3281251788139343, "logits/rejected": -0.4574473798274994, "logps/chosen": -141.13717651367188, "logps/rejected": -199.07858276367188, "loss": 1.2125, "nll_loss": 0.8954921960830688, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.681342124938965, "rewards/margins": 3.7514350414276123, "rewards/rejected": 1.929907202720642, "step": 11560 }, { "epoch": 0.6418773664720323, "grad_norm": 67.2186050415039, "learning_rate": 2.844331554672581e-08, "logits/chosen": -0.3187335431575775, "logits/rejected": -0.42251071333885193, "logps/chosen": -172.05410766601562, "logps/rejected": -208.20034790039062, "loss": 1.2633, "nll_loss": 1.0226013660430908, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.629790782928467, "rewards/margins": 2.8976128101348877, "rewards/rejected": 2.732178211212158, "step": 11570 }, { "epoch": 0.642432143798283, "grad_norm": 66.80694580078125, "learning_rate": 2.836471804975225e-08, "logits/chosen": -0.15100358426570892, "logits/rejected": -0.3213760554790497, "logps/chosen": -132.5699920654297, "logps/rejected": -180.60330200195312, "loss": 1.223, "nll_loss": 0.7963281869888306, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.969555854797363, "rewards/margins": 2.9499268531799316, "rewards/rejected": 2.0196290016174316, "step": 11580 }, { "epoch": 0.6429869211245336, "grad_norm": 70.9541244506836, "learning_rate": 2.828618627491141e-08, "logits/chosen": -0.39181265234947205, "logits/rejected": -0.576582670211792, "logps/chosen": -174.08493041992188, "logps/rejected": -240.56185913085938, "loss": 1.2268, "nll_loss": 0.9803875684738159, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.388065814971924, "rewards/margins": 4.782280921936035, "rewards/rejected": 1.6057850122451782, "step": 11590 }, { "epoch": 0.6435416984507843, "grad_norm": 43.877098083496094, "learning_rate": 2.820772046076152e-08, "logits/chosen": -0.2085866928100586, "logits/rejected": -0.47025489807128906, "logps/chosen": -110.3382568359375, "logps/rejected": -189.8135528564453, "loss": 1.189, "nll_loss": 0.7289911508560181, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.165595054626465, "rewards/margins": 4.362338066101074, "rewards/rejected": 0.8032568097114563, "step": 11600 }, { "epoch": 0.644096475777035, "grad_norm": 32.87038803100586, "learning_rate": 2.8129320845660555e-08, "logits/chosen": -0.1900126188993454, "logits/rejected": -0.40282735228538513, "logps/chosen": -125.1517333984375, "logps/rejected": -196.83175659179688, "loss": 1.2788, "nll_loss": 0.7826611399650574, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.2875776290893555, "rewards/margins": 4.222311019897461, "rewards/rejected": 1.0652662515640259, "step": 11610 }, { "epoch": 0.6446512531032856, "grad_norm": 28.977495193481445, "learning_rate": 2.8050987667765286e-08, "logits/chosen": -0.46628230810165405, "logits/rejected": -0.5823394060134888, "logps/chosen": -183.65231323242188, "logps/rejected": -216.43801879882812, "loss": 1.2014, "nll_loss": 1.0308088064193726, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.078799247741699, "rewards/margins": 3.7072880268096924, "rewards/rejected": 2.3715109825134277, "step": 11620 }, { "epoch": 0.6452060304295364, "grad_norm": 50.18671798706055, "learning_rate": 2.797272116503075e-08, "logits/chosen": -0.39025577902793884, "logits/rejected": -0.4783563017845154, "logps/chosen": -159.52896118164062, "logps/rejected": -198.98770141601562, "loss": 1.274, "nll_loss": 1.0031954050064087, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.547030925750732, "rewards/margins": 2.9004714488983154, "rewards/rejected": 2.646559476852417, "step": 11630 }, { "epoch": 0.645760807755787, "grad_norm": 89.77849578857422, "learning_rate": 2.7894521575209363e-08, "logits/chosen": -0.2552987337112427, "logits/rejected": -0.44860172271728516, "logps/chosen": -123.27348327636719, "logps/rejected": -187.30453491210938, "loss": 1.1879, "nll_loss": 0.8547815084457397, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.886176109313965, "rewards/margins": 3.252734422683716, "rewards/rejected": 1.6334421634674072, "step": 11640 }, { "epoch": 0.6463155850820377, "grad_norm": 75.75540161132812, "learning_rate": 2.7816389135850348e-08, "logits/chosen": -0.3667968213558197, "logits/rejected": -0.49988412857055664, "logps/chosen": -178.33102416992188, "logps/rejected": -265.7437438964844, "loss": 1.2416, "nll_loss": 0.9843828082084656, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.9211835861206055, "rewards/margins": 3.6456680297851562, "rewards/rejected": 2.275515079498291, "step": 11650 }, { "epoch": 0.6468703624082883, "grad_norm": 133.79086303710938, "learning_rate": 2.7738324084298927e-08, "logits/chosen": -0.46171092987060547, "logits/rejected": -0.5715997219085693, "logps/chosen": -183.53390502929688, "logps/rejected": -259.72418212890625, "loss": 1.2338, "nll_loss": 1.0586216449737549, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.598612308502197, "rewards/margins": 4.113223552703857, "rewards/rejected": 2.4853885173797607, "step": 11660 }, { "epoch": 0.6474251397345391, "grad_norm": 96.0303726196289, "learning_rate": 2.7660326657695572e-08, "logits/chosen": -0.3845437169075012, "logits/rejected": -0.502238392829895, "logps/chosen": -165.1953887939453, "logps/rejected": -206.5979766845703, "loss": 1.2835, "nll_loss": 1.031736135482788, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.993288993835449, "rewards/margins": 3.625528335571289, "rewards/rejected": 2.367759943008423, "step": 11670 }, { "epoch": 0.6479799170607897, "grad_norm": 50.27488327026367, "learning_rate": 2.7582397092975395e-08, "logits/chosen": -0.33000046014785767, "logits/rejected": -0.5031772255897522, "logps/chosen": -139.52232360839844, "logps/rejected": -210.4247589111328, "loss": 1.2628, "nll_loss": 0.9663726687431335, "rewards/accuracies": 0.875, "rewards/chosen": 4.999209403991699, "rewards/margins": 3.4824111461639404, "rewards/rejected": 1.5167982578277588, "step": 11680 }, { "epoch": 0.6485346943870404, "grad_norm": 63.37350082397461, "learning_rate": 2.7504535626867288e-08, "logits/chosen": -0.3095516860485077, "logits/rejected": -0.45925372838974, "logps/chosen": -192.0953826904297, "logps/rejected": -253.19265747070312, "loss": 1.1787, "nll_loss": 1.0016443729400635, "rewards/accuracies": 0.875, "rewards/chosen": 6.674686431884766, "rewards/margins": 4.775921821594238, "rewards/rejected": 1.8987648487091064, "step": 11690 }, { "epoch": 0.6490894717132911, "grad_norm": 80.29125213623047, "learning_rate": 2.742674249589334e-08, "logits/chosen": -0.3295501470565796, "logits/rejected": -0.5117398500442505, "logps/chosen": -196.1520538330078, "logps/rejected": -272.27020263671875, "loss": 1.2747, "nll_loss": 1.0100562572479248, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.9651079177856445, "rewards/margins": 4.8849310874938965, "rewards/rejected": 1.0801770687103271, "step": 11700 }, { "epoch": 0.6496442490395418, "grad_norm": 60.28606414794922, "learning_rate": 2.7349017936368034e-08, "logits/chosen": -0.26497378945350647, "logits/rejected": -0.4506555199623108, "logps/chosen": -155.94403076171875, "logps/rejected": -207.8164520263672, "loss": 1.433, "nll_loss": 0.8810884356498718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.338873863220215, "rewards/margins": 3.31561279296875, "rewards/rejected": 2.023261547088623, "step": 11710 }, { "epoch": 0.6501990263657924, "grad_norm": 36.00334930419922, "learning_rate": 2.7271362184397573e-08, "logits/chosen": -0.5077487230300903, "logits/rejected": -0.5495038628578186, "logps/chosen": -198.93734741210938, "logps/rejected": -220.6064453125, "loss": 1.3674, "nll_loss": 1.247666597366333, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 6.006911277770996, "rewards/margins": 1.937212347984314, "rewards/rejected": 4.069699287414551, "step": 11720 }, { "epoch": 0.650753803692043, "grad_norm": 57.20671463012695, "learning_rate": 2.7193775475879104e-08, "logits/chosen": -0.25387778878211975, "logits/rejected": -0.4570907652378082, "logps/chosen": -136.553466796875, "logps/rejected": -196.80422973632812, "loss": 1.2568, "nll_loss": 0.8721553683280945, "rewards/accuracies": 0.75, "rewards/chosen": 5.297316551208496, "rewards/margins": 2.8612141609191895, "rewards/rejected": 2.4361026287078857, "step": 11730 }, { "epoch": 0.6513085810182938, "grad_norm": 52.974388122558594, "learning_rate": 2.711625804650003e-08, "logits/chosen": -0.33820220828056335, "logits/rejected": -0.42576026916503906, "logps/chosen": -161.24368286132812, "logps/rejected": -227.69442749023438, "loss": 1.2178, "nll_loss": 0.9846166372299194, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.212119102478027, "rewards/margins": 3.714268207550049, "rewards/rejected": 2.4978511333465576, "step": 11740 }, { "epoch": 0.6518633583445445, "grad_norm": 57.019203186035156, "learning_rate": 2.7038810131737344e-08, "logits/chosen": -0.3755797743797302, "logits/rejected": -0.5482282638549805, "logps/chosen": -146.35415649414062, "logps/rejected": -189.00033569335938, "loss": 1.2881, "nll_loss": 0.9598537683486938, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.358166694641113, "rewards/margins": 3.091294050216675, "rewards/rejected": 2.2668726444244385, "step": 11750 }, { "epoch": 0.6524181356707951, "grad_norm": 74.11688232421875, "learning_rate": 2.6961431966856862e-08, "logits/chosen": -0.4043899476528168, "logits/rejected": -0.47904711961746216, "logps/chosen": -173.64895629882812, "logps/rejected": -223.4357452392578, "loss": 1.2868, "nll_loss": 0.9924699068069458, "rewards/accuracies": 0.875, "rewards/chosen": 6.230164527893066, "rewards/margins": 3.5057075023651123, "rewards/rejected": 2.724457263946533, "step": 11760 }, { "epoch": 0.6529729129970459, "grad_norm": 48.800636291503906, "learning_rate": 2.688412378691253e-08, "logits/chosen": -0.30842381715774536, "logits/rejected": -0.4647197723388672, "logps/chosen": -157.63990783691406, "logps/rejected": -211.4970245361328, "loss": 1.2867, "nll_loss": 0.9230083227157593, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.783360481262207, "rewards/margins": 3.545384168624878, "rewards/rejected": 2.237976312637329, "step": 11770 }, { "epoch": 0.6535276903232965, "grad_norm": 68.97724914550781, "learning_rate": 2.6806885826745644e-08, "logits/chosen": -0.4042227268218994, "logits/rejected": -0.5165958404541016, "logps/chosen": -204.16995239257812, "logps/rejected": -252.27609252929688, "loss": 1.2793, "nll_loss": 1.0783510208129883, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.4781599044799805, "rewards/margins": 3.8852744102478027, "rewards/rejected": 2.5928854942321777, "step": 11780 }, { "epoch": 0.6540824676495471, "grad_norm": 59.931514739990234, "learning_rate": 2.672971832098426e-08, "logits/chosen": -0.2738388180732727, "logits/rejected": -0.4349437654018402, "logps/chosen": -182.16796875, "logps/rejected": -263.86260986328125, "loss": 1.1825, "nll_loss": 0.9843899011611938, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.737701892852783, "rewards/margins": 3.4806103706359863, "rewards/rejected": 2.257091522216797, "step": 11790 }, { "epoch": 0.6546372449757979, "grad_norm": 34.759620666503906, "learning_rate": 2.665262150404236e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -164.31822204589844, "logps/rejected": -224.2421417236328, "loss": 1.1675, "nll_loss": NaN, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.8863844871521, "rewards/margins": 3.647648572921753, "rewards/rejected": 2.238736152648926, "step": 11800 }, { "epoch": 0.6551920223020485, "grad_norm": 34.575355529785156, "learning_rate": 2.6575595610119217e-08, "logits/chosen": -0.17474150657653809, "logits/rejected": -0.4074668288230896, "logps/chosen": -120.413330078125, "logps/rejected": -199.8701629638672, "loss": 1.2722, "nll_loss": 0.734146773815155, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.292850017547607, "rewards/margins": 4.236934661865234, "rewards/rejected": 1.0559158325195312, "step": 11810 }, { "epoch": 0.6557467996282992, "grad_norm": 77.0059814453125, "learning_rate": 2.6498640873198674e-08, "logits/chosen": -0.18045705556869507, "logits/rejected": -0.3690612316131592, "logps/chosen": -133.09133911132812, "logps/rejected": -167.37332153320312, "loss": 1.2525, "nll_loss": 0.8449319005012512, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.069517612457275, "rewards/margins": 3.9457669258117676, "rewards/rejected": 1.1237508058547974, "step": 11820 }, { "epoch": 0.6563015769545498, "grad_norm": 43.84608459472656, "learning_rate": 2.6421757527048373e-08, "logits/chosen": -0.24172338843345642, "logits/rejected": -0.30262669920921326, "logps/chosen": -159.74351501464844, "logps/rejected": -218.16470336914062, "loss": 1.2134, "nll_loss": 0.9308856129646301, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.76108455657959, "rewards/margins": 3.665342330932617, "rewards/rejected": 2.0957419872283936, "step": 11830 }, { "epoch": 0.6568563542808006, "grad_norm": 54.546878814697266, "learning_rate": 2.6344945805219154e-08, "logits/chosen": -0.3484647870063782, "logits/rejected": -0.4647384583950043, "logps/chosen": -181.24209594726562, "logps/rejected": -244.99813842773438, "loss": 1.188, "nll_loss": 0.9683634042739868, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.014298439025879, "rewards/margins": 3.6213595867156982, "rewards/rejected": 2.3929383754730225, "step": 11840 }, { "epoch": 0.6574111316070512, "grad_norm": 75.54428100585938, "learning_rate": 2.6268205941044174e-08, "logits/chosen": -0.31292271614074707, "logits/rejected": -0.46505337953567505, "logps/chosen": -183.8721466064453, "logps/rejected": -241.2721710205078, "loss": 1.1855, "nll_loss": 0.9584083557128906, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 6.108242034912109, "rewards/margins": 5.03185510635376, "rewards/rejected": 1.076386570930481, "step": 11850 }, { "epoch": 0.6579659089333019, "grad_norm": 86.36827087402344, "learning_rate": 2.6191538167638473e-08, "logits/chosen": -0.1850053369998932, "logits/rejected": -0.3884859085083008, "logps/chosen": -158.62387084960938, "logps/rejected": -202.6744842529297, "loss": 1.34, "nll_loss": 0.9155745506286621, "rewards/accuracies": 0.75, "rewards/chosen": 5.202109336853027, "rewards/margins": 2.996246814727783, "rewards/rejected": 2.205862522125244, "step": 11860 }, { "epoch": 0.6585206862595526, "grad_norm": 34.229007720947266, "learning_rate": 2.6114942717897924e-08, "logits/chosen": -0.3144778907299042, "logits/rejected": -0.47114571928977966, "logps/chosen": -147.51065063476562, "logps/rejected": -222.63412475585938, "loss": 1.1944, "nll_loss": 0.9164835810661316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.97763729095459, "rewards/margins": 3.2676658630371094, "rewards/rejected": 2.7099711894989014, "step": 11870 }, { "epoch": 0.6590754635858033, "grad_norm": 77.99613952636719, "learning_rate": 2.6038419824498836e-08, "logits/chosen": -0.45269614458084106, "logits/rejected": -0.608244776725769, "logps/chosen": -184.53318786621094, "logps/rejected": -231.4016876220703, "loss": 1.2404, "nll_loss": 1.023178219795227, "rewards/accuracies": 0.875, "rewards/chosen": 6.559817314147949, "rewards/margins": 3.81182861328125, "rewards/rejected": 2.7479889392852783, "step": 11880 }, { "epoch": 0.6596302409120539, "grad_norm": 107.27423858642578, "learning_rate": 2.5961969719897002e-08, "logits/chosen": -0.4463115632534027, "logits/rejected": -0.4912208616733551, "logps/chosen": -204.35488891601562, "logps/rejected": -226.47634887695312, "loss": 1.3068, "nll_loss": 1.0930724143981934, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.605230808258057, "rewards/margins": 3.248439311981201, "rewards/rejected": 3.3567910194396973, "step": 11890 }, { "epoch": 0.6601850182383046, "grad_norm": 58.7518196105957, "learning_rate": 2.5885592636327185e-08, "logits/chosen": -0.308633416891098, "logits/rejected": -0.47308340668678284, "logps/chosen": -154.84756469726562, "logps/rejected": -221.26101684570312, "loss": 1.2456, "nll_loss": 0.8628350496292114, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.491322994232178, "rewards/margins": 3.4300694465637207, "rewards/rejected": 2.0612540245056152, "step": 11900 }, { "epoch": 0.6607397955645553, "grad_norm": 47.63860321044922, "learning_rate": 2.5809288805802314e-08, "logits/chosen": -0.2752883732318878, "logits/rejected": -0.3745267987251282, "logps/chosen": -156.73974609375, "logps/rejected": -191.37518310546875, "loss": 1.1428, "nll_loss": 0.9624244570732117, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.198734283447266, "rewards/margins": 3.503079891204834, "rewards/rejected": 2.6956539154052734, "step": 11910 }, { "epoch": 0.661294572890806, "grad_norm": 53.0847053527832, "learning_rate": 2.5733058460112745e-08, "logits/chosen": -0.28915005922317505, "logits/rejected": -0.4263629913330078, "logps/chosen": -154.884765625, "logps/rejected": -195.12173461914062, "loss": 1.2892, "nll_loss": 0.9579075574874878, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.69394063949585, "rewards/margins": 3.602841854095459, "rewards/rejected": 2.091099262237549, "step": 11920 }, { "epoch": 0.6618493502170566, "grad_norm": 52.77870178222656, "learning_rate": 2.565690183082567e-08, "logits/chosen": 0.02300594374537468, "logits/rejected": -0.18990465998649597, "logps/chosen": -117.44551849365234, "logps/rejected": -164.27191162109375, "loss": 1.2247, "nll_loss": 0.6829160451889038, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.389244556427002, "rewards/margins": 3.0034196376800537, "rewards/rejected": 1.3858246803283691, "step": 11930 }, { "epoch": 0.6624041275433074, "grad_norm": 64.58387756347656, "learning_rate": 2.5580819149284294e-08, "logits/chosen": -0.3967145085334778, "logits/rejected": -0.5213366746902466, "logps/chosen": -188.41629028320312, "logps/rejected": -241.98794555664062, "loss": 1.2769, "nll_loss": 1.058941125869751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.2895283699035645, "rewards/margins": 3.641815185546875, "rewards/rejected": 2.6477131843566895, "step": 11940 }, { "epoch": 0.662958904869558, "grad_norm": 68.38165283203125, "learning_rate": 2.550481064660724e-08, "logits/chosen": -0.4660201966762543, "logits/rejected": -0.5716916918754578, "logps/chosen": -185.03787231445312, "logps/rejected": -262.6652526855469, "loss": 1.2245, "nll_loss": 1.1056232452392578, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.7243852615356445, "rewards/margins": 4.855156421661377, "rewards/rejected": 1.869228720664978, "step": 11950 }, { "epoch": 0.6635136821958086, "grad_norm": 75.75467681884766, "learning_rate": 2.5428876553687785e-08, "logits/chosen": -0.27877897024154663, "logits/rejected": -0.44745931029319763, "logps/chosen": -153.05589294433594, "logps/rejected": -215.16299438476562, "loss": 1.2257, "nll_loss": 0.940390944480896, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.786623001098633, "rewards/margins": 4.343803882598877, "rewards/rejected": 1.442819356918335, "step": 11960 }, { "epoch": 0.6640684595220593, "grad_norm": 132.7093963623047, "learning_rate": 2.5353017101193118e-08, "logits/chosen": -0.19808785617351532, "logits/rejected": -0.30676984786987305, "logps/chosen": -155.75851440429688, "logps/rejected": -188.7913360595703, "loss": 1.299, "nll_loss": 1.008725643157959, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.505126953125, "rewards/margins": 3.052074909210205, "rewards/rejected": 2.453052043914795, "step": 11970 }, { "epoch": 0.66462323684831, "grad_norm": 55.54659652709961, "learning_rate": 2.5277232519563786e-08, "logits/chosen": -0.37800878286361694, "logits/rejected": -0.5081731081008911, "logps/chosen": -205.9271240234375, "logps/rejected": -258.99505615234375, "loss": 1.2852, "nll_loss": 1.035773754119873, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.5168352127075195, "rewards/margins": 3.7608962059020996, "rewards/rejected": 2.7559380531311035, "step": 11980 }, { "epoch": 0.6651780141745607, "grad_norm": 58.10818099975586, "learning_rate": 2.5201523039012786e-08, "logits/chosen": -0.3581780791282654, "logits/rejected": -0.45601707696914673, "logps/chosen": -149.41586303710938, "logps/rejected": -208.29531860351562, "loss": 1.1857, "nll_loss": 1.0122449398040771, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.706791877746582, "rewards/margins": 3.4607417583465576, "rewards/rejected": 2.2460505962371826, "step": 11990 }, { "epoch": 0.6657327915008113, "grad_norm": 48.28881072998047, "learning_rate": 2.5125888889525053e-08, "logits/chosen": -0.43195661902427673, "logits/rejected": -0.5321120023727417, "logps/chosen": -217.3926544189453, "logps/rejected": -306.47174072265625, "loss": 1.2693, "nll_loss": 1.1178908348083496, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 7.168890476226807, "rewards/margins": 5.198139190673828, "rewards/rejected": 1.9707515239715576, "step": 12000 }, { "epoch": 0.6657327915008113, "eval_logits/chosen": -0.4029172360897064, "eval_logits/rejected": -0.5176486372947693, "eval_logps/chosen": -190.4573516845703, "eval_logps/rejected": -261.09027099609375, "eval_loss": 1.216133713722229, "eval_nll_loss": 0.9862600564956665, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.750385761260986, "eval_rewards/margins": 4.95156192779541, "eval_rewards/rejected": 1.7988238334655762, "eval_runtime": 16.8792, "eval_samples_per_second": 15.167, "eval_steps_per_second": 1.896, "step": 12000 }, { "epoch": 0.6662875688270621, "grad_norm": 63.904869079589844, "learning_rate": 2.505033030085668e-08, "logits/chosen": -0.2707623839378357, "logits/rejected": -0.41992464661598206, "logps/chosen": -170.99859619140625, "logps/rejected": -202.93099975585938, "loss": 1.1845, "nll_loss": 0.8949660062789917, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.781935691833496, "rewards/margins": 4.267865180969238, "rewards/rejected": 1.514070987701416, "step": 12010 }, { "epoch": 0.6668423461533127, "grad_norm": 48.43641662597656, "learning_rate": 2.4974847502534236e-08, "logits/chosen": -0.21682846546173096, "logits/rejected": -0.332996666431427, "logps/chosen": -133.58831787109375, "logps/rejected": -176.17198181152344, "loss": 1.3191, "nll_loss": 0.8816951513290405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.321299076080322, "rewards/margins": 2.6478521823883057, "rewards/rejected": 2.6734461784362793, "step": 12020 }, { "epoch": 0.6673971234795634, "grad_norm": 47.73326110839844, "learning_rate": 2.4899440723853993e-08, "logits/chosen": -0.32461345195770264, "logits/rejected": -0.508495569229126, "logps/chosen": -155.72108459472656, "logps/rejected": -234.18374633789062, "loss": 1.1253, "nll_loss": 0.8847814798355103, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.887340545654297, "rewards/margins": 4.291019439697266, "rewards/rejected": 1.5963211059570312, "step": 12030 }, { "epoch": 0.667951900805814, "grad_norm": 45.27253723144531, "learning_rate": 2.482411019388138e-08, "logits/chosen": -0.3711601495742798, "logits/rejected": -0.46444177627563477, "logps/chosen": -165.60052490234375, "logps/rejected": -220.0888214111328, "loss": 1.2507, "nll_loss": 0.9290505647659302, "rewards/accuracies": 0.875, "rewards/chosen": 5.9535369873046875, "rewards/margins": 3.6149909496307373, "rewards/rejected": 2.338545799255371, "step": 12040 }, { "epoch": 0.6685066781320648, "grad_norm": 38.41141891479492, "learning_rate": 2.474885614145013e-08, "logits/chosen": -0.2623186707496643, "logits/rejected": -0.40427589416503906, "logps/chosen": -200.8873291015625, "logps/rejected": -252.84957885742188, "loss": 1.2134, "nll_loss": 1.0127166509628296, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.942787170410156, "rewards/margins": 3.569545269012451, "rewards/rejected": 2.373241901397705, "step": 12050 }, { "epoch": 0.6690614554583154, "grad_norm": 54.231590270996094, "learning_rate": 2.467367879516171e-08, "logits/chosen": -0.19070479273796082, "logits/rejected": -0.3572729527950287, "logps/chosen": -141.1498260498047, "logps/rejected": -185.4639129638672, "loss": 1.1299, "nll_loss": 0.8368833661079407, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.0277299880981445, "rewards/margins": 3.8939177989959717, "rewards/rejected": 1.133811354637146, "step": 12060 }, { "epoch": 0.6696162327845661, "grad_norm": 107.39879608154297, "learning_rate": 2.4598578383384577e-08, "logits/chosen": -0.19599628448486328, "logits/rejected": -0.3971938490867615, "logps/chosen": -150.45681762695312, "logps/rejected": -206.2320098876953, "loss": 1.198, "nll_loss": 0.8617413640022278, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.54864501953125, "rewards/margins": 4.213971138000488, "rewards/rejected": 1.3346747159957886, "step": 12070 }, { "epoch": 0.6701710101108168, "grad_norm": 69.48493957519531, "learning_rate": 2.4523555134253427e-08, "logits/chosen": -0.1980140507221222, "logits/rejected": -0.3805929720401764, "logps/chosen": -156.84254455566406, "logps/rejected": -232.4204559326172, "loss": 1.2391, "nll_loss": 0.8746329545974731, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.7141547203063965, "rewards/margins": 4.587075233459473, "rewards/rejected": 1.1270796060562134, "step": 12080 }, { "epoch": 0.6707257874370675, "grad_norm": 33.65756607055664, "learning_rate": 2.4448609275668624e-08, "logits/chosen": -0.325199693441391, "logits/rejected": -0.4760337471961975, "logps/chosen": -140.7031707763672, "logps/rejected": -217.14047241210938, "loss": 1.196, "nll_loss": 0.9230989217758179, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.959973335266113, "rewards/margins": 4.0895891189575195, "rewards/rejected": 1.870383858680725, "step": 12090 }, { "epoch": 0.6712805647633181, "grad_norm": 54.62273406982422, "learning_rate": 2.4373741035295354e-08, "logits/chosen": -0.385964959859848, "logits/rejected": -0.5205613374710083, "logps/chosen": -181.7679901123047, "logps/rejected": -245.03591918945312, "loss": 1.2331, "nll_loss": 1.009312391281128, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.756820201873779, "rewards/margins": 3.7107512950897217, "rewards/rejected": 2.0460691452026367, "step": 12100 }, { "epoch": 0.6718353420895689, "grad_norm": 42.30917739868164, "learning_rate": 2.4298950640563153e-08, "logits/chosen": -0.2956189215183258, "logits/rejected": -0.4312848448753357, "logps/chosen": -148.74661254882812, "logps/rejected": -206.2273712158203, "loss": 1.302, "nll_loss": 0.9868310689926147, "rewards/accuracies": 0.75, "rewards/chosen": 5.419343948364258, "rewards/margins": 3.17564058303833, "rewards/rejected": 2.2437033653259277, "step": 12110 }, { "epoch": 0.6723901194158195, "grad_norm": 64.55408477783203, "learning_rate": 2.422423831866494e-08, "logits/chosen": -0.37799012660980225, "logits/rejected": -0.494337797164917, "logps/chosen": -189.30006408691406, "logps/rejected": -283.4101867675781, "loss": 1.3102, "nll_loss": 1.084997296333313, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.286576271057129, "rewards/margins": 4.620743274688721, "rewards/rejected": 1.665832281112671, "step": 12120 }, { "epoch": 0.6729448967420701, "grad_norm": 68.30670928955078, "learning_rate": 2.4149604296556582e-08, "logits/chosen": -0.17044074833393097, "logits/rejected": -0.3540880084037781, "logps/chosen": -150.18106079101562, "logps/rejected": -228.80203247070312, "loss": 1.1923, "nll_loss": 0.8533521890640259, "rewards/accuracies": 0.875, "rewards/chosen": 5.593564033508301, "rewards/margins": 4.3736572265625, "rewards/rejected": 1.2199056148529053, "step": 12130 }, { "epoch": 0.6734996740683208, "grad_norm": 42.05036163330078, "learning_rate": 2.4075048800955994e-08, "logits/chosen": -0.2699395716190338, "logits/rejected": -0.4457497000694275, "logps/chosen": -174.43775939941406, "logps/rejected": -227.38082885742188, "loss": 1.1834, "nll_loss": 0.9571785926818848, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.090694904327393, "rewards/margins": 4.016547679901123, "rewards/rejected": 2.0741469860076904, "step": 12140 }, { "epoch": 0.6740544513945715, "grad_norm": 33.59762954711914, "learning_rate": 2.4000572058342634e-08, "logits/chosen": -0.2383767068386078, "logits/rejected": -0.3165499269962311, "logps/chosen": -158.05007934570312, "logps/rejected": -225.517822265625, "loss": 1.2319, "nll_loss": 0.9493352770805359, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.893635272979736, "rewards/margins": 4.028491497039795, "rewards/rejected": 1.8651440143585205, "step": 12150 }, { "epoch": 0.6746092287208222, "grad_norm": 77.18594360351562, "learning_rate": 2.3926174294956696e-08, "logits/chosen": -0.2202729731798172, "logits/rejected": -0.40815192461013794, "logps/chosen": -154.131591796875, "logps/rejected": -213.88162231445312, "loss": 1.3252, "nll_loss": 0.9396843910217285, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.392186164855957, "rewards/margins": 4.005577087402344, "rewards/rejected": 1.386609435081482, "step": 12160 }, { "epoch": 0.6751640060470728, "grad_norm": 56.84512710571289, "learning_rate": 2.3851855736798433e-08, "logits/chosen": -0.40499407052993774, "logits/rejected": -0.46631139516830444, "logps/chosen": -187.3240966796875, "logps/rejected": -227.35302734375, "loss": 1.2733, "nll_loss": 1.0406402349472046, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.996212005615234, "rewards/margins": 2.9431254863739014, "rewards/rejected": 3.053086280822754, "step": 12170 }, { "epoch": 0.6757187833733236, "grad_norm": 62.96339797973633, "learning_rate": 2.377761660962754e-08, "logits/chosen": -0.19384492933750153, "logits/rejected": -0.4142443537712097, "logps/chosen": -120.52742004394531, "logps/rejected": -154.69898986816406, "loss": 1.2966, "nll_loss": 0.6951400637626648, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.903653144836426, "rewards/margins": 3.6983203887939453, "rewards/rejected": 1.2053325176239014, "step": 12180 }, { "epoch": 0.6762735606995742, "grad_norm": 41.99937438964844, "learning_rate": 2.3703457138962373e-08, "logits/chosen": -0.3248792290687561, "logits/rejected": -0.4245285987854004, "logps/chosen": -170.15814208984375, "logps/rejected": -214.8865509033203, "loss": 1.2082, "nll_loss": 1.0116769075393677, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.878182411193848, "rewards/margins": 3.5163872241973877, "rewards/rejected": 2.361795425415039, "step": 12190 }, { "epoch": 0.6768283380258249, "grad_norm": 54.36767578125, "learning_rate": 2.362937755007935e-08, "logits/chosen": -0.25324520468711853, "logits/rejected": -0.3982846140861511, "logps/chosen": -148.38656616210938, "logps/rejected": -229.2409210205078, "loss": 1.2597, "nll_loss": 0.8752374649047852, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.8527655601501465, "rewards/margins": 3.713012218475342, "rewards/rejected": 2.139753818511963, "step": 12200 }, { "epoch": 0.6773831153520755, "grad_norm": 73.44265747070312, "learning_rate": 2.355537806801224e-08, "logits/chosen": -0.3576010763645172, "logits/rejected": -0.48581376671791077, "logps/chosen": -172.70126342773438, "logps/rejected": -222.27035522460938, "loss": 1.2391, "nll_loss": 0.9975396990776062, "rewards/accuracies": 0.875, "rewards/chosen": 6.035371780395508, "rewards/margins": 3.65791392326355, "rewards/rejected": 2.377457618713379, "step": 12210 }, { "epoch": 0.6779378926783263, "grad_norm": 135.85301208496094, "learning_rate": 2.3481458917551412e-08, "logits/chosen": -0.2711308002471924, "logits/rejected": -0.4493893086910248, "logps/chosen": -121.0526351928711, "logps/rejected": -175.5157012939453, "loss": 1.2526, "nll_loss": 0.8514910936355591, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 4.768159866333008, "rewards/margins": 2.7525486946105957, "rewards/rejected": 2.015611171722412, "step": 12220 }, { "epoch": 0.6784926700045769, "grad_norm": 52.4161491394043, "learning_rate": 2.3407620323243276e-08, "logits/chosen": -0.29157713055610657, "logits/rejected": -0.4096639156341553, "logps/chosen": -169.6468048095703, "logps/rejected": -217.779296875, "loss": 1.2374, "nll_loss": 0.9138143658638, "rewards/accuracies": 0.875, "rewards/chosen": 6.091879367828369, "rewards/margins": 3.7146849632263184, "rewards/rejected": 2.3771939277648926, "step": 12230 }, { "epoch": 0.6790474473308276, "grad_norm": 58.094173431396484, "learning_rate": 2.3333862509389453e-08, "logits/chosen": -0.2922077775001526, "logits/rejected": -0.43591269850730896, "logps/chosen": -153.40057373046875, "logps/rejected": -206.0133056640625, "loss": 1.2634, "nll_loss": 0.921014666557312, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.6673455238342285, "rewards/margins": 3.7713565826416016, "rewards/rejected": 1.8959894180297852, "step": 12240 }, { "epoch": 0.6796022246570783, "grad_norm": 44.03510665893555, "learning_rate": 2.326018570004629e-08, "logits/chosen": -0.36777248978614807, "logits/rejected": -0.4938937723636627, "logps/chosen": -175.78761291503906, "logps/rejected": -244.18557739257812, "loss": 1.2377, "nll_loss": 0.9222499132156372, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.203314781188965, "rewards/margins": 4.028185844421387, "rewards/rejected": 2.175128698348999, "step": 12250 }, { "epoch": 0.680157001983329, "grad_norm": 55.41645812988281, "learning_rate": 2.3186590119023957e-08, "logits/chosen": -0.3484051823616028, "logits/rejected": -0.4971703886985779, "logps/chosen": -181.65892028808594, "logps/rejected": -213.5335235595703, "loss": 1.2372, "nll_loss": 0.9844461679458618, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.447892665863037, "rewards/margins": 3.9748847484588623, "rewards/rejected": 2.4730076789855957, "step": 12260 }, { "epoch": 0.6807117793095796, "grad_norm": 84.85285186767578, "learning_rate": 2.311307598988595e-08, "logits/chosen": -0.21466335654258728, "logits/rejected": -0.4452625811100006, "logps/chosen": -107.22621154785156, "logps/rejected": -147.81466674804688, "loss": 1.1798, "nll_loss": 0.6963762044906616, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.621092796325684, "rewards/margins": 3.5071613788604736, "rewards/rejected": 1.1139312982559204, "step": 12270 }, { "epoch": 0.6812665566358302, "grad_norm": 35.29098892211914, "learning_rate": 2.3039643535948254e-08, "logits/chosen": -0.2706051468849182, "logits/rejected": -0.4521883428096771, "logps/chosen": -117.00657653808594, "logps/rejected": -175.45608520507812, "loss": 1.2768, "nll_loss": 0.8625116348266602, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.758450508117676, "rewards/margins": 3.15194034576416, "rewards/rejected": 1.6065105199813843, "step": 12280 }, { "epoch": 0.681821333962081, "grad_norm": 56.05775451660156, "learning_rate": 2.2966292980278822e-08, "logits/chosen": -0.3769080936908722, "logits/rejected": -0.4741719365119934, "logps/chosen": -185.7469940185547, "logps/rejected": -212.88265991210938, "loss": 1.2799, "nll_loss": 1.052215576171875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.125279426574707, "rewards/margins": 3.444147825241089, "rewards/rejected": 2.681131601333618, "step": 12290 }, { "epoch": 0.6823761112883316, "grad_norm": 58.406681060791016, "learning_rate": 2.289302454569682e-08, "logits/chosen": -0.22093644738197327, "logits/rejected": -0.4181094169616699, "logps/chosen": -139.93206787109375, "logps/rejected": -175.80638122558594, "loss": 1.2401, "nll_loss": 0.8091662526130676, "rewards/accuracies": 0.875, "rewards/chosen": 4.987369537353516, "rewards/margins": 3.4839394092559814, "rewards/rejected": 1.5034297704696655, "step": 12300 }, { "epoch": 0.6829308886145823, "grad_norm": 76.34058380126953, "learning_rate": 2.2819838454771883e-08, "logits/chosen": -0.21736867725849152, "logits/rejected": -0.42537808418273926, "logps/chosen": -135.9351348876953, "logps/rejected": -181.4181365966797, "loss": 1.2302, "nll_loss": 0.936695396900177, "rewards/accuracies": 0.75, "rewards/chosen": 4.897767066955566, "rewards/margins": 2.7751545906066895, "rewards/rejected": 2.1226117610931396, "step": 12310 }, { "epoch": 0.683485665940833, "grad_norm": 33.38760757446289, "learning_rate": 2.2746734929823592e-08, "logits/chosen": -0.3998182713985443, "logits/rejected": -0.49476099014282227, "logps/chosen": -178.2798309326172, "logps/rejected": -253.4151611328125, "loss": 1.2437, "nll_loss": 1.123988389968872, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.30886173248291, "rewards/margins": 3.9014792442321777, "rewards/rejected": 2.4073822498321533, "step": 12320 }, { "epoch": 0.6840404432670837, "grad_norm": 60.61362075805664, "learning_rate": 2.267371419292064e-08, "logits/chosen": -0.2885650396347046, "logits/rejected": -0.447486013174057, "logps/chosen": -176.38241577148438, "logps/rejected": -235.9733428955078, "loss": 1.2304, "nll_loss": 0.8946449160575867, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.297074317932129, "rewards/margins": 4.915509223937988, "rewards/rejected": 1.3815653324127197, "step": 12330 }, { "epoch": 0.6845952205933343, "grad_norm": 41.32902908325195, "learning_rate": 2.2600776465880284e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -137.27139282226562, "logps/rejected": -175.2365264892578, "loss": 1.2429, "nll_loss": NaN, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.061639308929443, "rewards/margins": 3.0307247638702393, "rewards/rejected": 2.0309150218963623, "step": 12340 }, { "epoch": 0.685149997919585, "grad_norm": 46.262428283691406, "learning_rate": 2.252792197026761e-08, "logits/chosen": -0.2002539336681366, "logits/rejected": -0.460933119058609, "logps/chosen": -125.36739349365234, "logps/rejected": -214.4919891357422, "loss": 1.1804, "nll_loss": 0.7614500522613525, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.050937652587891, "rewards/margins": 3.953423023223877, "rewards/rejected": 1.0975148677825928, "step": 12350 }, { "epoch": 0.6857047752458357, "grad_norm": 86.92090606689453, "learning_rate": 2.2455150927394877e-08, "logits/chosen": -0.11846522241830826, "logits/rejected": -0.2859603464603424, "logps/chosen": -144.65676879882812, "logps/rejected": -190.19805908203125, "loss": 1.248, "nll_loss": 0.8645520210266113, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.232255458831787, "rewards/margins": 3.135033130645752, "rewards/rejected": 2.0972225666046143, "step": 12360 }, { "epoch": 0.6862595525720864, "grad_norm": 71.14341735839844, "learning_rate": 2.2382463558320785e-08, "logits/chosen": -0.4598962664604187, "logits/rejected": -0.5810025930404663, "logps/chosen": -209.81283569335938, "logps/rejected": -272.2491760253906, "loss": 1.2829, "nll_loss": 1.1209025382995605, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.895010471343994, "rewards/margins": 4.252721309661865, "rewards/rejected": 2.642289161682129, "step": 12370 }, { "epoch": 0.686814329898337, "grad_norm": 116.52542877197266, "learning_rate": 2.230986008384994e-08, "logits/chosen": -0.18051250278949738, "logits/rejected": -0.3351711332798004, "logps/chosen": -145.1734161376953, "logps/rejected": -210.15237426757812, "loss": 1.2633, "nll_loss": 0.8040771484375, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.451181411743164, "rewards/margins": 3.6372859477996826, "rewards/rejected": 1.8138954639434814, "step": 12380 }, { "epoch": 0.6873691072245878, "grad_norm": 65.41537475585938, "learning_rate": 2.2237340724532007e-08, "logits/chosen": -0.3516872525215149, "logits/rejected": -0.5156417489051819, "logps/chosen": -184.73089599609375, "logps/rejected": -243.6831512451172, "loss": 1.1755, "nll_loss": 0.9614565968513489, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.4388322830200195, "rewards/margins": 4.41310977935791, "rewards/rejected": 2.0257222652435303, "step": 12390 }, { "epoch": 0.6879238845508384, "grad_norm": 44.749759674072266, "learning_rate": 2.2164905700661197e-08, "logits/chosen": -0.2982178330421448, "logits/rejected": -0.4795723855495453, "logps/chosen": -167.46710205078125, "logps/rejected": -209.0430145263672, "loss": 1.2054, "nll_loss": 0.9289814233779907, "rewards/accuracies": 0.875, "rewards/chosen": 5.628087043762207, "rewards/margins": 3.88411283493042, "rewards/rejected": 1.7439743280410767, "step": 12400 }, { "epoch": 0.6884786618770891, "grad_norm": 52.118690490722656, "learning_rate": 2.209255523227554e-08, "logits/chosen": -0.35844165086746216, "logits/rejected": -0.5388228893280029, "logps/chosen": -179.90591430664062, "logps/rejected": -232.36801147460938, "loss": 1.1813, "nll_loss": 0.9430680274963379, "rewards/accuracies": 0.875, "rewards/chosen": 6.065617084503174, "rewards/margins": 4.8700337409973145, "rewards/rejected": 1.1955829858779907, "step": 12410 }, { "epoch": 0.6890334392033398, "grad_norm": 90.59578704833984, "learning_rate": 2.202028953915614e-08, "logits/chosen": -0.2993447482585907, "logits/rejected": -0.4592467248439789, "logps/chosen": -156.58291625976562, "logps/rejected": -223.3266143798828, "loss": 1.2281, "nll_loss": 0.9128421545028687, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.2060675621032715, "rewards/margins": 4.218282222747803, "rewards/rejected": 1.9877853393554688, "step": 12420 }, { "epoch": 0.6895882165295905, "grad_norm": 158.9627685546875, "learning_rate": 2.194810884082665e-08, "logits/chosen": -0.37616056203842163, "logits/rejected": -0.45298343896865845, "logps/chosen": -163.0350799560547, "logps/rejected": -223.75033569335938, "loss": 1.3006, "nll_loss": 0.9525953531265259, "rewards/accuracies": 0.875, "rewards/chosen": 5.8995537757873535, "rewards/margins": 3.986276626586914, "rewards/rejected": 1.9132773876190186, "step": 12430 }, { "epoch": 0.6901429938558411, "grad_norm": 56.218318939208984, "learning_rate": 2.1876013356552482e-08, "logits/chosen": -0.21467992663383484, "logits/rejected": -0.4005191922187805, "logps/chosen": -158.6463623046875, "logps/rejected": -223.93447875976562, "loss": 1.208, "nll_loss": 0.8587056398391724, "rewards/accuracies": 0.875, "rewards/chosen": 5.5468034744262695, "rewards/margins": 4.3985066413879395, "rewards/rejected": 1.1482973098754883, "step": 12440 }, { "epoch": 0.6906977711820917, "grad_norm": 59.432518005371094, "learning_rate": 2.1804003305340212e-08, "logits/chosen": -0.3025331199169159, "logits/rejected": -0.4626920223236084, "logps/chosen": -211.9149932861328, "logps/rejected": -250.5555877685547, "loss": 1.3116, "nll_loss": 1.023355484008789, "rewards/accuracies": 0.875, "rewards/chosen": 6.563776969909668, "rewards/margins": 4.3968400955200195, "rewards/rejected": 2.166937828063965, "step": 12450 }, { "epoch": 0.6912525485083425, "grad_norm": 84.86593627929688, "learning_rate": 2.1732078905936923e-08, "logits/chosen": -0.2796228229999542, "logits/rejected": -0.38698163628578186, "logps/chosen": -167.61642456054688, "logps/rejected": -239.12905883789062, "loss": 1.2759, "nll_loss": 1.0216405391693115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.806312561035156, "rewards/margins": 2.9036295413970947, "rewards/rejected": 2.9026830196380615, "step": 12460 }, { "epoch": 0.6918073258345931, "grad_norm": 45.21405029296875, "learning_rate": 2.1660240376829437e-08, "logits/chosen": -0.2210051268339157, "logits/rejected": -0.40046876668930054, "logps/chosen": -171.15985107421875, "logps/rejected": -228.51998901367188, "loss": 1.2174, "nll_loss": 0.8844130635261536, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.903317928314209, "rewards/margins": 4.436379909515381, "rewards/rejected": 1.4669368267059326, "step": 12470 }, { "epoch": 0.6923621031608438, "grad_norm": 75.68427276611328, "learning_rate": 2.1588487936243805e-08, "logits/chosen": -0.29069143533706665, "logits/rejected": -0.4780551493167877, "logps/chosen": -165.8023681640625, "logps/rejected": -217.4993438720703, "loss": 1.1407, "nll_loss": 0.9273978471755981, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.701485633850098, "rewards/margins": 3.7327961921691895, "rewards/rejected": 1.9686896800994873, "step": 12480 }, { "epoch": 0.6929168804870945, "grad_norm": 131.2355194091797, "learning_rate": 2.151682180214447e-08, "logits/chosen": -0.2729392647743225, "logits/rejected": -0.41850152611732483, "logps/chosen": -146.65773010253906, "logps/rejected": -186.9139404296875, "loss": 1.1893, "nll_loss": 0.9260753393173218, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.724074363708496, "rewards/margins": 3.4622206687927246, "rewards/rejected": 2.2618539333343506, "step": 12490 }, { "epoch": 0.6934716578133452, "grad_norm": 199.65155029296875, "learning_rate": 2.1445242192233832e-08, "logits/chosen": -0.31922265887260437, "logits/rejected": -0.4389330744743347, "logps/chosen": -195.14793395996094, "logps/rejected": -262.5490417480469, "loss": 1.2636, "nll_loss": 1.0710331201553345, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.984295845031738, "rewards/margins": 3.9620003700256348, "rewards/rejected": 2.0222959518432617, "step": 12500 }, { "epoch": 0.6934716578133452, "eval_logits/chosen": -0.4087273180484772, "eval_logits/rejected": -0.5315932631492615, "eval_logps/chosen": -190.18540954589844, "eval_logps/rejected": -263.9999694824219, "eval_loss": 1.2172236442565918, "eval_nll_loss": 0.986419141292572, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.777579307556152, "eval_rewards/margins": 5.269726753234863, "eval_rewards/rejected": 1.5078527927398682, "eval_runtime": 16.8595, "eval_samples_per_second": 15.184, "eval_steps_per_second": 1.898, "step": 12500 }, { "epoch": 0.6940264351395958, "grad_norm": 28.3609619140625, "learning_rate": 2.137374932395133e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -116.59901428222656, "logps/rejected": -184.24917602539062, "loss": 1.2296, "nll_loss": NaN, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.856743335723877, "rewards/margins": 4.400498390197754, "rewards/rejected": 0.4562453627586365, "step": 12510 }, { "epoch": 0.6945812124658465, "grad_norm": 66.7529525756836, "learning_rate": 2.130234341447298e-08, "logits/chosen": -0.26258862018585205, "logits/rejected": -0.44541144371032715, "logps/chosen": -151.9936981201172, "logps/rejected": -226.0979766845703, "loss": 1.3037, "nll_loss": 0.9197772741317749, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.724644660949707, "rewards/margins": 3.637474775314331, "rewards/rejected": 2.087170124053955, "step": 12520 }, { "epoch": 0.6951359897920972, "grad_norm": 81.84940338134766, "learning_rate": 2.123102468071058e-08, "logits/chosen": -0.3097625970840454, "logits/rejected": -0.5346147418022156, "logps/chosen": -158.1197967529297, "logps/rejected": -219.06069946289062, "loss": 1.2766, "nll_loss": 0.959620475769043, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.531791687011719, "rewards/margins": 4.247246742248535, "rewards/rejected": 1.284545660018921, "step": 12530 }, { "epoch": 0.6956907671183479, "grad_norm": 87.16346740722656, "learning_rate": 2.115979333931117e-08, "logits/chosen": -0.34987300634384155, "logits/rejected": -0.5129455924034119, "logps/chosen": -187.62423706054688, "logps/rejected": -233.568115234375, "loss": 1.4026, "nll_loss": 0.9986729621887207, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.0716729164123535, "rewards/margins": 3.5790810585021973, "rewards/rejected": 2.4925918579101562, "step": 12540 }, { "epoch": 0.6962455444445985, "grad_norm": 66.4018325805664, "learning_rate": 2.108864960665631e-08, "logits/chosen": -0.3691956698894501, "logits/rejected": -0.48615536093711853, "logps/chosen": -150.43435668945312, "logps/rejected": -215.7544708251953, "loss": 1.2317, "nll_loss": 0.9678544998168945, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 5.999693870544434, "rewards/margins": 2.967386484146118, "rewards/rejected": 3.0323076248168945, "step": 12550 }, { "epoch": 0.6968003217708493, "grad_norm": 65.25221252441406, "learning_rate": 2.101759369886137e-08, "logits/chosen": -0.3581236004829407, "logits/rejected": -0.4774579107761383, "logps/chosen": -196.92544555664062, "logps/rejected": -245.4114990234375, "loss": 1.3096, "nll_loss": 1.1450408697128296, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.050604343414307, "rewards/margins": 3.169355869293213, "rewards/rejected": 2.881248950958252, "step": 12560 }, { "epoch": 0.6973550990970999, "grad_norm": 99.85542297363281, "learning_rate": 2.094662583177501e-08, "logits/chosen": -0.4287230372428894, "logits/rejected": -0.5906526446342468, "logps/chosen": -203.54275512695312, "logps/rejected": -253.23587036132812, "loss": 1.3315, "nll_loss": 1.0714521408081055, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.651005744934082, "rewards/margins": 3.783782482147217, "rewards/rejected": 2.867222309112549, "step": 12570 }, { "epoch": 0.6979098764233506, "grad_norm": 88.76417541503906, "learning_rate": 2.0875746220978375e-08, "logits/chosen": -0.25520652532577515, "logits/rejected": -0.42219337821006775, "logps/chosen": -159.36973571777344, "logps/rejected": -202.49530029296875, "loss": 1.3895, "nll_loss": 0.9047862887382507, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.3983473777771, "rewards/margins": 3.299961566925049, "rewards/rejected": 2.0983855724334717, "step": 12580 }, { "epoch": 0.6984646537496012, "grad_norm": 121.02467346191406, "learning_rate": 2.0804955081784557e-08, "logits/chosen": -0.3737090528011322, "logits/rejected": -0.4600960314273834, "logps/chosen": -152.117919921875, "logps/rejected": -201.46641540527344, "loss": 1.3058, "nll_loss": 0.9616680145263672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.8062543869018555, "rewards/margins": 3.2586758136749268, "rewards/rejected": 2.547579050064087, "step": 12590 }, { "epoch": 0.699019431075852, "grad_norm": 64.31996154785156, "learning_rate": 2.0734252629237893e-08, "logits/chosen": -0.16399219632148743, "logits/rejected": -0.4218166470527649, "logps/chosen": -125.17207336425781, "logps/rejected": -188.16537475585938, "loss": 1.1864, "nll_loss": 0.6983264684677124, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.9564127922058105, "rewards/margins": 3.527151107788086, "rewards/rejected": 1.4292614459991455, "step": 12600 }, { "epoch": 0.6995742084021026, "grad_norm": 122.69866180419922, "learning_rate": 2.0663639078113305e-08, "logits/chosen": -0.19395720958709717, "logits/rejected": -0.3099953532218933, "logps/chosen": -157.4480743408203, "logps/rejected": -206.3944091796875, "loss": 1.2991, "nll_loss": 0.8979552388191223, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.614788055419922, "rewards/margins": 3.3870463371276855, "rewards/rejected": 2.2277417182922363, "step": 12610 }, { "epoch": 0.7001289857283532, "grad_norm": 38.911495208740234, "learning_rate": 2.0593114642915637e-08, "logits/chosen": -0.2975391447544098, "logits/rejected": -0.46102046966552734, "logps/chosen": -164.1873779296875, "logps/rejected": -248.583740234375, "loss": 1.2425, "nll_loss": 0.9558102488517761, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.593968868255615, "rewards/margins": 4.030261993408203, "rewards/rejected": 1.563706398010254, "step": 12620 }, { "epoch": 0.700683763054604, "grad_norm": 49.26844787597656, "learning_rate": 2.052267953787907e-08, "logits/chosen": -0.35543131828308105, "logits/rejected": -0.5265085101127625, "logps/chosen": -166.32180786132812, "logps/rejected": -224.92971801757812, "loss": 1.3341, "nll_loss": 0.964970588684082, "rewards/accuracies": 0.875, "rewards/chosen": 5.636528491973877, "rewards/margins": 3.437178134918213, "rewards/rejected": 2.199349880218506, "step": 12630 }, { "epoch": 0.7012385403808546, "grad_norm": 120.4012680053711, "learning_rate": 2.0452333976966353e-08, "logits/chosen": -0.1976948380470276, "logits/rejected": -0.40840989351272583, "logps/chosen": -144.37417602539062, "logps/rejected": -214.06057739257812, "loss": 1.1906, "nll_loss": 0.7425335645675659, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.459261894226074, "rewards/margins": 4.588191032409668, "rewards/rejected": 0.8710712194442749, "step": 12640 }, { "epoch": 0.7017933177071053, "grad_norm": 46.45096206665039, "learning_rate": 2.0382078173868294e-08, "logits/chosen": -0.3347860276699066, "logits/rejected": -0.4544796049594879, "logps/chosen": -184.65481567382812, "logps/rejected": -199.2404327392578, "loss": 1.305, "nll_loss": 0.9165772199630737, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.64373254776001, "rewards/margins": 3.0978517532348633, "rewards/rejected": 2.5458810329437256, "step": 12650 }, { "epoch": 0.7023480950333559, "grad_norm": 56.71844482421875, "learning_rate": 2.031191234200303e-08, "logits/chosen": -0.349911630153656, "logits/rejected": -0.5095638036727905, "logps/chosen": -175.8561553955078, "logps/rejected": -242.212646484375, "loss": 1.248, "nll_loss": 0.9571784734725952, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.067541122436523, "rewards/margins": 3.7752602100372314, "rewards/rejected": 2.292280673980713, "step": 12660 }, { "epoch": 0.7029028723596067, "grad_norm": 61.5022087097168, "learning_rate": 2.0241836694515335e-08, "logits/chosen": -0.28449350595474243, "logits/rejected": -0.4594908654689789, "logps/chosen": -179.852783203125, "logps/rejected": -265.66815185546875, "loss": 1.2322, "nll_loss": 0.9275538325309753, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.3422346115112305, "rewards/margins": 4.554647922515869, "rewards/rejected": 1.7875865697860718, "step": 12670 }, { "epoch": 0.7034576496858573, "grad_norm": 123.11775970458984, "learning_rate": 2.01718514442761e-08, "logits/chosen": -0.22520117461681366, "logits/rejected": -0.4005351960659027, "logps/chosen": -170.02728271484375, "logps/rejected": -232.5746612548828, "loss": 1.3099, "nll_loss": 0.8446242213249207, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.153702735900879, "rewards/margins": 4.833800315856934, "rewards/rejected": 1.319903016090393, "step": 12680 }, { "epoch": 0.704012427012108, "grad_norm": 70.78916931152344, "learning_rate": 2.0101956803881555e-08, "logits/chosen": -0.17443840205669403, "logits/rejected": -0.3497583866119385, "logps/chosen": -157.9462890625, "logps/rejected": -216.73391723632812, "loss": 1.252, "nll_loss": 0.8217185139656067, "rewards/accuracies": 0.875, "rewards/chosen": 5.5470476150512695, "rewards/margins": 3.8540797233581543, "rewards/rejected": 1.692967176437378, "step": 12690 }, { "epoch": 0.7045672043383587, "grad_norm": 46.204471588134766, "learning_rate": 2.0032152985652707e-08, "logits/chosen": -0.2721540629863739, "logits/rejected": -0.4170974791049957, "logps/chosen": -184.45590209960938, "logps/rejected": -244.99560546875, "loss": 1.2509, "nll_loss": 0.9397276043891907, "rewards/accuracies": 0.75, "rewards/chosen": 6.371033668518066, "rewards/margins": 4.111761569976807, "rewards/rejected": 2.2592720985412598, "step": 12700 }, { "epoch": 0.7051219816646094, "grad_norm": 34.63050842285156, "learning_rate": 1.9962440201634696e-08, "logits/chosen": -0.30988579988479614, "logits/rejected": -0.45380598306655884, "logps/chosen": -152.26637268066406, "logps/rejected": -203.39842224121094, "loss": 1.2288, "nll_loss": 0.8955994844436646, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.418598651885986, "rewards/margins": 3.852252960205078, "rewards/rejected": 1.566345453262329, "step": 12710 }, { "epoch": 0.70567675899086, "grad_norm": 108.41631317138672, "learning_rate": 1.989281866359606e-08, "logits/chosen": -0.2549108862876892, "logits/rejected": -0.3928259015083313, "logps/chosen": -146.1342010498047, "logps/rejected": -203.2252197265625, "loss": 1.2525, "nll_loss": 0.8680016398429871, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.755482196807861, "rewards/margins": 3.5762786865234375, "rewards/rejected": 2.179203510284424, "step": 12720 }, { "epoch": 0.7062315363171108, "grad_norm": 30.510683059692383, "learning_rate": 1.982328858302823e-08, "logits/chosen": -0.24541839957237244, "logits/rejected": -0.45684748888015747, "logps/chosen": -145.65972900390625, "logps/rejected": -205.34811401367188, "loss": 1.2206, "nll_loss": 0.8304737210273743, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.457831382751465, "rewards/margins": 3.5541675090789795, "rewards/rejected": 1.903663992881775, "step": 12730 }, { "epoch": 0.7067863136433614, "grad_norm": 41.52299118041992, "learning_rate": 1.9753850171144725e-08, "logits/chosen": -0.1176132932305336, "logits/rejected": -0.28097209334373474, "logps/chosen": -115.817138671875, "logps/rejected": -158.190185546875, "loss": 1.2493, "nll_loss": 0.7526054382324219, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.40674352645874, "rewards/margins": 2.9769506454467773, "rewards/rejected": 1.429793119430542, "step": 12740 }, { "epoch": 0.7073410909696121, "grad_norm": 52.08366012573242, "learning_rate": 1.968450363888073e-08, "logits/chosen": -0.36436715722084045, "logits/rejected": -0.42022258043289185, "logps/chosen": -163.3020782470703, "logps/rejected": -211.4189453125, "loss": 1.2257, "nll_loss": 0.983964741230011, "rewards/accuracies": 0.75, "rewards/chosen": 6.1039533615112305, "rewards/margins": 3.2575669288635254, "rewards/rejected": 2.846386432647705, "step": 12750 }, { "epoch": 0.7078958682958627, "grad_norm": 33.801876068115234, "learning_rate": 1.961524919689218e-08, "logits/chosen": -0.135419100522995, "logits/rejected": -0.32483750581741333, "logps/chosen": -149.5926971435547, "logps/rejected": -221.0042266845703, "loss": 1.2294, "nll_loss": 0.790702223777771, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.895544528961182, "rewards/margins": 4.437676429748535, "rewards/rejected": 1.4578684568405151, "step": 12760 }, { "epoch": 0.7084506456221135, "grad_norm": 70.3134765625, "learning_rate": 1.9546087055555375e-08, "logits/chosen": -0.40542134642601013, "logits/rejected": -0.5466644167900085, "logps/chosen": -181.45196533203125, "logps/rejected": -260.3875732421875, "loss": 1.3914, "nll_loss": 0.9717741012573242, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.495024681091309, "rewards/margins": 4.145397186279297, "rewards/rejected": 2.3496272563934326, "step": 12770 }, { "epoch": 0.7090054229483641, "grad_norm": 76.55762481689453, "learning_rate": 1.9477017424966152e-08, "logits/chosen": -0.2543894648551941, "logits/rejected": -0.42837873101234436, "logps/chosen": -144.65731811523438, "logps/rejected": -203.85897827148438, "loss": 1.2742, "nll_loss": 0.8862441182136536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.567269325256348, "rewards/margins": 3.940202236175537, "rewards/rejected": 1.6270668506622314, "step": 12780 }, { "epoch": 0.7095602002746147, "grad_norm": 75.84803771972656, "learning_rate": 1.9408040514939377e-08, "logits/chosen": -0.3257814049720764, "logits/rejected": -0.44889751076698303, "logps/chosen": -152.11138916015625, "logps/rejected": -206.82565307617188, "loss": 1.2851, "nll_loss": 0.8820212483406067, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.982528209686279, "rewards/margins": 4.001893043518066, "rewards/rejected": 1.9806352853775024, "step": 12790 }, { "epoch": 0.7101149776008655, "grad_norm": 40.27317810058594, "learning_rate": 1.933915653500826e-08, "logits/chosen": -0.2914651930332184, "logits/rejected": -0.46833962202072144, "logps/chosen": -149.4254913330078, "logps/rejected": -205.02490234375, "loss": 1.214, "nll_loss": 0.8474915623664856, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 6.0263285636901855, "rewards/margins": 3.7715892791748047, "rewards/rejected": 2.254739761352539, "step": 12800 }, { "epoch": 0.7106697549271161, "grad_norm": 74.58905792236328, "learning_rate": 1.927036569442365e-08, "logits/chosen": -0.27327996492385864, "logits/rejected": -0.4386266767978668, "logps/chosen": -203.98887634277344, "logps/rejected": -243.96932983398438, "loss": 1.2942, "nll_loss": 0.9820057153701782, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.953102111816406, "rewards/margins": 3.2255806922912598, "rewards/rejected": 2.7275218963623047, "step": 12810 }, { "epoch": 0.7112245322533668, "grad_norm": 42.248138427734375, "learning_rate": 1.9201668202153554e-08, "logits/chosen": -0.2995825409889221, "logits/rejected": -0.47395235300064087, "logps/chosen": -150.34645080566406, "logps/rejected": -229.35995483398438, "loss": 1.2174, "nll_loss": 0.9468528032302856, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.46729850769043, "rewards/margins": 3.862170457839966, "rewards/rejected": 1.6051280498504639, "step": 12820 }, { "epoch": 0.7117793095796174, "grad_norm": 57.27659606933594, "learning_rate": 1.9133064266882328e-08, "logits/chosen": -0.23794226348400116, "logits/rejected": -0.39174434542655945, "logps/chosen": -119.78440856933594, "logps/rejected": -176.40623474121094, "loss": 1.2189, "nll_loss": 0.7929913401603699, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.895203590393066, "rewards/margins": 2.8227221965789795, "rewards/rejected": 2.0724809169769287, "step": 12830 }, { "epoch": 0.7123340869058682, "grad_norm": 23.0706844329834, "learning_rate": 1.9064554097010176e-08, "logits/chosen": -0.37945157289505005, "logits/rejected": -0.5358896255493164, "logps/chosen": -153.80322265625, "logps/rejected": -219.34814453125, "loss": 1.2837, "nll_loss": 0.977512001991272, "rewards/accuracies": 0.875, "rewards/chosen": 6.117849349975586, "rewards/margins": 3.8214268684387207, "rewards/rejected": 2.296422004699707, "step": 12840 }, { "epoch": 0.7128888642321188, "grad_norm": 102.89293670654297, "learning_rate": 1.8996137900652466e-08, "logits/chosen": -0.27641671895980835, "logits/rejected": -0.40077847242355347, "logps/chosen": -167.06674194335938, "logps/rejected": -219.5724639892578, "loss": 1.2671, "nll_loss": 0.9891948699951172, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.899290561676025, "rewards/margins": 3.168273687362671, "rewards/rejected": 2.7310166358947754, "step": 12850 }, { "epoch": 0.7134436415583695, "grad_norm": 57.34820556640625, "learning_rate": 1.8927815885639097e-08, "logits/chosen": -0.32612407207489014, "logits/rejected": -0.4492993950843811, "logps/chosen": -146.11221313476562, "logps/rejected": -177.49276733398438, "loss": 1.2811, "nll_loss": 0.9233980178833008, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.71115779876709, "rewards/margins": 3.345907688140869, "rewards/rejected": 2.3652498722076416, "step": 12860 }, { "epoch": 0.7139984188846202, "grad_norm": 54.08071517944336, "learning_rate": 1.8859588259513864e-08, "logits/chosen": -0.3142291009426117, "logits/rejected": -0.49519652128219604, "logps/chosen": -166.45726013183594, "logps/rejected": -248.1087188720703, "loss": 1.2014, "nll_loss": 0.91033536195755, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.033982276916504, "rewards/margins": 4.530450820922852, "rewards/rejected": 1.5035309791564941, "step": 12870 }, { "epoch": 0.7145531962108709, "grad_norm": 56.40658950805664, "learning_rate": 1.8791455229533804e-08, "logits/chosen": -0.322214275598526, "logits/rejected": -0.44496506452560425, "logps/chosen": -172.56436157226562, "logps/rejected": -215.24002075195312, "loss": 1.1968, "nll_loss": 0.9485975503921509, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.363347053527832, "rewards/margins": 3.617643356323242, "rewards/rejected": 2.745704174041748, "step": 12880 }, { "epoch": 0.7151079735371215, "grad_norm": 43.77219772338867, "learning_rate": 1.8723417002668652e-08, "logits/chosen": -0.26582399010658264, "logits/rejected": -0.43083304166793823, "logps/chosen": -143.25259399414062, "logps/rejected": -204.05868530273438, "loss": 1.29, "nll_loss": 0.8451235890388489, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.363948822021484, "rewards/margins": 3.3561675548553467, "rewards/rejected": 2.0077812671661377, "step": 12890 }, { "epoch": 0.7156627508633722, "grad_norm": 53.658660888671875, "learning_rate": 1.8655473785600122e-08, "logits/chosen": -0.2778196632862091, "logits/rejected": -0.3918890953063965, "logps/chosen": -161.07229614257812, "logps/rejected": -195.16146850585938, "loss": 1.2338, "nll_loss": 0.9494892358779907, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.543831825256348, "rewards/margins": 3.4626071453094482, "rewards/rejected": 2.0812244415283203, "step": 12900 }, { "epoch": 0.7162175281896229, "grad_norm": 40.95535659790039, "learning_rate": 1.8587625784721356e-08, "logits/chosen": -0.2518269717693329, "logits/rejected": -0.42011457681655884, "logps/chosen": -142.2730712890625, "logps/rejected": -199.7687530517578, "loss": 1.2362, "nll_loss": 0.8660953640937805, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.482463836669922, "rewards/margins": 4.113658428192139, "rewards/rejected": 1.3688055276870728, "step": 12910 }, { "epoch": 0.7167723055158736, "grad_norm": 42.07647705078125, "learning_rate": 1.8519873206136177e-08, "logits/chosen": -0.33359354734420776, "logits/rejected": -0.458385705947876, "logps/chosen": -149.71578979492188, "logps/rejected": -203.4271240234375, "loss": 1.3138, "nll_loss": 1.0241000652313232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.560154914855957, "rewards/margins": 2.454641580581665, "rewards/rejected": 3.105512857437134, "step": 12920 }, { "epoch": 0.7173270828421242, "grad_norm": 39.50285720825195, "learning_rate": 1.8452216255658626e-08, "logits/chosen": -0.4769509434700012, "logits/rejected": -0.5878731608390808, "logps/chosen": -214.20791625976562, "logps/rejected": -306.9803771972656, "loss": 1.1628, "nll_loss": 1.1445338726043701, "rewards/accuracies": 0.875, "rewards/chosen": 6.931177616119385, "rewards/margins": 4.8411078453063965, "rewards/rejected": 2.090069055557251, "step": 12930 }, { "epoch": 0.717881860168375, "grad_norm": 41.24628448486328, "learning_rate": 1.8384655138812178e-08, "logits/chosen": -0.349549800157547, "logits/rejected": -0.5109783411026001, "logps/chosen": -139.32525634765625, "logps/rejected": -206.759033203125, "loss": 1.1471, "nll_loss": 0.9063628911972046, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.7531890869140625, "rewards/margins": 3.5763237476348877, "rewards/rejected": 2.176865339279175, "step": 12940 }, { "epoch": 0.7184366374946256, "grad_norm": 66.80060577392578, "learning_rate": 1.831719006082924e-08, "logits/chosen": -0.31192249059677124, "logits/rejected": -0.5079909563064575, "logps/chosen": -163.4386444091797, "logps/rejected": -230.68496704101562, "loss": 1.1593, "nll_loss": 0.9192889928817749, "rewards/accuracies": 0.875, "rewards/chosen": 6.040367126464844, "rewards/margins": 4.6038713455200195, "rewards/rejected": 1.4364957809448242, "step": 12950 }, { "epoch": 0.7189914148208763, "grad_norm": 45.7085075378418, "learning_rate": 1.8249821226650486e-08, "logits/chosen": -0.38364288210868835, "logits/rejected": -0.49500417709350586, "logps/chosen": -182.65310668945312, "logps/rejected": -227.1117706298828, "loss": 1.1732, "nll_loss": 0.9986478090286255, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.40786600112915, "rewards/margins": 3.7794127464294434, "rewards/rejected": 2.628453254699707, "step": 12960 }, { "epoch": 0.7195461921471269, "grad_norm": 58.808650970458984, "learning_rate": 1.8182548840924172e-08, "logits/chosen": -0.2475881278514862, "logits/rejected": -0.3897285759449005, "logps/chosen": -145.7071990966797, "logps/rejected": -188.16705322265625, "loss": 1.1754, "nll_loss": 0.9295892715454102, "rewards/accuracies": 0.875, "rewards/chosen": 5.440359115600586, "rewards/margins": 3.4720706939697266, "rewards/rejected": 1.9682880640029907, "step": 12970 }, { "epoch": 0.7201009694733777, "grad_norm": 39.131309509277344, "learning_rate": 1.8115373108005638e-08, "logits/chosen": -0.37623220682144165, "logits/rejected": -0.5077120661735535, "logps/chosen": -178.80911254882812, "logps/rejected": -245.22372436523438, "loss": 1.2162, "nll_loss": 0.9660285711288452, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.500500679016113, "rewards/margins": 4.713377952575684, "rewards/rejected": 1.787122130393982, "step": 12980 }, { "epoch": 0.7206557467996283, "grad_norm": 42.33364486694336, "learning_rate": 1.804829423195653e-08, "logits/chosen": -0.26987963914871216, "logits/rejected": -0.41641178727149963, "logps/chosen": -166.83279418945312, "logps/rejected": -231.6823272705078, "loss": 1.2484, "nll_loss": 0.9490247964859009, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 6.052439212799072, "rewards/margins": 4.058259010314941, "rewards/rejected": 1.994180679321289, "step": 12990 }, { "epoch": 0.7212105241258789, "grad_norm": 45.6847038269043, "learning_rate": 1.798131241654439e-08, "logits/chosen": -0.4033544063568115, "logits/rejected": -0.5462941527366638, "logps/chosen": -166.350341796875, "logps/rejected": -248.04190063476562, "loss": 1.343, "nll_loss": 1.0180920362472534, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.182610988616943, "rewards/margins": 4.22403621673584, "rewards/rejected": 1.9585742950439453, "step": 13000 }, { "epoch": 0.7212105241258789, "eval_logits/chosen": -0.4150004982948303, "eval_logits/rejected": -0.5308792591094971, "eval_logps/chosen": -190.27316284179688, "eval_logps/rejected": -260.8743591308594, "eval_loss": 1.2157081365585327, "eval_nll_loss": 0.987297773361206, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.768805503845215, "eval_rewards/margins": 4.948392868041992, "eval_rewards/rejected": 1.8204128742218018, "eval_runtime": 17.1826, "eval_samples_per_second": 14.899, "eval_steps_per_second": 1.862, "step": 13000 }, { "epoch": 0.7217653014521297, "grad_norm": 35.98946762084961, "learning_rate": 1.791442786524181e-08, "logits/chosen": -0.19918230175971985, "logits/rejected": -0.3975090980529785, "logps/chosen": -166.979248046875, "logps/rejected": -215.09579467773438, "loss": 1.0687, "nll_loss": 0.8822317123413086, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.629497528076172, "rewards/margins": 4.045687675476074, "rewards/rejected": 1.58380925655365, "step": 13010 }, { "epoch": 0.7223200787783803, "grad_norm": 94.03396606445312, "learning_rate": 1.784764078122598e-08, "logits/chosen": -0.2538452744483948, "logits/rejected": -0.42120417952537537, "logps/chosen": -149.91822814941406, "logps/rejected": -221.2766876220703, "loss": 1.3123, "nll_loss": 0.8378020524978638, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.508445739746094, "rewards/margins": 3.8266403675079346, "rewards/rejected": 1.6818052530288696, "step": 13020 }, { "epoch": 0.722874856104631, "grad_norm": 62.408546447753906, "learning_rate": 1.7780951367377972e-08, "logits/chosen": -0.33915066719055176, "logits/rejected": -0.4873233437538147, "logps/chosen": -145.5653839111328, "logps/rejected": -212.60238647460938, "loss": 1.2537, "nll_loss": 0.9451497793197632, "rewards/accuracies": 0.875, "rewards/chosen": 5.885664939880371, "rewards/margins": 3.323911190032959, "rewards/rejected": 2.561753749847412, "step": 13030 }, { "epoch": 0.7234296334308816, "grad_norm": 90.34113311767578, "learning_rate": 1.771435982628219e-08, "logits/chosen": -0.25658300518989563, "logits/rejected": -0.391094833612442, "logps/chosen": -154.4763641357422, "logps/rejected": -213.06753540039062, "loss": 1.2512, "nll_loss": 0.9946179389953613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.304181098937988, "rewards/margins": 3.5390563011169434, "rewards/rejected": 1.7651245594024658, "step": 13040 }, { "epoch": 0.7239844107571324, "grad_norm": 76.4636001586914, "learning_rate": 1.7647866360225726e-08, "logits/chosen": -0.31367915868759155, "logits/rejected": -0.4467683732509613, "logps/chosen": -163.2528076171875, "logps/rejected": -188.37747192382812, "loss": 1.1973, "nll_loss": 0.9559744000434875, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.3748369216918945, "rewards/margins": 3.4286091327667236, "rewards/rejected": 1.9462273120880127, "step": 13050 }, { "epoch": 0.724539188083383, "grad_norm": 106.77983856201172, "learning_rate": 1.7581471171197722e-08, "logits/chosen": -0.16705089807510376, "logits/rejected": -0.4071389138698578, "logps/chosen": -132.18258666992188, "logps/rejected": -180.0357208251953, "loss": 1.2273, "nll_loss": 0.7544218897819519, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.956098556518555, "rewards/margins": 3.1437020301818848, "rewards/rejected": 1.8123964071273804, "step": 13060 }, { "epoch": 0.7250939654096337, "grad_norm": 69.83487701416016, "learning_rate": 1.7515174460888816e-08, "logits/chosen": -0.29171133041381836, "logits/rejected": -0.4464386999607086, "logps/chosen": -165.49127197265625, "logps/rejected": -199.56341552734375, "loss": 1.3437, "nll_loss": 0.8982936143875122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.108651161193848, "rewards/margins": 2.5784244537353516, "rewards/rejected": 2.530226230621338, "step": 13070 }, { "epoch": 0.7256487427358844, "grad_norm": 53.608699798583984, "learning_rate": 1.7448976430690438e-08, "logits/chosen": -0.2807365357875824, "logits/rejected": -0.4650154709815979, "logps/chosen": -143.1229705810547, "logps/rejected": -221.6487274169922, "loss": 1.1904, "nll_loss": 0.898482620716095, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.50841760635376, "rewards/margins": 4.433836460113525, "rewards/rejected": 1.0745811462402344, "step": 13080 }, { "epoch": 0.7262035200621351, "grad_norm": 42.33846664428711, "learning_rate": 1.7382877281694354e-08, "logits/chosen": -0.42869797348976135, "logits/rejected": -0.5276592373847961, "logps/chosen": -212.95223999023438, "logps/rejected": -263.3831481933594, "loss": 1.3195, "nll_loss": 1.1187633275985718, "rewards/accuracies": 0.875, "rewards/chosen": 6.4709601402282715, "rewards/margins": 4.3836140632629395, "rewards/rejected": 2.087346076965332, "step": 13090 }, { "epoch": 0.7267582973883857, "grad_norm": 44.90936279296875, "learning_rate": 1.7316877214691862e-08, "logits/chosen": -0.2935159206390381, "logits/rejected": -0.4234120845794678, "logps/chosen": -152.82589721679688, "logps/rejected": -200.16238403320312, "loss": 1.2297, "nll_loss": 0.9580494165420532, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.570136070251465, "rewards/margins": 3.5980517864227295, "rewards/rejected": 1.9720847606658936, "step": 13100 }, { "epoch": 0.7273130747146365, "grad_norm": 72.38946533203125, "learning_rate": 1.7250976430173285e-08, "logits/chosen": -0.35018840432167053, "logits/rejected": -0.46431058645248413, "logps/chosen": -165.9536590576172, "logps/rejected": -224.0995330810547, "loss": 1.2554, "nll_loss": 1.0881538391113281, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.744930267333984, "rewards/margins": 3.581627607345581, "rewards/rejected": 2.1633026599884033, "step": 13110 }, { "epoch": 0.7278678520408871, "grad_norm": 39.84886169433594, "learning_rate": 1.7185175128327418e-08, "logits/chosen": -0.20200283825397491, "logits/rejected": -0.35957229137420654, "logps/chosen": -162.9403076171875, "logps/rejected": -203.55410766601562, "loss": 1.2343, "nll_loss": 0.8529999852180481, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.79220724105835, "rewards/margins": 3.306241989135742, "rewards/rejected": 2.4859659671783447, "step": 13120 }, { "epoch": 0.7284226293671378, "grad_norm": 35.085819244384766, "learning_rate": 1.7119473509040756e-08, "logits/chosen": -0.21651801466941833, "logits/rejected": -0.364332914352417, "logps/chosen": -163.5776824951172, "logps/rejected": -206.1197509765625, "loss": 1.1998, "nll_loss": 0.9077421426773071, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.570407867431641, "rewards/margins": 2.973717212677002, "rewards/rejected": 2.5966904163360596, "step": 13130 }, { "epoch": 0.7289774066933884, "grad_norm": 84.82121276855469, "learning_rate": 1.7053871771897115e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -160.59307861328125, "logps/rejected": -225.750244140625, "loss": 1.3315, "nll_loss": NaN, "rewards/accuracies": 0.875, "rewards/chosen": 6.173449516296387, "rewards/margins": 4.374788284301758, "rewards/rejected": 1.7986600399017334, "step": 13140 }, { "epoch": 0.7295321840196392, "grad_norm": 61.24282455444336, "learning_rate": 1.6988370116176764e-08, "logits/chosen": -0.34403157234191895, "logits/rejected": -0.5174924731254578, "logps/chosen": -186.6648712158203, "logps/rejected": -223.6381072998047, "loss": 1.3006, "nll_loss": 0.9689914584159851, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.245591163635254, "rewards/margins": 4.800109386444092, "rewards/rejected": 1.4454818964004517, "step": 13150 }, { "epoch": 0.7300869613458898, "grad_norm": 79.50798797607422, "learning_rate": 1.692296874085605e-08, "logits/chosen": -0.34989652037620544, "logits/rejected": -0.4624873101711273, "logps/chosen": -162.43450927734375, "logps/rejected": -224.67764282226562, "loss": 1.2184, "nll_loss": 1.0137053728103638, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.597512722015381, "rewards/margins": 3.3539185523986816, "rewards/rejected": 2.2435946464538574, "step": 13160 }, { "epoch": 0.7306417386721404, "grad_norm": 59.73739242553711, "learning_rate": 1.6857667844606616e-08, "logits/chosen": -0.41657987236976624, "logits/rejected": -0.566390335559845, "logps/chosen": -184.5636749267578, "logps/rejected": -233.48483276367188, "loss": 1.2721, "nll_loss": 1.0097143650054932, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.789811134338379, "rewards/margins": 3.3764655590057373, "rewards/rejected": 2.4133458137512207, "step": 13170 }, { "epoch": 0.7311965159983912, "grad_norm": 61.35866165161133, "learning_rate": 1.6792467625794942e-08, "logits/chosen": -0.2814289629459381, "logits/rejected": -0.4504537582397461, "logps/chosen": -174.3564910888672, "logps/rejected": -246.03244018554688, "loss": 1.2339, "nll_loss": 0.8952564001083374, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 6.157135009765625, "rewards/margins": 4.755683422088623, "rewards/rejected": 1.4014512300491333, "step": 13180 }, { "epoch": 0.7317512933246418, "grad_norm": 49.41172409057617, "learning_rate": 1.6727368282481656e-08, "logits/chosen": -0.30663132667541504, "logits/rejected": -0.42407432198524475, "logps/chosen": -167.57131958007812, "logps/rejected": -204.41546630859375, "loss": 1.193, "nll_loss": 0.895298957824707, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.07155704498291, "rewards/margins": 3.683244228363037, "rewards/rejected": 2.388312816619873, "step": 13190 }, { "epoch": 0.7323060706508925, "grad_norm": 40.43097686767578, "learning_rate": 1.666237001242093e-08, "logits/chosen": -0.4561356008052826, "logits/rejected": -0.5546432137489319, "logps/chosen": -188.98721313476562, "logps/rejected": -239.19589233398438, "loss": 1.2181, "nll_loss": 1.0610122680664062, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.779797554016113, "rewards/margins": 3.9123573303222656, "rewards/rejected": 2.8674397468566895, "step": 13200 }, { "epoch": 0.7328608479771431, "grad_norm": 32.458961486816406, "learning_rate": 1.6597473013059943e-08, "logits/chosen": -0.4126416742801666, "logits/rejected": -0.5425761342048645, "logps/chosen": -212.3006591796875, "logps/rejected": -268.7002868652344, "loss": 1.2087, "nll_loss": 1.0621838569641113, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.994882106781006, "rewards/margins": 4.066960334777832, "rewards/rejected": 2.927921772003174, "step": 13210 }, { "epoch": 0.7334156253033939, "grad_norm": 79.28108978271484, "learning_rate": 1.6532677481538194e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -134.73980712890625, "logps/rejected": -203.62240600585938, "loss": 1.1323, "nll_loss": NaN, "rewards/accuracies": 0.875, "rewards/chosen": 5.300518035888672, "rewards/margins": 3.7317371368408203, "rewards/rejected": 1.5687808990478516, "step": 13220 }, { "epoch": 0.7339704026296445, "grad_norm": 120.80438995361328, "learning_rate": 1.6467983614686992e-08, "logits/chosen": -0.3328816592693329, "logits/rejected": -0.46285098791122437, "logps/chosen": -168.47970581054688, "logps/rejected": -232.7246551513672, "loss": 1.2559, "nll_loss": 0.9353010058403015, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.086447715759277, "rewards/margins": 3.551722288131714, "rewards/rejected": 2.5347256660461426, "step": 13230 }, { "epoch": 0.7345251799558952, "grad_norm": 45.92528533935547, "learning_rate": 1.6403391609028793e-08, "logits/chosen": -0.32340607047080994, "logits/rejected": -0.5008508563041687, "logps/chosen": -192.72293090820312, "logps/rejected": -246.63119506835938, "loss": 1.2068, "nll_loss": 0.9869135022163391, "rewards/accuracies": 0.875, "rewards/chosen": 6.534943580627441, "rewards/margins": 4.894669055938721, "rewards/rejected": 1.6402740478515625, "step": 13240 }, { "epoch": 0.7350799572821459, "grad_norm": 48.16605758666992, "learning_rate": 1.6338901660776662e-08, "logits/chosen": -0.2729222774505615, "logits/rejected": -0.3413732945919037, "logps/chosen": -201.21612548828125, "logps/rejected": -254.92153930664062, "loss": 1.2211, "nll_loss": 1.0235908031463623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.458531379699707, "rewards/margins": 3.4676146507263184, "rewards/rejected": 2.9909164905548096, "step": 13250 }, { "epoch": 0.7356347346083966, "grad_norm": 59.24027633666992, "learning_rate": 1.6274513965833565e-08, "logits/chosen": -0.2952631413936615, "logits/rejected": -0.4671157896518707, "logps/chosen": -158.8353729248047, "logps/rejected": -238.5457000732422, "loss": 1.1925, "nll_loss": 0.910784900188446, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.756814479827881, "rewards/margins": 4.049107551574707, "rewards/rejected": 1.707707166671753, "step": 13260 }, { "epoch": 0.7361895119346472, "grad_norm": 36.049800872802734, "learning_rate": 1.6210228719791947e-08, "logits/chosen": -0.2845991849899292, "logits/rejected": -0.45980915427207947, "logps/chosen": -157.11216735839844, "logps/rejected": -205.4685516357422, "loss": 1.2241, "nll_loss": 0.8929013013839722, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.706349849700928, "rewards/margins": 3.1936049461364746, "rewards/rejected": 2.512744426727295, "step": 13270 }, { "epoch": 0.7367442892608979, "grad_norm": 135.76869201660156, "learning_rate": 1.6146046117932942e-08, "logits/chosen": -0.3357810378074646, "logits/rejected": -0.4589572548866272, "logps/chosen": -173.140869140625, "logps/rejected": -245.12350463867188, "loss": 1.2488, "nll_loss": 1.0283997058868408, "rewards/accuracies": 0.875, "rewards/chosen": 6.2359771728515625, "rewards/margins": 4.565829753875732, "rewards/rejected": 1.6701467037200928, "step": 13280 }, { "epoch": 0.7372990665871486, "grad_norm": 71.30635070800781, "learning_rate": 1.608196635522596e-08, "logits/chosen": -0.24882233142852783, "logits/rejected": -0.4175376296043396, "logps/chosen": -144.53494262695312, "logps/rejected": -198.82943725585938, "loss": 1.2672, "nll_loss": 0.9700925946235657, "rewards/accuracies": 0.75, "rewards/chosen": 5.016253471374512, "rewards/margins": 3.1141557693481445, "rewards/rejected": 1.9020977020263672, "step": 13290 }, { "epoch": 0.7378538439133993, "grad_norm": 31.621423721313477, "learning_rate": 1.601798962632799e-08, "logits/chosen": -0.35084596276283264, "logits/rejected": -0.47968751192092896, "logps/chosen": -169.7008514404297, "logps/rejected": -235.3212127685547, "loss": 1.1461, "nll_loss": 0.9612873792648315, "rewards/accuracies": 0.875, "rewards/chosen": 5.660435676574707, "rewards/margins": 3.5851454734802246, "rewards/rejected": 2.0752902030944824, "step": 13300 }, { "epoch": 0.7384086212396499, "grad_norm": 51.74677276611328, "learning_rate": 1.5954116125582996e-08, "logits/chosen": -0.2961025834083557, "logits/rejected": -0.41584667563438416, "logps/chosen": -168.86094665527344, "logps/rejected": -218.5751190185547, "loss": 1.206, "nll_loss": 1.0752700567245483, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.798641204833984, "rewards/margins": 4.416439533233643, "rewards/rejected": 1.3822017908096313, "step": 13310 }, { "epoch": 0.7389633985659007, "grad_norm": 50.81084442138672, "learning_rate": 1.589034604702142e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -135.4510955810547, "logps/rejected": -203.21034240722656, "loss": 1.1936, "nll_loss": NaN, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.300278663635254, "rewards/margins": 4.057191848754883, "rewards/rejected": 1.243086576461792, "step": 13320 }, { "epoch": 0.7395181758921513, "grad_norm": 74.6451416015625, "learning_rate": 1.5826679584359454e-08, "logits/chosen": -0.15505166351795197, "logits/rejected": -0.3529577851295471, "logps/chosen": -140.4691619873047, "logps/rejected": -204.4829559326172, "loss": 1.1831, "nll_loss": 0.7784417271614075, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.117010116577148, "rewards/margins": 3.7283904552459717, "rewards/rejected": 1.3886195421218872, "step": 13330 }, { "epoch": 0.7400729532184019, "grad_norm": 141.3043670654297, "learning_rate": 1.576311693099866e-08, "logits/chosen": -0.22378918528556824, "logits/rejected": -0.3400081694126129, "logps/chosen": -156.72413635253906, "logps/rejected": -186.8373565673828, "loss": 1.3799, "nll_loss": 0.9340991973876953, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.177123546600342, "rewards/margins": 2.554708242416382, "rewards/rejected": 2.622415065765381, "step": 13340 }, { "epoch": 0.7406277305446526, "grad_norm": 70.24127197265625, "learning_rate": 1.569965828002514e-08, "logits/chosen": -0.27799028158187866, "logits/rejected": -0.4888533055782318, "logps/chosen": -137.8538818359375, "logps/rejected": -201.4366912841797, "loss": 1.2301, "nll_loss": 0.869592547416687, "rewards/accuracies": 0.875, "rewards/chosen": 5.519021034240723, "rewards/margins": 4.259803771972656, "rewards/rejected": 1.2592167854309082, "step": 13350 }, { "epoch": 0.7411825078709033, "grad_norm": 57.17934036254883, "learning_rate": 1.5636303824209098e-08, "logits/chosen": -0.1766783595085144, "logits/rejected": -0.40922850370407104, "logps/chosen": -135.87318420410156, "logps/rejected": -201.8750762939453, "loss": 1.2799, "nll_loss": 0.7687122821807861, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.150801181793213, "rewards/margins": 3.9855704307556152, "rewards/rejected": 1.1652311086654663, "step": 13360 }, { "epoch": 0.741737285197154, "grad_norm": 62.709877014160156, "learning_rate": 1.5573053756004252e-08, "logits/chosen": -0.13567259907722473, "logits/rejected": -0.27794989943504333, "logps/chosen": -152.32171630859375, "logps/rejected": -213.1107177734375, "loss": 1.2109, "nll_loss": 0.8560276031494141, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.995692253112793, "rewards/margins": 3.5224056243896484, "rewards/rejected": 1.473286747932434, "step": 13370 }, { "epoch": 0.7422920625234046, "grad_norm": 46.401878356933594, "learning_rate": 1.550990826754715e-08, "logits/chosen": -0.3182279169559479, "logits/rejected": -0.4443788528442383, "logps/chosen": -173.28939819335938, "logps/rejected": -241.54397583007812, "loss": 1.2117, "nll_loss": 0.919338047504425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.1595540046691895, "rewards/margins": 3.7008984088897705, "rewards/rejected": 2.458655595779419, "step": 13380 }, { "epoch": 0.7428468398496554, "grad_norm": 62.002140045166016, "learning_rate": 1.5446867550656767e-08, "logits/chosen": -0.32122185826301575, "logits/rejected": -0.4888898730278015, "logps/chosen": -134.274658203125, "logps/rejected": -183.2400360107422, "loss": 1.168, "nll_loss": 0.8901262283325195, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.329556465148926, "rewards/margins": 3.9345555305480957, "rewards/rejected": 1.3950015306472778, "step": 13390 }, { "epoch": 0.743401617175906, "grad_norm": 31.993404388427734, "learning_rate": 1.53839317968337e-08, "logits/chosen": -0.13947448134422302, "logits/rejected": -0.36338135600090027, "logps/chosen": -126.97871398925781, "logps/rejected": -188.83485412597656, "loss": 1.2051, "nll_loss": 0.7395111918449402, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.282337188720703, "rewards/margins": 4.470915794372559, "rewards/rejected": 0.8114216923713684, "step": 13400 }, { "epoch": 0.7439563945021567, "grad_norm": 33.4692497253418, "learning_rate": 1.532110119725976e-08, "logits/chosen": -0.41327300667762756, "logits/rejected": -0.5486120581626892, "logps/chosen": -183.75914001464844, "logps/rejected": -263.6907043457031, "loss": 1.2326, "nll_loss": 1.069012999534607, "rewards/accuracies": 0.875, "rewards/chosen": 6.653592109680176, "rewards/margins": 4.541326522827148, "rewards/rejected": 2.1122653484344482, "step": 13410 }, { "epoch": 0.7445111718284074, "grad_norm": 108.77604675292969, "learning_rate": 1.5258375942797292e-08, "logits/chosen": -0.21581730246543884, "logits/rejected": -0.4722031056880951, "logps/chosen": -146.3475799560547, "logps/rejected": -222.5902862548828, "loss": 1.2368, "nll_loss": 0.7659646272659302, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.876851558685303, "rewards/margins": 4.014065742492676, "rewards/rejected": 1.8627859354019165, "step": 13420 }, { "epoch": 0.7450659491546581, "grad_norm": 85.56887817382812, "learning_rate": 1.519575622398865e-08, "logits/chosen": -0.3675265610218048, "logits/rejected": -0.5397453308105469, "logps/chosen": -163.51470947265625, "logps/rejected": -223.89694213867188, "loss": 1.2145, "nll_loss": 0.9677974581718445, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.7679643630981445, "rewards/margins": 3.14967942237854, "rewards/rejected": 2.6182851791381836, "step": 13430 }, { "epoch": 0.7456207264809087, "grad_norm": 52.506587982177734, "learning_rate": 1.513324223105562e-08, "logits/chosen": -0.3206063210964203, "logits/rejected": -0.4452175199985504, "logps/chosen": -146.18118286132812, "logps/rejected": -199.52804565429688, "loss": 1.2252, "nll_loss": 0.9559661149978638, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.652331352233887, "rewards/margins": 3.6066582202911377, "rewards/rejected": 2.04567289352417, "step": 13440 }, { "epoch": 0.7461755038071594, "grad_norm": 65.94963836669922, "learning_rate": 1.5070834153898766e-08, "logits/chosen": -0.2580520510673523, "logits/rejected": -0.3323266804218292, "logps/chosen": -116.36967468261719, "logps/rejected": -184.68650817871094, "loss": 1.3069, "nll_loss": 0.8281615972518921, "rewards/accuracies": 0.875, "rewards/chosen": 5.108590602874756, "rewards/margins": 3.4536919593811035, "rewards/rejected": 1.6548986434936523, "step": 13450 }, { "epoch": 0.7467302811334101, "grad_norm": 88.79898834228516, "learning_rate": 1.5008532182096968e-08, "logits/chosen": -0.291323721408844, "logits/rejected": -0.46528196334838867, "logps/chosen": -156.42576599121094, "logps/rejected": -208.47030639648438, "loss": 1.3222, "nll_loss": 0.8707913160324097, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.347197532653809, "rewards/margins": 3.129718065261841, "rewards/rejected": 2.2174792289733887, "step": 13460 }, { "epoch": 0.7472850584596608, "grad_norm": 79.50917053222656, "learning_rate": 1.4946336504906733e-08, "logits/chosen": -0.24390359222888947, "logits/rejected": -0.4485185146331787, "logps/chosen": -141.30801391601562, "logps/rejected": -194.8183135986328, "loss": 1.2848, "nll_loss": 0.8318487405776978, "rewards/accuracies": 0.875, "rewards/chosen": 5.60951566696167, "rewards/margins": 3.9858298301696777, "rewards/rejected": 1.6236860752105713, "step": 13470 }, { "epoch": 0.7478398357859114, "grad_norm": 62.15235137939453, "learning_rate": 1.4884247311261706e-08, "logits/chosen": -0.34686800837516785, "logits/rejected": -0.4664524495601654, "logps/chosen": -171.86383056640625, "logps/rejected": -223.7300262451172, "loss": 1.2128, "nll_loss": 1.0052675008773804, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.725604057312012, "rewards/margins": 3.4819304943084717, "rewards/rejected": 2.2436728477478027, "step": 13480 }, { "epoch": 0.7483946131121622, "grad_norm": 73.1287841796875, "learning_rate": 1.4822264789772071e-08, "logits/chosen": -0.23515813052654266, "logits/rejected": -0.3965316414833069, "logps/chosen": -149.0940399169922, "logps/rejected": -209.34872436523438, "loss": 1.1761, "nll_loss": 0.8091143369674683, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.670095443725586, "rewards/margins": 4.315380573272705, "rewards/rejected": 1.3547146320343018, "step": 13490 }, { "epoch": 0.7489493904384128, "grad_norm": 29.182003021240234, "learning_rate": 1.4760389128723965e-08, "logits/chosen": -0.4131618142127991, "logits/rejected": -0.5330209732055664, "logps/chosen": -212.5506134033203, "logps/rejected": -287.2911071777344, "loss": 1.2237, "nll_loss": 1.0589938163757324, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 6.932608604431152, "rewards/margins": 5.323004722595215, "rewards/rejected": 1.6096042394638062, "step": 13500 }, { "epoch": 0.7489493904384128, "eval_logits/chosen": -0.416965126991272, "eval_logits/rejected": -0.5322977900505066, "eval_logps/chosen": -189.99722290039062, "eval_logps/rejected": -260.26898193359375, "eval_loss": 1.2166118621826172, "eval_nll_loss": 0.9845010042190552, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.796399116516113, "eval_rewards/margins": 4.915448188781738, "eval_rewards/rejected": 1.8809503316879272, "eval_runtime": 16.693, "eval_samples_per_second": 15.336, "eval_steps_per_second": 1.917, "step": 13500 }, { "epoch": 0.7495041677646634, "grad_norm": 39.3568000793457, "learning_rate": 1.469862051607888e-08, "logits/chosen": -0.36322319507598877, "logits/rejected": -0.5250617265701294, "logps/chosen": -160.3135986328125, "logps/rejected": -216.87191772460938, "loss": 1.2106, "nll_loss": 0.9671661257743835, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.912907123565674, "rewards/margins": 3.5368614196777344, "rewards/rejected": 2.3760459423065186, "step": 13510 }, { "epoch": 0.7500589450909141, "grad_norm": 85.17237854003906, "learning_rate": 1.463695913947317e-08, "logits/chosen": -0.19930145144462585, "logits/rejected": -0.39999374747276306, "logps/chosen": -144.89492797851562, "logps/rejected": -218.9955291748047, "loss": 1.2911, "nll_loss": 0.8356701135635376, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.16513204574585, "rewards/margins": 3.6399447917938232, "rewards/rejected": 1.5251868963241577, "step": 13520 }, { "epoch": 0.7506137224171648, "grad_norm": 57.78459930419922, "learning_rate": 1.4575405186217392e-08, "logits/chosen": -0.30035391449928284, "logits/rejected": -0.46901971101760864, "logps/chosen": -185.02407836914062, "logps/rejected": -220.53857421875, "loss": 1.1251, "nll_loss": 1.061841607093811, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.5068511962890625, "rewards/margins": 4.961834907531738, "rewards/rejected": 1.5450154542922974, "step": 13530 }, { "epoch": 0.7511684997434155, "grad_norm": 74.79740905761719, "learning_rate": 1.451395884329581e-08, "logits/chosen": -0.37808674573898315, "logits/rejected": -0.44419270753860474, "logps/chosen": -172.4527587890625, "logps/rejected": -230.8903045654297, "loss": 1.3499, "nll_loss": 1.083181619644165, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.872241020202637, "rewards/margins": 3.1443417072296143, "rewards/rejected": 2.7278990745544434, "step": 13540 }, { "epoch": 0.7517232770696661, "grad_norm": 43.86543655395508, "learning_rate": 1.4452620297365802e-08, "logits/chosen": -0.3932144045829773, "logits/rejected": -0.505740761756897, "logps/chosen": -179.28599548339844, "logps/rejected": -243.1675567626953, "loss": 1.2663, "nll_loss": 1.0719324350357056, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.5402960777282715, "rewards/margins": 4.369576930999756, "rewards/rejected": 2.1707186698913574, "step": 13550 }, { "epoch": 0.7522780543959169, "grad_norm": 49.34416198730469, "learning_rate": 1.4391389734757254e-08, "logits/chosen": -0.319457083940506, "logits/rejected": -0.47524309158325195, "logps/chosen": -163.37777709960938, "logps/rejected": -251.42660522460938, "loss": 1.1682, "nll_loss": 0.9476040601730347, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.27634334564209, "rewards/margins": 4.794894218444824, "rewards/rejected": 1.4814488887786865, "step": 13560 }, { "epoch": 0.7528328317221675, "grad_norm": 70.49656677246094, "learning_rate": 1.4330267341472069e-08, "logits/chosen": -0.3233944773674011, "logits/rejected": -0.44154053926467896, "logps/chosen": -179.5468292236328, "logps/rejected": -223.44540405273438, "loss": 1.2648, "nll_loss": 1.0197670459747314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.219475746154785, "rewards/margins": 3.2971725463867188, "rewards/rejected": 2.9223031997680664, "step": 13570 }, { "epoch": 0.7533876090484182, "grad_norm": 50.081268310546875, "learning_rate": 1.4269253303183515e-08, "logits/chosen": -0.2911186218261719, "logits/rejected": -0.4075024724006653, "logps/chosen": -156.9920196533203, "logps/rejected": -201.3787384033203, "loss": 1.1953, "nll_loss": 0.966240406036377, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.275843620300293, "rewards/margins": 3.2944388389587402, "rewards/rejected": 1.9814043045043945, "step": 13580 }, { "epoch": 0.7539423863746688, "grad_norm": 53.76469039916992, "learning_rate": 1.4208347805235743e-08, "logits/chosen": -0.3274695873260498, "logits/rejected": -0.5015454888343811, "logps/chosen": -149.1398468017578, "logps/rejected": -196.37017822265625, "loss": 1.2699, "nll_loss": 0.9796813726425171, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.354022026062012, "rewards/margins": 3.6555702686309814, "rewards/rejected": 1.6984519958496094, "step": 13590 }, { "epoch": 0.7544971637009196, "grad_norm": 72.05597686767578, "learning_rate": 1.414755103264319e-08, "logits/chosen": -0.3498299717903137, "logits/rejected": -0.4781871736049652, "logps/chosen": -172.31446838378906, "logps/rejected": -226.637939453125, "loss": 1.3502, "nll_loss": 0.9635981321334839, "rewards/accuracies": 0.75, "rewards/chosen": 5.778754234313965, "rewards/margins": 3.1914174556732178, "rewards/rejected": 2.587336540222168, "step": 13600 }, { "epoch": 0.7550519410271702, "grad_norm": 56.078861236572266, "learning_rate": 1.4086863170089975e-08, "logits/chosen": -0.371195524930954, "logits/rejected": -0.4768117368221283, "logps/chosen": -174.6502685546875, "logps/rejected": -226.41098022460938, "loss": 1.2606, "nll_loss": 1.113525629043579, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.1385674476623535, "rewards/margins": 3.322308301925659, "rewards/rejected": 2.8162596225738525, "step": 13610 }, { "epoch": 0.7556067183534209, "grad_norm": 78.32002258300781, "learning_rate": 1.4026284401929439e-08, "logits/chosen": -0.27597662806510925, "logits/rejected": -0.41690319776535034, "logps/chosen": -175.29898071289062, "logps/rejected": -219.8903350830078, "loss": 1.2891, "nll_loss": 0.9898480176925659, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.125199794769287, "rewards/margins": 4.231071472167969, "rewards/rejected": 1.8941287994384766, "step": 13620 }, { "epoch": 0.7561614956796716, "grad_norm": 73.02739715576172, "learning_rate": 1.3965814912183432e-08, "logits/chosen": -0.34426796436309814, "logits/rejected": -0.46403923630714417, "logps/chosen": -191.7817840576172, "logps/rejected": -241.34689331054688, "loss": 1.2583, "nll_loss": 1.01153564453125, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.102602481842041, "rewards/margins": 3.521191120147705, "rewards/rejected": 2.581411123275757, "step": 13630 }, { "epoch": 0.7567162730059223, "grad_norm": 83.91927337646484, "learning_rate": 1.3905454884541967e-08, "logits/chosen": -0.3065240681171417, "logits/rejected": -0.4858540892601013, "logps/chosen": -160.86172485351562, "logps/rejected": -191.63796997070312, "loss": 1.3385, "nll_loss": 0.8707631230354309, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.422104835510254, "rewards/margins": 3.1213173866271973, "rewards/rejected": 2.3007874488830566, "step": 13640 }, { "epoch": 0.7572710503321729, "grad_norm": 70.45728302001953, "learning_rate": 1.384520450236244e-08, "logits/chosen": -0.19798685610294342, "logits/rejected": -0.3198954164981842, "logps/chosen": -133.4322967529297, "logps/rejected": -211.990478515625, "loss": 1.1453, "nll_loss": 0.8883682489395142, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.187527179718018, "rewards/margins": 3.546415328979492, "rewards/rejected": 1.6411120891571045, "step": 13650 }, { "epoch": 0.7578258276584235, "grad_norm": 117.31558990478516, "learning_rate": 1.3785063948669229e-08, "logits/chosen": -0.2564094066619873, "logits/rejected": -0.45179280638694763, "logps/chosen": -143.52139282226562, "logps/rejected": -181.74449157714844, "loss": 1.2219, "nll_loss": 0.8907234072685242, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.215444087982178, "rewards/margins": 3.032702684402466, "rewards/rejected": 2.1827406883239746, "step": 13660 }, { "epoch": 0.7583806049846743, "grad_norm": 92.70428466796875, "learning_rate": 1.3725033406153042e-08, "logits/chosen": -0.27764803171157837, "logits/rejected": -0.39444833993911743, "logps/chosen": -157.85360717773438, "logps/rejected": -223.7536163330078, "loss": 1.25, "nll_loss": 0.9519385099411011, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.345429420471191, "rewards/margins": 4.418973445892334, "rewards/rejected": 1.9264558553695679, "step": 13670 }, { "epoch": 0.7589353823109249, "grad_norm": 52.74095153808594, "learning_rate": 1.3665113057170429e-08, "logits/chosen": -0.29531174898147583, "logits/rejected": -0.46530881524086, "logps/chosen": -165.24893188476562, "logps/rejected": -206.25045776367188, "loss": 1.3079, "nll_loss": 0.8828876614570618, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 5.421899318695068, "rewards/margins": 2.826404094696045, "rewards/rejected": 2.5954947471618652, "step": 13680 }, { "epoch": 0.7594901596371756, "grad_norm": 54.86754608154297, "learning_rate": 1.3605303083743225e-08, "logits/chosen": -0.10553546994924545, "logits/rejected": -0.33541515469551086, "logps/chosen": -115.08282470703125, "logps/rejected": -159.0226593017578, "loss": 1.3038, "nll_loss": 0.7253124117851257, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.589519500732422, "rewards/margins": 3.5651519298553467, "rewards/rejected": 1.0243679285049438, "step": 13690 }, { "epoch": 0.7600449369634263, "grad_norm": 56.478858947753906, "learning_rate": 1.3545603667557909e-08, "logits/chosen": -0.15537983179092407, "logits/rejected": -0.28376904129981995, "logps/chosen": -156.2434539794922, "logps/rejected": -185.52598571777344, "loss": 1.2042, "nll_loss": 0.8980692625045776, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.184126853942871, "rewards/margins": 3.451261043548584, "rewards/rejected": 1.732865333557129, "step": 13700 }, { "epoch": 0.760599714289677, "grad_norm": 53.57682800292969, "learning_rate": 1.3486014989965183e-08, "logits/chosen": -0.30913281440734863, "logits/rejected": -0.4532663822174072, "logps/chosen": -135.60108947753906, "logps/rejected": -189.81686401367188, "loss": 1.1902, "nll_loss": 0.8723493814468384, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.170324802398682, "rewards/margins": 3.189711570739746, "rewards/rejected": 1.980613112449646, "step": 13710 }, { "epoch": 0.7611544916159276, "grad_norm": 58.94258499145508, "learning_rate": 1.3426537231979307e-08, "logits/chosen": -0.21132151782512665, "logits/rejected": -0.37680166959762573, "logps/chosen": -159.8973846435547, "logps/rejected": -198.5689697265625, "loss": 1.1732, "nll_loss": 0.8618567585945129, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.449318885803223, "rewards/margins": 3.848485231399536, "rewards/rejected": 1.6008336544036865, "step": 13720 }, { "epoch": 0.7617092689421784, "grad_norm": 72.0971450805664, "learning_rate": 1.3367170574277619e-08, "logits/chosen": -0.2789410948753357, "logits/rejected": -0.4663736820220947, "logps/chosen": -159.549072265625, "logps/rejected": -221.7187042236328, "loss": 1.2737, "nll_loss": 0.8865306973457336, "rewards/accuracies": 0.875, "rewards/chosen": 5.672693252563477, "rewards/margins": 4.128529071807861, "rewards/rejected": 1.5441645383834839, "step": 13730 }, { "epoch": 0.762264046268429, "grad_norm": 143.35287475585938, "learning_rate": 1.330791519719997e-08, "logits/chosen": -0.2800445854663849, "logits/rejected": -0.37771207094192505, "logps/chosen": -151.23098754882812, "logps/rejected": -208.29928588867188, "loss": 1.2792, "nll_loss": 0.9862383008003235, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.22971248626709, "rewards/margins": 3.126978874206543, "rewards/rejected": 2.1027328968048096, "step": 13740 }, { "epoch": 0.7628188235946797, "grad_norm": 118.14602661132812, "learning_rate": 1.3248771280748172e-08, "logits/chosen": -0.1906663477420807, "logits/rejected": -0.39341455698013306, "logps/chosen": -156.43212890625, "logps/rejected": -188.81900024414062, "loss": 1.2115, "nll_loss": 0.8189682960510254, "rewards/accuracies": 0.875, "rewards/chosen": 5.237671852111816, "rewards/margins": 3.4505152702331543, "rewards/rejected": 1.7871557474136353, "step": 13750 }, { "epoch": 0.7633736009209303, "grad_norm": 45.56608200073242, "learning_rate": 1.318973900458542e-08, "logits/chosen": -0.3441595733165741, "logits/rejected": -0.47740721702575684, "logps/chosen": -151.74215698242188, "logps/rejected": -225.3459014892578, "loss": 1.1899, "nll_loss": 0.9371752738952637, "rewards/accuracies": 0.875, "rewards/chosen": 5.4622697830200195, "rewards/margins": 3.729787826538086, "rewards/rejected": 1.7324821949005127, "step": 13760 }, { "epoch": 0.7639283782471811, "grad_norm": 83.97145080566406, "learning_rate": 1.3130818548035816e-08, "logits/chosen": -0.27460265159606934, "logits/rejected": -0.4410218596458435, "logps/chosen": -145.2313995361328, "logps/rejected": -205.31820678710938, "loss": 1.2731, "nll_loss": 0.883343517780304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.396486759185791, "rewards/margins": 3.477076292037964, "rewards/rejected": 1.9194103479385376, "step": 13770 }, { "epoch": 0.7644831555734317, "grad_norm": 69.6474380493164, "learning_rate": 1.3072010090083747e-08, "logits/chosen": -0.47135257720947266, "logits/rejected": -0.5717315673828125, "logps/chosen": -197.0193634033203, "logps/rejected": -259.94525146484375, "loss": 1.3495, "nll_loss": 1.0664008855819702, "rewards/accuracies": 0.875, "rewards/chosen": 6.381795406341553, "rewards/margins": 3.5582733154296875, "rewards/rejected": 2.8235225677490234, "step": 13780 }, { "epoch": 0.7650379328996824, "grad_norm": 59.73896789550781, "learning_rate": 1.3013313809373394e-08, "logits/chosen": -0.3769679665565491, "logits/rejected": -0.526505708694458, "logps/chosen": -156.14852905273438, "logps/rejected": -213.15328979492188, "loss": 1.2536, "nll_loss": 0.8887110948562622, "rewards/accuracies": 0.875, "rewards/chosen": 5.734314918518066, "rewards/margins": 3.6810462474823, "rewards/rejected": 2.0532686710357666, "step": 13790 }, { "epoch": 0.7655927102259331, "grad_norm": 76.70157623291016, "learning_rate": 1.295472988420821e-08, "logits/chosen": -0.3603596091270447, "logits/rejected": -0.4884079098701477, "logps/chosen": -172.51190185546875, "logps/rejected": -238.10107421875, "loss": 1.347, "nll_loss": 0.9501543045043945, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.018237590789795, "rewards/margins": 4.109791278839111, "rewards/rejected": 1.9084469079971313, "step": 13800 }, { "epoch": 0.7661474875521838, "grad_norm": 80.48224639892578, "learning_rate": 1.2896258492550266e-08, "logits/chosen": -0.3924103081226349, "logits/rejected": -0.5263667702674866, "logps/chosen": -150.50424194335938, "logps/rejected": -214.1674346923828, "loss": 1.3143, "nll_loss": 0.9524089694023132, "rewards/accuracies": 0.875, "rewards/chosen": 6.103146076202393, "rewards/margins": 3.896202802658081, "rewards/rejected": 2.2069430351257324, "step": 13810 }, { "epoch": 0.7667022648784344, "grad_norm": 39.00547409057617, "learning_rate": 1.2837899812019864e-08, "logits/chosen": -0.37665650248527527, "logits/rejected": -0.5423328876495361, "logps/chosen": -169.2489013671875, "logps/rejected": -217.527587890625, "loss": 1.2466, "nll_loss": 0.8893179893493652, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.886086940765381, "rewards/margins": 3.2424449920654297, "rewards/rejected": 2.643641948699951, "step": 13820 }, { "epoch": 0.767257042204685, "grad_norm": 43.614009857177734, "learning_rate": 1.2779654019894853e-08, "logits/chosen": -0.23461337387561798, "logits/rejected": -0.4687643051147461, "logps/chosen": -152.0112762451172, "logps/rejected": -216.02536010742188, "loss": 1.2577, "nll_loss": 0.8669017553329468, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.299442768096924, "rewards/margins": 3.8248748779296875, "rewards/rejected": 1.4745676517486572, "step": 13830 }, { "epoch": 0.7678118195309358, "grad_norm": 53.92696762084961, "learning_rate": 1.272152129311021e-08, "logits/chosen": -0.4088827967643738, "logits/rejected": -0.4966079294681549, "logps/chosen": -179.58731079101562, "logps/rejected": -244.9889373779297, "loss": 1.3564, "nll_loss": 1.0453494787216187, "rewards/accuracies": 0.875, "rewards/chosen": 6.4819841384887695, "rewards/margins": 3.3247177600860596, "rewards/rejected": 3.157266139984131, "step": 13840 }, { "epoch": 0.7683665968571864, "grad_norm": 63.71945571899414, "learning_rate": 1.2663501808257443e-08, "logits/chosen": -0.20103967189788818, "logits/rejected": -0.3945234417915344, "logps/chosen": -152.06423950195312, "logps/rejected": -193.32913208007812, "loss": 1.2031, "nll_loss": 0.8020838499069214, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.715639591217041, "rewards/margins": 4.018339157104492, "rewards/rejected": 1.697300672531128, "step": 13850 }, { "epoch": 0.7689213741834371, "grad_norm": 83.6656265258789, "learning_rate": 1.2605595741584013e-08, "logits/chosen": -0.5328843593597412, "logits/rejected": -0.6139092445373535, "logps/chosen": -225.92556762695312, "logps/rejected": -270.70611572265625, "loss": 1.2327, "nll_loss": 1.1312992572784424, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.906135559082031, "rewards/margins": 3.619173765182495, "rewards/rejected": 3.286961793899536, "step": 13860 }, { "epoch": 0.7694761515096878, "grad_norm": 50.409156799316406, "learning_rate": 1.2547803268992917e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -155.8525848388672, "logps/rejected": -206.296875, "loss": 1.2289, "nll_loss": NaN, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.228343963623047, "rewards/margins": 4.358792304992676, "rewards/rejected": 1.869551658630371, "step": 13870 }, { "epoch": 0.7700309288359385, "grad_norm": 41.457496643066406, "learning_rate": 1.2490124566042004e-08, "logits/chosen": -0.3529255986213684, "logits/rejected": -0.47922688722610474, "logps/chosen": -173.4693603515625, "logps/rejected": -212.62060546875, "loss": 1.264, "nll_loss": 0.972199559211731, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.152778148651123, "rewards/margins": 4.0120134353637695, "rewards/rejected": 2.1407644748687744, "step": 13880 }, { "epoch": 0.7705857061621891, "grad_norm": 78.218994140625, "learning_rate": 1.2432559807943632e-08, "logits/chosen": -0.44029346108436584, "logits/rejected": -0.4879940450191498, "logps/chosen": -194.955078125, "logps/rejected": -263.9924621582031, "loss": 1.2328, "nll_loss": 1.0761487483978271, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.286138534545898, "rewards/margins": 4.106390476226807, "rewards/rejected": 2.179748296737671, "step": 13890 }, { "epoch": 0.7711404834884398, "grad_norm": 48.475379943847656, "learning_rate": 1.2375109169563913e-08, "logits/chosen": -0.295167475938797, "logits/rejected": -0.44352278113365173, "logps/chosen": -157.22340393066406, "logps/rejected": -215.6311798095703, "loss": 1.2638, "nll_loss": 0.8964277505874634, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.457936763763428, "rewards/margins": 2.9228222370147705, "rewards/rejected": 2.535114288330078, "step": 13900 }, { "epoch": 0.7716952608146905, "grad_norm": 63.257591247558594, "learning_rate": 1.2317772825422367e-08, "logits/chosen": -0.34150010347366333, "logits/rejected": -0.43900713324546814, "logps/chosen": -138.32891845703125, "logps/rejected": -192.40408325195312, "loss": 1.2421, "nll_loss": 1.113840103149414, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.1876373291015625, "rewards/margins": 3.5217552185058594, "rewards/rejected": 1.6658827066421509, "step": 13910 }, { "epoch": 0.7722500381409412, "grad_norm": 60.22822189331055, "learning_rate": 1.2260550949691268e-08, "logits/chosen": -0.3614691197872162, "logits/rejected": -0.449531227350235, "logps/chosen": -152.67042541503906, "logps/rejected": -222.02505493164062, "loss": 1.3089, "nll_loss": 0.8986543416976929, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.665240287780762, "rewards/margins": 3.4257895946502686, "rewards/rejected": 2.2394509315490723, "step": 13920 }, { "epoch": 0.7728048154671918, "grad_norm": 38.36786651611328, "learning_rate": 1.2203443716195211e-08, "logits/chosen": -0.35673871636390686, "logits/rejected": -0.5140501260757446, "logps/chosen": -162.87362670898438, "logps/rejected": -248.33712768554688, "loss": 1.2613, "nll_loss": 0.9390610456466675, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.909486293792725, "rewards/margins": 3.5899605751037598, "rewards/rejected": 2.319525718688965, "step": 13930 }, { "epoch": 0.7733595927934426, "grad_norm": 42.55559158325195, "learning_rate": 1.2146451298410526e-08, "logits/chosen": -0.43257418274879456, "logits/rejected": -0.5539957284927368, "logps/chosen": -188.33963012695312, "logps/rejected": -237.80563354492188, "loss": 1.3806, "nll_loss": 1.0820258855819702, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.638883113861084, "rewards/margins": 3.8393142223358154, "rewards/rejected": 2.7995693683624268, "step": 13940 }, { "epoch": 0.7739143701196932, "grad_norm": 81.97818756103516, "learning_rate": 1.2089573869464736e-08, "logits/chosen": -0.44080132246017456, "logits/rejected": -0.5881060361862183, "logps/chosen": -170.52700805664062, "logps/rejected": -240.5904083251953, "loss": 1.1844, "nll_loss": 0.9856440424919128, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.976235866546631, "rewards/margins": 4.229222297668457, "rewards/rejected": 1.7470133304595947, "step": 13950 }, { "epoch": 0.7744691474459439, "grad_norm": 58.12779235839844, "learning_rate": 1.2032811602136107e-08, "logits/chosen": -0.3213277757167816, "logits/rejected": -0.5130875706672668, "logps/chosen": -161.09698486328125, "logps/rejected": -242.45492553710938, "loss": 1.2264, "nll_loss": 0.9088759422302246, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.524542808532715, "rewards/margins": 4.230862617492676, "rewards/rejected": 1.2936804294586182, "step": 13960 }, { "epoch": 0.7750239247721945, "grad_norm": 70.44070434570312, "learning_rate": 1.1976164668853e-08, "logits/chosen": -0.38771852850914, "logits/rejected": -0.5312173366546631, "logps/chosen": -164.395751953125, "logps/rejected": -250.2509765625, "loss": 1.292, "nll_loss": 0.9301018714904785, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.082016468048096, "rewards/margins": 4.434224605560303, "rewards/rejected": 1.6477924585342407, "step": 13970 }, { "epoch": 0.7755787020984453, "grad_norm": 134.88778686523438, "learning_rate": 1.1919633241693538e-08, "logits/chosen": -0.2789040207862854, "logits/rejected": -0.4455975890159607, "logps/chosen": -153.7880401611328, "logps/rejected": -211.2582550048828, "loss": 1.2049, "nll_loss": 0.8475750088691711, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.729306697845459, "rewards/margins": 4.2116804122924805, "rewards/rejected": 1.5176265239715576, "step": 13980 }, { "epoch": 0.7761334794246959, "grad_norm": 70.38463592529297, "learning_rate": 1.1863217492384853e-08, "logits/chosen": -0.34892693161964417, "logits/rejected": -0.5211832523345947, "logps/chosen": -158.76480102539062, "logps/rejected": -215.03958129882812, "loss": 1.249, "nll_loss": 0.9328628778457642, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.606719017028809, "rewards/margins": 3.7106430530548096, "rewards/rejected": 1.8960764408111572, "step": 13990 }, { "epoch": 0.7766882567509465, "grad_norm": 76.29339599609375, "learning_rate": 1.1806917592302761e-08, "logits/chosen": -0.31255191564559937, "logits/rejected": -0.48688554763793945, "logps/chosen": -150.01837158203125, "logps/rejected": -215.4327392578125, "loss": 1.1815, "nll_loss": 0.880654513835907, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.435760498046875, "rewards/margins": 3.266252040863037, "rewards/rejected": 2.1695079803466797, "step": 14000 }, { "epoch": 0.7766882567509465, "eval_logits/chosen": -0.4256032109260559, "eval_logits/rejected": -0.5457690358161926, "eval_logps/chosen": -189.90115356445312, "eval_logps/rejected": -260.0768737792969, "eval_loss": 1.2177233695983887, "eval_nll_loss": 0.9840515851974487, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.806005477905273, "eval_rewards/margins": 4.90584135055542, "eval_rewards/rejected": 1.9001634120941162, "eval_runtime": 17.2756, "eval_samples_per_second": 14.819, "eval_steps_per_second": 1.852, "step": 14000 }, { "epoch": 0.7772430340771973, "grad_norm": 65.32649230957031, "learning_rate": 1.1750733712471106e-08, "logits/chosen": -0.21402129530906677, "logits/rejected": -0.30491748452186584, "logps/chosen": -181.6761016845703, "logps/rejected": -221.7805938720703, "loss": 1.2736, "nll_loss": 0.9396146535873413, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.532603740692139, "rewards/margins": 2.3283419609069824, "rewards/rejected": 3.2042622566223145, "step": 14010 }, { "epoch": 0.777797811403448, "grad_norm": 73.90982055664062, "learning_rate": 1.1694666023561284e-08, "logits/chosen": -0.16523988544940948, "logits/rejected": -0.3659622073173523, "logps/chosen": -144.54388427734375, "logps/rejected": -205.26248168945312, "loss": 1.2495, "nll_loss": 0.8604100346565247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.111564636230469, "rewards/margins": 3.390612840652466, "rewards/rejected": 1.7209514379501343, "step": 14020 }, { "epoch": 0.7783525887296986, "grad_norm": 52.22819900512695, "learning_rate": 1.1638714695891822e-08, "logits/chosen": -0.33601441979408264, "logits/rejected": -0.47834569215774536, "logps/chosen": -168.5770263671875, "logps/rejected": -232.70596313476562, "loss": 1.2159, "nll_loss": 0.8969131708145142, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.402365207672119, "rewards/margins": 4.196316719055176, "rewards/rejected": 2.2060484886169434, "step": 14030 }, { "epoch": 0.7789073660559493, "grad_norm": 59.168701171875, "learning_rate": 1.1582879899427672e-08, "logits/chosen": -0.1585932970046997, "logits/rejected": -0.34851229190826416, "logps/chosen": -131.6605682373047, "logps/rejected": -196.3135528564453, "loss": 1.2516, "nll_loss": 0.7813040018081665, "rewards/accuracies": 0.875, "rewards/chosen": 5.3250579833984375, "rewards/margins": 3.896998882293701, "rewards/rejected": 1.4280592203140259, "step": 14040 }, { "epoch": 0.7794621433822, "grad_norm": 72.20557403564453, "learning_rate": 1.1527161803779866e-08, "logits/chosen": -0.25169992446899414, "logits/rejected": -0.37294498085975647, "logps/chosen": -162.71420288085938, "logps/rejected": -191.31991577148438, "loss": 1.2654, "nll_loss": 0.9724845886230469, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.3179030418396, "rewards/margins": 2.5933871269226074, "rewards/rejected": 2.7245163917541504, "step": 14050 }, { "epoch": 0.7800169207084506, "grad_norm": 34.26319885253906, "learning_rate": 1.1471560578204875e-08, "logits/chosen": -0.30690228939056396, "logits/rejected": -0.45574530959129333, "logps/chosen": -146.76254272460938, "logps/rejected": -204.77993774414062, "loss": 1.2631, "nll_loss": 0.9296310544013977, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.883360862731934, "rewards/margins": 4.266750335693359, "rewards/rejected": 1.6166105270385742, "step": 14060 }, { "epoch": 0.7805716980347013, "grad_norm": 74.3058853149414, "learning_rate": 1.1416076391604195e-08, "logits/chosen": -0.32706719636917114, "logits/rejected": -0.4510704576969147, "logps/chosen": -145.9856719970703, "logps/rejected": -195.59005737304688, "loss": 1.2075, "nll_loss": 0.8912612795829773, "rewards/accuracies": 0.875, "rewards/chosen": 5.421122074127197, "rewards/margins": 3.1206881999969482, "rewards/rejected": 2.300433397293091, "step": 14070 }, { "epoch": 0.781126475360952, "grad_norm": 58.19743728637695, "learning_rate": 1.1360709412523789e-08, "logits/chosen": -0.34424278140068054, "logits/rejected": -0.46891456842422485, "logps/chosen": -163.01699829101562, "logps/rejected": -211.4050750732422, "loss": 1.2561, "nll_loss": 0.9696332812309265, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.362013816833496, "rewards/margins": 2.8251922130584717, "rewards/rejected": 2.536821126937866, "step": 14080 }, { "epoch": 0.7816812526872027, "grad_norm": 79.0674819946289, "learning_rate": 1.1305459809153523e-08, "logits/chosen": -0.3033314645290375, "logits/rejected": -0.4799756407737732, "logps/chosen": -140.81182861328125, "logps/rejected": -201.52999877929688, "loss": 1.3443, "nll_loss": 0.837742030620575, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.111077308654785, "rewards/margins": 3.7379837036132812, "rewards/rejected": 1.373093843460083, "step": 14090 }, { "epoch": 0.7822360300134533, "grad_norm": 64.90271759033203, "learning_rate": 1.1250327749326772e-08, "logits/chosen": -0.4450332224369049, "logits/rejected": -0.5516294836997986, "logps/chosen": -165.32357788085938, "logps/rejected": -222.63339233398438, "loss": 1.2532, "nll_loss": 1.0632097721099854, "rewards/accuracies": 0.875, "rewards/chosen": 6.119533538818359, "rewards/margins": 3.5711288452148438, "rewards/rejected": 2.5484046936035156, "step": 14100 }, { "epoch": 0.7827908073397041, "grad_norm": 61.68737030029297, "learning_rate": 1.119531340051979e-08, "logits/chosen": -0.18699191510677338, "logits/rejected": -0.29720041155815125, "logps/chosen": -156.30979919433594, "logps/rejected": -236.480224609375, "loss": 1.2732, "nll_loss": 0.9045829772949219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.382053375244141, "rewards/margins": 3.5788066387176514, "rewards/rejected": 1.8032464981079102, "step": 14110 }, { "epoch": 0.7833455846659547, "grad_norm": 77.37608337402344, "learning_rate": 1.1140416929851304e-08, "logits/chosen": -0.29755669832229614, "logits/rejected": -0.47017520666122437, "logps/chosen": -151.44454956054688, "logps/rejected": -226.88577270507812, "loss": 1.2929, "nll_loss": 0.8588264584541321, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.481460094451904, "rewards/margins": 3.87780499458313, "rewards/rejected": 1.603655219078064, "step": 14120 }, { "epoch": 0.7839003619922054, "grad_norm": 52.44789123535156, "learning_rate": 1.108563850408193e-08, "logits/chosen": -0.4048032760620117, "logits/rejected": -0.4983956813812256, "logps/chosen": -188.40444946289062, "logps/rejected": -270.88568115234375, "loss": 1.2048, "nll_loss": 1.058279275894165, "rewards/accuracies": 0.75, "rewards/chosen": 6.549439907073975, "rewards/margins": 3.9809250831604004, "rewards/rejected": 2.5685155391693115, "step": 14130 }, { "epoch": 0.784455139318456, "grad_norm": 157.4579620361328, "learning_rate": 1.1030978289613724e-08, "logits/chosen": -0.3345903158187866, "logits/rejected": -0.4269779324531555, "logps/chosen": -167.9392547607422, "logps/rejected": -245.6435089111328, "loss": 1.2268, "nll_loss": 1.0074113607406616, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.121491432189941, "rewards/margins": 3.8833796977996826, "rewards/rejected": 2.238111972808838, "step": 14140 }, { "epoch": 0.7850099166447068, "grad_norm": 42.360111236572266, "learning_rate": 1.097643645248959e-08, "logits/chosen": -0.32162588834762573, "logits/rejected": -0.47896361351013184, "logps/chosen": -156.4175262451172, "logps/rejected": -198.45626831054688, "loss": 1.1511, "nll_loss": 0.9309374094009399, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.966883182525635, "rewards/margins": 4.057782173156738, "rewards/rejected": 1.9091007709503174, "step": 14150 }, { "epoch": 0.7855646939709574, "grad_norm": 75.29857635498047, "learning_rate": 1.0922013158392912e-08, "logits/chosen": -0.20973214507102966, "logits/rejected": -0.3338400721549988, "logps/chosen": -159.48536682128906, "logps/rejected": -205.24050903320312, "loss": 1.1851, "nll_loss": 0.8414397239685059, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.741863250732422, "rewards/margins": 3.673130512237549, "rewards/rejected": 2.0687320232391357, "step": 14160 }, { "epoch": 0.786119471297208, "grad_norm": 56.388431549072266, "learning_rate": 1.08677085726469e-08, "logits/chosen": -0.16435568034648895, "logits/rejected": -0.378712922334671, "logps/chosen": -152.93850708007812, "logps/rejected": -208.55862426757812, "loss": 1.2624, "nll_loss": 0.8205984830856323, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.232509136199951, "rewards/margins": 4.504544734954834, "rewards/rejected": 0.7279645204544067, "step": 14170 }, { "epoch": 0.7866742486234588, "grad_norm": 60.83591079711914, "learning_rate": 1.0813522860214208e-08, "logits/chosen": -0.41574448347091675, "logits/rejected": -0.5650998950004578, "logps/chosen": -190.96762084960938, "logps/rejected": -260.410400390625, "loss": 1.2285, "nll_loss": 1.052651047706604, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.427077293395996, "rewards/margins": 4.325234413146973, "rewards/rejected": 2.1018431186676025, "step": 14180 }, { "epoch": 0.7872290259497094, "grad_norm": 93.42770385742188, "learning_rate": 1.0759456185696375e-08, "logits/chosen": -0.429645836353302, "logits/rejected": -0.5378237962722778, "logps/chosen": -188.11366271972656, "logps/rejected": -282.5850524902344, "loss": 1.2489, "nll_loss": 1.0643198490142822, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 7.1027092933654785, "rewards/margins": 4.801459312438965, "rewards/rejected": 2.3012492656707764, "step": 14190 }, { "epoch": 0.7877838032759601, "grad_norm": 150.52871704101562, "learning_rate": 1.0705508713333312e-08, "logits/chosen": -0.24871881306171417, "logits/rejected": -0.44136008620262146, "logps/chosen": -174.60910034179688, "logps/rejected": -233.86111450195312, "loss": 1.2012, "nll_loss": 0.9185620546340942, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.834150791168213, "rewards/margins": 4.138575077056885, "rewards/rejected": 1.695575475692749, "step": 14200 }, { "epoch": 0.7883385806022107, "grad_norm": 80.00650024414062, "learning_rate": 1.065168060700286e-08, "logits/chosen": -0.2197830229997635, "logits/rejected": -0.39777567982673645, "logps/chosen": -131.45639038085938, "logps/rejected": -192.24990844726562, "loss": 1.1549, "nll_loss": 0.7730545997619629, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 5.495772361755371, "rewards/margins": 4.274158954620361, "rewards/rejected": 1.2216134071350098, "step": 14210 }, { "epoch": 0.7888933579284615, "grad_norm": 78.72735595703125, "learning_rate": 1.0597972030220214e-08, "logits/chosen": -0.5054045915603638, "logits/rejected": -0.6108459234237671, "logps/chosen": -198.7064971923828, "logps/rejected": -261.1624450683594, "loss": 1.2565, "nll_loss": 1.0970631837844849, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.776255130767822, "rewards/margins": 3.514409303665161, "rewards/rejected": 3.261845827102661, "step": 14220 }, { "epoch": 0.7894481352547121, "grad_norm": 89.0277099609375, "learning_rate": 1.0544383146137542e-08, "logits/chosen": -0.13528604805469513, "logits/rejected": -0.30049964785575867, "logps/chosen": -124.52632141113281, "logps/rejected": -187.10525512695312, "loss": 1.2313, "nll_loss": 0.905049204826355, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.986789226531982, "rewards/margins": 3.6463348865509033, "rewards/rejected": 1.3404542207717896, "step": 14230 }, { "epoch": 0.7900029125809628, "grad_norm": 61.415306091308594, "learning_rate": 1.0490914117543353e-08, "logits/chosen": -0.306907057762146, "logits/rejected": -0.504341721534729, "logps/chosen": -150.9152374267578, "logps/rejected": -225.4998016357422, "loss": 1.1882, "nll_loss": 0.8629624247550964, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 5.909262657165527, "rewards/margins": 4.346271514892578, "rewards/rejected": 1.562990665435791, "step": 14240 }, { "epoch": 0.7905576899072135, "grad_norm": 54.4163932800293, "learning_rate": 1.0437565106862073e-08, "logits/chosen": -0.2065925896167755, "logits/rejected": -0.37647438049316406, "logps/chosen": -154.891845703125, "logps/rejected": -226.71542358398438, "loss": 1.216, "nll_loss": 0.9100456237792969, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.1754865646362305, "rewards/margins": 4.063882827758789, "rewards/rejected": 2.1116039752960205, "step": 14250 }, { "epoch": 0.7911124672334642, "grad_norm": 81.15811920166016, "learning_rate": 1.0384336276153588e-08, "logits/chosen": -0.3366141617298126, "logits/rejected": -0.4981306493282318, "logps/chosen": -150.25709533691406, "logps/rejected": -204.87472534179688, "loss": 1.3039, "nll_loss": 0.9259662628173828, "rewards/accuracies": 0.875, "rewards/chosen": 5.649740219116211, "rewards/margins": 3.9433753490448, "rewards/rejected": 1.7063640356063843, "step": 14260 }, { "epoch": 0.7916672445597148, "grad_norm": 87.83256530761719, "learning_rate": 1.0331227787112645e-08, "logits/chosen": -0.2888008952140808, "logits/rejected": -0.42626166343688965, "logps/chosen": -166.24411010742188, "logps/rejected": -204.98977661132812, "loss": 1.2784, "nll_loss": 0.8952625393867493, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.862424850463867, "rewards/margins": 3.5198276042938232, "rewards/rejected": 2.342597007751465, "step": 14270 }, { "epoch": 0.7922220218859655, "grad_norm": 63.025001525878906, "learning_rate": 1.0278239801068517e-08, "logits/chosen": -0.2715142071247101, "logits/rejected": -0.4095768928527832, "logps/chosen": -135.20944213867188, "logps/rejected": -169.39744567871094, "loss": 1.2523, "nll_loss": 0.7992717623710632, "rewards/accuracies": 0.75, "rewards/chosen": 5.300876617431641, "rewards/margins": 3.151837110519409, "rewards/rejected": 2.1490390300750732, "step": 14280 }, { "epoch": 0.7927767992122162, "grad_norm": 90.99301147460938, "learning_rate": 1.0225372478984324e-08, "logits/chosen": -0.19432711601257324, "logits/rejected": -0.4075043797492981, "logps/chosen": -129.4255828857422, "logps/rejected": -188.38177490234375, "loss": 1.2615, "nll_loss": 0.8069963455200195, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.241572380065918, "rewards/margins": 3.424506664276123, "rewards/rejected": 1.8170654773712158, "step": 14290 }, { "epoch": 0.7933315765384669, "grad_norm": 54.6978645324707, "learning_rate": 1.0172625981456723e-08, "logits/chosen": -0.34659120440483093, "logits/rejected": -0.4400635361671448, "logps/chosen": -189.80599975585938, "logps/rejected": -243.38217163085938, "loss": 1.2849, "nll_loss": 1.0428221225738525, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.767068386077881, "rewards/margins": 3.213662624359131, "rewards/rejected": 3.553405284881592, "step": 14300 }, { "epoch": 0.7938863538647175, "grad_norm": 77.70732116699219, "learning_rate": 1.0120000468715267e-08, "logits/chosen": -0.351653516292572, "logits/rejected": -0.5094562768936157, "logps/chosen": -189.60104370117188, "logps/rejected": -238.1971893310547, "loss": 1.2378, "nll_loss": 0.9134159088134766, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.133057594299316, "rewards/margins": 3.4282565116882324, "rewards/rejected": 2.704801559448242, "step": 14310 }, { "epoch": 0.7944411311909683, "grad_norm": 63.889896392822266, "learning_rate": 1.0067496100622041e-08, "logits/chosen": -0.23658093810081482, "logits/rejected": -0.33566293120384216, "logps/chosen": -132.89378356933594, "logps/rejected": -160.03530883789062, "loss": 1.323, "nll_loss": 0.9023619890213013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.8944854736328125, "rewards/margins": 2.4125888347625732, "rewards/rejected": 2.48189640045166, "step": 14320 }, { "epoch": 0.7949959085172189, "grad_norm": 64.3019790649414, "learning_rate": 1.0015113036671119e-08, "logits/chosen": -0.263753741979599, "logits/rejected": -0.37547627091407776, "logps/chosen": -170.27474975585938, "logps/rejected": -205.6151123046875, "loss": 1.2795, "nll_loss": 0.945264458656311, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.726595878601074, "rewards/margins": 3.4203312397003174, "rewards/rejected": 2.306265115737915, "step": 14330 }, { "epoch": 0.7955506858434696, "grad_norm": 67.1368179321289, "learning_rate": 9.962851435988056e-09, "logits/chosen": -0.3096924126148224, "logits/rejected": -0.4432447850704193, "logps/chosen": -152.05868530273438, "logps/rejected": -206.04177856445312, "loss": 1.2941, "nll_loss": 0.9117245674133301, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.722899913787842, "rewards/margins": 3.8970985412597656, "rewards/rejected": 1.8258016109466553, "step": 14340 }, { "epoch": 0.7961054631697203, "grad_norm": 51.44058609008789, "learning_rate": 9.910711457329479e-09, "logits/chosen": -0.35930752754211426, "logits/rejected": -0.5132125020027161, "logps/chosen": -147.76963806152344, "logps/rejected": -210.6107635498047, "loss": 1.256, "nll_loss": 0.9329828023910522, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.823217868804932, "rewards/margins": 3.7123653888702393, "rewards/rejected": 2.1108522415161133, "step": 14350 }, { "epoch": 0.796660240495971, "grad_norm": 75.3576431274414, "learning_rate": 9.8586932590825e-09, "logits/chosen": -0.3371516764163971, "logits/rejected": -0.4289192259311676, "logps/chosen": -177.5013885498047, "logps/rejected": -230.17385864257812, "loss": 1.2697, "nll_loss": 0.9882882833480835, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.371665000915527, "rewards/margins": 3.8917853832244873, "rewards/rejected": 2.4798789024353027, "step": 14360 }, { "epoch": 0.7972150178222216, "grad_norm": 82.46305847167969, "learning_rate": 9.806796999264361e-09, "logits/chosen": -0.2205454409122467, "logits/rejected": -0.3871150612831116, "logps/chosen": -127.50831604003906, "logps/rejected": -173.83023071289062, "loss": 1.1455, "nll_loss": 0.7921987175941467, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.2621378898620605, "rewards/margins": 3.2860114574432373, "rewards/rejected": 1.9761260747909546, "step": 14370 }, { "epoch": 0.7977697951484722, "grad_norm": 68.38803100585938, "learning_rate": 9.755022835521843e-09, "logits/chosen": -0.2221817523241043, "logits/rejected": -0.4071694016456604, "logps/chosen": -130.88552856445312, "logps/rejected": -202.6993408203125, "loss": 1.1171, "nll_loss": 0.8255695104598999, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.904606819152832, "rewards/margins": 3.8406734466552734, "rewards/rejected": 2.0639336109161377, "step": 14380 }, { "epoch": 0.798324572474723, "grad_norm": 55.854034423828125, "learning_rate": 9.703370925130865e-09, "logits/chosen": -0.4654787480831146, "logits/rejected": -0.5669001340866089, "logps/chosen": -206.08377075195312, "logps/rejected": -264.7427673339844, "loss": 1.3264, "nll_loss": 1.115120530128479, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 6.440249443054199, "rewards/margins": 2.270873546600342, "rewards/rejected": 4.169375896453857, "step": 14390 }, { "epoch": 0.7988793498009736, "grad_norm": 55.6180419921875, "learning_rate": 9.651841424995932e-09, "logits/chosen": -0.3281271755695343, "logits/rejected": -0.3929949104785919, "logps/chosen": -184.00228881835938, "logps/rejected": -233.5999755859375, "loss": 1.323, "nll_loss": 1.0436643362045288, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.61121129989624, "rewards/margins": 4.3487677574157715, "rewards/rejected": 2.262443780899048, "step": 14400 }, { "epoch": 0.7994341271272243, "grad_norm": 74.48028564453125, "learning_rate": 9.600434491649745e-09, "logits/chosen": -0.31366071105003357, "logits/rejected": -0.4073302149772644, "logps/chosen": -182.0845184326172, "logps/rejected": -251.82730102539062, "loss": 1.2855, "nll_loss": 0.9698610305786133, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.493363857269287, "rewards/margins": 3.790088176727295, "rewards/rejected": 2.703275442123413, "step": 14410 }, { "epoch": 0.799988904453475, "grad_norm": 39.76643753051758, "learning_rate": 9.549150281252633e-09, "logits/chosen": -0.3544641137123108, "logits/rejected": -0.4706074595451355, "logps/chosen": -193.2956085205078, "logps/rejected": -253.80166625976562, "loss": 1.1707, "nll_loss": 0.987841010093689, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 6.459817409515381, "rewards/margins": 4.018962860107422, "rewards/rejected": 2.440854549407959, "step": 14420 }, { "epoch": 0.8005436817797257, "grad_norm": 68.87491607666016, "learning_rate": 9.497988949592161e-09, "logits/chosen": -0.2607325613498688, "logits/rejected": -0.3864908218383789, "logps/chosen": -145.94537353515625, "logps/rejected": -185.57022094726562, "loss": 1.3224, "nll_loss": 0.9280544519424438, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.35838508605957, "rewards/margins": 3.4789459705352783, "rewards/rejected": 1.879439115524292, "step": 14430 }, { "epoch": 0.8010984591059763, "grad_norm": 35.218536376953125, "learning_rate": 9.446950652082636e-09, "logits/chosen": -0.2851913571357727, "logits/rejected": -0.43879151344299316, "logps/chosen": -151.75906372070312, "logps/rejected": -198.59219360351562, "loss": 1.2342, "nll_loss": 0.8864370584487915, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.473379135131836, "rewards/margins": 4.034270286560059, "rewards/rejected": 1.4391090869903564, "step": 14440 }, { "epoch": 0.801653236432227, "grad_norm": 59.406585693359375, "learning_rate": 9.396035543764558e-09, "logits/chosen": -0.1484692394733429, "logits/rejected": -0.40019527077674866, "logps/chosen": -110.6164321899414, "logps/rejected": -167.54690551757812, "loss": 1.2621, "nll_loss": 0.7206388711929321, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.720696449279785, "rewards/margins": 3.0079617500305176, "rewards/rejected": 1.7127354145050049, "step": 14450 }, { "epoch": 0.8022080137584777, "grad_norm": 98.18295288085938, "learning_rate": 9.345243779304285e-09, "logits/chosen": -0.39211538434028625, "logits/rejected": -0.5174371600151062, "logps/chosen": -179.41665649414062, "logps/rejected": -245.5238494873047, "loss": 1.2218, "nll_loss": 0.9766333699226379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.362366676330566, "rewards/margins": 3.984196186065674, "rewards/rejected": 2.3781704902648926, "step": 14460 }, { "epoch": 0.8027627910847284, "grad_norm": 43.433929443359375, "learning_rate": 9.294575512993408e-09, "logits/chosen": -0.3674396574497223, "logits/rejected": -0.47984933853149414, "logps/chosen": -171.6109619140625, "logps/rejected": -235.5369110107422, "loss": 1.2055, "nll_loss": 0.9847660064697266, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.441687107086182, "rewards/margins": 4.69619083404541, "rewards/rejected": 1.7454957962036133, "step": 14470 }, { "epoch": 0.803317568410979, "grad_norm": 39.43125915527344, "learning_rate": 9.244030898748472e-09, "logits/chosen": -0.28768596053123474, "logits/rejected": -0.44666361808776855, "logps/chosen": -157.49334716796875, "logps/rejected": -208.1215362548828, "loss": 1.1838, "nll_loss": 0.869096577167511, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.3726806640625, "rewards/margins": 3.0028040409088135, "rewards/rejected": 2.3698766231536865, "step": 14480 }, { "epoch": 0.8038723457372298, "grad_norm": 49.01121520996094, "learning_rate": 9.193610090110304e-09, "logits/chosen": -0.2439526617527008, "logits/rejected": -0.43513980507850647, "logps/chosen": -172.5922393798828, "logps/rejected": -231.7100067138672, "loss": 1.1373, "nll_loss": 0.9176927804946899, "rewards/accuracies": 0.875, "rewards/chosen": 6.124392509460449, "rewards/margins": 4.6425018310546875, "rewards/rejected": 1.4818907976150513, "step": 14490 }, { "epoch": 0.8044271230634804, "grad_norm": 32.72774887084961, "learning_rate": 9.143313240243667e-09, "logits/chosen": -0.2891438901424408, "logits/rejected": -0.45568108558654785, "logps/chosen": -154.89642333984375, "logps/rejected": -236.8765411376953, "loss": 1.3321, "nll_loss": 0.9265453219413757, "rewards/accuracies": 0.75, "rewards/chosen": 5.802674293518066, "rewards/margins": 3.1932332515716553, "rewards/rejected": 2.609440565109253, "step": 14500 }, { "epoch": 0.8044271230634804, "eval_logits/chosen": -0.40663790702819824, "eval_logits/rejected": -0.520878255367279, "eval_logps/chosen": -190.1840057373047, "eval_logps/rejected": -261.1012268066406, "eval_loss": 1.2122180461883545, "eval_nll_loss": 0.9856801629066467, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.777721881866455, "eval_rewards/margins": 4.979991912841797, "eval_rewards/rejected": 1.7977294921875, "eval_runtime": 17.1334, "eval_samples_per_second": 14.942, "eval_steps_per_second": 1.868, "step": 14500 }, { "epoch": 0.804981900389731, "grad_norm": 49.28852844238281, "learning_rate": 9.093140501936813e-09, "logits/chosen": -0.30533546209335327, "logits/rejected": -0.4386633336544037, "logps/chosen": -161.4071502685547, "logps/rejected": -197.94219970703125, "loss": 1.2158, "nll_loss": 0.9138886332511902, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.987029075622559, "rewards/margins": 3.8081512451171875, "rewards/rejected": 2.178877353668213, "step": 14510 }, { "epoch": 0.8055366777159817, "grad_norm": 71.35733032226562, "learning_rate": 9.043092027600901e-09, "logits/chosen": -0.35957610607147217, "logits/rejected": -0.5047857165336609, "logps/chosen": -147.0637969970703, "logps/rejected": -201.2028350830078, "loss": 1.2649, "nll_loss": 0.9333122372627258, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.66337776184082, "rewards/margins": 4.2670488357543945, "rewards/rejected": 1.3963292837142944, "step": 14520 }, { "epoch": 0.8060914550422325, "grad_norm": 73.6759033203125, "learning_rate": 8.993167969269716e-09, "logits/chosen": -0.43993091583251953, "logits/rejected": -0.536629319190979, "logps/chosen": -206.188720703125, "logps/rejected": -273.0359802246094, "loss": 1.2285, "nll_loss": 1.0613583326339722, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 7.028354644775391, "rewards/margins": 4.819065093994141, "rewards/rejected": 2.209289789199829, "step": 14530 }, { "epoch": 0.8066462323684831, "grad_norm": 78.81867980957031, "learning_rate": 8.943368478598989e-09, "logits/chosen": -0.46705374121665955, "logits/rejected": -0.5537182092666626, "logps/chosen": -191.93948364257812, "logps/rejected": -271.65008544921875, "loss": 1.2468, "nll_loss": 1.0935325622558594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 7.112006187438965, "rewards/margins": 4.076033115386963, "rewards/rejected": 3.035973072052002, "step": 14540 }, { "epoch": 0.8072010096947337, "grad_norm": 54.17938995361328, "learning_rate": 8.893693706866124e-09, "logits/chosen": -0.20283639430999756, "logits/rejected": -0.45410043001174927, "logps/chosen": -108.3819580078125, "logps/rejected": -176.9251251220703, "loss": 1.2593, "nll_loss": 0.7126230001449585, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.9742279052734375, "rewards/margins": 4.209775924682617, "rewards/rejected": 0.7644524574279785, "step": 14550 }, { "epoch": 0.8077557870209845, "grad_norm": 32.046836853027344, "learning_rate": 8.844143804969623e-09, "logits/chosen": -0.24128413200378418, "logits/rejected": -0.4615742564201355, "logps/chosen": -135.6195831298828, "logps/rejected": -219.5218505859375, "loss": 1.2179, "nll_loss": 0.837755560874939, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.290262699127197, "rewards/margins": 3.544344425201416, "rewards/rejected": 1.7459179162979126, "step": 14560 }, { "epoch": 0.8083105643472351, "grad_norm": 45.38715744018555, "learning_rate": 8.794718923428685e-09, "logits/chosen": -0.23258860409259796, "logits/rejected": -0.39432069659233093, "logps/chosen": -156.38214111328125, "logps/rejected": -210.4923553466797, "loss": 1.1779, "nll_loss": 0.9198177456855774, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.690469264984131, "rewards/margins": 3.7175514698028564, "rewards/rejected": 1.9729175567626953, "step": 14570 }, { "epoch": 0.8088653416734858, "grad_norm": 62.882816314697266, "learning_rate": 8.745419212382738e-09, "logits/chosen": -0.1419857293367386, "logits/rejected": -0.3641354739665985, "logps/chosen": -128.32472229003906, "logps/rejected": -188.57131958007812, "loss": 1.213, "nll_loss": 0.7510377764701843, "rewards/accuracies": 0.875, "rewards/chosen": 5.057511806488037, "rewards/margins": 3.2589499950408936, "rewards/rejected": 1.7985626459121704, "step": 14580 }, { "epoch": 0.8094201189997364, "grad_norm": 100.9625015258789, "learning_rate": 8.696244821590948e-09, "logits/chosen": -0.31309443712234497, "logits/rejected": -0.4752843379974365, "logps/chosen": -187.08206176757812, "logps/rejected": -265.50408935546875, "loss": 1.3192, "nll_loss": 0.9962458610534668, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.294480323791504, "rewards/margins": 5.051595211029053, "rewards/rejected": 1.2428849935531616, "step": 14590 }, { "epoch": 0.8099748963259872, "grad_norm": 94.37689208984375, "learning_rate": 8.64719590043183e-09, "logits/chosen": -0.19047455489635468, "logits/rejected": -0.37319186329841614, "logps/chosen": -131.11935424804688, "logps/rejected": -172.666259765625, "loss": 1.2171, "nll_loss": 0.8773058652877808, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.016050338745117, "rewards/margins": 2.4018607139587402, "rewards/rejected": 2.614189624786377, "step": 14600 }, { "epoch": 0.8105296736522378, "grad_norm": 24.281864166259766, "learning_rate": 8.598272597902706e-09, "logits/chosen": -0.18936872482299805, "logits/rejected": -0.3464515805244446, "logps/chosen": -151.7962646484375, "logps/rejected": -206.4762725830078, "loss": 1.25, "nll_loss": 0.8511675000190735, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.5301618576049805, "rewards/margins": 3.685591220855713, "rewards/rejected": 1.8445703983306885, "step": 14610 }, { "epoch": 0.8110844509784885, "grad_norm": 49.428993225097656, "learning_rate": 8.549475062619354e-09, "logits/chosen": -0.2897084355354309, "logits/rejected": -0.48308873176574707, "logps/chosen": -149.5684051513672, "logps/rejected": -236.5645751953125, "loss": 1.2051, "nll_loss": 0.8345033526420593, "rewards/accuracies": 0.875, "rewards/chosen": 5.602771282196045, "rewards/margins": 4.437640190124512, "rewards/rejected": 1.1651312112808228, "step": 14620 }, { "epoch": 0.8116392283047392, "grad_norm": 56.484107971191406, "learning_rate": 8.500803442815474e-09, "logits/chosen": -0.23302340507507324, "logits/rejected": -0.39650648832321167, "logps/chosen": -153.33529663085938, "logps/rejected": -210.3851776123047, "loss": 1.2246, "nll_loss": 0.9016008377075195, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.680493354797363, "rewards/margins": 4.056046962738037, "rewards/rejected": 1.624446153640747, "step": 14630 }, { "epoch": 0.8121940056309899, "grad_norm": 111.01287841796875, "learning_rate": 8.452257886342295e-09, "logits/chosen": -0.1292901188135147, "logits/rejected": -0.30237165093421936, "logps/chosen": -119.23050689697266, "logps/rejected": -178.7151336669922, "loss": 1.3316, "nll_loss": 1.0425434112548828, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.750171661376953, "rewards/margins": 3.0412964820861816, "rewards/rejected": 1.708875298500061, "step": 14640 }, { "epoch": 0.8127487829572405, "grad_norm": 59.906715393066406, "learning_rate": 8.403838540668057e-09, "logits/chosen": -0.3006291687488556, "logits/rejected": -0.42718249559402466, "logps/chosen": -134.45458984375, "logps/rejected": -182.62513732910156, "loss": 1.2352, "nll_loss": 0.8543822169303894, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.532834529876709, "rewards/margins": 3.3276023864746094, "rewards/rejected": 2.2052321434020996, "step": 14650 }, { "epoch": 0.8133035602834913, "grad_norm": 71.21964263916016, "learning_rate": 8.355545552877658e-09, "logits/chosen": -0.45056334137916565, "logits/rejected": -0.5881573557853699, "logps/chosen": -176.95181274414062, "logps/rejected": -240.5482940673828, "loss": 1.2533, "nll_loss": 1.017821192741394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.567188262939453, "rewards/margins": 4.7127227783203125, "rewards/rejected": 1.8544645309448242, "step": 14660 }, { "epoch": 0.8138583376097419, "grad_norm": 52.82645797729492, "learning_rate": 8.307379069672099e-09, "logits/chosen": -0.37268659472465515, "logits/rejected": -0.5259225964546204, "logps/chosen": -173.6767578125, "logps/rejected": -238.0189666748047, "loss": 1.2661, "nll_loss": 0.9858657717704773, "rewards/accuracies": 0.875, "rewards/chosen": 6.159924507141113, "rewards/margins": 3.906090497970581, "rewards/rejected": 2.253833293914795, "step": 14670 }, { "epoch": 0.8144131149359926, "grad_norm": 73.35997772216797, "learning_rate": 8.259339237368134e-09, "logits/chosen": -0.3108692467212677, "logits/rejected": -0.4505864679813385, "logps/chosen": -141.55963134765625, "logps/rejected": -208.3193817138672, "loss": 1.2022, "nll_loss": 0.915246307849884, "rewards/accuracies": 0.875, "rewards/chosen": 5.435148239135742, "rewards/margins": 3.543468952178955, "rewards/rejected": 1.8916794061660767, "step": 14680 }, { "epoch": 0.8149678922622432, "grad_norm": 57.747432708740234, "learning_rate": 8.211426201897797e-09, "logits/chosen": -0.25365111231803894, "logits/rejected": -0.395337849855423, "logps/chosen": -165.84991455078125, "logps/rejected": -255.8370819091797, "loss": 1.1484, "nll_loss": 0.9351493120193481, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.533167839050293, "rewards/margins": 4.6734185218811035, "rewards/rejected": 1.8597490787506104, "step": 14690 }, { "epoch": 0.815522669588494, "grad_norm": 49.915531158447266, "learning_rate": 8.163640108807896e-09, "logits/chosen": -0.2948180139064789, "logits/rejected": -0.4152253568172455, "logps/chosen": -157.45738220214844, "logps/rejected": -204.9689178466797, "loss": 1.1782, "nll_loss": 0.904376208782196, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.626477241516113, "rewards/margins": 3.7865302562713623, "rewards/rejected": 1.8399465084075928, "step": 14700 }, { "epoch": 0.8160774469147446, "grad_norm": 61.01435089111328, "learning_rate": 8.115981103259678e-09, "logits/chosen": -0.20418615639209747, "logits/rejected": -0.3977503180503845, "logps/chosen": -155.60549926757812, "logps/rejected": -204.55967712402344, "loss": 1.2328, "nll_loss": 0.8879886865615845, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.589022636413574, "rewards/margins": 3.396660566329956, "rewards/rejected": 2.19236159324646, "step": 14710 }, { "epoch": 0.8166322242409952, "grad_norm": 32.84076690673828, "learning_rate": 8.068449330028282e-09, "logits/chosen": -0.3054724931716919, "logits/rejected": -0.45483383536338806, "logps/chosen": -165.53244018554688, "logps/rejected": -215.5263671875, "loss": 1.2858, "nll_loss": 1.0018367767333984, "rewards/accuracies": 0.875, "rewards/chosen": 5.649083614349365, "rewards/margins": 4.386407852172852, "rewards/rejected": 1.2626762390136719, "step": 14720 }, { "epoch": 0.817187001567246, "grad_norm": 52.607566833496094, "learning_rate": 8.02104493350238e-09, "logits/chosen": -0.2687918543815613, "logits/rejected": -0.38726913928985596, "logps/chosen": -161.62387084960938, "logps/rejected": -228.53164672851562, "loss": 1.2302, "nll_loss": 1.0006484985351562, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.204294204711914, "rewards/margins": 4.660628318786621, "rewards/rejected": 1.5436656475067139, "step": 14730 }, { "epoch": 0.8177417788934966, "grad_norm": 153.74290466308594, "learning_rate": 7.973768057683728e-09, "logits/chosen": -0.2631340026855469, "logits/rejected": -0.48361843824386597, "logps/chosen": -153.59364318847656, "logps/rejected": -218.3295135498047, "loss": 1.331, "nll_loss": 0.9026532173156738, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.447627067565918, "rewards/margins": 4.096835136413574, "rewards/rejected": 1.3507912158966064, "step": 14740 }, { "epoch": 0.8182965562197473, "grad_norm": 29.862611770629883, "learning_rate": 7.926618846186645e-09, "logits/chosen": -0.44401612877845764, "logits/rejected": -0.5922880172729492, "logps/chosen": -186.78976440429688, "logps/rejected": -259.10577392578125, "loss": 1.2862, "nll_loss": 1.0834470987319946, "rewards/accuracies": 0.875, "rewards/chosen": 6.416709899902344, "rewards/margins": 4.0009002685546875, "rewards/rejected": 2.415809154510498, "step": 14750 }, { "epoch": 0.8188513335459979, "grad_norm": 49.80220031738281, "learning_rate": 7.879597442237712e-09, "logits/chosen": -0.22588615119457245, "logits/rejected": -0.37203675508499146, "logps/chosen": -145.67984008789062, "logps/rejected": -199.66519165039062, "loss": 1.244, "nll_loss": 0.867226243019104, "rewards/accuracies": 0.875, "rewards/chosen": 5.510145664215088, "rewards/margins": 3.3055522441864014, "rewards/rejected": 2.204594135284424, "step": 14760 }, { "epoch": 0.8194061108722487, "grad_norm": 45.6759033203125, "learning_rate": 7.832703988675194e-09, "logits/chosen": -0.2180647850036621, "logits/rejected": -0.4713711738586426, "logps/chosen": -137.48190307617188, "logps/rejected": -186.2786865234375, "loss": 1.1119, "nll_loss": 0.7633231282234192, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.3695783615112305, "rewards/margins": 3.9644503593444824, "rewards/rejected": 1.4051278829574585, "step": 14770 }, { "epoch": 0.8199608881984993, "grad_norm": 70.81529235839844, "learning_rate": 7.785938627948757e-09, "logits/chosen": -0.29169461131095886, "logits/rejected": -0.4414834976196289, "logps/chosen": -140.72491455078125, "logps/rejected": -196.05258178710938, "loss": 1.1999, "nll_loss": 0.9778728485107422, "rewards/accuracies": 0.875, "rewards/chosen": 5.375916957855225, "rewards/margins": 3.624210834503174, "rewards/rejected": 1.7517064809799194, "step": 14780 }, { "epoch": 0.82051566552475, "grad_norm": 72.59806060791016, "learning_rate": 7.739301502118884e-09, "logits/chosen": -0.2551764249801636, "logits/rejected": -0.41377201676368713, "logps/chosen": -139.05166625976562, "logps/rejected": -197.09945678710938, "loss": 1.2325, "nll_loss": 0.8698671460151672, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.665285110473633, "rewards/margins": 3.6509602069854736, "rewards/rejected": 2.014324903488159, "step": 14790 }, { "epoch": 0.8210704428510007, "grad_norm": 68.07139587402344, "learning_rate": 7.692792752856563e-09, "logits/chosen": -0.2664044499397278, "logits/rejected": -0.403535932302475, "logps/chosen": -150.14035034179688, "logps/rejected": -195.5835418701172, "loss": 1.19, "nll_loss": 0.85888671875, "rewards/accuracies": 0.875, "rewards/chosen": 5.65519905090332, "rewards/margins": 3.977385997772217, "rewards/rejected": 1.6778132915496826, "step": 14800 }, { "epoch": 0.8216252201772514, "grad_norm": 66.05326843261719, "learning_rate": 7.646412521442775e-09, "logits/chosen": -0.275273859500885, "logits/rejected": -0.41081708669662476, "logps/chosen": -160.2446746826172, "logps/rejected": -218.191162109375, "loss": 1.2886, "nll_loss": 0.8886381983757019, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.96896505355835, "rewards/margins": 4.09000301361084, "rewards/rejected": 1.878962755203247, "step": 14810 }, { "epoch": 0.822179997503502, "grad_norm": 56.362491607666016, "learning_rate": 7.600160948768119e-09, "logits/chosen": -0.42797571420669556, "logits/rejected": -0.5637251138687134, "logps/chosen": -196.26580810546875, "logps/rejected": -264.8769226074219, "loss": 1.2708, "nll_loss": 1.0535155534744263, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.409218788146973, "rewards/margins": 4.376527309417725, "rewards/rejected": 2.032691240310669, "step": 14820 }, { "epoch": 0.8227347748297527, "grad_norm": 78.95390319824219, "learning_rate": 7.554038175332372e-09, "logits/chosen": -0.19425630569458008, "logits/rejected": -0.4434455931186676, "logps/chosen": -122.21415710449219, "logps/rejected": -181.78126525878906, "loss": 1.1581, "nll_loss": 0.7200466990470886, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.086915493011475, "rewards/margins": 4.251137733459473, "rewards/rejected": 0.8357783555984497, "step": 14830 }, { "epoch": 0.8232895521560034, "grad_norm": 90.60800170898438, "learning_rate": 7.508044341244014e-09, "logits/chosen": -0.35558730363845825, "logits/rejected": -0.4984716773033142, "logps/chosen": -177.00253295898438, "logps/rejected": -234.00106811523438, "loss": 1.2573, "nll_loss": 0.8866392374038696, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 6.090183258056641, "rewards/margins": 4.173531532287598, "rewards/rejected": 1.916651725769043, "step": 14840 }, { "epoch": 0.8238443294822541, "grad_norm": 92.40534973144531, "learning_rate": 7.462179586219896e-09, "logits/chosen": -0.26048916578292847, "logits/rejected": -0.44145363569259644, "logps/chosen": -142.84750366210938, "logps/rejected": -187.72445678710938, "loss": 1.298, "nll_loss": 0.8607121706008911, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.610686302185059, "rewards/margins": 3.019132375717163, "rewards/rejected": 2.5915539264678955, "step": 14850 }, { "epoch": 0.8243991068085047, "grad_norm": 40.92693328857422, "learning_rate": 7.416444049584713e-09, "logits/chosen": -0.37518757581710815, "logits/rejected": -0.5347386598587036, "logps/chosen": -136.802978515625, "logps/rejected": -192.44764709472656, "loss": 1.1889, "nll_loss": 0.9797961115837097, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 5.848901271820068, "rewards/margins": 3.5814437866210938, "rewards/rejected": 2.2674574851989746, "step": 14860 }, { "epoch": 0.8249538841347555, "grad_norm": 43.93336868286133, "learning_rate": 7.370837870270657e-09, "logits/chosen": -0.19765231013298035, "logits/rejected": -0.31762415170669556, "logps/chosen": -167.2465057373047, "logps/rejected": -211.02896118164062, "loss": 1.2521, "nll_loss": 0.9774270057678223, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.4940571784973145, "rewards/margins": 2.8652596473693848, "rewards/rejected": 2.6287970542907715, "step": 14870 }, { "epoch": 0.8255086614610061, "grad_norm": 77.28921508789062, "learning_rate": 7.325361186816958e-09, "logits/chosen": -0.3661887049674988, "logits/rejected": -0.47705668210983276, "logps/chosen": -183.70973205566406, "logps/rejected": -248.5480499267578, "loss": 1.3064, "nll_loss": 0.9975088238716125, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 6.510737419128418, "rewards/margins": 3.8452117443084717, "rewards/rejected": 2.665524959564209, "step": 14880 }, { "epoch": 0.8260634387872567, "grad_norm": 53.095848083496094, "learning_rate": 7.2800141373695e-09, "logits/chosen": -0.4615301489830017, "logits/rejected": -0.5704389810562134, "logps/chosen": -202.3080596923828, "logps/rejected": -265.6336975097656, "loss": 1.2323, "nll_loss": 1.1122227907180786, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 6.346624374389648, "rewards/margins": 3.7509427070617676, "rewards/rejected": 2.595681667327881, "step": 14890 }, { "epoch": 0.8266182161135074, "grad_norm": 59.551517486572266, "learning_rate": 7.234796859680309e-09, "logits/chosen": -0.18715207278728485, "logits/rejected": -0.43487709760665894, "logps/chosen": -134.09478759765625, "logps/rejected": -210.234130859375, "loss": 1.2, "nll_loss": 0.8074381947517395, "rewards/accuracies": 0.875, "rewards/chosen": 4.857832431793213, "rewards/margins": 4.174450874328613, "rewards/rejected": 0.6833813786506653, "step": 14900 }, { "epoch": 0.8271729934397581, "grad_norm": 53.82889175415039, "learning_rate": 7.189709491107271e-09, "logits/chosen": -0.18827755749225616, "logits/rejected": -0.3777596056461334, "logps/chosen": -146.95791625976562, "logps/rejected": -196.8974609375, "loss": 1.2265, "nll_loss": 0.8681344985961914, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.362422943115234, "rewards/margins": 3.6606945991516113, "rewards/rejected": 1.7017284631729126, "step": 14910 }, { "epoch": 0.8277277707660088, "grad_norm": 59.87836837768555, "learning_rate": 7.1447521686136045e-09, "logits/chosen": -0.22889253497123718, "logits/rejected": -0.4294372498989105, "logps/chosen": -171.40213012695312, "logps/rejected": -246.50717163085938, "loss": 1.1909, "nll_loss": 0.8252578973770142, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.834525108337402, "rewards/margins": 3.9994735717773438, "rewards/rejected": 1.8350521326065063, "step": 14920 }, { "epoch": 0.8282825480922594, "grad_norm": 49.997886657714844, "learning_rate": 7.099925028767484e-09, "logits/chosen": -0.20189666748046875, "logits/rejected": -0.34705278277397156, "logps/chosen": -142.7606658935547, "logps/rejected": -208.4573211669922, "loss": 1.1941, "nll_loss": 0.8327042460441589, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.526029109954834, "rewards/margins": 3.9469552040100098, "rewards/rejected": 1.5790737867355347, "step": 14930 }, { "epoch": 0.8288373254185102, "grad_norm": 36.117759704589844, "learning_rate": 7.055228207741648e-09, "logits/chosen": -0.24866139888763428, "logits/rejected": -0.41052132844924927, "logps/chosen": -156.0734405517578, "logps/rejected": -208.0565643310547, "loss": 1.142, "nll_loss": 0.8504625558853149, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.5310258865356445, "rewards/margins": 3.632554531097412, "rewards/rejected": 1.8984712362289429, "step": 14940 }, { "epoch": 0.8293921027447608, "grad_norm": 78.30645751953125, "learning_rate": 7.010661841312921e-09, "logits/chosen": -0.3344518840312958, "logits/rejected": -0.42739883065223694, "logps/chosen": -174.8288116455078, "logps/rejected": -229.3480224609375, "loss": 1.2936, "nll_loss": 1.0034263134002686, "rewards/accuracies": 0.875, "rewards/chosen": 6.538097381591797, "rewards/margins": 4.158072471618652, "rewards/rejected": 2.3800246715545654, "step": 14950 }, { "epoch": 0.8299468800710115, "grad_norm": 78.57463073730469, "learning_rate": 6.96622606486188e-09, "logits/chosen": -0.22606225311756134, "logits/rejected": -0.4104226529598236, "logps/chosen": -128.64364624023438, "logps/rejected": -194.7679901123047, "loss": 1.179, "nll_loss": 0.7998219728469849, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.099642276763916, "rewards/margins": 2.9102585315704346, "rewards/rejected": 2.189384698867798, "step": 14960 }, { "epoch": 0.8305016573972622, "grad_norm": 42.729522705078125, "learning_rate": 6.921921013372401e-09, "logits/chosen": -0.09530682861804962, "logits/rejected": -0.24463346600532532, "logps/chosen": -139.29718017578125, "logps/rejected": -214.3907470703125, "loss": 1.2336, "nll_loss": 0.835990309715271, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.2614030838012695, "rewards/margins": 3.730445384979248, "rewards/rejected": 1.5309584140777588, "step": 14970 }, { "epoch": 0.8310564347235129, "grad_norm": 80.23871612548828, "learning_rate": 6.877746821431218e-09, "logits/chosen": -0.33773642778396606, "logits/rejected": -0.47674092650413513, "logps/chosen": -158.9118194580078, "logps/rejected": -242.2969207763672, "loss": 1.2778, "nll_loss": 0.994554877281189, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 6.165432453155518, "rewards/margins": 4.255608081817627, "rewards/rejected": 1.909824013710022, "step": 14980 }, { "epoch": 0.8316112120497635, "grad_norm": 67.4317398071289, "learning_rate": 6.833703623227599e-09, "logits/chosen": -0.39678817987442017, "logits/rejected": -0.5164974927902222, "logps/chosen": -187.6385955810547, "logps/rejected": -282.72918701171875, "loss": 1.2539, "nll_loss": 1.0270280838012695, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 6.897919654846191, "rewards/margins": 5.087853908538818, "rewards/rejected": 1.810065507888794, "step": 14990 }, { "epoch": 0.8321659893760142, "grad_norm": 78.31485748291016, "learning_rate": 6.789791552552837e-09, "logits/chosen": -0.3030152916908264, "logits/rejected": -0.42236360907554626, "logps/chosen": -158.3644561767578, "logps/rejected": -209.4776153564453, "loss": 1.2471, "nll_loss": 0.9546947479248047, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 5.328968524932861, "rewards/margins": 3.3644192218780518, "rewards/rejected": 1.9645494222640991, "step": 15000 }, { "epoch": 0.8321659893760142, "eval_logits/chosen": -0.40984421968460083, "eval_logits/rejected": -0.5251399874687195, "eval_logps/chosen": -190.08694458007812, "eval_logps/rejected": -261.3680419921875, "eval_loss": 1.215467929840088, "eval_nll_loss": 0.9851780533790588, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 6.787428379058838, "eval_rewards/margins": 5.016385078430176, "eval_rewards/rejected": 1.7710434198379517, "eval_runtime": 17.1295, "eval_samples_per_second": 14.945, "eval_steps_per_second": 1.868, "step": 15000 } ], "logging_steps": 10, "max_steps": 18025, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }