{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6155917425310937, "eval_steps": 20, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004615976407231696, "grad_norm": 92.4314081607968, "learning_rate": 1.1494252873563218e-08, "logits/chosen": -1.3403388261795044, "logits/rejected": -1.3443610668182373, "logps/chosen": -48.98606872558594, "logps/rejected": -52.890384674072266, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.009231952814463392, "grad_norm": 114.27394093227154, "learning_rate": 2.2988505747126436e-08, "logits/chosen": -1.3453574180603027, "logits/rejected": -1.3622318506240845, "logps/chosen": -39.51582336425781, "logps/rejected": -55.267478942871094, "loss": 0.6984, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -0.0065580871887505054, "rewards/margins": -0.004221578594297171, "rewards/rejected": -0.002336508594453335, "step": 4 }, { "epoch": 0.01384792922169509, "grad_norm": 150.47548740563838, "learning_rate": 3.448275862068965e-08, "logits/chosen": -1.3104500770568848, "logits/rejected": -1.3256760835647583, "logps/chosen": -46.711997985839844, "logps/rejected": -61.08738327026367, "loss": 0.7077, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": -0.015531142242252827, "rewards/margins": -0.022088024765253067, "rewards/rejected": 0.006556881591677666, "step": 6 }, { "epoch": 0.018463905628926785, "grad_norm": 127.64708744370179, "learning_rate": 4.597701149425287e-08, "logits/chosen": -1.3497395515441895, "logits/rejected": -1.3723570108413696, "logps/chosen": -50.4114875793457, "logps/rejected": -67.92998504638672, "loss": 0.7139, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": 0.04585569351911545, "rewards/margins": -0.033931903541088104, "rewards/rejected": 0.07978759706020355, "step": 8 }, { "epoch": 0.023079882036158482, "grad_norm": 104.28603257129862, "learning_rate": 5.747126436781609e-08, "logits/chosen": -1.3193544149398804, "logits/rejected": -1.3253015279769897, "logps/chosen": -48.21293258666992, "logps/rejected": -55.63939666748047, "loss": 0.6759, "rewards/accuracies": 0.6527777910232544, "rewards/chosen": 0.09531965851783752, "rewards/margins": 0.03943055123090744, "rewards/rejected": 0.055889103561639786, "step": 10 }, { "epoch": 0.02769585844339018, "grad_norm": 122.92866276312793, "learning_rate": 6.89655172413793e-08, "logits/chosen": -1.356438159942627, "logits/rejected": -1.3778866529464722, "logps/chosen": -47.38197326660156, "logps/rejected": -62.85205841064453, "loss": 0.7012, "rewards/accuracies": 0.5, "rewards/chosen": 0.021549424156546593, "rewards/margins": -0.00793980248272419, "rewards/rejected": 0.029489226639270782, "step": 12 }, { "epoch": 0.032311834850621876, "grad_norm": 141.4940238366865, "learning_rate": 8.045977011494252e-08, "logits/chosen": -1.2811020612716675, "logits/rejected": -1.3018286228179932, "logps/chosen": -53.21059799194336, "logps/rejected": -68.97090148925781, "loss": 0.6877, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": -0.0111711286008358, "rewards/margins": 0.018631484359502792, "rewards/rejected": -0.029802612960338593, "step": 14 }, { "epoch": 0.03692781125785357, "grad_norm": 108.13024807755075, "learning_rate": 9.195402298850574e-08, "logits/chosen": -1.3367334604263306, "logits/rejected": -1.3522446155548096, "logps/chosen": -40.02373504638672, "logps/rejected": -54.60912322998047, "loss": 0.702, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 0.010513358749449253, "rewards/margins": -0.013667477294802666, "rewards/rejected": 0.024180836975574493, "step": 16 }, { "epoch": 0.04154378766508527, "grad_norm": 90.94991906906556, "learning_rate": 1.0344827586206897e-07, "logits/chosen": -1.2363929748535156, "logits/rejected": -1.2396348714828491, "logps/chosen": -56.863731384277344, "logps/rejected": -53.349342346191406, "loss": 0.6938, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 0.07457832247018814, "rewards/margins": 0.004799458663910627, "rewards/rejected": 0.06977886706590652, "step": 18 }, { "epoch": 0.046159764072316964, "grad_norm": 166.09887872402663, "learning_rate": 1.1494252873563217e-07, "logits/chosen": -1.2872830629348755, "logits/rejected": -1.317036747932434, "logps/chosen": -49.23244857788086, "logps/rejected": -71.62715911865234, "loss": 0.7087, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": 0.02608451619744301, "rewards/margins": -0.02054634317755699, "rewards/rejected": 0.046630859375, "step": 20 }, { "epoch": 0.046159764072316964, "eval_logits/chosen": -1.262330412864685, "eval_logits/rejected": -1.273974895477295, "eval_logps/chosen": -48.664798736572266, "eval_logps/rejected": -56.1088752746582, "eval_loss": 0.6998714804649353, "eval_rewards/accuracies": 0.4228110611438751, "eval_rewards/chosen": -0.01568525843322277, "eval_rewards/margins": -0.007799813989549875, "eval_rewards/rejected": -0.007885444909334183, "eval_runtime": 231.8501, "eval_samples_per_second": 7.479, "eval_steps_per_second": 1.872, "step": 20 }, { "epoch": 0.05077574047954866, "grad_norm": 141.4312370714434, "learning_rate": 1.2643678160919542e-07, "logits/chosen": -1.328560709953308, "logits/rejected": -1.3631365299224854, "logps/chosen": -50.79507827758789, "logps/rejected": -79.58642578125, "loss": 0.6838, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.04927082732319832, "rewards/margins": 0.02934443950653076, "rewards/rejected": 0.019926389679312706, "step": 22 }, { "epoch": 0.05539171688678036, "grad_norm": 106.70546600458962, "learning_rate": 1.379310344827586e-07, "logits/chosen": -1.335903286933899, "logits/rejected": -1.3434231281280518, "logps/chosen": -59.29114532470703, "logps/rejected": -62.07218933105469, "loss": 0.6961, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": 0.00791182741522789, "rewards/margins": -0.0008803076343610883, "rewards/rejected": 0.008792135864496231, "step": 24 }, { "epoch": 0.06000769329401205, "grad_norm": 84.5108710571048, "learning_rate": 1.4942528735632184e-07, "logits/chosen": -1.315495491027832, "logits/rejected": -1.314201831817627, "logps/chosen": -52.26453399658203, "logps/rejected": -46.654151916503906, "loss": 0.6879, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 0.10577751696109772, "rewards/margins": 0.01635124906897545, "rewards/rejected": 0.08942626416683197, "step": 26 }, { "epoch": 0.06462366970124375, "grad_norm": 142.3451166395631, "learning_rate": 1.6091954022988505e-07, "logits/chosen": -1.3120254278182983, "logits/rejected": -1.3418428897857666, "logps/chosen": -54.30976486206055, "logps/rejected": -76.96250915527344, "loss": 0.6796, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.14193940162658691, "rewards/margins": 0.03795723244547844, "rewards/rejected": 0.10398217290639877, "step": 28 }, { "epoch": 0.06923964610847544, "grad_norm": 85.45105537711353, "learning_rate": 1.7241379310344828e-07, "logits/chosen": -1.339949131011963, "logits/rejected": -1.35366952419281, "logps/chosen": -47.45890808105469, "logps/rejected": -56.32393264770508, "loss": 0.685, "rewards/accuracies": 0.5, "rewards/chosen": 0.12981660664081573, "rewards/margins": 0.022233910858631134, "rewards/rejected": 0.1075827032327652, "step": 30 }, { "epoch": 0.07385562251570714, "grad_norm": 87.30098240848938, "learning_rate": 1.839080459770115e-07, "logits/chosen": -1.3168249130249023, "logits/rejected": -1.3266100883483887, "logps/chosen": -48.27603530883789, "logps/rejected": -54.10696792602539, "loss": 0.6693, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": 0.2039007544517517, "rewards/margins": 0.05477040261030197, "rewards/rejected": 0.14913035929203033, "step": 32 }, { "epoch": 0.07847159892293884, "grad_norm": 81.90822350206311, "learning_rate": 1.9540229885057472e-07, "logits/chosen": -1.3052334785461426, "logits/rejected": -1.3256360292434692, "logps/chosen": -44.5953483581543, "logps/rejected": -61.29960250854492, "loss": 0.674, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": 0.29883071780204773, "rewards/margins": 0.04845905303955078, "rewards/rejected": 0.25037166476249695, "step": 34 }, { "epoch": 0.08308757533017054, "grad_norm": 97.62737031971352, "learning_rate": 2.0689655172413793e-07, "logits/chosen": -1.3107632398605347, "logits/rejected": -1.3140422105789185, "logps/chosen": -51.13896179199219, "logps/rejected": -48.718727111816406, "loss": 0.6724, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 0.2992264926433563, "rewards/margins": 0.05020047724246979, "rewards/rejected": 0.24902603030204773, "step": 36 }, { "epoch": 0.08770355173740223, "grad_norm": 91.70503719282425, "learning_rate": 2.1839080459770114e-07, "logits/chosen": -1.258926510810852, "logits/rejected": -1.2699991464614868, "logps/chosen": -50.59396743774414, "logps/rejected": -56.2684326171875, "loss": 0.6819, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": 0.33299052715301514, "rewards/margins": 0.038206882774829865, "rewards/rejected": 0.29478365182876587, "step": 38 }, { "epoch": 0.09231952814463393, "grad_norm": 92.44581774007628, "learning_rate": 2.2988505747126435e-07, "logits/chosen": -1.3053722381591797, "logits/rejected": -1.316298007965088, "logps/chosen": -52.49648666381836, "logps/rejected": -56.09816360473633, "loss": 0.6665, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": 0.38449087738990784, "rewards/margins": 0.07676863670349121, "rewards/rejected": 0.30772221088409424, "step": 40 }, { "epoch": 0.09231952814463393, "eval_logits/chosen": -1.2573766708374023, "eval_logits/rejected": -1.269149899482727, "eval_logps/chosen": -47.605804443359375, "eval_logps/rejected": -55.2476806640625, "eval_loss": 0.6606337428092957, "eval_rewards/accuracies": 0.5725806355476379, "eval_rewards/chosen": 0.5138096213340759, "eval_rewards/margins": 0.09109989553689957, "eval_rewards/rejected": 0.42270979285240173, "eval_runtime": 227.103, "eval_samples_per_second": 7.635, "eval_steps_per_second": 1.911, "step": 40 }, { "epoch": 0.09693550455186563, "grad_norm": 83.51334071319128, "learning_rate": 2.413793103448276e-07, "logits/chosen": -1.3424174785614014, "logits/rejected": -1.3526452779769897, "logps/chosen": -44.44143295288086, "logps/rejected": -49.79873275756836, "loss": 0.6451, "rewards/accuracies": 0.6527777910232544, "rewards/chosen": 0.5364899039268494, "rewards/margins": 0.11971965432167053, "rewards/rejected": 0.4167703092098236, "step": 42 }, { "epoch": 0.10155148095909731, "grad_norm": 97.54181471785779, "learning_rate": 2.5287356321839084e-07, "logits/chosen": -1.307891845703125, "logits/rejected": -1.3404256105422974, "logps/chosen": -50.57395935058594, "logps/rejected": -79.52447509765625, "loss": 0.6618, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": 0.5593165159225464, "rewards/margins": 0.12356305122375488, "rewards/rejected": 0.4357534646987915, "step": 44 }, { "epoch": 0.10616745736632902, "grad_norm": 87.61515534490086, "learning_rate": 2.64367816091954e-07, "logits/chosen": -1.2781007289886475, "logits/rejected": -1.2948905229568481, "logps/chosen": -55.20244216918945, "logps/rejected": -57.26641845703125, "loss": 0.656, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": 0.5990750193595886, "rewards/margins": 0.11623137444257736, "rewards/rejected": 0.4828437268733978, "step": 46 }, { "epoch": 0.11078343377356072, "grad_norm": 89.38653936134894, "learning_rate": 2.758620689655172e-07, "logits/chosen": -1.2956469058990479, "logits/rejected": -1.3047106266021729, "logps/chosen": -48.0570068359375, "logps/rejected": -54.5909423828125, "loss": 0.6482, "rewards/accuracies": 0.625, "rewards/chosen": 0.6316156983375549, "rewards/margins": 0.15523308515548706, "rewards/rejected": 0.47638261318206787, "step": 48 }, { "epoch": 0.1153994101807924, "grad_norm": 80.50515017031447, "learning_rate": 2.873563218390804e-07, "logits/chosen": -1.3206638097763062, "logits/rejected": -1.328073501586914, "logps/chosen": -51.81482696533203, "logps/rejected": -51.81681442260742, "loss": 0.6387, "rewards/accuracies": 0.6527777910232544, "rewards/chosen": 0.7185304164886475, "rewards/margins": 0.1627696007490158, "rewards/rejected": 0.5557608008384705, "step": 50 }, { "epoch": 0.1200153865880241, "grad_norm": 81.34880289712139, "learning_rate": 2.988505747126437e-07, "logits/chosen": -1.3584266901016235, "logits/rejected": -1.3828377723693848, "logps/chosen": -44.66862869262695, "logps/rejected": -66.46587371826172, "loss": 0.6441, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.8013312816619873, "rewards/margins": 0.21720875799655914, "rewards/rejected": 0.5841224789619446, "step": 52 }, { "epoch": 0.1246313629952558, "grad_norm": 77.01376791928342, "learning_rate": 3.103448275862069e-07, "logits/chosen": -1.4568628072738647, "logits/rejected": -1.4819716215133667, "logps/chosen": -47.80024719238281, "logps/rejected": -64.31399536132812, "loss": 0.6113, "rewards/accuracies": 0.6388888955116272, "rewards/chosen": 0.7414547204971313, "rewards/margins": 0.2719371020793915, "rewards/rejected": 0.4695175588130951, "step": 54 }, { "epoch": 0.1292473394024875, "grad_norm": 72.36047181825025, "learning_rate": 3.218390804597701e-07, "logits/chosen": -1.2487589120864868, "logits/rejected": -1.250648021697998, "logps/chosen": -44.55437469482422, "logps/rejected": -49.30759811401367, "loss": 0.6265, "rewards/accuracies": 0.6527777910232544, "rewards/chosen": 0.8203690052032471, "rewards/margins": 0.1877826303243637, "rewards/rejected": 0.6325862407684326, "step": 56 }, { "epoch": 0.1338633158097192, "grad_norm": 82.28322419724, "learning_rate": 3.333333333333333e-07, "logits/chosen": -1.3345168828964233, "logits/rejected": -1.3384166955947876, "logps/chosen": -46.937095642089844, "logps/rejected": -48.4022216796875, "loss": 0.5824, "rewards/accuracies": 0.6388888955116272, "rewards/chosen": 0.8726701736450195, "rewards/margins": 0.2923206090927124, "rewards/rejected": 0.5803494453430176, "step": 58 }, { "epoch": 0.13847929221695088, "grad_norm": 71.91030915826812, "learning_rate": 3.4482758620689656e-07, "logits/chosen": -1.326303243637085, "logits/rejected": -1.3416494131088257, "logps/chosen": -45.597774505615234, "logps/rejected": -55.49925994873047, "loss": 0.5933, "rewards/accuracies": 0.7083333134651184, "rewards/chosen": 0.8803545236587524, "rewards/margins": 0.32467857003211975, "rewards/rejected": 0.5556759834289551, "step": 60 }, { "epoch": 0.13847929221695088, "eval_logits/chosen": -1.2449160814285278, "eval_logits/rejected": -1.2571001052856445, "eval_logps/chosen": -46.695552825927734, "eval_logps/rejected": -54.76215362548828, "eval_loss": 0.6056556105613708, "eval_rewards/accuracies": 0.6440092325210571, "eval_rewards/chosen": 0.9689397215843201, "eval_rewards/margins": 0.30346596240997314, "eval_rewards/rejected": 0.6654736995697021, "eval_runtime": 226.9584, "eval_samples_per_second": 7.64, "eval_steps_per_second": 1.912, "step": 60 }, { "epoch": 0.1430952686241826, "grad_norm": 73.53474176254309, "learning_rate": 3.5632183908045977e-07, "logits/chosen": -1.302392840385437, "logits/rejected": -1.3110175132751465, "logps/chosen": -47.1639289855957, "logps/rejected": -52.83234786987305, "loss": 0.6038, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 0.9415748119354248, "rewards/margins": 0.2688363194465637, "rewards/rejected": 0.6727384924888611, "step": 62 }, { "epoch": 0.14771124503141428, "grad_norm": 72.82764174123524, "learning_rate": 3.67816091954023e-07, "logits/chosen": -1.3600918054580688, "logits/rejected": -1.3759901523590088, "logps/chosen": -48.68782043457031, "logps/rejected": -59.76481246948242, "loss": 0.5893, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.036276936531067, "rewards/margins": 0.393592894077301, "rewards/rejected": 0.6426840424537659, "step": 64 }, { "epoch": 0.152327221438646, "grad_norm": 72.4502448635078, "learning_rate": 3.793103448275862e-07, "logits/chosen": -1.254841923713684, "logits/rejected": -1.2786000967025757, "logps/chosen": -45.10692596435547, "logps/rejected": -67.49703216552734, "loss": 0.5441, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 1.195844292640686, "rewards/margins": 0.5360373854637146, "rewards/rejected": 0.6598069667816162, "step": 66 }, { "epoch": 0.15694319784587768, "grad_norm": 137.30984216926402, "learning_rate": 3.9080459770114945e-07, "logits/chosen": -1.3896088600158691, "logits/rejected": -1.4264814853668213, "logps/chosen": -41.05742645263672, "logps/rejected": -65.78131103515625, "loss": 0.6531, "rewards/accuracies": 0.625, "rewards/chosen": 1.2528166770935059, "rewards/margins": 0.4981537461280823, "rewards/rejected": 0.7546629905700684, "step": 68 }, { "epoch": 0.16155917425310937, "grad_norm": 72.56751313040455, "learning_rate": 4.0229885057471266e-07, "logits/chosen": -1.3160932064056396, "logits/rejected": -1.3216156959533691, "logps/chosen": -42.512996673583984, "logps/rejected": -46.83217239379883, "loss": 0.5056, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 1.5907378196716309, "rewards/margins": 0.5790587067604065, "rewards/rejected": 1.0116791725158691, "step": 70 }, { "epoch": 0.16617515066034108, "grad_norm": 78.13005318225228, "learning_rate": 4.1379310344827586e-07, "logits/chosen": -1.2581322193145752, "logits/rejected": -1.2822985649108887, "logps/chosen": -44.267303466796875, "logps/rejected": -71.07489013671875, "loss": 0.5435, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 1.4014304876327515, "rewards/margins": 0.723700761795044, "rewards/rejected": 0.6777297258377075, "step": 72 }, { "epoch": 0.17079112706757277, "grad_norm": 56.428824716251846, "learning_rate": 4.25287356321839e-07, "logits/chosen": -1.2850017547607422, "logits/rejected": -1.2932665348052979, "logps/chosen": -52.70983123779297, "logps/rejected": -53.61183166503906, "loss": 0.6011, "rewards/accuracies": 0.7083333134651184, "rewards/chosen": 1.2770977020263672, "rewards/margins": 0.5247661471366882, "rewards/rejected": 0.7523314952850342, "step": 74 }, { "epoch": 0.17540710347480445, "grad_norm": 80.88401808498448, "learning_rate": 4.367816091954023e-07, "logits/chosen": -1.2536025047302246, "logits/rejected": -1.2705798149108887, "logps/chosen": -47.25906753540039, "logps/rejected": -60.58863067626953, "loss": 0.5302, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 1.4430739879608154, "rewards/margins": 0.5440190434455872, "rewards/rejected": 0.8990550637245178, "step": 76 }, { "epoch": 0.18002307988203617, "grad_norm": 68.76597330947016, "learning_rate": 4.482758620689655e-07, "logits/chosen": -1.2450529336929321, "logits/rejected": -1.2557368278503418, "logps/chosen": -42.16405487060547, "logps/rejected": -52.293785095214844, "loss": 0.5623, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 1.489140510559082, "rewards/margins": 0.561518669128418, "rewards/rejected": 0.9276217222213745, "step": 78 }, { "epoch": 0.18463905628926786, "grad_norm": 74.75348151064274, "learning_rate": 4.597701149425287e-07, "logits/chosen": -1.3210065364837646, "logits/rejected": -1.3294503688812256, "logps/chosen": -42.845787048339844, "logps/rejected": -46.24819564819336, "loss": 0.5108, "rewards/accuracies": 0.6944444179534912, "rewards/chosen": 1.1906384229660034, "rewards/margins": 0.6210441589355469, "rewards/rejected": 0.5695942640304565, "step": 80 }, { "epoch": 0.18463905628926786, "eval_logits/chosen": -1.226657509803772, "eval_logits/rejected": -1.2391773462295532, "eval_logps/chosen": -46.65249252319336, "eval_logps/rejected": -55.351402282714844, "eval_loss": 0.5300613045692444, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 0.9904682636260986, "eval_rewards/margins": 0.6196123957633972, "eval_rewards/rejected": 0.3708558976650238, "eval_runtime": 227.2212, "eval_samples_per_second": 7.631, "eval_steps_per_second": 1.91, "step": 80 }, { "epoch": 0.18925503269649954, "grad_norm": 60.96010208828231, "learning_rate": 4.712643678160919e-07, "logits/chosen": -1.2890104055404663, "logits/rejected": -1.3069978952407837, "logps/chosen": -50.1616096496582, "logps/rejected": -61.80992126464844, "loss": 0.485, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.9283716678619385, "rewards/margins": 0.7194375991821289, "rewards/rejected": 0.20893406867980957, "step": 82 }, { "epoch": 0.19387100910373126, "grad_norm": 58.75128856791501, "learning_rate": 4.827586206896552e-07, "logits/chosen": -1.282674789428711, "logits/rejected": -1.3157954216003418, "logps/chosen": -42.50400161743164, "logps/rejected": -75.54652404785156, "loss": 0.4729, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.7927578091621399, "rewards/margins": 1.0220049619674683, "rewards/rejected": -0.22924719750881195, "step": 84 }, { "epoch": 0.19848698551096294, "grad_norm": 69.45296124181107, "learning_rate": 4.942528735632184e-07, "logits/chosen": -1.2970733642578125, "logits/rejected": -1.3046212196350098, "logps/chosen": -48.21321487426758, "logps/rejected": -59.63574981689453, "loss": 0.4753, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.7275898456573486, "rewards/margins": 0.7579395771026611, "rewards/rejected": -0.030349718406796455, "step": 86 }, { "epoch": 0.20310296191819463, "grad_norm": 57.934663164568036, "learning_rate": 4.999979670146248e-07, "logits/chosen": -1.322100043296814, "logits/rejected": -1.333228588104248, "logps/chosen": -51.94272232055664, "logps/rejected": -62.809814453125, "loss": 0.4698, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.5794834494590759, "rewards/margins": 0.9340064525604248, "rewards/rejected": -0.3545229136943817, "step": 88 }, { "epoch": 0.20771893832542634, "grad_norm": 67.05839544195749, "learning_rate": 4.99981703330008e-07, "logits/chosen": -1.2547653913497925, "logits/rejected": -1.2667738199234009, "logps/chosen": -44.88441467285156, "logps/rejected": -55.610042572021484, "loss": 0.4928, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.672675371170044, "rewards/margins": 0.72029709815979, "rewards/rejected": -0.04762159287929535, "step": 90 }, { "epoch": 0.21233491473265803, "grad_norm": 51.75653250091916, "learning_rate": 4.99949177018813e-07, "logits/chosen": -1.3547184467315674, "logits/rejected": -1.3632615804672241, "logps/chosen": -41.517826080322266, "logps/rejected": -52.83651351928711, "loss": 0.3985, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.8576219081878662, "rewards/margins": 1.0123066902160645, "rewards/rejected": -0.15468484163284302, "step": 92 }, { "epoch": 0.21695089113988972, "grad_norm": 81.64528533284856, "learning_rate": 4.999003901970474e-07, "logits/chosen": -1.3031115531921387, "logits/rejected": -1.3127225637435913, "logps/chosen": -54.79065704345703, "logps/rejected": -54.33130645751953, "loss": 0.5699, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.6079578995704651, "rewards/margins": 0.6609423160552979, "rewards/rejected": -0.05298437178134918, "step": 94 }, { "epoch": 0.22156686754712143, "grad_norm": 50.92778720915042, "learning_rate": 4.998353460385512e-07, "logits/chosen": -1.2541792392730713, "logits/rejected": -1.2738375663757324, "logps/chosen": -45.45362091064453, "logps/rejected": -64.12794494628906, "loss": 0.4024, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.9434598088264465, "rewards/margins": 1.2422370910644531, "rewards/rejected": -0.29877743124961853, "step": 96 }, { "epoch": 0.22618284395435312, "grad_norm": 56.275721433856205, "learning_rate": 4.997540487747892e-07, "logits/chosen": -1.2653698921203613, "logits/rejected": -1.2802023887634277, "logps/chosen": -44.68564987182617, "logps/rejected": -66.80543518066406, "loss": 0.4761, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.8570264577865601, "rewards/margins": 1.2518484592437744, "rewards/rejected": -0.39482197165489197, "step": 98 }, { "epoch": 0.2307988203615848, "grad_norm": 52.70179467657195, "learning_rate": 4.996565036945769e-07, "logits/chosen": -1.2993725538253784, "logits/rejected": -1.3049986362457275, "logps/chosen": -50.757686614990234, "logps/rejected": -53.39494323730469, "loss": 0.474, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.6508947014808655, "rewards/margins": 0.9926181435585022, "rewards/rejected": -0.3417234420776367, "step": 100 }, { "epoch": 0.2307988203615848, "eval_logits/chosen": -1.2166502475738525, "eval_logits/rejected": -1.2277939319610596, "eval_logps/chosen": -46.99800109863281, "eval_logps/rejected": -56.576629638671875, "eval_loss": 0.45405343174934387, "eval_rewards/accuracies": 0.7511520981788635, "eval_rewards/chosen": 0.8177129626274109, "eval_rewards/margins": 1.0594747066497803, "eval_rewards/rejected": -0.24176181852817535, "eval_runtime": 227.4246, "eval_samples_per_second": 7.625, "eval_steps_per_second": 1.908, "step": 100 }, { "epoch": 0.23541479676881652, "grad_norm": 44.630871285034665, "learning_rate": 4.995427171437356e-07, "logits/chosen": -1.2710050344467163, "logits/rejected": -1.2925546169281006, "logps/chosen": -44.29911422729492, "logps/rejected": -63.83744430541992, "loss": 0.3944, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.7817994356155396, "rewards/margins": 1.3069223165512085, "rewards/rejected": -0.5251227617263794, "step": 102 }, { "epoch": 0.2400307731760482, "grad_norm": 63.36947457687719, "learning_rate": 4.994126965246796e-07, "logits/chosen": -1.281785488128662, "logits/rejected": -1.2921262979507446, "logps/chosen": -45.61968994140625, "logps/rejected": -57.57981872558594, "loss": 0.4314, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.837566077709198, "rewards/margins": 1.084651231765747, "rewards/rejected": -0.247085303068161, "step": 104 }, { "epoch": 0.24464674958327992, "grad_norm": 64.3165584011508, "learning_rate": 4.992664502959351e-07, "logits/chosen": -1.2655751705169678, "logits/rejected": -1.3007822036743164, "logps/chosen": -42.23821258544922, "logps/rejected": -85.14391326904297, "loss": 0.3478, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.9540318250656128, "rewards/margins": 1.9801336526870728, "rewards/rejected": -1.0261015892028809, "step": 106 }, { "epoch": 0.2492627259905116, "grad_norm": 68.33245668142573, "learning_rate": 4.991039879715898e-07, "logits/chosen": -1.2239530086517334, "logits/rejected": -1.2506436109542847, "logps/chosen": -48.302852630615234, "logps/rejected": -70.37635803222656, "loss": 0.4095, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 1.1005271673202515, "rewards/margins": 1.5407756567001343, "rewards/rejected": -0.4402484893798828, "step": 108 }, { "epoch": 0.2538787023977433, "grad_norm": 41.58256253343475, "learning_rate": 4.989253201206736e-07, "logits/chosen": -1.3317803144454956, "logits/rejected": -1.3334723711013794, "logps/chosen": -47.94260787963867, "logps/rejected": -48.03237533569336, "loss": 0.4524, "rewards/accuracies": 0.6944444179534912, "rewards/chosen": 0.9174912571907043, "rewards/margins": 1.0279741287231445, "rewards/rejected": -0.11048289388418198, "step": 110 }, { "epoch": 0.258494678804975, "grad_norm": 66.23818262108296, "learning_rate": 4.987304583664712e-07, "logits/chosen": -1.2193766832351685, "logits/rejected": -1.2288120985031128, "logps/chosen": -55.089717864990234, "logps/rejected": -61.21225357055664, "loss": 0.4449, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.7610146999359131, "rewards/margins": 1.0799916982650757, "rewards/rejected": -0.3189769983291626, "step": 112 }, { "epoch": 0.26311065521220667, "grad_norm": 58.36439834992655, "learning_rate": 4.985194153857662e-07, "logits/chosen": -1.3395094871520996, "logits/rejected": -1.3416942358016968, "logps/chosen": -43.976890563964844, "logps/rejected": -45.82760238647461, "loss": 0.4929, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.4750349223613739, "rewards/margins": 0.913371741771698, "rewards/rejected": -0.4383367896080017, "step": 114 }, { "epoch": 0.2677266316194384, "grad_norm": 41.441657797600875, "learning_rate": 4.982922049080163e-07, "logits/chosen": -1.3572431802749634, "logits/rejected": -1.3625115156173706, "logps/chosen": -42.45515441894531, "logps/rejected": -49.73173522949219, "loss": 0.3691, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.41610708832740784, "rewards/margins": 1.2800379991531372, "rewards/rejected": -0.8639309406280518, "step": 116 }, { "epoch": 0.2723426080266701, "grad_norm": 60.30435878433939, "learning_rate": 4.980488417144599e-07, "logits/chosen": -1.2863659858703613, "logits/rejected": -1.3199315071105957, "logps/chosen": -48.62416076660156, "logps/rejected": -85.73699188232422, "loss": 0.4597, "rewards/accuracies": 0.75, "rewards/chosen": 0.11831729859113693, "rewards/margins": 1.6947745084762573, "rewards/rejected": -1.5764573812484741, "step": 118 }, { "epoch": 0.27695858443390176, "grad_norm": 41.355652091388535, "learning_rate": 4.977893416371544e-07, "logits/chosen": -1.2884269952774048, "logits/rejected": -1.2976269721984863, "logps/chosen": -41.79518127441406, "logps/rejected": -54.925662994384766, "loss": 0.3826, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.0362311452627182, "rewards/margins": 1.4945807456970215, "rewards/rejected": -1.4583497047424316, "step": 120 }, { "epoch": 0.27695858443390176, "eval_logits/chosen": -1.207366704940796, "eval_logits/rejected": -1.2173371315002441, "eval_logps/chosen": -48.50833511352539, "eval_logps/rejected": -58.64011764526367, "eval_loss": 0.4093886911869049, "eval_rewards/accuracies": 0.7724654674530029, "eval_rewards/chosen": 0.06255079805850983, "eval_rewards/margins": 1.336057424545288, "eval_rewards/rejected": -1.27350652217865, "eval_runtime": 227.1307, "eval_samples_per_second": 7.634, "eval_steps_per_second": 1.911, "step": 120 }, { "epoch": 0.28157456084113347, "grad_norm": 57.25843545889309, "learning_rate": 4.975137215579469e-07, "logits/chosen": -1.1866450309753418, "logits/rejected": -1.186094045639038, "logps/chosen": -55.403018951416016, "logps/rejected": -51.23255920410156, "loss": 0.4266, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.05289599671959877, "rewards/margins": 1.1414172649383545, "rewards/rejected": -1.0885213613510132, "step": 122 }, { "epoch": 0.2861905372483652, "grad_norm": 39.62322228531182, "learning_rate": 4.972219994073755e-07, "logits/chosen": -1.18235182762146, "logits/rejected": -1.2118382453918457, "logps/chosen": -48.611637115478516, "logps/rejected": -79.41681671142578, "loss": 0.3892, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": -0.14360421895980835, "rewards/margins": 1.8774958848953247, "rewards/rejected": -2.0211000442504883, "step": 124 }, { "epoch": 0.2908065136555969, "grad_norm": 52.17921097346343, "learning_rate": 4.969141941635025e-07, "logits/chosen": -1.2253869771957397, "logits/rejected": -1.2435510158538818, "logps/chosen": -50.45375061035156, "logps/rejected": -69.55599212646484, "loss": 0.4746, "rewards/accuracies": 0.75, "rewards/chosen": -0.31797370314598083, "rewards/margins": 1.7364860773086548, "rewards/rejected": -2.054459571838379, "step": 126 }, { "epoch": 0.29542249006282856, "grad_norm": 70.38261738253976, "learning_rate": 4.965903258506806e-07, "logits/chosen": -1.1796499490737915, "logits/rejected": -1.1957367658615112, "logps/chosen": -49.15879440307617, "logps/rejected": -73.3575210571289, "loss": 0.3346, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.04160241410136223, "rewards/margins": 1.7600233554840088, "rewards/rejected": -1.7184207439422607, "step": 128 }, { "epoch": 0.30003846647006027, "grad_norm": 57.934452383223125, "learning_rate": 4.962504155382493e-07, "logits/chosen": -1.3256597518920898, "logits/rejected": -1.3279542922973633, "logps/chosen": -44.50282287597656, "logps/rejected": -48.98247146606445, "loss": 0.382, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.2423497587442398, "rewards/margins": 1.2484761476516724, "rewards/rejected": -1.0061264038085938, "step": 130 }, { "epoch": 0.304654442877292, "grad_norm": 58.71983276565311, "learning_rate": 4.958944853391652e-07, "logits/chosen": -1.1831316947937012, "logits/rejected": -1.1885439157485962, "logps/chosen": -45.726505279541016, "logps/rejected": -54.00067138671875, "loss": 0.4078, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.6056569814682007, "rewards/margins": 1.2579783201217651, "rewards/rejected": -0.6523212790489197, "step": 132 }, { "epoch": 0.30927041928452365, "grad_norm": 50.9411946225504, "learning_rate": 4.955225584085624e-07, "logits/chosen": -1.3628097772598267, "logits/rejected": -1.3736504316329956, "logps/chosen": -44.589229583740234, "logps/rejected": -60.06134796142578, "loss": 0.4245, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.8993210196495056, "rewards/margins": 1.6077146530151367, "rewards/rejected": -0.7083935737609863, "step": 134 }, { "epoch": 0.31388639569175536, "grad_norm": 53.494824083866554, "learning_rate": 4.951346589422467e-07, "logits/chosen": -1.2143707275390625, "logits/rejected": -1.2256660461425781, "logps/chosen": -44.73698425292969, "logps/rejected": -63.74873733520508, "loss": 0.4379, "rewards/accuracies": 0.75, "rewards/chosen": 1.0820810794830322, "rewards/margins": 1.6512118577957153, "rewards/rejected": -0.5691307187080383, "step": 136 }, { "epoch": 0.3185023720989871, "grad_norm": 87.56798544187119, "learning_rate": 4.94730812175122e-07, "logits/chosen": -1.338615894317627, "logits/rejected": -1.344118595123291, "logps/chosen": -45.98448181152344, "logps/rejected": -49.50959396362305, "loss": 0.4706, "rewards/accuracies": 0.6805555820465088, "rewards/chosen": 1.05098295211792, "rewards/margins": 1.33613121509552, "rewards/rejected": -0.2851482033729553, "step": 138 }, { "epoch": 0.32311834850621873, "grad_norm": 68.21005063884613, "learning_rate": 4.943110443795476e-07, "logits/chosen": -1.258334755897522, "logits/rejected": -1.2637797594070435, "logps/chosen": -50.91215515136719, "logps/rejected": -52.46604537963867, "loss": 0.4759, "rewards/accuracies": 0.75, "rewards/chosen": 0.7942720055580139, "rewards/margins": 1.164951205253601, "rewards/rejected": -0.37067916989326477, "step": 140 }, { "epoch": 0.32311834850621873, "eval_logits/chosen": -1.1890074014663696, "eval_logits/rejected": -1.1990076303482056, "eval_logps/chosen": -47.355777740478516, "eval_logps/rejected": -58.06831359863281, "eval_loss": 0.38173291087150574, "eval_rewards/accuracies": 0.7718893885612488, "eval_rewards/chosen": 0.6388264298439026, "eval_rewards/margins": 1.6264294385910034, "eval_rewards/rejected": -0.987602949142456, "eval_runtime": 227.2808, "eval_samples_per_second": 7.629, "eval_steps_per_second": 1.91, "step": 140 }, { "epoch": 0.32773432491345045, "grad_norm": 50.18122842606284, "learning_rate": 4.938753828636297e-07, "logits/chosen": -1.2129461765289307, "logits/rejected": -1.21940279006958, "logps/chosen": -53.97553253173828, "logps/rejected": -53.66658020019531, "loss": 0.4623, "rewards/accuracies": 0.6805555820465088, "rewards/chosen": 0.7083945274353027, "rewards/margins": 1.2968156337738037, "rewards/rejected": -0.5884211659431458, "step": 142 }, { "epoch": 0.33235030132068216, "grad_norm": 50.59660648914643, "learning_rate": 4.934238559694447e-07, "logits/chosen": -1.1950477361679077, "logits/rejected": -1.2141423225402832, "logps/chosen": -47.11637878417969, "logps/rejected": -65.34754943847656, "loss": 0.3506, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.5280824899673462, "rewards/margins": 1.7753053903579712, "rewards/rejected": -1.2472230195999146, "step": 144 }, { "epoch": 0.3369662777279138, "grad_norm": 48.072274874250304, "learning_rate": 4.929564930711957e-07, "logits/chosen": -1.281104564666748, "logits/rejected": -1.2873430252075195, "logps/chosen": -46.95094299316406, "logps/rejected": -52.25088882446289, "loss": 0.3785, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.3459606468677521, "rewards/margins": 1.2943564653396606, "rewards/rejected": -0.9483956098556519, "step": 146 }, { "epoch": 0.34158225413514554, "grad_norm": 45.84138607827678, "learning_rate": 4.924733245733008e-07, "logits/chosen": -1.168983817100525, "logits/rejected": -1.1675101518630981, "logps/chosen": -53.398841857910156, "logps/rejected": -48.5836181640625, "loss": 0.372, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.4610249102115631, "rewards/margins": 1.1595698595046997, "rewards/rejected": -0.6985449194908142, "step": 148 }, { "epoch": 0.34619823054237725, "grad_norm": 41.47855724733255, "learning_rate": 4.91974381908416e-07, "logits/chosen": -1.3004454374313354, "logits/rejected": -1.3239775896072388, "logps/chosen": -44.39426803588867, "logps/rejected": -69.07200622558594, "loss": 0.3055, "rewards/accuracies": 0.875, "rewards/chosen": 0.01821739226579666, "rewards/margins": 2.1426663398742676, "rewards/rejected": -2.124448776245117, "step": 150 }, { "epoch": 0.3508142069496089, "grad_norm": 38.94525175640219, "learning_rate": 4.914596975353898e-07, "logits/chosen": -1.263897180557251, "logits/rejected": -1.2764997482299805, "logps/chosen": -47.19563674926758, "logps/rejected": -57.78141403198242, "loss": 0.386, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.055071063339710236, "rewards/margins": 1.5383036136627197, "rewards/rejected": -1.4832323789596558, "step": 152 }, { "epoch": 0.3554301833568406, "grad_norm": 66.86847446450086, "learning_rate": 4.909293049371519e-07, "logits/chosen": -1.2072829008102417, "logits/rejected": -1.2103402614593506, "logps/chosen": -55.665191650390625, "logps/rejected": -53.18820571899414, "loss": 0.4106, "rewards/accuracies": 0.75, "rewards/chosen": 0.15528617799282074, "rewards/margins": 1.4208853244781494, "rewards/rejected": -1.2655991315841675, "step": 154 }, { "epoch": 0.36004615976407234, "grad_norm": 58.39718753271016, "learning_rate": 4.903832386185343e-07, "logits/chosen": -1.246012568473816, "logits/rejected": -1.2516732215881348, "logps/chosen": -52.729427337646484, "logps/rejected": -53.41749572753906, "loss": 0.4389, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 0.07690320909023285, "rewards/margins": 1.1573313474655151, "rewards/rejected": -1.080428123474121, "step": 156 }, { "epoch": 0.364662136171304, "grad_norm": 48.60983015034901, "learning_rate": 4.89821534104028e-07, "logits/chosen": -1.2771211862564087, "logits/rejected": -1.2919840812683105, "logps/chosen": -50.660606384277344, "logps/rejected": -68.34771728515625, "loss": 0.3141, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.26177337765693665, "rewards/margins": 2.4097442626953125, "rewards/rejected": -2.1479711532592773, "step": 158 }, { "epoch": 0.3692781125785357, "grad_norm": 53.41098297744381, "learning_rate": 4.892442279354698e-07, "logits/chosen": -1.2370442152023315, "logits/rejected": -1.2537869215011597, "logps/chosen": -52.96739959716797, "logps/rejected": -71.32413482666016, "loss": 0.3591, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.09003262966871262, "rewards/margins": 1.8999587297439575, "rewards/rejected": -1.989991545677185, "step": 160 }, { "epoch": 0.3692781125785357, "eval_logits/chosen": -1.1776655912399292, "eval_logits/rejected": -1.1867269277572632, "eval_logps/chosen": -48.62409973144531, "eval_logps/rejected": -59.5378303527832, "eval_loss": 0.3585492968559265, "eval_rewards/accuracies": 0.7908986210823059, "eval_rewards/chosen": 0.0046647959388792515, "eval_rewards/margins": 1.7270255088806152, "eval_rewards/rejected": -1.7223609685897827, "eval_runtime": 226.9778, "eval_samples_per_second": 7.64, "eval_steps_per_second": 1.912, "step": 160 }, { "epoch": 0.3738940889857674, "grad_norm": 48.30891742529218, "learning_rate": 4.886513576696673e-07, "logits/chosen": -1.2570397853851318, "logits/rejected": -1.2753015756607056, "logps/chosen": -50.679901123046875, "logps/rejected": -69.58779907226562, "loss": 0.3731, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.10189250111579895, "rewards/margins": 1.991517424583435, "rewards/rejected": -2.093410015106201, "step": 162 }, { "epoch": 0.3785100653929991, "grad_norm": 38.94107066608042, "learning_rate": 4.880429618759543e-07, "logits/chosen": -1.300181269645691, "logits/rejected": -1.309295892715454, "logps/chosen": -54.4401969909668, "logps/rejected": -57.44432830810547, "loss": 0.4199, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -0.0659228190779686, "rewards/margins": 1.5560578107833862, "rewards/rejected": -1.6219807863235474, "step": 164 }, { "epoch": 0.3831260418002308, "grad_norm": 42.007218912580726, "learning_rate": 4.874190801336817e-07, "logits/chosen": -1.2063199281692505, "logits/rejected": -1.2187817096710205, "logps/chosen": -52.59967041015625, "logps/rejected": -62.30234909057617, "loss": 0.3373, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.0022201803512871265, "rewards/margins": 1.9247175455093384, "rewards/rejected": -1.9269376993179321, "step": 166 }, { "epoch": 0.3877420182074625, "grad_norm": 41.8692578527763, "learning_rate": 4.867797530296431e-07, "logits/chosen": -1.2532522678375244, "logits/rejected": -1.2671799659729004, "logps/chosen": -54.77018737792969, "logps/rejected": -65.7540512084961, "loss": 0.2993, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.03232141211628914, "rewards/margins": 2.12778377532959, "rewards/rejected": -2.1601054668426514, "step": 168 }, { "epoch": 0.39235799461469417, "grad_norm": 34.82958012225073, "learning_rate": 4.861250221554343e-07, "logits/chosen": -1.208885669708252, "logits/rejected": -1.2256441116333008, "logps/chosen": -43.97615051269531, "logps/rejected": -71.41213989257812, "loss": 0.3062, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.07487620413303375, "rewards/margins": 2.4894187450408936, "rewards/rejected": -2.4145421981811523, "step": 170 }, { "epoch": 0.3969739710219259, "grad_norm": 31.38457100645333, "learning_rate": 4.854549301047476e-07, "logits/chosen": -1.232684850692749, "logits/rejected": -1.2298487424850464, "logps/chosen": -51.118751525878906, "logps/rejected": -52.634708404541016, "loss": 0.3429, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.21625421941280365, "rewards/margins": 1.5239174365997314, "rewards/rejected": -1.3076633214950562, "step": 172 }, { "epoch": 0.4015899474291576, "grad_norm": 51.50376706566093, "learning_rate": 4.847695204706005e-07, "logits/chosen": -1.26514732837677, "logits/rejected": -1.266494631767273, "logps/chosen": -46.0894889831543, "logps/rejected": -49.9418830871582, "loss": 0.3849, "rewards/accuracies": 0.75, "rewards/chosen": 0.47246673703193665, "rewards/margins": 1.3556917905807495, "rewards/rejected": -0.88322514295578, "step": 174 }, { "epoch": 0.40620592383638926, "grad_norm": 39.155946696884016, "learning_rate": 4.840688378425e-07, "logits/chosen": -1.1615080833435059, "logits/rejected": -1.1706922054290771, "logps/chosen": -54.5849609375, "logps/rejected": -67.2327651977539, "loss": 0.2596, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.5416942238807678, "rewards/margins": 2.2614989280700684, "rewards/rejected": -1.7198045253753662, "step": 176 }, { "epoch": 0.410821900243621, "grad_norm": 41.671444907690415, "learning_rate": 4.833529278035422e-07, "logits/chosen": -1.30013108253479, "logits/rejected": -1.3356281518936157, "logps/chosen": -45.55073547363281, "logps/rejected": -83.93589782714844, "loss": 0.3114, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.6670578718185425, "rewards/margins": 3.266463279724121, "rewards/rejected": -2.599405527114868, "step": 178 }, { "epoch": 0.4154378766508527, "grad_norm": 39.182516678430176, "learning_rate": 4.826218369274459e-07, "logits/chosen": -1.1979715824127197, "logits/rejected": -1.2180922031402588, "logps/chosen": -46.9954948425293, "logps/rejected": -76.03369140625, "loss": 0.3212, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.5715588927268982, "rewards/margins": 2.937516212463379, "rewards/rejected": -2.3659567832946777, "step": 180 }, { "epoch": 0.4154378766508527, "eval_logits/chosen": -1.169126272201538, "eval_logits/rejected": -1.1776955127716064, "eval_logps/chosen": -46.824092864990234, "eval_logps/rejected": -58.37122344970703, "eval_loss": 0.33698517084121704, "eval_rewards/accuracies": 0.7857142686843872, "eval_rewards/chosen": 0.9046696424484253, "eval_rewards/margins": 2.043729782104492, "eval_rewards/rejected": -1.139060139656067, "eval_runtime": 227.2193, "eval_samples_per_second": 7.631, "eval_steps_per_second": 1.91, "step": 180 }, { "epoch": 0.42005385305808435, "grad_norm": 34.66268965474155, "learning_rate": 4.818756127755237e-07, "logits/chosen": -1.245609164237976, "logits/rejected": -1.2478203773498535, "logps/chosen": -44.18710708618164, "logps/rejected": -48.15581130981445, "loss": 0.2919, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 1.065934658050537, "rewards/margins": 1.712632656097412, "rewards/rejected": -0.6466982364654541, "step": 182 }, { "epoch": 0.42466982946531606, "grad_norm": 48.14030342071319, "learning_rate": 4.811143038935873e-07, "logits/chosen": -1.1818798780441284, "logits/rejected": -1.189144253730774, "logps/chosen": -51.0610237121582, "logps/rejected": -54.243797302246094, "loss": 0.3902, "rewards/accuracies": 0.75, "rewards/chosen": 1.165444254875183, "rewards/margins": 1.9199776649475098, "rewards/rejected": -0.7545332908630371, "step": 184 }, { "epoch": 0.4292858058725478, "grad_norm": 45.03104875996758, "learning_rate": 4.803379598087899e-07, "logits/chosen": -1.2626315355300903, "logits/rejected": -1.2647953033447266, "logps/chosen": -47.61143493652344, "logps/rejected": -46.91539764404297, "loss": 0.3381, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.8671760559082031, "rewards/margins": 1.6762652397155762, "rewards/rejected": -0.8090891242027283, "step": 186 }, { "epoch": 0.43390178227977944, "grad_norm": 63.031567731654626, "learning_rate": 4.795466310264034e-07, "logits/chosen": -1.18837308883667, "logits/rejected": -1.2106761932373047, "logps/chosen": -45.96682357788086, "logps/rejected": -75.37088012695312, "loss": 0.4522, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.41675278544425964, "rewards/margins": 2.4167346954345703, "rewards/rejected": -1.9999819993972778, "step": 188 }, { "epoch": 0.43851775868701115, "grad_norm": 30.590120071737818, "learning_rate": 4.787403690265335e-07, "logits/chosen": -1.2696727514266968, "logits/rejected": -1.2787107229232788, "logps/chosen": -46.919532775878906, "logps/rejected": -57.34629440307617, "loss": 0.3259, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.6651458144187927, "rewards/margins": 2.023615837097168, "rewards/rejected": -1.358469843864441, "step": 190 }, { "epoch": 0.44313373509424286, "grad_norm": 65.00881113396572, "learning_rate": 4.779192262607702e-07, "logits/chosen": -1.1799297332763672, "logits/rejected": -1.1909632682800293, "logps/chosen": -51.90432357788086, "logps/rejected": -67.85346221923828, "loss": 0.3613, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 0.8239190578460693, "rewards/margins": 2.4376637935638428, "rewards/rejected": -1.6137449741363525, "step": 192 }, { "epoch": 0.4477497115014745, "grad_norm": 45.51586779707086, "learning_rate": 4.770832561487758e-07, "logits/chosen": -1.273619294166565, "logits/rejected": -1.2734426259994507, "logps/chosen": -51.00930404663086, "logps/rejected": -50.82846450805664, "loss": 0.2704, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.6313294172286987, "rewards/margins": 1.9725594520568848, "rewards/rejected": -1.3412299156188965, "step": 194 }, { "epoch": 0.45236568790870624, "grad_norm": 59.66597715979004, "learning_rate": 4.762325130748097e-07, "logits/chosen": -1.1289526224136353, "logits/rejected": -1.1318069696426392, "logps/chosen": -56.16923522949219, "logps/rejected": -53.92496109008789, "loss": 0.3301, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.5848253965377808, "rewards/margins": 1.7668564319610596, "rewards/rejected": -1.1820309162139893, "step": 196 }, { "epoch": 0.45698166431593795, "grad_norm": 27.39596009585691, "learning_rate": 4.7536705238418995e-07, "logits/chosen": -1.2408547401428223, "logits/rejected": -1.2465531826019287, "logps/chosen": -51.9420280456543, "logps/rejected": -59.4866943359375, "loss": 0.2815, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.4031212031841278, "rewards/margins": 2.25166654586792, "rewards/rejected": -1.8485453128814697, "step": 198 }, { "epoch": 0.4615976407231696, "grad_norm": 55.89572206023644, "learning_rate": 4.7448693037969336e-07, "logits/chosen": -1.221846103668213, "logits/rejected": -1.229733943939209, "logps/chosen": -51.18299102783203, "logps/rejected": -57.01997375488281, "loss": 0.3319, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.33577215671539307, "rewards/margins": 1.9459080696105957, "rewards/rejected": -1.6101359128952026, "step": 200 }, { "epoch": 0.4615976407231696, "eval_logits/chosen": -1.1536659002304077, "eval_logits/rejected": -1.1624053716659546, "eval_logps/chosen": -47.9106559753418, "eval_logps/rejected": -59.707550048828125, "eval_loss": 0.3186802566051483, "eval_rewards/accuracies": 0.8012672662734985, "eval_rewards/chosen": 0.3613872528076172, "eval_rewards/margins": 2.168609857559204, "eval_rewards/rejected": -1.807222604751587, "eval_runtime": 227.3404, "eval_samples_per_second": 7.627, "eval_steps_per_second": 1.909, "step": 200 }, { "epoch": 0.4662136171304013, "grad_norm": 47.53414859111433, "learning_rate": 4.735922043178923e-07, "logits/chosen": -1.130042552947998, "logits/rejected": -1.1414666175842285, "logps/chosen": -50.747615814208984, "logps/rejected": -67.29204559326172, "loss": 0.2861, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.3648765981197357, "rewards/margins": 2.223259449005127, "rewards/rejected": -1.8583827018737793, "step": 202 }, { "epoch": 0.47082959353763304, "grad_norm": 45.15212642774012, "learning_rate": 4.7268293240543017e-07, "logits/chosen": -1.2075278759002686, "logits/rejected": -1.2145761251449585, "logps/chosen": -50.80036163330078, "logps/rejected": -64.42232513427734, "loss": 0.3686, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.3706679344177246, "rewards/margins": 2.0169572830200195, "rewards/rejected": -1.646289348602295, "step": 204 }, { "epoch": 0.4754455699448647, "grad_norm": 51.8551798480754, "learning_rate": 4.717591737952344e-07, "logits/chosen": -1.225889801979065, "logits/rejected": -1.2406290769577026, "logps/chosen": -42.99085235595703, "logps/rejected": -63.13933563232422, "loss": 0.31, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.2732833921909332, "rewards/margins": 2.319049835205078, "rewards/rejected": -2.045766592025757, "step": 206 }, { "epoch": 0.4800615463520964, "grad_norm": 45.48745944142938, "learning_rate": 4.7082098858266837e-07, "logits/chosen": -1.2216678857803345, "logits/rejected": -1.2374674081802368, "logps/chosen": -39.35933303833008, "logps/rejected": -69.94754791259766, "loss": 0.3897, "rewards/accuracies": 0.7083333134651184, "rewards/chosen": 0.15072119235992432, "rewards/margins": 2.5561957359313965, "rewards/rejected": -2.4054746627807617, "step": 208 }, { "epoch": 0.4846775227593281, "grad_norm": 23.330292154413335, "learning_rate": 4.698684378016222e-07, "logits/chosen": -1.235012412071228, "logits/rejected": -1.2410335540771484, "logps/chosen": -51.16822052001953, "logps/rejected": -67.03107452392578, "loss": 0.2774, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5210573673248291, "rewards/margins": 2.396049976348877, "rewards/rejected": -1.8749926090240479, "step": 210 }, { "epoch": 0.48929349916655984, "grad_norm": 44.38100018119066, "learning_rate": 4.6890158342054174e-07, "logits/chosen": -1.2579890489578247, "logits/rejected": -1.2686585187911987, "logps/chosen": -43.69232177734375, "logps/rejected": -59.38545608520508, "loss": 0.3209, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.6999004483222961, "rewards/margins": 2.572566032409668, "rewards/rejected": -1.8726658821105957, "step": 212 }, { "epoch": 0.4939094755737915, "grad_norm": 48.05758145861009, "learning_rate": 4.679204883383973e-07, "logits/chosen": -1.291759729385376, "logits/rejected": -1.311813235282898, "logps/chosen": -42.26872253417969, "logps/rejected": -75.81320190429688, "loss": 0.2963, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.7384893894195557, "rewards/margins": 3.459284782409668, "rewards/rejected": -2.7207956314086914, "step": 214 }, { "epoch": 0.4985254519810232, "grad_norm": 49.231820908366814, "learning_rate": 4.669252163805919e-07, "logits/chosen": -1.2944141626358032, "logits/rejected": -1.3142738342285156, "logps/chosen": -45.93452453613281, "logps/rejected": -61.74488830566406, "loss": 0.3873, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.7554634809494019, "rewards/margins": 2.4260568618774414, "rewards/rejected": -1.6705933809280396, "step": 216 }, { "epoch": 0.5031414283882549, "grad_norm": 39.371230612811594, "learning_rate": 4.65915832294809e-07, "logits/chosen": -1.2367289066314697, "logits/rejected": -1.2460105419158936, "logps/chosen": -43.46434783935547, "logps/rejected": -65.94113159179688, "loss": 0.3123, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.8203165531158447, "rewards/margins": 2.6564347743988037, "rewards/rejected": -1.836118221282959, "step": 218 }, { "epoch": 0.5077574047954866, "grad_norm": 36.504230004493856, "learning_rate": 4.6489240174680026e-07, "logits/chosen": -1.274338722229004, "logits/rejected": -1.274038314819336, "logps/chosen": -46.72291564941406, "logps/rejected": -46.98860549926758, "loss": 0.3653, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.7758186459541321, "rewards/margins": 1.619003415107727, "rewards/rejected": -0.8431846499443054, "step": 220 }, { "epoch": 0.5077574047954866, "eval_logits/chosen": -1.1508095264434814, "eval_logits/rejected": -1.158846139907837, "eval_logps/chosen": -47.08699035644531, "eval_logps/rejected": -59.3014030456543, "eval_loss": 0.30850934982299805, "eval_rewards/accuracies": 0.8029953837394714, "eval_rewards/chosen": 0.7732176184654236, "eval_rewards/margins": 2.377366065979004, "eval_rewards/rejected": -1.6041483879089355, "eval_runtime": 227.3591, "eval_samples_per_second": 7.627, "eval_steps_per_second": 1.909, "step": 220 }, { "epoch": 0.5123733812027182, "grad_norm": 58.78605593626179, "learning_rate": 4.638549913161138e-07, "logits/chosen": -1.163451910018921, "logits/rejected": -1.1646764278411865, "logps/chosen": -52.2976188659668, "logps/rejected": -53.40987777709961, "loss": 0.2728, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.7431490421295166, "rewards/margins": 2.2729909420013428, "rewards/rejected": -1.5298418998718262, "step": 222 }, { "epoch": 0.51698935760995, "grad_norm": 47.481692611666446, "learning_rate": 4.6280366849176267e-07, "logits/chosen": -1.1689667701721191, "logits/rejected": -1.176836371421814, "logps/chosen": -48.97719192504883, "logps/rejected": -53.70207977294922, "loss": 0.3088, "rewards/accuracies": 0.875, "rewards/chosen": 0.4173290729522705, "rewards/margins": 1.9205392599105835, "rewards/rejected": -1.503210186958313, "step": 224 }, { "epoch": 0.5216053340171817, "grad_norm": 26.23879949976686, "learning_rate": 4.6173850166783446e-07, "logits/chosen": -1.1005306243896484, "logits/rejected": -1.1028097867965698, "logps/chosen": -48.85009002685547, "logps/rejected": -60.73183822631836, "loss": 0.2688, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.3439117670059204, "rewards/margins": 2.138643741607666, "rewards/rejected": -1.7947319746017456, "step": 226 }, { "epoch": 0.5262213104244133, "grad_norm": 42.68538113603284, "learning_rate": 4.606595601390417e-07, "logits/chosen": -1.1647553443908691, "logits/rejected": -1.1835236549377441, "logps/chosen": -47.58196258544922, "logps/rejected": -73.24917602539062, "loss": 0.2677, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.032037004828453064, "rewards/margins": 2.9576616287231445, "rewards/rejected": -2.9256248474121094, "step": 228 }, { "epoch": 0.5308372868316451, "grad_norm": 47.39085343776366, "learning_rate": 4.595669140962143e-07, "logits/chosen": -1.2832393646240234, "logits/rejected": -1.3168139457702637, "logps/chosen": -42.34735870361328, "logps/rejected": -91.76000213623047, "loss": 0.3269, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.22074511647224426, "rewards/margins": 3.9149208068847656, "rewards/rejected": -4.1356658935546875, "step": 230 }, { "epoch": 0.5354532632388768, "grad_norm": 34.48196631675489, "learning_rate": 4.5846063462173284e-07, "logits/chosen": -1.2154712677001953, "logits/rejected": -1.2207145690917969, "logps/chosen": -46.80333709716797, "logps/rejected": -61.56758499145508, "loss": 0.299, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.07934803515672684, "rewards/margins": 2.2728195190429688, "rewards/rejected": -2.1934714317321777, "step": 232 }, { "epoch": 0.5400692396461084, "grad_norm": 53.84175279608908, "learning_rate": 4.573407936849044e-07, "logits/chosen": -1.235826015472412, "logits/rejected": -1.2379093170166016, "logps/chosen": -53.666229248046875, "logps/rejected": -56.39564895629883, "loss": 0.34, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.15499740839004517, "rewards/margins": 2.093020439147949, "rewards/rejected": -1.9380230903625488, "step": 234 }, { "epoch": 0.5446852160533402, "grad_norm": 55.22566434624621, "learning_rate": 4.5620746413728063e-07, "logits/chosen": -1.168860912322998, "logits/rejected": -1.167116641998291, "logps/chosen": -59.95442199707031, "logps/rejected": -55.52897644042969, "loss": 0.2556, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.048602912575006485, "rewards/margins": 2.1055731773376465, "rewards/rejected": -2.0569701194763184, "step": 236 }, { "epoch": 0.5493011924605719, "grad_norm": 40.26544195874405, "learning_rate": 4.550607197079185e-07, "logits/chosen": -1.1609958410263062, "logits/rejected": -1.158744215965271, "logps/chosen": -46.247127532958984, "logps/rejected": -53.768096923828125, "loss": 0.2776, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5583436489105225, "rewards/margins": 1.9103565216064453, "rewards/rejected": -1.3520128726959229, "step": 238 }, { "epoch": 0.5539171688678035, "grad_norm": 22.296733757945297, "learning_rate": 4.5390063499858353e-07, "logits/chosen": -1.143466591835022, "logits/rejected": -1.1510720252990723, "logps/chosen": -56.236507415771484, "logps/rejected": -72.3839111328125, "loss": 0.2216, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.30002227425575256, "rewards/margins": 2.8752281665802, "rewards/rejected": -2.5752060413360596, "step": 240 }, { "epoch": 0.5539171688678035, "eval_logits/chosen": -1.1655231714248657, "eval_logits/rejected": -1.1714258193969727, "eval_logps/chosen": -47.62031173706055, "eval_logps/rejected": -60.05898666381836, "eval_loss": 0.2956756353378296, "eval_rewards/accuracies": 0.8093317747116089, "eval_rewards/chosen": 0.5065575242042542, "eval_rewards/margins": 2.4894962310791016, "eval_rewards/rejected": -1.9829388856887817, "eval_runtime": 227.4458, "eval_samples_per_second": 7.624, "eval_steps_per_second": 1.908, "step": 240 }, { "epoch": 0.5585331452750353, "grad_norm": 25.00341489978113, "learning_rate": 4.5272728547889687e-07, "logits/chosen": -1.2413771152496338, "logits/rejected": -1.2534655332565308, "logps/chosen": -51.89939880371094, "logps/rejected": -61.60024642944336, "loss": 0.2157, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.444425106048584, "rewards/margins": 3.039713144302368, "rewards/rejected": -2.595287799835205, "step": 242 }, { "epoch": 0.5631491216822669, "grad_norm": 59.288877946216886, "learning_rate": 4.5154074748142535e-07, "logits/chosen": -1.1979435682296753, "logits/rejected": -1.200371265411377, "logps/chosen": -51.88737106323242, "logps/rejected": -62.96666717529297, "loss": 0.3125, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.536859929561615, "rewards/margins": 2.3762218952178955, "rewards/rejected": -1.8393617868423462, "step": 244 }, { "epoch": 0.5677650980894986, "grad_norm": 49.81197105903562, "learning_rate": 4.503410981967158e-07, "logits/chosen": -1.1927361488342285, "logits/rejected": -1.2023940086364746, "logps/chosen": -43.52791213989258, "logps/rejected": -68.50804138183594, "loss": 0.3784, "rewards/accuracies": 0.7083333134651184, "rewards/chosen": 0.9149570465087891, "rewards/margins": 2.78161883354187, "rewards/rejected": -1.8666616678237915, "step": 246 }, { "epoch": 0.5723810744967304, "grad_norm": 36.729769896842384, "learning_rate": 4.4912841566827333e-07, "logits/chosen": -1.2108688354492188, "logits/rejected": -1.2207088470458984, "logps/chosen": -47.57191848754883, "logps/rejected": -65.24411010742188, "loss": 0.2484, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 1.3121000528335571, "rewards/margins": 2.9495744705200195, "rewards/rejected": -1.6374740600585938, "step": 248 }, { "epoch": 0.576997050903962, "grad_norm": 52.94413715920982, "learning_rate": 4.4790277878748415e-07, "logits/chosen": -1.2431470155715942, "logits/rejected": -1.2514811754226685, "logps/chosen": -41.430206298828125, "logps/rejected": -57.99823760986328, "loss": 0.2988, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.8287627696990967, "rewards/margins": 2.483140707015991, "rewards/rejected": -1.6543784141540527, "step": 250 }, { "epoch": 0.5816130273111938, "grad_norm": 35.54104658985613, "learning_rate": 4.466642672884835e-07, "logits/chosen": -1.1631712913513184, "logits/rejected": -1.168829321861267, "logps/chosen": -44.09839630126953, "logps/rejected": -58.30562973022461, "loss": 0.2578, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.6901536583900452, "rewards/margins": 2.6523261070251465, "rewards/rejected": -1.9621726274490356, "step": 252 }, { "epoch": 0.5862290037184255, "grad_norm": 37.73217497274028, "learning_rate": 4.454129617429682e-07, "logits/chosen": -1.2659639120101929, "logits/rejected": -1.26548433303833, "logps/chosen": -48.81840515136719, "logps/rejected": -50.18553161621094, "loss": 0.303, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.6821924448013306, "rewards/margins": 2.1100261211395264, "rewards/rejected": -1.4278337955474854, "step": 254 }, { "epoch": 0.5908449801256571, "grad_norm": 19.374735161669783, "learning_rate": 4.441489435549551e-07, "logits/chosen": -1.1399126052856445, "logits/rejected": -1.1459710597991943, "logps/chosen": -51.82578659057617, "logps/rejected": -68.86441040039062, "loss": 0.2382, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.7167325615882874, "rewards/margins": 3.1428277492523193, "rewards/rejected": -2.426095485687256, "step": 256 }, { "epoch": 0.5954609565328889, "grad_norm": 41.763567555359415, "learning_rate": 4.4287229495548573e-07, "logits/chosen": -1.183684229850769, "logits/rejected": -1.190173625946045, "logps/chosen": -53.23274230957031, "logps/rejected": -66.342041015625, "loss": 0.2552, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.7042552828788757, "rewards/margins": 3.324800729751587, "rewards/rejected": -2.6205453872680664, "step": 258 }, { "epoch": 0.6000769329401205, "grad_norm": 53.08605142258815, "learning_rate": 4.415830989972761e-07, "logits/chosen": -1.1716980934143066, "logits/rejected": -1.1756532192230225, "logps/chosen": -48.24959182739258, "logps/rejected": -57.04353713989258, "loss": 0.2986, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.6846582889556885, "rewards/margins": 2.4726505279541016, "rewards/rejected": -1.7879924774169922, "step": 260 }, { "epoch": 0.6000769329401205, "eval_logits/chosen": -1.1839433908462524, "eval_logits/rejected": -1.1875815391540527, "eval_logps/chosen": -47.51332092285156, "eval_logps/rejected": -60.25726318359375, "eval_loss": 0.2889851927757263, "eval_rewards/accuracies": 0.8139401078224182, "eval_rewards/chosen": 0.5600550174713135, "eval_rewards/margins": 2.6421334743499756, "eval_rewards/rejected": -2.082078218460083, "eval_runtime": 227.3437, "eval_samples_per_second": 7.627, "eval_steps_per_second": 1.909, "step": 260 }, { "epoch": 0.6046929093473522, "grad_norm": 40.94616402060229, "learning_rate": 4.402814395493142e-07, "logits/chosen": -1.2763242721557617, "logits/rejected": -1.2830497026443481, "logps/chosen": -46.679168701171875, "logps/rejected": -46.02785873413086, "loss": 0.3389, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.5309950113296509, "rewards/margins": 1.931261658668518, "rewards/rejected": -1.4002668857574463, "step": 262 }, { "epoch": 0.609308885754584, "grad_norm": 37.45563925423882, "learning_rate": 4.3896740129140354e-07, "logits/chosen": -1.2274925708770752, "logits/rejected": -1.2251402139663696, "logps/chosen": -49.84899139404297, "logps/rejected": -48.70707702636719, "loss": 0.2565, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.42680761218070984, "rewards/margins": 2.4617788791656494, "rewards/rejected": -2.0349714756011963, "step": 264 }, { "epoch": 0.6139248621618156, "grad_norm": 39.43164361196429, "learning_rate": 4.3764106970865456e-07, "logits/chosen": -1.2649712562561035, "logits/rejected": -1.2683297395706177, "logps/chosen": -42.68367004394531, "logps/rejected": -59.22146224975586, "loss": 0.3223, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.032459404319524765, "rewards/margins": 2.20072603225708, "rewards/rejected": -2.168266534805298, "step": 266 }, { "epoch": 0.6185408385690473, "grad_norm": 38.60665005525895, "learning_rate": 4.3630253108592305e-07, "logits/chosen": -1.1689542531967163, "logits/rejected": -1.1707943677902222, "logps/chosen": -55.54972839355469, "logps/rejected": -63.90589141845703, "loss": 0.2454, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.198521688580513, "rewards/margins": 2.9068400859832764, "rewards/rejected": -3.1053614616394043, "step": 268 }, { "epoch": 0.6231568149762791, "grad_norm": 52.06719541190528, "learning_rate": 4.3495187250219723e-07, "logits/chosen": -1.2435555458068848, "logits/rejected": -1.247399926185608, "logps/chosen": -44.81391906738281, "logps/rejected": -66.15491485595703, "loss": 0.3261, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.1677350103855133, "rewards/margins": 2.8860220909118652, "rewards/rejected": -3.053757429122925, "step": 270 }, { "epoch": 0.6277727913835107, "grad_norm": 34.05944822234142, "learning_rate": 4.3358918182493253e-07, "logits/chosen": -1.1470799446105957, "logits/rejected": -1.146039366722107, "logps/chosen": -48.27168273925781, "logps/rejected": -55.25007629394531, "loss": 0.2124, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.0416293665766716, "rewards/margins": 2.1943235397338867, "rewards/rejected": -2.1526942253112793, "step": 272 }, { "epoch": 0.6323887677907424, "grad_norm": 32.55880144659725, "learning_rate": 4.3221454770433554e-07, "logits/chosen": -1.2215373516082764, "logits/rejected": -1.223750352859497, "logps/chosen": -53.73216247558594, "logps/rejected": -58.15590286254883, "loss": 0.217, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.4631071090698242, "rewards/margins": 2.812567949295044, "rewards/rejected": -2.3494608402252197, "step": 274 }, { "epoch": 0.6370047441979741, "grad_norm": 37.29286642220413, "learning_rate": 4.308280595675966e-07, "logits/chosen": -1.2593313455581665, "logits/rejected": -1.2601191997528076, "logps/chosen": -51.24105453491211, "logps/rejected": -58.06007766723633, "loss": 0.305, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.41045814752578735, "rewards/margins": 2.3772544860839844, "rewards/rejected": -1.9667962789535522, "step": 276 }, { "epoch": 0.6416207206052058, "grad_norm": 27.73832644868097, "learning_rate": 4.2942980761307227e-07, "logits/chosen": -1.2309229373931885, "logits/rejected": -1.2335686683654785, "logps/chosen": -48.84478759765625, "logps/rejected": -58.93284225463867, "loss": 0.2202, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.5729148387908936, "rewards/margins": 2.5622036457061768, "rewards/rejected": -1.9892889261245728, "step": 278 }, { "epoch": 0.6462366970124375, "grad_norm": 49.846247584121116, "learning_rate": 4.2801988280441765e-07, "logits/chosen": -1.2041369676589966, "logits/rejected": -1.2069140672683716, "logps/chosen": -50.05432891845703, "logps/rejected": -59.74306106567383, "loss": 0.2356, "rewards/accuracies": 0.875, "rewards/chosen": 1.4567064046859741, "rewards/margins": 2.9142255783081055, "rewards/rejected": -1.4575190544128418, "step": 280 }, { "epoch": 0.6462366970124375, "eval_logits/chosen": -1.1684162616729736, "eval_logits/rejected": -1.1727546453475952, "eval_logps/chosen": -45.66713333129883, "eval_logps/rejected": -58.96503829956055, "eval_loss": 0.28808632493019104, "eval_rewards/accuracies": 0.807603657245636, "eval_rewards/chosen": 1.483147144317627, "eval_rewards/margins": 2.9191133975982666, "eval_rewards/rejected": -1.4359666109085083, "eval_runtime": 227.2327, "eval_samples_per_second": 7.631, "eval_steps_per_second": 1.91, "step": 280 }, { "epoch": 0.6508526734196692, "grad_norm": 37.200050161533866, "learning_rate": 4.2659837686466813e-07, "logits/chosen": -1.2258718013763428, "logits/rejected": -1.226365327835083, "logps/chosen": -45.65945816040039, "logps/rejected": -56.360023498535156, "loss": 0.2663, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 1.7170668840408325, "rewards/margins": 2.946417808532715, "rewards/rejected": -1.2293510437011719, "step": 282 }, { "epoch": 0.6554686498269009, "grad_norm": 46.91080670819706, "learning_rate": 4.25165382270273e-07, "logits/chosen": -1.194913387298584, "logits/rejected": -1.1949591636657715, "logps/chosen": -41.45118713378906, "logps/rejected": -49.03273391723633, "loss": 0.2576, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 1.5383753776550293, "rewards/margins": 2.4985907077789307, "rewards/rejected": -0.9602153301239014, "step": 284 }, { "epoch": 0.6600846262341326, "grad_norm": 38.28106252628991, "learning_rate": 4.2372099224507875e-07, "logits/chosen": -1.262522578239441, "logits/rejected": -1.2750235795974731, "logps/chosen": -38.96574401855469, "logps/rejected": -67.78875732421875, "loss": 0.2815, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 1.0857527256011963, "rewards/margins": 3.374871253967285, "rewards/rejected": -2.289118766784668, "step": 286 }, { "epoch": 0.6647006026413643, "grad_norm": 48.61343601009988, "learning_rate": 4.2226530075426503e-07, "logits/chosen": -1.1528538465499878, "logits/rejected": -1.1555874347686768, "logps/chosen": -56.5745849609375, "logps/rejected": -58.78268051147461, "loss": 0.2806, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.9941625595092773, "rewards/margins": 2.6789402961730957, "rewards/rejected": -1.6847774982452393, "step": 288 }, { "epoch": 0.669316579048596, "grad_norm": 41.895687140764245, "learning_rate": 4.2079840249823106e-07, "logits/chosen": -1.1860905885696411, "logits/rejected": -1.1894373893737793, "logps/chosen": -50.26545715332031, "logps/rejected": -72.03532409667969, "loss": 0.289, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.2916366159915924, "rewards/margins": 3.1359870433807373, "rewards/rejected": -2.8443503379821777, "step": 290 }, { "epoch": 0.6739325554558276, "grad_norm": 41.288114692753076, "learning_rate": 4.193203929064353e-07, "logits/chosen": -1.2005698680877686, "logits/rejected": -1.2083783149719238, "logps/chosen": -51.12635803222656, "logps/rejected": -71.51712036132812, "loss": 0.345, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.0697176456451416, "rewards/margins": 2.9638874530792236, "rewards/rejected": -2.894169807434082, "step": 292 }, { "epoch": 0.6785485318630594, "grad_norm": 42.3623221323418, "learning_rate": 4.1783136813118705e-07, "logits/chosen": -1.222592830657959, "logits/rejected": -1.225614309310913, "logps/chosen": -50.67860794067383, "logps/rejected": -61.38881301879883, "loss": 0.2915, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.12122215330600739, "rewards/margins": 2.681208610534668, "rewards/rejected": -2.8024303913116455, "step": 294 }, { "epoch": 0.6831645082702911, "grad_norm": 22.982390981962695, "learning_rate": 4.163314250413913e-07, "logits/chosen": -1.1687074899673462, "logits/rejected": -1.165205955505371, "logps/chosen": -46.08445739746094, "logps/rejected": -56.52994918823242, "loss": 0.1833, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.2055283784866333, "rewards/margins": 2.5608911514282227, "rewards/rejected": -2.355362892150879, "step": 296 }, { "epoch": 0.6877804846775227, "grad_norm": 37.84043310920863, "learning_rate": 4.1482066121624716e-07, "logits/chosen": -1.207397222518921, "logits/rejected": -1.208192229270935, "logps/chosen": -49.74457550048828, "logps/rejected": -50.62785339355469, "loss": 0.3247, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.17621606588363647, "rewards/margins": 2.258789539337158, "rewards/rejected": -2.082573413848877, "step": 298 }, { "epoch": 0.6923964610847545, "grad_norm": 41.026889158159065, "learning_rate": 4.1329917493889933e-07, "logits/chosen": -1.3157416582107544, "logits/rejected": -1.3190845251083374, "logps/chosen": -46.27326965332031, "logps/rejected": -61.60360336303711, "loss": 0.2407, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.021905170753598213, "rewards/margins": 2.830016613006592, "rewards/rejected": -2.8081114292144775, "step": 300 }, { "epoch": 0.6923964610847545, "eval_logits/chosen": -1.1676603555679321, "eval_logits/rejected": -1.1713868379592896, "eval_logps/chosen": -47.389183044433594, "eval_logps/rejected": -60.791683197021484, "eval_loss": 0.2743883430957794, "eval_rewards/accuracies": 0.8133640289306641, "eval_rewards/chosen": 0.6221204400062561, "eval_rewards/margins": 2.9714088439941406, "eval_rewards/rejected": -2.349287986755371, "eval_runtime": 227.521, "eval_samples_per_second": 7.621, "eval_steps_per_second": 1.908, "step": 300 }, { "epoch": 0.6970124374919862, "grad_norm": 42.14326526501706, "learning_rate": 4.117670651900446e-07, "logits/chosen": -1.2038425207138062, "logits/rejected": -1.2065904140472412, "logps/chosen": -50.9475212097168, "logps/rejected": -56.376590728759766, "loss": 0.3052, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.660882830619812, "rewards/margins": 2.4505326747894287, "rewards/rejected": -1.789649486541748, "step": 302 }, { "epoch": 0.7016284138992178, "grad_norm": 59.036735448430065, "learning_rate": 4.1022443164149237e-07, "logits/chosen": -1.1808403730392456, "logits/rejected": -1.1900469064712524, "logps/chosen": -51.46991729736328, "logps/rejected": -69.61980438232422, "loss": 0.2946, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.5994511842727661, "rewards/margins": 3.409363269805908, "rewards/rejected": -2.8099122047424316, "step": 304 }, { "epoch": 0.7062443903064496, "grad_norm": 29.330872231920097, "learning_rate": 4.086713746496808e-07, "logits/chosen": -1.2124900817871094, "logits/rejected": -1.2128535509109497, "logps/chosen": -45.35523223876953, "logps/rejected": -56.160545349121094, "loss": 0.3006, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.7718818187713623, "rewards/margins": 2.726104259490967, "rewards/rejected": -1.954222559928894, "step": 306 }, { "epoch": 0.7108603667136812, "grad_norm": 34.74292770737631, "learning_rate": 4.0710799524914805e-07, "logits/chosen": -1.1383283138275146, "logits/rejected": -1.1413955688476562, "logps/chosen": -55.99458312988281, "logps/rejected": -62.90273666381836, "loss": 0.2295, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.6053013205528259, "rewards/margins": 2.9940264225006104, "rewards/rejected": -2.3887250423431396, "step": 308 }, { "epoch": 0.7154763431209129, "grad_norm": 30.476243441323902, "learning_rate": 4.055343951459592e-07, "logits/chosen": -1.191731572151184, "logits/rejected": -1.1988056898117065, "logps/chosen": -44.046875, "logps/rejected": -64.41764068603516, "loss": 0.2429, "rewards/accuracies": 0.875, "rewards/chosen": 0.2551959455013275, "rewards/margins": 3.2936155796051025, "rewards/rejected": -3.038419485092163, "step": 310 }, { "epoch": 0.7200923195281447, "grad_norm": 30.700879501780552, "learning_rate": 4.0395067671108985e-07, "logits/chosen": -1.2448992729187012, "logits/rejected": -1.2440135478973389, "logps/chosen": -41.98812484741211, "logps/rejected": -49.34878921508789, "loss": 0.2697, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.3203623592853546, "rewards/margins": 2.6056876182556152, "rewards/rejected": -2.285325050354004, "step": 312 }, { "epoch": 0.7247082959353763, "grad_norm": 42.617870752566596, "learning_rate": 4.0235694297376637e-07, "logits/chosen": -1.139160394668579, "logits/rejected": -1.1415181159973145, "logps/chosen": -58.74495315551758, "logps/rejected": -63.99246597290039, "loss": 0.2614, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.32423093914985657, "rewards/margins": 2.9529130458831787, "rewards/rejected": -2.6286821365356445, "step": 314 }, { "epoch": 0.729324272342608, "grad_norm": 40.78600953952273, "learning_rate": 4.0075329761476347e-07, "logits/chosen": -1.216194748878479, "logits/rejected": -1.2174675464630127, "logps/chosen": -50.62156677246094, "logps/rejected": -53.62016296386719, "loss": 0.2407, "rewards/accuracies": 0.875, "rewards/chosen": -0.07387811690568924, "rewards/margins": 2.2588860988616943, "rewards/rejected": -2.332764148712158, "step": 316 }, { "epoch": 0.7339402487498398, "grad_norm": 30.002748027873594, "learning_rate": 3.991398449596588e-07, "logits/chosen": -1.2065101861953735, "logits/rejected": -1.211814045906067, "logps/chosen": -53.182777404785156, "logps/rejected": -66.31592559814453, "loss": 0.209, "rewards/accuracies": 0.875, "rewards/chosen": 0.0831962302327156, "rewards/margins": 3.359541177749634, "rewards/rejected": -3.2763442993164062, "step": 318 }, { "epoch": 0.7385562251570714, "grad_norm": 23.665196371870486, "learning_rate": 3.9751668997204647e-07, "logits/chosen": -1.1500531435012817, "logits/rejected": -1.154016375541687, "logps/chosen": -52.478214263916016, "logps/rejected": -61.09043884277344, "loss": 0.199, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.16100816428661346, "rewards/margins": 2.913641929626465, "rewards/rejected": -2.752634048461914, "step": 320 }, { "epoch": 0.7385562251570714, "eval_logits/chosen": -1.152096152305603, "eval_logits/rejected": -1.1563034057617188, "eval_logps/chosen": -48.12975311279297, "eval_logps/rejected": -61.73065948486328, "eval_loss": 0.2689039707183838, "eval_rewards/accuracies": 0.8185483813285828, "eval_rewards/chosen": 0.2518383860588074, "eval_rewards/margins": 3.0706160068511963, "eval_rewards/rejected": -2.818777322769165, "eval_runtime": 227.7908, "eval_samples_per_second": 7.612, "eval_steps_per_second": 1.905, "step": 320 }, { "epoch": 0.7431722015643031, "grad_norm": 48.30684834494479, "learning_rate": 3.958839382467084e-07, "logits/chosen": -1.222053050994873, "logits/rejected": -1.2289328575134277, "logps/chosen": -43.89524459838867, "logps/rejected": -56.49114990234375, "loss": 0.3092, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.43327876925468445, "rewards/margins": 2.904792308807373, "rewards/rejected": -2.471513032913208, "step": 322 }, { "epoch": 0.7477881779715349, "grad_norm": 42.66239436350315, "learning_rate": 3.9424169600274494e-07, "logits/chosen": -1.2450088262557983, "logits/rejected": -1.24375581741333, "logps/chosen": -50.182308197021484, "logps/rejected": -56.25563049316406, "loss": 0.2956, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.13703547418117523, "rewards/margins": 2.3541619777679443, "rewards/rejected": -2.4911975860595703, "step": 324 }, { "epoch": 0.7524041543787665, "grad_norm": 27.173798633882665, "learning_rate": 3.9259007007666436e-07, "logits/chosen": -1.212989330291748, "logits/rejected": -1.2185275554656982, "logps/chosen": -51.97760009765625, "logps/rejected": -63.76836395263672, "loss": 0.282, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.34300458431243896, "rewards/margins": 2.9810492992401123, "rewards/rejected": -2.638044595718384, "step": 326 }, { "epoch": 0.7570201307859982, "grad_norm": 41.510696079488966, "learning_rate": 3.909291679154332e-07, "logits/chosen": -1.2237902879714966, "logits/rejected": -1.232872486114502, "logps/chosen": -48.094669342041016, "logps/rejected": -73.24827575683594, "loss": 0.3194, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.10229718685150146, "rewards/margins": 3.730062246322632, "rewards/rejected": -3.832359790802002, "step": 328 }, { "epoch": 0.7616361071932299, "grad_norm": 36.28616141840599, "learning_rate": 3.892590975694858e-07, "logits/chosen": -1.2199351787567139, "logits/rejected": -1.2302178144454956, "logps/chosen": -44.97592544555664, "logps/rejected": -71.27304077148438, "loss": 0.23, "rewards/accuracies": 0.875, "rewards/chosen": 0.4918098449707031, "rewards/margins": 4.322786331176758, "rewards/rejected": -3.830976724624634, "step": 330 }, { "epoch": 0.7662520836004616, "grad_norm": 23.858921961912298, "learning_rate": 3.875799676856952e-07, "logits/chosen": -1.1334155797958374, "logits/rejected": -1.1350713968276978, "logps/chosen": -50.02524948120117, "logps/rejected": -62.23411560058594, "loss": 0.2071, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.42125022411346436, "rewards/margins": 3.270005226135254, "rewards/rejected": -2.8487555980682373, "step": 332 }, { "epoch": 0.7708680600076933, "grad_norm": 46.04858312353762, "learning_rate": 3.858918875003053e-07, "logits/chosen": -1.2362879514694214, "logits/rejected": -1.2460957765579224, "logps/chosen": -48.63706970214844, "logps/rejected": -72.94087219238281, "loss": 0.2514, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.5030243396759033, "rewards/margins": 4.145318031311035, "rewards/rejected": -3.64229416847229, "step": 334 }, { "epoch": 0.775484036414925, "grad_norm": 21.37867116532646, "learning_rate": 3.8419496683182396e-07, "logits/chosen": -1.1005958318710327, "logits/rejected": -1.1029105186462402, "logps/chosen": -46.87886047363281, "logps/rejected": -63.63945007324219, "loss": 0.1861, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.5341323018074036, "rewards/margins": 3.1560137271881104, "rewards/rejected": -2.6218814849853516, "step": 336 }, { "epoch": 0.7801000128221567, "grad_norm": 34.41230269099275, "learning_rate": 3.824893160738792e-07, "logits/chosen": -1.1643880605697632, "logits/rejected": -1.1722698211669922, "logps/chosen": -47.30472946166992, "logps/rejected": -66.55563354492188, "loss": 0.2727, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.9107217788696289, "rewards/margins": 3.5939722061157227, "rewards/rejected": -2.683250904083252, "step": 338 }, { "epoch": 0.7847159892293883, "grad_norm": 20.704541622004893, "learning_rate": 3.8077504618803737e-07, "logits/chosen": -1.1662912368774414, "logits/rejected": -1.1619421243667603, "logps/chosen": -56.10293197631836, "logps/rejected": -53.84746170043945, "loss": 0.2129, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.9300886988639832, "rewards/margins": 2.765589475631714, "rewards/rejected": -1.8355004787445068, "step": 340 }, { "epoch": 0.7847159892293883, "eval_logits/chosen": -1.1408087015151978, "eval_logits/rejected": -1.1458781957626343, "eval_logps/chosen": -47.36520004272461, "eval_logps/rejected": -61.242679595947266, "eval_loss": 0.26537051796913147, "eval_rewards/accuracies": 0.8185483813285828, "eval_rewards/chosen": 0.6341149806976318, "eval_rewards/margins": 3.208899736404419, "eval_rewards/rejected": -2.574784755706787, "eval_runtime": 227.4228, "eval_samples_per_second": 7.625, "eval_steps_per_second": 1.908, "step": 340 }, { "epoch": 0.7893319656366201, "grad_norm": 29.91569671776041, "learning_rate": 3.7905226869658446e-07, "logits/chosen": -1.1567282676696777, "logits/rejected": -1.1574435234069824, "logps/chosen": -50.19194793701172, "logps/rejected": -61.67422866821289, "loss": 0.2453, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.956231415271759, "rewards/margins": 3.3507864475250244, "rewards/rejected": -2.394554853439331, "step": 342 }, { "epoch": 0.7939479420438518, "grad_norm": 35.82510237615759, "learning_rate": 3.773210956752709e-07, "logits/chosen": -1.1932220458984375, "logits/rejected": -1.1889605522155762, "logps/chosen": -46.48088073730469, "logps/rejected": -49.8135986328125, "loss": 0.2891, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.5073845982551575, "rewards/margins": 2.379626989364624, "rewards/rejected": -1.8722424507141113, "step": 344 }, { "epoch": 0.7985639184510834, "grad_norm": 28.47228811073422, "learning_rate": 3.7558163974602093e-07, "logits/chosen": -1.1920644044876099, "logits/rejected": -1.201475977897644, "logps/chosen": -43.6226806640625, "logps/rejected": -64.39244842529297, "loss": 0.2762, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.10022352635860443, "rewards/margins": 3.255174398422241, "rewards/rejected": -3.1549510955810547, "step": 346 }, { "epoch": 0.8031798948583152, "grad_norm": 41.618837869458865, "learning_rate": 3.73834014069605e-07, "logits/chosen": -1.1160246133804321, "logits/rejected": -1.1237598657608032, "logps/chosen": -55.9088249206543, "logps/rejected": -71.82222747802734, "loss": 0.236, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.08166421949863434, "rewards/margins": 3.3770830631256104, "rewards/rejected": -3.2954187393188477, "step": 348 }, { "epoch": 0.8077958712655469, "grad_norm": 34.292113974031, "learning_rate": 3.7207833233827914e-07, "logits/chosen": -1.221280813217163, "logits/rejected": -1.228824257850647, "logps/chosen": -51.23641586303711, "logps/rejected": -67.17003631591797, "loss": 0.2952, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.29954856634140015, "rewards/margins": 3.7896947860717773, "rewards/rejected": -4.089242935180664, "step": 350 }, { "epoch": 0.8124118476727785, "grad_norm": 27.12194701535223, "learning_rate": 3.7031470876838786e-07, "logits/chosen": -1.1533750295639038, "logits/rejected": -1.1603755950927734, "logps/chosen": -49.57859802246094, "logps/rejected": -74.35897064208984, "loss": 0.2616, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.34589090943336487, "rewards/margins": 3.77805495262146, "rewards/rejected": -4.123946189880371, "step": 352 }, { "epoch": 0.8170278240800103, "grad_norm": 33.865318639168144, "learning_rate": 3.6854325809293455e-07, "logits/chosen": -1.2225959300994873, "logits/rejected": -1.2331852912902832, "logps/chosen": -42.565975189208984, "logps/rejected": -74.49568939208984, "loss": 0.2349, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.32742711901664734, "rewards/margins": 4.166980743408203, "rewards/rejected": -4.494408130645752, "step": 354 }, { "epoch": 0.821643800487242, "grad_norm": 44.72468218463322, "learning_rate": 3.6676409555411653e-07, "logits/chosen": -1.1373627185821533, "logits/rejected": -1.1474027633666992, "logps/chosen": -51.0811882019043, "logps/rejected": -69.32566833496094, "loss": 0.2441, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.21452119946479797, "rewards/margins": 3.681729555130005, "rewards/rejected": -3.8962512016296387, "step": 356 }, { "epoch": 0.8262597768944736, "grad_norm": 18.3343181780076, "learning_rate": 3.6497733689582866e-07, "logits/chosen": -1.1717758178710938, "logits/rejected": -1.1715316772460938, "logps/chosen": -45.530574798583984, "logps/rejected": -56.84959030151367, "loss": 0.1942, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.07019754499197006, "rewards/margins": 3.02067232131958, "rewards/rejected": -2.950474739074707, "step": 358 }, { "epoch": 0.8308757533017054, "grad_norm": 35.844182387455, "learning_rate": 3.631830983561335e-07, "logits/chosen": -1.1136425733566284, "logits/rejected": -1.1134952306747437, "logps/chosen": -54.07844924926758, "logps/rejected": -60.01144027709961, "loss": 0.2174, "rewards/accuracies": 0.875, "rewards/chosen": 0.009207261726260185, "rewards/margins": 3.035071611404419, "rewards/rejected": -3.025864601135254, "step": 360 }, { "epoch": 0.8308757533017054, "eval_logits/chosen": -1.137829303741455, "eval_logits/rejected": -1.142533779144287, "eval_logps/chosen": -49.13446807861328, "eval_logps/rejected": -63.1205940246582, "eval_loss": 0.2611147463321686, "eval_rewards/accuracies": 0.8231566548347473, "eval_rewards/chosen": -0.25051993131637573, "eval_rewards/margins": 3.2632253170013428, "eval_rewards/rejected": -3.5137455463409424, "eval_runtime": 227.4188, "eval_samples_per_second": 7.625, "eval_steps_per_second": 1.908, "step": 360 }, { "epoch": 0.835491729708937, "grad_norm": 36.48149555986552, "learning_rate": 3.613814966596991e-07, "logits/chosen": -1.1987216472625732, "logits/rejected": -1.2045807838439941, "logps/chosen": -49.24374008178711, "logps/rejected": -66.37528228759766, "loss": 0.2604, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.2955743074417114, "rewards/margins": 3.364227533340454, "rewards/rejected": -3.659801959991455, "step": 362 }, { "epoch": 0.8401077061161687, "grad_norm": 17.123248215692428, "learning_rate": 3.595726490102059e-07, "logits/chosen": -1.1486543416976929, "logits/rejected": -1.155872106552124, "logps/chosen": -47.52320098876953, "logps/rejected": -72.59225463867188, "loss": 0.1309, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.2556496858596802, "rewards/margins": 3.9981601238250732, "rewards/rejected": -4.253809928894043, "step": 364 }, { "epoch": 0.8447236825234005, "grad_norm": 22.608323262045403, "learning_rate": 3.577566730827214e-07, "logits/chosen": -1.1728885173797607, "logits/rejected": -1.1819621324539185, "logps/chosen": -47.8609504699707, "logps/rejected": -66.17147064208984, "loss": 0.2715, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.29853469133377075, "rewards/margins": 3.493640661239624, "rewards/rejected": -3.792175769805908, "step": 366 }, { "epoch": 0.8493396589306321, "grad_norm": 37.233167771755866, "learning_rate": 3.559336870160453e-07, "logits/chosen": -1.1891751289367676, "logits/rejected": -1.193422794342041, "logps/chosen": -43.88676834106445, "logps/rejected": -61.30946350097656, "loss": 0.2259, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.18390725553035736, "rewards/margins": 3.307628631591797, "rewards/rejected": -3.4915361404418945, "step": 368 }, { "epoch": 0.8539556353378638, "grad_norm": 26.22453096528665, "learning_rate": 3.541038094050241e-07, "logits/chosen": -1.155517339706421, "logits/rejected": -1.1603643894195557, "logps/chosen": -52.52191162109375, "logps/rejected": -73.11701202392578, "loss": 0.1787, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.36556607484817505, "rewards/margins": 4.345615863800049, "rewards/rejected": -4.711181163787842, "step": 370 }, { "epoch": 0.8585716117450956, "grad_norm": 39.171509700373235, "learning_rate": 3.52267159292835e-07, "logits/chosen": -1.1714898347854614, "logits/rejected": -1.1753745079040527, "logps/chosen": -52.40201950073242, "logps/rejected": -71.5418701171875, "loss": 0.2399, "rewards/accuracies": 0.875, "rewards/chosen": -0.4184909164905548, "rewards/margins": 3.973634958267212, "rewards/rejected": -4.392125606536865, "step": 372 }, { "epoch": 0.8631875881523272, "grad_norm": 23.054835061356226, "learning_rate": 3.5042385616324236e-07, "logits/chosen": -1.3357490301132202, "logits/rejected": -1.345274806022644, "logps/chosen": -41.36846923828125, "logps/rejected": -68.94818115234375, "loss": 0.2237, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.5928651094436646, "rewards/margins": 4.040990352630615, "rewards/rejected": -4.63385534286499, "step": 374 }, { "epoch": 0.8678035645595589, "grad_norm": 23.34114570013692, "learning_rate": 3.485740199328244e-07, "logits/chosen": -1.1034616231918335, "logits/rejected": -1.1045833826065063, "logps/chosen": -55.426727294921875, "logps/rejected": -59.57293701171875, "loss": 0.1942, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.04073745757341385, "rewards/margins": 3.197631597518921, "rewards/rejected": -3.2383692264556885, "step": 376 }, { "epoch": 0.8724195409667906, "grad_norm": 19.26131705604103, "learning_rate": 3.4671777094317196e-07, "logits/chosen": -1.1123476028442383, "logits/rejected": -1.1115697622299194, "logps/chosen": -54.29782485961914, "logps/rejected": -60.28561782836914, "loss": 0.1845, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.4158029556274414, "rewards/margins": 3.2452781200408936, "rewards/rejected": -3.661081075668335, "step": 378 }, { "epoch": 0.8770355173740223, "grad_norm": 40.47170182440331, "learning_rate": 3.448552299530595e-07, "logits/chosen": -1.1856770515441895, "logits/rejected": -1.188659429550171, "logps/chosen": -47.90871047973633, "logps/rejected": -58.21343231201172, "loss": 0.2866, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.3236946761608124, "rewards/margins": 3.1452713012695312, "rewards/rejected": -3.468966245651245, "step": 380 }, { "epoch": 0.8770355173740223, "eval_logits/chosen": -1.1315786838531494, "eval_logits/rejected": -1.1364408731460571, "eval_logps/chosen": -48.8283576965332, "eval_logps/rejected": -63.289493560791016, "eval_loss": 0.2587234079837799, "eval_rewards/accuracies": 0.820852518081665, "eval_rewards/chosen": -0.09746361523866653, "eval_rewards/margins": 3.500731945037842, "eval_rewards/rejected": -3.5981955528259277, "eval_runtime": 227.2598, "eval_samples_per_second": 7.63, "eval_steps_per_second": 1.91, "step": 380 }, { "epoch": 0.881651493781254, "grad_norm": 50.06622297059798, "learning_rate": 3.429865181305894e-07, "logits/chosen": -1.1800260543823242, "logits/rejected": -1.1820400953292847, "logps/chosen": -53.985992431640625, "logps/rejected": -63.98917770385742, "loss": 0.3147, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.11410641670227051, "rewards/margins": 3.317570447921753, "rewards/rejected": -3.4316766262054443, "step": 382 }, { "epoch": 0.8862674701884857, "grad_norm": 50.74505917044877, "learning_rate": 3.411117570453091e-07, "logits/chosen": -1.1595518589019775, "logits/rejected": -1.1614227294921875, "logps/chosen": -48.32596969604492, "logps/rejected": -61.07521057128906, "loss": 0.2287, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.054149970412254333, "rewards/margins": 3.22495174407959, "rewards/rejected": -3.279102087020874, "step": 384 }, { "epoch": 0.8908834465957174, "grad_norm": 33.82726208082807, "learning_rate": 3.392310686603025e-07, "logits/chosen": -1.2322266101837158, "logits/rejected": -1.235073447227478, "logps/chosen": -48.9878044128418, "logps/rejected": -57.65345001220703, "loss": 0.3178, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": -0.5888361930847168, "rewards/margins": 2.6325643062591553, "rewards/rejected": -3.221400260925293, "step": 386 }, { "epoch": 0.895499423002949, "grad_norm": 31.968246601846598, "learning_rate": 3.3734457532425554e-07, "logits/chosen": -1.14549720287323, "logits/rejected": -1.1533808708190918, "logps/chosen": -47.60829544067383, "logps/rejected": -65.90504455566406, "loss": 0.2893, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.4989345967769623, "rewards/margins": 3.784000873565674, "rewards/rejected": -4.28293514251709, "step": 388 }, { "epoch": 0.9001153994101808, "grad_norm": 38.79859580208747, "learning_rate": 3.354523997634969e-07, "logits/chosen": -1.1520222425460815, "logits/rejected": -1.159006953239441, "logps/chosen": -51.533203125, "logps/rejected": -68.82389068603516, "loss": 0.2439, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.5372670292854309, "rewards/margins": 3.8664119243621826, "rewards/rejected": -4.403678894042969, "step": 390 }, { "epoch": 0.9047313758174125, "grad_norm": 48.99149097647652, "learning_rate": 3.3355466507401374e-07, "logits/chosen": -1.1964970827102661, "logits/rejected": -1.195936679840088, "logps/chosen": -49.486724853515625, "logps/rejected": -51.76087951660156, "loss": 0.3225, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.6623955965042114, "rewards/margins": 2.538459539413452, "rewards/rejected": -3.200855255126953, "step": 392 }, { "epoch": 0.9093473522246441, "grad_norm": 34.31601060375262, "learning_rate": 3.3165149471344394e-07, "logits/chosen": -1.1766917705535889, "logits/rejected": -1.1769944429397583, "logps/chosen": -50.33845901489258, "logps/rejected": -60.9412841796875, "loss": 0.2826, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.7761159539222717, "rewards/margins": 2.734633445739746, "rewards/rejected": -3.510749340057373, "step": 394 }, { "epoch": 0.9139633286318759, "grad_norm": 44.03751755180035, "learning_rate": 3.297430124930444e-07, "logits/chosen": -1.0980035066604614, "logits/rejected": -1.098144292831421, "logps/chosen": -56.62092590332031, "logps/rejected": -61.69611358642578, "loss": 0.3528, "rewards/accuracies": 0.75, "rewards/chosen": -0.423416405916214, "rewards/margins": 2.6373236179351807, "rewards/rejected": -3.0607402324676514, "step": 396 }, { "epoch": 0.9185793050391076, "grad_norm": 20.653109576582093, "learning_rate": 3.2782934256963647e-07, "logits/chosen": -1.1482434272766113, "logits/rejected": -1.1585140228271484, "logps/chosen": -52.95622253417969, "logps/rejected": -71.6562728881836, "loss": 0.2816, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.51191645860672, "rewards/margins": 3.753627300262451, "rewards/rejected": -4.265543460845947, "step": 398 }, { "epoch": 0.9231952814463392, "grad_norm": 33.58507227677532, "learning_rate": 3.259106094375289e-07, "logits/chosen": -1.1832419633865356, "logits/rejected": -1.1917306184768677, "logps/chosen": -46.096675872802734, "logps/rejected": -71.52079010009766, "loss": 0.2501, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.2719300389289856, "rewards/margins": 4.045370101928711, "rewards/rejected": -4.317299842834473, "step": 400 }, { "epoch": 0.9231952814463392, "eval_logits/chosen": -1.1317284107208252, "eval_logits/rejected": -1.1366225481033325, "eval_logps/chosen": -49.24271011352539, "eval_logps/rejected": -63.64417266845703, "eval_loss": 0.2558155655860901, "eval_rewards/accuracies": 0.820852518081665, "eval_rewards/chosen": -0.30464252829551697, "eval_rewards/margins": 3.470890522003174, "eval_rewards/rejected": -3.775533437728882, "eval_runtime": 227.3955, "eval_samples_per_second": 7.625, "eval_steps_per_second": 1.909, "step": 400 }, { "epoch": 0.927811257853571, "grad_norm": 40.05600195486227, "learning_rate": 3.239869379204189e-07, "logits/chosen": -1.165150761604309, "logits/rejected": -1.1658389568328857, "logps/chosen": -51.360679626464844, "logps/rejected": -65.45414733886719, "loss": 0.2124, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.36249446868896484, "rewards/margins": 3.780195951461792, "rewards/rejected": -4.142690658569336, "step": 402 }, { "epoch": 0.9324272342608027, "grad_norm": 39.216765683029095, "learning_rate": 3.2205845316327144e-07, "logits/chosen": -1.183584213256836, "logits/rejected": -1.1847604513168335, "logps/chosen": -40.19718933105469, "logps/rejected": -51.98136901855469, "loss": 0.3544, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": -0.23264381289482117, "rewards/margins": 2.18786883354187, "rewards/rejected": -2.4205124378204346, "step": 404 }, { "epoch": 0.9370432106680343, "grad_norm": 27.136867213884347, "learning_rate": 3.2012528062417845e-07, "logits/chosen": -1.1893184185028076, "logits/rejected": -1.186366081237793, "logps/chosen": -48.742347717285156, "logps/rejected": -53.711280822753906, "loss": 0.2583, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.4608592987060547, "rewards/margins": 2.4421236515045166, "rewards/rejected": -2.9029834270477295, "step": 406 }, { "epoch": 0.9416591870752661, "grad_norm": 28.572200442333273, "learning_rate": 3.1818754606619643e-07, "logits/chosen": -1.146033763885498, "logits/rejected": -1.154913306236267, "logps/chosen": -43.58420944213867, "logps/rejected": -65.13819885253906, "loss": 0.3209, "rewards/accuracies": 0.75, "rewards/chosen": 0.25047793984413147, "rewards/margins": 4.219507217407227, "rewards/rejected": -3.969028949737549, "step": 408 }, { "epoch": 0.9462751634824977, "grad_norm": 22.48708730265553, "learning_rate": 3.162453755491655e-07, "logits/chosen": -1.2108149528503418, "logits/rejected": -1.2209829092025757, "logps/chosen": -45.49837112426758, "logps/rejected": -68.48489379882812, "loss": 0.1921, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.18967992067337036, "rewards/margins": 3.9147284030914307, "rewards/rejected": -3.725048542022705, "step": 410 }, { "epoch": 0.9508911398897294, "grad_norm": 37.73589723314555, "learning_rate": 3.142988954215079e-07, "logits/chosen": -1.1515933275222778, "logits/rejected": -1.1659475564956665, "logps/chosen": -48.16081619262695, "logps/rejected": -75.52259063720703, "loss": 0.2767, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.4500226378440857, "rewards/margins": 3.918086051940918, "rewards/rejected": -3.4680633544921875, "step": 412 }, { "epoch": 0.9555071162969612, "grad_norm": 50.447595273319266, "learning_rate": 3.1234823231200925e-07, "logits/chosen": -1.1608054637908936, "logits/rejected": -1.1741983890533447, "logps/chosen": -46.12180709838867, "logps/rejected": -76.99250030517578, "loss": 0.2659, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.0659295991063118, "rewards/margins": 4.484518527984619, "rewards/rejected": -4.4185895919799805, "step": 414 }, { "epoch": 0.9601230927041928, "grad_norm": 34.31172314274907, "learning_rate": 3.1039351312157993e-07, "logits/chosen": -1.1714129447937012, "logits/rejected": -1.1802603006362915, "logps/chosen": -47.3844108581543, "logps/rejected": -66.79096221923828, "loss": 0.2247, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.06639357656240463, "rewards/margins": 3.8744897842407227, "rewards/rejected": -3.8080966472625732, "step": 416 }, { "epoch": 0.9647390691114246, "grad_norm": 59.97310545853326, "learning_rate": 3.0843486501499967e-07, "logits/chosen": -1.1815389394760132, "logits/rejected": -1.1873387098312378, "logps/chosen": -49.0379638671875, "logps/rejected": -60.66644287109375, "loss": 0.375, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.37162768840789795, "rewards/margins": 2.9354913234710693, "rewards/rejected": -2.563863515853882, "step": 418 }, { "epoch": 0.9693550455186563, "grad_norm": 28.24836123391339, "learning_rate": 3.064724154126449e-07, "logits/chosen": -1.1865981817245483, "logits/rejected": -1.186213731765747, "logps/chosen": -49.98203659057617, "logps/rejected": -52.87804412841797, "loss": 0.231, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.10280448943376541, "rewards/margins": 2.791215181350708, "rewards/rejected": -2.688410758972168, "step": 420 }, { "epoch": 0.9693550455186563, "eval_logits/chosen": -1.1221855878829956, "eval_logits/rejected": -1.1280066967010498, "eval_logps/chosen": -48.28245544433594, "eval_logps/rejected": -62.80416488647461, "eval_loss": 0.252390593290329, "eval_rewards/accuracies": 0.820852518081665, "eval_rewards/chosen": 0.17548592388629913, "eval_rewards/margins": 3.531018018722534, "eval_rewards/rejected": -3.35553240776062, "eval_runtime": 227.4345, "eval_samples_per_second": 7.624, "eval_steps_per_second": 1.908, "step": 420 }, { "epoch": 0.9739710219258879, "grad_norm": 34.078391098200555, "learning_rate": 3.045062919821995e-07, "logits/chosen": -1.1267274618148804, "logits/rejected": -1.139290452003479, "logps/chosen": -46.573524475097656, "logps/rejected": -72.93567657470703, "loss": 0.2995, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.05330763757228851, "rewards/margins": 4.125481605529785, "rewards/rejected": -4.072174072265625, "step": 422 }, { "epoch": 0.9785869983331197, "grad_norm": 40.16987886487936, "learning_rate": 3.0253662263034925e-07, "logits/chosen": -1.1718653440475464, "logits/rejected": -1.1762607097625732, "logps/chosen": -51.13752746582031, "logps/rejected": -70.25437927246094, "loss": 0.2582, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.11435369402170181, "rewards/margins": 3.699098825454712, "rewards/rejected": -3.8134524822235107, "step": 424 }, { "epoch": 0.9832029747403513, "grad_norm": 32.13301323909956, "learning_rate": 3.005635354944606e-07, "logits/chosen": -1.1121575832366943, "logits/rejected": -1.113258957862854, "logps/chosen": -53.563053131103516, "logps/rejected": -52.64200210571289, "loss": 0.2696, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.11980216950178146, "rewards/margins": 2.639040231704712, "rewards/rejected": -2.7588419914245605, "step": 426 }, { "epoch": 0.987818951147583, "grad_norm": 37.16965622890279, "learning_rate": 2.9858715893424504e-07, "logits/chosen": -1.1091896295547485, "logits/rejected": -1.1275534629821777, "logps/chosen": -45.88606643676758, "logps/rejected": -73.55078125, "loss": 0.1871, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.062331780791282654, "rewards/margins": 4.677850723266602, "rewards/rejected": -4.740182876586914, "step": 428 }, { "epoch": 0.9924349275548148, "grad_norm": 26.22123779547593, "learning_rate": 2.966076215234082e-07, "logits/chosen": -1.066051959991455, "logits/rejected": -1.0764615535736084, "logps/chosen": -54.595703125, "logps/rejected": -72.26995849609375, "loss": 0.1937, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.17351490259170532, "rewards/margins": 4.106486797332764, "rewards/rejected": -3.932971954345703, "step": 430 }, { "epoch": 0.9970509039620464, "grad_norm": 33.565390661724486, "learning_rate": 2.94625052041286e-07, "logits/chosen": -1.1814826726913452, "logits/rejected": -1.1842460632324219, "logps/chosen": -50.375, "logps/rejected": -58.42066955566406, "loss": 0.255, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.07062428444623947, "rewards/margins": 3.117582082748413, "rewards/rejected": -3.188206672668457, "step": 432 }, { "epoch": 1.001666880369278, "grad_norm": 21.605417390859568, "learning_rate": 2.926395794644665e-07, "logits/chosen": -1.1752268075942993, "logits/rejected": -1.1771807670593262, "logps/chosen": -51.052242279052734, "logps/rejected": -61.03368377685547, "loss": 0.1838, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.23978914320468903, "rewards/margins": 3.637639045715332, "rewards/rejected": -3.3978495597839355, "step": 434 }, { "epoch": 1.0062828567765099, "grad_norm": 29.001645648697252, "learning_rate": 2.906513329583991e-07, "logits/chosen": -1.186964511871338, "logits/rejected": -1.1928526163101196, "logps/chosen": -46.22761535644531, "logps/rejected": -62.58315658569336, "loss": 0.2362, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.08661065995693207, "rewards/margins": 3.6797680854797363, "rewards/rejected": -3.76637864112854, "step": 436 }, { "epoch": 1.0108988331837414, "grad_norm": 25.079284366199737, "learning_rate": 2.886604418689921e-07, "logits/chosen": -1.1703391075134277, "logits/rejected": -1.1838041543960571, "logps/chosen": -44.59703063964844, "logps/rejected": -76.01687622070312, "loss": 0.2554, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.19546601176261902, "rewards/margins": 4.618009567260742, "rewards/rejected": -4.813475131988525, "step": 438 }, { "epoch": 1.0155148095909732, "grad_norm": 15.893743443702332, "learning_rate": 2.866670357141979e-07, "logits/chosen": -1.1566696166992188, "logits/rejected": -1.1605850458145142, "logps/chosen": -50.24718475341797, "logps/rejected": -61.23912048339844, "loss": 0.2096, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.3380340337753296, "rewards/margins": 4.049044609069824, "rewards/rejected": -3.711010456085205, "step": 440 }, { "epoch": 1.0155148095909732, "eval_logits/chosen": -1.1188750267028809, "eval_logits/rejected": -1.124423861503601, "eval_logps/chosen": -47.42933654785156, "eval_logps/rejected": -62.06835174560547, "eval_loss": 0.2514457404613495, "eval_rewards/accuracies": 0.8237327337265015, "eval_rewards/chosen": 0.6020476222038269, "eval_rewards/margins": 3.589672088623047, "eval_rewards/rejected": -2.9876248836517334, "eval_runtime": 227.3665, "eval_samples_per_second": 7.626, "eval_steps_per_second": 1.909, "step": 440 }, { "epoch": 1.020130785998205, "grad_norm": 13.432673135223586, "learning_rate": 2.8467124417558737e-07, "logits/chosen": -1.1185688972473145, "logits/rejected": -1.1209557056427002, "logps/chosen": -48.8853759765625, "logps/rejected": -63.88041305541992, "loss": 0.1931, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.8135783076286316, "rewards/margins": 4.117891788482666, "rewards/rejected": -3.3043136596679688, "step": 442 }, { "epoch": 1.0247467624054365, "grad_norm": 26.849141741016137, "learning_rate": 2.8267319708991253e-07, "logits/chosen": -1.096121907234192, "logits/rejected": -1.0977866649627686, "logps/chosen": -52.22743225097656, "logps/rejected": -55.36363220214844, "loss": 0.1924, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.6431276202201843, "rewards/margins": 3.097036123275757, "rewards/rejected": -2.453908681869507, "step": 444 }, { "epoch": 1.0293627388126683, "grad_norm": 31.09854358728895, "learning_rate": 2.806730244406612e-07, "logits/chosen": -1.1671628952026367, "logits/rejected": -1.1714211702346802, "logps/chosen": -46.41295623779297, "logps/rejected": -59.33990478515625, "loss": 0.2477, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5287280082702637, "rewards/margins": 3.4002442359924316, "rewards/rejected": -2.871516227722168, "step": 446 }, { "epoch": 1.0339787152199, "grad_norm": 31.751017976380037, "learning_rate": 2.786708563496001e-07, "logits/chosen": -1.2408480644226074, "logits/rejected": -1.2548807859420776, "logps/chosen": -49.92308044433594, "logps/rejected": -67.60689544677734, "loss": 0.179, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.6711897850036621, "rewards/margins": 4.588629722595215, "rewards/rejected": -3.9174396991729736, "step": 448 }, { "epoch": 1.0385946916271316, "grad_norm": 32.2047076869101, "learning_rate": 2.7666682306830994e-07, "logits/chosen": -1.19577157497406, "logits/rejected": -1.194454550743103, "logps/chosen": -46.63425827026367, "logps/rejected": -49.01081848144531, "loss": 0.2547, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5076774954795837, "rewards/margins": 2.832526206970215, "rewards/rejected": -2.3248488903045654, "step": 450 }, { "epoch": 1.0432106680343634, "grad_norm": 28.553486090102076, "learning_rate": 2.746610549697119e-07, "logits/chosen": -1.1639982461929321, "logits/rejected": -1.1696867942810059, "logps/chosen": -49.013301849365234, "logps/rejected": -65.69493865966797, "loss": 0.2036, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5706920623779297, "rewards/margins": 3.6936237812042236, "rewards/rejected": -3.122931957244873, "step": 452 }, { "epoch": 1.0478266444415951, "grad_norm": 13.92439999757166, "learning_rate": 2.7265368253958615e-07, "logits/chosen": -1.2167223691940308, "logits/rejected": -1.2195782661437988, "logps/chosen": -45.21904754638672, "logps/rejected": -53.50370407104492, "loss": 0.1746, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.7654822468757629, "rewards/margins": 3.2732510566711426, "rewards/rejected": -2.5077688694000244, "step": 454 }, { "epoch": 1.0524426208488267, "grad_norm": 36.82786613306219, "learning_rate": 2.706448363680831e-07, "logits/chosen": -1.1744914054870605, "logits/rejected": -1.1839237213134766, "logps/chosen": -47.35738754272461, "logps/rejected": -73.86341094970703, "loss": 0.1569, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.696943998336792, "rewards/margins": 4.708230018615723, "rewards/rejected": -4.011285781860352, "step": 456 }, { "epoch": 1.0570585972560584, "grad_norm": 13.78515040845917, "learning_rate": 2.686346471412277e-07, "logits/chosen": -1.1262328624725342, "logits/rejected": -1.1403940916061401, "logps/chosen": -49.68544387817383, "logps/rejected": -76.74578857421875, "loss": 0.1419, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.3619132339954376, "rewards/margins": 4.7866668701171875, "rewards/rejected": -4.424753665924072, "step": 458 }, { "epoch": 1.0616745736632902, "grad_norm": 26.57858405427076, "learning_rate": 2.6662324563241805e-07, "logits/chosen": -1.2429147958755493, "logits/rejected": -1.2455251216888428, "logps/chosen": -45.16366195678711, "logps/rejected": -57.56427001953125, "loss": 0.2357, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5941984057426453, "rewards/margins": 3.2229466438293457, "rewards/rejected": -2.6287484169006348, "step": 460 }, { "epoch": 1.0616745736632902, "eval_logits/chosen": -1.1103266477584839, "eval_logits/rejected": -1.1161012649536133, "eval_logps/chosen": -47.78204345703125, "eval_logps/rejected": -62.643646240234375, "eval_loss": 0.2479603886604309, "eval_rewards/accuracies": 0.8277649879455566, "eval_rewards/chosen": 0.42569395899772644, "eval_rewards/margins": 3.700965642929077, "eval_rewards/rejected": -3.2752716541290283, "eval_runtime": 227.8171, "eval_samples_per_second": 7.611, "eval_steps_per_second": 1.905, "step": 460 }, { "epoch": 1.0662905500705218, "grad_norm": 16.869585682992792, "learning_rate": 2.6461076269391713e-07, "logits/chosen": -1.0661816596984863, "logits/rejected": -1.0739065408706665, "logps/chosen": -54.75306701660156, "logps/rejected": -72.30966186523438, "loss": 0.1519, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.5898687839508057, "rewards/margins": 4.626287460327148, "rewards/rejected": -4.03641939163208, "step": 462 }, { "epoch": 1.0709065264777535, "grad_norm": 39.49599828169064, "learning_rate": 2.625973292483409e-07, "logits/chosen": -1.1013195514678955, "logits/rejected": -1.1079678535461426, "logps/chosen": -56.018310546875, "logps/rejected": -69.99239349365234, "loss": 0.2341, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.3379661738872528, "rewards/margins": 3.941251516342163, "rewards/rejected": -3.603285551071167, "step": 464 }, { "epoch": 1.0755225028849853, "grad_norm": 19.11015860441972, "learning_rate": 2.6058307628014065e-07, "logits/chosen": -1.110822319984436, "logits/rejected": -1.1167054176330566, "logps/chosen": -53.93489074707031, "logps/rejected": -66.05422973632812, "loss": 0.1708, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.3638114929199219, "rewards/margins": 4.259352207183838, "rewards/rejected": -3.895540952682495, "step": 466 }, { "epoch": 1.0801384792922168, "grad_norm": 29.43582021185457, "learning_rate": 2.5856813482708217e-07, "logits/chosen": -1.184598445892334, "logits/rejected": -1.1916086673736572, "logps/chosen": -49.6500244140625, "logps/rejected": -56.30369567871094, "loss": 0.2254, "rewards/accuracies": 0.875, "rewards/chosen": 0.36434826254844666, "rewards/margins": 3.6883039474487305, "rewards/rejected": -3.32395601272583, "step": 468 }, { "epoch": 1.0847544556994486, "grad_norm": 25.173770275980058, "learning_rate": 2.565526359717206e-07, "logits/chosen": -1.1290383338928223, "logits/rejected": -1.1306836605072021, "logps/chosen": -43.971435546875, "logps/rejected": -53.389156341552734, "loss": 0.289, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.18997710943222046, "rewards/margins": 3.0762457847595215, "rewards/rejected": -2.8862688541412354, "step": 470 }, { "epoch": 1.0893704321066804, "grad_norm": 19.76426072194178, "learning_rate": 2.545367108328731e-07, "logits/chosen": -1.163740873336792, "logits/rejected": -1.1682920455932617, "logps/chosen": -49.15140914916992, "logps/rejected": -59.53387451171875, "loss": 0.187, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.26795661449432373, "rewards/margins": 3.4266703128814697, "rewards/rejected": -3.1587133407592773, "step": 472 }, { "epoch": 1.0939864085139122, "grad_norm": 17.775087193890624, "learning_rate": 2.525204905570889e-07, "logits/chosen": -1.1204829216003418, "logits/rejected": -1.1253838539123535, "logps/chosen": -54.046390533447266, "logps/rejected": -66.79469299316406, "loss": 0.1607, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.39596712589263916, "rewards/margins": 4.161929130554199, "rewards/rejected": -3.7659616470336914, "step": 474 }, { "epoch": 1.0986023849211437, "grad_norm": 24.757995548476888, "learning_rate": 2.505041063101171e-07, "logits/chosen": -1.1805049180984497, "logits/rejected": -1.1901381015777588, "logps/chosen": -53.46992492675781, "logps/rejected": -59.58029556274414, "loss": 0.2762, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.13624940812587738, "rewards/margins": 3.440009832382202, "rewards/rejected": -3.303760290145874, "step": 476 }, { "epoch": 1.1032183613283755, "grad_norm": 19.970980113968885, "learning_rate": 2.4848768926837466e-07, "logits/chosen": -1.0963982343673706, "logits/rejected": -1.112579345703125, "logps/chosen": -47.13343811035156, "logps/rejected": -87.57925415039062, "loss": 0.1717, "rewards/accuracies": 0.875, "rewards/chosen": 0.06597856432199478, "rewards/margins": 5.439146518707275, "rewards/rejected": -5.373167991638184, "step": 478 }, { "epoch": 1.107834337735607, "grad_norm": 16.28084665624267, "learning_rate": 2.464713706104113e-07, "logits/chosen": -1.1157184839248657, "logits/rejected": -1.1205692291259766, "logps/chosen": -50.34828567504883, "logps/rejected": -63.41987609863281, "loss": 0.1608, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.043437667191028595, "rewards/margins": 4.042869567871094, "rewards/rejected": -3.99943208694458, "step": 480 }, { "epoch": 1.107834337735607, "eval_logits/chosen": -1.1090331077575684, "eval_logits/rejected": -1.1147438287734985, "eval_logps/chosen": -48.225521087646484, "eval_logps/rejected": -63.15263748168945, "eval_loss": 0.24383509159088135, "eval_rewards/accuracies": 0.8294931054115295, "eval_rewards/chosen": 0.203952819108963, "eval_rewards/margins": 3.733717441558838, "eval_rewards/rejected": -3.529764175415039, "eval_runtime": 227.3827, "eval_samples_per_second": 7.626, "eval_steps_per_second": 1.909, "step": 480 }, { "epoch": 1.1124503141428388, "grad_norm": 22.059822838666445, "learning_rate": 2.444552815083767e-07, "logits/chosen": -1.1268048286437988, "logits/rejected": -1.1286168098449707, "logps/chosen": -49.519527435302734, "logps/rejected": -52.751312255859375, "loss": 0.2264, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.3959563970565796, "rewards/margins": 3.2894110679626465, "rewards/rejected": -2.8934545516967773, "step": 482 }, { "epoch": 1.1170662905500706, "grad_norm": 19.4616654191151, "learning_rate": 2.4243955311948693e-07, "logits/chosen": -1.1610568761825562, "logits/rejected": -1.1703104972839355, "logps/chosen": -45.62093734741211, "logps/rejected": -71.41989135742188, "loss": 0.2218, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.056832365691661835, "rewards/margins": 4.676138877868652, "rewards/rejected": -4.619307041168213, "step": 484 }, { "epoch": 1.1216822669573023, "grad_norm": 29.74478192767247, "learning_rate": 2.4042431657749115e-07, "logits/chosen": -1.082115650177002, "logits/rejected": -1.097312092781067, "logps/chosen": -47.262996673583984, "logps/rejected": -84.21355438232422, "loss": 0.1955, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.2266267091035843, "rewards/margins": 4.9980058670043945, "rewards/rejected": -4.771378993988037, "step": 486 }, { "epoch": 1.1262982433645339, "grad_norm": 34.83946230592783, "learning_rate": 2.384097029841419e-07, "logits/chosen": -1.1644551753997803, "logits/rejected": -1.1694614887237549, "logps/chosen": -49.46389389038086, "logps/rejected": -59.47831344604492, "loss": 0.2086, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.2441619485616684, "rewards/margins": 3.580085277557373, "rewards/rejected": -3.335923433303833, "step": 488 }, { "epoch": 1.1309142197717656, "grad_norm": 21.45284163289699, "learning_rate": 2.3639584340066544e-07, "logits/chosen": -1.1405658721923828, "logits/rejected": -1.146559238433838, "logps/chosen": -41.80732727050781, "logps/rejected": -62.37071990966797, "loss": 0.2166, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.7920265197753906, "rewards/margins": 4.414507865905762, "rewards/rejected": -3.622481346130371, "step": 490 }, { "epoch": 1.1355301961789972, "grad_norm": 20.900830046410174, "learning_rate": 2.3438286883923539e-07, "logits/chosen": -1.164839267730713, "logits/rejected": -1.1716415882110596, "logps/chosen": -52.77006912231445, "logps/rejected": -60.97998046875, "loss": 0.2024, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.7002817988395691, "rewards/margins": 3.6204490661621094, "rewards/rejected": -2.9201676845550537, "step": 492 }, { "epoch": 1.140146172586229, "grad_norm": 22.811055008597176, "learning_rate": 2.323709102544506e-07, "logits/chosen": -1.1385387182235718, "logits/rejected": -1.1350369453430176, "logps/chosen": -44.97706604003906, "logps/rejected": -47.24876403808594, "loss": 0.267, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.8821587562561035, "rewards/margins": 2.6500444412231445, "rewards/rejected": -1.7678859233856201, "step": 494 }, { "epoch": 1.1447621489934607, "grad_norm": 25.7099907801656, "learning_rate": 2.3036009853481474e-07, "logits/chosen": -1.1164131164550781, "logits/rejected": -1.124334454536438, "logps/chosen": -44.46446228027344, "logps/rejected": -66.84891510009766, "loss": 0.252, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.42438387870788574, "rewards/margins": 4.415454387664795, "rewards/rejected": -3.9910707473754883, "step": 496 }, { "epoch": 1.1493781254006925, "grad_norm": 24.703959475838438, "learning_rate": 2.283505644942223e-07, "logits/chosen": -1.1680512428283691, "logits/rejected": -1.172341227531433, "logps/chosen": -39.898555755615234, "logps/rejected": -61.25082778930664, "loss": 0.1888, "rewards/accuracies": 0.875, "rewards/chosen": 0.8999897837638855, "rewards/margins": 4.018680095672607, "rewards/rejected": -3.118690252304077, "step": 498 }, { "epoch": 1.153994101807924, "grad_norm": 22.54617945872361, "learning_rate": 2.2634243886344781e-07, "logits/chosen": -1.1353996992111206, "logits/rejected": -1.1466150283813477, "logps/chosen": -47.095890045166016, "logps/rejected": -63.17418670654297, "loss": 0.1944, "rewards/accuracies": 0.875, "rewards/chosen": 1.0475714206695557, "rewards/margins": 4.362390518188477, "rewards/rejected": -3.314818859100342, "step": 500 }, { "epoch": 1.153994101807924, "eval_logits/chosen": -1.1096464395523071, "eval_logits/rejected": -1.1153897047042847, "eval_logps/chosen": -46.75843811035156, "eval_logps/rejected": -61.96721649169922, "eval_loss": 0.24556967616081238, "eval_rewards/accuracies": 0.8312212228775024, "eval_rewards/chosen": 0.9374985098838806, "eval_rewards/margins": 3.8745529651641846, "eval_rewards/rejected": -2.9370551109313965, "eval_runtime": 227.4325, "eval_samples_per_second": 7.624, "eval_steps_per_second": 1.908, "step": 500 }, { "epoch": 1.1586100782151558, "grad_norm": 27.18085844791722, "learning_rate": 2.2433585228164115e-07, "logits/chosen": -1.1491751670837402, "logits/rejected": -1.1580781936645508, "logps/chosen": -50.64473342895508, "logps/rejected": -75.20685577392578, "loss": 0.2218, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.8257203698158264, "rewards/margins": 5.088500022888184, "rewards/rejected": -4.26278018951416, "step": 502 }, { "epoch": 1.1632260546223874, "grad_norm": 23.89654504779422, "learning_rate": 2.2233093528782938e-07, "logits/chosen": -1.1577401161193848, "logits/rejected": -1.1688594818115234, "logps/chosen": -54.145572662353516, "logps/rejected": -66.3442611694336, "loss": 0.1751, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 1.085695743560791, "rewards/margins": 4.018051624298096, "rewards/rejected": -2.9323554039001465, "step": 504 }, { "epoch": 1.1678420310296191, "grad_norm": 23.769575263027864, "learning_rate": 2.2032781831242367e-07, "logits/chosen": -1.1783199310302734, "logits/rejected": -1.182464838027954, "logps/chosen": -41.669734954833984, "logps/rejected": -51.315006256103516, "loss": 0.2418, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.9586374759674072, "rewards/margins": 3.437027931213379, "rewards/rejected": -2.4783899784088135, "step": 506 }, { "epoch": 1.172458007436851, "grad_norm": 36.28094297883332, "learning_rate": 2.183266316687347e-07, "logits/chosen": -1.1632755994796753, "logits/rejected": -1.1601239442825317, "logps/chosen": -47.738983154296875, "logps/rejected": -49.51511001586914, "loss": 0.2641, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 1.0999305248260498, "rewards/margins": 2.7960052490234375, "rewards/rejected": -1.6960747241973877, "step": 508 }, { "epoch": 1.1770739838440827, "grad_norm": 18.77553292711027, "learning_rate": 2.16327505544495e-07, "logits/chosen": -1.1429252624511719, "logits/rejected": -1.1522622108459473, "logps/chosen": -50.42318344116211, "logps/rejected": -66.82962036132812, "loss": 0.1438, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 1.0773425102233887, "rewards/margins": 4.845379829406738, "rewards/rejected": -3.7680368423461914, "step": 510 }, { "epoch": 1.1816899602513142, "grad_norm": 17.949048045246396, "learning_rate": 2.143305699933892e-07, "logits/chosen": -1.1755365133285522, "logits/rejected": -1.180452823638916, "logps/chosen": -43.959930419921875, "logps/rejected": -64.32354736328125, "loss": 0.2051, "rewards/accuracies": 0.875, "rewards/chosen": 0.7747830748558044, "rewards/margins": 4.00773811340332, "rewards/rejected": -3.23295521736145, "step": 512 }, { "epoch": 1.186305936658546, "grad_norm": 29.609702959008317, "learning_rate": 2.1233595492659382e-07, "logits/chosen": -1.0717233419418335, "logits/rejected": -1.0747785568237305, "logps/chosen": -56.3906135559082, "logps/rejected": -58.85280990600586, "loss": 0.1651, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.6519337892532349, "rewards/margins": 3.8963236808776855, "rewards/rejected": -3.244389772415161, "step": 514 }, { "epoch": 1.1909219130657775, "grad_norm": 20.918336765308602, "learning_rate": 2.1034379010432542e-07, "logits/chosen": -1.1730085611343384, "logits/rejected": -1.1727977991104126, "logps/chosen": -44.0880126953125, "logps/rejected": -56.85806655883789, "loss": 0.1976, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.4944622218608856, "rewards/margins": 3.6419970989227295, "rewards/rejected": -3.1475343704223633, "step": 516 }, { "epoch": 1.1955378894730093, "grad_norm": 25.288305102390122, "learning_rate": 2.0835420512739957e-07, "logits/chosen": -1.1438689231872559, "logits/rejected": -1.1519646644592285, "logps/chosen": -47.1004638671875, "logps/rejected": -83.05133819580078, "loss": 0.1839, "rewards/accuracies": 0.875, "rewards/chosen": 0.2153804749250412, "rewards/margins": 5.248907089233398, "rewards/rejected": -5.03352689743042, "step": 518 }, { "epoch": 1.200153865880241, "grad_norm": 19.59880042292574, "learning_rate": 2.0636732942879917e-07, "logits/chosen": -1.1264581680297852, "logits/rejected": -1.1307095289230347, "logps/chosen": -50.47024917602539, "logps/rejected": -64.73220825195312, "loss": 0.1619, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.3765060007572174, "rewards/margins": 4.3557209968566895, "rewards/rejected": -3.979214906692505, "step": 520 }, { "epoch": 1.200153865880241, "eval_logits/chosen": -1.1257847547531128, "eval_logits/rejected": -1.1301593780517578, "eval_logps/chosen": -48.62382507324219, "eval_logps/rejected": -63.695838928222656, "eval_loss": 0.24041977524757385, "eval_rewards/accuracies": 0.8335253596305847, "eval_rewards/chosen": 0.004803389776498079, "eval_rewards/margins": 3.8061721324920654, "eval_rewards/rejected": -3.8013687133789062, "eval_runtime": 227.3215, "eval_samples_per_second": 7.628, "eval_steps_per_second": 1.909, "step": 520 }, { "epoch": 1.2047698422874729, "grad_norm": 17.44277679097044, "learning_rate": 2.0438329226525415e-07, "logits/chosen": -1.144020915031433, "logits/rejected": -1.1453847885131836, "logps/chosen": -49.49299621582031, "logps/rejected": -53.57789611816406, "loss": 0.2025, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.42205584049224854, "rewards/margins": 3.364236831665039, "rewards/rejected": -2.942180871963501, "step": 522 }, { "epoch": 1.2093858186947044, "grad_norm": 37.73486109040497, "learning_rate": 2.0240222270883288e-07, "logits/chosen": -1.1556731462478638, "logits/rejected": -1.1702880859375, "logps/chosen": -50.98395538330078, "logps/rejected": -76.1040267944336, "loss": 0.2225, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.07603338360786438, "rewards/margins": 4.920933246612549, "rewards/rejected": -4.9969658851623535, "step": 524 }, { "epoch": 1.2140017951019362, "grad_norm": 25.377653028594498, "learning_rate": 2.0042424963854542e-07, "logits/chosen": -1.196961760520935, "logits/rejected": -1.2145916223526, "logps/chosen": -47.896732330322266, "logps/rejected": -83.96469116210938, "loss": 0.1463, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.26778745651245117, "rewards/margins": 5.1378092765808105, "rewards/rejected": -5.40559720993042, "step": 526 }, { "epoch": 1.2186177715091677, "grad_norm": 21.037533939572622, "learning_rate": 1.9844950173195883e-07, "logits/chosen": -1.2031718492507935, "logits/rejected": -1.207302212715149, "logps/chosen": -48.16632843017578, "logps/rejected": -63.03504943847656, "loss": 0.1798, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.3718983829021454, "rewards/margins": 3.580104351043701, "rewards/rejected": -3.95200252532959, "step": 528 }, { "epoch": 1.2232337479163995, "grad_norm": 22.283357312346677, "learning_rate": 1.964781074568265e-07, "logits/chosen": -1.2361119985580444, "logits/rejected": -1.2355589866638184, "logps/chosen": -48.786136627197266, "logps/rejected": -53.23166275024414, "loss": 0.1959, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.26044440269470215, "rewards/margins": 3.046844482421875, "rewards/rejected": -3.307288885116577, "step": 530 }, { "epoch": 1.2278497243236313, "grad_norm": 32.696983725912126, "learning_rate": 1.9451019506273018e-07, "logits/chosen": -1.1622250080108643, "logits/rejected": -1.16159987449646, "logps/chosen": -42.64177322387695, "logps/rejected": -55.659339904785156, "loss": 0.2379, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.17224609851837158, "rewards/margins": 3.0260884761810303, "rewards/rejected": -3.1983346939086914, "step": 532 }, { "epoch": 1.232465700730863, "grad_norm": 31.81561570512381, "learning_rate": 1.9254589257273712e-07, "logits/chosen": -1.1568024158477783, "logits/rejected": -1.1644660234451294, "logps/chosen": -43.20817565917969, "logps/rejected": -66.71856689453125, "loss": 0.1648, "rewards/accuracies": 0.875, "rewards/chosen": -0.03500910475850105, "rewards/margins": 4.864539623260498, "rewards/rejected": -4.899548530578613, "step": 534 }, { "epoch": 1.2370816771380946, "grad_norm": 28.042577352949, "learning_rate": 1.9058532777507141e-07, "logits/chosen": -1.1810351610183716, "logits/rejected": -1.186877727508545, "logps/chosen": -46.86735534667969, "logps/rejected": -58.00865173339844, "loss": 0.1946, "rewards/accuracies": 0.875, "rewards/chosen": -0.07706524431705475, "rewards/margins": 3.6953649520874023, "rewards/rejected": -3.772430896759033, "step": 536 }, { "epoch": 1.2416976535453264, "grad_norm": 31.06776524909045, "learning_rate": 1.886286282148002e-07, "logits/chosen": -1.1419155597686768, "logits/rejected": -1.1518969535827637, "logps/chosen": -48.36051559448242, "logps/rejected": -68.71621704101562, "loss": 0.2628, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.3396787643432617, "rewards/margins": 4.181530952453613, "rewards/rejected": -4.521209716796875, "step": 538 }, { "epoch": 1.246313629952558, "grad_norm": 25.267305267108494, "learning_rate": 1.8667592118553693e-07, "logits/chosen": -1.2024831771850586, "logits/rejected": -1.2076871395111084, "logps/chosen": -52.623023986816406, "logps/rejected": -61.89004898071289, "loss": 0.2083, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.21523982286453247, "rewards/margins": 3.845079183578491, "rewards/rejected": -4.060319423675537, "step": 540 }, { "epoch": 1.246313629952558, "eval_logits/chosen": -1.1086678504943848, "eval_logits/rejected": -1.11427903175354, "eval_logps/chosen": -48.652427673339844, "eval_logps/rejected": -63.7818717956543, "eval_loss": 0.2395094931125641, "eval_rewards/accuracies": 0.8323732614517212, "eval_rewards/chosen": -0.009500053711235523, "eval_rewards/margins": 3.8348822593688965, "eval_rewards/rejected": -3.8443822860717773, "eval_runtime": 227.3478, "eval_samples_per_second": 7.627, "eval_steps_per_second": 1.909, "step": 540 }, { "epoch": 1.2509296063597897, "grad_norm": 29.79858133121253, "learning_rate": 1.8472733372115956e-07, "logits/chosen": -1.2199275493621826, "logits/rejected": -1.2279648780822754, "logps/chosen": -50.78293228149414, "logps/rejected": -72.13998413085938, "loss": 0.2006, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.18516860902309418, "rewards/margins": 4.692964553833008, "rewards/rejected": -4.878133773803711, "step": 542 }, { "epoch": 1.2555455827670214, "grad_norm": 18.335692774068683, "learning_rate": 1.8278299258754692e-07, "logits/chosen": -1.1382191181182861, "logits/rejected": -1.1532717943191528, "logps/chosen": -51.21404266357422, "logps/rejected": -84.46864318847656, "loss": 0.2507, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.002586497226729989, "rewards/margins": 6.08302116394043, "rewards/rejected": -6.085607528686523, "step": 544 }, { "epoch": 1.2601615591742532, "grad_norm": 16.812848293352094, "learning_rate": 1.808430242743316e-07, "logits/chosen": -1.1635518074035645, "logits/rejected": -1.1678366661071777, "logps/chosen": -50.37785339355469, "logps/rejected": -63.68332290649414, "loss": 0.207, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.4088381230831146, "rewards/margins": 4.470311641693115, "rewards/rejected": -4.0614728927612305, "step": 546 }, { "epoch": 1.2647775355814848, "grad_norm": 15.038796138409147, "learning_rate": 1.7890755498667104e-07, "logits/chosen": -1.1664639711380005, "logits/rejected": -1.1719496250152588, "logps/chosen": -42.69700622558594, "logps/rejected": -64.51258850097656, "loss": 0.1614, "rewards/accuracies": 0.875, "rewards/chosen": 0.17392092943191528, "rewards/margins": 4.139625549316406, "rewards/rejected": -3.9657046794891357, "step": 548 }, { "epoch": 1.2693935119887165, "grad_norm": 27.84348089734983, "learning_rate": 1.7697671063703756e-07, "logits/chosen": -1.1588454246520996, "logits/rejected": -1.1661314964294434, "logps/chosen": -44.9870719909668, "logps/rejected": -65.78500366210938, "loss": 0.2201, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.46526622772216797, "rewards/margins": 4.418604373931885, "rewards/rejected": -3.953338623046875, "step": 550 }, { "epoch": 1.274009488395948, "grad_norm": 44.28075638457142, "learning_rate": 1.750506168370267e-07, "logits/chosen": -1.1754214763641357, "logits/rejected": -1.175520896911621, "logps/chosen": -46.97793960571289, "logps/rejected": -54.8778076171875, "loss": 0.2392, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.9450171589851379, "rewards/margins": 3.314897298812866, "rewards/rejected": -2.369880199432373, "step": 552 }, { "epoch": 1.2786254648031798, "grad_norm": 17.20774183339189, "learning_rate": 1.7312939888918594e-07, "logits/chosen": -1.1337149143218994, "logits/rejected": -1.1420766115188599, "logps/chosen": -49.699161529541016, "logps/rejected": -73.22956085205078, "loss": 0.1671, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 0.23295314610004425, "rewards/margins": 4.598634243011475, "rewards/rejected": -4.36568021774292, "step": 554 }, { "epoch": 1.2832414412104116, "grad_norm": 12.879730723232214, "learning_rate": 1.712131817788628e-07, "logits/chosen": -1.1121582984924316, "logits/rejected": -1.1107348203659058, "logps/chosen": -46.21583938598633, "logps/rejected": -56.827064514160156, "loss": 0.2081, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.3201579451560974, "rewards/margins": 3.4782567024230957, "rewards/rejected": -3.1580984592437744, "step": 556 }, { "epoch": 1.2878574176176434, "grad_norm": 18.379299257318767, "learning_rate": 1.693020901660738e-07, "logits/chosen": -1.1363815069198608, "logits/rejected": -1.1399273872375488, "logps/chosen": -53.33127212524414, "logps/rejected": -65.93392181396484, "loss": 0.1484, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.24407842755317688, "rewards/margins": 4.509576320648193, "rewards/rejected": -4.265497207641602, "step": 558 }, { "epoch": 1.292473394024875, "grad_norm": 30.348807794181617, "learning_rate": 1.6739624837739518e-07, "logits/chosen": -1.1836738586425781, "logits/rejected": -1.188474416732788, "logps/chosen": -53.792484283447266, "logps/rejected": -60.70852279663086, "loss": 0.2063, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.07742541283369064, "rewards/margins": 3.257985830307007, "rewards/rejected": -3.335411310195923, "step": 560 }, { "epoch": 1.292473394024875, "eval_logits/chosen": -1.1171412467956543, "eval_logits/rejected": -1.1220649480819702, "eval_logps/chosen": -48.55030059814453, "eval_logps/rejected": -63.84280014038086, "eval_loss": 0.2384917438030243, "eval_rewards/accuracies": 0.8329492807388306, "eval_rewards/chosen": 0.04156512767076492, "eval_rewards/margins": 3.9164135456085205, "eval_rewards/rejected": -3.8748483657836914, "eval_runtime": 227.4385, "eval_samples_per_second": 7.624, "eval_steps_per_second": 1.908, "step": 560 }, { "epoch": 1.2970893704321067, "grad_norm": 16.156045882880065, "learning_rate": 1.6549578039787434e-07, "logits/chosen": -1.174346923828125, "logits/rejected": -1.1770930290222168, "logps/chosen": -50.916481018066406, "logps/rejected": -73.8545913696289, "loss": 0.2379, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.09411442279815674, "rewards/margins": 4.1428680419921875, "rewards/rejected": -4.04875373840332, "step": 562 }, { "epoch": 1.3017053468393383, "grad_norm": 16.73959305434838, "learning_rate": 1.6360080986296384e-07, "logits/chosen": -1.14383065700531, "logits/rejected": -1.1593358516693115, "logps/chosen": -43.3416862487793, "logps/rejected": -74.88224029541016, "loss": 0.1743, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.07543455064296722, "rewards/margins": 5.390232563018799, "rewards/rejected": -5.465667247772217, "step": 564 }, { "epoch": 1.30632132324657, "grad_norm": 19.01188883701719, "learning_rate": 1.6171146005047894e-07, "logits/chosen": -1.1023046970367432, "logits/rejected": -1.1084368228912354, "logps/chosen": -55.277671813964844, "logps/rejected": -72.66205596923828, "loss": 0.1773, "rewards/accuracies": 0.875, "rewards/chosen": 0.12563864886760712, "rewards/margins": 4.530452728271484, "rewards/rejected": -4.404813766479492, "step": 566 }, { "epoch": 1.3109372996538018, "grad_norm": 21.87767395078719, "learning_rate": 1.5982785387257694e-07, "logits/chosen": -1.1128134727478027, "logits/rejected": -1.1113499402999878, "logps/chosen": -49.72315979003906, "logps/rejected": -56.22350311279297, "loss": 0.1993, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.21243299543857574, "rewards/margins": 3.1131999492645264, "rewards/rejected": -3.3256328105926514, "step": 568 }, { "epoch": 1.3155532760610336, "grad_norm": 23.959413620362366, "learning_rate": 1.5795011386776159e-07, "logits/chosen": -1.2445893287658691, "logits/rejected": -1.2455319166183472, "logps/chosen": -49.374000549316406, "logps/rejected": -54.24154281616211, "loss": 0.2022, "rewards/accuracies": 0.875, "rewards/chosen": 0.03143635019659996, "rewards/margins": 3.3321659564971924, "rewards/rejected": -3.300729751586914, "step": 570 }, { "epoch": 1.320169252468265, "grad_norm": 16.120326025420926, "learning_rate": 1.560783621929113e-07, "logits/chosen": -1.216759204864502, "logits/rejected": -1.2203327417373657, "logps/chosen": -57.26462936401367, "logps/rejected": -62.25014877319336, "loss": 0.1895, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.36375537514686584, "rewards/margins": 3.7947163581848145, "rewards/rejected": -3.4309606552124023, "step": 572 }, { "epoch": 1.3247852288754969, "grad_norm": 48.11745636864845, "learning_rate": 1.5421272061533177e-07, "logits/chosen": -1.1405613422393799, "logits/rejected": -1.1527469158172607, "logps/chosen": -43.10445785522461, "logps/rejected": -70.5496826171875, "loss": 0.3131, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.3498223125934601, "rewards/margins": 4.530541896820068, "rewards/rejected": -4.180719375610352, "step": 574 }, { "epoch": 1.3294012052827284, "grad_norm": 36.29330680340726, "learning_rate": 1.5235331050483513e-07, "logits/chosen": -1.110296607017517, "logits/rejected": -1.1132102012634277, "logps/chosen": -50.03364944458008, "logps/rejected": -65.54788208007812, "loss": 0.2241, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.058246809989213943, "rewards/margins": 3.8965775966644287, "rewards/rejected": -3.954824209213257, "step": 576 }, { "epoch": 1.3340171816899602, "grad_norm": 12.515890509319174, "learning_rate": 1.5050025282584327e-07, "logits/chosen": -1.1224780082702637, "logits/rejected": -1.1311124563217163, "logps/chosen": -56.89671325683594, "logps/rejected": -74.29792785644531, "loss": 0.1357, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 0.14432966709136963, "rewards/margins": 4.722050666809082, "rewards/rejected": -4.577720642089844, "step": 578 }, { "epoch": 1.338633158097192, "grad_norm": 16.31324930298768, "learning_rate": 1.4865366812951921e-07, "logits/chosen": -1.0956053733825684, "logits/rejected": -1.0948615074157715, "logps/chosen": -44.66694641113281, "logps/rejected": -53.35417175292969, "loss": 0.186, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.01459770742803812, "rewards/margins": 3.5722241401672363, "rewards/rejected": -3.586822032928467, "step": 580 }, { "epoch": 1.338633158097192, "eval_logits/chosen": -1.1196925640106201, "eval_logits/rejected": -1.1241984367370605, "eval_logps/chosen": -49.20341873168945, "eval_logps/rejected": -64.5632095336914, "eval_loss": 0.23692870140075684, "eval_rewards/accuracies": 0.8352534770965576, "eval_rewards/chosen": -0.28499341011047363, "eval_rewards/margins": 3.9500606060028076, "eval_rewards/rejected": -4.2350544929504395, "eval_runtime": 227.2284, "eval_samples_per_second": 7.631, "eval_steps_per_second": 1.91, "step": 580 }, { "epoch": 1.3432491345044237, "grad_norm": 20.781517687408783, "learning_rate": 1.4681367654592446e-07, "logits/chosen": -1.1334997415542603, "logits/rejected": -1.1324316263198853, "logps/chosen": -51.77867889404297, "logps/rejected": -59.6033935546875, "loss": 0.1616, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.17863652110099792, "rewards/margins": 3.280036449432373, "rewards/rejected": -3.4586730003356934, "step": 582 }, { "epoch": 1.3478651109116553, "grad_norm": 23.414987494186462, "learning_rate": 1.4498039777620353e-07, "logits/chosen": -1.1315072774887085, "logits/rejected": -1.1402992010116577, "logps/chosen": -56.817928314208984, "logps/rejected": -78.69415283203125, "loss": 0.1822, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.08201665431261063, "rewards/margins": 4.948797702789307, "rewards/rejected": -4.866781234741211, "step": 584 }, { "epoch": 1.352481087318887, "grad_norm": 29.704044318217893, "learning_rate": 1.4315395108479728e-07, "logits/chosen": -1.170173168182373, "logits/rejected": -1.1773362159729004, "logps/chosen": -49.731021881103516, "logps/rejected": -67.60116577148438, "loss": 0.1818, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.5396385788917542, "rewards/margins": 3.7138872146606445, "rewards/rejected": -4.253525733947754, "step": 586 }, { "epoch": 1.3570970637261186, "grad_norm": 26.692588714788616, "learning_rate": 1.4133445529168365e-07, "logits/chosen": -1.1388497352600098, "logits/rejected": -1.1425156593322754, "logps/chosen": -54.95615005493164, "logps/rejected": -69.78382873535156, "loss": 0.1626, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.5585896968841553, "rewards/margins": 4.065035820007324, "rewards/rejected": -4.623625755310059, "step": 588 }, { "epoch": 1.3617130401333504, "grad_norm": 17.240579232471294, "learning_rate": 1.395220287646483e-07, "logits/chosen": -1.1489366292953491, "logits/rejected": -1.1541228294372559, "logps/chosen": -52.8204345703125, "logps/rejected": -65.34224700927734, "loss": 0.1726, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.6690115332603455, "rewards/margins": 3.6602673530578613, "rewards/rejected": -4.329278945922852, "step": 590 }, { "epoch": 1.3663290165405821, "grad_norm": 26.542897065871216, "learning_rate": 1.377167894115837e-07, "logits/chosen": -1.092912197113037, "logits/rejected": -1.1030445098876953, "logps/chosen": -45.3072509765625, "logps/rejected": -80.26785278320312, "loss": 0.1874, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.4750543236732483, "rewards/margins": 4.889781475067139, "rewards/rejected": -5.3648362159729, "step": 592 }, { "epoch": 1.370944992947814, "grad_norm": 14.292615359684781, "learning_rate": 1.3591885467281877e-07, "logits/chosen": -1.2319241762161255, "logits/rejected": -1.2370344400405884, "logps/chosen": -47.54234313964844, "logps/rejected": -69.16310119628906, "loss": 0.175, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": -0.6050369739532471, "rewards/margins": 4.743907451629639, "rewards/rejected": -5.348944664001465, "step": 594 }, { "epoch": 1.3755609693550455, "grad_norm": 24.44530683273813, "learning_rate": 1.3412834151347896e-07, "logits/chosen": -1.1558417081832886, "logits/rejected": -1.1581149101257324, "logps/chosen": -51.461524963378906, "logps/rejected": -66.95028686523438, "loss": 0.1852, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.5921568274497986, "rewards/margins": 4.088617324829102, "rewards/rejected": -4.680773735046387, "step": 596 }, { "epoch": 1.3801769457622772, "grad_norm": 20.235949045232623, "learning_rate": 1.323453664158769e-07, "logits/chosen": -1.15675687789917, "logits/rejected": -1.1713188886642456, "logps/chosen": -47.177574157714844, "logps/rejected": -76.56927490234375, "loss": 0.2271, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.8099576234817505, "rewards/margins": 4.676289081573486, "rewards/rejected": -5.486246585845947, "step": 598 }, { "epoch": 1.3847929221695088, "grad_norm": 17.33444042750883, "learning_rate": 1.3057004537193422e-07, "logits/chosen": -1.178117036819458, "logits/rejected": -1.1799274682998657, "logps/chosen": -53.55020523071289, "logps/rejected": -62.39503860473633, "loss": 0.1845, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.5546784996986389, "rewards/margins": 4.285789966583252, "rewards/rejected": -4.840468883514404, "step": 600 }, { "epoch": 1.3847929221695088, "eval_logits/chosen": -1.1150095462799072, "eval_logits/rejected": -1.1200028657913208, "eval_logps/chosen": -49.0994873046875, "eval_logps/rejected": -64.55924987792969, "eval_loss": 0.23688365519046783, "eval_rewards/accuracies": 0.8312212228775024, "eval_rewards/chosen": -0.2330285757780075, "eval_rewards/margins": 4.000042915344238, "eval_rewards/rejected": -4.2330708503723145, "eval_runtime": 227.4951, "eval_samples_per_second": 7.622, "eval_steps_per_second": 1.908, "step": 600 }, { "epoch": 1.3894088985767405, "grad_norm": 37.7777346930926, "learning_rate": 1.2880249387563662e-07, "logits/chosen": -1.1187705993652344, "logits/rejected": -1.1262003183364868, "logps/chosen": -51.49440002441406, "logps/rejected": -74.16474914550781, "loss": 0.1865, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.5179623365402222, "rewards/margins": 4.877338409423828, "rewards/rejected": -5.395299911499023, "step": 602 }, { "epoch": 1.3940248749839723, "grad_norm": 9.235672719669628, "learning_rate": 1.2704282691551938e-07, "logits/chosen": -1.116797924041748, "logits/rejected": -1.1314423084259033, "logps/chosen": -47.851707458496094, "logps/rejected": -79.61589813232422, "loss": 0.1526, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.13351619243621826, "rewards/margins": 5.681245803833008, "rewards/rejected": -5.5477294921875, "step": 604 }, { "epoch": 1.398640851391204, "grad_norm": 22.69345823971204, "learning_rate": 1.2529115896718714e-07, "logits/chosen": -1.1550525426864624, "logits/rejected": -1.158216118812561, "logps/chosen": -52.753231048583984, "logps/rejected": -61.50723648071289, "loss": 0.1924, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.3491640090942383, "rewards/margins": 3.678175926208496, "rewards/rejected": -4.027340412139893, "step": 606 }, { "epoch": 1.4032568277984356, "grad_norm": 18.832219757797976, "learning_rate": 1.2354760398586708e-07, "logits/chosen": -1.0966627597808838, "logits/rejected": -1.108534574508667, "logps/chosen": -55.49364471435547, "logps/rejected": -82.76998901367188, "loss": 0.1459, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.10722073167562485, "rewards/margins": 5.6869659423828125, "rewards/rejected": -5.794186592102051, "step": 608 }, { "epoch": 1.4078728042056674, "grad_norm": 14.9525121487636, "learning_rate": 1.2181227539899468e-07, "logits/chosen": -1.1296883821487427, "logits/rejected": -1.1345244646072388, "logps/chosen": -52.32283020019531, "logps/rejected": -67.40359497070312, "loss": 0.1857, "rewards/accuracies": 0.875, "rewards/chosen": -0.009007109329104424, "rewards/margins": 4.094158172607422, "rewards/rejected": -4.103165626525879, "step": 610 }, { "epoch": 1.412488780612899, "grad_norm": 14.336076086644082, "learning_rate": 1.2008528609883557e-07, "logits/chosen": -1.1148794889450073, "logits/rejected": -1.125455379486084, "logps/chosen": -54.79003143310547, "logps/rejected": -75.44340515136719, "loss": 0.1438, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.29885244369506836, "rewards/margins": 5.630979537963867, "rewards/rejected": -5.332127571105957, "step": 612 }, { "epoch": 1.4171047570201307, "grad_norm": 19.871071765511616, "learning_rate": 1.1836674843514042e-07, "logits/chosen": -1.1470178365707397, "logits/rejected": -1.1546311378479004, "logps/chosen": -43.762474060058594, "logps/rejected": -64.04544830322266, "loss": 0.1737, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.18334037065505981, "rewards/margins": 4.691874027252197, "rewards/rejected": -4.508533477783203, "step": 614 }, { "epoch": 1.4217207334273625, "grad_norm": 12.336389870346276, "learning_rate": 1.1665677420783671e-07, "logits/chosen": -1.1334505081176758, "logits/rejected": -1.1354079246520996, "logps/chosen": -48.85646057128906, "logps/rejected": -58.98961639404297, "loss": 0.1774, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.5762940645217896, "rewards/margins": 4.42154598236084, "rewards/rejected": -3.84525203704834, "step": 616 }, { "epoch": 1.4263367098345943, "grad_norm": 26.99839545918975, "learning_rate": 1.149554746597553e-07, "logits/chosen": -1.153773546218872, "logits/rejected": -1.1617780923843384, "logps/chosen": -51.52302551269531, "logps/rejected": -71.93586730957031, "loss": 0.2209, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.06947656720876694, "rewards/margins": 4.659074783325195, "rewards/rejected": -4.728551387786865, "step": 618 }, { "epoch": 1.4309526862418258, "grad_norm": 23.120670434819147, "learning_rate": 1.1326296046939333e-07, "logits/chosen": -1.2297728061676025, "logits/rejected": -1.2327196598052979, "logps/chosen": -46.15282440185547, "logps/rejected": -57.13591766357422, "loss": 0.2511, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.22867631912231445, "rewards/margins": 3.8101589679718018, "rewards/rejected": -3.5814828872680664, "step": 620 }, { "epoch": 1.4309526862418258, "eval_logits/chosen": -1.1074106693267822, "eval_logits/rejected": -1.1129966974258423, "eval_logps/chosen": -48.4957389831543, "eval_logps/rejected": -64.15252685546875, "eval_loss": 0.23753519356250763, "eval_rewards/accuracies": 0.8341013789176941, "eval_rewards/chosen": 0.06884526461362839, "eval_rewards/margins": 4.098559379577637, "eval_rewards/rejected": -4.029714107513428, "eval_runtime": 227.1817, "eval_samples_per_second": 7.633, "eval_steps_per_second": 1.91, "step": 620 }, { "epoch": 1.4355686626490576, "grad_norm": 19.372274896588006, "learning_rate": 1.1157934174371413e-07, "logits/chosen": -1.1408151388168335, "logits/rejected": -1.15198814868927, "logps/chosen": -50.5245361328125, "logps/rejected": -73.73757934570312, "loss": 0.1935, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.12286674976348877, "rewards/margins": 4.926203727722168, "rewards/rejected": -4.803337574005127, "step": 622 }, { "epoch": 1.4401846390562894, "grad_norm": 17.679599039225316, "learning_rate": 1.0990472801098419e-07, "logits/chosen": -1.1729627847671509, "logits/rejected": -1.1764029264450073, "logps/chosen": -45.59742736816406, "logps/rejected": -67.27173614501953, "loss": 0.1504, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.4040929973125458, "rewards/margins": 4.669921398162842, "rewards/rejected": -4.265828609466553, "step": 624 }, { "epoch": 1.444800615463521, "grad_norm": 21.626789388307476, "learning_rate": 1.0823922821364795e-07, "logits/chosen": -1.0937749147415161, "logits/rejected": -1.0970505475997925, "logps/chosen": -57.45598602294922, "logps/rejected": -64.92636108398438, "loss": 0.1922, "rewards/accuracies": 0.875, "rewards/chosen": 0.20599400997161865, "rewards/margins": 4.274693489074707, "rewards/rejected": -4.068699836730957, "step": 626 }, { "epoch": 1.4494165918707527, "grad_norm": 22.846453164687816, "learning_rate": 1.0658295070124026e-07, "logits/chosen": -1.1624855995178223, "logits/rejected": -1.1643033027648926, "logps/chosen": -54.79329299926758, "logps/rejected": -61.6580810546875, "loss": 0.2111, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.43274542689323425, "rewards/margins": 4.024546146392822, "rewards/rejected": -3.5918006896972656, "step": 628 }, { "epoch": 1.4540325682779844, "grad_norm": 35.15148920852289, "learning_rate": 1.0493600322333762e-07, "logits/chosen": -1.1498773097991943, "logits/rejected": -1.1634445190429688, "logps/chosen": -50.85679626464844, "logps/rejected": -84.39616394042969, "loss": 0.167, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.023858733475208282, "rewards/margins": 6.123683929443359, "rewards/rejected": -6.147542953491211, "step": 630 }, { "epoch": 1.458648544685216, "grad_norm": 30.044775332799237, "learning_rate": 1.0329849292254883e-07, "logits/chosen": -1.0466785430908203, "logits/rejected": -1.0532869100570679, "logps/chosen": -50.80934143066406, "logps/rejected": -70.91539001464844, "loss": 0.2205, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.08874227106571198, "rewards/margins": 4.631314277648926, "rewards/rejected": -4.542571544647217, "step": 632 }, { "epoch": 1.4632645210924478, "grad_norm": 27.395602840016966, "learning_rate": 1.0167052632754458e-07, "logits/chosen": -1.1735262870788574, "logits/rejected": -1.1718860864639282, "logps/chosen": -46.975406646728516, "logps/rejected": -56.96874237060547, "loss": 0.2108, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.012903611175715923, "rewards/margins": 3.0641860961914062, "rewards/rejected": -3.0512828826904297, "step": 634 }, { "epoch": 1.4678804974996795, "grad_norm": 19.703963008444365, "learning_rate": 1.0005220934612713e-07, "logits/chosen": -1.0792059898376465, "logits/rejected": -1.0779961347579956, "logps/chosen": -54.955718994140625, "logps/rejected": -60.88299560546875, "loss": 0.1733, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.10138123482465744, "rewards/margins": 3.779219627380371, "rewards/rejected": -3.6778385639190674, "step": 636 }, { "epoch": 1.472496473906911, "grad_norm": 26.332520125277515, "learning_rate": 9.844364725834056e-08, "logits/chosen": -1.1881197690963745, "logits/rejected": -1.201986312866211, "logps/chosen": -51.59233093261719, "logps/rejected": -88.54263305664062, "loss": 0.1426, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.19230936467647552, "rewards/margins": 6.799825668334961, "rewards/rejected": -6.607515811920166, "step": 638 }, { "epoch": 1.4771124503141428, "grad_norm": 28.45092893361144, "learning_rate": 9.68449447096217e-08, "logits/chosen": -1.2595468759536743, "logits/rejected": -1.2635498046875, "logps/chosen": -45.43921661376953, "logps/rejected": -59.12704849243164, "loss": 0.3573, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.18244637548923492, "rewards/margins": 3.4541211128234863, "rewards/rejected": -3.271674394607544, "step": 640 }, { "epoch": 1.4771124503141428, "eval_logits/chosen": -1.1004152297973633, "eval_logits/rejected": -1.1063543558120728, "eval_logps/chosen": -48.19259262084961, "eval_logps/rejected": -63.969642639160156, "eval_loss": 0.23769782483577728, "eval_rewards/accuracies": 0.8346773982048035, "eval_rewards/chosen": 0.22041727602481842, "eval_rewards/margins": 4.15868616104126, "eval_rewards/rejected": -3.9382688999176025, "eval_runtime": 227.6304, "eval_samples_per_second": 7.618, "eval_steps_per_second": 1.907, "step": 640 }, { "epoch": 1.4817284267213746, "grad_norm": 24.31958981085731, "learning_rate": 9.525620570399259e-08, "logits/chosen": -1.1648564338684082, "logits/rejected": -1.1766667366027832, "logps/chosen": -50.69769287109375, "logps/rejected": -73.31265258789062, "loss": 0.138, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.1818230152130127, "rewards/margins": 4.770647048950195, "rewards/rejected": -4.5888237953186035, "step": 642 }, { "epoch": 1.4863444031286062, "grad_norm": 16.147422378148644, "learning_rate": 9.36775335972943e-08, "logits/chosen": -1.1746997833251953, "logits/rejected": -1.2103776931762695, "logps/chosen": -45.329200744628906, "logps/rejected": -116.44515228271484, "loss": 0.166, "rewards/accuracies": 0.875, "rewards/chosen": 0.2069842666387558, "rewards/margins": 9.259431838989258, "rewards/rejected": -9.052447319030762, "step": 644 }, { "epoch": 1.490960379535838, "grad_norm": 26.623831420383585, "learning_rate": 9.210903109046284e-08, "logits/chosen": -1.186785340309143, "logits/rejected": -1.1978414058685303, "logps/chosen": -49.64828872680664, "logps/rejected": -75.10488891601562, "loss": 0.1953, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.27270856499671936, "rewards/margins": 5.708976745605469, "rewards/rejected": -5.981686115264893, "step": 646 }, { "epoch": 1.4955763559430697, "grad_norm": 20.261642357856545, "learning_rate": 9.05508002228485e-08, "logits/chosen": -1.1485164165496826, "logits/rejected": -1.1560351848602295, "logps/chosen": -43.76812744140625, "logps/rejected": -61.48163604736328, "loss": 0.2165, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.30232900381088257, "rewards/margins": 4.651912689208984, "rewards/rejected": -4.349584579467773, "step": 648 }, { "epoch": 1.5001923323503012, "grad_norm": 16.434029065803497, "learning_rate": 8.900294236557707e-08, "logits/chosen": -1.1660001277923584, "logits/rejected": -1.1689536571502686, "logps/chosen": -43.022518157958984, "logps/rejected": -55.89129638671875, "loss": 0.2104, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.252067506313324, "rewards/margins": 3.607164144515991, "rewards/rejected": -3.3550968170166016, "step": 650 }, { "epoch": 1.504808308757533, "grad_norm": 17.153074627752197, "learning_rate": 8.746555821495561e-08, "logits/chosen": -1.1337089538574219, "logits/rejected": -1.149064540863037, "logps/chosen": -49.65050506591797, "logps/rejected": -73.29889678955078, "loss": 0.168, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.3097386956214905, "rewards/margins": 5.331449031829834, "rewards/rejected": -5.0217108726501465, "step": 652 }, { "epoch": 1.5094242851647648, "grad_norm": 23.56988440345321, "learning_rate": 8.593874778592122e-08, "logits/chosen": -1.1784387826919556, "logits/rejected": -1.1783757209777832, "logps/chosen": -43.02362823486328, "logps/rejected": -56.035400390625, "loss": 0.1716, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.2998148500919342, "rewards/margins": 3.706061840057373, "rewards/rejected": -3.4062466621398926, "step": 654 }, { "epoch": 1.5140402615719966, "grad_norm": 24.44217268761521, "learning_rate": 8.442261040553472e-08, "logits/chosen": -1.1735872030258179, "logits/rejected": -1.1768840551376343, "logps/chosen": -50.59683609008789, "logps/rejected": -56.11674880981445, "loss": 0.1561, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 0.28288352489471436, "rewards/margins": 3.881596088409424, "rewards/rejected": -3.59871244430542, "step": 656 }, { "epoch": 1.518656237979228, "grad_norm": 32.02127603424348, "learning_rate": 8.291724470651903e-08, "logits/chosen": -1.1492629051208496, "logits/rejected": -1.1569753885269165, "logps/chosen": -51.28215026855469, "logps/rejected": -65.28805541992188, "loss": 0.2696, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.23993118107318878, "rewards/margins": 3.802645444869995, "rewards/rejected": -4.042576313018799, "step": 658 }, { "epoch": 1.5232722143864597, "grad_norm": 16.1030974632108, "learning_rate": 8.14227486208423e-08, "logits/chosen": -1.2377209663391113, "logits/rejected": -1.2406808137893677, "logps/chosen": -45.322593688964844, "logps/rejected": -61.13237762451172, "loss": 0.1727, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.44821304082870483, "rewards/margins": 4.551301956176758, "rewards/rejected": -4.103089332580566, "step": 660 }, { "epoch": 1.5232722143864597, "eval_logits/chosen": -1.106886386871338, "eval_logits/rejected": -1.1123414039611816, "eval_logps/chosen": -48.28040313720703, "eval_logps/rejected": -64.126708984375, "eval_loss": 0.23655745387077332, "eval_rewards/accuracies": 0.8346773982048035, "eval_rewards/chosen": 0.17651285231113434, "eval_rewards/margins": 4.193314552307129, "eval_rewards/rejected": -4.016801834106445, "eval_runtime": 227.3008, "eval_samples_per_second": 7.629, "eval_steps_per_second": 1.909, "step": 660 }, { "epoch": 1.5278881907936914, "grad_norm": 26.89012407667588, "learning_rate": 7.993921937334716e-08, "logits/chosen": -1.1547244787216187, "logits/rejected": -1.1549021005630493, "logps/chosen": -48.07485580444336, "logps/rejected": -55.9273567199707, "loss": 0.2437, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.16911821067333221, "rewards/margins": 3.7287042140960693, "rewards/rejected": -3.5595860481262207, "step": 662 }, { "epoch": 1.5325041672009232, "grad_norm": 9.277145884274647, "learning_rate": 7.846675347542578e-08, "logits/chosen": -1.0987221002578735, "logits/rejected": -1.0982441902160645, "logps/chosen": -44.44068908691406, "logps/rejected": -55.658538818359375, "loss": 0.1239, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.7777649760246277, "rewards/margins": 4.145383834838867, "rewards/rejected": -3.3676185607910156, "step": 664 }, { "epoch": 1.537120143608155, "grad_norm": 23.47472041668218, "learning_rate": 7.700544671874079e-08, "logits/chosen": -1.1225018501281738, "logits/rejected": -1.1206145286560059, "logps/chosen": -55.324520111083984, "logps/rejected": -59.72848892211914, "loss": 0.1971, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.16201752424240112, "rewards/margins": 3.718522071838379, "rewards/rejected": -3.556504487991333, "step": 666 }, { "epoch": 1.5417361200153867, "grad_norm": 36.079809765281745, "learning_rate": 7.555539416899437e-08, "logits/chosen": -1.1977320909500122, "logits/rejected": -1.2057994604110718, "logps/chosen": -42.88581848144531, "logps/rejected": -62.64973831176758, "loss": 0.2275, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.011536385864019394, "rewards/margins": 4.496410369873047, "rewards/rejected": -4.484873294830322, "step": 668 }, { "epoch": 1.5463520964226183, "grad_norm": 24.671849788858164, "learning_rate": 7.41166901597429e-08, "logits/chosen": -1.115236520767212, "logits/rejected": -1.117641806602478, "logps/chosen": -48.77861785888672, "logps/rejected": -63.02436065673828, "loss": 0.1757, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.31542646884918213, "rewards/margins": 4.368900775909424, "rewards/rejected": -4.053474426269531, "step": 670 }, { "epoch": 1.5509680728298498, "grad_norm": 22.124151348706462, "learning_rate": 7.268942828626046e-08, "logits/chosen": -1.201170802116394, "logits/rejected": -1.2061718702316284, "logps/chosen": -46.17606735229492, "logps/rejected": -60.99459457397461, "loss": 0.2017, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.4047514796257019, "rewards/margins": 4.501495361328125, "rewards/rejected": -4.096743583679199, "step": 672 }, { "epoch": 1.5555840492370816, "grad_norm": 24.405076004524442, "learning_rate": 7.127370139945018e-08, "logits/chosen": -1.1625523567199707, "logits/rejected": -1.1654243469238281, "logps/chosen": -48.26844024658203, "logps/rejected": -65.458740234375, "loss": 0.1711, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.4592117667198181, "rewards/margins": 4.522068500518799, "rewards/rejected": -4.062856674194336, "step": 674 }, { "epoch": 1.5602000256443134, "grad_norm": 18.612456705571446, "learning_rate": 6.986960159980326e-08, "logits/chosen": -1.1604725122451782, "logits/rejected": -1.16167414188385, "logps/chosen": -50.374324798583984, "logps/rejected": -61.40808868408203, "loss": 0.1842, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.5715973377227783, "rewards/margins": 3.9924135208129883, "rewards/rejected": -3.42081618309021, "step": 676 }, { "epoch": 1.5648160020515451, "grad_norm": 43.48667538563028, "learning_rate": 6.847722023140776e-08, "logits/chosen": -1.2388062477111816, "logits/rejected": -1.234206199645996, "logps/chosen": -44.29112243652344, "logps/rejected": -51.78242492675781, "loss": 0.2001, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.6832877993583679, "rewards/margins": 3.618730306625366, "rewards/rejected": -2.9354422092437744, "step": 678 }, { "epoch": 1.569431978458777, "grad_norm": 28.795198035373833, "learning_rate": 6.709664787600616e-08, "logits/chosen": -1.209147572517395, "logits/rejected": -1.2093759775161743, "logps/chosen": -42.70883560180664, "logps/rejected": -50.379886627197266, "loss": 0.2779, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.21883456408977509, "rewards/margins": 2.795126438140869, "rewards/rejected": -2.576291799545288, "step": 680 }, { "epoch": 1.569431978458777, "eval_logits/chosen": -1.118242621421814, "eval_logits/rejected": -1.1229368448257446, "eval_logps/chosen": -47.97200012207031, "eval_logps/rejected": -63.929046630859375, "eval_loss": 0.2367607206106186, "eval_rewards/accuracies": 0.8312212228775024, "eval_rewards/chosen": 0.33071503043174744, "eval_rewards/margins": 4.248687744140625, "eval_rewards/rejected": -3.9179720878601074, "eval_runtime": 227.4731, "eval_samples_per_second": 7.623, "eval_steps_per_second": 1.908, "step": 680 }, { "epoch": 1.5740479548660085, "grad_norm": 22.618944291226985, "learning_rate": 6.572797434710219e-08, "logits/chosen": -1.2124552726745605, "logits/rejected": -1.2253483533859253, "logps/chosen": -45.591331481933594, "logps/rejected": -77.16690063476562, "loss": 0.1948, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.740224301815033, "rewards/margins": 5.656050682067871, "rewards/rejected": -4.91582727432251, "step": 682 }, { "epoch": 1.57866393127324, "grad_norm": 17.206603905842275, "learning_rate": 6.437128868411856e-08, "logits/chosen": -1.1554473638534546, "logits/rejected": -1.1552696228027344, "logps/chosen": -45.49040222167969, "logps/rejected": -53.40934753417969, "loss": 0.2204, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.516470193862915, "rewards/margins": 3.7008235454559326, "rewards/rejected": -3.1843535900115967, "step": 684 }, { "epoch": 1.5832799076804718, "grad_norm": 14.245504042199357, "learning_rate": 6.302667914660384e-08, "logits/chosen": -1.1610139608383179, "logits/rejected": -1.1691855192184448, "logps/chosen": -41.98570251464844, "logps/rejected": -61.45694351196289, "loss": 0.2166, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.34538733959198, "rewards/margins": 4.076152324676514, "rewards/rejected": -3.7307653427124023, "step": 686 }, { "epoch": 1.5878958840877035, "grad_norm": 25.659568066978697, "learning_rate": 6.169423320849112e-08, "logits/chosen": -1.139572024345398, "logits/rejected": -1.1325372457504272, "logps/chosen": -52.09668731689453, "logps/rejected": -52.88841247558594, "loss": 0.1753, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.5496838092803955, "rewards/margins": 3.8007943630218506, "rewards/rejected": -3.251110553741455, "step": 688 }, { "epoch": 1.5925118604949353, "grad_norm": 23.501661963559606, "learning_rate": 6.037403755240748e-08, "logits/chosen": -1.1798467636108398, "logits/rejected": -1.1862027645111084, "logps/chosen": -51.716487884521484, "logps/rejected": -67.60077667236328, "loss": 0.175, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.23453055322170258, "rewards/margins": 4.55530309677124, "rewards/rejected": -4.320772171020508, "step": 690 }, { "epoch": 1.597127836902167, "grad_norm": 17.275740083868094, "learning_rate": 5.9066178064034326e-08, "logits/chosen": -1.1528249979019165, "logits/rejected": -1.1694761514663696, "logps/chosen": -38.17667770385742, "logps/rejected": -83.85364532470703, "loss": 0.2264, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.28412505984306335, "rewards/margins": 6.359869003295898, "rewards/rejected": -6.075743675231934, "step": 692 }, { "epoch": 1.6017438133093986, "grad_norm": 26.873806928959727, "learning_rate": 5.777073982652064e-08, "logits/chosen": -1.1307650804519653, "logits/rejected": -1.132928490638733, "logps/chosen": -41.0783576965332, "logps/rejected": -60.499267578125, "loss": 0.2291, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.24884945154190063, "rewards/margins": 4.286365509033203, "rewards/rejected": -4.037516117095947, "step": 694 }, { "epoch": 1.6063597897166302, "grad_norm": 32.76563481878735, "learning_rate": 5.6487807114947325e-08, "logits/chosen": -1.133086085319519, "logits/rejected": -1.149233102798462, "logps/chosen": -49.21199035644531, "logps/rejected": -82.91252899169922, "loss": 0.1834, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.2540854215621948, "rewards/margins": 5.426255226135254, "rewards/rejected": -5.172169208526611, "step": 696 }, { "epoch": 1.610975766123862, "grad_norm": 30.041135081079663, "learning_rate": 5.521746339084532e-08, "logits/chosen": -1.1215559244155884, "logits/rejected": -1.130250334739685, "logps/chosen": -54.02631759643555, "logps/rejected": -67.99839782714844, "loss": 0.2581, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.2139756679534912, "rewards/margins": 4.321435928344727, "rewards/rejected": -4.107460021972656, "step": 698 }, { "epoch": 1.6155917425310937, "grad_norm": 21.854488700327504, "learning_rate": 5.39597912967652e-08, "logits/chosen": -1.131272315979004, "logits/rejected": -1.1449000835418701, "logps/chosen": -45.570159912109375, "logps/rejected": -73.73873901367188, "loss": 0.1902, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.329825758934021, "rewards/margins": 5.084650039672852, "rewards/rejected": -4.754825115203857, "step": 700 }, { "epoch": 1.6155917425310937, "eval_logits/chosen": -1.1056500673294067, "eval_logits/rejected": -1.1112511157989502, "eval_logps/chosen": -48.541831970214844, "eval_logps/rejected": -64.42839050292969, "eval_loss": 0.23584917187690735, "eval_rewards/accuracies": 0.8323732614517212, "eval_rewards/chosen": 0.04579799994826317, "eval_rewards/margins": 4.213436603546143, "eval_rewards/rejected": -4.167638778686523, "eval_runtime": 227.3674, "eval_samples_per_second": 7.626, "eval_steps_per_second": 1.909, "step": 700 } ], "logging_steps": 2, "max_steps": 866, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }