{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9993753903810119, "eval_steps": 500, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006246096189881324, "grad_norm": 12.530923843383789, "kl": 0.36666667461395264, "learning_rate": 1.5625e-07, "logits/chosen": 765380163.5404255, "logits/rejected": 259708735.73877552, "logps/chosen": -2094.8425531914895, "logps/rejected": -1494.334693877551, "loss": 0.5003, "rewards/chosen": 1.1236422924285239, "rewards/margins": -85949919.52941893, "rewards/rejected": 85949920.65306123, "step": 5 }, { "epoch": 0.012492192379762648, "grad_norm": 11.841174125671387, "kl": 0.5041666626930237, "learning_rate": 3.125e-07, "logits/chosen": 922524724.0677966, "logits/rejected": 222366871.0819672, "logps/chosen": -2251.6610169491523, "logps/rejected": -1449.5737704918033, "loss": 0.4989, "rewards/chosen": 1.0612761933924788, "rewards/margins": -41449781.49610086, "rewards/rejected": 41449782.55737705, "step": 10 }, { "epoch": 0.018738288569643973, "grad_norm": 16.48398780822754, "kl": 1.5750000476837158, "learning_rate": 4.6875000000000006e-07, "logits/chosen": 586256366.4206009, "logits/rejected": 338159392.1295546, "logps/chosen": -2081.922746781116, "logps/rejected": -1634.8502024291497, "loss": 0.4964, "rewards/chosen": 2.080868045659536, "rewards/margins": -31932565.23897001, "rewards/rejected": 31932567.31983806, "step": 15 }, { "epoch": 0.024984384759525295, "grad_norm": 10.12692928314209, "kl": 4.645833492279053, "learning_rate": 6.25e-07, "logits/chosen": 646690373.632, "logits/rejected": 292872975.5826087, "logps/chosen": -1991.936, "logps/rejected": -1609.0434782608695, "loss": 0.4901, "rewards/chosen": 1.5784925537109376, "rewards/margins": -24520791.46498571, "rewards/rejected": 24520793.04347826, "step": 20 }, { "epoch": 0.03123048094940662, "grad_norm": 9.373343467712402, "kl": 10.037500381469727, "learning_rate": 7.8125e-07, "logits/chosen": 722191438.0627803, "logits/rejected": 241486644.7937743, "logps/chosen": -2005.9551569506727, "logps/rejected": -1370.645914396887, "loss": 0.4737, "rewards/chosen": 1.6375982857605802, "rewards/margins": 6768824.921644978, "rewards/rejected": -6768823.284046693, "step": 25 }, { "epoch": 0.037476577139287946, "grad_norm": 9.365606307983398, "kl": 23.879167556762695, "learning_rate": 9.375000000000001e-07, "logits/chosen": 924951578.2564102, "logits/rejected": 539978277.4634147, "logps/chosen": -2154.119658119658, "logps/rejected": -1664.780487804878, "loss": 0.4414, "rewards/chosen": 1.9932785686264691, "rewards/margins": -80350996.90103038, "rewards/rejected": 80350998.89430894, "step": 30 }, { "epoch": 0.04372267332916927, "grad_norm": 8.471269607543945, "kl": 40.59166717529297, "learning_rate": 1.0937500000000001e-06, "logits/chosen": 1037311177.5100402, "logits/rejected": 570116672.2770563, "logps/chosen": -2063.0361445783133, "logps/rejected": -1451.2207792207791, "loss": 0.4117, "rewards/chosen": 2.058617281626506, "rewards/margins": -23764258.51281129, "rewards/rejected": 23764260.57142857, "step": 35 }, { "epoch": 0.04996876951905059, "grad_norm": 7.911782741546631, "kl": 63.125, "learning_rate": 1.25e-06, "logits/chosen": 1458762482.9490197, "logits/rejected": 982212790.0444444, "logps/chosen": -2011.2313725490196, "logps/rejected": -1490.9155555555556, "loss": 0.3641, "rewards/chosen": 2.3586921243106618, "rewards/margins": -33682500.1835301, "rewards/rejected": 33682502.542222224, "step": 40 }, { "epoch": 0.056214865708931916, "grad_norm": 4.834893703460693, "kl": 68.25, "learning_rate": 1.40625e-06, "logits/chosen": 1641465751.8644068, "logits/rejected": 1301059348.9836066, "logps/chosen": -1905.6271186440679, "logps/rejected": -1531.4754098360656, "loss": 0.3385, "rewards/chosen": 2.5306468898967163, "rewards/margins": -92570008.74804163, "rewards/rejected": 92570011.27868852, "step": 45 }, { "epoch": 0.06246096189881324, "grad_norm": 4.706682205200195, "kl": 77.67500305175781, "learning_rate": 1.5625e-06, "logits/chosen": 2110757053.3109243, "logits/rejected": 1733322125.7520661, "logps/chosen": -1932.1008403361345, "logps/rejected": -1390.4132231404958, "loss": 0.305, "rewards/chosen": 2.7748095087644433, "rewards/margins": -111042684.16733925, "rewards/rejected": 111042686.94214876, "step": 50 }, { "epoch": 0.06870705808869457, "grad_norm": 4.392908096313477, "kl": 81.23332977294922, "learning_rate": 1.71875e-06, "logits/chosen": 2346525682.9276595, "logits/rejected": 2144470596.9632654, "logps/chosen": -1960.0340425531915, "logps/rejected": -1477.0938775510203, "loss": 0.2917, "rewards/chosen": 2.3351378095910906, "rewards/margins": -107784122.7913928, "rewards/rejected": 107784125.12653062, "step": 55 }, { "epoch": 0.07495315427857589, "grad_norm": 6.164892196655273, "kl": 86.1624984741211, "learning_rate": 1.8750000000000003e-06, "logits/chosen": 2208228740.413793, "logits/rejected": 2164937364.645161, "logps/chosen": -1969.9310344827586, "logps/rejected": -1540.774193548387, "loss": 0.2827, "rewards/chosen": 2.851356769430226, "rewards/margins": -100023912.50348194, "rewards/rejected": 100023915.35483871, "step": 60 }, { "epoch": 0.08119925046845722, "grad_norm": 3.74001145362854, "kl": 92.03333282470703, "learning_rate": 2.0312500000000002e-06, "logits/chosen": 2025868071.9266055, "logits/rejected": 2308852290.442748, "logps/chosen": -2005.2844036697247, "logps/rejected": -1620.1526717557251, "loss": 0.2535, "rewards/chosen": 3.949506287181049, "rewards/margins": -85202893.14973035, "rewards/rejected": 85202897.09923664, "step": 65 }, { "epoch": 0.08744534665833854, "grad_norm": 3.4245049953460693, "kl": 93.34166717529297, "learning_rate": 2.1875000000000002e-06, "logits/chosen": 1961691515.2592592, "logits/rejected": 2473542115.9156117, "logps/chosen": -1924.2469135802469, "logps/rejected": -1569.0801687763712, "loss": 0.2364, "rewards/chosen": 3.217338405028292, "rewards/margins": -110899300.47886413, "rewards/rejected": 110899303.69620253, "step": 70 }, { "epoch": 0.09369144284821987, "grad_norm": 2.3636491298675537, "kl": 75.18333435058594, "learning_rate": 2.3437500000000002e-06, "logits/chosen": 1799070057.8955824, "logits/rejected": 2992154739.255411, "logps/chosen": -1946.7951807228915, "logps/rejected": -1687.5497835497836, "loss": 0.1854, "rewards/chosen": 3.330800496909513, "rewards/margins": -75541336.65188348, "rewards/rejected": 75541339.98268399, "step": 75 }, { "epoch": 0.09993753903810118, "grad_norm": 5.065158843994141, "kl": 37.454166412353516, "learning_rate": 2.5e-06, "logits/chosen": 1232394418.4564316, "logits/rejected": 3475059836.251046, "logps/chosen": -1765.97510373444, "logps/rejected": -1910.4937238493724, "loss": 0.1416, "rewards/chosen": 3.2204437889004147, "rewards/margins": -76902045.44901228, "rewards/rejected": 76902048.66945606, "step": 80 }, { "epoch": 0.1061836352279825, "grad_norm": 2.6473050117492676, "kl": 10.274999618530273, "learning_rate": 2.65625e-06, "logits/chosen": 616933788.9711934, "logits/rejected": 3842477993.586498, "logps/chosen": -1740.9053497942386, "logps/rejected": -2050.9704641350213, "loss": 0.103, "rewards/chosen": 3.3858763141396606, "rewards/margins": -32738116.985431705, "rewards/rejected": 32738120.371308018, "step": 85 }, { "epoch": 0.11242973141786383, "grad_norm": 1.7162717580795288, "kl": 12.845833778381348, "learning_rate": 2.8125e-06, "logits/chosen": 694821410.1333333, "logits/rejected": 4035479688.5333333, "logps/chosen": -1776.4, "logps/rejected": -2271.4666666666667, "loss": 0.0757, "rewards/chosen": 4.09275156656901, "rewards/margins": -24005026.307248432, "rewards/rejected": 24005030.4, "step": 90 }, { "epoch": 0.11867582760774516, "grad_norm": 1.7540525197982788, "kl": 2.304166555404663, "learning_rate": 2.96875e-06, "logits/chosen": 658665198.9333333, "logits/rejected": 4033801966.9333334, "logps/chosen": -2076.266666666667, "logps/rejected": -2257.866666666667, "loss": 0.0738, "rewards/chosen": 3.6998219807942707, "rewards/margins": -76789833.10017802, "rewards/rejected": 76789836.8, "step": 95 }, { "epoch": 0.12492192379762648, "grad_norm": 1.0445038080215454, "kl": 19.09166717529297, "learning_rate": 3.125e-06, "logits/chosen": 273522077.6156863, "logits/rejected": 3739305902.08, "logps/chosen": -1813.835294117647, "logps/rejected": -2274.56, "loss": 0.0878, "rewards/chosen": 3.6624865962009805, "rewards/margins": -22223153.813068956, "rewards/rejected": 22223157.475555554, "step": 100 }, { "epoch": 0.1311680199875078, "grad_norm": 1.0965297222137451, "kl": 9.591666221618652, "learning_rate": 3.28125e-06, "logits/chosen": 526766851.2133891, "logits/rejected": 3644206237.2116184, "logps/chosen": -1742.8619246861924, "logps/rejected": -2335.8672199170123, "loss": 0.0538, "rewards/chosen": 4.146895532328713, "rewards/margins": -51111124.5501999, "rewards/rejected": 51111128.69709544, "step": 105 }, { "epoch": 0.13741411617738913, "grad_norm": 2.170487880706787, "kl": 6.2791666984558105, "learning_rate": 3.4375e-06, "logits/chosen": 213918950.63063064, "logits/rejected": 3645207829.8294573, "logps/chosen": -1730.162162162162, "logps/rejected": -2354.3565891472867, "loss": 0.0487, "rewards/chosen": 4.383471514727618, "rewards/margins": -8812543.585520733, "rewards/rejected": 8812547.968992248, "step": 110 }, { "epoch": 0.14366021236727045, "grad_norm": 1.0979344844818115, "kl": 34.391666412353516, "learning_rate": 3.59375e-06, "logits/chosen": 134268182.49372384, "logits/rejected": 3293625076.315353, "logps/chosen": -1707.7824267782428, "logps/rejected": -2157.6763485477177, "loss": 0.0577, "rewards/chosen": 4.70439218177955, "rewards/margins": -32989264.27486093, "rewards/rejected": 32989268.979253113, "step": 115 }, { "epoch": 0.14990630855715179, "grad_norm": 0.6703037023544312, "kl": 26.808332443237305, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 169325389.7327935, "logits/rejected": 3147348117.424893, "logps/chosen": -1694.1862348178138, "logps/rejected": -2355.0901287553647, "loss": 0.0532, "rewards/chosen": 4.615021369717865, "rewards/margins": -13508923.110300519, "rewards/rejected": 13508927.725321889, "step": 120 }, { "epoch": 0.1561524047470331, "grad_norm": 0.5291562676429749, "kl": 11.0, "learning_rate": 3.90625e-06, "logits/chosen": -25907824.813559324, "logits/rejected": 3046577403.803279, "logps/chosen": -1716.8813559322034, "logps/rejected": -2457.1803278688526, "loss": 0.0509, "rewards/chosen": 4.240614551608846, "rewards/margins": 4334839.125860454, "rewards/rejected": -4334834.8852459015, "step": 125 }, { "epoch": 0.16239850093691444, "grad_norm": 0.6935663819313049, "kl": 27.100000381469727, "learning_rate": 4.0625000000000005e-06, "logits/chosen": -2747534.582278481, "logits/rejected": 2980406832.4609056, "logps/chosen": -1617.282700421941, "logps/rejected": -2338.5020576131687, "loss": 0.0494, "rewards/chosen": 5.010839019646625, "rewards/margins": -9376828.306033408, "rewards/rejected": 9376833.316872427, "step": 130 }, { "epoch": 0.16864459712679575, "grad_norm": 0.3774864375591278, "kl": 50.74166488647461, "learning_rate": 4.21875e-06, "logits/chosen": -33710690.384313725, "logits/rejected": 2680178897.351111, "logps/chosen": -1679.1843137254903, "logps/rejected": -2161.92, "loss": 0.0471, "rewards/chosen": 4.944170304840687, "rewards/margins": -12017750.078051917, "rewards/rejected": 12017755.022222223, "step": 135 }, { "epoch": 0.1748906933166771, "grad_norm": 0.31545472145080566, "kl": 21.391666412353516, "learning_rate": 4.3750000000000005e-06, "logits/chosen": -161463154.61087865, "logits/rejected": 2881086561.726141, "logps/chosen": -1553.6066945606694, "logps/rejected": -2346.4896265560164, "loss": 0.0364, "rewards/chosen": 5.193703112741893, "rewards/margins": 1904972.62938776, "rewards/rejected": -1904967.4356846474, "step": 140 }, { "epoch": 0.1811367895065584, "grad_norm": 0.24622267484664917, "kl": 20.46666717529297, "learning_rate": 4.53125e-06, "logits/chosen": 30635895.466666665, "logits/rejected": 3146846481.0666666, "logps/chosen": -1706.6666666666667, "logps/rejected": -2571.0666666666666, "loss": 0.0382, "rewards/chosen": 5.129107666015625, "rewards/margins": -16690829.004225668, "rewards/rejected": 16690834.133333333, "step": 145 }, { "epoch": 0.18738288569643974, "grad_norm": 0.484261155128479, "kl": 21.75, "learning_rate": 4.6875000000000004e-06, "logits/chosen": -77826685.90163934, "logits/rejected": 2589804995.254237, "logps/chosen": -1657.5737704918033, "logps/rejected": -2377.491525423729, "loss": 0.0368, "rewards/chosen": 5.271223224577357, "rewards/margins": -5416552.288098809, "rewards/rejected": 5416557.559322034, "step": 150 }, { "epoch": 0.19362898188632105, "grad_norm": 0.40670040249824524, "kl": 30.483333587646484, "learning_rate": 4.84375e-06, "logits/chosen": -26928078.378854625, "logits/rejected": 2463001409.770751, "logps/chosen": -1662.2378854625551, "logps/rejected": -2253.913043478261, "loss": 0.051, "rewards/chosen": 5.290996265831498, "rewards/margins": 5252345.148703775, "rewards/rejected": -5252339.85770751, "step": 155 }, { "epoch": 0.19987507807620236, "grad_norm": 0.18753188848495483, "kl": 24.350000381469727, "learning_rate": 5e-06, "logits/chosen": -89292537.856, "logits/rejected": 2722431545.878261, "logps/chosen": -1611.712, "logps/rejected": -2609.5304347826086, "loss": 0.0265, "rewards/chosen": 5.405384765625, "rewards/margins": 2749760.396689113, "rewards/rejected": -2749754.9913043478, "step": 160 }, { "epoch": 0.2061211742660837, "grad_norm": 0.3736235201358795, "kl": 38.81666564941406, "learning_rate": 4.999851262500375e-06, "logits/chosen": -188500339.57805908, "logits/rejected": 2439669566.1563787, "logps/chosen": -1609.5864978902953, "logps/rejected": -2361.5473251028807, "loss": 0.0298, "rewards/chosen": 5.654172229364452, "rewards/margins": 11241722.065694863, "rewards/rejected": -11241716.411522634, "step": 165 }, { "epoch": 0.212367270455965, "grad_norm": 0.20251910388469696, "kl": 39.24166488647461, "learning_rate": 4.999405067699773e-06, "logits/chosen": -153447545.4915254, "logits/rejected": 2220780829.377049, "logps/chosen": -1598.6440677966102, "logps/rejected": -2233.44262295082, "loss": 0.0324, "rewards/chosen": 5.615495067531779, "rewards/margins": -22238567.499259032, "rewards/rejected": 22238573.1147541, "step": 170 }, { "epoch": 0.21861336664584635, "grad_norm": 0.6630544662475586, "kl": 55.57500076293945, "learning_rate": 4.998661468690914e-06, "logits/chosen": -90310722.06451613, "logits/rejected": 2574579500.137931, "logps/chosen": -1666.1935483870968, "logps/rejected": -2460.8275862068967, "loss": 0.0478, "rewards/chosen": 5.576678860572077, "rewards/margins": -12666122.975045277, "rewards/rejected": 12666128.551724138, "step": 175 }, { "epoch": 0.22485946283572766, "grad_norm": 0.3803296685218811, "kl": 48.650001525878906, "learning_rate": 4.997620553954645e-06, "logits/chosen": -155793365.8600823, "logits/rejected": 2239510571.206751, "logps/chosen": -1490.8312757201645, "logps/rejected": -2156.2869198312237, "loss": 0.036, "rewards/chosen": 5.737128866062243, "rewards/margins": -4821880.102533582, "rewards/rejected": 4821885.839662448, "step": 180 }, { "epoch": 0.231105559025609, "grad_norm": 0.28740158677101135, "kl": 105.14167022705078, "learning_rate": 4.996282447349408e-06, "logits/chosen": -180883622.50406504, "logits/rejected": 2277471223.2478633, "logps/chosen": -1626.1463414634147, "logps/rejected": -2156.4444444444443, "loss": 0.0546, "rewards/chosen": 6.121557716431656, "rewards/margins": -17472389.91263032, "rewards/rejected": 17472396.034188036, "step": 185 }, { "epoch": 0.23735165521549031, "grad_norm": 0.570648193359375, "kl": 74.65833282470703, "learning_rate": 4.994647308096509e-06, "logits/chosen": -326146294.439834, "logits/rejected": 2238810668.9874477, "logps/chosen": -1514.8215767634854, "logps/rejected": -2162.0753138075315, "loss": 0.056, "rewards/chosen": 5.687685384789938, "rewards/margins": 8352651.5788987735, "rewards/rejected": -8352645.891213389, "step": 190 }, { "epoch": 0.24359775140537165, "grad_norm": 0.23554614186286926, "kl": 37.224998474121094, "learning_rate": 4.992715330761167e-06, "logits/chosen": -252608231.93162394, "logits/rejected": 2436106323.2520328, "logps/chosen": -1510.017094017094, "logps/rejected": -2297.756097560976, "loss": 0.0355, "rewards/chosen": 5.856194390190972, "rewards/margins": 12085535.872454552, "rewards/rejected": -12085530.016260162, "step": 195 }, { "epoch": 0.24984384759525297, "grad_norm": 0.2651134729385376, "kl": 53.724998474121094, "learning_rate": 4.990486745229364e-06, "logits/chosen": -353363390.35897434, "logits/rejected": 2253193748.8130083, "logps/chosen": -1603.145299145299, "logps/rejected": -2289.430894308943, "loss": 0.0301, "rewards/chosen": 5.701554257645566, "rewards/margins": 16624133.701554257, "rewards/rejected": -16624128.0, "step": 200 }, { "epoch": 0.2560899437851343, "grad_norm": 0.19966034591197968, "kl": 64.79582977294922, "learning_rate": 4.987961816680493e-06, "logits/chosen": -321853638.1935484, "logits/rejected": 2614515782.62069, "logps/chosen": -1530.0645161290322, "logps/rejected": -2399.448275862069, "loss": 0.0292, "rewards/chosen": 6.3989006780808975, "rewards/margins": -6738289.8769613905, "rewards/rejected": 6738296.275862069, "step": 205 }, { "epoch": 0.2623360399750156, "grad_norm": 0.12018506973981857, "kl": 35.98749923706055, "learning_rate": 4.985140845555799e-06, "logits/chosen": -93972786.7854251, "logits/rejected": 2599676422.5922747, "logps/chosen": -1611.2064777327935, "logps/rejected": -2370.74678111588, "loss": 0.0286, "rewards/chosen": 6.0249038263853745, "rewards/margins": -1163880.9450532552, "rewards/rejected": 1163886.9699570816, "step": 210 }, { "epoch": 0.26858213616489696, "grad_norm": 0.3969196379184723, "kl": 45.61249923706055, "learning_rate": 4.982024167522638e-06, "logits/chosen": -198077725.37704918, "logits/rejected": 2726813001.762712, "logps/chosen": -1471.672131147541, "logps/rejected": -2450.3050847457625, "loss": 0.0393, "rewards/chosen": 5.829876508869108, "rewards/margins": 8360851.389198543, "rewards/rejected": -8360845.559322034, "step": 215 }, { "epoch": 0.27482823235477827, "grad_norm": 0.34571054577827454, "kl": 108.92500305175781, "learning_rate": 4.978612153434527e-06, "logits/chosen": -214673374.4262295, "logits/rejected": 2318099403.9322033, "logps/chosen": -1586.7540983606557, "logps/rejected": -2199.0508474576272, "loss": 0.0471, "rewards/chosen": 6.15484619140625, "rewards/margins": 1069003.239591954, "rewards/rejected": -1068997.0847457626, "step": 220 }, { "epoch": 0.2810743285446596, "grad_norm": 0.30018988251686096, "kl": 31.90833282470703, "learning_rate": 4.97490520928702e-06, "logits/chosen": -101820645.4439834, "logits/rejected": 3018214138.6443515, "logps/chosen": -1520.331950207469, "logps/rejected": -2587.0460251046024, "loss": 0.018, "rewards/chosen": 6.184298313504927, "rewards/margins": 22056342.786808774, "rewards/rejected": -22056336.60251046, "step": 225 }, { "epoch": 0.2873204247345409, "grad_norm": 0.5999372005462646, "kl": 7.224999904632568, "learning_rate": 4.970903776169403e-06, "logits/chosen": -140607768.06837606, "logits/rejected": 2407871496.3252034, "logps/chosen": -1606.4273504273503, "logps/rejected": -2584.9756097560976, "loss": 0.0167, "rewards/chosen": 5.993517753405449, "rewards/margins": 12516249.765875475, "rewards/rejected": -12516243.772357723, "step": 230 }, { "epoch": 0.29356652092442226, "grad_norm": 0.3130801022052765, "kl": 6.9666666984558105, "learning_rate": 4.966608330212198e-06, "logits/chosen": -466865134.6440678, "logits/rejected": 2092888936.918033, "logps/chosen": -1367.1864406779662, "logps/rejected": -2658.8852459016393, "loss": 0.0166, "rewards/chosen": 6.395548675019862, "rewards/margins": 12007318.133253593, "rewards/rejected": -12007311.737704918, "step": 235 }, { "epoch": 0.29981261711430357, "grad_norm": 0.26502835750579834, "kl": 23.16666603088379, "learning_rate": 4.962019382530521e-06, "logits/chosen": -196223522.13333333, "logits/rejected": 2744053486.9333334, "logps/chosen": -1525.4333333333334, "logps/rejected": -2999.4666666666667, "loss": 0.0198, "rewards/chosen": 6.421251424153646, "rewards/margins": -11612441.578748576, "rewards/rejected": 11612448.0, "step": 240 }, { "epoch": 0.3060587133041849, "grad_norm": 0.33838242292404175, "kl": 30.891666412353516, "learning_rate": 4.957137479163253e-06, "logits/chosen": -261833796.26666668, "logits/rejected": 1684432486.4, "logps/chosen": -1455.4666666666667, "logps/rejected": -2374.9333333333334, "loss": 0.0183, "rewards/chosen": 5.891462198893229, "rewards/margins": -4358305.308537802, "rewards/rejected": 4358311.2, "step": 245 }, { "epoch": 0.3123048094940662, "grad_norm": 0.9754793047904968, "kl": 22.049999237060547, "learning_rate": 4.9519632010080765e-06, "logits/chosen": -299084816.78688526, "logits/rejected": 1817093345.6271186, "logps/chosen": -1480.655737704918, "logps/rejected": -2994.4406779661017, "loss": 0.0212, "rewards/chosen": 6.027991122886783, "rewards/margins": 7320732.265279259, "rewards/rejected": -7320726.237288136, "step": 250 }, { "epoch": 0.3185509056839475, "grad_norm": 0.3510875999927521, "kl": 16.53333282470703, "learning_rate": 4.9464971637523465e-06, "logits/chosen": 257342375.9638009, "logits/rejected": 3064983599.4440155, "logps/chosen": -1668.633484162896, "logps/rejected": -3069.034749034749, "loss": 0.0189, "rewards/chosen": 5.769157858455882, "rewards/margins": -4938778.354394265, "rewards/rejected": 4938784.123552124, "step": 255 }, { "epoch": 0.32479700187382887, "grad_norm": 0.16266603767871857, "kl": 0.0, "learning_rate": 4.9407400177998335e-06, "logits/chosen": -265875506.56790122, "logits/rejected": 3530719093.7383966, "logps/chosen": -1570.897119341564, "logps/rejected": -3502.987341772152, "loss": 0.0126, "rewards/chosen": 6.517467106320731, "rewards/margins": -4732542.343292387, "rewards/rejected": 4732548.860759494, "step": 260 }, { "epoch": 0.3310430980637102, "grad_norm": 0.12264657765626907, "kl": 7.008333206176758, "learning_rate": 4.9346924481933345e-06, "logits/chosen": -322727020.4235294, "logits/rejected": 2676152365.5111113, "logps/chosen": -1618.9490196078432, "logps/rejected": -2990.08, "loss": 0.0204, "rewards/chosen": 6.062105066636029, "rewards/margins": 15871837.670993956, "rewards/rejected": -15871831.608888889, "step": 265 }, { "epoch": 0.3372891942535915, "grad_norm": 0.17087456583976746, "kl": 2.3583333492279053, "learning_rate": 4.928355174533153e-06, "logits/chosen": -415921328.2232558, "logits/rejected": 2119342242.2943397, "logps/chosen": -1588.093023255814, "logps/rejected": -2714.0830188679247, "loss": 0.008, "rewards/chosen": 7.158184388626453, "rewards/margins": 7941654.131769294, "rewards/rejected": -7941646.973584905, "step": 270 }, { "epoch": 0.3435352904434728, "grad_norm": 0.26105254888534546, "kl": 13.591666221618652, "learning_rate": 4.9217289508914836e-06, "logits/chosen": -344134401.11790395, "logits/rejected": 2264991001.498008, "logps/chosen": -1525.659388646288, "logps/rejected": -3018.4541832669324, "loss": 0.0132, "rewards/chosen": 6.172569574747544, "rewards/margins": 11028239.351852443, "rewards/rejected": -11028233.179282868, "step": 275 }, { "epoch": 0.3497813866333542, "grad_norm": 0.14341367781162262, "kl": 13.274999618530273, "learning_rate": 4.914814565722671e-06, "logits/chosen": -605742868.3893806, "logits/rejected": 1702325981.7322834, "logps/chosen": -1471.929203539823, "logps/rejected": -2880.503937007874, "loss": 0.0128, "rewards/chosen": 6.208206041724281, "rewards/margins": 17545562.869623367, "rewards/rejected": -17545556.661417324, "step": 280 }, { "epoch": 0.3560274828232355, "grad_norm": 0.11326275765895844, "kl": 12.416666984558105, "learning_rate": 4.907612841769407e-06, "logits/chosen": -296522604.57587546, "logits/rejected": 2360177650.224215, "logps/chosen": -1487.1906614785992, "logps/rejected": -3418.403587443946, "loss": 0.013, "rewards/chosen": 6.169439293531128, "rewards/margins": -2895297.866435146, "rewards/rejected": 2895304.0358744394, "step": 285 }, { "epoch": 0.3622735790131168, "grad_norm": 0.371408611536026, "kl": 22.683332443237305, "learning_rate": 4.900124635964823e-06, "logits/chosen": -535632088.0337553, "logits/rejected": 2330721225.218107, "logps/chosen": -1422.8523206751054, "logps/rejected": -3448.098765432099, "loss": 0.0181, "rewards/chosen": 5.8123933816257916, "rewards/margins": 8163410.190994204, "rewards/rejected": -8163404.378600823, "step": 290 }, { "epoch": 0.3685196752029981, "grad_norm": 0.1974770724773407, "kl": 3.7249999046325684, "learning_rate": 4.8923508393305224e-06, "logits/chosen": -610109912.6153846, "logits/rejected": 1617081524.7058823, "logps/chosen": -1457.1153846153845, "logps/rejected": -2936.9411764705883, "loss": 0.0082, "rewards/chosen": 6.869715177095854, "rewards/margins": 8407971.105009295, "rewards/rejected": -8407964.235294119, "step": 295 }, { "epoch": 0.3747657713928795, "grad_norm": 0.15918996930122375, "kl": 10.741666793823242, "learning_rate": 4.884292376870567e-06, "logits/chosen": -526262306.27615064, "logits/rejected": 1927291389.8755186, "logps/chosen": -1425.4058577405858, "logps/rejected": -3032.697095435685, "loss": 0.0112, "rewards/chosen": 6.35287156364409, "rewards/margins": 14068449.174448326, "rewards/rejected": -14068442.821576763, "step": 300 }, { "epoch": 0.3810118675827608, "grad_norm": 0.27272769808769226, "kl": 5.775000095367432, "learning_rate": 4.875950207461403e-06, "logits/chosen": -388197814.85714287, "logits/rejected": 2318860829.619835, "logps/chosen": -1382.453781512605, "logps/rejected": -3041.8512396694214, "loss": 0.012, "rewards/chosen": 6.2134866153492645, "rewards/margins": 2678316.2961312435, "rewards/rejected": -2678310.082644628, "step": 305 }, { "epoch": 0.3872579637726421, "grad_norm": 0.17649586498737335, "kl": 38.70833206176758, "learning_rate": 4.867325323737765e-06, "logits/chosen": -427835718.374502, "logits/rejected": 1462694835.9825327, "logps/chosen": -1543.2669322709164, "logps/rejected": -2535.9650655021833, "loss": 0.0242, "rewards/chosen": 6.1200719582607075, "rewards/margins": 10201054.993434405, "rewards/rejected": -10201048.873362446, "step": 310 }, { "epoch": 0.3935040599625234, "grad_norm": 0.15216855704784393, "kl": 9.566666603088379, "learning_rate": 4.858418751974564e-06, "logits/chosen": -660057950.7401575, "logits/rejected": 1958461584.9911504, "logps/chosen": -1579.5905511811025, "logps/rejected": -2988.8849557522126, "loss": 0.0123, "rewards/chosen": 6.074395127183809, "rewards/margins": 16072222.99474911, "rewards/rejected": -16072216.920353983, "step": 315 }, { "epoch": 0.3997501561524047, "grad_norm": 0.15513299405574799, "kl": 4.599999904632568, "learning_rate": 4.849231551964771e-06, "logits/chosen": -757533956.338983, "logits/rejected": 1946294574.1639345, "logps/chosen": -1310.7118644067796, "logps/rejected": -3486.1639344262294, "loss": 0.0089, "rewards/chosen": 6.3125450004965575, "rewards/margins": 20475022.705987625, "rewards/rejected": -20475016.393442623, "step": 320 }, { "epoch": 0.4059962523422861, "grad_norm": 0.13751330971717834, "kl": 0.0, "learning_rate": 4.839764816893315e-06, "logits/chosen": -558023220.9655173, "logits/rejected": 1358303364.1290324, "logps/chosen": -1644.8275862068965, "logps/rejected": -3607.2258064516127, "loss": 0.0123, "rewards/chosen": 6.341705848430765, "rewards/margins": 9993958.341705848, "rewards/rejected": -9993952.0, "step": 325 }, { "epoch": 0.4122423485321674, "grad_norm": 0.30045750737190247, "kl": 0.2083333283662796, "learning_rate": 4.830019673206997e-06, "logits/chosen": -620181966.451613, "logits/rejected": 1212370802.7586207, "logps/chosen": -1604.0, "logps/rejected": -3483.0344827586205, "loss": 0.0114, "rewards/chosen": 6.462984146610383, "rewards/margins": 21377946.325053114, "rewards/rejected": -21377939.862068966, "step": 330 }, { "epoch": 0.4184884447220487, "grad_norm": 0.22446036338806152, "kl": 0.0, "learning_rate": 4.8199972804804615e-06, "logits/chosen": -509039072.9068826, "logits/rejected": 1352807050.4377682, "logps/chosen": -1562.5587044534414, "logps/rejected": -3614.214592274678, "loss": 0.009, "rewards/chosen": 6.133410990479504, "rewards/margins": 17571108.897359487, "rewards/rejected": -17571102.763948496, "step": 335 }, { "epoch": 0.42473454091193, "grad_norm": 0.06178925931453705, "kl": 0.0, "learning_rate": 4.809698831278217e-06, "logits/chosen": -515473141.5934959, "logits/rejected": 1385061349.7435896, "logps/chosen": -1490.081300813008, "logps/rejected": -3626.6666666666665, "loss": 0.0101, "rewards/chosen": 6.447595115599594, "rewards/margins": 7472535.95186862, "rewards/rejected": -7472529.504273504, "step": 340 }, { "epoch": 0.4309806371018114, "grad_norm": 0.10884927213191986, "kl": 1.8083332777023315, "learning_rate": 4.799125551012731e-06, "logits/chosen": -577620593.5397489, "logits/rejected": 1478309420.6141078, "logps/chosen": -1485.9246861924687, "logps/rejected": -3730.058091286307, "loss": 0.0079, "rewards/chosen": 6.468292874771182, "rewards/margins": 14943737.721404908, "rewards/rejected": -14943731.253112033, "step": 345 }, { "epoch": 0.4372267332916927, "grad_norm": 0.15412873029708862, "kl": 11.300000190734863, "learning_rate": 4.788278697798619e-06, "logits/chosen": -543128542.967742, "logits/rejected": 1143824666.4827585, "logps/chosen": -1587.6129032258063, "logps/rejected": -3566.6206896551726, "loss": 0.0124, "rewards/chosen": 6.4700597947643645, "rewards/margins": 17442286.194197726, "rewards/rejected": -17442279.724137932, "step": 350 }, { "epoch": 0.443472829481574, "grad_norm": 0.6250858902931213, "kl": 17.0625, "learning_rate": 4.77715956230294e-06, "logits/chosen": -585785205.1525424, "logits/rejected": 1392276866.0983605, "logps/chosen": -1420.6101694915253, "logps/rejected": -3279.7377049180327, "loss": 0.0152, "rewards/chosen": 6.209764383606991, "rewards/margins": 9625444.635993892, "rewards/rejected": -9625438.426229509, "step": 355 }, { "epoch": 0.4497189256714553, "grad_norm": 0.3552960753440857, "kl": 10.883333206176758, "learning_rate": 4.765769467591626e-06, "logits/chosen": -576030856.5333333, "logits/rejected": 1427985749.3333333, "logps/chosen": -1384.0, "logps/rejected": -2608.0, "loss": 0.0139, "rewards/chosen": 6.204444885253906, "rewards/margins": 19064277.13777822, "rewards/rejected": -19064270.933333334, "step": 360 }, { "epoch": 0.45596502186133664, "grad_norm": 0.29342055320739746, "kl": 14.975000381469727, "learning_rate": 4.75410976897204e-06, "logits/chosen": -637534208.0, "logits/rejected": 1524465529.1522634, "logps/chosen": -1628.8945147679324, "logps/rejected": -3114.40329218107, "loss": 0.0092, "rewards/chosen": 6.717748199334125, "rewards/margins": 22818043.129270833, "rewards/rejected": -22818036.411522634, "step": 365 }, { "epoch": 0.462211118051218, "grad_norm": 0.1184186041355133, "kl": 10.0, "learning_rate": 4.742181853831721e-06, "logits/chosen": -462829314.11570245, "logits/rejected": 1503693230.252101, "logps/chosen": -1507.4380165289256, "logps/rejected": -3374.5210084033615, "loss": 0.015, "rewards/chosen": 6.435300212261105, "rewards/margins": 12131771.141182566, "rewards/rejected": -12131764.705882354, "step": 370 }, { "epoch": 0.4684572142410993, "grad_norm": 0.14714303612709045, "kl": 21.516666412353516, "learning_rate": 4.729987141473286e-06, "logits/chosen": -530579456.0, "logits/rejected": 1497512767.720524, "logps/chosen": -1480.5418326693227, "logps/rejected": -3180.995633187773, "loss": 0.0226, "rewards/chosen": 6.02860870969248, "rewards/margins": -5513310.582745003, "rewards/rejected": 5513316.611353712, "step": 375 }, { "epoch": 0.47470331043098063, "grad_norm": 0.1731535792350769, "kl": 28.633333206176758, "learning_rate": 4.717527082945555e-06, "logits/chosen": -814227456.0, "logits/rejected": 1239173412.5714285, "logps/chosen": -1477.25, "logps/rejected": -3086.0, "loss": 0.0179, "rewards/chosen": 6.597807884216309, "rewards/margins": 17609291.169236455, "rewards/rejected": -17609284.57142857, "step": 380 }, { "epoch": 0.48094940662086194, "grad_norm": 0.13685530424118042, "kl": 14.733333587646484, "learning_rate": 4.704803160870888e-06, "logits/chosen": -748893894.9868996, "logits/rejected": 1572780448.12749, "logps/chosen": -1380.122270742358, "logps/rejected": -3014.629482071713, "loss": 0.0118, "rewards/chosen": 6.789244272823417, "rewards/margins": 21147032.63784985, "rewards/rejected": -21147025.848605577, "step": 385 }, { "epoch": 0.4871955028107433, "grad_norm": 0.3021528124809265, "kl": 7.650000095367432, "learning_rate": 4.69181688926877e-06, "logits/chosen": -378228522.1526104, "logits/rejected": 2085631281.8701298, "logps/chosen": -1484.5943775100402, "logps/rejected": -3172.848484848485, "loss": 0.0109, "rewards/chosen": 6.780140581858685, "rewards/margins": 15358581.602651404, "rewards/rejected": -15358574.822510822, "step": 390 }, { "epoch": 0.4934415990006246, "grad_norm": 0.07516395300626755, "kl": 0.0, "learning_rate": 4.678569813375654e-06, "logits/chosen": -424886970.8436019, "logits/rejected": 1915752250.0520446, "logps/chosen": -1499.9810426540284, "logps/rejected": -3160.5055762081784, "loss": 0.0064, "rewards/chosen": 6.621546740780509, "rewards/margins": 6866231.1568627255, "rewards/rejected": -6866224.535315985, "step": 395 }, { "epoch": 0.49968769519050593, "grad_norm": 0.27578118443489075, "kl": 0.0, "learning_rate": 4.665063509461098e-06, "logits/chosen": -471388229.4237288, "logits/rejected": 1642860745.442623, "logps/chosen": -1550.6440677966102, "logps/rejected": -3196.590163934426, "loss": 0.0074, "rewards/chosen": 6.7454585705773304, "rewards/margins": 16931292.778245453, "rewards/rejected": -16931286.032786883, "step": 400 }, { "epoch": 0.5059337913803873, "grad_norm": 0.10973331332206726, "kl": 3.674999952316284, "learning_rate": 4.651299584640198e-06, "logits/chosen": -555291323.3170732, "logits/rejected": 1601991111.1111112, "logps/chosen": -1485.918699186992, "logps/rejected": -3310.4957264957266, "loss": 0.0134, "rewards/chosen": 5.922939393578506, "rewards/margins": 9290643.051144522, "rewards/rejected": -9290637.128205128, "step": 405 }, { "epoch": 0.5121798875702686, "grad_norm": 0.54237300157547, "kl": 4.599999904632568, "learning_rate": 4.637279676682367e-06, "logits/chosen": -570763808.4050633, "logits/rejected": 932611261.6296296, "logps/chosen": -1530.9367088607594, "logps/rejected": -3226.074074074074, "loss": 0.0121, "rewards/chosen": 6.037538938884493, "rewards/margins": 11322472.01284758, "rewards/rejected": -11322465.975308642, "step": 410 }, { "epoch": 0.5184259837601499, "grad_norm": 0.08977213501930237, "kl": 1.5416666269302368, "learning_rate": 4.623005453816447e-06, "logits/chosen": -606227321.370518, "logits/rejected": 1303320441.8515284, "logps/chosen": -1591.3306772908365, "logps/rejected": -3280.069868995633, "loss": 0.0067, "rewards/chosen": 6.4393286230079685, "rewards/margins": 31069579.46989631, "rewards/rejected": -31069573.030567687, "step": 415 }, { "epoch": 0.5246720799500312, "grad_norm": 3.119323492050171, "kl": 13.933333396911621, "learning_rate": 4.608478614532215e-06, "logits/chosen": -561827020.8, "logits/rejected": 1508271718.4, "logps/chosen": -1578.5333333333333, "logps/rejected": -3426.133333333333, "loss": 0.0123, "rewards/chosen": 6.627941385904948, "rewards/margins": 29183765.56127472, "rewards/rejected": -29183758.933333334, "step": 420 }, { "epoch": 0.5309181761399125, "grad_norm": 0.252535343170166, "kl": 10.433333396911621, "learning_rate": 4.59370088737827e-06, "logits/chosen": -592064528.7183673, "logits/rejected": 1528359757.3446808, "logps/chosen": -1488.4571428571428, "logps/rejected": -3226.9617021276595, "loss": 0.0161, "rewards/chosen": 6.000954639668367, "rewards/margins": 24429160.04350783, "rewards/rejected": -24429154.04255319, "step": 425 }, { "epoch": 0.5371642723297939, "grad_norm": 0.19260543584823608, "kl": 0.24166665971279144, "learning_rate": 4.578674030756364e-06, "logits/chosen": -723806702.3448275, "logits/rejected": 1168265876.6451614, "logps/chosen": -1373.103448275862, "logps/rejected": -2701.6774193548385, "loss": 0.0105, "rewards/chosen": 6.012945767106681, "rewards/margins": 25346937.625848994, "rewards/rejected": -25346931.612903226, "step": 430 }, { "epoch": 0.5434103685196752, "grad_norm": 0.08087070286273956, "kl": 16.933332443237305, "learning_rate": 4.5633998327121595e-06, "logits/chosen": -731779713.5421686, "logits/rejected": 1168406448.2077923, "logps/chosen": -1502.008032128514, "logps/rejected": -3129.627705627706, "loss": 0.0166, "rewards/chosen": 6.375834392256526, "rewards/margins": 31221008.349860363, "rewards/rejected": -31221001.974025972, "step": 435 }, { "epoch": 0.5496564647095565, "grad_norm": 0.08805792033672333, "kl": 9.266666412353516, "learning_rate": 4.54788011072248e-06, "logits/chosen": -716458515.6085106, "logits/rejected": 1169859864.032653, "logps/chosen": -1333.1063829787233, "logps/rejected": -3449.208163265306, "loss": 0.012, "rewards/chosen": 7.045041348071808, "rewards/margins": -3183029.5590402847, "rewards/rejected": 3183036.6040816326, "step": 440 }, { "epoch": 0.5559025608994379, "grad_norm": 0.7193201780319214, "kl": 1.4166666269302368, "learning_rate": 4.532116711479039e-06, "logits/chosen": -627270098.2113822, "logits/rejected": 1177344717.6752136, "logps/chosen": -1573.138211382114, "logps/rejected": -3736.8888888888887, "loss": 0.0096, "rewards/chosen": 6.526741834190803, "rewards/margins": 28779710.32161363, "rewards/rejected": -28779703.794871796, "step": 445 }, { "epoch": 0.5621486570893192, "grad_norm": 0.08647624403238297, "kl": 5.0333333015441895, "learning_rate": 4.516111510668707e-06, "logits/chosen": -656890023.3463035, "logits/rejected": 919079215.0672646, "logps/chosen": -1566.1322957198443, "logps/rejected": -3726.6367713004483, "loss": 0.0096, "rewards/chosen": 6.501714209174367, "rewards/margins": 49463311.68557071, "rewards/rejected": -49463305.1838565, "step": 450 }, { "epoch": 0.5683947532792005, "grad_norm": 0.07795864343643188, "kl": 1.7000000476837158, "learning_rate": 4.499866412750324e-06, "logits/chosen": -725505175.373913, "logits/rejected": 908956008.448, "logps/chosen": -1507.3391304347826, "logps/rejected": -3735.04, "loss": 0.0063, "rewards/chosen": 6.4802914826766305, "rewards/margins": 20719481.168291483, "rewards/rejected": -20719474.688, "step": 455 }, { "epoch": 0.5746408494690818, "grad_norm": 0.09131667762994766, "kl": 3.2249999046325684, "learning_rate": 4.4833833507280884e-06, "logits/chosen": -645614869.8884759, "logits/rejected": 997203230.3317536, "logps/chosen": -1540.8773234200744, "logps/rejected": -4016.0758293838862, "loss": 0.0118, "rewards/chosen": 6.323027543418912, "rewards/margins": 38274318.81591854, "rewards/rejected": -38274312.492891, "step": 460 }, { "epoch": 0.5808869456589631, "grad_norm": 0.0715484768152237, "kl": 11.366666793823242, "learning_rate": 4.466664285921543e-06, "logits/chosen": -630874549.740458, "logits/rejected": 962645677.7981651, "logps/chosen": -1566.1679389312976, "logps/rejected": -3975.3394495412845, "loss": 0.0095, "rewards/chosen": 6.996014544072042, "rewards/margins": 56229115.25289527, "rewards/rejected": -56229108.25688073, "step": 465 }, { "epoch": 0.5871330418488445, "grad_norm": 0.07269534468650818, "kl": 0.0, "learning_rate": 4.4497112077322045e-06, "logits/chosen": -580528576.2746781, "logits/rejected": 840915499.5303644, "logps/chosen": -1388.2231759656652, "logps/rejected": -4148.599190283401, "loss": 0.0098, "rewards/chosen": 6.23273621915236, "rewards/margins": 21097550.783343505, "rewards/rejected": -21097544.550607286, "step": 470 }, { "epoch": 0.5933791380387258, "grad_norm": 0.08192071318626404, "kl": 0.0, "learning_rate": 4.432526133406843e-06, "logits/chosen": -690147276.5857741, "logits/rejected": 1121767474.987552, "logps/chosen": -1430.7615062761506, "logps/rejected": -4231.9668049792535, "loss": 0.0066, "rewards/chosen": 6.778336656642259, "rewards/margins": 19634322.99410429, "rewards/rejected": -19634316.215767633, "step": 475 }, { "epoch": 0.5996252342286071, "grad_norm": 0.14791642129421234, "kl": 8.800000190734863, "learning_rate": 4.415111107797445e-06, "logits/chosen": -582916348.9402391, "logits/rejected": 1169253818.6899564, "logps/chosen": -1577.434262948207, "logps/rejected": -3904.5589519650657, "loss": 0.015, "rewards/chosen": 6.03934554749751, "rewards/margins": 19996107.90834118, "rewards/rejected": -19996101.868995633, "step": 480 }, { "epoch": 0.6058713304184884, "grad_norm": 0.0918608009815216, "kl": 2.3333332538604736, "learning_rate": 4.397468203117905e-06, "logits/chosen": -683436670.976, "logits/rejected": 1088444683.1304348, "logps/chosen": -1622.784, "logps/rejected": -3430.4, "loss": 0.0057, "rewards/chosen": 6.6358408203125, "rewards/margins": 13961369.557579951, "rewards/rejected": -13961362.921739131, "step": 485 }, { "epoch": 0.6121174266083698, "grad_norm": 0.1823032945394516, "kl": 25.15833282470703, "learning_rate": 4.379599518697444e-06, "logits/chosen": -739122483.5893536, "logits/rejected": 745465053.7880185, "logps/chosen": -1405.3231939163497, "logps/rejected": -2948.7188940092165, "loss": 0.0152, "rewards/chosen": 6.272283633852187, "rewards/margins": 11475810.290716814, "rewards/rejected": -11475804.01843318, "step": 490 }, { "epoch": 0.6183635227982511, "grad_norm": 0.6821036338806152, "kl": 5.625, "learning_rate": 4.3615071807308165e-06, "logits/chosen": -681329732.2666667, "logits/rejected": 1006300910.9333333, "logps/chosen": -1461.8666666666666, "logps/rejected": -3339.733333333333, "loss": 0.0081, "rewards/chosen": 6.6669565836588545, "rewards/margins": 18135166.13362325, "rewards/rejected": -18135159.466666665, "step": 495 }, { "epoch": 0.6246096189881324, "grad_norm": 0.1369735449552536, "kl": 2.433333396911621, "learning_rate": 4.34319334202531e-06, "logits/chosen": -694496685.1452283, "logits/rejected": 1160242762.9790795, "logps/chosen": -1413.5767634854772, "logps/rejected": -3294.794979079498, "loss": 0.0092, "rewards/chosen": 6.173919804363329, "rewards/margins": 13679084.466806835, "rewards/rejected": -13679078.29288703, "step": 500 }, { "epoch": 0.6246096189881324, "eval_kl": 14.301383018493652, "eval_logits/chosen": -660690092.3172147, "eval_logits/rejected": 961826460.1858586, "eval_logps/chosen": -1467.8413926499034, "eval_logps/rejected": -3063.4989898989897, "eval_loss": 0.009466251358389854, "eval_rewards/chosen": 6.571533203125, "eval_rewards/margins": 21357531.12910896, "eval_rewards/rejected": -21357524.55757576, "eval_runtime": 640.876, "eval_samples_per_second": 6.312, "eval_steps_per_second": 0.395, "step": 500 }, { "epoch": 0.6308557151780138, "grad_norm": 0.0860004872083664, "kl": 11.016666412353516, "learning_rate": 4.324660181744589e-06, "logits/chosen": -672199384.5375494, "logits/rejected": 952610509.2511014, "logps/chosen": -1372.0790513833992, "logps/rejected": -3022.6607929515417, "loss": 0.0113, "rewards/chosen": 6.333535658041008, "rewards/margins": 46791826.17494535, "rewards/rejected": -46791819.84140969, "step": 505 }, { "epoch": 0.637101811367895, "grad_norm": 0.1688835471868515, "kl": 10.633333206176758, "learning_rate": 4.305909905149389e-06, "logits/chosen": -653113051.4285715, "logits/rejected": 957479286.7404256, "logps/chosen": -1485.7142857142858, "logps/rejected": -3097.6, "loss": 0.0079, "rewards/chosen": 6.55218431122449, "rewards/margins": 14125448.186226865, "rewards/rejected": -14125441.634042554, "step": 510 }, { "epoch": 0.6433479075577764, "grad_norm": 0.23886211216449738, "kl": 10.949999809265137, "learning_rate": 4.2869447433351165e-06, "logits/chosen": -685645342.117647, "logits/rejected": 734592482.3801653, "logps/chosen": -1563.6302521008404, "logps/rejected": -3077.0247933884298, "loss": 0.0116, "rewards/chosen": 6.41238018644958, "rewards/margins": 4603817.850396716, "rewards/rejected": -4603811.438016529, "step": 515 }, { "epoch": 0.6495940037476577, "grad_norm": 0.10929810255765915, "kl": 5.0333333015441895, "learning_rate": 4.267766952966369e-06, "logits/chosen": -814962587.8755555, "logits/rejected": 927048097.6313726, "logps/chosen": -1531.591111111111, "logps/rejected": -3431.9058823529413, "loss": 0.0069, "rewards/chosen": 6.492171223958334, "rewards/margins": 4893755.700014361, "rewards/rejected": -4893749.207843137, "step": 520 }, { "epoch": 0.655840099937539, "grad_norm": 0.24036245048046112, "kl": 6.233333110809326, "learning_rate": 4.248378816008418e-06, "logits/chosen": -786074843.1091703, "logits/rejected": 869507626.8366534, "logps/chosen": -1462.5676855895197, "logps/rejected": -3666.868525896414, "loss": 0.0092, "rewards/chosen": 6.3837330913960155, "rewards/margins": 22811593.18851397, "rewards/rejected": -22811586.804780878, "step": 525 }, { "epoch": 0.6620861961274204, "grad_norm": 0.0712604969739914, "kl": 0.0, "learning_rate": 4.228782639455674e-06, "logits/chosen": -546996645.9055794, "logits/rejected": 1379747715.6275303, "logps/chosen": -1523.5021459227469, "logps/rejected": -4356.663967611336, "loss": 0.0098, "rewards/chosen": 6.099889141295601, "rewards/margins": 21226120.69098226, "rewards/rejected": -21226114.59109312, "step": 530 }, { "epoch": 0.6683322923173017, "grad_norm": 0.08271102607250214, "kl": 0.0, "learning_rate": 4.2089807550571786e-06, "logits/chosen": -598833823.1932774, "logits/rejected": 1428741128.4628098, "logps/chosen": -1500.90756302521, "logps/rejected": -3963.2396694214876, "loss": 0.0107, "rewards/chosen": 6.574934246159401, "rewards/margins": 16506663.368322676, "rewards/rejected": -16506656.79338843, "step": 535 }, { "epoch": 0.674578388507183, "grad_norm": 0.09333149343729019, "kl": 1.4166666269302368, "learning_rate": 4.188975519039151e-06, "logits/chosen": -715718131.712, "logits/rejected": 1054329490.9217391, "logps/chosen": -1413.376, "logps/rejected": -3373.913043478261, "loss": 0.0079, "rewards/chosen": 6.30774169921875, "rewards/margins": 4260589.264263438, "rewards/rejected": -4260582.956521739, "step": 540 }, { "epoch": 0.6808244846970644, "grad_norm": 0.0794818177819252, "kl": 0.0, "learning_rate": 4.168769311824619e-06, "logits/chosen": -656695737.8921162, "logits/rejected": 1032728901.623431, "logps/chosen": -1501.7427385892115, "logps/rejected": -3163.313807531381, "loss": 0.0088, "rewards/chosen": 6.385147537927905, "rewards/margins": 29309559.924896494, "rewards/rejected": -29309553.539748956, "step": 545 }, { "epoch": 0.6870705808869456, "grad_norm": 0.07905972003936768, "kl": 13.633333206176758, "learning_rate": 4.1483645377501726e-06, "logits/chosen": -694805668.1526718, "logits/rejected": 1306294816.880734, "logps/chosen": -1517.3129770992366, "logps/rejected": -3286.605504587156, "loss": 0.0127, "rewards/chosen": 5.729227546517175, "rewards/margins": 14022816.738401858, "rewards/rejected": -14022811.009174312, "step": 550 }, { "epoch": 0.693316677076827, "grad_norm": 0.20355987548828125, "kl": 1.100000023841858, "learning_rate": 4.127763624779873e-06, "logits/chosen": -776127811.6017317, "logits/rejected": 950439393.1566265, "logps/chosen": -1429.0562770562772, "logps/rejected": -3153.2208835341366, "loss": 0.0086, "rewards/chosen": 6.4727360448796, "rewards/margins": 17968574.50486456, "rewards/rejected": -17968568.032128513, "step": 555 }, { "epoch": 0.6995627732667083, "grad_norm": 0.25120851397514343, "kl": 29.850000381469727, "learning_rate": 4.106969024216348e-06, "logits/chosen": -833976294.0759493, "logits/rejected": 1107952155.3909464, "logps/chosen": -1552.7426160337552, "logps/rejected": -3334.320987654321, "loss": 0.017, "rewards/chosen": 6.508130294864188, "rewards/margins": -5248354.841663944, "rewards/rejected": 5248361.349794239, "step": 560 }, { "epoch": 0.7058088694565896, "grad_norm": 0.06628026813268661, "kl": 0.0, "learning_rate": 4.085983210409114e-06, "logits/chosen": -842612666.8691983, "logits/rejected": 1499886562.5020576, "logps/chosen": -1503.392405063291, "logps/rejected": -3725.9588477366256, "loss": 0.0065, "rewards/chosen": 6.703082764702004, "rewards/margins": 5656834.489090995, "rewards/rejected": -5656827.78600823, "step": 565 }, { "epoch": 0.712054965646471, "grad_norm": 0.05841664969921112, "kl": 0.0, "learning_rate": 4.064808680460149e-06, "logits/chosen": -759934940.3826087, "logits/rejected": 1470086774.784, "logps/chosen": -1512.0695652173913, "logps/rejected": -3696.64, "loss": 0.0065, "rewards/chosen": 6.662904424252718, "rewards/margins": 9375535.622904425, "rewards/rejected": -9375528.96, "step": 570 }, { "epoch": 0.7183010618363522, "grad_norm": 0.335035502910614, "kl": 2.8333332538604736, "learning_rate": 4.043447953926763e-06, "logits/chosen": -727839396.7304348, "logits/rejected": 1297751212.032, "logps/chosen": -1522.6434782608696, "logps/rejected": -3939.072, "loss": 0.0101, "rewards/chosen": 6.710281504755435, "rewards/margins": 13018670.646281505, "rewards/rejected": -13018663.936, "step": 575 }, { "epoch": 0.7245471580262336, "grad_norm": 0.08834685385227203, "kl": 3.0833332538604736, "learning_rate": 4.021903572521802e-06, "logits/chosen": -775282141.8666667, "logits/rejected": 1288280473.6, "logps/chosen": -1568.4, "logps/rejected": -3556.266666666667, "loss": 0.0058, "rewards/chosen": 6.64389902750651, "rewards/margins": 10371696.243899027, "rewards/rejected": -10371689.6, "step": 580 }, { "epoch": 0.730793254216115, "grad_norm": 0.08919289708137512, "kl": 2.25, "learning_rate": 4.000178099811203e-06, "logits/chosen": -764677672.110599, "logits/rejected": 1221270088.0304182, "logps/chosen": -1534.2304147465438, "logps/rejected": -3374.722433460076, "loss": 0.0068, "rewards/chosen": 6.908961423531106, "rewards/margins": 16852222.148505148, "rewards/rejected": -16852215.239543725, "step": 585 }, { "epoch": 0.7370393504059962, "grad_norm": 0.08590350300073624, "kl": 0.0, "learning_rate": 3.978274120908957e-06, "logits/chosen": -685725950.9012876, "logits/rejected": 1515111660.3076923, "logps/chosen": -1563.4678111587982, "logps/rejected": -3674.1700404858298, "loss": 0.0083, "rewards/chosen": 6.62035294561427, "rewards/margins": 13114593.308612056, "rewards/rejected": -13114586.68825911, "step": 590 }, { "epoch": 0.7432854465958776, "grad_norm": 0.0870102196931839, "kl": 0.0, "learning_rate": 3.956194242169506e-06, "logits/chosen": -676005492.1680672, "logits/rejected": 1381572540.2975206, "logps/chosen": -1465.3445378151262, "logps/rejected": -3624.7272727272725, "loss": 0.0074, "rewards/chosen": 6.180169626444328, "rewards/margins": 18082138.8082688, "rewards/rejected": -18082132.628099173, "step": 595 }, { "epoch": 0.749531542785759, "grad_norm": 0.2675867974758148, "kl": 8.666666984558105, "learning_rate": 3.933941090877615e-06, "logits/chosen": -682359716.493617, "logits/rejected": 1673604334.2367346, "logps/chosen": -1567.659574468085, "logps/rejected": -4016.326530612245, "loss": 0.0115, "rewards/chosen": 6.600190118018617, "rewards/margins": 17001855.808353383, "rewards/rejected": -17001849.208163265, "step": 600 }, { "epoch": 0.7557776389756402, "grad_norm": 0.06292186677455902, "kl": 0.0, "learning_rate": 3.911517314935752e-06, "logits/chosen": -702037249.0893617, "logits/rejected": 1173634737.632653, "logps/chosen": -1671.6255319148936, "logps/rejected": -3740.734693877551, "loss": 0.006, "rewards/chosen": 6.977364527925532, "rewards/margins": 23121256.152874734, "rewards/rejected": -23121249.175510205, "step": 605 }, { "epoch": 0.7620237351655216, "grad_norm": 0.07434140145778656, "kl": 0.0, "learning_rate": 3.888925582549006e-06, "logits/chosen": -725903268.7935222, "logits/rejected": 1318685577.3390558, "logps/chosen": -1664.0, "logps/rejected": -3763.0901287553647, "loss": 0.008, "rewards/chosen": 6.757370674658401, "rewards/margins": 18745667.18655522, "rewards/rejected": -18745660.42918455, "step": 610 }, { "epoch": 0.7682698313554028, "grad_norm": 0.06401233375072479, "kl": 22.33333396911621, "learning_rate": 3.866168581907609e-06, "logits/chosen": -783920391.083004, "logits/rejected": 1419005103.9295154, "logps/chosen": -1494.3873517786562, "logps/rejected": -4285.180616740088, "loss": 0.0153, "rewards/chosen": 6.33299623270751, "rewards/margins": 32352429.24048522, "rewards/rejected": -32352422.907488987, "step": 615 }, { "epoch": 0.7745159275452842, "grad_norm": 0.10351639986038208, "kl": 0.0, "learning_rate": 3.8432490208670605e-06, "logits/chosen": -914429361.898305, "logits/rejected": 1108860525.1147542, "logps/chosen": -1482.5762711864406, "logps/rejected": -4299.803278688524, "loss": 0.006, "rewards/chosen": 6.938127420716366, "rewards/margins": 10782713.298783159, "rewards/rejected": -10782706.360655738, "step": 620 }, { "epoch": 0.7807620237351656, "grad_norm": 0.05740602687001228, "kl": 0.0, "learning_rate": 3.82016962662592e-06, "logits/chosen": -904887262.967742, "logits/rejected": 979027103.634981, "logps/chosen": -1401.3640552995391, "logps/rejected": -4223.2699619771865, "loss": 0.0063, "rewards/chosen": 6.702658095118087, "rewards/margins": 38639299.53916, "rewards/rejected": -38639292.836501904, "step": 625 }, { "epoch": 0.7870081199250468, "grad_norm": 0.07504531741142273, "kl": 0.0, "learning_rate": 3.796933145401304e-06, "logits/chosen": -765740100.2666667, "logits/rejected": 919574937.6, "logps/chosen": -1549.2, "logps/rejected": -4261.333333333333, "loss": 0.0063, "rewards/chosen": 6.80685780843099, "rewards/margins": 22749884.14019114, "rewards/rejected": -22749877.333333332, "step": 630 }, { "epoch": 0.7932542161149282, "grad_norm": 0.04929427057504654, "kl": 3.9833333492279053, "learning_rate": 3.773542342102105e-06, "logits/chosen": -887692046.5691057, "logits/rejected": 754652081.2307693, "logps/chosen": -1502.69918699187, "logps/rejected": -4267.213675213675, "loss": 0.0069, "rewards/chosen": 6.618328311579014, "rewards/margins": 30490055.16533686, "rewards/rejected": -30490048.547008548, "step": 635 }, { "epoch": 0.7995003123048094, "grad_norm": 0.05546702817082405, "kl": 14.333333015441895, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -872644608.0, "logits/rejected": 810034322.2857143, "logps/chosen": -1477.5, "logps/rejected": -4392.285714285715, "loss": 0.0087, "rewards/chosen": 6.3790788650512695, "rewards/margins": 32740708.093364578, "rewards/rejected": -32740701.714285713, "step": 640 }, { "epoch": 0.8057464084946908, "grad_norm": 0.08095496147871017, "kl": 0.0, "learning_rate": 3.7263089203982698e-06, "logits/chosen": -660306022.8675799, "logits/rejected": 944827238.9885057, "logps/chosen": -1674.2283105022832, "logps/rejected": -4282.360153256705, "loss": 0.0062, "rewards/chosen": 6.8761649632562785, "rewards/margins": -5804263.468662622, "rewards/rejected": 5804270.344827586, "step": 645 }, { "epoch": 0.8119925046845722, "grad_norm": 0.3066859245300293, "kl": 0.0, "learning_rate": 3.7024719222984696e-06, "logits/chosen": -730389925.1255411, "logits/rejected": 1048660222.9718876, "logps/chosen": -1364.3636363636363, "logps/rejected": -4099.084337349397, "loss": 0.0097, "rewards/chosen": 6.316058534564394, "rewards/margins": 3112194.7176649603, "rewards/rejected": -3112188.4016064256, "step": 650 }, { "epoch": 0.8182386008744534, "grad_norm": 0.0594465434551239, "kl": 1.0083333253860474, "learning_rate": 3.6784918420649952e-06, "logits/chosen": -717847362.3703704, "logits/rejected": 960088573.8396624, "logps/chosen": -1455.8024691358025, "logps/rejected": -3842.700421940928, "loss": 0.0086, "rewards/chosen": 6.571082597897377, "rewards/margins": 9235133.119605804, "rewards/rejected": -9235126.548523206, "step": 655 }, { "epoch": 0.8244846970643348, "grad_norm": 0.7943433523178101, "kl": 13.666666984558105, "learning_rate": 3.654371533087586e-06, "logits/chosen": -735423581.8326694, "logits/rejected": 1230538278.0087335, "logps/chosen": -1492.9083665338646, "logps/rejected": -3983.3711790393013, "loss": 0.0139, "rewards/chosen": 6.401267002303287, "rewards/margins": 29458865.440568313, "rewards/rejected": -29458859.03930131, "step": 660 }, { "epoch": 0.8307307932542161, "grad_norm": 0.5621448159217834, "kl": 43.11249923706055, "learning_rate": 3.6301138654418e-06, "logits/chosen": -750683250.6563706, "logits/rejected": 951841305.4841629, "logps/chosen": -1559.7837837837837, "logps/rejected": -2898.823529411765, "loss": 0.0147, "rewards/chosen": 6.53166287101834, "rewards/margins": 61762992.81220586, "rewards/rejected": -61762986.280542985, "step": 665 }, { "epoch": 0.8369768894440974, "grad_norm": 0.06558384746313095, "kl": 22.733333587646484, "learning_rate": 3.6057217255475034e-06, "logits/chosen": -798352653.4736842, "logits/rejected": 947858700.0858369, "logps/chosen": -1364.34008097166, "logps/rejected": -3295.038626609442, "loss": 0.0106, "rewards/chosen": 6.661080920261893, "rewards/margins": 42726151.75979336, "rewards/rejected": -42726145.098712444, "step": 670 }, { "epoch": 0.8432229856339788, "grad_norm": 0.34487301111221313, "kl": 2.7333333492279053, "learning_rate": 3.5811980158254156e-06, "logits/chosen": -712224751.144033, "logits/rejected": 919353387.2067511, "logps/chosen": -1495.9012345679012, "logps/rejected": -4504.84388185654, "loss": 0.0084, "rewards/chosen": 6.79199721096965, "rewards/margins": 18640485.84684953, "rewards/rejected": -18640479.05485232, "step": 675 }, { "epoch": 0.84946908182386, "grad_norm": 0.07742549479007721, "kl": 0.0, "learning_rate": 3.556545654351749e-06, "logits/chosen": -634858531.3103448, "logits/rejected": 1233641207.7419355, "logps/chosen": -1602.4137931034484, "logps/rejected": -4847.4838709677415, "loss": 0.0064, "rewards/chosen": 6.5001373291015625, "rewards/margins": 12399482.113040555, "rewards/rejected": -12399475.612903226, "step": 680 }, { "epoch": 0.8557151780137414, "grad_norm": 0.07713647931814194, "kl": 0.0, "learning_rate": 3.531767574510987e-06, "logits/chosen": -767521783.2478633, "logits/rejected": 1424972158.9593496, "logps/chosen": -1447.6581196581196, "logps/rejected": -4773.463414634146, "loss": 0.009, "rewards/chosen": 6.673850035056089, "rewards/margins": 36117947.747020766, "rewards/rejected": -36117941.07317073, "step": 685 }, { "epoch": 0.8619612742036228, "grad_norm": 0.0707743763923645, "kl": 4.150000095367432, "learning_rate": 3.5068667246468437e-06, "logits/chosen": -698709666.3414634, "logits/rejected": 932399156.5128205, "logps/chosen": -1396.5528455284552, "logps/rejected": -4100.102564102564, "loss": 0.0118, "rewards/chosen": 6.197249249714177, "rewards/margins": 10702407.838274892, "rewards/rejected": -10702401.641025642, "step": 690 }, { "epoch": 0.868207370393504, "grad_norm": 0.0866667851805687, "kl": 4.666666507720947, "learning_rate": 3.481846067711436e-06, "logits/chosen": -739102499.053942, "logits/rejected": 999367512.9037657, "logps/chosen": -1414.240663900415, "logps/rejected": -4140.719665271967, "loss": 0.0065, "rewards/chosen": 6.7948727429655085, "rewards/margins": 23043689.338805795, "rewards/rejected": -23043682.543933053, "step": 695 }, { "epoch": 0.8744534665833854, "grad_norm": 0.06301329284906387, "kl": 4.400000095367432, "learning_rate": 3.4567085809127247e-06, "logits/chosen": -748142063.483871, "logits/rejected": 1094568712.8275862, "logps/chosen": -1499.8709677419354, "logps/rejected": -4060.137931034483, "loss": 0.0071, "rewards/chosen": 6.396800379599294, "rewards/margins": 33424125.569214173, "rewards/rejected": -33424119.172413792, "step": 700 }, { "epoch": 0.8806995627732667, "grad_norm": 0.08260803669691086, "kl": 0.0, "learning_rate": 3.4314572553602577e-06, "logits/chosen": -728655004.5065502, "logits/rejected": 914725900.2390438, "logps/chosen": -1468.0873362445416, "logps/rejected": -3849.434262948207, "loss": 0.0058, "rewards/chosen": 6.83486072257096, "rewards/margins": 14487753.6396416, "rewards/rejected": -14487746.804780876, "step": 705 }, { "epoch": 0.886945658963148, "grad_norm": 0.11022159457206726, "kl": 0.0, "learning_rate": 3.406095095709254e-06, "logits/chosen": -760663350.931174, "logits/rejected": 794069053.527897, "logps/chosen": -1469.9271255060728, "logps/rejected": -3839.725321888412, "loss": 0.0064, "rewards/chosen": 6.487148516573886, "rewards/margins": 22605913.98929444, "rewards/rejected": -22605907.502145924, "step": 710 }, { "epoch": 0.8931917551530294, "grad_norm": 0.08453460782766342, "kl": 0.0, "learning_rate": 3.3806251198030843e-06, "logits/chosen": -895154599.1404959, "logits/rejected": 1068807348.7058823, "logps/chosen": -1484.9586776859503, "logps/rejected": -4220.235294117647, "loss": 0.0093, "rewards/chosen": 6.4102722672391526, "rewards/margins": 18819080.56153277, "rewards/rejected": -18819074.151260503, "step": 715 }, { "epoch": 0.8994378513429107, "grad_norm": 0.0879315510392189, "kl": 0.0, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -890640039.6460177, "logits/rejected": 928196172.5984251, "logps/chosen": -1418.7610619469026, "logps/rejected": -4121.1968503937005, "loss": 0.0083, "rewards/chosen": 6.719538055689989, "rewards/margins": 28256163.94788451, "rewards/rejected": -28256157.228346456, "step": 720 }, { "epoch": 0.905683947532792, "grad_norm": 0.08083692938089371, "kl": 0.0, "learning_rate": 3.3293738543833807e-06, "logits/chosen": -763081357.9831933, "logits/rejected": 966262784.0, "logps/chosen": -1469.9831932773109, "logps/rejected": -4322.644628099173, "loss": 0.0063, "rewards/chosen": 6.22869103696166, "rewards/margins": 14399539.716294343, "rewards/rejected": -14399533.487603305, "step": 725 }, { "epoch": 0.9119300437226733, "grad_norm": 0.08498977869749069, "kl": 0.0, "learning_rate": 3.303598663257904e-06, "logits/chosen": -845987405.0265486, "logits/rejected": 976843493.7952756, "logps/chosen": -1476.3893805309735, "logps/rejected": -3976.566929133858, "loss": 0.0082, "rewards/chosen": 6.599641783047566, "rewards/margins": 20458966.221689027, "rewards/rejected": -20458959.622047246, "step": 730 }, { "epoch": 0.9181761399125546, "grad_norm": 0.1156986802816391, "kl": 0.0, "learning_rate": 3.277727851927727e-06, "logits/chosen": -723482487.4666667, "logits/rejected": 912776669.8666667, "logps/chosen": -1497.3333333333333, "logps/rejected": -3891.733333333333, "loss": 0.009, "rewards/chosen": 6.8691650390625, "rewards/margins": 15557920.469165038, "rewards/rejected": -15557913.6, "step": 735 }, { "epoch": 0.924422236102436, "grad_norm": 0.15747644007205963, "kl": 0.0, "learning_rate": 3.2517644987606827e-06, "logits/chosen": -711371434.6666666, "logits/rejected": 1074933387.6363637, "logps/chosen": -1456.5925925925926, "logps/rejected": -4056.7272727272725, "loss": 0.0082, "rewards/chosen": 6.889565927010995, "rewards/margins": 31410347.738050774, "rewards/rejected": -31410340.848484848, "step": 740 }, { "epoch": 0.9306683322923173, "grad_norm": 0.07812928408384323, "kl": 0.0, "learning_rate": 3.225711693136156e-06, "logits/chosen": -697507742.0876493, "logits/rejected": 1089667358.183406, "logps/chosen": -1519.4262948207172, "logps/rejected": -4072.803493449782, "loss": 0.0107, "rewards/chosen": 6.269356169073705, "rewards/margins": 20432616.085950054, "rewards/rejected": -20432609.816593885, "step": 745 }, { "epoch": 0.9369144284821986, "grad_norm": 0.09357881546020508, "kl": 4.599999904632568, "learning_rate": 3.199572535077481e-06, "logits/chosen": -772760009.0643777, "logits/rejected": 1114988643.4979758, "logps/chosen": -1435.5708154506437, "logps/rejected": -4109.7327935222675, "loss": 0.0085, "rewards/chosen": 6.513185689377682, "rewards/margins": 38015799.2257363, "rewards/rejected": -38015792.71255061, "step": 750 }, { "epoch": 0.94316052467208, "grad_norm": 0.08641842007637024, "kl": 0.0, "learning_rate": 3.173350134883066e-06, "logits/chosen": -784815252.8634361, "logits/rejected": 1084269029.6916995, "logps/chosen": -1494.4140969162995, "logps/rejected": -4097.01185770751, "loss": 0.0048, "rewards/chosen": 6.749071297666575, "rewards/margins": 42705485.57911082, "rewards/rejected": -42705478.83003952, "step": 755 }, { "epoch": 0.9494066208619613, "grad_norm": 0.11333774030208588, "kl": 13.466666221618652, "learning_rate": 3.147047612756302e-06, "logits/chosen": -665163950.4932735, "logits/rejected": 1101347525.229572, "logps/chosen": -1584.57399103139, "logps/rejected": -4380.389105058366, "loss": 0.0135, "rewards/chosen": 6.7324350126121075, "rewards/margins": -230439.8356583726, "rewards/rejected": 230446.56809338523, "step": 760 }, { "epoch": 0.9556527170518426, "grad_norm": 0.07624714821577072, "kl": 0.0, "learning_rate": 3.120668098434291e-06, "logits/chosen": -905083063.1544715, "logits/rejected": 1016455518.0854701, "logps/chosen": -1511.479674796748, "logps/rejected": -4418.188034188034, "loss": 0.0074, "rewards/chosen": 6.443224899167937, "rewards/margins": 31545728.973139428, "rewards/rejected": -31545722.529914528, "step": 765 }, { "epoch": 0.9618988132417239, "grad_norm": 0.05935695767402649, "kl": 11.266666412353516, "learning_rate": 3.094214730815433e-06, "logits/chosen": -773629720.6359832, "logits/rejected": 844717162.2240664, "logps/chosen": -1496.3012552301254, "logps/rejected": -4301.809128630705, "loss": 0.014, "rewards/chosen": 6.738523858361663, "rewards/margins": 28066950.20740353, "rewards/rejected": -28066943.46887967, "step": 770 }, { "epoch": 0.9681449094316052, "grad_norm": 0.048142824321985245, "kl": 0.0, "learning_rate": 3.0676906575859335e-06, "logits/chosen": -790502942.117647, "logits/rejected": 963840660.0991735, "logps/chosen": -1486.6554621848738, "logps/rejected": -4551.4049586776855, "loss": 0.0075, "rewards/chosen": 6.345016864167542, "rewards/margins": 13151236.229314385, "rewards/rejected": -13151229.88429752, "step": 775 }, { "epoch": 0.9743910056214866, "grad_norm": 0.06406976282596588, "kl": 0.0, "learning_rate": 3.0410990348452572e-06, "logits/chosen": -673650812.6608696, "logits/rejected": 1048039129.088, "logps/chosen": -1577.5304347826086, "logps/rejected": -4895.744, "loss": 0.0057, "rewards/chosen": 6.700010614809782, "rewards/margins": 10417035.820010614, "rewards/rejected": -10417029.12, "step": 780 }, { "epoch": 0.9806371018113679, "grad_norm": 0.060679152607917786, "kl": 0.0, "learning_rate": 3.0144430267305874e-06, "logits/chosen": -849952403.9111111, "logits/rejected": 943619710.4941176, "logps/chosen": -1398.6844444444444, "logps/rejected": -4804.266666666666, "loss": 0.0062, "rewards/chosen": 6.7072553168402775, "rewards/margins": 11713205.389608258, "rewards/rejected": -11713198.682352941, "step": 785 }, { "epoch": 0.9868831980012492, "grad_norm": 0.08234000205993652, "kl": 0.0, "learning_rate": 2.9877258050403214e-06, "logits/chosen": -818881825.221374, "logits/rejected": 1011058143.119266, "logps/chosen": -1413.0687022900763, "logps/rejected": -4671.412844036698, "loss": 0.0087, "rewards/chosen": 6.607599855379294, "rewards/margins": 45059064.515856735, "rewards/rejected": -45059057.90825688, "step": 790 }, { "epoch": 0.9931292941911305, "grad_norm": 0.07609214633703232, "kl": 0.0, "learning_rate": 2.9609505488566585e-06, "logits/chosen": -804330948.4651163, "logits/rejected": 1076613599.7117116, "logps/chosen": -1639.8139534883721, "logps/rejected": -4862.846846846847, "loss": 0.0127, "rewards/chosen": 6.3508963178294575, "rewards/margins": 54194814.27882425, "rewards/rejected": -54194807.927927926, "step": 795 }, { "epoch": 0.9993753903810119, "grad_norm": 0.0829169750213623, "kl": 0.0, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -796991022.9519651, "logits/rejected": 1063515074.8047808, "logps/chosen": -1517.9737991266375, "logps/rejected": -5103.681274900398, "loss": 0.0058, "rewards/chosen": 6.788350334334061, "rewards/margins": 13866081.561258702, "rewards/rejected": -13866074.772908367, "step": 800 }, { "epoch": 1.0062460961898814, "grad_norm": 0.06701330095529556, "kl": 0.5483871102333069, "learning_rate": 2.9072386834864723e-06, "logits/chosen": -763707123.409836, "logits/rejected": 865724318.4761904, "logps/chosen": -1432.9180327868853, "logps/rejected": -4660.825396825397, "loss": 0.0085, "rewards/chosen": 6.726138255635246, "rewards/margins": 11638771.424550954, "rewards/rejected": -11638764.698412698, "step": 805 }, { "epoch": 1.0124921923797627, "grad_norm": 1.086881160736084, "kl": 0.0, "learning_rate": 2.880308465474792e-06, "logits/chosen": -682072029.2881356, "logits/rejected": 712537474.0983607, "logps/chosen": -1559.1864406779662, "logps/rejected": -4389.770491803279, "loss": 0.009, "rewards/chosen": 7.010221836930614, "rewards/margins": 21856382.616779212, "rewards/rejected": -21856375.606557377, "step": 810 }, { "epoch": 1.0187382885696439, "grad_norm": 0.08652088791131973, "kl": 0.0, "learning_rate": 2.8533329945589192e-06, "logits/chosen": -876447524.2575108, "logits/rejected": 1136792231.902834, "logps/chosen": -1390.8326180257511, "logps/rejected": -5287.384615384615, "loss": 0.0064, "rewards/chosen": 7.198371592509388, "rewards/margins": 17011080.753027465, "rewards/rejected": -17011073.554655872, "step": 815 }, { "epoch": 1.0249843847595252, "grad_norm": 0.10212092101573944, "kl": 0.0, "learning_rate": 2.82631548055013e-06, "logits/chosen": -851879919.616, "logits/rejected": 1202871678.8869565, "logps/chosen": -1350.912, "logps/rejected": -5666.504347826087, "loss": 0.0088, "rewards/chosen": 6.3730400390625, "rewards/margins": 17158576.668692213, "rewards/rejected": -17158570.295652173, "step": 820 }, { "epoch": 1.0312304809494066, "grad_norm": 0.05050951614975929, "kl": 0.0, "learning_rate": 2.7992591382624064e-06, "logits/chosen": -876872855.5336323, "logits/rejected": 692957773.696498, "logps/chosen": -1370.1165919282512, "logps/rejected": -4394.334630350195, "loss": 0.0074, "rewards/chosen": 6.687953247617713, "rewards/margins": 15439475.263828734, "rewards/rejected": -15439468.575875487, "step": 825 }, { "epoch": 1.037476577139288, "grad_norm": 0.17391963303089142, "kl": 0.0, "learning_rate": 2.7721671871299115e-06, "logits/chosen": -870837886.9059829, "logits/rejected": 948289935.6097561, "logps/chosen": -1535.2478632478633, "logps/rejected": -4426.926829268293, "loss": 0.0094, "rewards/chosen": 7.150073972522703, "rewards/margins": 36318818.72730975, "rewards/rejected": -36318811.57723577, "step": 830 }, { "epoch": 1.0437226733291693, "grad_norm": 0.05891693755984306, "kl": 1.2000000476837158, "learning_rate": 2.7450428508239024e-06, "logits/chosen": -946800960.7710843, "logits/rejected": 793304485.1255411, "logps/chosen": -1461.0763052208836, "logps/rejected": -4203.774891774891, "loss": 0.005, "rewards/chosen": 6.78319430064006, "rewards/margins": 9203793.588389104, "rewards/rejected": -9203786.805194804, "step": 835 }, { "epoch": 1.0499687695190505, "grad_norm": 0.0753137543797493, "kl": 0.0, "learning_rate": 2.717889356869146e-06, "logits/chosen": -808641250.8862745, "logits/rejected": 812026575.0755556, "logps/chosen": -1453.678431372549, "logps/rejected": -4184.746666666667, "loss": 0.0095, "rewards/chosen": 6.665716911764706, "rewards/margins": 23116390.239050247, "rewards/rejected": -23116383.573333334, "step": 840 }, { "epoch": 1.0562148657089319, "grad_norm": 0.049415141344070435, "kl": 6.5333333015441895, "learning_rate": 2.6907099362598815e-06, "logits/chosen": -968955313.898305, "logits/rejected": 909888931.6721312, "logps/chosen": -1353.7627118644068, "logps/rejected": -4428.459016393443, "loss": 0.0098, "rewards/chosen": 6.966336007845604, "rewards/margins": 45752528.40895896, "rewards/rejected": -45752521.44262295, "step": 845 }, { "epoch": 1.0624609618988132, "grad_norm": 0.08883284777402878, "kl": 2.566666603088379, "learning_rate": 2.663507823075358e-06, "logits/chosen": -916332062.117647, "logits/rejected": 652192607.2066115, "logps/chosen": -1383.5966386554621, "logps/rejected": -3607.5371900826444, "loss": 0.0059, "rewards/chosen": 7.278132590926996, "rewards/margins": 19222385.063256558, "rewards/rejected": -19222377.785123967, "step": 850 }, { "epoch": 1.0687070580886946, "grad_norm": 0.18526360392570496, "kl": 0.0, "learning_rate": 2.6362862540950163e-06, "logits/chosen": -882909916.0510638, "logits/rejected": 661963888.8489796, "logps/chosen": -1447.2170212765957, "logps/rejected": -3268.9632653061226, "loss": 0.0099, "rewards/chosen": 6.842736037234043, "rewards/margins": 42400891.87130746, "rewards/rejected": -42400885.02857143, "step": 855 }, { "epoch": 1.074953154278576, "grad_norm": 0.14632245898246765, "kl": 6.016666889190674, "learning_rate": 2.6090484684133406e-06, "logits/chosen": -773478470.6206896, "logits/rejected": 809466846.967742, "logps/chosen": -1492.2758620689656, "logps/rejected": -3227.0967741935483, "loss": 0.0072, "rewards/chosen": 6.721473430765086, "rewards/margins": 33828763.36663473, "rewards/rejected": -33828756.64516129, "step": 860 }, { "epoch": 1.0811992504684573, "grad_norm": 0.134897381067276, "kl": 0.0, "learning_rate": 2.5817977070544408e-06, "logits/chosen": -743969481.9816514, "logits/rejected": 971032904.3053435, "logps/chosen": -1543.8165137614678, "logps/rejected": -3347.053435114504, "loss": 0.0055, "rewards/chosen": 7.166585659762041, "rewards/margins": 12854784.326891003, "rewards/rejected": -12854777.160305344, "step": 865 }, { "epoch": 1.0874453466583385, "grad_norm": 0.06570931524038315, "kl": 28.450000762939453, "learning_rate": 2.554537212586403e-06, "logits/chosen": -829488342.9135803, "logits/rejected": 857283882.1265823, "logps/chosen": -1473.0864197530864, "logps/rejected": -3268.590717299578, "loss": 0.0132, "rewards/chosen": 6.962601775495113, "rewards/margins": 34971415.16513342, "rewards/rejected": -34971408.20253164, "step": 870 }, { "epoch": 1.0936914428482198, "grad_norm": 0.05726313218474388, "kl": 7.766666889190674, "learning_rate": 2.527270228735456e-06, "logits/chosen": -825991529.8955823, "logits/rejected": 898852057.2121212, "logps/chosen": -1504.7710843373493, "logps/rejected": -3464.034632034632, "loss": 0.006, "rewards/chosen": 6.7191294474774095, "rewards/margins": 26011091.740774468, "rewards/rejected": -26011085.02164502, "step": 875 }, { "epoch": 1.0999375390381012, "grad_norm": 0.2399851530790329, "kl": 1.2166666984558105, "learning_rate": 2.5e-06, "logits/chosen": -906456969.0290457, "logits/rejected": 896194654.2594142, "logps/chosen": -1351.5684647302905, "logps/rejected": -3565.255230125523, "loss": 0.0053, "rewards/chosen": 6.544242637286048, "rewards/margins": 50190004.067255184, "rewards/rejected": -50189997.52301255, "step": 880 }, { "epoch": 1.1061836352279826, "grad_norm": 0.0749124139547348, "kl": 7.150000095367432, "learning_rate": 2.4727297712645446e-06, "logits/chosen": -879664646.3209877, "logits/rejected": 918287113.721519, "logps/chosen": -1344.2633744855966, "logps/rejected": -3626.9367088607596, "loss": 0.0092, "rewards/chosen": 6.501157407407407, "rewards/margins": 46364501.35347809, "rewards/rejected": -46364494.85232068, "step": 885 }, { "epoch": 1.1124297314178637, "grad_norm": 0.0572606585919857, "kl": 10.399999618530273, "learning_rate": 2.4454627874135976e-06, "logits/chosen": -1075104972.8, "logits/rejected": 832289723.7333333, "logps/chosen": -1370.6666666666667, "logps/rejected": -3668.0, "loss": 0.0071, "rewards/chosen": 7.1034596761067705, "rewards/margins": 29097848.17012634, "rewards/rejected": -29097841.066666666, "step": 890 }, { "epoch": 1.118675827607745, "grad_norm": 0.08893398195505142, "kl": 0.2666666805744171, "learning_rate": 2.41820229294556e-06, "logits/chosen": -897616008.5333333, "logits/rejected": 795406062.9333333, "logps/chosen": -1664.1333333333334, "logps/rejected": -3720.5333333333333, "loss": 0.0087, "rewards/chosen": 7.050230916341146, "rewards/margins": 46222744.11689758, "rewards/rejected": -46222737.06666667, "step": 895 }, { "epoch": 1.1249219237976265, "grad_norm": 0.04612141475081444, "kl": 3.1500000953674316, "learning_rate": 2.3909515315866606e-06, "logits/chosen": -1041832217.0980393, "logits/rejected": 625883363.5555556, "logps/chosen": -1448.9098039215687, "logps/rejected": -3645.7244444444445, "loss": 0.0051, "rewards/chosen": 6.208157169117647, "rewards/margins": 32499763.71926828, "rewards/rejected": -32499757.51111111, "step": 900 }, { "epoch": 1.1311680199875078, "grad_norm": 0.09885445982217789, "kl": 13.033333778381348, "learning_rate": 2.363713745904984e-06, "logits/chosen": -1081814542.995816, "logits/rejected": 461145015.76763487, "logps/chosen": -1408.8702928870293, "logps/rejected": -3603.3858921161827, "loss": 0.0046, "rewards/chosen": 6.505180071587343, "rewards/margins": 36068290.52177758, "rewards/rejected": -36068284.01659751, "step": 905 }, { "epoch": 1.1374141161773892, "grad_norm": 0.06535188853740692, "kl": 0.0, "learning_rate": 2.3364921769246423e-06, "logits/chosen": -1133595675.6756756, "logits/rejected": 612681331.1007752, "logps/chosen": -1402.8108108108108, "logps/rejected": -3662.8837209302324, "loss": 0.0049, "rewards/chosen": 6.57831250439893, "rewards/margins": 14068877.52404894, "rewards/rejected": -14068870.945736434, "step": 910 }, { "epoch": 1.1436602123672706, "grad_norm": 0.031041713431477547, "kl": 5.800000190734863, "learning_rate": 2.3092900637401193e-06, "logits/chosen": -1192516089.5732217, "logits/rejected": 424164220.28215766, "logps/chosen": -1403.44769874477, "logps/rejected": -3883.286307053942, "loss": 0.007, "rewards/chosen": 6.954869226431747, "rewards/margins": 42810608.64781529, "rewards/rejected": -42810601.69294606, "step": 915 }, { "epoch": 1.1499063085571517, "grad_norm": 0.05815276503562927, "kl": 12.633333206176758, "learning_rate": 2.2821106431308546e-06, "logits/chosen": -1078428576.6477733, "logits/rejected": 431689288.51502144, "logps/chosen": -1423.5465587044534, "logps/rejected": -4039.6909871244634, "loss": 0.0103, "rewards/chosen": 6.802721525493421, "rewards/margins": 29919731.02589749, "rewards/rejected": -29919724.223175965, "step": 920 }, { "epoch": 1.156152404747033, "grad_norm": 0.14204278588294983, "kl": 0.0, "learning_rate": 2.2549571491760985e-06, "logits/chosen": -1101822333.8305085, "logits/rejected": 492673863.3442623, "logps/chosen": -1459.050847457627, "logps/rejected": -4242.360655737705, "loss": 0.0094, "rewards/chosen": 6.55073831849179, "rewards/margins": 50274793.17368914, "rewards/rejected": -50274786.62295082, "step": 925 }, { "epoch": 1.1623985009369144, "grad_norm": 0.034907300025224686, "kl": 0.0, "learning_rate": 2.2278328128700893e-06, "logits/chosen": -1120958714.5991561, "logits/rejected": 516123778.63374484, "logps/chosen": -1365.0632911392406, "logps/rejected": -4247.7037037037035, "loss": 0.0061, "rewards/chosen": 6.981713661161656, "rewards/margins": 10248818.438503785, "rewards/rejected": -10248811.456790123, "step": 930 }, { "epoch": 1.1686445971267958, "grad_norm": 0.05778981000185013, "kl": 0.0, "learning_rate": 2.2007408617375944e-06, "logits/chosen": -1153482944.7529411, "logits/rejected": 333381923.27111113, "logps/chosen": -1441.3803921568629, "logps/rejected": -4042.5244444444443, "loss": 0.0044, "rewards/chosen": 6.869606885723039, "rewards/margins": 59051943.29627355, "rewards/rejected": -59051936.42666667, "step": 935 }, { "epoch": 1.1748906933166772, "grad_norm": 0.05797224119305611, "kl": 0.0, "learning_rate": 2.173684519449872e-06, "logits/chosen": -1122529125.757322, "logits/rejected": 264763264.53112033, "logps/chosen": -1340.7866108786611, "logps/rejected": -3995.0871369294605, "loss": 0.0052, "rewards/chosen": 6.821588364605126, "rewards/margins": 60523131.6680614, "rewards/rejected": -60523124.84647303, "step": 940 }, { "epoch": 1.1811367895065583, "grad_norm": 0.3804548680782318, "kl": 6.166666507720947, "learning_rate": 2.146667005441082e-06, "logits/chosen": -1062347298.1333333, "logits/rejected": 383971054.93333334, "logps/chosen": -1508.7333333333333, "logps/rejected": -4347.2, "loss": 0.0089, "rewards/chosen": 6.360628763834636, "rewards/margins": 30816569.560628764, "rewards/rejected": -30816563.2, "step": 945 }, { "epoch": 1.1873828856964397, "grad_norm": 0.04914607107639313, "kl": 0.0, "learning_rate": 2.1196915345252085e-06, "logits/chosen": -1130330548.4590163, "logits/rejected": 303704931.7966102, "logps/chosen": -1441.9016393442623, "logps/rejected": -4298.3050847457625, "loss": 0.0046, "rewards/chosen": 6.969489425909324, "rewards/margins": 28575672.86779451, "rewards/rejected": -28575665.898305085, "step": 950 }, { "epoch": 1.193628981886321, "grad_norm": 0.033815231174230576, "kl": 0.0, "learning_rate": 2.0927613165135285e-06, "logits/chosen": -1095581768.1762114, "logits/rejected": 306727130.5612648, "logps/chosen": -1453.0396475770924, "logps/rejected": -4572.584980237154, "loss": 0.0066, "rewards/chosen": 6.561520210971916, "rewards/margins": 32260731.93701428, "rewards/rejected": -32260725.37549407, "step": 955 }, { "epoch": 1.1998750780762024, "grad_norm": 0.28993692994117737, "kl": 3.1666667461395264, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -1137361027.072, "logits/rejected": 360618963.4782609, "logps/chosen": -1458.944, "logps/rejected": -4253.495652173913, "loss": 0.0061, "rewards/chosen": 6.25701904296875, "rewards/margins": 26442014.743975565, "rewards/rejected": -26442008.486956522, "step": 960 }, { "epoch": 1.2061211742660838, "grad_norm": 0.048584070056676865, "kl": 2.433333396911621, "learning_rate": 2.039049451143342e-06, "logits/chosen": -1235638418.9029536, "logits/rejected": 253565526.38683128, "logps/chosen": -1440.1350210970463, "logps/rejected": -3978.008230452675, "loss": 0.004, "rewards/chosen": 6.875963170820148, "rewards/margins": 24204303.303946707, "rewards/rejected": -24204296.427983537, "step": 965 }, { "epoch": 1.212367270455965, "grad_norm": 0.1172674149274826, "kl": 2.883333444595337, "learning_rate": 2.01227419495968e-06, "logits/chosen": -1092509557.1525424, "logits/rejected": 51586501.24590164, "logps/chosen": -1448.8813559322034, "logps/rejected": -3612.8524590163934, "loss": 0.0083, "rewards/chosen": 6.858350850768009, "rewards/margins": 14972855.120645933, "rewards/rejected": -14972848.262295082, "step": 970 }, { "epoch": 1.2186133666458463, "grad_norm": 0.13202552497386932, "kl": 22.200000762939453, "learning_rate": 1.985556973269413e-06, "logits/chosen": -1104590253.419355, "logits/rejected": 359010727.7241379, "logps/chosen": -1484.774193548387, "logps/rejected": -3469.793103448276, "loss": 0.0057, "rewards/chosen": 7.040771484375, "rewards/margins": 48334387.17870252, "rewards/rejected": -48334380.137931034, "step": 975 }, { "epoch": 1.2248594628357277, "grad_norm": 0.08171664923429489, "kl": 11.266666412353516, "learning_rate": 1.958900965154743e-06, "logits/chosen": -1088931073.053498, "logits/rejected": 97844971.47679324, "logps/chosen": -1347.6213991769548, "logps/rejected": -3505.1476793248944, "loss": 0.0062, "rewards/chosen": 6.958841708461934, "rewards/margins": 21401479.498926096, "rewards/rejected": -21401472.540084388, "step": 980 }, { "epoch": 1.231105559025609, "grad_norm": 0.13173836469650269, "kl": 46.71666717529297, "learning_rate": 1.9323093424140673e-06, "logits/chosen": -1094815644.097561, "logits/rejected": 227897238.97435898, "logps/chosen": -1464.0650406504064, "logps/rejected": -3056.136752136752, "loss": 0.011, "rewards/chosen": 7.490336596481199, "rewards/margins": 25790834.70401181, "rewards/rejected": -25790827.213675212, "step": 985 }, { "epoch": 1.2373516552154904, "grad_norm": 0.09634287655353546, "kl": 29.266666412353516, "learning_rate": 1.9057852691845677e-06, "logits/chosen": -1083139849.560166, "logits/rejected": 329840768.53556484, "logps/chosen": -1380.5809128630706, "logps/rejected": -3287.297071129707, "loss": 0.0128, "rewards/chosen": 6.732099730938796, "rewards/margins": 21379312.23837588, "rewards/rejected": -21379305.50627615, "step": 990 }, { "epoch": 1.2435977514053715, "grad_norm": 0.034083809703588486, "kl": 2.058333396911621, "learning_rate": 1.8793319015657091e-06, "logits/chosen": -1052806152.7521367, "logits/rejected": 341929551.08943087, "logps/chosen": -1385.7094017094016, "logps/rejected": -3508.5528455284552, "loss": 0.0068, "rewards/chosen": 6.91802978515625, "rewards/margins": 45265156.836728975, "rewards/rejected": -45265149.91869919, "step": 995 }, { "epoch": 1.249843847595253, "grad_norm": 0.05067850649356842, "kl": 0.0, "learning_rate": 1.852952387243698e-06, "logits/chosen": -1049794857.5726496, "logits/rejected": 357337437.6585366, "logps/chosen": -1484.034188034188, "logps/rejected": -3700.8130081300815, "loss": 0.0035, "rewards/chosen": 6.565341525607639, "rewards/margins": 30968287.02062608, "rewards/rejected": -30968280.455284555, "step": 1000 }, { "epoch": 1.249843847595253, "eval_kl": 6.689723491668701, "eval_logits/chosen": -1030192454.8085107, "eval_logits/rejected": 341122426.56969696, "eval_logps/chosen": -1450.943907156673, "eval_logps/rejected": -3746.327272727273, "eval_loss": 0.007318771444261074, "eval_rewards/chosen": 6.702488439917795, "eval_rewards/margins": 34027874.24188238, "eval_rewards/rejected": -34027867.53939394, "eval_runtime": 640.6156, "eval_samples_per_second": 6.314, "eval_steps_per_second": 0.395, "step": 1000 }, { "epoch": 1.2560899437851343, "grad_norm": 0.5064871907234192, "kl": 0.6333333253860474, "learning_rate": 1.8266498651169352e-06, "logits/chosen": -1037007838.967742, "logits/rejected": 415172819.86206895, "logps/chosen": -1417.4193548387098, "logps/rejected": -4225.103448275862, "loss": 0.0039, "rewards/chosen": 7.380758962323589, "rewards/margins": 24907485.44972448, "rewards/rejected": -24907478.068965517, "step": 1005 }, { "epoch": 1.2623360399750156, "grad_norm": 0.06193430721759796, "kl": 0.0, "learning_rate": 1.8004274649225201e-06, "logits/chosen": -985542573.0850202, "logits/rejected": 323438442.5751073, "logps/chosen": -1514.5587044534414, "logps/rejected": -3861.1502145922746, "loss": 0.0061, "rewards/chosen": 6.887757088973937, "rewards/margins": 33208175.26543949, "rewards/rejected": -33208168.377682403, "step": 1010 }, { "epoch": 1.268582136164897, "grad_norm": 0.03230896219611168, "kl": 7.991666793823242, "learning_rate": 1.7742883068638447e-06, "logits/chosen": -1005395296.5245901, "logits/rejected": 510674284.4745763, "logps/chosen": -1370.4262295081967, "logps/rejected": -3835.6610169491523, "loss": 0.0075, "rewards/chosen": 6.63354242043417, "rewards/margins": 29501325.142017, "rewards/rejected": -29501318.508474577, "step": 1015 }, { "epoch": 1.2748282323547784, "grad_norm": 0.03562912717461586, "kl": 19.53333282470703, "learning_rate": 1.7482355012393177e-06, "logits/chosen": -955716859.8032787, "logits/rejected": 323245767.59322035, "logps/chosen": -1482.0983606557377, "logps/rejected": -3633.898305084746, "loss": 0.0073, "rewards/chosen": 6.989379382524334, "rewards/margins": 17213807.124972604, "rewards/rejected": -17213800.13559322, "step": 1020 }, { "epoch": 1.2810743285446595, "grad_norm": 0.05819810554385185, "kl": 5.400000095367432, "learning_rate": 1.722272148072273e-06, "logits/chosen": -1113979296.3983402, "logits/rejected": 439603422.7949791, "logps/chosen": -1445.5767634854772, "logps/rejected": -3578.1087866108787, "loss": 0.0048, "rewards/chosen": 6.725795635049274, "rewards/margins": 81843742.2906492, "rewards/rejected": -81843735.56485356, "step": 1025 }, { "epoch": 1.287320424734541, "grad_norm": 0.03748779371380806, "kl": 3.299999952316284, "learning_rate": 1.6964013367420967e-06, "logits/chosen": -988995373.948718, "logits/rejected": 310352920.9756098, "logps/chosen": -1538.4615384615386, "logps/rejected": -3188.2926829268295, "loss": 0.0066, "rewards/chosen": 6.511683276575854, "rewards/margins": 55828161.82875645, "rewards/rejected": -55828155.317073174, "step": 1030 }, { "epoch": 1.2935665209244223, "grad_norm": 0.032022129744291306, "kl": 3.616666555404663, "learning_rate": 1.6706261456166205e-06, "logits/chosen": -1067770272.5423728, "logits/rejected": 299978684.852459, "logps/chosen": -1298.3050847457628, "logps/rejected": -3424.5245901639346, "loss": 0.0049, "rewards/chosen": 6.950107380495233, "rewards/margins": 31364489.048468035, "rewards/rejected": -31364482.098360654, "step": 1035 }, { "epoch": 1.2998126171143036, "grad_norm": 1.6918329000473022, "kl": 17.450000762939453, "learning_rate": 1.6449496416858285e-06, "logits/chosen": -1045500177.0666667, "logits/rejected": 558655078.4, "logps/chosen": -1459.0166666666667, "logps/rejected": -3562.133333333333, "loss": 0.0086, "rewards/chosen": 6.9263356526692705, "rewards/margins": 35992681.05966899, "rewards/rejected": -35992674.13333333, "step": 1040 }, { "epoch": 1.3060587133041848, "grad_norm": 0.08456436544656754, "kl": 12.133333206176758, "learning_rate": 1.6193748801969164e-06, "logits/chosen": -949660330.6666666, "logits/rejected": 287388467.2, "logps/chosen": -1376.4, "logps/rejected": -3333.0666666666666, "loss": 0.0053, "rewards/chosen": 6.561256408691406, "rewards/margins": 14701292.961256409, "rewards/rejected": -14701286.4, "step": 1045 }, { "epoch": 1.3123048094940661, "grad_norm": 0.0434587299823761, "kl": 20.799999237060547, "learning_rate": 1.5939049042907463e-06, "logits/chosen": -961389484.0655738, "logits/rejected": 351912769.08474576, "logps/chosen": -1404.8524590163934, "logps/rejected": -3683.7966101694915, "loss": 0.008, "rewards/chosen": 6.551050405033299, "rewards/margins": 32163902.957830068, "rewards/rejected": -32163896.40677966, "step": 1050 }, { "epoch": 1.3185509056839475, "grad_norm": 0.05942801013588905, "kl": 13.866666793823242, "learning_rate": 1.5685427446397427e-06, "logits/chosen": -1025611711.1312218, "logits/rejected": 455446354.03861004, "logps/chosen": -1561.9185520361991, "logps/rejected": -3815.289575289575, "loss": 0.0066, "rewards/chosen": 6.581945082720588, "rewards/margins": 32960453.34642385, "rewards/rejected": -32960446.764478765, "step": 1055 }, { "epoch": 1.3247970018738289, "grad_norm": 0.0555511899292469, "kl": 7.133333206176758, "learning_rate": 1.5432914190872757e-06, "logits/chosen": -1093971142.0576131, "logits/rejected": 627083842.9704641, "logps/chosen": -1495.9670781893003, "logps/rejected": -3863.7637130801686, "loss": 0.0055, "rewards/chosen": 6.9783146862139915, "rewards/margins": 25399112.868610047, "rewards/rejected": -25399105.89029536, "step": 1060 }, { "epoch": 1.3310430980637102, "grad_norm": 0.05366786941885948, "kl": 8.399999618530273, "learning_rate": 1.5181539322885652e-06, "logits/chosen": -972289011.9529412, "logits/rejected": 415688148.7644445, "logps/chosen": -1533.7411764705882, "logps/rejected": -3943.5377777777776, "loss": 0.0075, "rewards/chosen": 6.679204963235295, "rewards/margins": 57861982.28809385, "rewards/rejected": -57861975.60888889, "step": 1065 }, { "epoch": 1.3372891942535916, "grad_norm": 0.03548969328403473, "kl": 0.0, "learning_rate": 1.4931332753531575e-06, "logits/chosen": -974249031.4418604, "logits/rejected": 311672399.2150943, "logps/chosen": -1517.3953488372092, "logps/rejected": -3859.8037735849057, "loss": 0.0036, "rewards/chosen": 7.6878400935683135, "rewards/margins": 18645339.733123112, "rewards/rejected": -18645332.04528302, "step": 1070 }, { "epoch": 1.3435352904434728, "grad_norm": 0.022944239899516106, "kl": 5.241666793823242, "learning_rate": 1.4682324254890135e-06, "logits/chosen": -901207572.1222707, "logits/rejected": 596886222.0239043, "logps/chosen": -1441.6768558951965, "logps/rejected": -3946.5816733067727, "loss": 0.004, "rewards/chosen": 6.79593521851119, "rewards/margins": 43888491.768046774, "rewards/rejected": -43888484.97211155, "step": 1075 }, { "epoch": 1.3497813866333541, "grad_norm": 0.03675493597984314, "kl": 0.0, "learning_rate": 1.443454345648252e-06, "logits/chosen": -1049132766.0176991, "logits/rejected": 458731358.7401575, "logps/chosen": -1401.9823008849557, "logps/rejected": -3990.4251968503936, "loss": 0.0041, "rewards/chosen": 6.76517830907771, "rewards/margins": 32297702.513209805, "rewards/rejected": -32297695.748031497, "step": 1080 }, { "epoch": 1.3560274828232355, "grad_norm": 0.03743245080113411, "kl": 20.516666412353516, "learning_rate": 1.4188019841745842e-06, "logits/chosen": -938349038.0700389, "logits/rejected": 473034733.63228697, "logps/chosen": -1411.1128404669262, "logps/rejected": -3885.345291479821, "loss": 0.0057, "rewards/chosen": 6.771459675948444, "rewards/margins": 14190376.062939497, "rewards/rejected": -14190369.29147982, "step": 1085 }, { "epoch": 1.3622735790131169, "grad_norm": 0.1011684238910675, "kl": 21.53333282470703, "learning_rate": 1.3942782744524974e-06, "logits/chosen": -1010349431.8987342, "logits/rejected": 668188874.2716049, "logps/chosen": -1350.210970464135, "logps/rejected": -4185.020576131687, "loss": 0.0073, "rewards/chosen": 6.38992760251846, "rewards/margins": 14171458.43931032, "rewards/rejected": -14171452.049382716, "step": 1090 }, { "epoch": 1.368519675202998, "grad_norm": 0.050006281584501266, "kl": 0.0, "learning_rate": 1.369886134558201e-06, "logits/chosen": -1023652155.0769231, "logits/rejected": 468892732.2352941, "logps/chosen": -1394.6538461538462, "logps/rejected": -3811.0588235294117, "loss": 0.0038, "rewards/chosen": 7.336665813739483, "rewards/margins": 22742599.336665813, "rewards/rejected": -22742592.0, "step": 1095 }, { "epoch": 1.3747657713928794, "grad_norm": 0.04441044107079506, "kl": 7.0, "learning_rate": 1.3456284669124159e-06, "logits/chosen": -1112350480.0669456, "logits/rejected": 470060087.2365145, "logps/chosen": -1366.7615062761506, "logps/rejected": -3846.3734439834025, "loss": 0.0058, "rewards/chosen": 6.793774209760722, "rewards/margins": 25733330.743981678, "rewards/rejected": -25733323.950207468, "step": 1100 }, { "epoch": 1.3810118675827607, "grad_norm": 0.04001372680068016, "kl": 0.550000011920929, "learning_rate": 1.3215081579350058e-06, "logits/chosen": -1090801010.0168068, "logits/rejected": 500092758.74380165, "logps/chosen": -1313.4789915966387, "logps/rejected": -4148.892561983471, "loss": 0.0048, "rewards/chosen": 6.737123633633141, "rewards/margins": 30996945.84456165, "rewards/rejected": -30996939.107438017, "step": 1105 }, { "epoch": 1.387257963772642, "grad_norm": 0.05370747670531273, "kl": 6.116666793823242, "learning_rate": 1.2975280777015315e-06, "logits/chosen": -955014613.1633466, "logits/rejected": 278692269.2751092, "logps/chosen": -1469.4501992031871, "logps/rejected": -3723.598253275109, "loss": 0.0082, "rewards/chosen": 6.8361475971115535, "rewards/margins": 18840817.595972925, "rewards/rejected": -18840810.759825327, "step": 1110 }, { "epoch": 1.3935040599625235, "grad_norm": 0.10151516646146774, "kl": 5.0333333015441895, "learning_rate": 1.2736910796017302e-06, "logits/chosen": -874066532.7874016, "logits/rejected": 606169722.3362832, "logps/chosen": -1502.4881889763778, "logps/rejected": -3978.761061946903, "loss": 0.0058, "rewards/chosen": 6.727448711245079, "rewards/margins": 24209047.718599156, "rewards/rejected": -24209040.991150443, "step": 1115 }, { "epoch": 1.3997501561524048, "grad_norm": 0.03879309073090553, "kl": 4.183333396911621, "learning_rate": 1.2500000000000007e-06, "logits/chosen": -974429236.0677966, "logits/rejected": 597739889.3114754, "logps/chosen": -1249.3559322033898, "logps/rejected": -3902.688524590164, "loss": 0.0045, "rewards/chosen": 6.744752011056674, "rewards/margins": 29603798.48245693, "rewards/rejected": -29603791.737704918, "step": 1120 }, { "epoch": 1.4059962523422862, "grad_norm": 0.04185258969664574, "kl": 13.066666603088379, "learning_rate": 1.2264576578978956e-06, "logits/chosen": -792940402.7586207, "logits/rejected": 305645105.5483871, "logps/chosen": -1571.0344827586207, "logps/rejected": -3693.4193548387098, "loss": 0.0082, "rewards/chosen": 6.826761706122037, "rewards/margins": 13727978.955793964, "rewards/rejected": -13727972.129032258, "step": 1125 }, { "epoch": 1.4122423485321673, "grad_norm": 0.03908821567893028, "kl": 6.266666889190674, "learning_rate": 1.203066854598696e-06, "logits/chosen": -893589702.1935484, "logits/rejected": 462767774.8965517, "logps/chosen": -1564.516129032258, "logps/rejected": -3881.103448275862, "loss": 0.0087, "rewards/chosen": 6.733085386214718, "rewards/margins": 33438053.62963711, "rewards/rejected": -33438046.896551725, "step": 1130 }, { "epoch": 1.4184884447220487, "grad_norm": 0.05348537489771843, "kl": 0.0, "learning_rate": 1.1798303733740801e-06, "logits/chosen": -912541306.2995951, "logits/rejected": 547716698.0944206, "logps/chosen": -1501.7327935222672, "logps/rejected": -4242.403433476395, "loss": 0.0047, "rewards/chosen": 6.570156329073886, "rewards/margins": 41152707.5486971, "rewards/rejected": -41152700.97854077, "step": 1135 }, { "epoch": 1.42473454091193, "grad_norm": 0.03471559286117554, "kl": 0.0, "learning_rate": 1.1567509791329402e-06, "logits/chosen": -936182292.8130082, "logits/rejected": 496272200.2051282, "logps/chosen": -1443.3821138211383, "logps/rejected": -3950.4957264957266, "loss": 0.0067, "rewards/chosen": 6.8144675154026935, "rewards/margins": 13923291.053783756, "rewards/rejected": -13923284.23931624, "step": 1140 }, { "epoch": 1.4309806371018114, "grad_norm": 0.03510862588882446, "kl": 3.0999999046325684, "learning_rate": 1.1338314180923917e-06, "logits/chosen": -1019970495.7322176, "logits/rejected": 594355501.6763486, "logps/chosen": -1436.1171548117154, "logps/rejected": -4024.033195020747, "loss": 0.0038, "rewards/chosen": 6.848525043311977, "rewards/margins": 26934969.304956578, "rewards/rejected": -26934962.456431534, "step": 1145 }, { "epoch": 1.4372267332916926, "grad_norm": 0.045122962445020676, "kl": 13.449999809265137, "learning_rate": 1.1110744174509952e-06, "logits/chosen": -953595309.4193548, "logits/rejected": 422915107.3103448, "logps/chosen": -1544.6451612903227, "logps/rejected": -4019.310344827586, "loss": 0.0101, "rewards/chosen": 6.858165125693044, "rewards/margins": 35274222.582303055, "rewards/rejected": -35274215.72413793, "step": 1150 }, { "epoch": 1.443472829481574, "grad_norm": 0.04991288483142853, "kl": 13.316666603088379, "learning_rate": 1.0884826850642492e-06, "logits/chosen": -982462394.5762712, "logits/rejected": 519917500.852459, "logps/chosen": -1360.2372881355932, "logps/rejected": -4078.688524590164, "loss": 0.0068, "rewards/chosen": 6.708972478317002, "rewards/margins": 27566999.495857727, "rewards/rejected": -27566992.786885247, "step": 1155 }, { "epoch": 1.4497189256714553, "grad_norm": 0.0597245953977108, "kl": 0.0, "learning_rate": 1.0660589091223854e-06, "logits/chosen": -956790647.4666667, "logits/rejected": 440581051.73333335, "logps/chosen": -1363.0666666666666, "logps/rejected": -4116.8, "loss": 0.009, "rewards/chosen": 6.348196411132813, "rewards/margins": 36036477.81486308, "rewards/rejected": -36036471.46666667, "step": 1160 }, { "epoch": 1.4559650218613367, "grad_norm": 0.03371795266866684, "kl": 6.150000095367432, "learning_rate": 1.043805757830495e-06, "logits/chosen": -1058097247.0548524, "logits/rejected": 537000365.8271605, "logps/chosen": -1570.8354430379748, "logps/rejected": -4048.329218106996, "loss": 0.0037, "rewards/chosen": 7.199290343980749, "rewards/margins": 49041229.783652484, "rewards/rejected": -49041222.58436214, "step": 1165 }, { "epoch": 1.462211118051218, "grad_norm": 0.05939038470387459, "kl": 6.483333110809326, "learning_rate": 1.0217258790910447e-06, "logits/chosen": -901359395.9669422, "logits/rejected": 502541062.4537815, "logps/chosen": -1473.8512396694214, "logps/rejected": -3943.529411764706, "loss": 0.0118, "rewards/chosen": 6.762523304332387, "rewards/margins": 40793770.25832162, "rewards/rejected": -40793763.49579832, "step": 1170 }, { "epoch": 1.4684572142410994, "grad_norm": 0.1239381730556488, "kl": 10.458333015441895, "learning_rate": 9.99821900188798e-07, "logits/chosen": -941830127.6812749, "logits/rejected": 502061851.9475983, "logps/chosen": -1454.406374501992, "logps/rejected": -4060.5065502183406, "loss": 0.0094, "rewards/chosen": 6.164070281374502, "rewards/margins": 3046339.0898344736, "rewards/rejected": -3046332.9257641924, "step": 1175 }, { "epoch": 1.4747033104309806, "grad_norm": 0.04187585785984993, "kl": 5.474999904632568, "learning_rate": 9.780964274781984e-07, "logits/chosen": -1160314880.0, "logits/rejected": 799277056.0, "logps/chosen": -1435.3125, "logps/rejected": -4670.285714285715, "loss": 0.0069, "rewards/chosen": 7.046051502227783, "rewards/margins": 23172132.760337215, "rewards/rejected": -23172125.714285713, "step": 1180 }, { "epoch": 1.480949406620862, "grad_norm": 0.032712046056985855, "kl": 0.9333333373069763, "learning_rate": 9.56552046073238e-07, "logits/chosen": -1228912756.2620087, "logits/rejected": 683057445.7370518, "logps/chosen": -1331.8427947598254, "logps/rejected": -4543.235059760957, "loss": 0.0037, "rewards/chosen": 7.146233304619269, "rewards/margins": 35616020.40519745, "rewards/rejected": -35616013.25896414, "step": 1185 }, { "epoch": 1.4871955028107433, "grad_norm": 0.041652340441942215, "kl": 4.083333492279053, "learning_rate": 9.351913195398523e-07, "logits/chosen": -1104601120.8995984, "logits/rejected": 549363038.1991342, "logps/chosen": -1428.0481927710844, "logps/rejected": -4372.779220779221, "loss": 0.0059, "rewards/chosen": 7.262572261703062, "rewards/margins": 39940559.85131685, "rewards/rejected": -39940552.58874459, "step": 1190 }, { "epoch": 1.4934415990006247, "grad_norm": 0.0400969497859478, "kl": 0.0, "learning_rate": 9.140167895908867e-07, "logits/chosen": -1067380794.2369668, "logits/rejected": 536605844.4609665, "logps/chosen": -1447.9620853080569, "logps/rejected": -4313.457249070632, "loss": 0.0036, "rewards/chosen": 6.994773539321683, "rewards/margins": 27280765.953881346, "rewards/rejected": -27280758.959107805, "step": 1195 }, { "epoch": 1.4996876951905058, "grad_norm": 0.03562096133828163, "kl": 0.0, "learning_rate": 8.930309757836517e-07, "logits/chosen": -970590381.559322, "logits/rejected": 498589293.1147541, "logps/chosen": -1501.8305084745762, "logps/rejected": -4478.426229508197, "loss": 0.0041, "rewards/chosen": 7.16283196918035, "rewards/margins": 43964162.96611066, "rewards/rejected": -43964155.80327869, "step": 1200 }, { "epoch": 1.5059337913803872, "grad_norm": 0.04305117204785347, "kl": 2.2333333492279053, "learning_rate": 8.722363752201277e-07, "logits/chosen": -984962389.3333334, "logits/rejected": 612717909.3333334, "logps/chosen": -1455.8699186991869, "logps/rejected": -4411.076923076923, "loss": 0.009, "rewards/chosen": 6.036764005335366, "rewards/margins": 34317619.985481955, "rewards/rejected": -34317613.94871795, "step": 1205 }, { "epoch": 1.5121798875702686, "grad_norm": 0.04826142638921738, "kl": 0.0, "learning_rate": 8.516354622498279e-07, "logits/chosen": -991059172.9957806, "logits/rejected": 368671554.3703704, "logps/chosen": -1475.0379746835442, "logps/rejected": -4335.14403292181, "loss": 0.0046, "rewards/chosen": 6.493236171545359, "rewards/margins": 28039880.336857572, "rewards/rejected": -28039873.8436214, "step": 1210 }, { "epoch": 1.51842598376015, "grad_norm": 0.04798099398612976, "kl": 2.4666666984558105, "learning_rate": 8.31230688175382e-07, "logits/chosen": -1036176902.1195219, "logits/rejected": 545987570.5851529, "logps/chosen": -1544.03187250996, "logps/rejected": -4271.371179039302, "loss": 0.0041, "rewards/chosen": 6.812338050143177, "rewards/margins": 59293774.358189575, "rewards/rejected": -59293767.54585153, "step": 1215 }, { "epoch": 1.5246720799500313, "grad_norm": 0.04589550942182541, "kl": 9.600000381469727, "learning_rate": 8.110244809608494e-07, "logits/chosen": -1033476505.6, "logits/rejected": 549855778.1333333, "logps/chosen": -1523.4666666666667, "logps/rejected": -4341.6, "loss": 0.0038, "rewards/chosen": 7.062318929036459, "rewards/margins": 54005668.928985596, "rewards/rejected": -54005661.86666667, "step": 1220 }, { "epoch": 1.5309181761399127, "grad_norm": 0.035851314663887024, "kl": 0.0, "learning_rate": 7.910192449428216e-07, "logits/chosen": -989924222.4326531, "logits/rejected": 594846009.7361702, "logps/chosen": -1455.6734693877552, "logps/rejected": -4535.2851063829785, "loss": 0.0092, "rewards/chosen": 6.187053571428572, "rewards/margins": 61192067.46364931, "rewards/rejected": -61192061.27659574, "step": 1225 }, { "epoch": 1.537164272329794, "grad_norm": 0.053536731749773026, "kl": 0.0, "learning_rate": 7.712173605443269e-07, "logits/chosen": -1039319604.9655173, "logits/rejected": 441644989.9354839, "logps/chosen": -1326.2068965517242, "logps/rejected": -4275.612903225807, "loss": 0.0057, "rewards/chosen": 6.277499494881465, "rewards/margins": 36871186.664596274, "rewards/rejected": -36871180.38709678, "step": 1230 }, { "epoch": 1.5434103685196752, "grad_norm": 0.05226130411028862, "kl": 4.4666666984558105, "learning_rate": 7.516211839915821e-07, "logits/chosen": -1063635067.373494, "logits/rejected": 564147505.8701298, "logps/chosen": -1477.012048192771, "logps/rejected": -4608.554112554112, "loss": 0.0074, "rewards/chosen": 6.572622521335341, "rewards/margins": 44541413.32586928, "rewards/rejected": -44541406.753246754, "step": 1235 }, { "epoch": 1.5496564647095565, "grad_norm": 0.05410970002412796, "kl": 13.733333587646484, "learning_rate": 7.322330470336314e-07, "logits/chosen": -1117790940.0510638, "logits/rejected": 435368755.2, "logps/chosen": -1283.9489361702128, "logps/rejected": -4281.0775510204085, "loss": 0.0102, "rewards/chosen": 7.458873213098404, "rewards/margins": 6269761.450709948, "rewards/rejected": -6269753.991836735, "step": 1240 }, { "epoch": 1.555902560899438, "grad_norm": 0.038916345685720444, "kl": 1.7333333492279053, "learning_rate": 7.130552566648847e-07, "logits/chosen": -1104022652.878049, "logits/rejected": 367427303.93162394, "logps/chosen": -1533.4634146341464, "logps/rejected": -4443.076923076923, "loss": 0.0062, "rewards/chosen": 6.831621371633638, "rewards/margins": 49302145.3615359, "rewards/rejected": -49302138.52991453, "step": 1245 }, { "epoch": 1.562148657089319, "grad_norm": 0.057752642780542374, "kl": 4.333333492279053, "learning_rate": 6.940900948506113e-07, "logits/chosen": -1084643750.3501945, "logits/rejected": 301049461.0941704, "logps/chosen": -1523.4241245136186, "logps/rejected": -4271.354260089686, "loss": 0.0056, "rewards/chosen": 6.883037641354572, "rewards/margins": 76870272.56913629, "rewards/rejected": -76870265.68609865, "step": 1250 }, { "epoch": 1.5683947532792004, "grad_norm": 0.04232333227992058, "kl": 0.0, "learning_rate": 6.753398182554116e-07, "logits/chosen": -1142273104.1391304, "logits/rejected": 393610264.576, "logps/chosen": -1470.3304347826088, "logps/rejected": -4278.528, "loss": 0.0041, "rewards/chosen": 6.700403893512228, "rewards/margins": 32329095.724403895, "rewards/rejected": -32329089.024, "step": 1255 }, { "epoch": 1.5746408494690818, "grad_norm": 0.05231759324669838, "kl": 5.699999809265137, "learning_rate": 6.568066579746901e-07, "logits/chosen": -1081600296.9219332, "logits/rejected": 481572194.27488154, "logps/chosen": -1511.1375464684015, "logps/rejected": -4275.109004739336, "loss": 0.0077, "rewards/chosen": 6.582321677509293, "rewards/margins": 64165729.577582344, "rewards/rejected": -64165722.99526066, "step": 1260 }, { "epoch": 1.5808869456589631, "grad_norm": 0.05123298987746239, "kl": 16.875, "learning_rate": 6.384928192691844e-07, "logits/chosen": -1078488431.389313, "logits/rejected": 398170281.10091746, "logps/chosen": -1525.9236641221373, "logps/rejected": -4215.339449541284, "loss": 0.0097, "rewards/chosen": 7.378648131858301, "rewards/margins": 95379745.21351051, "rewards/rejected": -95379737.83486238, "step": 1265 }, { "epoch": 1.5871330418488445, "grad_norm": 0.08783072233200073, "kl": 9.783333778381348, "learning_rate": 6.204004813025569e-07, "logits/chosen": -984671368.2403433, "logits/rejected": 288396607.22267205, "logps/chosen": -1377.7854077253219, "logps/rejected": -4215.708502024291, "loss": 0.0092, "rewards/chosen": 6.247759774007511, "rewards/margins": 38891629.89148447, "rewards/rejected": -38891623.643724695, "step": 1270 }, { "epoch": 1.5933791380387259, "grad_norm": 0.05191274732351303, "kl": 0.375, "learning_rate": 6.025317968820954e-07, "logits/chosen": -1148081036.3179917, "logits/rejected": 496389787.0871369, "logps/chosen": -1394.878661087866, "logps/rejected": -4440.165975103734, "loss": 0.0043, "rewards/chosen": 7.052397277065899, "rewards/margins": 31868582.38849686, "rewards/rejected": -31868575.336099584, "step": 1275 }, { "epoch": 1.5996252342286073, "grad_norm": 0.07014898210763931, "kl": 14.300000190734863, "learning_rate": 5.848888922025553e-07, "logits/chosen": -1002822994.6135458, "logits/rejected": 517804228.7510917, "logps/chosen": -1539.187250996016, "logps/rejected": -4211.423580786026, "loss": 0.0095, "rewards/chosen": 6.328193087026892, "rewards/margins": 36380255.760507494, "rewards/rejected": -36380249.43231441, "step": 1280 }, { "epoch": 1.6058713304184884, "grad_norm": 0.039311449974775314, "kl": 4.733333110809326, "learning_rate": 5.674738665931575e-07, "logits/chosen": -1064598241.28, "logits/rejected": 489178940.1043478, "logps/chosen": -1587.584, "logps/rejected": -4630.539130434782, "loss": 0.0038, "rewards/chosen": 6.928841796875, "rewards/margins": 36989088.5288418, "rewards/rejected": -36989081.6, "step": 1285 }, { "epoch": 1.6121174266083698, "grad_norm": 0.06494542211294174, "kl": 17.399999618530273, "learning_rate": 5.50288792267796e-07, "logits/chosen": -1104234254.6007605, "logits/rejected": 231532345.80645162, "logps/chosen": -1379.406844106464, "logps/rejected": -3870.672811059908, "loss": 0.0078, "rewards/chosen": 6.449863448342443, "rewards/margins": 20531010.615762066, "rewards/rejected": -20531004.165898617, "step": 1290 }, { "epoch": 1.6183635227982511, "grad_norm": 0.04059774428606033, "kl": 0.0, "learning_rate": 5.333357140784576e-07, "logits/chosen": -1077796317.8666666, "logits/rejected": 325858099.2, "logps/chosen": -1426.5333333333333, "logps/rejected": -4271.466666666666, "loss": 0.0045, "rewards/chosen": 6.975811258951823, "rewards/margins": 33188625.642477926, "rewards/rejected": -33188618.666666668, "step": 1295 }, { "epoch": 1.6246096189881323, "grad_norm": 0.05548970028758049, "kl": 0.0, "learning_rate": 5.166166492719124e-07, "logits/chosen": -1096140451.5850623, "logits/rejected": 495085816.50209206, "logps/chosen": -1395.3858921161825, "logps/rejected": -4268.18410041841, "loss": 0.0069, "rewards/chosen": 6.420809797231587, "rewards/margins": 19974116.14465917, "rewards/rejected": -19974109.72384937, "step": 1300 }, { "epoch": 1.6308557151780136, "grad_norm": 0.06512665003538132, "kl": 5.791666507720947, "learning_rate": 5.001335872496759e-07, "logits/chosen": -1032362445.4071146, "logits/rejected": 395955231.5770925, "logps/chosen": -1337.4229249011858, "logps/rejected": -4086.1321585903083, "loss": 0.0073, "rewards/chosen": 6.668516377686512, "rewards/margins": 77927011.1442873, "rewards/rejected": -77927004.47577092, "step": 1305 }, { "epoch": 1.637101811367895, "grad_norm": 0.046625133603811264, "kl": 0.0, "learning_rate": 4.838884893312934e-07, "logits/chosen": -972410863.2816327, "logits/rejected": 417935621.4468085, "logps/chosen": -1453.9755102040817, "logps/rejected": -4156.4595744680855, "loss": 0.0047, "rewards/chosen": 6.839940409757653, "rewards/margins": 22208490.516536154, "rewards/rejected": -22208483.676595744, "step": 1310 }, { "epoch": 1.6433479075577764, "grad_norm": 0.038477230817079544, "kl": 2.741666555404663, "learning_rate": 4.678832885209622e-07, "logits/chosen": -1002685379.7647059, "logits/rejected": 291196487.9338843, "logps/chosen": -1523.9663865546217, "logps/rejected": -4202.3140495867765, "loss": 0.0042, "rewards/chosen": 6.712801380317752, "rewards/margins": 19249440.101231135, "rewards/rejected": -19249433.388429753, "step": 1315 }, { "epoch": 1.6495940037476577, "grad_norm": 0.04708554968237877, "kl": 1.2333333492279053, "learning_rate": 4.5211988927752026e-07, "logits/chosen": -1077693790.4355555, "logits/rejected": 553812610.5098039, "logps/chosen": -1498.4533333333334, "logps/rejected": -4704.376470588235, "loss": 0.0043, "rewards/chosen": 6.698840060763889, "rewards/margins": 9807112.706683198, "rewards/rejected": -9807106.007843137, "step": 1320 }, { "epoch": 1.6558400999375391, "grad_norm": 0.04353487491607666, "kl": 9.600000381469727, "learning_rate": 4.366001672878406e-07, "logits/chosen": -1135099546.2707424, "logits/rejected": 389176266.96414346, "logps/chosen": -1425.467248908297, "logps/rejected": -4486.629482071713, "loss": 0.0059, "rewards/chosen": 6.637555651269105, "rewards/margins": 30159156.533969995, "rewards/rejected": -30159149.896414343, "step": 1325 }, { "epoch": 1.6620861961274205, "grad_norm": 0.03297581896185875, "kl": 1.3333333730697632, "learning_rate": 4.2132596924363666e-07, "logits/chosen": -1105496125.527897, "logits/rejected": 518404087.708502, "logps/chosen": -1485.3218884120172, "logps/rejected": -4637.538461538462, "loss": 0.0065, "rewards/chosen": 6.355323103876073, "rewards/margins": 54210093.73993849, "rewards/rejected": -54210087.384615384, "step": 1330 }, { "epoch": 1.6683322923173018, "grad_norm": 0.11054141819477081, "kl": 0.0, "learning_rate": 4.0629911262173053e-07, "logits/chosen": -1095594500.302521, "logits/rejected": 497796290.6446281, "logps/chosen": -1463.126050420168, "logps/rejected": -4335.074380165289, "loss": 0.0076, "rewards/chosen": 6.8984651966255255, "rewards/margins": 29223174.898465198, "rewards/rejected": -29223168.0, "step": 1335 }, { "epoch": 1.674578388507183, "grad_norm": 0.05203811824321747, "kl": 0.0, "learning_rate": 3.915213854677863e-07, "logits/chosen": -1098504994.816, "logits/rejected": 172379055.86086956, "logps/chosen": -1367.936, "logps/rejected": -4157.217391304348, "loss": 0.005, "rewards/chosen": 6.6947431640625, "rewards/margins": 6683829.99909099, "rewards/rejected": -6683823.304347826, "step": 1340 }, { "epoch": 1.6808244846970644, "grad_norm": 0.03993624076247215, "kl": 0.0, "learning_rate": 3.7699454618355306e-07, "logits/chosen": -946833671.4356847, "logits/rejected": 229857511.36401674, "logps/chosen": -1473.0622406639004, "logps/rejected": -4066.543933054393, "loss": 0.0066, "rewards/chosen": 6.581208161793309, "rewards/margins": 39973453.27576883, "rewards/rejected": -39973446.69456067, "step": 1345 }, { "epoch": 1.6870705808869455, "grad_norm": 0.04042872413992882, "kl": 5.699999809265137, "learning_rate": 3.627203233176341e-07, "logits/chosen": -964882025.5267175, "logits/rejected": 379531602.20183486, "logps/chosen": -1474.6870229007634, "logps/rejected": -4828.477064220184, "loss": 0.007, "rewards/chosen": 6.037196501520754, "rewards/margins": 20895698.367471732, "rewards/rejected": -20895692.33027523, "step": 1350 }, { "epoch": 1.6933166770768269, "grad_norm": 0.05504531040787697, "kl": 0.0, "learning_rate": 3.4870041535980283e-07, "logits/chosen": -1005325644.4675325, "logits/rejected": 160775336.61044177, "logps/chosen": -1396.2943722943724, "logps/rejected": -4198.29718875502, "loss": 0.0061, "rewards/chosen": 6.7330190154897185, "rewards/margins": 31472147.070368413, "rewards/rejected": -31472140.337349396, "step": 1355 }, { "epoch": 1.6995627732667082, "grad_norm": 0.2234458178281784, "kl": 30.433332443237305, "learning_rate": 3.3493649053890325e-07, "logits/chosen": -1036081575.4261603, "logits/rejected": 231148438.65020576, "logps/chosen": -1513.9915611814347, "logps/rejected": -3994.864197530864, "loss": 0.0134, "rewards/chosen": 6.843573847903481, "rewards/margins": -1633841.1235043413, "rewards/rejected": 1633847.9670781894, "step": 1360 }, { "epoch": 1.7058088694565896, "grad_norm": 0.044682055711746216, "kl": 0.0, "learning_rate": 3.214301866243469e-07, "logits/chosen": -1072538395.0042194, "logits/rejected": 448950187.7201646, "logps/chosen": -1468.759493670886, "logps/rejected": -4584.2962962962965, "loss": 0.0042, "rewards/chosen": 6.981564807489452, "rewards/margins": 8730029.960988674, "rewards/rejected": -8730022.979423868, "step": 1365 }, { "epoch": 1.712054965646471, "grad_norm": 0.04081505164504051, "kl": 0.0, "learning_rate": 3.081831107312308e-07, "logits/chosen": -993211187.2, "logits/rejected": 445330227.2, "logps/chosen": -1481.2521739130434, "logps/rejected": -4459.52, "loss": 0.0041, "rewards/chosen": 6.918339737601902, "rewards/margins": 13266534.150339738, "rewards/rejected": -13266527.232, "step": 1370 }, { "epoch": 1.7183010618363523, "grad_norm": 0.04136398062109947, "kl": 6.233333110809326, "learning_rate": 2.9519683912911267e-07, "logits/chosen": -1032674117.0086956, "logits/rejected": 353256865.792, "logps/chosen": -1485.0782608695652, "logps/rejected": -4357.376, "loss": 0.0072, "rewards/chosen": 7.032565174932065, "rewards/margins": 18321705.848565176, "rewards/rejected": -18321698.816, "step": 1375 }, { "epoch": 1.7245471580262337, "grad_norm": 0.037806980311870575, "kl": 0.0, "learning_rate": 2.8247291705444575e-07, "logits/chosen": -1073811729.0666666, "logits/rejected": 381803997.8666667, "logps/chosen": -1532.9333333333334, "logps/rejected": -4173.866666666667, "loss": 0.0033, "rewards/chosen": 6.961608378092448, "rewards/margins": 17027739.228275042, "rewards/rejected": -17027732.266666666, "step": 1380 }, { "epoch": 1.730793254216115, "grad_norm": 0.04734532907605171, "kl": 0.0, "learning_rate": 2.700128585267148e-07, "logits/chosen": -1064232157.7880185, "logits/rejected": 400121451.07224333, "logps/chosen": -1503.26267281106, "logps/rejected": -4154.6463878326995, "loss": 0.004, "rewards/chosen": 7.132465415286578, "rewards/margins": 24937706.904328533, "rewards/rejected": -24937699.771863118, "step": 1385 }, { "epoch": 1.7370393504059962, "grad_norm": 0.0439315065741539, "kl": 0.0, "learning_rate": 2.5781814616827936e-07, "logits/chosen": -1055812524.4978541, "logits/rejected": 363995965.14979756, "logps/chosen": -1543.4849785407725, "logps/rejected": -4515.757085020243, "loss": 0.0061, "rewards/chosen": 6.714240209227468, "rewards/margins": 19813420.0259811, "rewards/rejected": -19813413.31174089, "step": 1390 }, { "epoch": 1.7432854465958776, "grad_norm": 0.05658557638525963, "kl": 2.4833333492279053, "learning_rate": 2.458902310279601e-07, "logits/chosen": -1039975914.4873949, "logits/rejected": 141531762.2479339, "logps/chosen": -1433.142857142857, "logps/rejected": -4137.520661157025, "loss": 0.0053, "rewards/chosen": 6.4333219127494745, "rewards/margins": 29758231.358941745, "rewards/rejected": -29758224.925619833, "step": 1395 }, { "epoch": 1.749531542785759, "grad_norm": 0.18774373829364777, "kl": 13.333333015441895, "learning_rate": 2.3423053240837518e-07, "logits/chosen": -1019430049.2255319, "logits/rejected": 447735532.1469388, "logps/chosen": -1531.3021276595746, "logps/rejected": -4217.404081632653, "loss": 0.0092, "rewards/chosen": 6.913489029255319, "rewards/margins": 26482598.782876782, "rewards/rejected": -26482591.869387753, "step": 1400 }, { "epoch": 1.75577763897564, "grad_norm": 0.043808355927467346, "kl": 0.0, "learning_rate": 2.2284043769706026e-07, "logits/chosen": -1011416251.3702128, "logits/rejected": 144172780.14693877, "logps/chosen": -1641.531914893617, "logps/rejected": -4018.6775510204084, "loss": 0.0043, "rewards/chosen": 7.178429916057181, "rewards/margins": 31332219.162103385, "rewards/rejected": -31332211.98367347, "step": 1405 }, { "epoch": 1.7620237351655215, "grad_norm": 0.048849448561668396, "kl": 0.0, "learning_rate": 2.1172130220138227e-07, "logits/chosen": -1013900822.8016194, "logits/rejected": 300855805.8025751, "logps/chosen": -1627.076923076923, "logps/rejected": -4419.021459227468, "loss": 0.0047, "rewards/chosen": 7.054942513284413, "rewards/margins": 24462578.179406036, "rewards/rejected": -24462571.12446352, "step": 1410 }, { "epoch": 1.7682698313554028, "grad_norm": 0.049721091985702515, "kl": 28.33333396911621, "learning_rate": 2.0087444898726938e-07, "logits/chosen": -995293418.7509881, "logits/rejected": 524846932.5814978, "logps/chosen": -1456.5691699604743, "logps/rejected": -4499.101321585903, "loss": 0.0116, "rewards/chosen": 6.656613798480731, "rewards/margins": 43495953.93414684, "rewards/rejected": -43495947.27753304, "step": 1415 }, { "epoch": 1.7745159275452842, "grad_norm": 0.022943388670682907, "kl": 0.0, "learning_rate": 1.9030116872178317e-07, "logits/chosen": -1116324673.0847456, "logits/rejected": 419383128.13114756, "logps/chosen": -1456.2711864406779, "logps/rejected": -4149.639344262295, "loss": 0.0041, "rewards/chosen": 7.154229632878708, "rewards/margins": 12445927.678819798, "rewards/rejected": -12445920.524590164, "step": 1420 }, { "epoch": 1.7807620237351656, "grad_norm": 0.04606853052973747, "kl": 0.0, "learning_rate": 1.800027195195389e-07, "logits/chosen": -1112389339.4285715, "logits/rejected": 345240657.76425856, "logps/chosen": -1370.8387096774193, "logps/rejected": -4172.897338403041, "loss": 0.0042, "rewards/chosen": 6.945747902865784, "rewards/margins": 47055758.246128134, "rewards/rejected": -47055751.30038023, "step": 1425 }, { "epoch": 1.787008119925047, "grad_norm": 0.041725583374500275, "kl": 6.366666793823242, "learning_rate": 1.699803267930039e-07, "logits/chosen": -978041787.7333333, "logits/rejected": 322212113.06666666, "logps/chosen": -1522.6666666666667, "logps/rejected": -4063.733333333333, "loss": 0.0051, "rewards/chosen": 7.036024983723959, "rewards/margins": 30979183.56935832, "rewards/rejected": -30979176.533333335, "step": 1430 }, { "epoch": 1.7932542161149283, "grad_norm": 0.03576849400997162, "kl": 5.733333110809326, "learning_rate": 1.602351831066862e-07, "logits/chosen": -1131575479.1544716, "logits/rejected": 228284853.6068376, "logps/chosen": -1475.2520325203252, "logps/rejected": -3997.811965811966, "loss": 0.0049, "rewards/chosen": 6.8408907758511175, "rewards/margins": 40579949.67849761, "rewards/rejected": -40579942.83760684, "step": 1435 }, { "epoch": 1.7995003123048094, "grad_norm": 0.05014333873987198, "kl": 23.66666603088379, "learning_rate": 1.507684480352292e-07, "logits/chosen": -1127743488.0, "logits/rejected": 292159488.0, "logps/chosen": -1444.25, "logps/rejected": -4205.714285714285, "loss": 0.007, "rewards/chosen": 6.619314193725586, "rewards/margins": 43717551.762171336, "rewards/rejected": -43717545.14285714, "step": 1440 }, { "epoch": 1.8057464084946908, "grad_norm": 0.04119168221950531, "kl": 1.6333333253860474, "learning_rate": 1.4158124802543693e-07, "logits/chosen": -911179027.8721461, "logits/rejected": 418642963.61685824, "logps/chosen": -1645.7351598173516, "logps/rejected": -4242.145593869732, "loss": 0.0037, "rewards/chosen": 7.1187459867294525, "rewards/margins": -1617341.9080739366, "rewards/rejected": 1617349.0268199234, "step": 1445 }, { "epoch": 1.8119925046845722, "grad_norm": 0.04770239442586899, "kl": 0.0, "learning_rate": 1.3267467626223606e-07, "logits/chosen": -1013024280.3809524, "logits/rejected": 458046632.61044174, "logps/chosen": -1338.1818181818182, "logps/rejected": -4339.919678714859, "loss": 0.0055, "rewards/chosen": 6.514653721929113, "rewards/margins": 3903908.329914766, "rewards/rejected": -3903901.815261044, "step": 1450 }, { "epoch": 1.8182386008744533, "grad_norm": 0.05117342248558998, "kl": 2.0333333015441895, "learning_rate": 1.2404979253859722e-07, "logits/chosen": -1017058308.2139918, "logits/rejected": 364585893.26582277, "logps/chosen": -1433.9423868312758, "logps/rejected": -4159.729957805907, "loss": 0.0049, "rewards/chosen": 6.74083668901106, "rewards/margins": 12966103.213410528, "rewards/rejected": -12966096.47257384, "step": 1455 }, { "epoch": 1.8244846970643347, "grad_norm": 0.048183873295784, "kl": 11.966666221618652, "learning_rate": 1.1570762312943295e-07, "logits/chosen": -1041858429.4501992, "logits/rejected": 471474569.5021834, "logps/chosen": -1457.9123505976095, "logps/rejected": -4449.816593886463, "loss": 0.0062, "rewards/chosen": 6.686128532744024, "rewards/margins": 48752919.45468748, "rewards/rejected": -48752912.76855895, "step": 1460 }, { "epoch": 1.830730793254216, "grad_norm": 0.041397128254175186, "kl": 3.049999952316284, "learning_rate": 1.0764916066947795e-07, "logits/chosen": -1057709542.3011583, "logits/rejected": 425707621.9366516, "logps/chosen": -1529.6988416988418, "logps/rejected": -4262.226244343891, "loss": 0.0042, "rewards/chosen": 6.806276770632239, "rewards/margins": 84398012.67053016, "rewards/rejected": -84398005.86425339, "step": 1465 }, { "epoch": 1.8369768894440974, "grad_norm": 0.0291211549192667, "kl": 12.516666412353516, "learning_rate": 9.98753640351785e-08, "logits/chosen": -1084134188.5668015, "logits/rejected": 390043270.04291844, "logps/chosen": -1341.1497975708503, "logps/rejected": -4192.137339055794, "loss": 0.006, "rewards/chosen": 6.858956896824393, "rewards/margins": 62714258.39543758, "rewards/rejected": -62714251.53648069, "step": 1470 }, { "epoch": 1.8432229856339788, "grad_norm": 0.042660392820835114, "kl": 12.533333778381348, "learning_rate": 9.238715823059324e-08, "logits/chosen": -1036907895.0452675, "logits/rejected": 222360053.19831222, "logps/chosen": -1464.4938271604938, "logps/rejected": -4489.991561181435, "loss": 0.0065, "rewards/chosen": 7.0193659778485085, "rewards/margins": 30820391.424429268, "rewards/rejected": -30820384.40506329, "step": 1475 }, { "epoch": 1.8494690818238602, "grad_norm": 0.04971213638782501, "kl": 0.0, "learning_rate": 8.518543427732951e-08, "logits/chosen": -1037584030.8965517, "logits/rejected": 420588907.3548387, "logps/chosen": -1572.896551724138, "logps/rejected": -4336.774193548387, "loss": 0.0044, "rewards/chosen": 6.730742355872845, "rewards/margins": 21317710.988806874, "rewards/rejected": -21317704.258064516, "step": 1480 }, { "epoch": 1.8557151780137415, "grad_norm": 0.0393749363720417, "kl": 2.6666667461395264, "learning_rate": 7.827104910851729e-08, "logits/chosen": -1214985907.4188035, "logits/rejected": 577688650.9268292, "logps/chosen": -1419.3504273504273, "logps/rejected": -4508.09756097561, "loss": 0.007, "rewards/chosen": 6.878979283520299, "rewards/margins": 57327797.70824757, "rewards/rejected": -57327790.82926829, "step": 1485 }, { "epoch": 1.861961274203623, "grad_norm": 0.037422481924295425, "kl": 9.25, "learning_rate": 7.164482546684642e-08, "logits/chosen": -1083562633.3658535, "logits/rejected": 255969052.44444445, "logps/chosen": -1367.5447154471544, "logps/rejected": -4047.042735042735, "loss": 0.0078, "rewards/chosen": 6.497047486344004, "rewards/margins": 20068045.6081586, "rewards/rejected": -20068039.111111112, "step": 1490 }, { "epoch": 1.868207370393504, "grad_norm": 0.06087194010615349, "kl": 5.900000095367432, "learning_rate": 6.530755180666593e-08, "logits/chosen": -1080703324.4149377, "logits/rejected": 321298603.38075316, "logps/chosen": -1386.323651452282, "logps/rejected": -4462.058577405858, "loss": 0.0047, "rewards/chosen": 7.002343142181017, "rewards/margins": 37324824.140418455, "rewards/rejected": -37324817.138075314, "step": 1495 }, { "epoch": 1.8744534665833854, "grad_norm": 0.04000187665224075, "kl": 2.7166666984558105, "learning_rate": 5.92599822001666e-08, "logits/chosen": -1044922896.516129, "logits/rejected": 480415037.79310346, "logps/chosen": -1473.2903225806451, "logps/rejected": -4401.6551724137935, "loss": 0.0047, "rewards/chosen": 6.6074873401272685, "rewards/margins": 46696542.88334941, "rewards/rejected": -46696536.27586207, "step": 1500 }, { "epoch": 1.8744534665833854, "eval_kl": 5.369565010070801, "eval_logits/chosen": -1053226647.5203094, "eval_logits/rejected": 314457350.7232323, "eval_logps/chosen": -1451.8375241779497, "eval_logps/rejected": -4126.286868686869, "eval_loss": 0.006971114315092564, "eval_rewards/chosen": 6.690913246494198, "eval_rewards/margins": 34923553.58384254, "eval_rewards/rejected": -34923546.89292929, "eval_runtime": 640.7129, "eval_samples_per_second": 6.313, "eval_steps_per_second": 0.395, "step": 1500 }, { "epoch": 1.8806995627732666, "grad_norm": 0.051798831671476364, "kl": 1.024999976158142, "learning_rate": 5.3502836247654176e-08, "logits/chosen": -997878037.2401747, "logits/rejected": 345052523.09163344, "logps/chosen": -1443.7729257641922, "logps/rejected": -4034.03984063745, "loss": 0.0041, "rewards/chosen": 7.0013470379025655, "rewards/margins": 18439719.63879724, "rewards/rejected": -18439712.6374502, "step": 1505 }, { "epoch": 1.886945658963148, "grad_norm": 0.05178893357515335, "kl": 0.0, "learning_rate": 4.8036798991923925e-08, "logits/chosen": -992199120.3238866, "logits/rejected": 275653979.19313306, "logps/chosen": -1448.161943319838, "logps/rejected": -4049.304721030043, "loss": 0.0047, "rewards/chosen": 6.675408120097419, "rewards/margins": 31989076.683991812, "rewards/rejected": -31989070.00858369, "step": 1510 }, { "epoch": 1.8931917551530293, "grad_norm": 0.04293132573366165, "kl": 0.0, "learning_rate": 4.2862520836747245e-08, "logits/chosen": -1123284873.520661, "logits/rejected": 473070789.9159664, "logps/chosen": -1461.685950413223, "logps/rejected": -4277.243697478992, "loss": 0.0059, "rewards/chosen": 6.628247978273502, "rewards/margins": 25677056.17446647, "rewards/rejected": -25677049.54621849, "step": 1515 }, { "epoch": 1.8994378513429107, "grad_norm": 0.062386684119701385, "kl": 0.0, "learning_rate": 3.798061746947995e-08, "logits/chosen": -1092597633.1327434, "logits/rejected": 353865502.2362205, "logps/chosen": -1405.0265486725664, "logps/rejected": -4292.031496062992, "loss": 0.0068, "rewards/chosen": 6.833754277862279, "rewards/margins": 36038900.692021996, "rewards/rejected": -36038893.85826772, "step": 1520 }, { "epoch": 1.905683947532792, "grad_norm": 0.04458368569612503, "kl": 0.11666666716337204, "learning_rate": 3.339166978780256e-08, "logits/chosen": -973501483.0252101, "logits/rejected": 412649319.6694215, "logps/chosen": -1451.4285714285713, "logps/rejected": -4935.93388429752, "loss": 0.0048, "rewards/chosen": 6.3846789448201156, "rewards/margins": 18261066.087158285, "rewards/rejected": -18261059.70247934, "step": 1525 }, { "epoch": 1.9119300437226734, "grad_norm": 0.06264102458953857, "kl": 0.0, "learning_rate": 2.9096223830598347e-08, "logits/chosen": -1078418658.5486727, "logits/rejected": 434630623.7480315, "logps/chosen": -1453.6637168141592, "logps/rejected": -4444.220472440945, "loss": 0.0066, "rewards/chosen": 6.742341978360066, "rewards/margins": 28229193.262027018, "rewards/rejected": -28229186.51968504, "step": 1530 }, { "epoch": 1.9181761399125548, "grad_norm": 0.03891793265938759, "kl": 0.0, "learning_rate": 2.5094790712980322e-08, "logits/chosen": -994888908.8, "logits/rejected": 331507302.4, "logps/chosen": -1478.8, "logps/rejected": -4330.133333333333, "loss": 0.0071, "rewards/chosen": 7.01343739827474, "rewards/margins": 25696160.6134374, "rewards/rejected": -25696153.6, "step": 1535 }, { "epoch": 1.9244222361024361, "grad_norm": 0.04806411266326904, "kl": 0.0, "learning_rate": 2.1387846565474047e-08, "logits/chosen": -1012147693.037037, "logits/rejected": 443452322.90909094, "logps/chosen": -1441.037037037037, "logps/rejected": -4375.272727272727, "loss": 0.0062, "rewards/chosen": 7.009937427662037, "rewards/margins": 45192074.8887253, "rewards/rejected": -45192067.878787875, "step": 1540 }, { "epoch": 1.9306683322923173, "grad_norm": 0.06165820360183716, "kl": 0.3916666805744171, "learning_rate": 1.79758324773624e-08, "logits/chosen": -996874101.2908367, "logits/rejected": 430538895.09170306, "logps/chosen": -1505.5298804780878, "logps/rejected": -4623.930131004367, "loss": 0.0076, "rewards/chosen": 6.439177858876992, "rewards/margins": 33095169.96756214, "rewards/rejected": -33095163.52838428, "step": 1545 }, { "epoch": 1.9369144284821986, "grad_norm": 0.04440110921859741, "kl": 2.866666555404663, "learning_rate": 1.4859154444200885e-08, "logits/chosen": -1060492863.7253219, "logits/rejected": 399448022.5425101, "logps/chosen": -1412.4291845493563, "logps/rejected": -4322.202429149797, "loss": 0.0061, "rewards/chosen": 6.6896883592073495, "rewards/margins": 48499838.91640901, "rewards/rejected": -48499832.226720646, "step": 1550 }, { "epoch": 1.94316052467208, "grad_norm": 0.03537024185061455, "kl": 0.0, "learning_rate": 1.2038183319507957e-08, "logits/chosen": -1090112543.5770924, "logits/rejected": 381984217.5494071, "logps/chosen": -1472.704845814978, "logps/rejected": -4046.418972332016, "loss": 0.0036, "rewards/chosen": 6.901054214275881, "rewards/margins": 61204207.62832694, "rewards/rejected": -61204200.72727273, "step": 1555 }, { "epoch": 1.9494066208619611, "grad_norm": 0.03937402740120888, "kl": 19.266666412353516, "learning_rate": 9.513254770636138e-09, "logits/chosen": -958182165.8116592, "logits/rejected": 389183878.4747082, "logps/chosen": -1551.2825112107623, "logps/rejected": -4418.988326848249, "loss": 0.0108, "rewards/chosen": 7.0115835420753925, "rewards/margins": 5401249.408470702, "rewards/rejected": -5401242.39688716, "step": 1560 }, { "epoch": 1.9556527170518425, "grad_norm": 0.5586203336715698, "kl": 0.0, "learning_rate": 7.284669238833419e-09, "logits/chosen": -1119640467.7723577, "logits/rejected": 510185997.1282051, "logps/chosen": -1490.1463414634147, "logps/rejected": -4648.478632478633, "loss": 0.0051, "rewards/chosen": 6.604302134940295, "rewards/margins": 39599342.53592607, "rewards/rejected": -39599335.93162393, "step": 1565 }, { "epoch": 1.9618988132417239, "grad_norm": 0.05282744765281677, "kl": 14.333333015441895, "learning_rate": 5.352691903491303e-09, "logits/chosen": -963496561.5397489, "logits/rejected": 376038497.7261411, "logps/chosen": -1471.7991631799164, "logps/rejected": -4203.817427385892, "loss": 0.011, "rewards/chosen": 6.948775415141213, "rewards/margins": 33892983.54628579, "rewards/rejected": -33892976.597510375, "step": 1570 }, { "epoch": 1.9681449094316052, "grad_norm": 0.04685989022254944, "kl": 3.25, "learning_rate": 3.71755265059226e-09, "logits/chosen": -1014598612.9747899, "logits/rejected": 395408477.09090906, "logps/chosen": -1472.2689075630253, "logps/rejected": -4229.289256198347, "loss": 0.0064, "rewards/chosen": 6.425900242909663, "rewards/margins": 16834238.492015947, "rewards/rejected": -16834232.066115703, "step": 1575 }, { "epoch": 1.9743910056214866, "grad_norm": 0.05169759690761566, "kl": 0.0, "learning_rate": 2.3794460453555046e-09, "logits/chosen": -900699429.8434782, "logits/rejected": 464678551.552, "logps/chosen": -1558.2608695652175, "logps/rejected": -4444.928, "loss": 0.0044, "rewards/chosen": 6.816860033118206, "rewards/margins": 14998663.840860033, "rewards/rejected": -14998657.024, "step": 1580 }, { "epoch": 1.980637101811368, "grad_norm": 0.04657047241926193, "kl": 0.0, "learning_rate": 1.3385313090857888e-09, "logits/chosen": -1086119681.1377778, "logits/rejected": 389585048.59607846, "logps/chosen": -1381.7955555555557, "logps/rejected": -4238.054901960784, "loss": 0.0047, "rewards/chosen": 6.858923068576389, "rewards/margins": 15058023.235393656, "rewards/rejected": -15058016.376470588, "step": 1585 }, { "epoch": 1.9868831980012494, "grad_norm": 0.055049166083335876, "kl": 3.799999952316284, "learning_rate": 5.94932300227169e-10, "logits/chosen": -1062919879.3282443, "logits/rejected": 403480500.8440367, "logps/chosen": -1391.5725190839694, "logps/rejected": -4085.7247706422017, "loss": 0.0064, "rewards/chosen": 6.819431916447996, "rewards/margins": 56341567.18640439, "rewards/rejected": -56341560.366972476, "step": 1590 }, { "epoch": 1.9931292941911305, "grad_norm": 0.05392511561512947, "kl": 2.633333444595337, "learning_rate": 1.4873749962562855e-10, "logits/chosen": -1080439704.8062015, "logits/rejected": 458492217.6576577, "logps/chosen": -1620.3410852713178, "logps/rejected": -4351.711711711711, "loss": 0.0114, "rewards/chosen": 6.508843000545058, "rewards/margins": 68250289.17550968, "rewards/rejected": -68250282.66666667, "step": 1595 }, { "epoch": 1.9993753903810119, "grad_norm": 0.0583941750228405, "kl": 0.0, "learning_rate": 0.0, "logits/chosen": -1059895126.0786027, "logits/rejected": 462384417.6573705, "logps/chosen": -1498.8296943231442, "logps/rejected": -4708.462151394422, "loss": 0.0044, "rewards/chosen": 6.921540239492358, "rewards/margins": 25431611.957396813, "rewards/rejected": -25431605.035856575, "step": 1600 } ], "logging_steps": 5, "max_steps": 1600, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }