{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 144, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021052631578947368, "grad_norm": 1.224082589149475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 270973525.3333333, "logits/rejected": 308584228.5714286, "logps/chosen": -743.7762586805555, "logps/rejected": -327.42550223214283, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.042105263157894736, "grad_norm": 1.7011622190475464, "kl": 0.08848989009857178, "learning_rate": 4.999405067699773e-06, "logits/chosen": 283945024.0, "logits/rejected": 300706848.0, "logps/chosen": -670.29150390625, "logps/rejected": -340.8790283203125, "loss": 0.5072, "rewards/chosen": -0.04161600396037102, "rewards/margins": -0.051780181005597115, "rewards/rejected": 0.010164177045226097, "step": 2 }, { "epoch": 0.06315789473684211, "grad_norm": 1.2693746089935303, "kl": 0.17015418410301208, "learning_rate": 4.997620553954645e-06, "logits/chosen": 268529444.5714286, "logits/rejected": 305261397.3333333, "logps/chosen": -813.0807756696429, "logps/rejected": -319.21929253472223, "loss": 0.496, "rewards/chosen": 0.018497141344206675, "rewards/margins": 0.028634450974918547, "rewards/rejected": -0.010137309630711874, "step": 3 }, { "epoch": 0.08421052631578947, "grad_norm": 2.279048442840576, "kl": 0.06164896488189697, "learning_rate": 4.994647308096509e-06, "logits/chosen": 252110563.55555555, "logits/rejected": 306722450.28571427, "logps/chosen": -748.6314019097222, "logps/rejected": -325.29725864955356, "loss": 0.4976, "rewards/chosen": 0.011831367181407081, "rewards/margins": 0.011405953797437841, "rewards/rejected": 0.0004254133839692388, "step": 4 }, { "epoch": 0.10526315789473684, "grad_norm": 1.5578685998916626, "kl": 0.06295323371887207, "learning_rate": 4.990486745229364e-06, "logits/chosen": 292113728.0, "logits/rejected": 316883904.0, "logps/chosen": -815.0404663085938, "logps/rejected": -337.57080078125, "loss": 0.499, "rewards/chosen": 0.006664060987532139, "rewards/margins": 0.011198383755981922, "rewards/rejected": -0.004534322768449783, "step": 5 }, { "epoch": 0.12631578947368421, "grad_norm": 1.006475567817688, "kl": 0.049591064453125, "learning_rate": 4.985140845555799e-06, "logits/chosen": 306853741.71428573, "logits/rejected": 325227320.8888889, "logps/chosen": -479.3779994419643, "logps/rejected": -286.5568576388889, "loss": 0.4958, "rewards/chosen": 0.026198712842805044, "rewards/margins": 0.030564389649837737, "rewards/rejected": -0.004365676807032691, "step": 6 }, { "epoch": 0.14736842105263157, "grad_norm": 1.2592947483062744, "kl": 0.15429818630218506, "learning_rate": 4.978612153434527e-06, "logits/chosen": 305409763.5555556, "logits/rejected": 309714139.4285714, "logps/chosen": -609.1155056423611, "logps/rejected": -277.2419956752232, "loss": 0.4977, "rewards/chosen": 0.01816416945722368, "rewards/margins": 0.028014377705634586, "rewards/rejected": -0.009850208248410906, "step": 7 }, { "epoch": 0.16842105263157894, "grad_norm": 1.420854926109314, "kl": 0.20131784677505493, "learning_rate": 4.970903776169403e-06, "logits/chosen": 300502112.0, "logits/rejected": 337999712.0, "logps/chosen": -599.28125, "logps/rejected": -321.2285461425781, "loss": 0.5003, "rewards/chosen": 0.006110990885645151, "rewards/margins": 0.016330440063029528, "rewards/rejected": -0.010219449177384377, "step": 8 }, { "epoch": 0.18947368421052632, "grad_norm": 1.0294359922409058, "kl": 0.15416035056114197, "learning_rate": 4.962019382530521e-06, "logits/chosen": 280579858.28571427, "logits/rejected": 310000554.6666667, "logps/chosen": -656.5274135044643, "logps/rejected": -314.7470703125, "loss": 0.4943, "rewards/chosen": 0.05247082880565098, "rewards/margins": 0.061781181111222216, "rewards/rejected": -0.009310352305571238, "step": 9 }, { "epoch": 0.21052631578947367, "grad_norm": 1.8129311800003052, "kl": 0.1927608847618103, "learning_rate": 4.9519632010080765e-06, "logits/chosen": 238834005.33333334, "logits/rejected": 301916818.28571427, "logps/chosen": -759.1360134548611, "logps/rejected": -292.9808872767857, "loss": 0.5004, "rewards/chosen": 0.014881134033203125, "rewards/margins": 0.019827809184789658, "rewards/rejected": -0.004946675151586533, "step": 10 }, { "epoch": 0.23157894736842105, "grad_norm": 1.232249140739441, "kl": 0.23234114050865173, "learning_rate": 4.9407400177998335e-06, "logits/chosen": 286699296.0, "logits/rejected": 300436576.0, "logps/chosen": -627.421875, "logps/rejected": -282.529296875, "loss": 0.4981, "rewards/chosen": 0.03665875270962715, "rewards/margins": 0.0362735278904438, "rewards/rejected": 0.0003852248191833496, "step": 11 }, { "epoch": 0.25263157894736843, "grad_norm": 1.2517226934432983, "kl": 0.2269219160079956, "learning_rate": 4.928355174533153e-06, "logits/chosen": 283396937.14285713, "logits/rejected": 298269838.2222222, "logps/chosen": -700.8916015625, "logps/rejected": -346.30750868055554, "loss": 0.4953, "rewards/chosen": 0.052171528339385986, "rewards/margins": 0.05235843691560957, "rewards/rejected": -0.00018690857622358535, "step": 12 }, { "epoch": 0.2736842105263158, "grad_norm": 1.2330269813537598, "kl": 0.2519031763076782, "learning_rate": 4.914814565722671e-06, "logits/chosen": 302669795.5555556, "logits/rejected": 313485238.85714287, "logps/chosen": -654.2307942708334, "logps/rejected": -330.16427176339283, "loss": 0.4958, "rewards/chosen": 0.0363319648636712, "rewards/margins": 0.07227063652068849, "rewards/rejected": -0.0359386716570173, "step": 13 }, { "epoch": 0.29473684210526313, "grad_norm": 1.4144365787506104, "kl": 0.130226731300354, "learning_rate": 4.900124635964823e-06, "logits/chosen": 276869248.0, "logits/rejected": 289425024.0, "logps/chosen": -519.8916015625, "logps/rejected": -265.8385314941406, "loss": 0.496, "rewards/chosen": 0.03466583415865898, "rewards/margins": 0.0368356395047158, "rewards/rejected": -0.002169805346056819, "step": 14 }, { "epoch": 0.3157894736842105, "grad_norm": 1.305309772491455, "kl": 0.12101024389266968, "learning_rate": 4.884292376870567e-06, "logits/chosen": 293158875.4285714, "logits/rejected": 304345201.7777778, "logps/chosen": -500.9711216517857, "logps/rejected": -344.5473361545139, "loss": 0.4944, "rewards/chosen": 0.06058854716164725, "rewards/margins": 0.07498784883627815, "rewards/rejected": -0.014399301674630906, "step": 15 }, { "epoch": 0.3368421052631579, "grad_norm": 1.447916865348816, "kl": 0.1458943486213684, "learning_rate": 4.867325323737765e-06, "logits/chosen": 290241507.5555556, "logits/rejected": 316717494.85714287, "logps/chosen": -713.4215494791666, "logps/rejected": -295.7858189174107, "loss": 0.4897, "rewards/chosen": 0.08352628681394789, "rewards/margins": 0.09608901836096294, "rewards/rejected": -0.012562731547015054, "step": 16 }, { "epoch": 0.35789473684210527, "grad_norm": 1.3701616525650024, "kl": 0.3826329708099365, "learning_rate": 4.849231551964771e-06, "logits/chosen": 279923008.0, "logits/rejected": 295235712.0, "logps/chosen": -549.1046752929688, "logps/rejected": -335.35260009765625, "loss": 0.4927, "rewards/chosen": 0.06883127987384796, "rewards/margins": 0.08687522634863853, "rewards/rejected": -0.018043946474790573, "step": 17 }, { "epoch": 0.37894736842105264, "grad_norm": 1.467745065689087, "kl": 0.5873703956604004, "learning_rate": 4.830019673206997e-06, "logits/chosen": 278496658.28571427, "logits/rejected": 300650268.4444444, "logps/chosen": -629.6449497767857, "logps/rejected": -360.1111653645833, "loss": 0.4923, "rewards/chosen": 0.06821728178433009, "rewards/margins": 0.0920389105403234, "rewards/rejected": -0.023821628755993314, "step": 18 }, { "epoch": 0.4, "grad_norm": 1.9833446741104126, "kl": 0.7327308654785156, "learning_rate": 4.809698831278217e-06, "logits/chosen": 277803520.0, "logits/rejected": 308114651.4285714, "logps/chosen": -737.9443901909722, "logps/rejected": -338.15098353794644, "loss": 0.4907, "rewards/chosen": 0.10881086852815416, "rewards/margins": 0.15786849695538718, "rewards/rejected": -0.04905762842723301, "step": 19 }, { "epoch": 0.42105263157894735, "grad_norm": 1.154601812362671, "kl": 0.1896182894706726, "learning_rate": 4.788278697798619e-06, "logits/chosen": 283599232.0, "logits/rejected": 313346368.0, "logps/chosen": -707.3074951171875, "logps/rejected": -316.3603515625, "loss": 0.4943, "rewards/chosen": 0.04999881610274315, "rewards/margins": 0.0727224051952362, "rewards/rejected": -0.022723589092493057, "step": 20 }, { "epoch": 0.4421052631578947, "grad_norm": 1.574962854385376, "kl": 0.4762837886810303, "learning_rate": 4.765769467591626e-06, "logits/chosen": 287853293.71428573, "logits/rejected": 302228480.0, "logps/chosen": -572.1301618303571, "logps/rejected": -282.95513237847223, "loss": 0.488, "rewards/chosen": 0.12458467483520508, "rewards/margins": 0.15258528788884482, "rewards/rejected": -0.02800061305363973, "step": 21 }, { "epoch": 0.4631578947368421, "grad_norm": 1.5850838422775269, "kl": 0.5586809515953064, "learning_rate": 4.742181853831721e-06, "logits/chosen": 290416867.5555556, "logits/rejected": 309413156.5714286, "logps/chosen": -668.2986653645834, "logps/rejected": -324.6029575892857, "loss": 0.4884, "rewards/chosen": 0.09367326895395915, "rewards/margins": 0.1388323534102667, "rewards/rejected": -0.04515908445630755, "step": 22 }, { "epoch": 0.4842105263157895, "grad_norm": 1.4497779607772827, "kl": 0.5230355262756348, "learning_rate": 4.717527082945555e-06, "logits/chosen": 280489216.0, "logits/rejected": 309412736.0, "logps/chosen": -682.4186401367188, "logps/rejected": -332.3192138671875, "loss": 0.488, "rewards/chosen": 0.1085066944360733, "rewards/margins": 0.14131877198815346, "rewards/rejected": -0.032812077552080154, "step": 23 }, { "epoch": 0.5052631578947369, "grad_norm": 1.3657130002975464, "kl": 0.5021036863327026, "learning_rate": 4.69181688926877e-06, "logits/chosen": 239562038.85714287, "logits/rejected": 313432405.3333333, "logps/chosen": -761.9135044642857, "logps/rejected": -289.5192057291667, "loss": 0.4848, "rewards/chosen": 0.1538386004311698, "rewards/margins": 0.17894491955401406, "rewards/rejected": -0.02510631912284427, "step": 24 }, { "epoch": 0.5263157894736842, "grad_norm": 1.446842074394226, "kl": 0.6509883999824524, "learning_rate": 4.665063509461098e-06, "logits/chosen": 293611036.4444444, "logits/rejected": 316255341.71428573, "logps/chosen": -721.4010416666666, "logps/rejected": -285.1228724888393, "loss": 0.4881, "rewards/chosen": 0.12438484032948811, "rewards/margins": 0.1453892659573328, "rewards/rejected": -0.021004425627844676, "step": 25 }, { "epoch": 0.5473684210526316, "grad_norm": 1.2756037712097168, "kl": 0.4022580087184906, "learning_rate": 4.637279676682367e-06, "logits/chosen": 292368704.0, "logits/rejected": 317329792.0, "logps/chosen": -469.4183044433594, "logps/rejected": -302.54144287109375, "loss": 0.4966, "rewards/chosen": 0.07243937253952026, "rewards/margins": 0.10708872973918915, "rewards/rejected": -0.034649357199668884, "step": 26 }, { "epoch": 0.5684210526315789, "grad_norm": 1.2571344375610352, "kl": 1.006415605545044, "learning_rate": 4.608478614532215e-06, "logits/chosen": 252147291.42857143, "logits/rejected": 306243811.5555556, "logps/chosen": -751.1729910714286, "logps/rejected": -298.89708116319446, "loss": 0.4868, "rewards/chosen": 0.15643044880458287, "rewards/margins": 0.2008941164092412, "rewards/rejected": -0.044463667604658336, "step": 27 }, { "epoch": 0.5894736842105263, "grad_norm": 1.2348135709762573, "kl": 0.844231903553009, "learning_rate": 4.578674030756364e-06, "logits/chosen": 328972714.6666667, "logits/rejected": 334514980.5714286, "logps/chosen": -661.2038302951389, "logps/rejected": -348.25048828125, "loss": 0.4897, "rewards/chosen": 0.1231810384326511, "rewards/margins": 0.1448404531157206, "rewards/rejected": -0.021659414683069502, "step": 28 }, { "epoch": 0.6105263157894737, "grad_norm": 1.038001298904419, "kl": 0.5513710379600525, "learning_rate": 4.54788011072248e-06, "logits/chosen": 276993920.0, "logits/rejected": 302539648.0, "logps/chosen": -632.3786010742188, "logps/rejected": -318.97100830078125, "loss": 0.489, "rewards/chosen": 0.10169073939323425, "rewards/margins": 0.1353834606707096, "rewards/rejected": -0.03369272127747536, "step": 29 }, { "epoch": 0.631578947368421, "grad_norm": 1.5136151313781738, "kl": 0.7220326066017151, "learning_rate": 4.516111510668707e-06, "logits/chosen": 276634130.28571427, "logits/rejected": 300450048.0, "logps/chosen": -667.6506696428571, "logps/rejected": -341.21375868055554, "loss": 0.4831, "rewards/chosen": 0.2004882778440203, "rewards/margins": 0.24258499533411054, "rewards/rejected": -0.042096717490090266, "step": 30 }, { "epoch": 0.6526315789473685, "grad_norm": 1.0843844413757324, "kl": 0.6163355112075806, "learning_rate": 4.4833833507280884e-06, "logits/chosen": 307501454.2222222, "logits/rejected": 303213787.4285714, "logps/chosen": -486.1819661458333, "logps/rejected": -337.0064174107143, "loss": 0.4833, "rewards/chosen": 0.12439311875237359, "rewards/margins": 0.1898724100892506, "rewards/rejected": -0.06547929133687701, "step": 31 }, { "epoch": 0.6736842105263158, "grad_norm": 1.1725709438323975, "kl": 0.6453120708465576, "learning_rate": 4.4497112077322045e-06, "logits/chosen": 314310528.0, "logits/rejected": 323297440.0, "logps/chosen": -570.1083374023438, "logps/rejected": -277.752197265625, "loss": 0.4858, "rewards/chosen": 0.13651692867279053, "rewards/margins": 0.190122302621603, "rewards/rejected": -0.053605373948812485, "step": 32 }, { "epoch": 0.6947368421052632, "grad_norm": 1.653479814529419, "kl": 1.0490498542785645, "learning_rate": 4.415111107797445e-06, "logits/chosen": 311787922.28571427, "logits/rejected": 350929635.5555556, "logps/chosen": -676.2388392857143, "logps/rejected": -374.4116482204861, "loss": 0.4785, "rewards/chosen": 0.2800070898873465, "rewards/margins": 0.29183324911291636, "rewards/rejected": -0.011826159225569831, "step": 33 }, { "epoch": 0.7157894736842105, "grad_norm": 1.5196908712387085, "kl": 0.8070676922798157, "learning_rate": 4.379599518697444e-06, "logits/chosen": 278536049.7777778, "logits/rejected": 313191350.85714287, "logps/chosen": -602.8457573784722, "logps/rejected": -309.17257254464283, "loss": 0.475, "rewards/chosen": 0.19871669345431858, "rewards/margins": 0.28476871952177985, "rewards/rejected": -0.08605202606746129, "step": 34 }, { "epoch": 0.7368421052631579, "grad_norm": 1.0484706163406372, "kl": 0.807353138923645, "learning_rate": 4.34319334202531e-06, "logits/chosen": 299660160.0, "logits/rejected": 306065632.0, "logps/chosen": -465.7568359375, "logps/rejected": -229.71621704101562, "loss": 0.4919, "rewards/chosen": 0.11409495025873184, "rewards/margins": 0.16953209787607193, "rewards/rejected": -0.05543714761734009, "step": 35 }, { "epoch": 0.7578947368421053, "grad_norm": 1.3274704217910767, "kl": 0.8778089880943298, "learning_rate": 4.305909905149389e-06, "logits/chosen": 270111561.14285713, "logits/rejected": 306747079.1111111, "logps/chosen": -576.6547502790179, "logps/rejected": -307.5883517795139, "loss": 0.4756, "rewards/chosen": 0.23983100482395717, "rewards/margins": 0.28282778887521653, "rewards/rejected": -0.04299678405125936, "step": 36 }, { "epoch": 0.7789473684210526, "grad_norm": 1.6128290891647339, "kl": 1.20198392868042, "learning_rate": 4.267766952966369e-06, "logits/chosen": 279583146.6666667, "logits/rejected": 321376694.85714287, "logps/chosen": -716.5959201388889, "logps/rejected": -284.7782505580357, "loss": 0.4762, "rewards/chosen": 0.2411472267574734, "rewards/margins": 0.34859214321015375, "rewards/rejected": -0.10744491645268031, "step": 37 }, { "epoch": 0.8, "grad_norm": 1.4675477743148804, "kl": 0.7991436719894409, "learning_rate": 4.228782639455674e-06, "logits/chosen": 289123392.0, "logits/rejected": 312878304.0, "logps/chosen": -548.8831176757812, "logps/rejected": -298.0077819824219, "loss": 0.4854, "rewards/chosen": 0.1557559221982956, "rewards/margins": 0.20764141902327538, "rewards/rejected": -0.05188549682497978, "step": 38 }, { "epoch": 0.8210526315789474, "grad_norm": 1.5274215936660767, "kl": 1.3201950788497925, "learning_rate": 4.188975519039151e-06, "logits/chosen": 306054198.85714287, "logits/rejected": 318293873.7777778, "logps/chosen": -641.423828125, "logps/rejected": -323.07017686631946, "loss": 0.4675, "rewards/chosen": 0.2957319532121931, "rewards/margins": 0.35744103268971517, "rewards/rejected": -0.061709079477522105, "step": 39 }, { "epoch": 0.8421052631578947, "grad_norm": 1.8003302812576294, "kl": 1.431631326675415, "learning_rate": 4.1483645377501726e-06, "logits/chosen": 286558065.7777778, "logits/rejected": 281730944.0, "logps/chosen": -683.1265190972222, "logps/rejected": -394.296875, "loss": 0.4595, "rewards/chosen": 0.2565513981713189, "rewards/margins": 0.4815535848102872, "rewards/rejected": -0.22500218663896834, "step": 40 }, { "epoch": 0.8631578947368421, "grad_norm": 1.385386347770691, "kl": 1.2051702737808228, "learning_rate": 4.106969024216348e-06, "logits/chosen": 297838432.0, "logits/rejected": 317846336.0, "logps/chosen": -683.0381469726562, "logps/rejected": -333.70623779296875, "loss": 0.4735, "rewards/chosen": 0.30369484424591064, "rewards/margins": 0.38917434215545654, "rewards/rejected": -0.0854794979095459, "step": 41 }, { "epoch": 0.8842105263157894, "grad_norm": 2.4110379219055176, "kl": 1.4937351942062378, "learning_rate": 4.064808680460149e-06, "logits/chosen": 275613819.5862069, "logits/rejected": 329032821.0285714, "logps/chosen": -821.4407327586207, "logps/rejected": -363.7448660714286, "loss": 0.4399, "rewards/chosen": 0.4741830825805664, "rewards/margins": 0.6639985765729631, "rewards/rejected": -0.18981549399239675, "step": 42 }, { "epoch": 0.9052631578947369, "grad_norm": 1.4786880016326904, "kl": 1.3580402135849, "learning_rate": 4.021903572521802e-06, "logits/chosen": 292805532.9032258, "logits/rejected": 320448667.1515151, "logps/chosen": -615.5443548387096, "logps/rejected": -311.78767163825756, "loss": 0.4669, "rewards/chosen": 0.3574201829971806, "rewards/margins": 0.4636997006622344, "rewards/rejected": -0.10627951766505386, "step": 43 }, { "epoch": 0.9263157894736842, "grad_norm": 1.0773484706878662, "kl": 1.1364717483520508, "learning_rate": 3.978274120908957e-06, "logits/chosen": 298700241.45454544, "logits/rejected": 323973846.7096774, "logps/chosen": -521.9517045454545, "logps/rejected": -351.1326864919355, "loss": 0.4819, "rewards/chosen": 0.21399710395119406, "rewards/margins": 0.30352772575669273, "rewards/rejected": -0.08953062180549867, "step": 44 }, { "epoch": 0.9473684210526315, "grad_norm": 1.8579978942871094, "kl": 1.5150351524353027, "learning_rate": 3.933941090877615e-06, "logits/chosen": 250053461.33333334, "logits/rejected": 297370359.7419355, "logps/chosen": -674.4289180871212, "logps/rejected": -328.01861769153226, "loss": 0.4562, "rewards/chosen": 0.3749615929343484, "rewards/margins": 0.5518351999545726, "rewards/rejected": -0.17687360702022428, "step": 45 }, { "epoch": 0.968421052631579, "grad_norm": 1.442551612854004, "kl": 1.4570834636688232, "learning_rate": 3.888925582549006e-06, "logits/chosen": 296643956.3636364, "logits/rejected": 300621658.83870965, "logps/chosen": -726.8507339015151, "logps/rejected": -310.6574470766129, "loss": 0.4584, "rewards/chosen": 0.3162169022993608, "rewards/margins": 0.4777500119027504, "rewards/rejected": -0.16153310960338962, "step": 46 }, { "epoch": 0.9894736842105263, "grad_norm": 1.5954558849334717, "kl": 1.3480093479156494, "learning_rate": 3.8432490208670605e-06, "logits/chosen": 266226656.96969697, "logits/rejected": 327847011.0967742, "logps/chosen": -818.7182765151515, "logps/rejected": -304.5207283266129, "loss": 0.4383, "rewards/chosen": 0.4229244463371508, "rewards/margins": 0.5965663433541067, "rewards/rejected": -0.17364189701695595, "step": 47 }, { "epoch": 1.0, "grad_norm": 0.6833202838897705, "kl": 0.490053653717041, "learning_rate": 3.796933145401304e-06, "logits/chosen": 347558865.45454544, "logits/rejected": 323800656.84210527, "logps/chosen": -481.98979048295456, "logps/rejected": -379.3189761513158, "loss": 0.231, "rewards/chosen": 0.20139399441805753, "rewards/margins": 0.35350871200196476, "rewards/rejected": -0.15211471758390727, "step": 48 }, { "epoch": 1.0210526315789474, "grad_norm": 1.429196834564209, "kl": 2.1841373443603516, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 271206684.4444444, "logits/rejected": 308082724.5714286, "logps/chosen": -739.5323893229166, "logps/rejected": -329.6499720982143, "loss": 0.452, "rewards/chosen": 0.4243852562374539, "rewards/margins": 0.6468354274356176, "rewards/rejected": -0.2224501711981637, "step": 49 }, { "epoch": 1.0421052631578946, "grad_norm": 1.9531340599060059, "kl": 2.2712414264678955, "learning_rate": 3.7024719222984696e-06, "logits/chosen": 283599008.0, "logits/rejected": 300947008.0, "logps/chosen": -664.508056640625, "logps/rejected": -342.87835693359375, "loss": 0.4543, "rewards/chosen": 0.5367215871810913, "rewards/margins": 0.7264900505542755, "rewards/rejected": -0.1897684633731842, "step": 50 }, { "epoch": 1.063157894736842, "grad_norm": 1.3136494159698486, "kl": 1.6561638116836548, "learning_rate": 3.654371533087586e-06, "logits/chosen": 269414235.4285714, "logits/rejected": 305372387.5555556, "logps/chosen": -808.3594447544643, "logps/rejected": -321.4450412326389, "loss": 0.4417, "rewards/chosen": 0.4906304563794817, "rewards/margins": 0.7233401290954106, "rewards/rejected": -0.23270967271592882, "step": 51 }, { "epoch": 1.0842105263157895, "grad_norm": 2.3089241981506348, "kl": 3.5582315921783447, "learning_rate": 3.6057217255475034e-06, "logits/chosen": 253091982.2222222, "logits/rejected": 307139657.14285713, "logps/chosen": -741.5936957465278, "logps/rejected": -328.35609654017856, "loss": 0.4255, "rewards/chosen": 0.7155967818366157, "rewards/margins": 1.0210523643190899, "rewards/rejected": -0.3054555824824742, "step": 52 }, { "epoch": 1.1052631578947367, "grad_norm": 1.4406310319900513, "kl": 2.404527425765991, "learning_rate": 3.556545654351749e-06, "logits/chosen": 293073600.0, "logits/rejected": 317258272.0, "logps/chosen": -809.40478515625, "logps/rejected": -338.89068603515625, "loss": 0.4662, "rewards/chosen": 0.570232629776001, "rewards/margins": 0.706758126616478, "rewards/rejected": -0.136525496840477, "step": 53 }, { "epoch": 1.1263157894736842, "grad_norm": 0.8961158394813538, "kl": 0.712938666343689, "learning_rate": 3.5068667246468437e-06, "logits/chosen": 307262500.5714286, "logits/rejected": 325764835.5555556, "logps/chosen": -476.66078404017856, "logps/rejected": -288.21210394965277, "loss": 0.4594, "rewards/chosen": 0.2979165826525007, "rewards/margins": 0.4678047744054643, "rewards/rejected": -0.1698881917529636, "step": 54 }, { "epoch": 1.1473684210526316, "grad_norm": 1.1575278043746948, "kl": 1.8655436038970947, "learning_rate": 3.4567085809127247e-06, "logits/chosen": 305821866.6666667, "logits/rejected": 310424832.0, "logps/chosen": -604.8497721354166, "logps/rejected": -279.33816964285717, "loss": 0.4503, "rewards/chosen": 0.44473473230997723, "rewards/margins": 0.6642032237279983, "rewards/rejected": -0.21946849141802108, "step": 55 }, { "epoch": 1.168421052631579, "grad_norm": 1.4242761135101318, "kl": 2.379883289337158, "learning_rate": 3.406095095709254e-06, "logits/chosen": 300870336.0, "logits/rejected": 338083168.0, "logps/chosen": -594.4110107421875, "logps/rejected": -323.47467041015625, "loss": 0.4548, "rewards/chosen": 0.4931264817714691, "rewards/margins": 0.7279564440250397, "rewards/rejected": -0.23482996225357056, "step": 56 }, { "epoch": 1.1894736842105262, "grad_norm": 1.1814433336257935, "kl": 1.7233543395996094, "learning_rate": 3.3550503583141726e-06, "logits/chosen": 281220224.0, "logits/rejected": 310704924.4444444, "logps/chosen": -652.3648158482143, "logps/rejected": -317.59087456597223, "loss": 0.4316, "rewards/chosen": 0.4687236377171108, "rewards/margins": 0.7624165973966084, "rewards/rejected": -0.2936929596794976, "step": 57 }, { "epoch": 1.2105263157894737, "grad_norm": 1.7414805889129639, "kl": 3.008741855621338, "learning_rate": 3.303598663257904e-06, "logits/chosen": 240169557.33333334, "logits/rejected": 302531145.14285713, "logps/chosen": -752.9740668402778, "logps/rejected": -296.3207310267857, "loss": 0.4323, "rewards/chosen": 0.6310732099745009, "rewards/margins": 0.9700051110888285, "rewards/rejected": -0.3389319011143276, "step": 58 }, { "epoch": 1.231578947368421, "grad_norm": 1.3317354917526245, "kl": 2.0062644481658936, "learning_rate": 3.2517644987606827e-06, "logits/chosen": 287379040.0, "logits/rejected": 301314176.0, "logps/chosen": -622.8872680664062, "logps/rejected": -286.7091064453125, "loss": 0.4326, "rewards/chosen": 0.49011722207069397, "rewards/margins": 0.9077150523662567, "rewards/rejected": -0.41759783029556274, "step": 59 }, { "epoch": 1.2526315789473683, "grad_norm": 1.2843542098999023, "kl": 1.9442932605743408, "learning_rate": 3.199572535077481e-06, "logits/chosen": 284359917.71428573, "logits/rejected": 299180487.1111111, "logps/chosen": -695.6732700892857, "logps/rejected": -348.7754177517361, "loss": 0.4422, "rewards/chosen": 0.5740087372916085, "rewards/margins": 0.8209883712586902, "rewards/rejected": -0.2469796339670817, "step": 60 }, { "epoch": 1.2736842105263158, "grad_norm": 1.2031817436218262, "kl": 2.3849363327026367, "learning_rate": 3.147047612756302e-06, "logits/chosen": 303765760.0, "logits/rejected": 314450102.85714287, "logps/chosen": -649.5403103298611, "logps/rejected": -333.43558175223217, "loss": 0.4408, "rewards/chosen": 0.5053782992892795, "rewards/margins": 0.8684476292322552, "rewards/rejected": -0.36306932994297575, "step": 61 }, { "epoch": 1.2947368421052632, "grad_norm": 1.274133563041687, "kl": 1.325626254081726, "learning_rate": 3.094214730815433e-06, "logits/chosen": 278172608.0, "logits/rejected": 290380704.0, "logps/chosen": -515.3540649414062, "logps/rejected": -267.65130615234375, "loss": 0.4527, "rewards/chosen": 0.4884217381477356, "rewards/margins": 0.6718673706054688, "rewards/rejected": -0.18344563245773315, "step": 62 }, { "epoch": 1.3157894736842106, "grad_norm": 1.1882749795913696, "kl": 1.1256272792816162, "learning_rate": 3.0410990348452572e-06, "logits/chosen": 294844745.14285713, "logits/rejected": 305883420.4444444, "logps/chosen": -496.18729073660717, "logps/rejected": -349.1206325954861, "loss": 0.4201, "rewards/chosen": 0.5389730930328369, "rewards/margins": 1.0107019477420383, "rewards/rejected": -0.4717288547092014, "step": 63 }, { "epoch": 1.3368421052631578, "grad_norm": 1.1736260652542114, "kl": 2.3068413734436035, "learning_rate": 2.9877258050403214e-06, "logits/chosen": 291604650.6666667, "logits/rejected": 318462098.28571427, "logps/chosen": -707.8668619791666, "logps/rejected": -299.36582728794644, "loss": 0.4267, "rewards/chosen": 0.638993157280816, "rewards/margins": 1.0095570049588642, "rewards/rejected": -0.3705638476780483, "step": 64 }, { "epoch": 1.3578947368421053, "grad_norm": 1.2752070426940918, "kl": 2.008157253265381, "learning_rate": 2.9341204441673267e-06, "logits/chosen": 281887584.0, "logits/rejected": 296646016.0, "logps/chosen": -544.8499145507812, "logps/rejected": -338.8497619628906, "loss": 0.4418, "rewards/chosen": 0.4943011403083801, "rewards/margins": 0.862060546875, "rewards/rejected": -0.3677594065666199, "step": 65 }, { "epoch": 1.3789473684210527, "grad_norm": 1.292263150215149, "kl": 2.7316248416900635, "learning_rate": 2.880308465474792e-06, "logits/chosen": 280081645.71428573, "logits/rejected": 302065777.7777778, "logps/chosen": -623.3849748883929, "logps/rejected": -363.41002061631946, "loss": 0.4141, "rewards/chosen": 0.6942225183759417, "rewards/margins": 1.0479256463429285, "rewards/rejected": -0.3537031279669868, "step": 66 }, { "epoch": 1.4, "grad_norm": 1.6679728031158447, "kl": 3.688748836517334, "learning_rate": 2.82631548055013e-06, "logits/chosen": 279772216.8888889, "logits/rejected": 309287862.85714287, "logps/chosen": -730.6381293402778, "logps/rejected": -341.89644949776783, "loss": 0.4255, "rewards/chosen": 0.8394391271803114, "rewards/margins": 1.263045250423371, "rewards/rejected": -0.4236061232430594, "step": 67 }, { "epoch": 1.4210526315789473, "grad_norm": 1.2063454389572144, "kl": 2.190333366394043, "learning_rate": 2.7721671871299115e-06, "logits/chosen": 284919776.0, "logits/rejected": 314707328.0, "logps/chosen": -701.7225952148438, "logps/rejected": -319.76239013671875, "loss": 0.4379, "rewards/chosen": 0.6084889769554138, "rewards/margins": 0.9714170694351196, "rewards/rejected": -0.3629280924797058, "step": 68 }, { "epoch": 1.4421052631578948, "grad_norm": 1.0976450443267822, "kl": 3.1681458950042725, "learning_rate": 2.717889356869146e-06, "logits/chosen": 288710180.5714286, "logits/rejected": 303286584.8888889, "logps/chosen": -564.9296177455357, "logps/rejected": -287.56187608506946, "loss": 0.41, "rewards/chosen": 0.8446392331804548, "rewards/margins": 1.333316681877015, "rewards/rejected": -0.4886774486965603, "step": 69 }, { "epoch": 1.4631578947368422, "grad_norm": 1.4227020740509033, "kl": 2.573333263397217, "learning_rate": 2.663507823075358e-06, "logits/chosen": 292599864.8888889, "logits/rejected": 310929920.0, "logps/chosen": -662.43212890625, "logps/rejected": -329.9785853794643, "loss": 0.4136, "rewards/chosen": 0.6803323957655165, "rewards/margins": 1.2630558770800393, "rewards/rejected": -0.5827234813145229, "step": 70 }, { "epoch": 1.4842105263157894, "grad_norm": 1.2122981548309326, "kl": 2.633822202682495, "learning_rate": 2.6090484684133406e-06, "logits/chosen": 282540576.0, "logits/rejected": 311218656.0, "logps/chosen": -676.41015625, "logps/rejected": -336.9644775390625, "loss": 0.4216, "rewards/chosen": 0.7093576192855835, "rewards/margins": 1.2067002058029175, "rewards/rejected": -0.497342586517334, "step": 71 }, { "epoch": 1.5052631578947369, "grad_norm": 1.1785283088684082, "kl": 2.1220672130584717, "learning_rate": 2.554537212586403e-06, "logits/chosen": 241519908.57142857, "logits/rejected": 314742869.3333333, "logps/chosen": -754.8909737723214, "logps/rejected": -293.48790147569446, "loss": 0.3964, "rewards/chosen": 0.8560841424124581, "rewards/margins": 1.278062933967227, "rewards/rejected": -0.42197879155476886, "step": 72 }, { "epoch": 1.526315789473684, "grad_norm": 1.2416644096374512, "kl": 2.510000467300415, "learning_rate": 2.5e-06, "logits/chosen": 295481742.2222222, "logits/rejected": 317371611.4285714, "logps/chosen": -715.8388671875, "logps/rejected": -289.58375767299106, "loss": 0.4226, "rewards/chosen": 0.680607000986735, "rewards/margins": 1.1476976417359852, "rewards/rejected": -0.46709064074925016, "step": 73 }, { "epoch": 1.5473684210526315, "grad_norm": 0.9503583908081055, "kl": 1.7678486108779907, "learning_rate": 2.4454627874135976e-06, "logits/chosen": 294551104.0, "logits/rejected": 318992832.0, "logps/chosen": -464.9443359375, "logps/rejected": -306.6573181152344, "loss": 0.4345, "rewards/chosen": 0.5198342204093933, "rewards/margins": 0.9660733342170715, "rewards/rejected": -0.4462391138076782, "step": 74 }, { "epoch": 1.568421052631579, "grad_norm": 1.0121111869812012, "kl": 3.102855920791626, "learning_rate": 2.3909515315866606e-06, "logits/chosen": 253119268.57142857, "logits/rejected": 307017955.5555556, "logps/chosen": -744.9547293526786, "logps/rejected": -303.27305772569446, "loss": 0.4184, "rewards/chosen": 0.7782654762268066, "rewards/margins": 1.2603272332085504, "rewards/rejected": -0.4820617569817437, "step": 75 }, { "epoch": 1.5894736842105264, "grad_norm": 1.0430024862289429, "kl": 2.240968942642212, "learning_rate": 2.3364921769246423e-06, "logits/chosen": 331319495.1111111, "logits/rejected": 336348635.4285714, "logps/chosen": -656.6678602430555, "logps/rejected": -351.60421316964283, "loss": 0.4397, "rewards/chosen": 0.5767775111728244, "rewards/margins": 0.9338111726064531, "rewards/rejected": -0.35703366143362864, "step": 76 }, { "epoch": 1.6105263157894738, "grad_norm": 1.1261143684387207, "kl": 1.5933310985565186, "learning_rate": 2.2821106431308546e-06, "logits/chosen": 278966432.0, "logits/rejected": 303633792.0, "logps/chosen": -628.973876953125, "logps/rejected": -323.4996337890625, "loss": 0.4327, "rewards/chosen": 0.4421617388725281, "rewards/margins": 0.9287151098251343, "rewards/rejected": -0.4865533709526062, "step": 77 }, { "epoch": 1.631578947368421, "grad_norm": 1.2804371118545532, "kl": 2.7089781761169434, "learning_rate": 2.2278328128700893e-06, "logits/chosen": 278770578.28571427, "logits/rejected": 302573937.7777778, "logps/chosen": -661.4776785714286, "logps/rejected": -346.33751085069446, "loss": 0.4066, "rewards/chosen": 0.8177859442574638, "rewards/margins": 1.3722575959705172, "rewards/rejected": -0.5544716517130533, "step": 78 }, { "epoch": 1.6526315789473685, "grad_norm": 0.9231442213058472, "kl": 2.3125405311584473, "learning_rate": 2.173684519449872e-06, "logits/chosen": 310171477.3333333, "logits/rejected": 305176356.5714286, "logps/chosen": -482.10584852430554, "logps/rejected": -341.7206333705357, "loss": 0.4199, "rewards/chosen": 0.5320029788547092, "rewards/margins": 1.0689090168665325, "rewards/rejected": -0.5369060380118233, "step": 79 }, { "epoch": 1.6736842105263157, "grad_norm": 0.8038806319236755, "kl": 2.2327325344085693, "learning_rate": 2.1196915345252085e-06, "logits/chosen": 316725248.0, "logits/rejected": 324820768.0, "logps/chosen": -565.4083862304688, "logps/rejected": -283.5719299316406, "loss": 0.4239, "rewards/chosen": 0.606515645980835, "rewards/margins": 1.2420920133590698, "rewards/rejected": -0.6355763673782349, "step": 80 }, { "epoch": 1.694736842105263, "grad_norm": 1.22795569896698, "kl": 3.6404929161071777, "learning_rate": 2.0658795558326745e-06, "logits/chosen": 314771090.28571427, "logits/rejected": 352914432.0, "logps/chosen": -669.0993303571429, "logps/rejected": -378.419921875, "loss": 0.4146, "rewards/chosen": 0.9939588138035366, "rewards/margins": 1.4066120480734203, "rewards/rejected": -0.4126532342698839, "step": 81 }, { "epoch": 1.7157894736842105, "grad_norm": 1.2053930759429932, "kl": 2.856393575668335, "learning_rate": 2.01227419495968e-06, "logits/chosen": 281342748.4444444, "logits/rejected": 315115154.28571427, "logps/chosen": -597.5861545138889, "logps/rejected": -316.50892857142856, "loss": 0.386, "rewards/chosen": 0.7246790991889106, "rewards/margins": 1.5443669425116644, "rewards/rejected": -0.8196878433227539, "step": 82 }, { "epoch": 1.736842105263158, "grad_norm": 0.8006544709205627, "kl": 2.2694008350372314, "learning_rate": 1.958900965154743e-06, "logits/chosen": 301532064.0, "logits/rejected": 307345312.0, "logps/chosen": -462.2938232421875, "logps/rejected": -235.0488739013672, "loss": 0.4387, "rewards/chosen": 0.4603995978832245, "rewards/margins": 1.0491024553775787, "rewards/rejected": -0.5887028574943542, "step": 83 }, { "epoch": 1.7578947368421054, "grad_norm": 1.1402519941329956, "kl": 1.9562792778015137, "learning_rate": 1.9057852691845677e-06, "logits/chosen": 272010203.4285714, "logits/rejected": 308657237.3333333, "logps/chosen": -570.7827497209821, "logps/rejected": -313.10308159722223, "loss": 0.3831, "rewards/chosen": 0.827030863080706, "rewards/margins": 1.4214982910761758, "rewards/rejected": -0.5944674279954698, "step": 84 }, { "epoch": 1.7789473684210526, "grad_norm": 1.1353209018707275, "kl": 3.1249067783355713, "learning_rate": 1.852952387243698e-06, "logits/chosen": 282185585.7777778, "logits/rejected": 323195904.0, "logps/chosen": -710.0297309027778, "logps/rejected": -292.21461704799106, "loss": 0.3872, "rewards/chosen": 0.8977683385213217, "rewards/margins": 1.7488488242739724, "rewards/rejected": -0.8510804857526507, "step": 85 }, { "epoch": 1.8, "grad_norm": 1.0260812044143677, "kl": 1.7017306089401245, "learning_rate": 1.8004274649225201e-06, "logits/chosen": 291513664.0, "logits/rejected": 314734592.0, "logps/chosen": -544.3753662109375, "logps/rejected": -303.4842529296875, "loss": 0.4086, "rewards/chosen": 0.6065320372581482, "rewards/margins": 1.2060632109642029, "rewards/rejected": -0.5995311737060547, "step": 86 }, { "epoch": 1.8210526315789473, "grad_norm": 1.1943074464797974, "kl": 3.096353769302368, "learning_rate": 1.7482355012393177e-06, "logits/chosen": 308626578.28571427, "logits/rejected": 320611356.4444444, "logps/chosen": -635.6044921875, "logps/rejected": -328.5426974826389, "loss": 0.3767, "rewards/chosen": 0.8776694025312152, "rewards/margins": 1.486631302606492, "rewards/rejected": -0.6089619000752767, "step": 87 }, { "epoch": 1.8421052631578947, "grad_norm": 1.3380628824234009, "kl": 2.669947385787964, "learning_rate": 1.6964013367420967e-06, "logits/chosen": 289058076.4444444, "logits/rejected": 283605193.14285713, "logps/chosen": -677.7171223958334, "logps/rejected": -403.6229771205357, "loss": 0.3675, "rewards/chosen": 0.7974962128533257, "rewards/margins": 1.9551072423420255, "rewards/rejected": -1.1576110294886999, "step": 88 }, { "epoch": 1.8631578947368421, "grad_norm": 1.051780343055725, "kl": 2.4696009159088135, "learning_rate": 1.6449496416858285e-06, "logits/chosen": 300929024.0, "logits/rejected": 320133600.0, "logps/chosen": -677.1189575195312, "logps/rejected": -339.54608154296875, "loss": 0.4024, "rewards/chosen": 0.8956129550933838, "rewards/margins": 1.5650758743286133, "rewards/rejected": -0.6694629192352295, "step": 89 }, { "epoch": 1.8842105263157896, "grad_norm": 1.425748348236084, "kl": 2.594327926635742, "learning_rate": 1.5939049042907463e-06, "logits/chosen": 279002712.2758621, "logits/rejected": 331318096.45714283, "logps/chosen": -812.5680226293103, "logps/rejected": -373.0890345982143, "loss": 0.3106, "rewards/chosen": 1.3614521684317753, "rewards/margins": 2.4856834524370766, "rewards/rejected": -1.1242312840053013, "step": 90 }, { "epoch": 1.905263157894737, "grad_norm": 0.9643742442131042, "kl": 2.9081778526306152, "learning_rate": 1.5432914190872757e-06, "logits/chosen": 295409465.8064516, "logits/rejected": 322480624.4848485, "logps/chosen": -609.1846018145161, "logps/rejected": -318.19146543560606, "loss": 0.381, "rewards/chosen": 0.993392082952684, "rewards/margins": 1.7400533558686109, "rewards/rejected": -0.7466612729159269, "step": 91 }, { "epoch": 1.9263157894736842, "grad_norm": 0.9330692291259766, "kl": 2.2181286811828613, "learning_rate": 1.4931332753531575e-06, "logits/chosen": 301478136.24242425, "logits/rejected": 325849946.83870965, "logps/chosen": -518.4118726325758, "logps/rejected": -356.10846144153226, "loss": 0.4365, "rewards/chosen": 0.5679830493349017, "rewards/margins": 1.1550928304155783, "rewards/rejected": -0.5871097810806767, "step": 92 }, { "epoch": 1.9473684210526314, "grad_norm": 1.2292976379394531, "kl": 3.2448878288269043, "learning_rate": 1.443454345648252e-06, "logits/chosen": 252509028.84848484, "logits/rejected": 298726697.2903226, "logps/chosen": -668.6770833333334, "logps/rejected": -336.6697013608871, "loss": 0.3723, "rewards/chosen": 0.950144623265122, "rewards/margins": 1.9921284430420876, "rewards/rejected": -1.0419838197769657, "step": 93 }, { "epoch": 1.9684210526315788, "grad_norm": 1.0690096616744995, "kl": 2.7862634658813477, "learning_rate": 1.3942782744524974e-06, "logits/chosen": 298993601.93939394, "logits/rejected": 302201723.87096775, "logps/chosen": -721.709990530303, "logps/rejected": -317.60660282258067, "loss": 0.3709, "rewards/chosen": 0.8302892049153646, "rewards/margins": 1.68674196222777, "rewards/rejected": -0.8564527573124054, "step": 94 }, { "epoch": 1.9894736842105263, "grad_norm": 0.9422377943992615, "kl": 2.6181588172912598, "learning_rate": 1.3456284669124159e-06, "logits/chosen": 269021277.09090906, "logits/rejected": 329800010.32258064, "logps/chosen": -812.2279829545455, "logps/rejected": -312.94632056451616, "loss": 0.3429, "rewards/chosen": 1.0719468087861033, "rewards/margins": 2.088143951848572, "rewards/rejected": -1.0161971430624686, "step": 95 }, { "epoch": 2.0, "grad_norm": 0.6018558144569397, "kl": 0.3343966007232666, "learning_rate": 1.2975280777015315e-06, "logits/chosen": 350389713.45454544, "logits/rejected": 325354549.8947368, "logps/chosen": -479.17649147727275, "logps/rejected": -385.98116262335526, "loss": 0.186, "rewards/chosen": 0.4827239296653054, "rewards/margins": 1.3010570535249117, "rewards/rejected": -0.8183331238596063, "step": 96 }, { "epoch": 2.0210526315789474, "grad_norm": 0.9914832711219788, "kl": 4.49602746963501, "learning_rate": 1.2500000000000007e-06, "logits/chosen": 273368320.0, "logits/rejected": 309601865.14285713, "logps/chosen": -734.2335069444445, "logps/rejected": -337.99769810267856, "loss": 0.3757, "rewards/chosen": 0.9542705747816298, "rewards/margins": 2.011491783081539, "rewards/rejected": -1.0572212082999093, "step": 97 }, { "epoch": 2.042105263157895, "grad_norm": 1.185113787651062, "kl": 4.397549629211426, "learning_rate": 1.203066854598696e-06, "logits/chosen": 285693728.0, "logits/rejected": 302816000.0, "logps/chosen": -658.1407470703125, "logps/rejected": -351.9900207519531, "loss": 0.3642, "rewards/chosen": 1.173454761505127, "rewards/margins": 2.274389863014221, "rewards/rejected": -1.1009351015090942, "step": 98 }, { "epoch": 2.0631578947368423, "grad_norm": 0.975797176361084, "kl": 2.5340423583984375, "learning_rate": 1.1567509791329402e-06, "logits/chosen": 272200941.71428573, "logits/rejected": 307242979.5555556, "logps/chosen": -803.5295061383929, "logps/rejected": -330.32402886284723, "loss": 0.3608, "rewards/chosen": 0.9736276354108538, "rewards/margins": 2.0942357532561773, "rewards/rejected": -1.1206081178453233, "step": 99 }, { "epoch": 2.0842105263157893, "grad_norm": 1.179176688194275, "kl": 6.256747722625732, "learning_rate": 1.1110744174509952e-06, "logits/chosen": 255072796.44444445, "logits/rejected": 308663661.71428573, "logps/chosen": -734.2338324652778, "logps/rejected": -339.58851841517856, "loss": 0.3406, "rewards/chosen": 1.4515838623046875, "rewards/margins": 2.880281448364258, "rewards/rejected": -1.4286975860595703, "step": 100 }, { "epoch": 2.1052631578947367, "grad_norm": 1.2810099124908447, "kl": 3.9927051067352295, "learning_rate": 1.0660589091223854e-06, "logits/chosen": 296453824.0, "logits/rejected": 319576800.0, "logps/chosen": -804.0701904296875, "logps/rejected": -344.6459655761719, "loss": 0.4024, "rewards/chosen": 1.1036909818649292, "rewards/margins": 1.8157426118850708, "rewards/rejected": -0.7120516300201416, "step": 101 }, { "epoch": 2.126315789473684, "grad_norm": 0.7138159871101379, "kl": 1.1326825618743896, "learning_rate": 1.0217258790910447e-06, "logits/chosen": 309144685.71428573, "logits/rejected": 327665123.5555556, "logps/chosen": -474.47987583705356, "logps/rejected": -293.0481228298611, "loss": 0.4138, "rewards/chosen": 0.5160130092075893, "rewards/margins": 1.1695069434150818, "rewards/rejected": -0.6534939342074924, "step": 102 }, { "epoch": 2.1473684210526316, "grad_norm": 0.8066704869270325, "kl": 2.314944267272949, "learning_rate": 9.780964274781984e-07, "logits/chosen": 307828736.0, "logits/rejected": 312215113.14285713, "logps/chosen": -601.6102973090278, "logps/rejected": -286.4847935267857, "loss": 0.3993, "rewards/chosen": 0.7686821089850532, "rewards/margins": 1.7028116877116855, "rewards/rejected": -0.9341295787266323, "step": 103 }, { "epoch": 2.168421052631579, "grad_norm": 0.9273717403411865, "kl": 3.7996773719787598, "learning_rate": 9.351913195398523e-07, "logits/chosen": 303605472.0, "logits/rejected": 339657600.0, "logps/chosen": -590.1983642578125, "logps/rejected": -330.32666015625, "loss": 0.3999, "rewards/chosen": 0.9143981337547302, "rewards/margins": 1.8344270586967468, "rewards/rejected": -0.9200289249420166, "step": 104 }, { "epoch": 2.1894736842105265, "grad_norm": 0.8671090006828308, "kl": 2.5950183868408203, "learning_rate": 8.930309757836517e-07, "logits/chosen": 283258258.28571427, "logits/rejected": 312255089.7777778, "logps/chosen": -648.32666015625, "logps/rejected": -325.6108127170139, "loss": 0.3448, "rewards/chosen": 0.8725461278642926, "rewards/margins": 1.9682320867265974, "rewards/rejected": -1.0956859588623047, "step": 105 }, { "epoch": 2.2105263157894735, "grad_norm": 1.1705127954483032, "kl": 4.775852203369141, "learning_rate": 8.516354622498279e-07, "logits/chosen": 242087224.8888889, "logits/rejected": 304067876.5714286, "logps/chosen": -748.5776909722222, "logps/rejected": -303.85107421875, "loss": 0.3737, "rewards/chosen": 1.0707118776109483, "rewards/margins": 2.1626787109980508, "rewards/rejected": -1.0919668333871024, "step": 106 }, { "epoch": 2.231578947368421, "grad_norm": 0.8211517930030823, "kl": 2.75072979927063, "learning_rate": 8.110244809608494e-07, "logits/chosen": 289833088.0, "logits/rejected": 302552512.0, "logps/chosen": -619.5748291015625, "logps/rejected": -296.18695068359375, "loss": 0.3668, "rewards/chosen": 0.8213632702827454, "rewards/margins": 2.1867424845695496, "rewards/rejected": -1.3653792142868042, "step": 107 }, { "epoch": 2.2526315789473683, "grad_norm": 1.093108892440796, "kl": 3.059412956237793, "learning_rate": 7.712173605443269e-07, "logits/chosen": 285983762.28571427, "logits/rejected": 300334563.5555556, "logps/chosen": -692.2897600446429, "logps/rejected": -355.3886447482639, "loss": 0.376, "rewards/chosen": 0.9123561041695731, "rewards/margins": 1.820660742502364, "rewards/rejected": -0.9083046383327908, "step": 108 }, { "epoch": 2.2736842105263158, "grad_norm": 0.8700193762779236, "kl": 3.397562026977539, "learning_rate": 7.322330470336314e-07, "logits/chosen": 306071779.5555556, "logits/rejected": 316075373.71428573, "logps/chosen": -645.9636501736111, "logps/rejected": -340.6162806919643, "loss": 0.3865, "rewards/chosen": 0.863048235575358, "rewards/margins": 1.944187255132766, "rewards/rejected": -1.081139019557408, "step": 109 }, { "epoch": 2.294736842105263, "grad_norm": 0.9735374450683594, "kl": 1.8307335376739502, "learning_rate": 6.940900948506113e-07, "logits/chosen": 280107104.0, "logits/rejected": 291821312.0, "logps/chosen": -512.7600708007812, "logps/rejected": -272.31732177734375, "loss": 0.4089, "rewards/chosen": 0.7478194236755371, "rewards/margins": 1.3978694677352905, "rewards/rejected": -0.6500500440597534, "step": 110 }, { "epoch": 2.3157894736842106, "grad_norm": 0.8717783689498901, "kl": 1.6515731811523438, "learning_rate": 6.568066579746901e-07, "logits/chosen": 297068635.4285714, "logits/rejected": 307091399.1111111, "logps/chosen": -493.14170619419644, "logps/rejected": -356.8146701388889, "loss": 0.3656, "rewards/chosen": 0.8435340608869281, "rewards/margins": 2.0846710356455, "rewards/rejected": -1.241136974758572, "step": 111 }, { "epoch": 2.336842105263158, "grad_norm": 0.8384011387825012, "kl": 2.806180715560913, "learning_rate": 6.204004813025569e-07, "logits/chosen": 293585009.7777778, "logits/rejected": 319276032.0, "logps/chosen": -704.3138020833334, "logps/rejected": -306.95455496651783, "loss": 0.374, "rewards/chosen": 0.99429808722602, "rewards/margins": 2.1237333539932495, "rewards/rejected": -1.1294352667672294, "step": 112 }, { "epoch": 2.3578947368421055, "grad_norm": 1.0061078071594238, "kl": 2.242816686630249, "learning_rate": 5.848888922025553e-07, "logits/chosen": 284455776.0, "logits/rejected": 298613120.0, "logps/chosen": -542.8225708007812, "logps/rejected": -345.3612060546875, "loss": 0.3959, "rewards/chosen": 0.6970374584197998, "rewards/margins": 1.7159442901611328, "rewards/rejected": -1.018906831741333, "step": 113 }, { "epoch": 2.3789473684210525, "grad_norm": 1.0236337184906006, "kl": 3.477278232574463, "learning_rate": 5.50288792267796e-07, "logits/chosen": 281507035.4285714, "logits/rejected": 303031580.4444444, "logps/chosen": -620.07666015625, "logps/rejected": -369.61480034722223, "loss": 0.3544, "rewards/chosen": 1.025052615574428, "rewards/margins": 1.9992341086978005, "rewards/rejected": -0.9741814931233724, "step": 114 }, { "epoch": 2.4, "grad_norm": 1.2651081085205078, "kl": 4.5784382820129395, "learning_rate": 5.166166492719124e-07, "logits/chosen": 281606286.2222222, "logits/rejected": 310555867.4285714, "logps/chosen": -726.4253472222222, "logps/rejected": -348.73287527901783, "loss": 0.3689, "rewards/chosen": 1.2607183456420898, "rewards/margins": 2.3679630415780206, "rewards/rejected": -1.1072446959359306, "step": 115 }, { "epoch": 2.4210526315789473, "grad_norm": 0.9246846437454224, "kl": 2.923830032348633, "learning_rate": 4.838884893312934e-07, "logits/chosen": 286494016.0, "logits/rejected": 315544192.0, "logps/chosen": -698.613525390625, "logps/rejected": -325.4237365722656, "loss": 0.3879, "rewards/chosen": 0.9193922877311707, "rewards/margins": 1.8484562039375305, "rewards/rejected": -0.9290639162063599, "step": 116 }, { "epoch": 2.442105263157895, "grad_norm": 0.7015478014945984, "kl": 4.306000232696533, "learning_rate": 4.5211988927752026e-07, "logits/chosen": 291081782.85714287, "logits/rejected": 304304839.1111111, "logps/chosen": -561.2941545758929, "logps/rejected": -294.81694878472223, "loss": 0.3653, "rewards/chosen": 1.208183969770159, "rewards/margins": 2.422367989070832, "rewards/rejected": -1.2141840193006728, "step": 117 }, { "epoch": 2.463157894736842, "grad_norm": 1.10671865940094, "kl": 3.112697124481201, "learning_rate": 4.2132596924363666e-07, "logits/chosen": 294296291.5555556, "logits/rejected": 311906011.4285714, "logps/chosen": -659.6044921875, "logps/rejected": -337.13692801339283, "loss": 0.3648, "rewards/chosen": 0.9630860222710503, "rewards/margins": 2.2616418051341225, "rewards/rejected": -1.298555782863072, "step": 118 }, { "epoch": 2.4842105263157896, "grad_norm": 0.9874948859214783, "kl": 3.2510499954223633, "learning_rate": 3.915213854677863e-07, "logits/chosen": 284307872.0, "logits/rejected": 312279616.0, "logps/chosen": -673.400634765625, "logps/rejected": -343.1709289550781, "loss": 0.3752, "rewards/chosen": 1.0103015899658203, "rewards/margins": 2.128287434577942, "rewards/rejected": -1.1179858446121216, "step": 119 }, { "epoch": 2.5052631578947366, "grad_norm": 0.9072942733764648, "kl": 2.408541202545166, "learning_rate": 3.627203233176341e-07, "logits/chosen": 242695186.2857143, "logits/rejected": 315699996.4444444, "logps/chosen": -751.4161551339286, "logps/rejected": -298.6019694010417, "loss": 0.3405, "rewards/chosen": 1.203570774623326, "rewards/margins": 2.136957047477601, "rewards/rejected": -0.9333862728542752, "step": 120 }, { "epoch": 2.526315789473684, "grad_norm": 1.0498640537261963, "kl": 2.8332810401916504, "learning_rate": 3.3493649053890325e-07, "logits/chosen": 296178346.6666667, "logits/rejected": 317656722.28571427, "logps/chosen": -713.7129991319445, "logps/rejected": -295.0296107700893, "loss": 0.3833, "rewards/chosen": 0.8931880527072482, "rewards/margins": 1.9048690644521562, "rewards/rejected": -1.011681011744908, "step": 121 }, { "epoch": 2.5473684210526315, "grad_norm": 0.7236945033073425, "kl": 2.3001227378845215, "learning_rate": 3.081831107312308e-07, "logits/chosen": 295994208.0, "logits/rejected": 320040224.0, "logps/chosen": -463.0760192871094, "logps/rejected": -311.6647644042969, "loss": 0.4038, "rewards/chosen": 0.7066670656204224, "rewards/margins": 1.6536514163017273, "rewards/rejected": -0.9469843506813049, "step": 122 }, { "epoch": 2.568421052631579, "grad_norm": 0.809329092502594, "kl": 3.3746235370635986, "learning_rate": 2.8247291705444575e-07, "logits/chosen": 254174134.85714287, "logits/rejected": 307626439.1111111, "logps/chosen": -742.4175502232143, "logps/rejected": -308.20654296875, "loss": 0.3842, "rewards/chosen": 1.0319812638419015, "rewards/margins": 2.007392784905812, "rewards/rejected": -0.9754115210639106, "step": 123 }, { "epoch": 2.5894736842105264, "grad_norm": 0.935296356678009, "kl": 2.1826891899108887, "learning_rate": 2.5781814616827936e-07, "logits/chosen": 332361813.3333333, "logits/rejected": 337299017.14285713, "logps/chosen": -655.0251736111111, "logps/rejected": -355.64554268973217, "loss": 0.4048, "rewards/chosen": 0.7410464816623263, "rewards/margins": 1.5022159152560763, "rewards/rejected": -0.76116943359375, "step": 124 }, { "epoch": 2.610526315789474, "grad_norm": 0.984870195388794, "kl": 1.8854070901870728, "learning_rate": 2.3423053240837518e-07, "logits/chosen": 279978112.0, "logits/rejected": 304146432.0, "logps/chosen": -627.2296142578125, "logps/rejected": -328.7203369140625, "loss": 0.3887, "rewards/chosen": 0.6165910959243774, "rewards/margins": 1.625217080116272, "rewards/rejected": -1.0086259841918945, "step": 125 }, { "epoch": 2.6315789473684212, "grad_norm": 1.1392616033554077, "kl": 3.3097527027130127, "learning_rate": 2.1172130220138227e-07, "logits/chosen": 279993014.85714287, "logits/rejected": 303306353.7777778, "logps/chosen": -659.3328683035714, "logps/rejected": -350.6284450954861, "loss": 0.3736, "rewards/chosen": 1.0322659356253487, "rewards/margins": 2.0158348689003596, "rewards/rejected": -0.9835689332750108, "step": 126 }, { "epoch": 2.6526315789473687, "grad_norm": 0.7549173831939697, "kl": 2.572181224822998, "learning_rate": 1.9030116872178317e-07, "logits/chosen": 311775431.1111111, "logits/rejected": 305627556.5714286, "logps/chosen": -481.0133463541667, "logps/rejected": -346.57254464285717, "loss": 0.3882, "rewards/chosen": 0.6412492328219943, "rewards/margins": 1.663343141949366, "rewards/rejected": -1.0220939091273717, "step": 127 }, { "epoch": 2.6736842105263157, "grad_norm": 0.7030604481697083, "kl": 2.4721288681030273, "learning_rate": 1.699803267930039e-07, "logits/chosen": 318301792.0, "logits/rejected": 325693792.0, "logps/chosen": -564.3296508789062, "logps/rejected": -288.3941345214844, "loss": 0.4038, "rewards/chosen": 0.7143887877464294, "rewards/margins": 1.8321842551231384, "rewards/rejected": -1.117795467376709, "step": 128 }, { "epoch": 2.694736842105263, "grad_norm": 1.0730773210525513, "kl": 4.131094455718994, "learning_rate": 1.507684480352292e-07, "logits/chosen": 316674084.5714286, "logits/rejected": 354119111.1111111, "logps/chosen": -666.7138671875, "logps/rejected": -381.65391710069446, "loss": 0.3853, "rewards/chosen": 1.232508659362793, "rewards/margins": 1.9685664706759982, "rewards/rejected": -0.7360578113132052, "step": 129 }, { "epoch": 2.7157894736842105, "grad_norm": 0.9587432742118835, "kl": 3.2579798698425293, "learning_rate": 1.3267467626223606e-07, "logits/chosen": 282049649.7777778, "logits/rejected": 315219309.71428573, "logps/chosen": -596.02001953125, "logps/rejected": -321.43014090401783, "loss": 0.3558, "rewards/chosen": 0.8812927140129937, "rewards/margins": 2.193102798764668, "rewards/rejected": -1.311810084751674, "step": 130 }, { "epoch": 2.736842105263158, "grad_norm": 0.6875790357589722, "kl": 2.5100412368774414, "learning_rate": 1.1570762312943295e-07, "logits/chosen": 302621984.0, "logits/rejected": 307840000.0, "logps/chosen": -461.5149230957031, "logps/rejected": -238.98057556152344, "loss": 0.4222, "rewards/chosen": 0.538287878036499, "rewards/margins": 1.520159900188446, "rewards/rejected": -0.981872022151947, "step": 131 }, { "epoch": 2.7578947368421054, "grad_norm": 0.9666208028793335, "kl": 1.7118310928344727, "learning_rate": 9.98753640351785e-08, "logits/chosen": 272928329.14285713, "logits/rejected": 309160789.3333333, "logps/chosen": -569.4566127232143, "logps/rejected": -317.2120768229167, "loss": 0.3475, "rewards/chosen": 0.9596413884844098, "rewards/margins": 1.96501068084959, "rewards/rejected": -1.00536929236518, "step": 132 }, { "epoch": 2.7789473684210524, "grad_norm": 0.9755533933639526, "kl": 3.468967914581299, "learning_rate": 8.518543427732951e-08, "logits/chosen": 282519637.3333333, "logits/rejected": 323490450.28571427, "logps/chosen": -708.4093967013889, "logps/rejected": -296.62779017857144, "loss": 0.3604, "rewards/chosen": 1.059796651204427, "rewards/margins": 2.3521926516578313, "rewards/rejected": -1.292396000453404, "step": 133 }, { "epoch": 2.8, "grad_norm": 0.8586853742599487, "kl": 1.594870686531067, "learning_rate": 7.164482546684642e-08, "logits/chosen": 292587744.0, "logits/rejected": 315242752.0, "logps/chosen": -543.4364013671875, "logps/rejected": -306.8507385253906, "loss": 0.3866, "rewards/chosen": 0.7004275321960449, "rewards/margins": 1.6366068124771118, "rewards/rejected": -0.9361792802810669, "step": 134 }, { "epoch": 2.8210526315789473, "grad_norm": 1.0991864204406738, "kl": 3.3454222679138184, "learning_rate": 5.92599822001666e-08, "logits/chosen": 309429650.28571427, "logits/rejected": 321015552.0, "logps/chosen": -634.3738839285714, "logps/rejected": -331.81629774305554, "loss": 0.3457, "rewards/chosen": 1.0007305826459612, "rewards/margins": 1.937054808177645, "rewards/rejected": -0.936324225531684, "step": 135 }, { "epoch": 2.8421052631578947, "grad_norm": 1.3589946031570435, "kl": 2.5998637676239014, "learning_rate": 4.8036798991923925e-08, "logits/chosen": 289647502.2222222, "logits/rejected": 283804617.14285713, "logps/chosen": -676.5671657986111, "logps/rejected": -408.20846121651783, "loss": 0.3478, "rewards/chosen": 0.9124876658121744, "rewards/margins": 2.528648331051781, "rewards/rejected": -1.6161606652396066, "step": 136 }, { "epoch": 2.863157894736842, "grad_norm": 0.9792783260345459, "kl": 2.5762431621551514, "learning_rate": 3.798061746947995e-08, "logits/chosen": 301908608.0, "logits/rejected": 320611360.0, "logps/chosen": -675.7954711914062, "logps/rejected": -342.86993408203125, "loss": 0.3774, "rewards/chosen": 1.0279605388641357, "rewards/margins": 2.02980899810791, "rewards/rejected": -1.0018484592437744, "step": 137 }, { "epoch": 2.8842105263157896, "grad_norm": 1.141641616821289, "kl": 2.4281487464904785, "learning_rate": 2.9096223830598347e-08, "logits/chosen": 279651857.6551724, "logits/rejected": 331748410.51428574, "logps/chosen": -810.5805495689655, "logps/rejected": -377.9096261160714, "loss": 0.2789, "rewards/chosen": 1.5602044730350888, "rewards/margins": 3.166494363869352, "rewards/rejected": -1.6062898908342633, "step": 138 }, { "epoch": 2.905263157894737, "grad_norm": 0.8568317294120789, "kl": 2.945486307144165, "learning_rate": 2.1387846565474047e-08, "logits/chosen": 296125539.0967742, "logits/rejected": 323021482.6666667, "logps/chosen": -607.9652217741935, "logps/rejected": -321.35458096590907, "loss": 0.3593, "rewards/chosen": 1.1153303577053932, "rewards/margins": 2.1783015306161415, "rewards/rejected": -1.062971172910748, "step": 139 }, { "epoch": 2.9263157894736844, "grad_norm": 0.8753089308738708, "kl": 2.3184986114501953, "learning_rate": 1.4859154444200885e-08, "logits/chosen": 301763180.6060606, "logits/rejected": 325586547.61290324, "logps/chosen": -517.9040601325758, "logps/rejected": -358.8358114919355, "loss": 0.419, "rewards/chosen": 0.6187645883271189, "rewards/margins": 1.4786070117969086, "rewards/rejected": -0.8598424234697896, "step": 140 }, { "epoch": 2.9473684210526314, "grad_norm": 1.006534457206726, "kl": 3.3187642097473145, "learning_rate": 9.513254770636138e-09, "logits/chosen": 252854985.6969697, "logits/rejected": 298858562.0645161, "logps/chosen": -667.3155776515151, "logps/rejected": -339.79082661290323, "loss": 0.354, "rewards/chosen": 1.0862979310931582, "rewards/margins": 2.440396220336795, "rewards/rejected": -1.3540982892436366, "step": 141 }, { "epoch": 2.968421052631579, "grad_norm": 0.9552326202392578, "kl": 2.966846466064453, "learning_rate": 5.352691903491303e-09, "logits/chosen": 299796542.06060606, "logits/rejected": 302482795.3548387, "logps/chosen": -720.6031013257576, "logps/rejected": -320.2646484375, "loss": 0.3499, "rewards/chosen": 0.9409768653638435, "rewards/margins": 2.0632290527850885, "rewards/rejected": -1.122252187421245, "step": 142 }, { "epoch": 2.9894736842105263, "grad_norm": 0.8531591892242432, "kl": 2.526409149169922, "learning_rate": 2.3794460453555046e-09, "logits/chosen": 269314079.030303, "logits/rejected": 330170368.0, "logps/chosen": -811.1819957386364, "logps/rejected": -315.7971585181452, "loss": 0.3277, "rewards/chosen": 1.1765475417628433, "rewards/margins": 2.47783097819965, "rewards/rejected": -1.3012834364368069, "step": 143 }, { "epoch": 3.0, "grad_norm": 0.5618255734443665, "kl": 0.21782374382019043, "learning_rate": 5.94932300227169e-10, "logits/chosen": 351071092.3636364, "logits/rejected": 325673606.7368421, "logps/chosen": -478.63028231534093, "logps/rejected": -388.55186060855266, "loss": 0.1768, "rewards/chosen": 0.5373444123701616, "rewards/margins": 1.6127457185225054, "rewards/rejected": -1.0754013061523438, "step": 144 } ], "logging_steps": 1, "max_steps": 144, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }