{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 6.702973036038713, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.06859037280082703, "logits/rejected": 0.14135734736919403, "logps/chosen": -1.716321349143982, "logps/rejected": -1.8896639347076416, "loss": 0.8499, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.716321349143982, "rewards/margins": 0.17334221303462982, "rewards/rejected": -1.8896639347076416, "sft_loss": 1.4685341119766235, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 9.644141530369794, "learning_rate": 1.7825311942959e-08, "logits/chosen": -0.006559779401868582, "logits/rejected": 0.11489315330982208, "logps/chosen": -1.802130937576294, "logps/rejected": -1.8450310230255127, "loss": 0.9404, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.802130937576294, "rewards/margins": 0.04290003329515457, "rewards/rejected": -1.8450310230255127, "sft_loss": 1.5081868171691895, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 10.051611837677612, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.03827698528766632, "logits/rejected": 0.061770737171173096, "logps/chosen": -1.6346393823623657, "logps/rejected": -1.7642667293548584, "loss": 0.9395, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6346393823623657, "rewards/margins": 0.12962760031223297, "rewards/rejected": -1.7642667293548584, "sft_loss": 1.500527024269104, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 6.647704350122245, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.039084356278181076, "logits/rejected": 0.04886917397379875, "logps/chosen": -1.7247798442840576, "logps/rejected": -1.8060953617095947, "loss": 0.9581, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.7247798442840576, "rewards/margins": 0.08131532371044159, "rewards/rejected": -1.8060953617095947, "sft_loss": 1.5003466606140137, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 14.90062696498156, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.048385441303253174, "logits/rejected": 0.040979672223329544, "logps/chosen": -1.8698208332061768, "logps/rejected": -1.7785001993179321, "loss": 1.0964, "rewards/accuracies": 0.375, "rewards/chosen": -1.8698208332061768, "rewards/margins": -0.09132039546966553, "rewards/rejected": -1.7785001993179321, "sft_loss": 1.5456753969192505, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 8.344668876349031, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.08016878366470337, "logits/rejected": 0.016008157283067703, "logps/chosen": -1.908936858177185, "logps/rejected": -1.8325151205062866, "loss": 0.9981, "rewards/accuracies": 0.4375, "rewards/chosen": -1.908936858177185, "rewards/margins": -0.07642142474651337, "rewards/rejected": -1.8325151205062866, "sft_loss": 1.6472301483154297, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 8.961052501074231, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.05137736722826958, "logits/rejected": 0.11413037776947021, "logps/chosen": -1.8463821411132812, "logps/rejected": -1.9967739582061768, "loss": 0.9789, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8463821411132812, "rewards/margins": 0.15039141476154327, "rewards/rejected": -1.9967739582061768, "sft_loss": 1.5614362955093384, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 8.308376321072556, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.028514528647065163, "logits/rejected": 0.20667286217212677, "logps/chosen": -1.8810441493988037, "logps/rejected": -1.7430187463760376, "loss": 1.0385, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.8810441493988037, "rewards/margins": -0.138025164604187, "rewards/rejected": -1.7430187463760376, "sft_loss": 1.5189718008041382, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 13.986161027627224, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.020828912034630775, "logits/rejected": 0.22010770440101624, "logps/chosen": -1.836972951889038, "logps/rejected": -1.870734453201294, "loss": 0.9983, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.836972951889038, "rewards/margins": 0.03376161307096481, "rewards/rejected": -1.870734453201294, "sft_loss": 1.5358479022979736, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 11.407268722651331, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.05352424457669258, "logits/rejected": 0.09972499310970306, "logps/chosen": -1.8980283737182617, "logps/rejected": -1.7786309719085693, "loss": 1.0466, "rewards/accuracies": 0.5, "rewards/chosen": -1.8980283737182617, "rewards/margins": -0.11939746141433716, "rewards/rejected": -1.7786309719085693, "sft_loss": 1.5829687118530273, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 7.521526699655726, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.11135170608758926, "logits/rejected": 0.11175274848937988, "logps/chosen": -1.8340307474136353, "logps/rejected": -1.8674598932266235, "loss": 1.0073, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8340307474136353, "rewards/margins": 0.033429183065891266, "rewards/rejected": -1.8674598932266235, "sft_loss": 1.5837171077728271, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 7.480466581705964, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.08121562004089355, "logits/rejected": 0.11202778667211533, "logps/chosen": -1.787698745727539, "logps/rejected": -1.892869234085083, "loss": 0.9183, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.787698745727539, "rewards/margins": 0.10517048835754395, "rewards/rejected": -1.892869234085083, "sft_loss": 1.543967366218567, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 6.295629203227723, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.024999063462018967, "logits/rejected": 0.12647823989391327, "logps/chosen": -1.6358072757720947, "logps/rejected": -1.7663062810897827, "loss": 0.8902, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6358072757720947, "rewards/margins": 0.13049918413162231, "rewards/rejected": -1.7663062810897827, "sft_loss": 1.474139928817749, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 11.701055217688808, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.08039408177137375, "logits/rejected": 0.07166466116905212, "logps/chosen": -1.7646785974502563, "logps/rejected": -1.8103328943252563, "loss": 1.0079, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7646785974502563, "rewards/margins": 0.04565427824854851, "rewards/rejected": -1.8103328943252563, "sft_loss": 1.6289875507354736, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 13.253269721585566, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.0566411130130291, "logits/rejected": 0.12927421927452087, "logps/chosen": -1.7753467559814453, "logps/rejected": -2.0347797870635986, "loss": 0.8738, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7753467559814453, "rewards/margins": 0.2594330310821533, "rewards/rejected": -2.0347797870635986, "sft_loss": 1.5650640726089478, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 8.386802086569567, "learning_rate": 1.42602495543672e-07, "logits/chosen": -0.005293454043567181, "logits/rejected": 0.10028276592493057, "logps/chosen": -1.7161533832550049, "logps/rejected": -1.748827576637268, "loss": 0.9613, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7161533832550049, "rewards/margins": 0.03267427533864975, "rewards/rejected": -1.748827576637268, "sft_loss": 1.5250723361968994, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 6.061474731949857, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.14668717980384827, "logits/rejected": 0.10416042804718018, "logps/chosen": -1.7873996496200562, "logps/rejected": -1.9631637334823608, "loss": 0.9281, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7873996496200562, "rewards/margins": 0.1757640838623047, "rewards/rejected": -1.9631637334823608, "sft_loss": 1.4945319890975952, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 14.741981588462846, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.08980236947536469, "logits/rejected": 0.05404208227992058, "logps/chosen": -1.7418218851089478, "logps/rejected": -1.771594762802124, "loss": 1.0002, "rewards/accuracies": 0.46875, "rewards/chosen": -1.7418218851089478, "rewards/margins": 0.02977297641336918, "rewards/rejected": -1.771594762802124, "sft_loss": 1.4563801288604736, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 5.61298553003334, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.07583034038543701, "logits/rejected": 0.07639636844396591, "logps/chosen": -1.788304090499878, "logps/rejected": -1.9007819890975952, "loss": 0.9449, "rewards/accuracies": 0.59375, "rewards/chosen": -1.788304090499878, "rewards/margins": 0.11247781664133072, "rewards/rejected": -1.9007819890975952, "sft_loss": 1.5202165842056274, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 5.958519674378306, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.02949271723628044, "logits/rejected": 0.037863839417696, "logps/chosen": -1.6825199127197266, "logps/rejected": -1.7904939651489258, "loss": 0.9087, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6825199127197266, "rewards/margins": 0.10797406733036041, "rewards/rejected": -1.7904939651489258, "sft_loss": 1.4878851175308228, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 8.883510402309446, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.054995059967041016, "logits/rejected": 0.08293595165014267, "logps/chosen": -1.6364829540252686, "logps/rejected": -1.803472876548767, "loss": 0.8893, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6364829540252686, "rewards/margins": 0.16699011623859406, "rewards/rejected": -1.803472876548767, "sft_loss": 1.4342707395553589, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 6.887732422302257, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.014778594486415386, "logits/rejected": 0.11244583129882812, "logps/chosen": -1.669877290725708, "logps/rejected": -1.7259242534637451, "loss": 0.9626, "rewards/accuracies": 0.5, "rewards/chosen": -1.669877290725708, "rewards/margins": 0.0560469925403595, "rewards/rejected": -1.7259242534637451, "sft_loss": 1.4641985893249512, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 10.053029624630458, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.01813269779086113, "logits/rejected": 0.22458188235759735, "logps/chosen": -1.657580018043518, "logps/rejected": -1.9396737813949585, "loss": 0.834, "rewards/accuracies": 0.625, "rewards/chosen": -1.657580018043518, "rewards/margins": 0.28209394216537476, "rewards/rejected": -1.9396737813949585, "sft_loss": 1.563565969467163, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 6.716771226271919, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.0731058418750763, "logits/rejected": 0.10547780990600586, "logps/chosen": -1.7298002243041992, "logps/rejected": -1.8506269454956055, "loss": 0.9118, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7298002243041992, "rewards/margins": 0.12082656472921371, "rewards/rejected": -1.8506269454956055, "sft_loss": 1.5554392337799072, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 7.206509063947677, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.0829402357339859, "logits/rejected": 0.049699828028678894, "logps/chosen": -1.6518990993499756, "logps/rejected": -1.6049795150756836, "loss": 0.992, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6518990993499756, "rewards/margins": -0.04691971093416214, "rewards/rejected": -1.6049795150756836, "sft_loss": 1.5169137716293335, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 9.398500000703004, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.039764903485774994, "logits/rejected": 0.1792357712984085, "logps/chosen": -1.6893419027328491, "logps/rejected": -1.8123197555541992, "loss": 0.8713, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.6893419027328491, "rewards/margins": 0.12297798693180084, "rewards/rejected": -1.8123197555541992, "sft_loss": 1.577155590057373, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 16.324023931321047, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.047886431217193604, "logits/rejected": 0.07469049841165543, "logps/chosen": -1.7392584085464478, "logps/rejected": -1.7592836618423462, "loss": 0.9732, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.7392584085464478, "rewards/margins": 0.020025230944156647, "rewards/rejected": -1.7592836618423462, "sft_loss": 1.5168216228485107, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 11.546083545636698, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.038810618221759796, "logits/rejected": 0.1334814727306366, "logps/chosen": -1.6959807872772217, "logps/rejected": -1.8378652334213257, "loss": 0.8841, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.6959807872772217, "rewards/margins": 0.141884446144104, "rewards/rejected": -1.8378652334213257, "sft_loss": 1.5605138540267944, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 10.449440333498194, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.02952158823609352, "logits/rejected": 0.12635311484336853, "logps/chosen": -1.611771583557129, "logps/rejected": -1.7252804040908813, "loss": 0.898, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.611771583557129, "rewards/margins": 0.11350883543491364, "rewards/rejected": -1.7252804040908813, "sft_loss": 1.5070559978485107, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 11.254343815302727, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.047827668488025665, "logits/rejected": 0.12055377662181854, "logps/chosen": -1.565514326095581, "logps/rejected": -1.564286470413208, "loss": 0.9705, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.565514326095581, "rewards/margins": -0.0012281477684155107, "rewards/rejected": -1.564286470413208, "sft_loss": 1.3673644065856934, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 9.262506789237806, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.07001235336065292, "logits/rejected": -0.018733523786067963, "logps/chosen": -1.5835145711898804, "logps/rejected": -1.6756055355072021, "loss": 0.9092, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5835145711898804, "rewards/margins": 0.09209098666906357, "rewards/rejected": -1.6756055355072021, "sft_loss": 1.4590342044830322, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 8.130608494491169, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.14981473982334137, "logits/rejected": -0.00554328877478838, "logps/chosen": -1.7048254013061523, "logps/rejected": -1.6821569204330444, "loss": 0.9968, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.7048254013061523, "rewards/margins": -0.02266838401556015, "rewards/rejected": -1.6821569204330444, "sft_loss": 1.5138128995895386, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 8.446134985236093, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.0554620735347271, "logits/rejected": 0.11860129982233047, "logps/chosen": -1.5411068201065063, "logps/rejected": -1.6820226907730103, "loss": 0.9089, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5411068201065063, "rewards/margins": 0.14091593027114868, "rewards/rejected": -1.6820226907730103, "sft_loss": 1.4017772674560547, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 13.961121834810543, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.10742922127246857, "logits/rejected": -0.052524250000715256, "logps/chosen": -1.6910076141357422, "logps/rejected": -1.7397321462631226, "loss": 0.9506, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6910076141357422, "rewards/margins": 0.04872459918260574, "rewards/rejected": -1.7397321462631226, "sft_loss": 1.5169093608856201, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 10.42747131693837, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.04428332671523094, "logits/rejected": 0.042808979749679565, "logps/chosen": -1.5634596347808838, "logps/rejected": -1.6661045551300049, "loss": 0.9397, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.5634596347808838, "rewards/margins": 0.10264496505260468, "rewards/rejected": -1.6661045551300049, "sft_loss": 1.4734010696411133, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 8.78321914975296, "learning_rate": 3.2085561497326203e-07, "logits/chosen": 0.012711775489151478, "logits/rejected": 0.014043694362044334, "logps/chosen": -1.5782628059387207, "logps/rejected": -1.7380142211914062, "loss": 0.9172, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5782628059387207, "rewards/margins": 0.15975116193294525, "rewards/rejected": -1.7380142211914062, "sft_loss": 1.4693598747253418, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 8.723622382486115, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.15454931557178497, "logits/rejected": -0.06420670449733734, "logps/chosen": -1.5361636877059937, "logps/rejected": -1.6049896478652954, "loss": 0.9592, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.5361636877059937, "rewards/margins": 0.06882590800523758, "rewards/rejected": -1.6049896478652954, "sft_loss": 1.4361121654510498, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 9.379186782623458, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.06259158998727798, "logits/rejected": 0.0626489445567131, "logps/chosen": -1.5418685674667358, "logps/rejected": -1.6475862264633179, "loss": 0.8949, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.5418685674667358, "rewards/margins": 0.1057177409529686, "rewards/rejected": -1.6475862264633179, "sft_loss": 1.4272186756134033, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 6.312300436261769, "learning_rate": 3.475935828877005e-07, "logits/chosen": 0.009148378856480122, "logits/rejected": 0.16754285991191864, "logps/chosen": -1.3938941955566406, "logps/rejected": -1.554271936416626, "loss": 0.8745, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3938941955566406, "rewards/margins": 0.16037783026695251, "rewards/rejected": -1.554271936416626, "sft_loss": 1.3546264171600342, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 14.617155597119094, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.08748116344213486, "logits/rejected": 0.05105700343847275, "logps/chosen": -1.512632131576538, "logps/rejected": -1.541290283203125, "loss": 0.944, "rewards/accuracies": 0.5625, "rewards/chosen": -1.512632131576538, "rewards/margins": 0.028658073395490646, "rewards/rejected": -1.541290283203125, "sft_loss": 1.4626529216766357, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 14.83212977546729, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.060840390622615814, "logits/rejected": 0.08631278574466705, "logps/chosen": -1.433002233505249, "logps/rejected": -1.4933949708938599, "loss": 0.9357, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.433002233505249, "rewards/margins": 0.060392655432224274, "rewards/rejected": -1.4933949708938599, "sft_loss": 1.3502676486968994, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 12.073571859501529, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.16160300374031067, "logits/rejected": 0.02457263693213463, "logps/chosen": -1.4971989393234253, "logps/rejected": -1.664298415184021, "loss": 0.8794, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4971989393234253, "rewards/margins": 0.16709943115711212, "rewards/rejected": -1.664298415184021, "sft_loss": 1.4122134447097778, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 8.053391971103682, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.20213842391967773, "logits/rejected": 0.04158937931060791, "logps/chosen": -1.5002050399780273, "logps/rejected": -1.5914572477340698, "loss": 0.875, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5002050399780273, "rewards/margins": 0.09125222265720367, "rewards/rejected": -1.5914572477340698, "sft_loss": 1.4370949268341064, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 17.92140582684194, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.04896925762295723, "logits/rejected": 0.15036346018314362, "logps/chosen": -1.4815274477005005, "logps/rejected": -1.6879806518554688, "loss": 0.8464, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4815274477005005, "rewards/margins": 0.2064533233642578, "rewards/rejected": -1.6879806518554688, "sft_loss": 1.4312843084335327, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 6.819571995837769, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.10190838575363159, "logits/rejected": 0.0719117596745491, "logps/chosen": -1.4662240743637085, "logps/rejected": -1.6161525249481201, "loss": 0.8542, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4662240743637085, "rewards/margins": 0.1499285250902176, "rewards/rejected": -1.6161525249481201, "sft_loss": 1.3879258632659912, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 6.60282038631885, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.038417570292949677, "logits/rejected": 0.04065591096878052, "logps/chosen": -1.5287741422653198, "logps/rejected": -1.698678970336914, "loss": 0.882, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5287741422653198, "rewards/margins": 0.16990481317043304, "rewards/rejected": -1.698678970336914, "sft_loss": 1.3734486103057861, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 11.553765477929723, "learning_rate": 4.188948306595365e-07, "logits/chosen": 0.01179618202149868, "logits/rejected": 0.15246328711509705, "logps/chosen": -1.4749536514282227, "logps/rejected": -1.6516231298446655, "loss": 0.8382, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4749536514282227, "rewards/margins": 0.17666950821876526, "rewards/rejected": -1.6516231298446655, "sft_loss": 1.3738255500793457, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 7.820091999170089, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.04290889948606491, "logits/rejected": 0.08651427924633026, "logps/chosen": -1.4832552671432495, "logps/rejected": -1.678340196609497, "loss": 0.8709, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4832552671432495, "rewards/margins": 0.19508466124534607, "rewards/rejected": -1.678340196609497, "sft_loss": 1.446438193321228, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 7.853590240194409, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.032018642872571945, "logits/rejected": 0.1550910770893097, "logps/chosen": -1.581587791442871, "logps/rejected": -1.6717331409454346, "loss": 0.8975, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.581587791442871, "rewards/margins": 0.09014533460140228, "rewards/rejected": -1.6717331409454346, "sft_loss": 1.5218582153320312, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 13.935759547676863, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.06505148857831955, "logits/rejected": 0.10135525465011597, "logps/chosen": -1.5776017904281616, "logps/rejected": -1.6428560018539429, "loss": 0.9479, "rewards/accuracies": 0.5, "rewards/chosen": -1.5776017904281616, "rewards/margins": 0.06525401026010513, "rewards/rejected": -1.6428560018539429, "sft_loss": 1.4219996929168701, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 8.872910049052326, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.025789733976125717, "logits/rejected": 0.12136568874120712, "logps/chosen": -1.4498792886734009, "logps/rejected": -1.6182514429092407, "loss": 0.8624, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4498792886734009, "rewards/margins": 0.16837215423583984, "rewards/rejected": -1.6182514429092407, "sft_loss": 1.334236741065979, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 7.407385018535111, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.2350751906633377, "logits/rejected": -0.12682147324085236, "logps/chosen": -1.6020698547363281, "logps/rejected": -1.7310168743133545, "loss": 0.8466, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6020698547363281, "rewards/margins": 0.12894682586193085, "rewards/rejected": -1.7310168743133545, "sft_loss": 1.4834848642349243, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 15.909087871185044, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.0795278325676918, "logits/rejected": 0.006105656735599041, "logps/chosen": -1.6134687662124634, "logps/rejected": -1.7488367557525635, "loss": 0.8955, "rewards/accuracies": 0.53125, "rewards/chosen": -1.6134687662124634, "rewards/margins": 0.13536801934242249, "rewards/rejected": -1.7488367557525635, "sft_loss": 1.5514605045318604, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 6.423043314074773, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.0875733345746994, "logits/rejected": 0.049231600016355515, "logps/chosen": -1.5094425678253174, "logps/rejected": -1.632433295249939, "loss": 0.8701, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5094425678253174, "rewards/margins": 0.12299074977636337, "rewards/rejected": -1.632433295249939, "sft_loss": 1.4312238693237305, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 8.545207155313289, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.03895934298634529, "logits/rejected": 0.05629078298807144, "logps/chosen": -1.4638268947601318, "logps/rejected": -1.689480185508728, "loss": 0.8625, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4638268947601318, "rewards/margins": 0.2256532907485962, "rewards/rejected": -1.689480185508728, "sft_loss": 1.37359619140625, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 15.708485896926108, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.10516306012868881, "logits/rejected": 0.05365392565727234, "logps/chosen": -1.5601650476455688, "logps/rejected": -1.680772066116333, "loss": 0.8802, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5601650476455688, "rewards/margins": 0.12060710042715073, "rewards/rejected": -1.680772066116333, "sft_loss": 1.4628015756607056, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 7.9296611950593885, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.059190742671489716, "logits/rejected": 0.08300630003213882, "logps/chosen": -1.5395736694335938, "logps/rejected": -1.6552746295928955, "loss": 0.9105, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.5395736694335938, "rewards/margins": 0.11570099741220474, "rewards/rejected": -1.6552746295928955, "sft_loss": 1.4917641878128052, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 8.512531880461733, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.14478448033332825, "logits/rejected": 0.14671705663204193, "logps/chosen": -1.5264160633087158, "logps/rejected": -1.7059307098388672, "loss": 0.8277, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5264160633087158, "rewards/margins": 0.1795145720243454, "rewards/rejected": -1.7059307098388672, "sft_loss": 1.465397834777832, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 11.31996235651409, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.0691608339548111, "logits/rejected": -0.006187940947711468, "logps/chosen": -1.4758808612823486, "logps/rejected": -1.6043269634246826, "loss": 0.8655, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4758808612823486, "rewards/margins": 0.1284460872411728, "rewards/rejected": -1.6043269634246826, "sft_loss": 1.3975608348846436, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 9.416282928218743, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.07168702781200409, "logits/rejected": 0.10054339468479156, "logps/chosen": -1.5283973217010498, "logps/rejected": -1.6419986486434937, "loss": 0.8884, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.5283973217010498, "rewards/margins": 0.11360123008489609, "rewards/rejected": -1.6419986486434937, "sft_loss": 1.483135461807251, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 7.35648695914466, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.009979024529457092, "logits/rejected": 0.06142239645123482, "logps/chosen": -1.6317169666290283, "logps/rejected": -1.6297073364257812, "loss": 0.9396, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.6317169666290283, "rewards/margins": -0.0020095347426831722, "rewards/rejected": -1.6297073364257812, "sft_loss": 1.5488479137420654, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 10.439877376996783, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.18711742758750916, "logits/rejected": -0.08916506916284561, "logps/chosen": -1.6041101217269897, "logps/rejected": -1.719856858253479, "loss": 0.8965, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.6041101217269897, "rewards/margins": 0.11574681848287582, "rewards/rejected": -1.719856858253479, "sft_loss": 1.5262537002563477, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 12.40055960906028, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.015852855518460274, "logits/rejected": 0.14568349719047546, "logps/chosen": -1.6016238927841187, "logps/rejected": -1.7875289916992188, "loss": 0.8617, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6016238927841187, "rewards/margins": 0.18590494990348816, "rewards/rejected": -1.7875289916992188, "sft_loss": 1.5366556644439697, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 6.813830943157419, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.05537568777799606, "logits/rejected": 0.07943960279226303, "logps/chosen": -1.5311832427978516, "logps/rejected": -1.5911250114440918, "loss": 0.9035, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.5311832427978516, "rewards/margins": 0.05994180589914322, "rewards/rejected": -1.5911250114440918, "sft_loss": 1.47402822971344, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 10.004743649510763, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.11411911249160767, "logits/rejected": 0.006765827536582947, "logps/chosen": -1.557644248008728, "logps/rejected": -1.8630332946777344, "loss": 0.8201, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.557644248008728, "rewards/margins": 0.3053889870643616, "rewards/rejected": -1.8630332946777344, "sft_loss": 1.547539234161377, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 12.547787288359798, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.013900229707360268, "logits/rejected": 0.14141085743904114, "logps/chosen": -1.5468895435333252, "logps/rejected": -1.8233592510223389, "loss": 0.8041, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5468895435333252, "rewards/margins": 0.2764698565006256, "rewards/rejected": -1.8233592510223389, "sft_loss": 1.4977320432662964, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 19.340150760011156, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.04217713326215744, "logits/rejected": 0.15277662873268127, "logps/chosen": -1.6085857152938843, "logps/rejected": -1.6779794692993164, "loss": 0.8962, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6085857152938843, "rewards/margins": 0.06939395517110825, "rewards/rejected": -1.6779794692993164, "sft_loss": 1.4822218418121338, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 14.640742158325542, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.02979375422000885, "logits/rejected": 0.12206296622753143, "logps/chosen": -1.6841856241226196, "logps/rejected": -1.7910856008529663, "loss": 0.9172, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6841856241226196, "rewards/margins": 0.10689990222454071, "rewards/rejected": -1.7910856008529663, "sft_loss": 1.5484795570373535, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 12.93329168947757, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.05653851106762886, "logits/rejected": 0.08597005903720856, "logps/chosen": -1.5816501379013062, "logps/rejected": -1.7690460681915283, "loss": 0.865, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5816501379013062, "rewards/margins": 0.1873955875635147, "rewards/rejected": -1.7690460681915283, "sft_loss": 1.5128852128982544, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 11.403163971079575, "learning_rate": 6.238859180035651e-07, "logits/chosen": -0.023158203810453415, "logits/rejected": 0.070060595870018, "logps/chosen": -1.537097692489624, "logps/rejected": -1.6681251525878906, "loss": 0.8959, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.537097692489624, "rewards/margins": 0.13102751970291138, "rewards/rejected": -1.6681251525878906, "sft_loss": 1.4899991750717163, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 10.269830532082047, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.09361349791288376, "logits/rejected": 0.12785111367702484, "logps/chosen": -1.6137135028839111, "logps/rejected": -1.6899824142456055, "loss": 0.9026, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6137135028839111, "rewards/margins": 0.07626868039369583, "rewards/rejected": -1.6899824142456055, "sft_loss": 1.5542749166488647, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 9.251468349092507, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.06010516732931137, "logits/rejected": 0.021687399595975876, "logps/chosen": -1.5847867727279663, "logps/rejected": -1.755059003829956, "loss": 0.8823, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.5847867727279663, "rewards/margins": 0.17027229070663452, "rewards/rejected": -1.755059003829956, "sft_loss": 1.4727002382278442, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 14.888840674530762, "learning_rate": 6.506238859180035e-07, "logits/chosen": 0.00939253717660904, "logits/rejected": 0.09567335247993469, "logps/chosen": -1.546452283859253, "logps/rejected": -1.636813759803772, "loss": 0.9025, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.546452283859253, "rewards/margins": 0.09036144614219666, "rewards/rejected": -1.636813759803772, "sft_loss": 1.4358354806900024, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 12.156898971625248, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.028807152062654495, "logits/rejected": 0.061083655804395676, "logps/chosen": -1.530022382736206, "logps/rejected": -1.6157872676849365, "loss": 0.9193, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.530022382736206, "rewards/margins": 0.08576497435569763, "rewards/rejected": -1.6157872676849365, "sft_loss": 1.4403166770935059, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 11.627248013410282, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.05842500180006027, "logits/rejected": 0.09691628813743591, "logps/chosen": -1.5447757244110107, "logps/rejected": -1.7703924179077148, "loss": 0.8379, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5447757244110107, "rewards/margins": 0.22561664879322052, "rewards/rejected": -1.7703924179077148, "sft_loss": 1.4894663095474243, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 10.537843750135279, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.03339865058660507, "logits/rejected": 0.054922886192798615, "logps/chosen": -1.6277233362197876, "logps/rejected": -1.8356781005859375, "loss": 0.8116, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6277233362197876, "rewards/margins": 0.20795480906963348, "rewards/rejected": -1.8356781005859375, "sft_loss": 1.512076497077942, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 5.665583910535334, "learning_rate": 6.862745098039216e-07, "logits/chosen": 0.006865252740681171, "logits/rejected": 0.08682769536972046, "logps/chosen": -1.678415060043335, "logps/rejected": -1.740708351135254, "loss": 0.8908, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.678415060043335, "rewards/margins": 0.06229352951049805, "rewards/rejected": -1.740708351135254, "sft_loss": 1.5830042362213135, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 10.41529627121867, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.038954682648181915, "logits/rejected": 0.19942323863506317, "logps/chosen": -1.7041614055633545, "logps/rejected": -1.8604240417480469, "loss": 0.8541, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.7041614055633545, "rewards/margins": 0.15626260638237, "rewards/rejected": -1.8604240417480469, "sft_loss": 1.5955750942230225, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 10.077546406750734, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.07024117559194565, "logits/rejected": 0.08584646135568619, "logps/chosen": -1.6505581140518188, "logps/rejected": -1.7517807483673096, "loss": 0.8694, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.6505581140518188, "rewards/margins": 0.1012226939201355, "rewards/rejected": -1.7517807483673096, "sft_loss": 1.5330431461334229, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 8.621579993141502, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.07951502501964569, "logits/rejected": 0.17376390099525452, "logps/chosen": -1.7101367712020874, "logps/rejected": -1.8504247665405273, "loss": 0.839, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.7101367712020874, "rewards/margins": 0.14028772711753845, "rewards/rejected": -1.8504247665405273, "sft_loss": 1.5114778280258179, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.28865498304367065, "eval_logits/rejected": 0.37876713275909424, "eval_logps/chosen": -1.7061017751693726, "eval_logps/rejected": -1.9017139673233032, "eval_loss": 0.8444245457649231, "eval_rewards/accuracies": 0.5563797950744629, "eval_rewards/chosen": -1.7061017751693726, "eval_rewards/margins": 0.1956121325492859, "eval_rewards/rejected": -1.9017139673233032, "eval_runtime": 43.5944, "eval_samples_per_second": 30.853, "eval_sft_loss": 1.5361721515655518, "eval_steps_per_second": 7.73, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 8.801414813330842, "learning_rate": 7.219251336898395e-07, "logits/chosen": 0.012902075424790382, "logits/rejected": 0.10784679651260376, "logps/chosen": -1.7358152866363525, "logps/rejected": -1.8969757556915283, "loss": 0.8908, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.7358152866363525, "rewards/margins": 0.1611604392528534, "rewards/rejected": -1.8969757556915283, "sft_loss": 1.5918419361114502, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 12.045545052797944, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.05082521587610245, "logits/rejected": 0.18324458599090576, "logps/chosen": -1.6618913412094116, "logps/rejected": -1.844369888305664, "loss": 0.8491, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6618913412094116, "rewards/margins": 0.18247857689857483, "rewards/rejected": -1.844369888305664, "sft_loss": 1.5552400350570679, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 8.576746510646535, "learning_rate": 7.397504456327985e-07, "logits/chosen": 0.03030974604189396, "logits/rejected": 0.07934688031673431, "logps/chosen": -1.669297218322754, "logps/rejected": -1.8434665203094482, "loss": 0.8484, "rewards/accuracies": 0.53125, "rewards/chosen": -1.669297218322754, "rewards/margins": 0.17416930198669434, "rewards/rejected": -1.8434665203094482, "sft_loss": 1.5341603755950928, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 10.63547091961648, "learning_rate": 7.486631016042781e-07, "logits/chosen": 0.008567921817302704, "logits/rejected": 0.2103685885667801, "logps/chosen": -1.561725378036499, "logps/rejected": -1.7165727615356445, "loss": 0.8708, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.561725378036499, "rewards/margins": 0.15484726428985596, "rewards/rejected": -1.7165727615356445, "sft_loss": 1.4999353885650635, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 9.932013113637575, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.008445126004517078, "logits/rejected": 0.1929190754890442, "logps/chosen": -1.6287310123443604, "logps/rejected": -1.8927959203720093, "loss": 0.7954, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6287310123443604, "rewards/margins": 0.26406508684158325, "rewards/rejected": -1.8927959203720093, "sft_loss": 1.5982462167739868, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 8.876616130983669, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.04979996010661125, "logits/rejected": 0.1489320695400238, "logps/chosen": -1.6180919408798218, "logps/rejected": -1.9462039470672607, "loss": 0.8028, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6180919408798218, "rewards/margins": 0.3281119763851166, "rewards/rejected": -1.9462039470672607, "sft_loss": 1.580506682395935, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 20.45419933146445, "learning_rate": 7.754010695187165e-07, "logits/chosen": 0.02379441447556019, "logits/rejected": 0.11689828336238861, "logps/chosen": -1.5695067644119263, "logps/rejected": -1.7231642007827759, "loss": 0.8429, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5695067644119263, "rewards/margins": 0.15365752577781677, "rewards/rejected": -1.7231642007827759, "sft_loss": 1.517533540725708, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 13.65531705237304, "learning_rate": 7.84313725490196e-07, "logits/chosen": 0.013252335600554943, "logits/rejected": 0.10951529443264008, "logps/chosen": -1.6240532398223877, "logps/rejected": -1.825933814048767, "loss": 0.8266, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6240532398223877, "rewards/margins": 0.201880544424057, "rewards/rejected": -1.825933814048767, "sft_loss": 1.5613758563995361, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 12.24154907085509, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.023964716121554375, "logits/rejected": 0.09082510322332382, "logps/chosen": -1.6749906539916992, "logps/rejected": -1.9429988861083984, "loss": 0.8244, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6749906539916992, "rewards/margins": 0.26800835132598877, "rewards/rejected": -1.9429988861083984, "sft_loss": 1.583179235458374, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 14.76664890654527, "learning_rate": 8.02139037433155e-07, "logits/chosen": 0.030245855450630188, "logits/rejected": 0.1601337343454361, "logps/chosen": -1.7110360860824585, "logps/rejected": -1.9090163707733154, "loss": 0.8058, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7110360860824585, "rewards/margins": 0.1979803740978241, "rewards/rejected": -1.9090163707733154, "sft_loss": 1.5668268203735352, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 19.790468019674638, "learning_rate": 8.110516934046346e-07, "logits/chosen": -0.006326199974864721, "logits/rejected": 0.07478408515453339, "logps/chosen": -1.6535755395889282, "logps/rejected": -1.9522926807403564, "loss": 0.7848, "rewards/accuracies": 0.625, "rewards/chosen": -1.6535755395889282, "rewards/margins": 0.2987171709537506, "rewards/rejected": -1.9522926807403564, "sft_loss": 1.537989616394043, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 9.616950707647714, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.11517757177352905, "logits/rejected": 0.003994188271462917, "logps/chosen": -1.746042013168335, "logps/rejected": -1.8898332118988037, "loss": 0.8426, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.746042013168335, "rewards/margins": 0.1437911093235016, "rewards/rejected": -1.8898332118988037, "sft_loss": 1.6806637048721313, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 11.900667307712698, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.14643609523773193, "logits/rejected": 0.16485807299613953, "logps/chosen": -1.7409279346466064, "logps/rejected": -1.9578319787979126, "loss": 0.8387, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7409279346466064, "rewards/margins": 0.21690388023853302, "rewards/rejected": -1.9578319787979126, "sft_loss": 1.6416947841644287, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 8.830181983289117, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.1644625961780548, "logits/rejected": 0.12652386724948883, "logps/chosen": -1.6838014125823975, "logps/rejected": -1.9070326089859009, "loss": 0.8349, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6838014125823975, "rewards/margins": 0.22323103249073029, "rewards/rejected": -1.9070326089859009, "sft_loss": 1.596195101737976, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 7.995168104534913, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.024302974343299866, "logits/rejected": 0.11604616791009903, "logps/chosen": -1.6668624877929688, "logps/rejected": -2.0832455158233643, "loss": 0.7536, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6668624877929688, "rewards/margins": 0.4163830280303955, "rewards/rejected": -2.0832455158233643, "sft_loss": 1.625457763671875, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 13.35967258064493, "learning_rate": 8.55614973262032e-07, "logits/chosen": 0.0026796311140060425, "logits/rejected": 0.19870570302009583, "logps/chosen": -1.6937164068222046, "logps/rejected": -1.8854515552520752, "loss": 0.822, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6937164068222046, "rewards/margins": 0.19173488020896912, "rewards/rejected": -1.8854515552520752, "sft_loss": 1.6137354373931885, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 14.777387782967626, "learning_rate": 8.645276292335115e-07, "logits/chosen": 0.02986701764166355, "logits/rejected": 0.07667236030101776, "logps/chosen": -1.900377631187439, "logps/rejected": -2.0468382835388184, "loss": 0.8557, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.900377631187439, "rewards/margins": 0.14646077156066895, "rewards/rejected": -2.0468382835388184, "sft_loss": 1.723910927772522, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 9.826571934223194, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.05959752947092056, "logits/rejected": 0.12927241623401642, "logps/chosen": -1.8736746311187744, "logps/rejected": -2.0128977298736572, "loss": 0.8551, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8736746311187744, "rewards/margins": 0.13922320306301117, "rewards/rejected": -2.0128977298736572, "sft_loss": 1.7088829278945923, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 12.412657506346838, "learning_rate": 8.823529411764705e-07, "logits/chosen": 0.02171451412141323, "logits/rejected": 0.05109437555074692, "logps/chosen": -1.885629653930664, "logps/rejected": -2.019071578979492, "loss": 0.8437, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.885629653930664, "rewards/margins": 0.13344189524650574, "rewards/rejected": -2.019071578979492, "sft_loss": 1.7931492328643799, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 9.482682538601056, "learning_rate": 8.912655971479501e-07, "logits/chosen": 0.008008052594959736, "logits/rejected": 0.10441839694976807, "logps/chosen": -1.82148015499115, "logps/rejected": -2.05816912651062, "loss": 0.809, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.82148015499115, "rewards/margins": 0.23668909072875977, "rewards/rejected": -2.05816912651062, "sft_loss": 1.700110673904419, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 12.48443598930354, "learning_rate": 9.001782531194295e-07, "logits/chosen": 0.012660378590226173, "logits/rejected": 0.16065539419651031, "logps/chosen": -1.8914144039154053, "logps/rejected": -2.045323610305786, "loss": 0.8174, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.8914144039154053, "rewards/margins": 0.1539088636636734, "rewards/rejected": -2.045323610305786, "sft_loss": 1.7708075046539307, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 9.036453223871478, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.1636802852153778, "logits/rejected": 0.21730521321296692, "logps/chosen": -1.868528127670288, "logps/rejected": -2.1642730236053467, "loss": 0.7701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.868528127670288, "rewards/margins": 0.2957451343536377, "rewards/rejected": -2.1642730236053467, "sft_loss": 1.730780839920044, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 10.891419449579097, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.14418451488018036, "logits/rejected": 0.25062134861946106, "logps/chosen": -1.7637317180633545, "logps/rejected": -2.0156896114349365, "loss": 0.7843, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.7637317180633545, "rewards/margins": 0.2519580125808716, "rewards/rejected": -2.0156896114349365, "sft_loss": 1.6884260177612305, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 9.775456886285932, "learning_rate": 9.26916221033868e-07, "logits/chosen": 0.039710883051157, "logits/rejected": 0.17652074992656708, "logps/chosen": -1.8773629665374756, "logps/rejected": -2.135971784591675, "loss": 0.7854, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8773629665374756, "rewards/margins": 0.25860869884490967, "rewards/rejected": -2.135971784591675, "sft_loss": 1.8785078525543213, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 17.399966824971457, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.20017535984516144, "logits/rejected": 0.2788700759410858, "logps/chosen": -1.9711978435516357, "logps/rejected": -2.333888530731201, "loss": 0.7413, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9711978435516357, "rewards/margins": 0.3626905083656311, "rewards/rejected": -2.333888530731201, "sft_loss": 1.9129832983016968, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 16.108980962918235, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.18143267929553986, "logits/rejected": 0.2672853469848633, "logps/chosen": -1.9471544027328491, "logps/rejected": -2.2187883853912354, "loss": 0.7997, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.9471544027328491, "rewards/margins": 0.2716338634490967, "rewards/rejected": -2.2187883853912354, "sft_loss": 1.801098108291626, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 14.859693867200447, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.002810171339660883, "logits/rejected": 0.24968405067920685, "logps/chosen": -1.9894745349884033, "logps/rejected": -2.2667486667633057, "loss": 0.737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9894745349884033, "rewards/margins": 0.27727416157722473, "rewards/rejected": -2.2667486667633057, "sft_loss": 1.834472894668579, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 13.266783239205989, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.11695842444896698, "logits/rejected": 0.1895637810230255, "logps/chosen": -2.2287280559539795, "logps/rejected": -2.49006986618042, "loss": 0.781, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.2287280559539795, "rewards/margins": 0.2613416314125061, "rewards/rejected": -2.49006986618042, "sft_loss": 2.1219325065612793, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 10.108061461389307, "learning_rate": 9.714795008912655e-07, "logits/chosen": 0.00664617121219635, "logits/rejected": 0.19402989745140076, "logps/chosen": -2.17495059967041, "logps/rejected": -2.560760974884033, "loss": 0.6977, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.17495059967041, "rewards/margins": 0.38581031560897827, "rewards/rejected": -2.560760974884033, "sft_loss": 2.0547683238983154, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 11.156872123357896, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.14837445318698883, "logits/rejected": 0.21960575878620148, "logps/chosen": -2.2893216609954834, "logps/rejected": -2.6165287494659424, "loss": 0.7158, "rewards/accuracies": 0.65625, "rewards/chosen": -2.2893216609954834, "rewards/margins": 0.32720714807510376, "rewards/rejected": -2.6165287494659424, "sft_loss": 2.1706032752990723, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 21.327228435759405, "learning_rate": 9.893048128342244e-07, "logits/chosen": 0.0808831974864006, "logits/rejected": 0.19863127171993256, "logps/chosen": -2.471397876739502, "logps/rejected": -2.6428585052490234, "loss": 0.7911, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.471397876739502, "rewards/margins": 0.171460822224617, "rewards/rejected": -2.6428585052490234, "sft_loss": 2.363668441772461, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 15.817975460786185, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.17479531466960907, "logits/rejected": 0.19065909087657928, "logps/chosen": -2.38142728805542, "logps/rejected": -2.700078248977661, "loss": 0.7215, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.38142728805542, "rewards/margins": 0.31865087151527405, "rewards/rejected": -2.700078248977661, "sft_loss": 2.4128060340881348, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 11.167026754067493, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.15115851163864136, "logits/rejected": 0.2050451785326004, "logps/chosen": -2.5232253074645996, "logps/rejected": -2.9136128425598145, "loss": 0.6758, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.5232253074645996, "rewards/margins": 0.39038750529289246, "rewards/rejected": -2.9136128425598145, "sft_loss": 2.4833340644836426, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 29.251751625509854, "learning_rate": 9.999921413906797e-07, "logits/chosen": 0.06527841836214066, "logits/rejected": 0.26166507601737976, "logps/chosen": -2.654595375061035, "logps/rejected": -3.0485172271728516, "loss": 0.6882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.654595375061035, "rewards/margins": 0.39392179250717163, "rewards/rejected": -3.0485172271728516, "sft_loss": 2.623623847961426, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 14.777746420060607, "learning_rate": 9.999809841765644e-07, "logits/chosen": 0.10745315253734589, "logits/rejected": 0.16344238817691803, "logps/chosen": -2.7796332836151123, "logps/rejected": -3.1552608013153076, "loss": 0.7132, "rewards/accuracies": 0.65625, "rewards/chosen": -2.7796332836151123, "rewards/margins": 0.37562793493270874, "rewards/rejected": -3.1552608013153076, "sft_loss": 2.741098403930664, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 12.739144398411966, "learning_rate": 9.999649761447477e-07, "logits/chosen": 0.09595675766468048, "logits/rejected": 0.24499674141407013, "logps/chosen": -2.9956271648406982, "logps/rejected": -3.428938627243042, "loss": 0.6871, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.9956271648406982, "rewards/margins": 0.43331179022789, "rewards/rejected": -3.428938627243042, "sft_loss": 2.928342819213867, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 17.255767939413705, "learning_rate": 9.999441174505398e-07, "logits/chosen": 0.07325030118227005, "logits/rejected": 0.1624840795993805, "logps/chosen": -3.4379382133483887, "logps/rejected": -3.6763598918914795, "loss": 0.79, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.4379382133483887, "rewards/margins": 0.23842184245586395, "rewards/rejected": -3.6763598918914795, "sft_loss": 3.286072254180908, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 30.110227167602943, "learning_rate": 9.999184082963116e-07, "logits/chosen": 0.10350818932056427, "logits/rejected": 0.2149147093296051, "logps/chosen": -3.244036912918091, "logps/rejected": -3.4989047050476074, "loss": 0.7593, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.244036912918091, "rewards/margins": 0.25486817955970764, "rewards/rejected": -3.4989047050476074, "sft_loss": 3.1437976360321045, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 18.945077633365436, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.1482018530368805, "logits/rejected": 0.2577352523803711, "logps/chosen": -2.873711585998535, "logps/rejected": -3.2073111534118652, "loss": 0.7128, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.873711585998535, "rewards/margins": 0.3336000144481659, "rewards/rejected": -3.2073111534118652, "sft_loss": 2.8837180137634277, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 9.958916715868252, "learning_rate": 9.99852439652573e-07, "logits/chosen": 0.12357846647500992, "logits/rejected": 0.2520459294319153, "logps/chosen": -2.982365131378174, "logps/rejected": -3.3198161125183105, "loss": 0.6921, "rewards/accuracies": 0.65625, "rewards/chosen": -2.982365131378174, "rewards/margins": 0.33745133876800537, "rewards/rejected": -3.3198161125183105, "sft_loss": 2.9125170707702637, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 16.38786490932435, "learning_rate": 9.998121808030904e-07, "logits/chosen": 0.08862508088350296, "logits/rejected": 0.17105832695960999, "logps/chosen": -3.2011170387268066, "logps/rejected": -3.5023789405822754, "loss": 0.753, "rewards/accuracies": 0.59375, "rewards/chosen": -3.2011170387268066, "rewards/margins": 0.3012619912624359, "rewards/rejected": -3.5023789405822754, "sft_loss": 3.0842275619506836, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 34.784158500457494, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.14439894258975983, "logits/rejected": 0.30359843373298645, "logps/chosen": -3.0514113903045654, "logps/rejected": -3.3972702026367188, "loss": 0.7366, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.0514113903045654, "rewards/margins": 0.3458591103553772, "rewards/rejected": -3.3972702026367188, "sft_loss": 2.9700963497161865, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 11.02894563208569, "learning_rate": 9.99717116001853e-07, "logits/chosen": 0.07742391526699066, "logits/rejected": 0.1711132973432541, "logps/chosen": -3.041868209838867, "logps/rejected": -3.5872230529785156, "loss": 0.6433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.041868209838867, "rewards/margins": 0.5453550219535828, "rewards/rejected": -3.5872230529785156, "sft_loss": 2.9712107181549072, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 13.435575037564124, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.17571452260017395, "logits/rejected": 0.22912061214447021, "logps/chosen": -3.235530376434326, "logps/rejected": -3.6874687671661377, "loss": 0.6783, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.235530376434326, "rewards/margins": 0.4519377648830414, "rewards/rejected": -3.6874687671661377, "sft_loss": 3.3161187171936035, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 11.963965881026747, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.16318285465240479, "logits/rejected": 0.28264936804771423, "logps/chosen": -3.1633148193359375, "logps/rejected": -3.6431198120117188, "loss": 0.6719, "rewards/accuracies": 0.65625, "rewards/chosen": -3.1633148193359375, "rewards/margins": 0.4798053205013275, "rewards/rejected": -3.6431198120117188, "sft_loss": 3.2683768272399902, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 18.982814005299204, "learning_rate": 9.995381583144996e-07, "logits/chosen": 0.1366763710975647, "logits/rejected": 0.2427402287721634, "logps/chosen": -3.2480270862579346, "logps/rejected": -3.759871244430542, "loss": 0.6599, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.2480270862579346, "rewards/margins": 0.5118443369865417, "rewards/rejected": -3.759871244430542, "sft_loss": 3.296314239501953, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 14.820255923961053, "learning_rate": 9.994688118905471e-07, "logits/chosen": 0.12495915591716766, "logits/rejected": 0.33405324816703796, "logps/chosen": -3.371495008468628, "logps/rejected": -3.818183183670044, "loss": 0.6938, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.371495008468628, "rewards/margins": 0.44668784737586975, "rewards/rejected": -3.818183183670044, "sft_loss": 3.383021831512451, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 17.167666064407857, "learning_rate": 9.993946196179912e-07, "logits/chosen": 0.02807859145104885, "logits/rejected": 0.20716509222984314, "logps/chosen": -3.329270601272583, "logps/rejected": -3.7738585472106934, "loss": 0.6796, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.329270601272583, "rewards/margins": 0.4445876479148865, "rewards/rejected": -3.7738585472106934, "sft_loss": 3.4169044494628906, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 12.591955459885122, "learning_rate": 9.993155822166455e-07, "logits/chosen": 0.0873405784368515, "logits/rejected": 0.1538153886795044, "logps/chosen": -3.218961715698242, "logps/rejected": -3.6624724864959717, "loss": 0.6938, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.218961715698242, "rewards/margins": 0.44351091980934143, "rewards/rejected": -3.6624724864959717, "sft_loss": 3.1909327507019043, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 17.348198575764926, "learning_rate": 9.992317004533313e-07, "logits/chosen": 0.12633930146694183, "logits/rejected": 0.23183250427246094, "logps/chosen": -3.4124233722686768, "logps/rejected": -3.8890254497528076, "loss": 0.6747, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.4124233722686768, "rewards/margins": 0.476602166891098, "rewards/rejected": -3.8890254497528076, "sft_loss": 3.488661289215088, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 19.495944078784344, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.15044409036636353, "logits/rejected": 0.1703573614358902, "logps/chosen": -3.2523608207702637, "logps/rejected": -3.721022844314575, "loss": 0.7206, "rewards/accuracies": 0.625, "rewards/chosen": -3.2523608207702637, "rewards/margins": 0.468661367893219, "rewards/rejected": -3.721022844314575, "sft_loss": 3.3398749828338623, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 12.739827332278542, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.10364080965518951, "logits/rejected": 0.20852389931678772, "logps/chosen": -3.156210422515869, "logps/rejected": -3.4665565490722656, "loss": 0.7243, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.156210422515869, "rewards/margins": 0.3103458881378174, "rewards/rejected": -3.4665565490722656, "sft_loss": 3.1437571048736572, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 9.131578117600116, "learning_rate": 9.989509973647416e-07, "logits/chosen": 0.09303895384073257, "logits/rejected": 0.2246369570493698, "logps/chosen": -3.0079312324523926, "logps/rejected": -3.3630542755126953, "loss": 0.7239, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.0079312324523926, "rewards/margins": 0.35512271523475647, "rewards/rejected": -3.3630542755126953, "sft_loss": 3.066347599029541, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 12.124152976286277, "learning_rate": 9.988477467616445e-07, "logits/chosen": 0.08354990184307098, "logits/rejected": 0.24085800349712372, "logps/chosen": -2.965276002883911, "logps/rejected": -3.3744754791259766, "loss": 0.6552, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.965276002883911, "rewards/margins": 0.40919971466064453, "rewards/rejected": -3.3744754791259766, "sft_loss": 3.086196184158325, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 14.467133514240121, "learning_rate": 9.987396563355205e-07, "logits/chosen": 0.09642884135246277, "logits/rejected": 0.16047967970371246, "logps/chosen": -2.9351863861083984, "logps/rejected": -3.4026923179626465, "loss": 0.6612, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9351863861083984, "rewards/margins": 0.46750617027282715, "rewards/rejected": -3.4026923179626465, "sft_loss": 2.9935317039489746, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 12.228933979475576, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.1422261893749237, "logits/rejected": 0.28388795256614685, "logps/chosen": -3.123826503753662, "logps/rejected": -3.486147403717041, "loss": 0.7477, "rewards/accuracies": 0.625, "rewards/chosen": -3.123826503753662, "rewards/margins": 0.3623208999633789, "rewards/rejected": -3.486147403717041, "sft_loss": 3.068068027496338, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 22.57125542560206, "learning_rate": 9.985089602559123e-07, "logits/chosen": 0.13599984347820282, "logits/rejected": 0.2769896388053894, "logps/chosen": -3.2404942512512207, "logps/rejected": -3.6959774494171143, "loss": 0.6779, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.2404942512512207, "rewards/margins": 0.45548272132873535, "rewards/rejected": -3.6959774494171143, "sft_loss": 3.2109158039093018, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 18.452385824632163, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.16817834973335266, "logits/rejected": 0.19235409796237946, "logps/chosen": -3.385051727294922, "logps/rejected": -3.7719883918762207, "loss": 0.7072, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.385051727294922, "rewards/margins": 0.38693660497665405, "rewards/rejected": -3.7719883918762207, "sft_loss": 3.4286434650421143, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 11.372268524897146, "learning_rate": 9.982589180787532e-07, "logits/chosen": 0.08407465368509293, "logits/rejected": 0.15698882937431335, "logps/chosen": -3.1884450912475586, "logps/rejected": -3.640734910964966, "loss": 0.656, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.1884450912475586, "rewards/margins": 0.4522898197174072, "rewards/rejected": -3.640734910964966, "sft_loss": 3.3043384552001953, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 21.43770166832855, "learning_rate": 9.981266452066553e-07, "logits/chosen": 0.021909546107053757, "logits/rejected": 0.1328798085451126, "logps/chosen": -3.5039210319519043, "logps/rejected": -3.8362011909484863, "loss": 0.6862, "rewards/accuracies": 0.625, "rewards/chosen": -3.5039210319519043, "rewards/margins": 0.3322799503803253, "rewards/rejected": -3.8362011909484863, "sft_loss": 3.4569506645202637, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 14.27115720520455, "learning_rate": 9.979895395076608e-07, "logits/chosen": 0.07725761830806732, "logits/rejected": 0.2512449622154236, "logps/chosen": -3.555420398712158, "logps/rejected": -4.093667507171631, "loss": 0.6392, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.555420398712158, "rewards/margins": 0.5382481813430786, "rewards/rejected": -4.093667507171631, "sft_loss": 3.611607789993286, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 18.789545823364513, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.10939677059650421, "logits/rejected": 0.20701321959495544, "logps/chosen": -3.553346633911133, "logps/rejected": -4.05139684677124, "loss": 0.6367, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.553346633911133, "rewards/margins": 0.4980502128601074, "rewards/rejected": -4.05139684677124, "sft_loss": 3.549403429031372, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 20.283066722727373, "learning_rate": 9.97700834996658e-07, "logits/chosen": 0.0789613127708435, "logits/rejected": 0.2348795384168625, "logps/chosen": -3.755260467529297, "logps/rejected": -4.174482822418213, "loss": 0.6705, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.755260467529297, "rewards/margins": 0.41922205686569214, "rewards/rejected": -4.174482822418213, "sft_loss": 3.7192587852478027, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 17.71024407775548, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.1549479067325592, "logits/rejected": 0.32185259461402893, "logps/chosen": -3.6697444915771484, "logps/rejected": -4.121180057525635, "loss": 0.6921, "rewards/accuracies": 0.65625, "rewards/chosen": -3.6697444915771484, "rewards/margins": 0.45143526792526245, "rewards/rejected": -4.121180057525635, "sft_loss": 3.7580153942108154, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 27.931643171716484, "learning_rate": 9.973928157497674e-07, "logits/chosen": 0.07944132387638092, "logits/rejected": 0.20254309475421906, "logps/chosen": -3.4026761054992676, "logps/rejected": -3.8807315826416016, "loss": 0.6469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4026761054992676, "rewards/margins": 0.4780558943748474, "rewards/rejected": -3.8807315826416016, "sft_loss": 3.532989978790283, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 14.385729673838856, "learning_rate": 9.972315668065927e-07, "logits/chosen": 0.07879441976547241, "logits/rejected": 0.19542153179645538, "logps/chosen": -3.5237643718719482, "logps/rejected": -3.905362606048584, "loss": 0.7068, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.5237643718719482, "rewards/margins": 0.38159847259521484, "rewards/rejected": -3.905362606048584, "sft_loss": 3.5806221961975098, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 13.641550923885305, "learning_rate": 9.97065493720576e-07, "logits/chosen": 0.0845077782869339, "logits/rejected": 0.1788623034954071, "logps/chosen": -3.2255218029022217, "logps/rejected": -3.5273890495300293, "loss": 0.7208, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.2255218029022217, "rewards/margins": 0.3018674850463867, "rewards/rejected": -3.5273890495300293, "sft_loss": 3.248542308807373, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 26.105825510166056, "learning_rate": 9.968945981029594e-07, "logits/chosen": 0.12168808281421661, "logits/rejected": 0.2733016908168793, "logps/chosen": -3.3159337043762207, "logps/rejected": -3.775067090988159, "loss": 0.6729, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.3159337043762207, "rewards/margins": 0.4591336250305176, "rewards/rejected": -3.775067090988159, "sft_loss": 3.2262046337127686, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 11.117385711267735, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.17880137264728546, "logits/rejected": 0.2560071051120758, "logps/chosen": -3.331556797027588, "logps/rejected": -3.8179938793182373, "loss": 0.686, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.331556797027588, "rewards/margins": 0.4864373803138733, "rewards/rejected": -3.8179938793182373, "sft_loss": 3.3403239250183105, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 12.031202800814468, "learning_rate": 9.965383459518179e-07, "logits/chosen": 0.08572041988372803, "logits/rejected": 0.23330307006835938, "logps/chosen": -3.3558125495910645, "logps/rejected": -3.828129291534424, "loss": 0.6601, "rewards/accuracies": 0.6875, "rewards/chosen": -3.3558125495910645, "rewards/margins": 0.4723171293735504, "rewards/rejected": -3.828129291534424, "sft_loss": 3.398761749267578, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 13.472832754646447, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.15691408514976501, "logits/rejected": 0.2690011262893677, "logps/chosen": -3.389195203781128, "logps/rejected": -3.7743306159973145, "loss": 0.7096, "rewards/accuracies": 0.65625, "rewards/chosen": -3.389195203781128, "rewards/margins": 0.38513538241386414, "rewards/rejected": -3.7743306159973145, "sft_loss": 3.376270294189453, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 11.454275325872517, "learning_rate": 9.961628241785746e-07, "logits/chosen": 0.07620684802532196, "logits/rejected": 0.15791398286819458, "logps/chosen": -3.3654396533966064, "logps/rejected": -3.7810680866241455, "loss": 0.6903, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.3654396533966064, "rewards/margins": 0.4156281352043152, "rewards/rejected": -3.7810680866241455, "sft_loss": 3.4098827838897705, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 11.654710079787488, "learning_rate": 9.959678417085998e-07, "logits/chosen": 0.0927480012178421, "logits/rejected": 0.17140790820121765, "logps/chosen": -3.2945895195007324, "logps/rejected": -3.6890273094177246, "loss": 0.6735, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.2945895195007324, "rewards/margins": 0.3944377601146698, "rewards/rejected": -3.6890273094177246, "sft_loss": 3.3298301696777344, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 13.564142616226638, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.20415227115154266, "logits/rejected": 0.3103726804256439, "logps/chosen": -3.2019429206848145, "logps/rejected": -3.724005937576294, "loss": 0.6451, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.2019429206848145, "rewards/margins": 0.5220627784729004, "rewards/rejected": -3.724005937576294, "sft_loss": 3.1217987537384033, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 8.788907244047499, "learning_rate": 9.95563443060529e-07, "logits/chosen": 0.05300183221697807, "logits/rejected": 0.19994166493415833, "logps/chosen": -3.236523389816284, "logps/rejected": -3.5783581733703613, "loss": 0.7294, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.236523389816284, "rewards/margins": 0.3418344557285309, "rewards/rejected": -3.5783581733703613, "sft_loss": 3.2097830772399902, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 13.11357065397142, "learning_rate": 9.95354030805911e-07, "logits/chosen": 0.0014542639255523682, "logits/rejected": 0.12298593670129776, "logps/chosen": -3.181215286254883, "logps/rejected": -3.5314762592315674, "loss": 0.6809, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.181215286254883, "rewards/margins": 0.3502606749534607, "rewards/rejected": -3.5314762592315674, "sft_loss": 3.2443747520446777, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 11.935322142601875, "learning_rate": 9.951398126243133e-07, "logits/chosen": 0.12124574184417725, "logits/rejected": 0.22774991393089294, "logps/chosen": -3.2135891914367676, "logps/rejected": -3.64837384223938, "loss": 0.6878, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.2135891914367676, "rewards/margins": 0.43478527665138245, "rewards/rejected": -3.64837384223938, "sft_loss": 3.2128987312316895, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 11.666360539461333, "learning_rate": 9.94920790594082e-07, "logits/chosen": 0.07049150764942169, "logits/rejected": 0.1756301373243332, "logps/chosen": -3.114420175552368, "logps/rejected": -3.6367697715759277, "loss": 0.628, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.114420175552368, "rewards/margins": 0.5223496556282043, "rewards/rejected": -3.6367697715759277, "sft_loss": 3.0744128227233887, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 11.228129348682602, "learning_rate": 9.946969668401696e-07, "logits/chosen": 0.05934765189886093, "logits/rejected": 0.21110644936561584, "logps/chosen": -3.3124725818634033, "logps/rejected": -3.8809173107147217, "loss": 0.642, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.3124725818634033, "rewards/margins": 0.5684444904327393, "rewards/rejected": -3.8809173107147217, "sft_loss": 3.352839708328247, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 12.42314167315067, "learning_rate": 9.944683435341155e-07, "logits/chosen": 0.04263642802834511, "logits/rejected": 0.10475891828536987, "logps/chosen": -3.370643138885498, "logps/rejected": -3.90073823928833, "loss": 0.6224, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.370643138885498, "rewards/margins": 0.5300950407981873, "rewards/rejected": -3.90073823928833, "sft_loss": 3.352079391479492, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.30916711688041687, "eval_logits/rejected": 0.3857182264328003, "eval_logps/chosen": -3.476654529571533, "eval_logps/rejected": -4.0245490074157715, "eval_loss": 0.628761887550354, "eval_rewards/accuracies": 0.6869435906410217, "eval_rewards/chosen": -3.476654529571533, "eval_rewards/margins": 0.5478941798210144, "eval_rewards/rejected": -4.0245490074157715, "eval_runtime": 42.9456, "eval_samples_per_second": 31.319, "eval_sft_loss": 3.4512197971343994, "eval_steps_per_second": 7.847, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 15.285719140308519, "learning_rate": 9.942349228940236e-07, "logits/chosen": 0.029286552220582962, "logits/rejected": 0.15155065059661865, "logps/chosen": -3.489750623703003, "logps/rejected": -4.151576042175293, "loss": 0.5868, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.489750623703003, "rewards/margins": 0.6618257164955139, "rewards/rejected": -4.151576042175293, "sft_loss": 3.473094940185547, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 16.26548162705179, "learning_rate": 9.939967071845424e-07, "logits/chosen": 0.09252439439296722, "logits/rejected": 0.16039565205574036, "logps/chosen": -3.74686861038208, "logps/rejected": -4.148434638977051, "loss": 0.6825, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.74686861038208, "rewards/margins": 0.40156635642051697, "rewards/rejected": -4.148434638977051, "sft_loss": 3.8165481090545654, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 12.521564977020468, "learning_rate": 9.937536987168413e-07, "logits/chosen": 0.10305075347423553, "logits/rejected": 0.20441201329231262, "logps/chosen": -3.5569655895233154, "logps/rejected": -4.217929840087891, "loss": 0.6361, "rewards/accuracies": 0.65625, "rewards/chosen": -3.5569655895233154, "rewards/margins": 0.6609641313552856, "rewards/rejected": -4.217929840087891, "sft_loss": 3.6337389945983887, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 16.53507037989396, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.12354373931884766, "logits/rejected": 0.16019897162914276, "logps/chosen": -3.6725382804870605, "logps/rejected": -4.262721061706543, "loss": 0.6498, "rewards/accuracies": 0.6875, "rewards/chosen": -3.6725382804870605, "rewards/margins": 0.5901829600334167, "rewards/rejected": -4.262721061706543, "sft_loss": 3.66874623298645, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 20.293552669583075, "learning_rate": 9.932533129839333e-07, "logits/chosen": 0.10642262548208237, "logits/rejected": 0.2096632421016693, "logps/chosen": -3.6120095252990723, "logps/rejected": -4.203782081604004, "loss": 0.6397, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6120095252990723, "rewards/margins": 0.591772198677063, "rewards/rejected": -4.203782081604004, "sft_loss": 3.7229583263397217, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 14.193848823920673, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.12132594734430313, "logits/rejected": 0.25849542021751404, "logps/chosen": -3.58459210395813, "logps/rejected": -4.052337646484375, "loss": 0.6578, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.58459210395813, "rewards/margins": 0.46774548292160034, "rewards/rejected": -4.052337646484375, "sft_loss": 3.5832104682922363, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 17.16303931069294, "learning_rate": 9.927337851142314e-07, "logits/chosen": 0.08140738308429718, "logits/rejected": 0.18814049661159515, "logps/chosen": -3.5765693187713623, "logps/rejected": -4.066720485687256, "loss": 0.6598, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.5765693187713623, "rewards/margins": 0.4901511073112488, "rewards/rejected": -4.066720485687256, "sft_loss": 3.7092864513397217, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 14.101710559555091, "learning_rate": 9.924668491496474e-07, "logits/chosen": 0.025357728824019432, "logits/rejected": 0.19356423616409302, "logps/chosen": -3.731132984161377, "logps/rejected": -4.178229808807373, "loss": 0.7033, "rewards/accuracies": 0.65625, "rewards/chosen": -3.731132984161377, "rewards/margins": 0.4470970034599304, "rewards/rejected": -4.178229808807373, "sft_loss": 3.84734845161438, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 11.084723277082443, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.11301273107528687, "logits/rejected": 0.16978143155574799, "logps/chosen": -3.6617302894592285, "logps/rejected": -4.03924036026001, "loss": 0.7095, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.6617302894592285, "rewards/margins": 0.3775102496147156, "rewards/rejected": -4.03924036026001, "sft_loss": 3.7795937061309814, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 12.211834887067432, "learning_rate": 9.919186461100574e-07, "logits/chosen": 0.0779627338051796, "logits/rejected": 0.14930522441864014, "logps/chosen": -3.7439913749694824, "logps/rejected": -4.1387457847595215, "loss": 0.6733, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.7439913749694824, "rewards/margins": 0.394754558801651, "rewards/rejected": -4.1387457847595215, "sft_loss": 3.746532917022705, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 31.70471938835657, "learning_rate": 9.9163738435372e-07, "logits/chosen": 0.0450650155544281, "logits/rejected": 0.16089345514774323, "logps/chosen": -3.6246726512908936, "logps/rejected": -4.134470462799072, "loss": 0.6901, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.6246726512908936, "rewards/margins": 0.5097973346710205, "rewards/rejected": -4.134470462799072, "sft_loss": 3.6062686443328857, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 8.598859788111836, "learning_rate": 9.913513527293234e-07, "logits/chosen": 0.006855173502117395, "logits/rejected": 0.14971241354942322, "logps/chosen": -3.5897128582000732, "logps/rejected": -4.210363388061523, "loss": 0.6252, "rewards/accuracies": 0.6875, "rewards/chosen": -3.5897128582000732, "rewards/margins": 0.6206499934196472, "rewards/rejected": -4.210363388061523, "sft_loss": 3.5778141021728516, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 37.65213597743668, "learning_rate": 9.910605540119474e-07, "logits/chosen": 0.08587872982025146, "logits/rejected": 0.16899526119232178, "logps/chosen": -3.4890365600585938, "logps/rejected": -4.047313690185547, "loss": 0.6675, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.4890365600585938, "rewards/margins": 0.5582772493362427, "rewards/rejected": -4.047313690185547, "sft_loss": 3.3437390327453613, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 11.543128601737179, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.027017872780561447, "logits/rejected": 0.1710745394229889, "logps/chosen": -3.376879930496216, "logps/rejected": -3.9517147541046143, "loss": 0.6164, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.376879930496216, "rewards/margins": 0.5748350024223328, "rewards/rejected": -3.9517147541046143, "sft_loss": 3.3898417949676514, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 15.187532658706145, "learning_rate": 9.90464666629803e-07, "logits/chosen": 0.05295708775520325, "logits/rejected": 0.1010611280798912, "logps/chosen": -3.346054792404175, "logps/rejected": -3.7483839988708496, "loss": 0.7253, "rewards/accuracies": 0.625, "rewards/chosen": -3.346054792404175, "rewards/margins": 0.40232938528060913, "rewards/rejected": -3.7483839988708496, "sft_loss": 3.3345093727111816, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 8.845824171927392, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.007680124137550592, "logits/rejected": 0.1661723554134369, "logps/chosen": -3.394366502761841, "logps/rejected": -3.9658546447753906, "loss": 0.6212, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.394366502761841, "rewards/margins": 0.5714882612228394, "rewards/rejected": -3.9658546447753906, "sft_loss": 3.316911220550537, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 16.21241632674145, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.06916798651218414, "logits/rejected": 0.007078188471496105, "logps/chosen": -3.4882004261016846, "logps/rejected": -4.050824165344238, "loss": 0.605, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.4882004261016846, "rewards/margins": 0.5626236200332642, "rewards/rejected": -4.050824165344238, "sft_loss": 3.5420258045196533, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 12.373306544762638, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.14452961087226868, "logits/rejected": -0.04519026353955269, "logps/chosen": -3.458292007446289, "logps/rejected": -3.9274654388427734, "loss": 0.6591, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.458292007446289, "rewards/margins": 0.46917352080345154, "rewards/rejected": -3.9274654388427734, "sft_loss": 3.5802388191223145, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 13.304136856842742, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.01676345430314541, "logits/rejected": 0.08211679756641388, "logps/chosen": -3.4115653038024902, "logps/rejected": -3.8524603843688965, "loss": 0.6597, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3.4115653038024902, "rewards/margins": 0.4408953785896301, "rewards/rejected": -3.8524603843688965, "sft_loss": 3.483126401901245, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 16.428604159117846, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.055256836116313934, "logits/rejected": 0.018062064424157143, "logps/chosen": -3.339559555053711, "logps/rejected": -3.853358030319214, "loss": 0.6395, "rewards/accuracies": 0.71875, "rewards/chosen": -3.339559555053711, "rewards/margins": 0.5137983560562134, "rewards/rejected": -3.853358030319214, "sft_loss": 3.397245407104492, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 14.82089637749487, "learning_rate": 9.885628971850641e-07, "logits/chosen": -0.007930848747491837, "logits/rejected": 0.14382946491241455, "logps/chosen": -3.4519202709198, "logps/rejected": -4.07681941986084, "loss": 0.6226, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.4519202709198, "rewards/margins": 0.62489914894104, "rewards/rejected": -4.07681941986084, "sft_loss": 3.538830280303955, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 13.842999888489807, "learning_rate": 9.882293271315481e-07, "logits/chosen": -0.04674383997917175, "logits/rejected": 0.027312126010656357, "logps/chosen": -3.600853681564331, "logps/rejected": -4.066799640655518, "loss": 0.6763, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.600853681564331, "rewards/margins": 0.4659459590911865, "rewards/rejected": -4.066799640655518, "sft_loss": 3.625293731689453, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 13.496629124081092, "learning_rate": 9.878910202749589e-07, "logits/chosen": -0.03229089826345444, "logits/rejected": 0.11424863338470459, "logps/chosen": -3.6046853065490723, "logps/rejected": -4.177473068237305, "loss": 0.6267, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.6046853065490723, "rewards/margins": 0.572787880897522, "rewards/rejected": -4.177473068237305, "sft_loss": 3.654684066772461, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 14.754415801413213, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.013253210112452507, "logits/rejected": 0.14170649647712708, "logps/chosen": -3.6022396087646484, "logps/rejected": -4.2444329261779785, "loss": 0.6264, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.6022396087646484, "rewards/margins": 0.6421931982040405, "rewards/rejected": -4.2444329261779785, "sft_loss": 3.753528594970703, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 16.88476581071326, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.02223074436187744, "logits/rejected": 0.0964256003499031, "logps/chosen": -3.945333957672119, "logps/rejected": -4.436131954193115, "loss": 0.6546, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.945333957672119, "rewards/margins": 0.49079805612564087, "rewards/rejected": -4.436131954193115, "sft_loss": 3.862926483154297, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 20.326567899701928, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.028742101043462753, "logits/rejected": 0.03769497200846672, "logps/chosen": -3.7966148853302, "logps/rejected": -4.435694694519043, "loss": 0.6393, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.7966148853302, "rewards/margins": 0.6390798687934875, "rewards/rejected": -4.435694694519043, "sft_loss": 3.8229899406433105, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 13.99872257148215, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.010284209623932838, "logits/rejected": 0.044766783714294434, "logps/chosen": -3.979724884033203, "logps/rejected": -4.445012092590332, "loss": 0.6638, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.979724884033203, "rewards/margins": 0.4652870297431946, "rewards/rejected": -4.445012092590332, "sft_loss": 4.037301063537598, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 16.86367004869719, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.012009387835860252, "logits/rejected": 0.061289478093385696, "logps/chosen": -3.8050410747528076, "logps/rejected": -4.29853630065918, "loss": 0.6353, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.8050410747528076, "rewards/margins": 0.49349552392959595, "rewards/rejected": -4.29853630065918, "sft_loss": 3.802018642425537, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 13.932891382776969, "learning_rate": 9.857618932900502e-07, "logits/chosen": 0.0036790785379707813, "logits/rejected": 0.10150575637817383, "logps/chosen": -3.826122283935547, "logps/rejected": -4.335418701171875, "loss": 0.6438, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.826122283935547, "rewards/margins": 0.5092965364456177, "rewards/rejected": -4.335418701171875, "sft_loss": 3.819323778152466, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 13.970445556471764, "learning_rate": 9.853905232845727e-07, "logits/chosen": 0.023456787690520287, "logits/rejected": 0.14656312763690948, "logps/chosen": -3.714625835418701, "logps/rejected": -4.2214674949646, "loss": 0.6547, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.714625835418701, "rewards/margins": 0.5068413615226746, "rewards/rejected": -4.2214674949646, "sft_loss": 3.676867961883545, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 13.66631939092454, "learning_rate": 9.850144440181095e-07, "logits/chosen": 0.08379775285720825, "logits/rejected": 0.22799447178840637, "logps/chosen": -3.87310791015625, "logps/rejected": -4.435972690582275, "loss": 0.6092, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.87310791015625, "rewards/margins": 0.5628647804260254, "rewards/rejected": -4.435972690582275, "sft_loss": 3.9427828788757324, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 16.58644478757467, "learning_rate": 9.846336591393832e-07, "logits/chosen": 0.08455801755189896, "logits/rejected": 0.18910866975784302, "logps/chosen": -3.9857611656188965, "logps/rejected": -4.546139717102051, "loss": 0.6454, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.9857611656188965, "rewards/margins": 0.5603781342506409, "rewards/rejected": -4.546139717102051, "sft_loss": 4.061606407165527, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 13.756191723689144, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.09445828199386597, "logits/rejected": 0.11051924526691437, "logps/chosen": -4.213374137878418, "logps/rejected": -4.733959197998047, "loss": 0.694, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.213374137878418, "rewards/margins": 0.520584762096405, "rewards/rejected": -4.733959197998047, "sft_loss": 4.2486796379089355, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 13.816243024663285, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.09202511608600616, "logits/rejected": 0.09299103915691376, "logps/chosen": -3.9934258460998535, "logps/rejected": -4.433906078338623, "loss": 0.6682, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.9934258460998535, "rewards/margins": 0.4404802918434143, "rewards/rejected": -4.433906078338623, "sft_loss": 4.066201686859131, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 10.847829039403418, "learning_rate": 9.834631080014457e-07, "logits/chosen": 0.015875589102506638, "logits/rejected": 0.1718440055847168, "logps/chosen": -3.8322250843048096, "logps/rejected": -4.449212551116943, "loss": 0.5865, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.8322250843048096, "rewards/margins": 0.6169876456260681, "rewards/rejected": -4.449212551116943, "sft_loss": 3.9336464405059814, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 20.731197033134084, "learning_rate": 9.830635380734312e-07, "logits/chosen": 0.017926190048456192, "logits/rejected": 0.14910198748111725, "logps/chosen": -3.856818675994873, "logps/rejected": -4.3431854248046875, "loss": 0.6517, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.856818675994873, "rewards/margins": 0.4863665699958801, "rewards/rejected": -4.3431854248046875, "sft_loss": 3.9083619117736816, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 12.502144291648296, "learning_rate": 9.826592814608517e-07, "logits/chosen": 0.09634266048669815, "logits/rejected": 0.2468501776456833, "logps/chosen": -3.7185311317443848, "logps/rejected": -4.257325172424316, "loss": 0.6265, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.7185311317443848, "rewards/margins": 0.5387943983078003, "rewards/rejected": -4.257325172424316, "sft_loss": 3.7992255687713623, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 10.0780790134523, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.17207573354244232, "logits/rejected": 0.18692907691001892, "logps/chosen": -3.6508517265319824, "logps/rejected": -4.257552146911621, "loss": 0.6139, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.6508517265319824, "rewards/margins": 0.6067003011703491, "rewards/rejected": -4.257552146911621, "sft_loss": 3.8181662559509277, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 13.754268295572302, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.15358421206474304, "logits/rejected": 0.20071625709533691, "logps/chosen": -3.7733490467071533, "logps/rejected": -4.283802032470703, "loss": 0.6637, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.7733490467071533, "rewards/margins": 0.5104531049728394, "rewards/rejected": -4.283802032470703, "sft_loss": 3.8505585193634033, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 14.788821967814014, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.1338832527399063, "logits/rejected": 0.23015956580638885, "logps/chosen": -4.02156925201416, "logps/rejected": -4.630791664123535, "loss": 0.6083, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.02156925201416, "rewards/margins": 0.6092226505279541, "rewards/rejected": -4.630791664123535, "sft_loss": 4.029107093811035, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 19.007935191666654, "learning_rate": 9.809954672881237e-07, "logits/chosen": 0.12174554914236069, "logits/rejected": 0.23447482287883759, "logps/chosen": -4.086304664611816, "logps/rejected": -4.584083557128906, "loss": 0.6822, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.086304664611816, "rewards/margins": 0.49777883291244507, "rewards/rejected": -4.584083557128906, "sft_loss": 4.131983757019043, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 12.754319708012039, "learning_rate": 9.80567836992274e-07, "logits/chosen": 0.12806352972984314, "logits/rejected": 0.26299232244491577, "logps/chosen": -3.8444855213165283, "logps/rejected": -4.5119428634643555, "loss": 0.621, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.8444855213165283, "rewards/margins": 0.6674574017524719, "rewards/rejected": -4.5119428634643555, "sft_loss": 3.909728527069092, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 11.84982709117696, "learning_rate": 9.801355442251625e-07, "logits/chosen": 0.1154123991727829, "logits/rejected": 0.22723989188671112, "logps/chosen": -3.721278429031372, "logps/rejected": -4.259707927703857, "loss": 0.6454, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.721278429031372, "rewards/margins": 0.5384299159049988, "rewards/rejected": -4.259707927703857, "sft_loss": 3.7684149742126465, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 16.10253215157953, "learning_rate": 9.796985931808949e-07, "logits/chosen": 0.09676255285739899, "logits/rejected": 0.21589374542236328, "logps/chosen": -3.759204864501953, "logps/rejected": -4.318037986755371, "loss": 0.622, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.759204864501953, "rewards/margins": 0.5588343739509583, "rewards/rejected": -4.318037986755371, "sft_loss": 3.8170742988586426, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 16.177105588352344, "learning_rate": 9.792569880987724e-07, "logits/chosen": 0.04962032288312912, "logits/rejected": 0.1333126276731491, "logps/chosen": -3.7688231468200684, "logps/rejected": -4.422165393829346, "loss": 0.6031, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.7688231468200684, "rewards/margins": 0.6533424258232117, "rewards/rejected": -4.422165393829346, "sft_loss": 3.7961933612823486, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 20.099779652750147, "learning_rate": 9.788107332632493e-07, "logits/chosen": 0.10792167484760284, "logits/rejected": 0.17767611145973206, "logps/chosen": -3.7205605506896973, "logps/rejected": -4.1855549812316895, "loss": 0.6964, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.7205605506896973, "rewards/margins": 0.464994341135025, "rewards/rejected": -4.1855549812316895, "sft_loss": 3.749657392501831, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 11.803497746097289, "learning_rate": 9.783598330038924e-07, "logits/chosen": 0.08459456264972687, "logits/rejected": 0.16978034377098083, "logps/chosen": -3.7540791034698486, "logps/rejected": -4.233614444732666, "loss": 0.6499, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.7540791034698486, "rewards/margins": 0.47953516244888306, "rewards/rejected": -4.233614444732666, "sft_loss": 3.757070541381836, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 16.181353128715678, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.088498555123806, "logits/rejected": 0.21762952208518982, "logps/chosen": -3.5868847370147705, "logps/rejected": -4.343627452850342, "loss": 0.5831, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.5868847370147705, "rewards/margins": 0.7567430734634399, "rewards/rejected": -4.343627452850342, "sft_loss": 3.650301456451416, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 12.80569022038835, "learning_rate": 9.774441137572487e-07, "logits/chosen": 0.022781116887927055, "logits/rejected": 0.12793084979057312, "logps/chosen": -3.790767192840576, "logps/rejected": -4.484066486358643, "loss": 0.5885, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.790767192840576, "rewards/margins": 0.6932995915412903, "rewards/rejected": -4.484066486358643, "sft_loss": 3.834791660308838, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 16.985779571284176, "learning_rate": 9.76979303654274e-07, "logits/chosen": 0.011258067563176155, "logits/rejected": 0.09075259417295456, "logps/chosen": -4.021671295166016, "logps/rejected": -4.701707363128662, "loss": 0.6082, "rewards/accuracies": 0.71875, "rewards/chosen": -4.021671295166016, "rewards/margins": 0.6800357103347778, "rewards/rejected": -4.701707363128662, "sft_loss": 4.067013740539551, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 17.764033387407114, "learning_rate": 9.765098658960035e-07, "logits/chosen": 0.05323363095521927, "logits/rejected": 0.10457701981067657, "logps/chosen": -4.006975173950195, "logps/rejected": -4.602959156036377, "loss": 0.6252, "rewards/accuracies": 0.71875, "rewards/chosen": -4.006975173950195, "rewards/margins": 0.5959838628768921, "rewards/rejected": -4.602959156036377, "sft_loss": 4.014351844787598, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 21.003003227643912, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.15413573384284973, "logits/rejected": 0.2889839708805084, "logps/chosen": -4.063313961029053, "logps/rejected": -4.599331855773926, "loss": 0.6403, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.063313961029053, "rewards/margins": 0.5360177755355835, "rewards/rejected": -4.599331855773926, "sft_loss": 4.007753849029541, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 14.649343042640266, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.16708219051361084, "logits/rejected": 0.27574026584625244, "logps/chosen": -3.8334312438964844, "logps/rejected": -4.484501838684082, "loss": 0.6036, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8334312438964844, "rewards/margins": 0.6510700583457947, "rewards/rejected": -4.484501838684082, "sft_loss": 3.9353995323181152, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 11.955416009251634, "learning_rate": 9.750738324585097e-07, "logits/chosen": 0.06077583506703377, "logits/rejected": 0.24156561493873596, "logps/chosen": -3.7830872535705566, "logps/rejected": -4.326009273529053, "loss": 0.6281, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.7830872535705566, "rewards/margins": 0.5429221391677856, "rewards/rejected": -4.326009273529053, "sft_loss": 3.7620186805725098, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 9.182424894003129, "learning_rate": 9.74585930072237e-07, "logits/chosen": 0.13045711815357208, "logits/rejected": 0.22782549262046814, "logps/chosen": -3.610170841217041, "logps/rejected": -4.254181861877441, "loss": 0.6351, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.610170841217041, "rewards/margins": 0.6440110206604004, "rewards/rejected": -4.254181861877441, "sft_loss": 3.6286041736602783, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 12.76745223701187, "learning_rate": 9.740934232511892e-07, "logits/chosen": 0.013227030634880066, "logits/rejected": 0.09067533910274506, "logps/chosen": -3.8230583667755127, "logps/rejected": -4.390995025634766, "loss": 0.6306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.8230583667755127, "rewards/margins": 0.5679364204406738, "rewards/rejected": -4.390995025634766, "sft_loss": 3.8801512718200684, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 14.137757785460963, "learning_rate": 9.735963167736698e-07, "logits/chosen": 0.10038147866725922, "logits/rejected": 0.23114939033985138, "logps/chosen": -3.847576856613159, "logps/rejected": -4.33864164352417, "loss": 0.6556, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.847576856613159, "rewards/margins": 0.4910648465156555, "rewards/rejected": -4.33864164352417, "sft_loss": 3.834911823272705, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 13.201256161784015, "learning_rate": 9.730946154626078e-07, "logits/chosen": 0.13253948092460632, "logits/rejected": 0.19316302239894867, "logps/chosen": -3.913498640060425, "logps/rejected": -4.428316116333008, "loss": 0.6718, "rewards/accuracies": 0.65625, "rewards/chosen": -3.913498640060425, "rewards/margins": 0.5148173570632935, "rewards/rejected": -4.428316116333008, "sft_loss": 3.9783337116241455, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 13.821285355110062, "learning_rate": 9.725883241855117e-07, "logits/chosen": 0.04290536046028137, "logits/rejected": 0.16766893863677979, "logps/chosen": -4.017451286315918, "logps/rejected": -4.656888008117676, "loss": 0.6224, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.017451286315918, "rewards/margins": 0.6394359469413757, "rewards/rejected": -4.656888008117676, "sft_loss": 4.158010482788086, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 13.869053395478524, "learning_rate": 9.720774478544218e-07, "logits/chosen": 0.10910507291555405, "logits/rejected": 0.21823792159557343, "logps/chosen": -3.9000885486602783, "logps/rejected": -4.603386878967285, "loss": 0.6111, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.9000885486602783, "rewards/margins": 0.703298807144165, "rewards/rejected": -4.603386878967285, "sft_loss": 3.998074769973755, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 14.52641753953559, "learning_rate": 9.715619914258624e-07, "logits/chosen": 0.04893391951918602, "logits/rejected": 0.1237044706940651, "logps/chosen": -4.041838645935059, "logps/rejected": -4.510806083679199, "loss": 0.6814, "rewards/accuracies": 0.65625, "rewards/chosen": -4.041838645935059, "rewards/margins": 0.4689674377441406, "rewards/rejected": -4.510806083679199, "sft_loss": 3.9709079265594482, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 18.355595445420242, "learning_rate": 9.710419599007937e-07, "logits/chosen": 0.07414282858371735, "logits/rejected": 0.19301927089691162, "logps/chosen": -3.9801788330078125, "logps/rejected": -4.495860576629639, "loss": 0.6375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.9801788330078125, "rewards/margins": 0.5156816244125366, "rewards/rejected": -4.495860576629639, "sft_loss": 4.034372329711914, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 22.088214219073393, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.0930425375699997, "logits/rejected": 0.22560659050941467, "logps/chosen": -3.8769278526306152, "logps/rejected": -4.395509243011475, "loss": 0.6778, "rewards/accuracies": 0.65625, "rewards/chosen": -3.8769278526306152, "rewards/margins": 0.518581211566925, "rewards/rejected": -4.395509243011475, "sft_loss": 3.9343044757843018, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 11.505160450043082, "learning_rate": 9.699881917868609e-07, "logits/chosen": 0.007545255124568939, "logits/rejected": 0.0969097688794136, "logps/chosen": -3.6723945140838623, "logps/rejected": -4.290095806121826, "loss": 0.6219, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.6723945140838623, "rewards/margins": 0.6177011132240295, "rewards/rejected": -4.290095806121826, "sft_loss": 3.7624449729919434, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 14.851585818047047, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.007628323044627905, "logits/rejected": 0.13381069898605347, "logps/chosen": -3.7548370361328125, "logps/rejected": -4.348697185516357, "loss": 0.6055, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.7548370361328125, "rewards/margins": 0.5938605666160583, "rewards/rejected": -4.348697185516357, "sft_loss": 3.7085232734680176, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 30.9003565366131, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.12496396154165268, "logits/rejected": 0.18439385294914246, "logps/chosen": -3.555870532989502, "logps/rejected": -4.09328031539917, "loss": 0.6437, "rewards/accuracies": 0.6875, "rewards/chosen": -3.555870532989502, "rewards/margins": 0.5374095439910889, "rewards/rejected": -4.09328031539917, "sft_loss": 3.5397467613220215, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 13.645012557766098, "learning_rate": 9.683733539658138e-07, "logits/chosen": 0.030058467760682106, "logits/rejected": 0.16403785347938538, "logps/chosen": -3.690871000289917, "logps/rejected": -4.289731979370117, "loss": 0.6233, "rewards/accuracies": 0.71875, "rewards/chosen": -3.690871000289917, "rewards/margins": 0.5988608598709106, "rewards/rejected": -4.289731979370117, "sft_loss": 3.603515148162842, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 14.376364101322325, "learning_rate": 9.678259793641178e-07, "logits/chosen": 0.06398359686136246, "logits/rejected": 0.09742051362991333, "logps/chosen": -3.7032439708709717, "logps/rejected": -4.105428218841553, "loss": 0.6627, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3.7032439708709717, "rewards/margins": 0.40218430757522583, "rewards/rejected": -4.105428218841553, "sft_loss": 3.766770124435425, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 13.032289343771657, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.04188089817762375, "logits/rejected": 0.06649746745824814, "logps/chosen": -3.700268507003784, "logps/rejected": -4.40798807144165, "loss": 0.6147, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.700268507003784, "rewards/margins": 0.7077193260192871, "rewards/rejected": -4.40798807144165, "sft_loss": 3.8044610023498535, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 15.503107221557544, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.032415203750133514, "logits/rejected": 0.07029138505458832, "logps/chosen": -3.8163063526153564, "logps/rejected": -4.493452548980713, "loss": 0.5884, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8163063526153564, "rewards/margins": 0.6771461963653564, "rewards/rejected": -4.493452548980713, "sft_loss": 3.8472914695739746, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 10.769101243354882, "learning_rate": 9.661566439234592e-07, "logits/chosen": 0.024549299851059914, "logits/rejected": 0.11254332214593887, "logps/chosen": -3.799016237258911, "logps/rejected": -4.320359706878662, "loss": 0.64, "rewards/accuracies": 0.65625, "rewards/chosen": -3.799016237258911, "rewards/margins": 0.5213435888290405, "rewards/rejected": -4.320359706878662, "sft_loss": 3.937115430831909, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 12.10636480168215, "learning_rate": 9.655911462268327e-07, "logits/chosen": 0.07037197053432465, "logits/rejected": 0.15609976649284363, "logps/chosen": -3.7353408336639404, "logps/rejected": -4.422998905181885, "loss": 0.5857, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.7353408336639404, "rewards/margins": 0.6876574158668518, "rewards/rejected": -4.422998905181885, "sft_loss": 3.93669056892395, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 13.191549758318944, "learning_rate": 9.650211313628636e-07, "logits/chosen": 0.0054580033756792545, "logits/rejected": 0.08709286153316498, "logps/chosen": -3.783989429473877, "logps/rejected": -4.179694652557373, "loss": 0.6961, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.783989429473877, "rewards/margins": 0.3957056999206543, "rewards/rejected": -4.179694652557373, "sft_loss": 3.9875073432922363, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 13.307987308972477, "learning_rate": 9.644466048618386e-07, "logits/chosen": 0.01374664343893528, "logits/rejected": 0.1347121298313141, "logps/chosen": -3.83042573928833, "logps/rejected": -4.345409870147705, "loss": 0.6669, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.83042573928833, "rewards/margins": 0.5149842500686646, "rewards/rejected": -4.345409870147705, "sft_loss": 3.802509307861328, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 12.39337562882268, "learning_rate": 9.63867572297816e-07, "logits/chosen": 0.009365784004330635, "logits/rejected": 0.1598575860261917, "logps/chosen": -3.5629711151123047, "logps/rejected": -4.108077049255371, "loss": 0.6313, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.5629711151123047, "rewards/margins": 0.5451056957244873, "rewards/rejected": -4.108077049255371, "sft_loss": 3.6623432636260986, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 14.120963984377784, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.0034040785394608974, "logits/rejected": 0.11227305233478546, "logps/chosen": -3.8640055656433105, "logps/rejected": -4.4511613845825195, "loss": 0.6424, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.8640055656433105, "rewards/margins": 0.5871554613113403, "rewards/rejected": -4.4511613845825195, "sft_loss": 3.903380870819092, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 11.539496715161526, "learning_rate": 9.626960114955483e-07, "logits/chosen": 0.048967987298965454, "logits/rejected": 0.16037827730178833, "logps/chosen": -3.79237699508667, "logps/rejected": -4.49746036529541, "loss": 0.5889, "rewards/accuracies": 0.71875, "rewards/chosen": -3.79237699508667, "rewards/margins": 0.7050831913948059, "rewards/rejected": -4.49746036529541, "sft_loss": 3.8400700092315674, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 16.150703871819903, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.017244238406419754, "logits/rejected": 0.08983320742845535, "logps/chosen": -3.9456849098205566, "logps/rejected": -4.6313605308532715, "loss": 0.5918, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.9456849098205566, "rewards/margins": 0.6856756210327148, "rewards/rejected": -4.6313605308532715, "sft_loss": 4.094405651092529, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 12.931679999737772, "learning_rate": 9.615064944219021e-07, "logits/chosen": 0.038251686841249466, "logits/rejected": 0.12875883281230927, "logps/chosen": -3.786808729171753, "logps/rejected": -4.453344821929932, "loss": 0.5906, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.786808729171753, "rewards/margins": 0.6665353178977966, "rewards/rejected": -4.453344821929932, "sft_loss": 3.9314029216766357, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 17.215641703722756, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.005074346903711557, "logits/rejected": 0.059091150760650635, "logps/chosen": -3.9514594078063965, "logps/rejected": -4.556410789489746, "loss": 0.6292, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.9514594078063965, "rewards/margins": 0.6049517393112183, "rewards/rejected": -4.556410789489746, "sft_loss": 3.9630367755889893, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.2461836338043213, "eval_logits/rejected": 0.3271760046482086, "eval_logps/chosen": -3.8912527561187744, "eval_logps/rejected": -4.595021724700928, "eval_loss": 0.5942531824111938, "eval_rewards/accuracies": 0.721068263053894, "eval_rewards/chosen": -3.8912527561187744, "eval_rewards/margins": 0.7037691473960876, "eval_rewards/rejected": -4.595021724700928, "eval_runtime": 42.9784, "eval_samples_per_second": 31.295, "eval_sft_loss": 3.991314649581909, "eval_steps_per_second": 7.841, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 16.845520637315555, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.09139742702245712, "logits/rejected": 0.06181849166750908, "logps/chosen": -3.8685569763183594, "logps/rejected": -4.536148548126221, "loss": 0.5905, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8685569763183594, "rewards/margins": 0.6675916910171509, "rewards/rejected": -4.536148548126221, "sft_loss": 3.933539628982544, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 13.30971595013577, "learning_rate": 9.59688651973581e-07, "logits/chosen": 0.0002502262650523335, "logits/rejected": 0.15695317089557648, "logps/chosen": -3.790062427520752, "logps/rejected": -4.361540794372559, "loss": 0.6162, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.790062427520752, "rewards/margins": 0.5714787244796753, "rewards/rejected": -4.361540794372559, "sft_loss": 3.882965087890625, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 13.25167322757119, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.046138983219861984, "logits/rejected": 0.041084617376327515, "logps/chosen": -3.9596238136291504, "logps/rejected": -4.504553318023682, "loss": 0.6367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.9596238136291504, "rewards/margins": 0.5449296236038208, "rewards/rejected": -4.504553318023682, "sft_loss": 3.9218735694885254, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 12.924722270033136, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.08105846494436264, "logits/rejected": 0.16998329758644104, "logps/chosen": -3.8064639568328857, "logps/rejected": -4.354712963104248, "loss": 0.6409, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.8064639568328857, "rewards/margins": 0.5482488870620728, "rewards/rejected": -4.354712963104248, "sft_loss": 3.770095109939575, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 15.684642791031575, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.07153487950563431, "logits/rejected": 0.09660569578409195, "logps/chosen": -3.9475014209747314, "logps/rejected": -4.414361000061035, "loss": 0.6879, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.9475014209747314, "rewards/margins": 0.4668591618537903, "rewards/rejected": -4.414361000061035, "sft_loss": 3.897876024246216, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 15.03767424806433, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.0005132406949996948, "logits/rejected": 0.07582568377256393, "logps/chosen": -3.9208877086639404, "logps/rejected": -4.437623500823975, "loss": 0.6546, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.9208877086639404, "rewards/margins": 0.5167354941368103, "rewards/rejected": -4.437623500823975, "sft_loss": 3.898923873901367, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 13.714925325727796, "learning_rate": 9.565697970821593e-07, "logits/chosen": 0.01927166059613228, "logits/rejected": 0.13673178851604462, "logps/chosen": -3.84504771232605, "logps/rejected": -4.373183250427246, "loss": 0.6295, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.84504771232605, "rewards/margins": 0.5281357765197754, "rewards/rejected": -4.373183250427246, "sft_loss": 3.883021593093872, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 10.639540991446472, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.041995711624622345, "logits/rejected": 0.0656842365860939, "logps/chosen": -3.769777297973633, "logps/rejected": -4.308537483215332, "loss": 0.6232, "rewards/accuracies": 0.6875, "rewards/chosen": -3.769777297973633, "rewards/margins": 0.5387598276138306, "rewards/rejected": -4.308537483215332, "sft_loss": 3.8884778022766113, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 15.509107129403525, "learning_rate": 9.552912049655789e-07, "logits/chosen": 0.004844689276069403, "logits/rejected": 0.1393628716468811, "logps/chosen": -3.6201400756835938, "logps/rejected": -4.24270486831665, "loss": 0.614, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.6201400756835938, "rewards/margins": 0.6225649118423462, "rewards/rejected": -4.24270486831665, "sft_loss": 3.6522445678710938, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 17.954875027648637, "learning_rate": 9.546452799394315e-07, "logits/chosen": 0.05819585919380188, "logits/rejected": 0.19612844288349152, "logps/chosen": -3.7995667457580566, "logps/rejected": -4.250999450683594, "loss": 0.6912, "rewards/accuracies": 0.625, "rewards/chosen": -3.7995667457580566, "rewards/margins": 0.4514332413673401, "rewards/rejected": -4.250999450683594, "sft_loss": 3.818459987640381, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 11.497282803803873, "learning_rate": 9.539949439427846e-07, "logits/chosen": 0.028142839670181274, "logits/rejected": 0.12065265327692032, "logps/chosen": -3.639207363128662, "logps/rejected": -4.279654502868652, "loss": 0.5895, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.639207363128662, "rewards/margins": 0.6404469609260559, "rewards/rejected": -4.279654502868652, "sft_loss": 3.8028016090393066, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 10.502555017466102, "learning_rate": 9.533402032852002e-07, "logits/chosen": 0.00669657438993454, "logits/rejected": 0.1235589012503624, "logps/chosen": -3.6829135417938232, "logps/rejected": -4.4046783447265625, "loss": 0.5923, "rewards/accuracies": 0.71875, "rewards/chosen": -3.6829135417938232, "rewards/margins": 0.721764862537384, "rewards/rejected": -4.4046783447265625, "sft_loss": 3.843273639678955, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 13.183562797082082, "learning_rate": 9.526810643189754e-07, "logits/chosen": 0.08398241549730301, "logits/rejected": 0.2003088742494583, "logps/chosen": -3.769449234008789, "logps/rejected": -4.366641998291016, "loss": 0.6093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.769449234008789, "rewards/margins": 0.5971931219100952, "rewards/rejected": -4.366641998291016, "sft_loss": 3.8731276988983154, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 13.682588226342201, "learning_rate": 9.52017533439079e-07, "logits/chosen": 0.02536749839782715, "logits/rejected": 0.0881018415093422, "logps/chosen": -3.840707778930664, "logps/rejected": -4.396668910980225, "loss": 0.6311, "rewards/accuracies": 0.71875, "rewards/chosen": -3.840707778930664, "rewards/margins": 0.5559613704681396, "rewards/rejected": -4.396668910980225, "sft_loss": 3.9200286865234375, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 11.828789139592562, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.011203656904399395, "logits/rejected": 0.0779070183634758, "logps/chosen": -3.9936225414276123, "logps/rejected": -4.510663986206055, "loss": 0.6742, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.9936225414276123, "rewards/margins": 0.5170412659645081, "rewards/rejected": -4.510663986206055, "sft_loss": 3.9869353771209717, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 17.434450544438157, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.009508686140179634, "logits/rejected": 0.11930382251739502, "logps/chosen": -3.8271725177764893, "logps/rejected": -4.379973411560059, "loss": 0.6282, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.8271725177764893, "rewards/margins": 0.5528005361557007, "rewards/rejected": -4.379973411560059, "sft_loss": 3.8565165996551514, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 15.854921858999106, "learning_rate": 9.500006539058334e-07, "logits/chosen": 0.02335262857377529, "logits/rejected": 0.12421506643295288, "logps/chosen": -3.8039650917053223, "logps/rejected": -4.209416389465332, "loss": 0.6594, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.8039650917053223, "rewards/margins": 0.4054519534111023, "rewards/rejected": -4.209416389465332, "sft_loss": 3.7736048698425293, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 16.02613111579297, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.07322710007429123, "logits/rejected": 0.04123363643884659, "logps/chosen": -3.893324613571167, "logps/rejected": -4.322262763977051, "loss": 0.6719, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.893324613571167, "rewards/margins": 0.42893800139427185, "rewards/rejected": -4.322262763977051, "sft_loss": 3.895965099334717, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 11.795083046033852, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.014591937884688377, "logits/rejected": -0.003014406654983759, "logps/chosen": -3.850743055343628, "logps/rejected": -4.518239498138428, "loss": 0.594, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.850743055343628, "rewards/margins": 0.6674960851669312, "rewards/rejected": -4.518239498138428, "sft_loss": 3.830537796020508, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 17.252417092397284, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.015925895422697067, "logits/rejected": 0.18819871544837952, "logps/chosen": -3.8830389976501465, "logps/rejected": -4.609259128570557, "loss": 0.5934, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.8830389976501465, "rewards/margins": 0.7262201905250549, "rewards/rejected": -4.609259128570557, "sft_loss": 3.906867265701294, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 10.17591526538395, "learning_rate": 9.472503898067645e-07, "logits/chosen": 0.08815959841012955, "logits/rejected": 0.1264752745628357, "logps/chosen": -3.9305195808410645, "logps/rejected": -4.479735374450684, "loss": 0.6485, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.9305195808410645, "rewards/margins": 0.5492160320281982, "rewards/rejected": -4.479735374450684, "sft_loss": 3.8741016387939453, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 14.198842418229422, "learning_rate": 9.465519589364099e-07, "logits/chosen": 0.06211893633008003, "logits/rejected": 0.12285672128200531, "logps/chosen": -3.9210121631622314, "logps/rejected": -4.57586145401001, "loss": 0.6043, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.9210121631622314, "rewards/margins": 0.6548491716384888, "rewards/rejected": -4.57586145401001, "sft_loss": 3.9747262001037598, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 18.363856784951615, "learning_rate": 9.458491956169914e-07, "logits/chosen": 0.0014732598792761564, "logits/rejected": 0.11077898740768433, "logps/chosen": -4.034459590911865, "logps/rejected": -4.670225620269775, "loss": 0.6237, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.034459590911865, "rewards/margins": 0.6357653737068176, "rewards/rejected": -4.670225620269775, "sft_loss": 4.082320690155029, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 13.852905792988352, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.10090867429971695, "logits/rejected": 0.035910557955503464, "logps/chosen": -3.940913677215576, "logps/rejected": -4.5349016189575195, "loss": 0.6018, "rewards/accuracies": 0.71875, "rewards/chosen": -3.940913677215576, "rewards/margins": 0.5939876437187195, "rewards/rejected": -4.5349016189575195, "sft_loss": 4.0024943351745605, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 16.397248975860062, "learning_rate": 9.444306989457805e-07, "logits/chosen": 0.003956289496272802, "logits/rejected": 0.10998527705669403, "logps/chosen": -3.8461742401123047, "logps/rejected": -4.399134635925293, "loss": 0.6786, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.8461742401123047, "rewards/margins": 0.552960216999054, "rewards/rejected": -4.399134635925293, "sft_loss": 3.8427023887634277, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 16.470232882281454, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.0042040422558784485, "logits/rejected": 0.07606049627065659, "logps/chosen": -3.7906928062438965, "logps/rejected": -4.396378993988037, "loss": 0.6244, "rewards/accuracies": 0.65625, "rewards/chosen": -3.7906928062438965, "rewards/margins": 0.6056860685348511, "rewards/rejected": -4.396378993988037, "sft_loss": 3.8501904010772705, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 14.169085969543985, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.02851545251905918, "logits/rejected": 0.04445397108793259, "logps/chosen": -3.810012102127075, "logps/rejected": -4.389849662780762, "loss": 0.633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.810012102127075, "rewards/margins": 0.5798380970954895, "rewards/rejected": -4.389849662780762, "sft_loss": 3.8390088081359863, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 12.805135756811211, "learning_rate": 9.422706323888396e-07, "logits/chosen": 0.01615220494568348, "logits/rejected": 0.0676146000623703, "logps/chosen": -3.7289607524871826, "logps/rejected": -4.293822288513184, "loss": 0.6407, "rewards/accuracies": 0.65625, "rewards/chosen": -3.7289607524871826, "rewards/margins": 0.5648613572120667, "rewards/rejected": -4.293822288513184, "sft_loss": 3.74847412109375, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 14.163967951041753, "learning_rate": 9.415420190240225e-07, "logits/chosen": 0.018159478902816772, "logits/rejected": 0.15642356872558594, "logps/chosen": -3.7944741249084473, "logps/rejected": -4.561646461486816, "loss": 0.5273, "rewards/accuracies": 0.78125, "rewards/chosen": -3.7944741249084473, "rewards/margins": 0.7671729326248169, "rewards/rejected": -4.561646461486816, "sft_loss": 3.849836826324463, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 17.43992081275538, "learning_rate": 9.408091218166002e-07, "logits/chosen": 0.03253094106912613, "logits/rejected": 0.08874372392892838, "logps/chosen": -3.867936611175537, "logps/rejected": -4.314837455749512, "loss": 0.6665, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.867936611175537, "rewards/margins": 0.4469006657600403, "rewards/rejected": -4.314837455749512, "sft_loss": 3.9457309246063232, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 18.167281348220694, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.030413394793868065, "logits/rejected": 0.18014629185199738, "logps/chosen": -4.015583038330078, "logps/rejected": -4.593112945556641, "loss": 0.6096, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.015583038330078, "rewards/margins": 0.5775297284126282, "rewards/rejected": -4.593112945556641, "sft_loss": 3.978579044342041, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 15.974570997045625, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.04908815771341324, "logits/rejected": 0.03998412936925888, "logps/chosen": -4.031233310699463, "logps/rejected": -4.735138893127441, "loss": 0.5853, "rewards/accuracies": 0.6875, "rewards/chosen": -4.031233310699463, "rewards/margins": 0.7039054036140442, "rewards/rejected": -4.735138893127441, "sft_loss": 4.132689476013184, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 8.94370333182499, "learning_rate": 9.38584798451817e-07, "logits/chosen": -0.02070789411664009, "logits/rejected": 0.09080308675765991, "logps/chosen": -3.9505438804626465, "logps/rejected": -4.525331497192383, "loss": 0.6236, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.9505438804626465, "rewards/margins": 0.574787437915802, "rewards/rejected": -4.525331497192383, "sft_loss": 3.9485411643981934, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 26.678755989582026, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.03940100595355034, "logits/rejected": 0.05745028704404831, "logps/chosen": -3.922192096710205, "logps/rejected": -4.713294982910156, "loss": 0.6099, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.922192096710205, "rewards/margins": 0.7911027073860168, "rewards/rejected": -4.713294982910156, "sft_loss": 3.9023406505584717, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 15.42526326583326, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.08443541824817657, "logits/rejected": 0.0254441536962986, "logps/chosen": -3.8435254096984863, "logps/rejected": -4.57711124420166, "loss": 0.579, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8435254096984863, "rewards/margins": 0.73358553647995, "rewards/rejected": -4.57711124420166, "sft_loss": 3.8551177978515625, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 12.582495846995794, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.0869092345237732, "logits/rejected": 0.005117898341268301, "logps/chosen": -3.783353805541992, "logps/rejected": -4.234916687011719, "loss": 0.7033, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.783353805541992, "rewards/margins": 0.4515630304813385, "rewards/rejected": -4.234916687011719, "sft_loss": 3.7639663219451904, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 17.41495253319472, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.0802505686879158, "logits/rejected": 0.007249271962791681, "logps/chosen": -3.5686416625976562, "logps/rejected": -4.24164342880249, "loss": 0.597, "rewards/accuracies": 0.71875, "rewards/chosen": -3.5686416625976562, "rewards/margins": 0.6730014085769653, "rewards/rejected": -4.24164342880249, "sft_loss": 3.5927505493164062, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 12.243772262322084, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.04855138063430786, "logits/rejected": 0.0416254922747612, "logps/chosen": -3.685039520263672, "logps/rejected": -4.1918182373046875, "loss": 0.6482, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.685039520263672, "rewards/margins": 0.5067787170410156, "rewards/rejected": -4.1918182373046875, "sft_loss": 3.6339497566223145, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 15.180696863080362, "learning_rate": 9.34021460937342e-07, "logits/chosen": -0.03241829201579094, "logits/rejected": 0.040210507810115814, "logps/chosen": -3.7386927604675293, "logps/rejected": -4.187805652618408, "loss": 0.6493, "rewards/accuracies": 0.6875, "rewards/chosen": -3.7386927604675293, "rewards/margins": 0.44911304116249084, "rewards/rejected": -4.187805652618408, "sft_loss": 3.7067184448242188, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 8.992479929116369, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.2001856565475464, "logits/rejected": -0.11754343658685684, "logps/chosen": -3.778327226638794, "logps/rejected": -4.2554402351379395, "loss": 0.6238, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.778327226638794, "rewards/margins": 0.47711247205734253, "rewards/rejected": -4.2554402351379395, "sft_loss": 3.767138719558716, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 11.86927774449835, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.11683745682239532, "logits/rejected": 0.019382378086447716, "logps/chosen": -3.5938785076141357, "logps/rejected": -4.3098320960998535, "loss": 0.6299, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.5938785076141357, "rewards/margins": 0.7159535884857178, "rewards/rejected": -4.3098320960998535, "sft_loss": 3.6116302013397217, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 15.387992935443528, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.10649319738149643, "logits/rejected": 0.005260472185909748, "logps/chosen": -3.74613618850708, "logps/rejected": -4.398713111877441, "loss": 0.5803, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.74613618850708, "rewards/margins": 0.6525768041610718, "rewards/rejected": -4.398713111877441, "sft_loss": 3.7421631813049316, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 21.090362384980526, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.041864484548568726, "logits/rejected": 0.04505940526723862, "logps/chosen": -3.701702833175659, "logps/rejected": -4.130052089691162, "loss": 0.6612, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.701702833175659, "rewards/margins": 0.42834967374801636, "rewards/rejected": -4.130052089691162, "sft_loss": 3.621180772781372, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 11.325300022922676, "learning_rate": 9.301028145701543e-07, "logits/chosen": -0.04674559831619263, "logits/rejected": 0.055222172290086746, "logps/chosen": -3.793757915496826, "logps/rejected": -4.501084804534912, "loss": 0.6039, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.793757915496826, "rewards/margins": 0.7073268890380859, "rewards/rejected": -4.501084804534912, "sft_loss": 3.869464874267578, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 9.102658850824849, "learning_rate": 9.293065361002563e-07, "logits/chosen": -0.03859156370162964, "logits/rejected": 0.033886075019836426, "logps/chosen": -3.7963879108428955, "logps/rejected": -4.407050609588623, "loss": 0.6331, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.7963879108428955, "rewards/margins": 0.6106627583503723, "rewards/rejected": -4.407050609588623, "sft_loss": 3.7996819019317627, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 14.4711171960013, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.14051930606365204, "logits/rejected": -0.05075984075665474, "logps/chosen": -3.8657398223876953, "logps/rejected": -4.398106575012207, "loss": 0.6229, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.8657398223876953, "rewards/margins": 0.5323665738105774, "rewards/rejected": -4.398106575012207, "sft_loss": 3.8228697776794434, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 12.52167523723316, "learning_rate": 9.277014915246792e-07, "logits/chosen": -0.0580422468483448, "logits/rejected": -0.019903894513845444, "logps/chosen": -3.8900628089904785, "logps/rejected": -4.588275909423828, "loss": 0.5952, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.8900628089904785, "rewards/margins": 0.6982136368751526, "rewards/rejected": -4.588275909423828, "sft_loss": 3.9355030059814453, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 9.415628292863678, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.1376069039106369, "logits/rejected": -0.05684811994433403, "logps/chosen": -3.823603391647339, "logps/rejected": -4.432217121124268, "loss": 0.6246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.823603391647339, "rewards/margins": 0.6086133122444153, "rewards/rejected": -4.432217121124268, "sft_loss": 3.8841118812561035, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 13.700848888870734, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.1622910499572754, "logits/rejected": -0.034788988530635834, "logps/chosen": -3.89607310295105, "logps/rejected": -4.454383850097656, "loss": 0.6166, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.89607310295105, "rewards/margins": 0.5583103895187378, "rewards/rejected": -4.454383850097656, "sft_loss": 3.9466094970703125, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 15.693301225581965, "learning_rate": 9.252628226650389e-07, "logits/chosen": -0.08934115618467331, "logits/rejected": -0.002043300773948431, "logps/chosen": -3.997868776321411, "logps/rejected": -4.518886566162109, "loss": 0.6649, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.997868776321411, "rewards/margins": 0.5210171341896057, "rewards/rejected": -4.518886566162109, "sft_loss": 4.018801689147949, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 14.661747537082288, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.09901993721723557, "logits/rejected": 0.014294229447841644, "logps/chosen": -3.921424388885498, "logps/rejected": -4.539112567901611, "loss": 0.6357, "rewards/accuracies": 0.65625, "rewards/chosen": -3.921424388885498, "rewards/margins": 0.617688775062561, "rewards/rejected": -4.539112567901611, "sft_loss": 3.9897141456604004, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 12.04009641192229, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.09351952373981476, "logits/rejected": 0.015524087473750114, "logps/chosen": -3.9031898975372314, "logps/rejected": -4.5179572105407715, "loss": 0.6171, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.9031898975372314, "rewards/margins": 0.6147674322128296, "rewards/rejected": -4.5179572105407715, "sft_loss": 3.8708298206329346, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 10.032646768408965, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.1010308712720871, "logits/rejected": -0.005897210445255041, "logps/chosen": -3.9854037761688232, "logps/rejected": -4.480762958526611, "loss": 0.6496, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.9854037761688232, "rewards/margins": 0.49535924196243286, "rewards/rejected": -4.480762958526611, "sft_loss": 3.9827957153320312, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 9.168216544574479, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.13384617865085602, "logits/rejected": -0.10935819149017334, "logps/chosen": -3.9713332653045654, "logps/rejected": -4.536910533905029, "loss": 0.6398, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.9713332653045654, "rewards/margins": 0.5655772089958191, "rewards/rejected": -4.536910533905029, "sft_loss": 3.9855117797851562, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 11.98976821599669, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.16908545792102814, "logits/rejected": -0.044844694435596466, "logps/chosen": -4.051001071929932, "logps/rejected": -4.692683219909668, "loss": 0.6194, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.051001071929932, "rewards/margins": 0.6416821479797363, "rewards/rejected": -4.692683219909668, "sft_loss": 4.038779258728027, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 13.081822544871454, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.04509899392724037, "logits/rejected": 0.01662004180252552, "logps/chosen": -3.9795117378234863, "logps/rejected": -4.6536149978637695, "loss": 0.6271, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.9795117378234863, "rewards/margins": 0.6741029620170593, "rewards/rejected": -4.6536149978637695, "sft_loss": 4.023516654968262, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 14.210973536735354, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.07396290451288223, "logits/rejected": 0.04083425551652908, "logps/chosen": -4.056087493896484, "logps/rejected": -4.83945369720459, "loss": 0.5591, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.056087493896484, "rewards/margins": 0.7833663821220398, "rewards/rejected": -4.83945369720459, "sft_loss": 4.060484409332275, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 15.471031754520801, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.12588518857955933, "logits/rejected": -0.04100862145423889, "logps/chosen": -4.009485244750977, "logps/rejected": -4.681950569152832, "loss": 0.6228, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.009485244750977, "rewards/margins": 0.6724649667739868, "rewards/rejected": -4.681950569152832, "sft_loss": 4.088533401489258, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 14.561177242848148, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.09878459572792053, "logits/rejected": -0.009741676971316338, "logps/chosen": -4.209200382232666, "logps/rejected": -4.6544108390808105, "loss": 0.6626, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.209200382232666, "rewards/margins": 0.4452107548713684, "rewards/rejected": -4.6544108390808105, "sft_loss": 4.294915676116943, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 12.949238093080922, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.12456649541854858, "logits/rejected": -0.03368541598320007, "logps/chosen": -4.077846527099609, "logps/rejected": -4.671810150146484, "loss": 0.6596, "rewards/accuracies": 0.625, "rewards/chosen": -4.077846527099609, "rewards/margins": 0.593963086605072, "rewards/rejected": -4.671810150146484, "sft_loss": 4.222781658172607, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 12.368100727938941, "learning_rate": 9.16004998712373e-07, "logits/chosen": -0.07127787172794342, "logits/rejected": -0.024446146562695503, "logps/chosen": -4.05262565612793, "logps/rejected": -4.659695625305176, "loss": 0.6252, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.05262565612793, "rewards/margins": 0.6070703268051147, "rewards/rejected": -4.659695625305176, "sft_loss": 4.069108963012695, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 12.448223400661613, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.12020577490329742, "logits/rejected": 0.08523955196142197, "logps/chosen": -3.913705348968506, "logps/rejected": -4.676518440246582, "loss": 0.5621, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.913705348968506, "rewards/margins": 0.7628134489059448, "rewards/rejected": -4.676518440246582, "sft_loss": 3.920109272003174, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 12.886115881341064, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.06894917786121368, "logits/rejected": -0.016827654093503952, "logps/chosen": -3.890366792678833, "logps/rejected": -4.506114482879639, "loss": 0.6142, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.890366792678833, "rewards/margins": 0.6157475113868713, "rewards/rejected": -4.506114482879639, "sft_loss": 3.963930606842041, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 12.071807685749667, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.10245764255523682, "logits/rejected": -0.023907829076051712, "logps/chosen": -3.8091113567352295, "logps/rejected": -4.430354118347168, "loss": 0.5976, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8091113567352295, "rewards/margins": 0.6212425827980042, "rewards/rejected": -4.430354118347168, "sft_loss": 3.9815516471862793, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 10.834734052131708, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.13426610827445984, "logits/rejected": -0.01993221417069435, "logps/chosen": -3.874058961868286, "logps/rejected": -4.447755336761475, "loss": 0.6235, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.874058961868286, "rewards/margins": 0.5736962556838989, "rewards/rejected": -4.447755336761475, "sft_loss": 3.925154447555542, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 30.6039357353955, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.1942942887544632, "logits/rejected": -0.08810271322727203, "logps/chosen": -3.9921023845672607, "logps/rejected": -4.488032341003418, "loss": 0.637, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.9921023845672607, "rewards/margins": 0.49593037366867065, "rewards/rejected": -4.488032341003418, "sft_loss": 4.042372703552246, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 12.922402171519687, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.17460720241069794, "logits/rejected": -0.05064401775598526, "logps/chosen": -3.8689498901367188, "logps/rejected": -4.4816694259643555, "loss": 0.5821, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.8689498901367188, "rewards/margins": 0.6127195358276367, "rewards/rejected": -4.4816694259643555, "sft_loss": 3.943500518798828, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 16.343723546981533, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.10426691919565201, "logits/rejected": -0.00016860663890838623, "logps/chosen": -3.905703067779541, "logps/rejected": -4.731764793395996, "loss": 0.5837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.905703067779541, "rewards/margins": 0.8260625004768372, "rewards/rejected": -4.731764793395996, "sft_loss": 3.9430885314941406, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 10.650171573281304, "learning_rate": 9.089646803833588e-07, "logits/chosen": -0.11272887140512466, "logits/rejected": -0.01053983997553587, "logps/chosen": -3.8744194507598877, "logps/rejected": -4.515874862670898, "loss": 0.6116, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8744194507598877, "rewards/margins": 0.6414551734924316, "rewards/rejected": -4.515874862670898, "sft_loss": 3.9839954376220703, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 11.204609395323448, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.2166566550731659, "logits/rejected": -0.030534937977790833, "logps/chosen": -3.8562233448028564, "logps/rejected": -4.782811164855957, "loss": 0.5023, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.8562233448028564, "rewards/margins": 0.9265871047973633, "rewards/rejected": -4.782811164855957, "sft_loss": 3.879866123199463, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 11.280835645464546, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.17664167284965515, "logits/rejected": -0.0007077917689457536, "logps/chosen": -3.9161293506622314, "logps/rejected": -4.722474098205566, "loss": 0.6143, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.9161293506622314, "rewards/margins": 0.8063453435897827, "rewards/rejected": -4.722474098205566, "sft_loss": 3.895829677581787, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 12.603156990040194, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.10256105661392212, "logits/rejected": -0.045074738562107086, "logps/chosen": -4.097698211669922, "logps/rejected": -4.654773712158203, "loss": 0.6505, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.097698211669922, "rewards/margins": 0.5570759177207947, "rewards/rejected": -4.654773712158203, "sft_loss": 4.164440155029297, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 10.578843607292864, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.06902100145816803, "logits/rejected": 0.005874344613403082, "logps/chosen": -4.0252180099487305, "logps/rejected": -4.597375392913818, "loss": 0.6331, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.0252180099487305, "rewards/margins": 0.5721582174301147, "rewards/rejected": -4.597375392913818, "sft_loss": 4.069859504699707, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 16.18310307037792, "learning_rate": 9.044352511642661e-07, "logits/chosen": -0.0919872522354126, "logits/rejected": -0.05589014291763306, "logps/chosen": -4.019768238067627, "logps/rejected": -4.503331184387207, "loss": 0.7016, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.019768238067627, "rewards/margins": 0.48356255888938904, "rewards/rejected": -4.503331184387207, "sft_loss": 4.123537063598633, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 10.830456959519374, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.08890876919031143, "logits/rejected": -0.015451954677700996, "logps/chosen": -3.699070453643799, "logps/rejected": -4.209559440612793, "loss": 0.626, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.699070453643799, "rewards/margins": 0.5104890465736389, "rewards/rejected": -4.209559440612793, "sft_loss": 3.7042746543884277, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 9.401950567422222, "learning_rate": 9.025959508580436e-07, "logits/chosen": -0.03602803871035576, "logits/rejected": 0.11538251489400864, "logps/chosen": -3.798994541168213, "logps/rejected": -4.442479133605957, "loss": 0.5996, "rewards/accuracies": 0.71875, "rewards/chosen": -3.798994541168213, "rewards/margins": 0.6434845328330994, "rewards/rejected": -4.442479133605957, "sft_loss": 3.7785956859588623, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 10.214149261214663, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.08980683982372284, "logits/rejected": 0.01351415365934372, "logps/chosen": -3.624281644821167, "logps/rejected": -4.2779221534729, "loss": 0.5905, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.624281644821167, "rewards/margins": 0.6536401510238647, "rewards/rejected": -4.2779221534729, "sft_loss": 3.7103257179260254, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 14.182967548181901, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.15173068642616272, "logits/rejected": -0.08541973680257797, "logps/chosen": -3.6514763832092285, "logps/rejected": -4.157805442810059, "loss": 0.6305, "rewards/accuracies": 0.6875, "rewards/chosen": -3.6514763832092285, "rewards/margins": 0.5063292384147644, "rewards/rejected": -4.157805442810059, "sft_loss": 3.675691604614258, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 16.02994223042789, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.04729296639561653, "logits/rejected": -0.004415331874042749, "logps/chosen": -3.785186767578125, "logps/rejected": -4.244810104370117, "loss": 0.6665, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.785186767578125, "rewards/margins": 0.45962339639663696, "rewards/rejected": -4.244810104370117, "sft_loss": 3.740236759185791, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 10.885918050761497, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.14420801401138306, "logits/rejected": -0.02436070702970028, "logps/chosen": -3.710519313812256, "logps/rejected": -4.561938762664795, "loss": 0.5282, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.710519313812256, "rewards/margins": 0.8514199256896973, "rewards/rejected": -4.561938762664795, "sft_loss": 3.786695957183838, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.14557655155658722, "eval_logits/rejected": 0.21837201714515686, "eval_logps/chosen": -3.799365758895874, "eval_logps/rejected": -4.488193988800049, "eval_loss": 0.5851835608482361, "eval_rewards/accuracies": 0.7173590660095215, "eval_rewards/chosen": -3.799365758895874, "eval_rewards/margins": 0.68882817029953, "eval_rewards/rejected": -4.488193988800049, "eval_runtime": 42.9857, "eval_samples_per_second": 31.289, "eval_sft_loss": 3.8604371547698975, "eval_steps_per_second": 7.84, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 12.27828649507042, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.10497484356164932, "logits/rejected": 0.056699056178331375, "logps/chosen": -3.9257049560546875, "logps/rejected": -4.624436855316162, "loss": 0.5789, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.9257049560546875, "rewards/margins": 0.6987317204475403, "rewards/rejected": -4.624436855316162, "sft_loss": 3.912734270095825, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 11.632477594854953, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.12709620594978333, "logits/rejected": 0.0015153981512412429, "logps/chosen": -3.961102247238159, "logps/rejected": -4.7628655433654785, "loss": 0.5721, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.961102247238159, "rewards/margins": 0.8017629384994507, "rewards/rejected": -4.7628655433654785, "sft_loss": 4.05449914932251, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 17.797718688717936, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.07080388069152832, "logits/rejected": 0.014102371409535408, "logps/chosen": -4.029933452606201, "logps/rejected": -4.655080318450928, "loss": 0.6526, "rewards/accuracies": 0.65625, "rewards/chosen": -4.029933452606201, "rewards/margins": 0.6251465082168579, "rewards/rejected": -4.655080318450928, "sft_loss": 4.067551136016846, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 13.620376101090379, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.07276565581560135, "logits/rejected": 0.03689710050821304, "logps/chosen": -3.949676513671875, "logps/rejected": -4.511597156524658, "loss": 0.6814, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.949676513671875, "rewards/margins": 0.5619208812713623, "rewards/rejected": -4.511597156524658, "sft_loss": 4.0169172286987305, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 15.277906245685788, "learning_rate": 8.941267982915213e-07, "logits/chosen": -0.05725591629743576, "logits/rejected": -0.01894243434071541, "logps/chosen": -3.93320894241333, "logps/rejected": -4.239479064941406, "loss": 0.7493, "rewards/accuracies": 0.59375, "rewards/chosen": -3.93320894241333, "rewards/margins": 0.30627012252807617, "rewards/rejected": -4.239479064941406, "sft_loss": 3.906848192214966, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 10.028521696122647, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.05407093092799187, "logits/rejected": 0.039415888488292694, "logps/chosen": -3.6694939136505127, "logps/rejected": -4.256445407867432, "loss": 0.6118, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.6694939136505127, "rewards/margins": 0.5869513750076294, "rewards/rejected": -4.256445407867432, "sft_loss": 3.7156128883361816, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 9.740621126018869, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.13916271924972534, "logits/rejected": -0.03805696219205856, "logps/chosen": -3.4495654106140137, "logps/rejected": -4.118634223937988, "loss": 0.579, "rewards/accuracies": 0.71875, "rewards/chosen": -3.4495654106140137, "rewards/margins": 0.6690683364868164, "rewards/rejected": -4.118634223937988, "sft_loss": 3.4922516345977783, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 16.928582236655778, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.14447686076164246, "logits/rejected": -0.056305646896362305, "logps/chosen": -3.626981258392334, "logps/rejected": -4.2726945877075195, "loss": 0.6124, "rewards/accuracies": 0.6875, "rewards/chosen": -3.626981258392334, "rewards/margins": 0.6457129716873169, "rewards/rejected": -4.2726945877075195, "sft_loss": 3.6541404724121094, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 11.880923542733687, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.10572276264429092, "logits/rejected": -0.07693799585103989, "logps/chosen": -3.740095853805542, "logps/rejected": -4.2216291427612305, "loss": 0.6411, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.740095853805542, "rewards/margins": 0.48153313994407654, "rewards/rejected": -4.2216291427612305, "sft_loss": 3.799666166305542, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 13.283487323354745, "learning_rate": 8.892874524469537e-07, "logits/chosen": -0.010224530473351479, "logits/rejected": 0.01489633321762085, "logps/chosen": -3.5730910301208496, "logps/rejected": -4.23054313659668, "loss": 0.5645, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.5730910301208496, "rewards/margins": 0.6574516296386719, "rewards/rejected": -4.23054313659668, "sft_loss": 3.5423073768615723, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 12.396900074714226, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.13912677764892578, "logits/rejected": -0.05157994478940964, "logps/chosen": -3.792473554611206, "logps/rejected": -4.402674674987793, "loss": 0.5987, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.792473554611206, "rewards/margins": 0.6102014183998108, "rewards/rejected": -4.402674674987793, "sft_loss": 3.8069300651550293, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 16.16834676319544, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.06999781727790833, "logits/rejected": -0.01696675829589367, "logps/chosen": -3.7842602729797363, "logps/rejected": -4.433149337768555, "loss": 0.583, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.7842602729797363, "rewards/margins": 0.6488891839981079, "rewards/rejected": -4.433149337768555, "sft_loss": 3.8168907165527344, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 18.052223178294405, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.09506978839635849, "logits/rejected": -0.04939929395914078, "logps/chosen": -4.108767032623291, "logps/rejected": -4.559029579162598, "loss": 0.6721, "rewards/accuracies": 0.65625, "rewards/chosen": -4.108767032623291, "rewards/margins": 0.45026326179504395, "rewards/rejected": -4.559029579162598, "sft_loss": 4.11429500579834, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 12.903552574200932, "learning_rate": 8.853479363438342e-07, "logits/chosen": -0.02410072088241577, "logits/rejected": 0.08524173498153687, "logps/chosen": -4.105332374572754, "logps/rejected": -4.600467681884766, "loss": 0.6741, "rewards/accuracies": 0.625, "rewards/chosen": -4.105332374572754, "rewards/margins": 0.49513569474220276, "rewards/rejected": -4.600467681884766, "sft_loss": 4.071771621704102, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 15.292369300212767, "learning_rate": 8.843536867202588e-07, "logits/chosen": -0.06857451051473618, "logits/rejected": 0.05873778462409973, "logps/chosen": -3.989157199859619, "logps/rejected": -4.681168079376221, "loss": 0.602, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.989157199859619, "rewards/margins": 0.6920109391212463, "rewards/rejected": -4.681168079376221, "sft_loss": 4.073268890380859, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 17.441981364376606, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.14238521456718445, "logits/rejected": -0.06444496661424637, "logps/chosen": -3.9185664653778076, "logps/rejected": -4.394036293029785, "loss": 0.659, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.9185664653778076, "rewards/margins": 0.47546929121017456, "rewards/rejected": -4.394036293029785, "sft_loss": 3.975456953048706, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 15.180284082109292, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.06266282498836517, "logits/rejected": 0.06844954192638397, "logps/chosen": -3.8134632110595703, "logps/rejected": -4.475861549377441, "loss": 0.6285, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.8134632110595703, "rewards/margins": 0.6623983979225159, "rewards/rejected": -4.475861549377441, "sft_loss": 3.880690097808838, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 15.740476534234624, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.10209021717309952, "logits/rejected": 0.04368484765291214, "logps/chosen": -3.8531203269958496, "logps/rejected": -4.5683064460754395, "loss": 0.5769, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8531203269958496, "rewards/margins": 0.7151857614517212, "rewards/rejected": -4.5683064460754395, "sft_loss": 3.9027419090270996, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 21.63075410350149, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.20173931121826172, "logits/rejected": -0.07687047868967056, "logps/chosen": -3.8959603309631348, "logps/rejected": -4.4894514083862305, "loss": 0.6031, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.8959603309631348, "rewards/margins": 0.5934913754463196, "rewards/rejected": -4.4894514083862305, "sft_loss": 3.912142276763916, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 13.107913392579063, "learning_rate": 8.793266977736342e-07, "logits/chosen": -0.0669541209936142, "logits/rejected": -0.09892503172159195, "logps/chosen": -3.954420566558838, "logps/rejected": -4.294497966766357, "loss": 0.6963, "rewards/accuracies": 0.65625, "rewards/chosen": -3.954420566558838, "rewards/margins": 0.3400774300098419, "rewards/rejected": -4.294497966766357, "sft_loss": 4.035944938659668, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 13.270751912496983, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.06592990458011627, "logits/rejected": 0.03833655267953873, "logps/chosen": -3.835791826248169, "logps/rejected": -4.498007774353027, "loss": 0.589, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.835791826248169, "rewards/margins": 0.6622158288955688, "rewards/rejected": -4.498007774353027, "sft_loss": 3.873680830001831, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 12.413522513995133, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.09897182881832123, "logits/rejected": -0.014216387644410133, "logps/chosen": -4.089080810546875, "logps/rejected": -4.6123456954956055, "loss": 0.6242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.089080810546875, "rewards/margins": 0.52326500415802, "rewards/rejected": -4.6123456954956055, "sft_loss": 4.189183235168457, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 17.5216145626001, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.11287244409322739, "logits/rejected": 0.005205526947975159, "logps/chosen": -4.19803524017334, "logps/rejected": -4.849751949310303, "loss": 0.627, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.19803524017334, "rewards/margins": 0.6517167687416077, "rewards/rejected": -4.849751949310303, "sft_loss": 4.22841739654541, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 14.685549172591013, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.08108510076999664, "logits/rejected": -0.023788919672369957, "logps/chosen": -4.0276780128479, "logps/rejected": -4.748166084289551, "loss": 0.5887, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.0276780128479, "rewards/margins": 0.7204879522323608, "rewards/rejected": -4.748166084289551, "sft_loss": 4.157613754272461, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 12.536529497993769, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.09019587934017181, "logits/rejected": -0.009882120415568352, "logps/chosen": -4.088213920593262, "logps/rejected": -4.951138019561768, "loss": 0.5387, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.088213920593262, "rewards/margins": 0.8629242777824402, "rewards/rejected": -4.951138019561768, "sft_loss": 4.194894790649414, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 11.991754829936772, "learning_rate": 8.731729746982068e-07, "logits/chosen": -0.048121221363544464, "logits/rejected": 0.014382058754563332, "logps/chosen": -3.8855576515197754, "logps/rejected": -4.534394264221191, "loss": 0.5889, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.8855576515197754, "rewards/margins": 0.6488367319107056, "rewards/rejected": -4.534394264221191, "sft_loss": 4.058175563812256, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 15.905985513002536, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.15635357797145844, "logits/rejected": -0.009334458038210869, "logps/chosen": -4.011080741882324, "logps/rejected": -4.760368824005127, "loss": 0.6137, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.011080741882324, "rewards/margins": 0.7492876052856445, "rewards/rejected": -4.760368824005127, "sft_loss": 4.057827949523926, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 13.754476655611992, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.07682603597640991, "logits/rejected": 0.05229213088750839, "logps/chosen": -3.996446132659912, "logps/rejected": -4.582263946533203, "loss": 0.6005, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.996446132659912, "rewards/margins": 0.5858179330825806, "rewards/rejected": -4.582263946533203, "sft_loss": 4.014688491821289, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 10.067335179362507, "learning_rate": 8.700471013287424e-07, "logits/chosen": -0.07593150436878204, "logits/rejected": -0.04116532951593399, "logps/chosen": -3.863304853439331, "logps/rejected": -4.45562219619751, "loss": 0.5992, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.863304853439331, "rewards/margins": 0.5923171639442444, "rewards/rejected": -4.45562219619751, "sft_loss": 3.9426918029785156, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 18.353916912030076, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.08506940305233002, "logits/rejected": -0.035322219133377075, "logps/chosen": -4.098275184631348, "logps/rejected": -4.524683952331543, "loss": 0.699, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.098275184631348, "rewards/margins": 0.42640891671180725, "rewards/rejected": -4.524683952331543, "sft_loss": 4.1536760330200195, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 13.388975189982899, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.10464473068714142, "logits/rejected": -0.030910471454262733, "logps/chosen": -3.925689697265625, "logps/rejected": -4.58644962310791, "loss": 0.5593, "rewards/accuracies": 0.75, "rewards/chosen": -3.925689697265625, "rewards/margins": 0.6607602834701538, "rewards/rejected": -4.58644962310791, "sft_loss": 4.019021987915039, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 13.16293333208723, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.08910728991031647, "logits/rejected": -0.0004989765584468842, "logps/chosen": -3.8630428314208984, "logps/rejected": -4.422264575958252, "loss": 0.6096, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8630428314208984, "rewards/margins": 0.5592218637466431, "rewards/rejected": -4.422264575958252, "sft_loss": 3.9234790802001953, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 14.103884346044767, "learning_rate": 8.658290552963827e-07, "logits/chosen": -0.03781446814537048, "logits/rejected": -0.0019023215863853693, "logps/chosen": -3.9329593181610107, "logps/rejected": -4.5812506675720215, "loss": 0.635, "rewards/accuracies": 0.65625, "rewards/chosen": -3.9329593181610107, "rewards/margins": 0.6482918858528137, "rewards/rejected": -4.5812506675720215, "sft_loss": 3.982529401779175, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 10.813112953658766, "learning_rate": 8.647656449367966e-07, "logits/chosen": -0.030758926644921303, "logits/rejected": 0.08899354934692383, "logps/chosen": -3.8885810375213623, "logps/rejected": -4.4750494956970215, "loss": 0.6233, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.8885810375213623, "rewards/margins": 0.5864687561988831, "rewards/rejected": -4.4750494956970215, "sft_loss": 3.9688422679901123, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 11.772991974358334, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.11877866089344025, "logits/rejected": -0.031002437695860863, "logps/chosen": -3.7342638969421387, "logps/rejected": -4.362320899963379, "loss": 0.6135, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.7342638969421387, "rewards/margins": 0.6280564069747925, "rewards/rejected": -4.362320899963379, "sft_loss": 3.768057346343994, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 12.658512953340415, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.05375178903341293, "logits/rejected": 0.04362834244966507, "logps/chosen": -3.6902244091033936, "logps/rejected": -4.343583106994629, "loss": 0.5861, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.6902244091033936, "rewards/margins": 0.6533581018447876, "rewards/rejected": -4.343583106994629, "sft_loss": 3.7296364307403564, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 14.179960732016779, "learning_rate": 8.615542215511389e-07, "logits/chosen": -0.037360578775405884, "logits/rejected": 0.00016373097605537623, "logps/chosen": -3.7651381492614746, "logps/rejected": -4.201287746429443, "loss": 0.6528, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.7651381492614746, "rewards/margins": 0.436149537563324, "rewards/rejected": -4.201287746429443, "sft_loss": 3.768385648727417, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 16.424368962312393, "learning_rate": 8.604767176061241e-07, "logits/chosen": -0.03774772211909294, "logits/rejected": 0.029529806226491928, "logps/chosen": -3.819180727005005, "logps/rejected": -4.342913627624512, "loss": 0.624, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.819180727005005, "rewards/margins": 0.5237328410148621, "rewards/rejected": -4.342913627624512, "sft_loss": 3.8550522327423096, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 10.137170437138185, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.11584246158599854, "logits/rejected": -0.008330103941261768, "logps/chosen": -3.6586318016052246, "logps/rejected": -4.388864994049072, "loss": 0.5723, "rewards/accuracies": 0.75, "rewards/chosen": -3.6586318016052246, "rewards/margins": 0.7302330732345581, "rewards/rejected": -4.388864994049072, "sft_loss": 3.8012499809265137, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 11.336523700086135, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.06548066437244415, "logits/rejected": 0.008463447913527489, "logps/chosen": -3.7641005516052246, "logps/rejected": -4.285780429840088, "loss": 0.618, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7641005516052246, "rewards/margins": 0.5216798782348633, "rewards/rejected": -4.285780429840088, "sft_loss": 3.7976505756378174, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 17.348790285283606, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.1233416348695755, "logits/rejected": -0.029293501749634743, "logps/chosen": -3.632089138031006, "logps/rejected": -4.459404945373535, "loss": 0.5263, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.632089138031006, "rewards/margins": 0.8273167610168457, "rewards/rejected": -4.459404945373535, "sft_loss": 3.6921048164367676, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 11.301010273536113, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.07567024976015091, "logits/rejected": 0.0327785387635231, "logps/chosen": -3.817183256149292, "logps/rejected": -4.456064701080322, "loss": 0.5807, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.817183256149292, "rewards/margins": 0.6388810873031616, "rewards/rejected": -4.456064701080322, "sft_loss": 3.8272976875305176, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 12.81936248461788, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.07301422208547592, "logits/rejected": 0.004421988967806101, "logps/chosen": -3.991558790206909, "logps/rejected": -4.765776634216309, "loss": 0.5697, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.991558790206909, "rewards/margins": 0.7742177248001099, "rewards/rejected": -4.765776634216309, "sft_loss": 4.0272016525268555, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 17.547856262711125, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.06253395229578018, "logits/rejected": -0.03671478480100632, "logps/chosen": -4.0807695388793945, "logps/rejected": -4.742793560028076, "loss": 0.5886, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.0807695388793945, "rewards/margins": 0.6620240211486816, "rewards/rejected": -4.742793560028076, "sft_loss": 4.02866268157959, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 13.651133484117285, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.1057223305106163, "logits/rejected": -0.010374218225479126, "logps/chosen": -3.9620888233184814, "logps/rejected": -4.94193172454834, "loss": 0.5389, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.9620888233184814, "rewards/margins": 0.9798428416252136, "rewards/rejected": -4.94193172454834, "sft_loss": 4.024913787841797, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 14.638181690700069, "learning_rate": 8.51731666796467e-07, "logits/chosen": -0.07686323672533035, "logits/rejected": -0.020837152376770973, "logps/chosen": -4.231797218322754, "logps/rejected": -4.9157586097717285, "loss": 0.6144, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.231797218322754, "rewards/margins": 0.683961033821106, "rewards/rejected": -4.9157586097717285, "sft_loss": 4.229867935180664, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 14.737203631162844, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.116435207426548, "logits/rejected": -0.06153398007154465, "logps/chosen": -4.225415229797363, "logps/rejected": -4.778110504150391, "loss": 0.6326, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.225415229797363, "rewards/margins": 0.5526946783065796, "rewards/rejected": -4.778110504150391, "sft_loss": 4.249210834503174, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 12.484086685506309, "learning_rate": 8.495110657042488e-07, "logits/chosen": -0.07110501825809479, "logits/rejected": 0.061887145042419434, "logps/chosen": -4.253946781158447, "logps/rejected": -5.050871849060059, "loss": 0.5617, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.253946781158447, "rewards/margins": 0.7969244122505188, "rewards/rejected": -5.050871849060059, "sft_loss": 4.325262069702148, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 15.068150123718786, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.10341789573431015, "logits/rejected": -0.020474877208471298, "logps/chosen": -4.231016635894775, "logps/rejected": -4.958784103393555, "loss": 0.5979, "rewards/accuracies": 0.71875, "rewards/chosen": -4.231016635894775, "rewards/margins": 0.7277677655220032, "rewards/rejected": -4.958784103393555, "sft_loss": 4.313578128814697, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 21.8315650041553, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.20346875488758087, "logits/rejected": -0.09715770184993744, "logps/chosen": -4.41485071182251, "logps/rejected": -5.1168389320373535, "loss": 0.6072, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.41485071182251, "rewards/margins": 0.7019883394241333, "rewards/rejected": -5.1168389320373535, "sft_loss": 4.39910888671875, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 20.59256668239971, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.09526954591274261, "logits/rejected": 0.004205456469208002, "logps/chosen": -4.31170654296875, "logps/rejected": -5.111785888671875, "loss": 0.641, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.31170654296875, "rewards/margins": 0.8000794649124146, "rewards/rejected": -5.111785888671875, "sft_loss": 4.410150527954102, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 15.729252198391519, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.09368324279785156, "logits/rejected": -0.002443189499899745, "logps/chosen": -4.455938816070557, "logps/rejected": -5.167759895324707, "loss": 0.5937, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.455938816070557, "rewards/margins": 0.7118209600448608, "rewards/rejected": -5.167759895324707, "sft_loss": 4.464524269104004, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 19.667161127776616, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.08779771625995636, "logits/rejected": -0.034873414784669876, "logps/chosen": -4.279250144958496, "logps/rejected": -5.026732444763184, "loss": 0.604, "rewards/accuracies": 0.71875, "rewards/chosen": -4.279250144958496, "rewards/margins": 0.747482419013977, "rewards/rejected": -5.026732444763184, "sft_loss": 4.387712001800537, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 14.558493175272398, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.12827758491039276, "logits/rejected": -0.027052491903305054, "logps/chosen": -4.180931091308594, "logps/rejected": -4.950901031494141, "loss": 0.5554, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.180931091308594, "rewards/margins": 0.7699697017669678, "rewards/rejected": -4.950901031494141, "sft_loss": 4.20839786529541, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 18.27663410760242, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.17196258902549744, "logits/rejected": -0.06464549154043198, "logps/chosen": -4.138596534729004, "logps/rejected": -4.672633171081543, "loss": 0.6435, "rewards/accuracies": 0.6875, "rewards/chosen": -4.138596534729004, "rewards/margins": 0.5340362787246704, "rewards/rejected": -4.672633171081543, "sft_loss": 4.115258693695068, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 12.116387685411857, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.055260974913835526, "logits/rejected": 0.04977753013372421, "logps/chosen": -3.918208360671997, "logps/rejected": -4.867048740386963, "loss": 0.5035, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.918208360671997, "rewards/margins": 0.9488400220870972, "rewards/rejected": -4.867048740386963, "sft_loss": 3.9469428062438965, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 13.712602815240812, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.09575023502111435, "logits/rejected": -0.02423889935016632, "logps/chosen": -3.929525375366211, "logps/rejected": -4.6153106689453125, "loss": 0.6128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.929525375366211, "rewards/margins": 0.6857854723930359, "rewards/rejected": -4.6153106689453125, "sft_loss": 3.9949851036071777, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 15.553912642870277, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.13556435704231262, "logits/rejected": 0.03311797231435776, "logps/chosen": -3.945952892303467, "logps/rejected": -4.7006120681762695, "loss": 0.5602, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.945952892303467, "rewards/margins": 0.7546585202217102, "rewards/rejected": -4.7006120681762695, "sft_loss": 4.093667030334473, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 13.24083869530128, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.04542506858706474, "logits/rejected": -0.024010324850678444, "logps/chosen": -3.9064109325408936, "logps/rejected": -4.599078178405762, "loss": 0.562, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.9064109325408936, "rewards/margins": 0.6926669478416443, "rewards/rejected": -4.599078178405762, "sft_loss": 3.8920929431915283, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 12.759117673717622, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.15118056535720825, "logits/rejected": 0.05147156864404678, "logps/chosen": -4.047581195831299, "logps/rejected": -4.806788444519043, "loss": 0.5797, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.047581195831299, "rewards/margins": 0.7592069506645203, "rewards/rejected": -4.806788444519043, "sft_loss": 4.174335956573486, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 13.883968917337462, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.10345200449228287, "logits/rejected": 0.0034005953930318356, "logps/chosen": -4.144354820251465, "logps/rejected": -4.889418125152588, "loss": 0.5943, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.144354820251465, "rewards/margins": 0.7450634837150574, "rewards/rejected": -4.889418125152588, "sft_loss": 4.188509941101074, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 10.731596419107385, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.20840208232402802, "logits/rejected": -0.11927944421768188, "logps/chosen": -4.063103675842285, "logps/rejected": -4.731971263885498, "loss": 0.5719, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.063103675842285, "rewards/margins": 0.6688679456710815, "rewards/rejected": -4.731971263885498, "sft_loss": 4.155635356903076, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 10.963254128019047, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.18590185046195984, "logits/rejected": -0.08720795810222626, "logps/chosen": -4.1862921714782715, "logps/rejected": -4.918190956115723, "loss": 0.5674, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.1862921714782715, "rewards/margins": 0.7318993806838989, "rewards/rejected": -4.918190956115723, "sft_loss": 4.248475074768066, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 11.916215877231483, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.1482846736907959, "logits/rejected": -0.08977895975112915, "logps/chosen": -4.116065502166748, "logps/rejected": -4.904284477233887, "loss": 0.5654, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.116065502166748, "rewards/margins": 0.7882182002067566, "rewards/rejected": -4.904284477233887, "sft_loss": 4.303043842315674, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 18.505577200312903, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.2550107538700104, "logits/rejected": -0.09487710893154144, "logps/chosen": -4.230193138122559, "logps/rejected": -4.9468584060668945, "loss": 0.5762, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.230193138122559, "rewards/margins": 0.7166647911071777, "rewards/rejected": -4.9468584060668945, "sft_loss": 4.287786960601807, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 15.467327289614014, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.2060461938381195, "logits/rejected": -0.09747409075498581, "logps/chosen": -4.046446800231934, "logps/rejected": -4.801242828369141, "loss": 0.5648, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.046446800231934, "rewards/margins": 0.7547962069511414, "rewards/rejected": -4.801242828369141, "sft_loss": 4.178564548492432, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 14.620059094646697, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.23246657848358154, "logits/rejected": -0.07202459126710892, "logps/chosen": -4.032923698425293, "logps/rejected": -4.800480842590332, "loss": 0.5747, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.032923698425293, "rewards/margins": 0.7675572633743286, "rewards/rejected": -4.800480842590332, "sft_loss": 4.033566951751709, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 15.895269581813086, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.16594497859477997, "logits/rejected": -0.07534348219633102, "logps/chosen": -3.938028335571289, "logps/rejected": -4.824290752410889, "loss": 0.5395, "rewards/accuracies": 0.71875, "rewards/chosen": -3.938028335571289, "rewards/margins": 0.8862627744674683, "rewards/rejected": -4.824290752410889, "sft_loss": 3.8248863220214844, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 11.005947997620881, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.2246125191450119, "logits/rejected": -0.10327408462762833, "logps/chosen": -3.947735548019409, "logps/rejected": -4.890450477600098, "loss": 0.5051, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.947735548019409, "rewards/margins": 0.942714512348175, "rewards/rejected": -4.890450477600098, "sft_loss": 3.9465606212615967, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 13.930596343210329, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.1279004067182541, "logits/rejected": 0.03352883830666542, "logps/chosen": -4.045596122741699, "logps/rejected": -4.859705924987793, "loss": 0.5664, "rewards/accuracies": 0.75, "rewards/chosen": -4.045596122741699, "rewards/margins": 0.814110279083252, "rewards/rejected": -4.859705924987793, "sft_loss": 4.0374040603637695, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 16.839294295183258, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.16892042756080627, "logits/rejected": -0.10787680000066757, "logps/chosen": -4.0660200119018555, "logps/rejected": -4.787367820739746, "loss": 0.5484, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.0660200119018555, "rewards/margins": 0.7213469743728638, "rewards/rejected": -4.787367820739746, "sft_loss": 4.065528869628906, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 17.413554977238082, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.1649189293384552, "logits/rejected": -0.13256001472473145, "logps/chosen": -4.049285411834717, "logps/rejected": -4.7529191970825195, "loss": 0.5938, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.049285411834717, "rewards/margins": 0.7036335468292236, "rewards/rejected": -4.7529191970825195, "sft_loss": 4.0807366371154785, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 13.4636615396051, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.12577806413173676, "logits/rejected": -0.05988551303744316, "logps/chosen": -4.122179985046387, "logps/rejected": -5.196074485778809, "loss": 0.4992, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.122179985046387, "rewards/margins": 1.0738941431045532, "rewards/rejected": -5.196074485778809, "sft_loss": 4.1924967765808105, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 14.840492992172235, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.18502992391586304, "logits/rejected": -0.07532224804162979, "logps/chosen": -4.094423770904541, "logps/rejected": -4.795498847961426, "loss": 0.5868, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.094423770904541, "rewards/margins": 0.7010759115219116, "rewards/rejected": -4.795498847961426, "sft_loss": 4.141739845275879, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 14.625104911300314, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.027142172679305077, "logits/rejected": 0.014019886963069439, "logps/chosen": -4.086574554443359, "logps/rejected": -4.903563499450684, "loss": 0.5701, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.086574554443359, "rewards/margins": 0.8169891238212585, "rewards/rejected": -4.903563499450684, "sft_loss": 4.168776035308838, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 17.926558844975595, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.07390134036540985, "logits/rejected": 0.005565158557146788, "logps/chosen": -4.23164176940918, "logps/rejected": -5.028608322143555, "loss": 0.5973, "rewards/accuracies": 0.71875, "rewards/chosen": -4.23164176940918, "rewards/margins": 0.796966016292572, "rewards/rejected": -5.028608322143555, "sft_loss": 4.260631084442139, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 15.960753170538107, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.10266276448965073, "logits/rejected": 0.004123342223465443, "logps/chosen": -4.1737165451049805, "logps/rejected": -4.8127055168151855, "loss": 0.6381, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.1737165451049805, "rewards/margins": 0.6389890909194946, "rewards/rejected": -4.8127055168151855, "sft_loss": 4.173183441162109, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 19.160626139055477, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.15659977495670319, "logits/rejected": -0.07845531404018402, "logps/chosen": -4.23665714263916, "logps/rejected": -4.884160041809082, "loss": 0.6013, "rewards/accuracies": 0.6875, "rewards/chosen": -4.23665714263916, "rewards/margins": 0.6475027799606323, "rewards/rejected": -4.884160041809082, "sft_loss": 4.255043983459473, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 21.29535366361702, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.08475625514984131, "logits/rejected": 0.04252525418996811, "logps/chosen": -4.185021877288818, "logps/rejected": -5.140681266784668, "loss": 0.5477, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.185021877288818, "rewards/margins": 0.9556596875190735, "rewards/rejected": -5.140681266784668, "sft_loss": 4.271013259887695, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 18.12803368124785, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.19395044445991516, "logits/rejected": -0.06716791540384293, "logps/chosen": -4.197428226470947, "logps/rejected": -4.901777744293213, "loss": 0.6187, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.197428226470947, "rewards/margins": 0.7043498158454895, "rewards/rejected": -4.901777744293213, "sft_loss": 4.247593402862549, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.0695294588804245, "eval_logits/rejected": 0.14971204102039337, "eval_logps/chosen": -4.1031975746154785, "eval_logps/rejected": -4.878896236419678, "eval_loss": 0.5857792496681213, "eval_rewards/accuracies": 0.715133547782898, "eval_rewards/chosen": -4.1031975746154785, "eval_rewards/margins": 0.7756983637809753, "eval_rewards/rejected": -4.878896236419678, "eval_runtime": 42.9964, "eval_samples_per_second": 31.282, "eval_sft_loss": 4.131138801574707, "eval_steps_per_second": 7.838, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 17.05338147800005, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.14190088212490082, "logits/rejected": -0.09127983450889587, "logps/chosen": -4.014540672302246, "logps/rejected": -4.691771507263184, "loss": 0.6169, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.014540672302246, "rewards/margins": 0.6772304177284241, "rewards/rejected": -4.691771507263184, "sft_loss": 4.064333438873291, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 19.87084053942476, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.11133239418268204, "logits/rejected": -0.011872517876327038, "logps/chosen": -3.922724485397339, "logps/rejected": -4.658257961273193, "loss": 0.5662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.922724485397339, "rewards/margins": 0.7355332970619202, "rewards/rejected": -4.658257961273193, "sft_loss": 3.9572842121124268, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 15.774688448078022, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.09978102147579193, "logits/rejected": 0.0001321844756603241, "logps/chosen": -3.9842612743377686, "logps/rejected": -4.7468109130859375, "loss": 0.5523, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.9842612743377686, "rewards/margins": 0.7625495791435242, "rewards/rejected": -4.7468109130859375, "sft_loss": 3.9912009239196777, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 14.969559946626333, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.20114025473594666, "logits/rejected": -0.046921707689762115, "logps/chosen": -4.055922508239746, "logps/rejected": -4.940064430236816, "loss": 0.5027, "rewards/accuracies": 0.78125, "rewards/chosen": -4.055922508239746, "rewards/margins": 0.884142279624939, "rewards/rejected": -4.940064430236816, "sft_loss": 4.046994686126709, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 13.207095475491212, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.0987638384103775, "logits/rejected": 0.012016674503684044, "logps/chosen": -4.053750038146973, "logps/rejected": -4.846047401428223, "loss": 0.5785, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.053750038146973, "rewards/margins": 0.7922976613044739, "rewards/rejected": -4.846047401428223, "sft_loss": 4.142302513122559, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 11.154188596460584, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.19130758941173553, "logits/rejected": -0.14506961405277252, "logps/chosen": -3.9372031688690186, "logps/rejected": -4.668862342834473, "loss": 0.5785, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.9372031688690186, "rewards/margins": 0.7316591143608093, "rewards/rejected": -4.668862342834473, "sft_loss": 3.939913511276245, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 17.533100647290997, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.16748812794685364, "logits/rejected": -0.051542095839977264, "logps/chosen": -4.124471187591553, "logps/rejected": -4.820754051208496, "loss": 0.6085, "rewards/accuracies": 0.71875, "rewards/chosen": -4.124471187591553, "rewards/margins": 0.6962825059890747, "rewards/rejected": -4.820754051208496, "sft_loss": 4.2142438888549805, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 26.039351885496657, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.24009211361408234, "logits/rejected": -0.18795573711395264, "logps/chosen": -4.091917514801025, "logps/rejected": -4.927672386169434, "loss": 0.5537, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.091917514801025, "rewards/margins": 0.8357549905776978, "rewards/rejected": -4.927672386169434, "sft_loss": 4.069763660430908, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 18.31897348268519, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.155735582113266, "logits/rejected": -0.10973642021417618, "logps/chosen": -4.329349040985107, "logps/rejected": -5.011044025421143, "loss": 0.589, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.329349040985107, "rewards/margins": 0.6816949844360352, "rewards/rejected": -5.011044025421143, "sft_loss": 4.351229667663574, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 20.184438837605725, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.18872496485710144, "logits/rejected": -0.08053840696811676, "logps/chosen": -4.147758483886719, "logps/rejected": -4.978602409362793, "loss": 0.5597, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.147758483886719, "rewards/margins": 0.8308441042900085, "rewards/rejected": -4.978602409362793, "sft_loss": 4.277966499328613, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 20.94320682853988, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.15364764630794525, "logits/rejected": -0.024008695036172867, "logps/chosen": -4.2366251945495605, "logps/rejected": -5.171116352081299, "loss": 0.5549, "rewards/accuracies": 0.75, "rewards/chosen": -4.2366251945495605, "rewards/margins": 0.934490978717804, "rewards/rejected": -5.171116352081299, "sft_loss": 4.434886932373047, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 22.250093128795292, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.14576201140880585, "logits/rejected": -0.08288852870464325, "logps/chosen": -4.201880931854248, "logps/rejected": -5.092720985412598, "loss": 0.5352, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.201880931854248, "rewards/margins": 0.8908407092094421, "rewards/rejected": -5.092720985412598, "sft_loss": 4.3396759033203125, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 21.242304741827414, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.18846619129180908, "logits/rejected": -0.06018044799566269, "logps/chosen": -4.2880449295043945, "logps/rejected": -5.2668890953063965, "loss": 0.5225, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.2880449295043945, "rewards/margins": 0.9788439869880676, "rewards/rejected": -5.2668890953063965, "sft_loss": 4.337084770202637, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 19.31570145078924, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.10848329961299896, "logits/rejected": 0.021837735548615456, "logps/chosen": -4.196038246154785, "logps/rejected": -5.2210283279418945, "loss": 0.4841, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.196038246154785, "rewards/margins": 1.024989128112793, "rewards/rejected": -5.2210283279418945, "sft_loss": 4.274868965148926, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 22.94013542586667, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.18677277863025665, "logits/rejected": -0.09163932502269745, "logps/chosen": -4.378596782684326, "logps/rejected": -5.164307117462158, "loss": 0.5738, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.378596782684326, "rewards/margins": 0.7857100963592529, "rewards/rejected": -5.164307117462158, "sft_loss": 4.5541205406188965, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 17.167234712434464, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.21692046523094177, "logits/rejected": -0.08329122513532639, "logps/chosen": -4.143950462341309, "logps/rejected": -5.008641242980957, "loss": 0.5503, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.143950462341309, "rewards/margins": 0.8646903038024902, "rewards/rejected": -5.008641242980957, "sft_loss": 4.27891206741333, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 19.34770237312849, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.16237115859985352, "logits/rejected": -0.08700194954872131, "logps/chosen": -4.205402374267578, "logps/rejected": -5.136702537536621, "loss": 0.5444, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.205402374267578, "rewards/margins": 0.9313004612922668, "rewards/rejected": -5.136702537536621, "sft_loss": 4.245094299316406, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 17.22780023928489, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.23272278904914856, "logits/rejected": -0.21367502212524414, "logps/chosen": -4.102339744567871, "logps/rejected": -4.745957374572754, "loss": 0.5998, "rewards/accuracies": 0.71875, "rewards/chosen": -4.102339744567871, "rewards/margins": 0.6436169743537903, "rewards/rejected": -4.745957374572754, "sft_loss": 4.200228691101074, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 17.10567437014088, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.22676539421081543, "logits/rejected": -0.12468956410884857, "logps/chosen": -4.255773067474365, "logps/rejected": -4.978231430053711, "loss": 0.5951, "rewards/accuracies": 0.71875, "rewards/chosen": -4.255773067474365, "rewards/margins": 0.722458004951477, "rewards/rejected": -4.978231430053711, "sft_loss": 4.269195079803467, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 20.0998741552246, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.16979366540908813, "logits/rejected": -0.14117419719696045, "logps/chosen": -4.149965763092041, "logps/rejected": -4.847676753997803, "loss": 0.5978, "rewards/accuracies": 0.71875, "rewards/chosen": -4.149965763092041, "rewards/margins": 0.6977112293243408, "rewards/rejected": -4.847676753997803, "sft_loss": 4.136035919189453, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 12.754923904543626, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.12392155826091766, "logits/rejected": -0.01042869407683611, "logps/chosen": -3.959754467010498, "logps/rejected": -4.914425849914551, "loss": 0.5009, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.959754467010498, "rewards/margins": 0.9546712636947632, "rewards/rejected": -4.914425849914551, "sft_loss": 3.9639930725097656, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 17.556772490752582, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.1658501923084259, "logits/rejected": -0.06590066850185394, "logps/chosen": -4.020123481750488, "logps/rejected": -4.954275131225586, "loss": 0.5673, "rewards/accuracies": 0.75, "rewards/chosen": -4.020123481750488, "rewards/margins": 0.9341517686843872, "rewards/rejected": -4.954275131225586, "sft_loss": 4.090924263000488, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 16.578758847117033, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.10159929096698761, "logits/rejected": -0.06274596601724625, "logps/chosen": -4.219315052032471, "logps/rejected": -5.10249137878418, "loss": 0.5599, "rewards/accuracies": 0.75, "rewards/chosen": -4.219315052032471, "rewards/margins": 0.883176326751709, "rewards/rejected": -5.10249137878418, "sft_loss": 4.273227691650391, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 16.2793904905083, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.19755463302135468, "logits/rejected": -0.15835200250148773, "logps/chosen": -4.155510425567627, "logps/rejected": -4.86837100982666, "loss": 0.5639, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.155510425567627, "rewards/margins": 0.7128608822822571, "rewards/rejected": -4.86837100982666, "sft_loss": 4.283026695251465, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 14.055977009848089, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.18170380592346191, "logits/rejected": -0.10592323541641235, "logps/chosen": -4.295332431793213, "logps/rejected": -5.149449348449707, "loss": 0.5547, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.295332431793213, "rewards/margins": 0.8541167974472046, "rewards/rejected": -5.149449348449707, "sft_loss": 4.373181343078613, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 17.30927241380946, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.11262079328298569, "logits/rejected": 0.050410158932209015, "logps/chosen": -4.156128883361816, "logps/rejected": -5.0874128341674805, "loss": 0.5383, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.156128883361816, "rewards/margins": 0.9312840700149536, "rewards/rejected": -5.0874128341674805, "sft_loss": 4.226848602294922, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 19.103225819609648, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.14864888787269592, "logits/rejected": 0.01239332277327776, "logps/chosen": -3.9819626808166504, "logps/rejected": -4.8957133293151855, "loss": 0.5437, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.9819626808166504, "rewards/margins": 0.9137503504753113, "rewards/rejected": -4.8957133293151855, "sft_loss": 3.9962246417999268, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 22.11739512358915, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.12412190437316895, "logits/rejected": -0.02263473905622959, "logps/chosen": -3.9432873725891113, "logps/rejected": -4.75213098526001, "loss": 0.57, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.9432873725891113, "rewards/margins": 0.8088437914848328, "rewards/rejected": -4.75213098526001, "sft_loss": 4.006190776824951, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 14.002264234409363, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.11869798600673676, "logits/rejected": -0.03148897737264633, "logps/chosen": -3.889836549758911, "logps/rejected": -4.979212760925293, "loss": 0.5115, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.889836549758911, "rewards/margins": 1.08937668800354, "rewards/rejected": -4.979212760925293, "sft_loss": 3.909101963043213, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 15.202302167744714, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.15758922696113586, "logits/rejected": -0.09596370160579681, "logps/chosen": -4.030735015869141, "logps/rejected": -4.90773344039917, "loss": 0.5131, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.030735015869141, "rewards/margins": 0.8769989013671875, "rewards/rejected": -4.90773344039917, "sft_loss": 3.9965927600860596, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 18.19913795403612, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.12692536413669586, "logits/rejected": -0.056444037705659866, "logps/chosen": -4.315699577331543, "logps/rejected": -4.95998477935791, "loss": 0.6302, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.315699577331543, "rewards/margins": 0.6442848443984985, "rewards/rejected": -4.95998477935791, "sft_loss": 4.314980506896973, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 17.004668760738433, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.12733769416809082, "logits/rejected": -0.060163237154483795, "logps/chosen": -4.17328405380249, "logps/rejected": -5.150243282318115, "loss": 0.5492, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.17328405380249, "rewards/margins": 0.976959228515625, "rewards/rejected": -5.150243282318115, "sft_loss": 4.155553817749023, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 18.08265422362301, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.1178988590836525, "logits/rejected": 0.03894580155611038, "logps/chosen": -4.1134514808654785, "logps/rejected": -5.065721035003662, "loss": 0.5513, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.1134514808654785, "rewards/margins": 0.9522699117660522, "rewards/rejected": -5.065721035003662, "sft_loss": 4.160314083099365, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 10.545918570379506, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.09397809207439423, "logits/rejected": 0.013970824889838696, "logps/chosen": -4.077818870544434, "logps/rejected": -5.0350213050842285, "loss": 0.5459, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.077818870544434, "rewards/margins": 0.9572019577026367, "rewards/rejected": -5.0350213050842285, "sft_loss": 4.102574825286865, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 13.748822446211006, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.18362076580524445, "logits/rejected": -0.025226274505257607, "logps/chosen": -4.079222679138184, "logps/rejected": -4.883854389190674, "loss": 0.562, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.079222679138184, "rewards/margins": 0.8046321868896484, "rewards/rejected": -4.883854389190674, "sft_loss": 4.105748176574707, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 17.97959176978472, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.13779087364673615, "logits/rejected": -0.06253369152545929, "logps/chosen": -4.060021877288818, "logps/rejected": -5.039783954620361, "loss": 0.5132, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.060021877288818, "rewards/margins": 0.9797617793083191, "rewards/rejected": -5.039783954620361, "sft_loss": 4.161438941955566, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 17.599251235883298, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.06856133043766022, "logits/rejected": 0.05627395957708359, "logps/chosen": -4.204378604888916, "logps/rejected": -4.9699811935424805, "loss": 0.5685, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.204378604888916, "rewards/margins": 0.7656024694442749, "rewards/rejected": -4.9699811935424805, "sft_loss": 4.238471031188965, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 17.30284692112705, "learning_rate": 7.638933899585354e-07, "logits/chosen": -0.033752650022506714, "logits/rejected": -0.008529474027454853, "logps/chosen": -4.111490249633789, "logps/rejected": -5.000385284423828, "loss": 0.5555, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.111490249633789, "rewards/margins": 0.8888949155807495, "rewards/rejected": -5.000385284423828, "sft_loss": 4.2806501388549805, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 16.467697801821306, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.11014772951602936, "logits/rejected": -0.02117350697517395, "logps/chosen": -4.215609550476074, "logps/rejected": -5.145616054534912, "loss": 0.5854, "rewards/accuracies": 0.6875, "rewards/chosen": -4.215609550476074, "rewards/margins": 0.9300066232681274, "rewards/rejected": -5.145616054534912, "sft_loss": 4.365790367126465, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 15.112508542730295, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.10207013040781021, "logits/rejected": -0.10883428156375885, "logps/chosen": -4.202320575714111, "logps/rejected": -5.06498384475708, "loss": 0.5486, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.202320575714111, "rewards/margins": 0.8626632690429688, "rewards/rejected": -5.06498384475708, "sft_loss": 4.304457664489746, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 20.42638434211248, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.16497863829135895, "logits/rejected": -0.026081090793013573, "logps/chosen": -4.316076755523682, "logps/rejected": -5.126869201660156, "loss": 0.5781, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.316076755523682, "rewards/margins": 0.8107931017875671, "rewards/rejected": -5.126869201660156, "sft_loss": 4.403500556945801, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 21.271489259140914, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.17145070433616638, "logits/rejected": -0.0644187331199646, "logps/chosen": -4.0592265129089355, "logps/rejected": -5.11184024810791, "loss": 0.4993, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.0592265129089355, "rewards/margins": 1.052613615989685, "rewards/rejected": -5.11184024810791, "sft_loss": 4.157874584197998, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 16.32211937915042, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.158425971865654, "logits/rejected": -0.09455561637878418, "logps/chosen": -4.253739833831787, "logps/rejected": -5.023743629455566, "loss": 0.587, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.253739833831787, "rewards/margins": 0.7700031995773315, "rewards/rejected": -5.023743629455566, "sft_loss": 4.278241157531738, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 17.160418022349273, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.18763625621795654, "logits/rejected": -0.08890589326620102, "logps/chosen": -4.00942325592041, "logps/rejected": -4.843209266662598, "loss": 0.5768, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.00942325592041, "rewards/margins": 0.833785891532898, "rewards/rejected": -4.843209266662598, "sft_loss": 4.075999736785889, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 12.458562499903586, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.20564058423042297, "logits/rejected": 0.01137080229818821, "logps/chosen": -3.936244487762451, "logps/rejected": -5.025080680847168, "loss": 0.4974, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.936244487762451, "rewards/margins": 1.088836908340454, "rewards/rejected": -5.025080680847168, "sft_loss": 4.006060600280762, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 16.745086145900924, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.18450811505317688, "logits/rejected": -0.04890236631035805, "logps/chosen": -3.865755558013916, "logps/rejected": -4.929928779602051, "loss": 0.5334, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.865755558013916, "rewards/margins": 1.0641729831695557, "rewards/rejected": -4.929928779602051, "sft_loss": 3.895325183868408, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 29.180103058843205, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.14506229758262634, "logits/rejected": 0.006264629773795605, "logps/chosen": -3.9540493488311768, "logps/rejected": -4.871550559997559, "loss": 0.5514, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.9540493488311768, "rewards/margins": 0.9175008535385132, "rewards/rejected": -4.871550559997559, "sft_loss": 3.948908567428589, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 16.713002193962918, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.13637515902519226, "logits/rejected": 0.00962438527494669, "logps/chosen": -4.014534950256348, "logps/rejected": -4.793184757232666, "loss": 0.5942, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.014534950256348, "rewards/margins": 0.7786496877670288, "rewards/rejected": -4.793184757232666, "sft_loss": 4.10478401184082, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 22.288147043431373, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.10160605609416962, "logits/rejected": -0.07597693055868149, "logps/chosen": -3.868107318878174, "logps/rejected": -4.712255954742432, "loss": 0.5491, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.868107318878174, "rewards/margins": 0.8441489338874817, "rewards/rejected": -4.712255954742432, "sft_loss": 3.9633774757385254, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 15.5423927398474, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.05929984897375107, "logits/rejected": -0.025438731536269188, "logps/chosen": -3.977214813232422, "logps/rejected": -4.813974380493164, "loss": 0.5684, "rewards/accuracies": 0.71875, "rewards/chosen": -3.977214813232422, "rewards/margins": 0.8367594480514526, "rewards/rejected": -4.813974380493164, "sft_loss": 4.0244035720825195, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 23.989756074132227, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.0868724137544632, "logits/rejected": -0.020146001130342484, "logps/chosen": -3.998081684112549, "logps/rejected": -4.8043036460876465, "loss": 0.5908, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.998081684112549, "rewards/margins": 0.8062219619750977, "rewards/rejected": -4.8043036460876465, "sft_loss": 4.047082424163818, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 15.013028141264995, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.07611582428216934, "logits/rejected": -0.07014383375644684, "logps/chosen": -3.9512946605682373, "logps/rejected": -4.656590461730957, "loss": 0.5899, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.9512946605682373, "rewards/margins": 0.7052956223487854, "rewards/rejected": -4.656590461730957, "sft_loss": 4.055122375488281, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 12.295896409912185, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.11540064960718155, "logits/rejected": -0.031953103840351105, "logps/chosen": -3.908743381500244, "logps/rejected": -4.824851989746094, "loss": 0.5311, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.908743381500244, "rewards/margins": 0.916108250617981, "rewards/rejected": -4.824851989746094, "sft_loss": 4.039055824279785, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 20.32458374752934, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.19283311069011688, "logits/rejected": -0.04803388565778732, "logps/chosen": -3.9939074516296387, "logps/rejected": -4.994405269622803, "loss": 0.5022, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.9939074516296387, "rewards/margins": 1.000497579574585, "rewards/rejected": -4.994405269622803, "sft_loss": 4.019697666168213, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 14.210381238979735, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.14989027380943298, "logits/rejected": -0.026964152231812477, "logps/chosen": -3.871859073638916, "logps/rejected": -4.951968193054199, "loss": 0.4942, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.871859073638916, "rewards/margins": 1.0801092386245728, "rewards/rejected": -4.951968193054199, "sft_loss": 3.885855197906494, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 18.383099955253005, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.14752139151096344, "logits/rejected": 0.008709174580872059, "logps/chosen": -3.8431155681610107, "logps/rejected": -4.750420570373535, "loss": 0.5279, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.8431155681610107, "rewards/margins": 0.9073052406311035, "rewards/rejected": -4.750420570373535, "sft_loss": 3.9268157482147217, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 11.770440842002662, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.13080976903438568, "logits/rejected": 0.026985500007867813, "logps/chosen": -4.05759334564209, "logps/rejected": -4.9776129722595215, "loss": 0.533, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.05759334564209, "rewards/margins": 0.9200197458267212, "rewards/rejected": -4.9776129722595215, "sft_loss": 4.099170684814453, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 19.709789959596435, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.19222676753997803, "logits/rejected": -0.062310922890901566, "logps/chosen": -3.993795871734619, "logps/rejected": -4.981151103973389, "loss": 0.5684, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.993795871734619, "rewards/margins": 0.9873548746109009, "rewards/rejected": -4.981151103973389, "sft_loss": 4.073160171508789, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 20.5759194425081, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.15541115403175354, "logits/rejected": -0.08983412384986877, "logps/chosen": -4.1916046142578125, "logps/rejected": -4.978062629699707, "loss": 0.6232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.1916046142578125, "rewards/margins": 0.7864580750465393, "rewards/rejected": -4.978062629699707, "sft_loss": 4.301588535308838, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 18.397865902564224, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.1119125708937645, "logits/rejected": 0.007355786859989166, "logps/chosen": -4.03453254699707, "logps/rejected": -4.903344631195068, "loss": 0.5729, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.03453254699707, "rewards/margins": 0.8688125610351562, "rewards/rejected": -4.903344631195068, "sft_loss": 4.140660285949707, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 13.963771138170584, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.12454034388065338, "logits/rejected": -0.025773998349905014, "logps/chosen": -4.224725723266602, "logps/rejected": -5.127718925476074, "loss": 0.5521, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.224725723266602, "rewards/margins": 0.9029935598373413, "rewards/rejected": -5.127718925476074, "sft_loss": 4.250973701477051, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 16.055123565220338, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.11387109756469727, "logits/rejected": 0.07982424646615982, "logps/chosen": -4.00413179397583, "logps/rejected": -5.042935371398926, "loss": 0.5002, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.00413179397583, "rewards/margins": 1.0388035774230957, "rewards/rejected": -5.042935371398926, "sft_loss": 4.035943031311035, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 13.380973620447216, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.1749085783958435, "logits/rejected": 0.003732487093657255, "logps/chosen": -4.17561674118042, "logps/rejected": -5.078269004821777, "loss": 0.5367, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.17561674118042, "rewards/margins": 0.9026520848274231, "rewards/rejected": -5.078269004821777, "sft_loss": 4.265914440155029, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 13.500724126105808, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.04895884171128273, "logits/rejected": -0.010584674775600433, "logps/chosen": -4.067770481109619, "logps/rejected": -5.008973598480225, "loss": 0.5079, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.067770481109619, "rewards/margins": 0.9412034153938293, "rewards/rejected": -5.008973598480225, "sft_loss": 4.119451999664307, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 13.896592196030422, "learning_rate": 7.272866090342493e-07, "logits/chosen": -0.03926707059144974, "logits/rejected": 0.00505078723654151, "logps/chosen": -4.068405628204346, "logps/rejected": -5.059308052062988, "loss": 0.4772, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.068405628204346, "rewards/margins": 0.9909025430679321, "rewards/rejected": -5.059308052062988, "sft_loss": 4.019812107086182, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 18.523891805911703, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.13083158433437347, "logits/rejected": -0.04379258677363396, "logps/chosen": -4.155807971954346, "logps/rejected": -5.062368869781494, "loss": 0.5736, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.155807971954346, "rewards/margins": 0.9065613746643066, "rewards/rejected": -5.062368869781494, "sft_loss": 4.245884895324707, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 16.341347028939982, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.08317460119724274, "logits/rejected": -0.008221238851547241, "logps/chosen": -4.245736122131348, "logps/rejected": -5.182717800140381, "loss": 0.5387, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.245736122131348, "rewards/margins": 0.9369821548461914, "rewards/rejected": -5.182717800140381, "sft_loss": 4.267935752868652, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 16.1921418325257, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.21621200442314148, "logits/rejected": -0.05815199017524719, "logps/chosen": -4.295234680175781, "logps/rejected": -5.175727844238281, "loss": 0.5664, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.295234680175781, "rewards/margins": 0.8804932832717896, "rewards/rejected": -5.175727844238281, "sft_loss": 4.34125280380249, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 14.671338560540171, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.1463036984205246, "logits/rejected": 0.0023394287563860416, "logps/chosen": -4.261254787445068, "logps/rejected": -5.200662612915039, "loss": 0.5502, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.261254787445068, "rewards/margins": 0.9394083023071289, "rewards/rejected": -5.200662612915039, "sft_loss": 4.298597812652588, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 14.786302419959824, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.12140548229217529, "logits/rejected": 0.021579179912805557, "logps/chosen": -4.222411155700684, "logps/rejected": -5.156377792358398, "loss": 0.5386, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.222411155700684, "rewards/margins": 0.9339669346809387, "rewards/rejected": -5.156377792358398, "sft_loss": 4.266728401184082, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 22.892584684084042, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.12190501391887665, "logits/rejected": 0.003065797733142972, "logps/chosen": -4.157485485076904, "logps/rejected": -5.230223655700684, "loss": 0.5172, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.157485485076904, "rewards/margins": 1.0727381706237793, "rewards/rejected": -5.230223655700684, "sft_loss": 4.248648643493652, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 21.165580208370535, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.12997111678123474, "logits/rejected": 0.008958925493061543, "logps/chosen": -4.3522419929504395, "logps/rejected": -5.354263782501221, "loss": 0.5413, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.3522419929504395, "rewards/margins": 1.0020217895507812, "rewards/rejected": -5.354263782501221, "sft_loss": 4.560755729675293, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 18.520445412017608, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.12115871906280518, "logits/rejected": 0.006101625971496105, "logps/chosen": -4.234294891357422, "logps/rejected": -5.240196228027344, "loss": 0.5312, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.234294891357422, "rewards/margins": 1.005900502204895, "rewards/rejected": -5.240196228027344, "sft_loss": 4.323896408081055, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 18.94162664238945, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.060700200498104095, "logits/rejected": 0.08092973381280899, "logps/chosen": -4.362667560577393, "logps/rejected": -5.29547643661499, "loss": 0.5441, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.362667560577393, "rewards/margins": 0.9328088760375977, "rewards/rejected": -5.29547643661499, "sft_loss": 4.4554314613342285, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 16.089707699168432, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.17727595567703247, "logits/rejected": -0.028062384575605392, "logps/chosen": -4.135886192321777, "logps/rejected": -5.120160102844238, "loss": 0.5319, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.135886192321777, "rewards/margins": 0.9842736124992371, "rewards/rejected": -5.120160102844238, "sft_loss": 4.2006707191467285, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 16.925599283103264, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.1306312084197998, "logits/rejected": 0.017275024205446243, "logps/chosen": -4.3715691566467285, "logps/rejected": -5.420563697814941, "loss": 0.5397, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.3715691566467285, "rewards/margins": 1.0489943027496338, "rewards/rejected": -5.420563697814941, "sft_loss": 4.410833835601807, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 17.298361405272146, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.0684792622923851, "logits/rejected": 0.0437152236700058, "logps/chosen": -4.312630653381348, "logps/rejected": -5.2664313316345215, "loss": 0.5355, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.312630653381348, "rewards/margins": 0.9538006782531738, "rewards/rejected": -5.2664313316345215, "sft_loss": 4.343523979187012, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 16.894139267941664, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.0749233216047287, "logits/rejected": -0.028645822778344154, "logps/chosen": -4.292330741882324, "logps/rejected": -5.397002696990967, "loss": 0.4933, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.292330741882324, "rewards/margins": 1.1046717166900635, "rewards/rejected": -5.397002696990967, "sft_loss": 4.319766044616699, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 20.57306013082817, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.19145503640174866, "logits/rejected": -0.09005744755268097, "logps/chosen": -4.161319255828857, "logps/rejected": -4.979227542877197, "loss": 0.5577, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.161319255828857, "rewards/margins": 0.8179081678390503, "rewards/rejected": -4.979227542877197, "sft_loss": 4.277467250823975, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 24.941469477287953, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.10934241861104965, "logits/rejected": -0.02125650644302368, "logps/chosen": -4.162993431091309, "logps/rejected": -4.937913417816162, "loss": 0.5774, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.162993431091309, "rewards/margins": 0.7749199867248535, "rewards/rejected": -4.937913417816162, "sft_loss": 4.256572723388672, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.1579088568687439, "eval_logits/rejected": 0.24521751701831818, "eval_logps/chosen": -4.261503219604492, "eval_logps/rejected": -5.1611127853393555, "eval_loss": 0.5776896476745605, "eval_rewards/accuracies": 0.7277448177337646, "eval_rewards/chosen": -4.261503219604492, "eval_rewards/margins": 0.8996095657348633, "eval_rewards/rejected": -5.1611127853393555, "eval_runtime": 43.0968, "eval_samples_per_second": 31.209, "eval_sft_loss": 4.317946434020996, "eval_steps_per_second": 7.82, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 13.250023970937631, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.09466713666915894, "logits/rejected": 0.034652967005968094, "logps/chosen": -4.163640022277832, "logps/rejected": -5.244973659515381, "loss": 0.5412, "rewards/accuracies": 0.75, "rewards/chosen": -4.163640022277832, "rewards/margins": 1.081333875656128, "rewards/rejected": -5.244973659515381, "sft_loss": 4.2320756912231445, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 18.225924804438332, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.15738347172737122, "logits/rejected": 0.001196927623823285, "logps/chosen": -4.119842052459717, "logps/rejected": -4.949393272399902, "loss": 0.5431, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.119842052459717, "rewards/margins": 0.8295515775680542, "rewards/rejected": -4.949393272399902, "sft_loss": 4.182238578796387, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 13.813377337904992, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.02098652347922325, "logits/rejected": 0.05877337604761124, "logps/chosen": -4.1416335105896, "logps/rejected": -5.199089527130127, "loss": 0.5336, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.1416335105896, "rewards/margins": 1.0574554204940796, "rewards/rejected": -5.199089527130127, "sft_loss": 4.261752605438232, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 13.803338758968538, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.13239726424217224, "logits/rejected": -0.02377437800168991, "logps/chosen": -4.052218914031982, "logps/rejected": -5.084633827209473, "loss": 0.5173, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.052218914031982, "rewards/margins": 1.0324145555496216, "rewards/rejected": -5.084633827209473, "sft_loss": 4.154684066772461, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 15.467388748261115, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.08688588440418243, "logits/rejected": 0.0056015015579760075, "logps/chosen": -3.8696110248565674, "logps/rejected": -4.658940315246582, "loss": 0.5586, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.8696110248565674, "rewards/margins": 0.7893290519714355, "rewards/rejected": -4.658940315246582, "sft_loss": 4.0217156410217285, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 24.121078239161786, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.05252006649971008, "logits/rejected": 0.01766449585556984, "logps/chosen": -3.976597547531128, "logps/rejected": -4.868682861328125, "loss": 0.5583, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.976597547531128, "rewards/margins": 0.8920855522155762, "rewards/rejected": -4.868682861328125, "sft_loss": 3.9951343536376953, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 13.581396643669095, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.07369393110275269, "logits/rejected": 0.05318892002105713, "logps/chosen": -3.871485948562622, "logps/rejected": -4.692887783050537, "loss": 0.5965, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.871485948562622, "rewards/margins": 0.8214018940925598, "rewards/rejected": -4.692887783050537, "sft_loss": 3.947432279586792, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 16.035934350930617, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.07468081265687943, "logits/rejected": 0.03066675364971161, "logps/chosen": -3.855999708175659, "logps/rejected": -4.732884883880615, "loss": 0.5653, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.855999708175659, "rewards/margins": 0.8768857717514038, "rewards/rejected": -4.732884883880615, "sft_loss": 3.871295928955078, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 22.99980782696124, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.1008199006319046, "logits/rejected": 0.03632703796029091, "logps/chosen": -3.902106761932373, "logps/rejected": -4.686600208282471, "loss": 0.5728, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.902106761932373, "rewards/margins": 0.7844932675361633, "rewards/rejected": -4.686600208282471, "sft_loss": 3.9292640686035156, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 23.013156543428995, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.03523783013224602, "logits/rejected": 0.08227065950632095, "logps/chosen": -3.7815330028533936, "logps/rejected": -4.665060520172119, "loss": 0.536, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.7815330028533936, "rewards/margins": 0.8835276365280151, "rewards/rejected": -4.665060520172119, "sft_loss": 3.8153178691864014, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 15.170455557689014, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.1345018893480301, "logits/rejected": 0.09576258063316345, "logps/chosen": -3.930418014526367, "logps/rejected": -4.801162242889404, "loss": 0.5256, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.930418014526367, "rewards/margins": 0.8707441091537476, "rewards/rejected": -4.801162242889404, "sft_loss": 3.9338157176971436, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 25.67041425851637, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.059803593903779984, "logits/rejected": 0.02915279194712639, "logps/chosen": -3.8801674842834473, "logps/rejected": -4.6841230392456055, "loss": 0.586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8801674842834473, "rewards/margins": 0.8039555549621582, "rewards/rejected": -4.6841230392456055, "sft_loss": 3.8844990730285645, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 14.242351838450366, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.06666535884141922, "logits/rejected": 0.013455508276820183, "logps/chosen": -3.9759879112243652, "logps/rejected": -4.992976665496826, "loss": 0.4987, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.9759879112243652, "rewards/margins": 1.0169888734817505, "rewards/rejected": -4.992976665496826, "sft_loss": 3.9486610889434814, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 19.94100490761148, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.14499233663082123, "logits/rejected": -0.04203890636563301, "logps/chosen": -4.115070343017578, "logps/rejected": -4.816925525665283, "loss": 0.6046, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.115070343017578, "rewards/margins": 0.7018548250198364, "rewards/rejected": -4.816925525665283, "sft_loss": 4.2145185470581055, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 19.44598133863135, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.04176859185099602, "logits/rejected": 0.0038140460383147, "logps/chosen": -4.126821041107178, "logps/rejected": -4.9137067794799805, "loss": 0.5781, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.126821041107178, "rewards/margins": 0.7868860960006714, "rewards/rejected": -4.9137067794799805, "sft_loss": 4.189349174499512, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 13.139796574332802, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.12218749523162842, "logits/rejected": 0.060198068618774414, "logps/chosen": -4.126533031463623, "logps/rejected": -5.21274471282959, "loss": 0.5015, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.126533031463623, "rewards/margins": 1.086211085319519, "rewards/rejected": -5.21274471282959, "sft_loss": 4.309605121612549, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 15.664523423520645, "learning_rate": 6.818417974097246e-07, "logits/chosen": 0.04256322979927063, "logits/rejected": 0.18730340898036957, "logps/chosen": -3.963308811187744, "logps/rejected": -5.152323246002197, "loss": 0.4905, "rewards/accuracies": 0.75, "rewards/chosen": -3.963308811187744, "rewards/margins": 1.1890140771865845, "rewards/rejected": -5.152323246002197, "sft_loss": 4.1547064781188965, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 14.557563994969184, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.08105379343032837, "logits/rejected": -0.021051811054348946, "logps/chosen": -4.075659275054932, "logps/rejected": -5.015525817871094, "loss": 0.5219, "rewards/accuracies": 0.75, "rewards/chosen": -4.075659275054932, "rewards/margins": 0.9398663640022278, "rewards/rejected": -5.015525817871094, "sft_loss": 4.149325370788574, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 16.459628588276328, "learning_rate": 6.789367763543292e-07, "logits/chosen": 0.0012557022273540497, "logits/rejected": 0.03740160912275314, "logps/chosen": -4.035196781158447, "logps/rejected": -4.862973213195801, "loss": 0.5951, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.035196781158447, "rewards/margins": 0.827777087688446, "rewards/rejected": -4.862973213195801, "sft_loss": 4.138479232788086, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 20.5569062334297, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.02261544018983841, "logits/rejected": 0.12358323484659195, "logps/chosen": -3.9922146797180176, "logps/rejected": -4.67208194732666, "loss": 0.5743, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.9922146797180176, "rewards/margins": 0.6798672676086426, "rewards/rejected": -4.67208194732666, "sft_loss": 4.113407135009766, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 14.390536620434528, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.03141166269779205, "logits/rejected": 0.12173942476511002, "logps/chosen": -3.8743910789489746, "logps/rejected": -4.843568325042725, "loss": 0.5178, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.8743910789489746, "rewards/margins": 0.9691771268844604, "rewards/rejected": -4.843568325042725, "sft_loss": 3.890059232711792, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 14.954853828357361, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.09151516109704971, "logits/rejected": 0.03339110687375069, "logps/chosen": -3.8521568775177, "logps/rejected": -4.75100040435791, "loss": 0.5163, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.8521568775177, "rewards/margins": 0.8988434672355652, "rewards/rejected": -4.75100040435791, "sft_loss": 3.9217453002929688, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 14.922121712243158, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.00676008453592658, "logits/rejected": 0.0773114562034607, "logps/chosen": -3.9265663623809814, "logps/rejected": -4.898476600646973, "loss": 0.5228, "rewards/accuracies": 0.75, "rewards/chosen": -3.9265663623809814, "rewards/margins": 0.971910297870636, "rewards/rejected": -4.898476600646973, "sft_loss": 4.014676094055176, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 17.26823540006087, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.07249848544597626, "logits/rejected": 0.0011456996435299516, "logps/chosen": -4.010064125061035, "logps/rejected": -4.87760066986084, "loss": 0.52, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.010064125061035, "rewards/margins": 0.8675360679626465, "rewards/rejected": -4.87760066986084, "sft_loss": 3.9492244720458984, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 18.332281848753983, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.05388345569372177, "logits/rejected": 0.013353681191802025, "logps/chosen": -4.10679817199707, "logps/rejected": -5.018848419189453, "loss": 0.5434, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.10679817199707, "rewards/margins": 0.9120498895645142, "rewards/rejected": -5.018848419189453, "sft_loss": 4.238326072692871, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 23.35264730325701, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.0372026227414608, "logits/rejected": 0.033376261591911316, "logps/chosen": -4.218406677246094, "logps/rejected": -5.008605003356934, "loss": 0.6154, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.218406677246094, "rewards/margins": 0.7901977896690369, "rewards/rejected": -5.008605003356934, "sft_loss": 4.312189102172852, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 23.98823359034702, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.11207201331853867, "logits/rejected": 0.017515579238533974, "logps/chosen": -4.183786869049072, "logps/rejected": -5.026003837585449, "loss": 0.5648, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.183786869049072, "rewards/margins": 0.8422168493270874, "rewards/rejected": -5.026003837585449, "sft_loss": 4.257922649383545, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 18.23902876924883, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.07603417336940765, "logits/rejected": 0.06636123359203339, "logps/chosen": -4.205514907836914, "logps/rejected": -5.013917446136475, "loss": 0.5752, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.205514907836914, "rewards/margins": 0.8084025382995605, "rewards/rejected": -5.013917446136475, "sft_loss": 4.290884971618652, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 17.576043961769646, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.07821284234523773, "logits/rejected": 0.10023043304681778, "logps/chosen": -4.091734886169434, "logps/rejected": -5.130631923675537, "loss": 0.4947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.091734886169434, "rewards/margins": 1.0388973951339722, "rewards/rejected": -5.130631923675537, "sft_loss": 4.176156044006348, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 17.31891971556111, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.041422732174396515, "logits/rejected": 0.0944138616323471, "logps/chosen": -4.169663429260254, "logps/rejected": -4.939678192138672, "loss": 0.575, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.169663429260254, "rewards/margins": 0.7700151205062866, "rewards/rejected": -4.939678192138672, "sft_loss": 4.2203521728515625, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 17.980562333891605, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.026197027415037155, "logits/rejected": 0.08606614917516708, "logps/chosen": -3.9951393604278564, "logps/rejected": -5.0169453620910645, "loss": 0.5218, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.9951393604278564, "rewards/margins": 1.0218056440353394, "rewards/rejected": -5.0169453620910645, "sft_loss": 4.092293739318848, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 18.965850339952567, "learning_rate": 6.598899386669395e-07, "logits/chosen": 0.0009241849184036255, "logits/rejected": 0.10314282029867172, "logps/chosen": -3.983647584915161, "logps/rejected": -4.829386234283447, "loss": 0.5697, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.983647584915161, "rewards/margins": 0.8457382321357727, "rewards/rejected": -4.829386234283447, "sft_loss": 4.019073486328125, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 22.059341130807468, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.016916906461119652, "logits/rejected": 0.09965212643146515, "logps/chosen": -3.9172863960266113, "logps/rejected": -4.917275428771973, "loss": 0.5232, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9172863960266113, "rewards/margins": 0.9999886751174927, "rewards/rejected": -4.917275428771973, "sft_loss": 3.9654273986816406, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 12.239812031955307, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.04140595346689224, "logits/rejected": 0.042737144976854324, "logps/chosen": -3.9178566932678223, "logps/rejected": -4.855074405670166, "loss": 0.5396, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.9178566932678223, "rewards/margins": 0.9372177124023438, "rewards/rejected": -4.855074405670166, "sft_loss": 3.9111125469207764, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 16.9497595753261, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.11973029375076294, "logits/rejected": 0.015954116359353065, "logps/chosen": -4.019374370574951, "logps/rejected": -4.899747848510742, "loss": 0.5561, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.019374370574951, "rewards/margins": 0.8803732991218567, "rewards/rejected": -4.899747848510742, "sft_loss": 4.068663120269775, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 21.405099023095033, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.06440500169992447, "logits/rejected": 0.015679899603128433, "logps/chosen": -4.161512851715088, "logps/rejected": -4.838021278381348, "loss": 0.5979, "rewards/accuracies": 0.6875, "rewards/chosen": -4.161512851715088, "rewards/margins": 0.6765087842941284, "rewards/rejected": -4.838021278381348, "sft_loss": 4.2326836585998535, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 18.492800267020584, "learning_rate": 6.524927148842602e-07, "logits/chosen": 0.027476992458105087, "logits/rejected": 0.166078582406044, "logps/chosen": -4.085366249084473, "logps/rejected": -4.959372520446777, "loss": 0.5506, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.085366249084473, "rewards/margins": 0.8740068674087524, "rewards/rejected": -4.959372520446777, "sft_loss": 4.118053913116455, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 22.051717275566087, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.07167111337184906, "logits/rejected": 0.03960205242037773, "logps/chosen": -3.994276523590088, "logps/rejected": -4.847044944763184, "loss": 0.5662, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.994276523590088, "rewards/margins": 0.8527683019638062, "rewards/rejected": -4.847044944763184, "sft_loss": 4.117629051208496, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 18.763676766189004, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.029994597658514977, "logits/rejected": 0.01790265180170536, "logps/chosen": -4.043631553649902, "logps/rejected": -4.825007438659668, "loss": 0.5664, "rewards/accuracies": 0.71875, "rewards/chosen": -4.043631553649902, "rewards/margins": 0.7813760042190552, "rewards/rejected": -4.825007438659668, "sft_loss": 4.099611759185791, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 22.383869694507865, "learning_rate": 6.480365119346011e-07, "logits/chosen": 0.072385273873806, "logits/rejected": 0.1959836781024933, "logps/chosen": -4.041221618652344, "logps/rejected": -4.9139838218688965, "loss": 0.5435, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.041221618652344, "rewards/margins": 0.8727631568908691, "rewards/rejected": -4.9139838218688965, "sft_loss": 4.137312889099121, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 14.148402012039993, "learning_rate": 6.465482192203129e-07, "logits/chosen": 0.07596292346715927, "logits/rejected": 0.14356279373168945, "logps/chosen": -4.043821811676025, "logps/rejected": -4.930019378662109, "loss": 0.5302, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.043821811676025, "rewards/margins": 0.8861970901489258, "rewards/rejected": -4.930019378662109, "sft_loss": 4.203348159790039, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 24.014892783435013, "learning_rate": 6.45058504694559e-07, "logits/chosen": 0.05701693147420883, "logits/rejected": 0.10021479427814484, "logps/chosen": -4.121259689331055, "logps/rejected": -5.054667949676514, "loss": 0.5315, "rewards/accuracies": 0.75, "rewards/chosen": -4.121259689331055, "rewards/margins": 0.9334084391593933, "rewards/rejected": -5.054667949676514, "sft_loss": 4.234942436218262, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 20.26942892267099, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.007051569409668446, "logits/rejected": 0.11115667968988419, "logps/chosen": -4.124650001525879, "logps/rejected": -5.125641822814941, "loss": 0.5469, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.124650001525879, "rewards/margins": 1.0009920597076416, "rewards/rejected": -5.125641822814941, "sft_loss": 4.286810874938965, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 17.584308254258314, "learning_rate": 6.420748680351763e-07, "logits/chosen": 0.08284053951501846, "logits/rejected": 0.036834098398685455, "logps/chosen": -4.212587833404541, "logps/rejected": -4.950636863708496, "loss": 0.5971, "rewards/accuracies": 0.71875, "rewards/chosen": -4.212587833404541, "rewards/margins": 0.7380497455596924, "rewards/rejected": -4.950636863708496, "sft_loss": 4.395787239074707, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 26.301576396692624, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.02107442170381546, "logits/rejected": 0.10732152312994003, "logps/chosen": -4.238432884216309, "logps/rejected": -5.290383815765381, "loss": 0.5606, "rewards/accuracies": 0.75, "rewards/chosen": -4.238432884216309, "rewards/margins": 1.0519508123397827, "rewards/rejected": -5.290383815765381, "sft_loss": 4.277993202209473, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 17.321515931340176, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.10459005832672119, "logits/rejected": 0.06394404172897339, "logps/chosen": -4.208449363708496, "logps/rejected": -5.058264255523682, "loss": 0.5599, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.208449363708496, "rewards/margins": 0.849814772605896, "rewards/rejected": -5.058264255523682, "sft_loss": 4.295122146606445, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 19.571155222145673, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.10111421346664429, "logits/rejected": -0.006807476282119751, "logps/chosen": -4.322181224822998, "logps/rejected": -5.2172675132751465, "loss": 0.5485, "rewards/accuracies": 0.78125, "rewards/chosen": -4.322181224822998, "rewards/margins": 0.8950859904289246, "rewards/rejected": -5.2172675132751465, "sft_loss": 4.372712135314941, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 16.4871101876668, "learning_rate": 6.360911698273326e-07, "logits/chosen": 0.013696163892745972, "logits/rejected": 0.12533724308013916, "logps/chosen": -4.365028381347656, "logps/rejected": -5.152103424072266, "loss": 0.5814, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.365028381347656, "rewards/margins": 0.7870752215385437, "rewards/rejected": -5.152103424072266, "sft_loss": 4.409226894378662, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 14.975019621922833, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.014929696917533875, "logits/rejected": 0.05198700353503227, "logps/chosen": -4.0075507164001465, "logps/rejected": -5.04404354095459, "loss": 0.5027, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.0075507164001465, "rewards/margins": 1.0364933013916016, "rewards/rejected": -5.04404354095459, "sft_loss": 4.074658393859863, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 21.222758938780224, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.0078008463606238365, "logits/rejected": 0.10538405179977417, "logps/chosen": -4.141799449920654, "logps/rejected": -5.210144996643066, "loss": 0.5247, "rewards/accuracies": 0.75, "rewards/chosen": -4.141799449920654, "rewards/margins": 1.0683459043502808, "rewards/rejected": -5.210144996643066, "sft_loss": 4.205315589904785, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 16.952020669846306, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.006896118633449078, "logits/rejected": 0.13101527094841003, "logps/chosen": -4.1590070724487305, "logps/rejected": -5.018160820007324, "loss": 0.5278, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.1590070724487305, "rewards/margins": 0.859154224395752, "rewards/rejected": -5.018160820007324, "sft_loss": 4.295965671539307, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 15.126805683593512, "learning_rate": 6.300863461616657e-07, "logits/chosen": 0.029642198234796524, "logits/rejected": 0.10298113524913788, "logps/chosen": -4.103089809417725, "logps/rejected": -4.830977439880371, "loss": 0.616, "rewards/accuracies": 0.71875, "rewards/chosen": -4.103089809417725, "rewards/margins": 0.7278872132301331, "rewards/rejected": -4.830977439880371, "sft_loss": 4.179484844207764, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 15.129423535610094, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.0842280238866806, "logits/rejected": 0.032838691025972366, "logps/chosen": -4.095457553863525, "logps/rejected": -5.005012035369873, "loss": 0.5048, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.095457553863525, "rewards/margins": 0.9095550775527954, "rewards/rejected": -5.005012035369873, "sft_loss": 4.185179710388184, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 18.110220036830906, "learning_rate": 6.270763034485986e-07, "logits/chosen": 0.041431643068790436, "logits/rejected": 0.12850026786327362, "logps/chosen": -4.218494892120361, "logps/rejected": -5.118128776550293, "loss": 0.533, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.218494892120361, "rewards/margins": 0.8996332287788391, "rewards/rejected": -5.118128776550293, "sft_loss": 4.214810848236084, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 35.430231259525655, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.015107926912605762, "logits/rejected": 0.11170251667499542, "logps/chosen": -4.227860927581787, "logps/rejected": -5.089137554168701, "loss": 0.5851, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.227860927581787, "rewards/margins": 0.8612769246101379, "rewards/rejected": -5.089137554168701, "sft_loss": 4.147841453552246, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 19.98694884716753, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.07201234996318817, "logits/rejected": 0.07067441195249557, "logps/chosen": -4.084803581237793, "logps/rejected": -4.9614481925964355, "loss": 0.5871, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.084803581237793, "rewards/margins": 0.8766444325447083, "rewards/rejected": -4.9614481925964355, "sft_loss": 4.110752105712891, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 12.298144029591253, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.05858708545565605, "logits/rejected": 0.1349249631166458, "logps/chosen": -3.9142913818359375, "logps/rejected": -5.051329612731934, "loss": 0.4604, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.9142913818359375, "rewards/margins": 1.137038230895996, "rewards/rejected": -5.051329612731934, "sft_loss": 4.005929946899414, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 31.511770228889546, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.1398880034685135, "logits/rejected": 0.0756937712430954, "logps/chosen": -4.080639362335205, "logps/rejected": -5.032453536987305, "loss": 0.5697, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.080639362335205, "rewards/margins": 0.9518140554428101, "rewards/rejected": -5.032453536987305, "sft_loss": 4.059521198272705, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 19.41232862754573, "learning_rate": 6.195298770577415e-07, "logits/chosen": 0.059480488300323486, "logits/rejected": 0.07041338831186295, "logps/chosen": -4.093963146209717, "logps/rejected": -5.036952495574951, "loss": 0.5553, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.093963146209717, "rewards/margins": 0.9429893493652344, "rewards/rejected": -5.036952495574951, "sft_loss": 4.13309907913208, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 17.476186313500772, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.04353252798318863, "logits/rejected": 0.11172328144311905, "logps/chosen": -4.150980472564697, "logps/rejected": -5.078780174255371, "loss": 0.5423, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.150980472564697, "rewards/margins": 0.9278000593185425, "rewards/rejected": -5.078780174255371, "sft_loss": 4.235171318054199, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 15.38466246175128, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.0989823192358017, "logits/rejected": 0.08282925188541412, "logps/chosen": -4.078875541687012, "logps/rejected": -5.304009437561035, "loss": 0.4665, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.078875541687012, "rewards/margins": 1.2251341342926025, "rewards/rejected": -5.304009437561035, "sft_loss": 4.070713996887207, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 18.130730530155056, "learning_rate": 6.149879879003876e-07, "logits/chosen": 0.03766113892197609, "logits/rejected": 0.0644763633608818, "logps/chosen": -4.121838569641113, "logps/rejected": -5.137380123138428, "loss": 0.5099, "rewards/accuracies": 0.8125, "rewards/chosen": -4.121838569641113, "rewards/margins": 1.0155417919158936, "rewards/rejected": -5.137380123138428, "sft_loss": 4.17519998550415, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 15.753038422112043, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.1554158627986908, "logits/rejected": -0.05729420855641365, "logps/chosen": -4.0880632400512695, "logps/rejected": -5.01497745513916, "loss": 0.5179, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.0880632400512695, "rewards/margins": 0.9269140958786011, "rewards/rejected": -5.01497745513916, "sft_loss": 4.160233020782471, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 14.711083719896562, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.08170298486948013, "logits/rejected": 0.03274992108345032, "logps/chosen": -4.147464752197266, "logps/rejected": -4.928176403045654, "loss": 0.5773, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.147464752197266, "rewards/margins": 0.78071129322052, "rewards/rejected": -4.928176403045654, "sft_loss": 4.148330211639404, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 20.865595547653733, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.0227263942360878, "logits/rejected": 0.05186920240521431, "logps/chosen": -4.170312881469727, "logps/rejected": -5.016467094421387, "loss": 0.5475, "rewards/accuracies": 0.71875, "rewards/chosen": -4.170312881469727, "rewards/margins": 0.8461543321609497, "rewards/rejected": -5.016467094421387, "sft_loss": 4.219161033630371, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 17.644714739641714, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.05663248151540756, "logits/rejected": 0.08762215077877045, "logps/chosen": -4.0606608390808105, "logps/rejected": -5.053631782531738, "loss": 0.5428, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.0606608390808105, "rewards/margins": 0.9929712414741516, "rewards/rejected": -5.053631782531738, "sft_loss": 4.119710445404053, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 12.892354129619571, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.025140320882201195, "logits/rejected": 0.13027231395244598, "logps/chosen": -3.8592910766601562, "logps/rejected": -4.957719802856445, "loss": 0.4691, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.8592910766601562, "rewards/margins": 1.0984289646148682, "rewards/rejected": -4.957719802856445, "sft_loss": 3.962966203689575, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 14.579312955072842, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.08685266226530075, "logits/rejected": -0.0053533404134213924, "logps/chosen": -4.053783416748047, "logps/rejected": -5.094172477722168, "loss": 0.5043, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.053783416748047, "rewards/margins": 1.0403884649276733, "rewards/rejected": -5.094172477722168, "sft_loss": 4.1353960037231445, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 16.287696529068228, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.05991528183221817, "logits/rejected": 0.06272322684526443, "logps/chosen": -3.9453110694885254, "logps/rejected": -4.837578773498535, "loss": 0.5177, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.9453110694885254, "rewards/margins": 0.8922680020332336, "rewards/rejected": -4.837578773498535, "sft_loss": 3.940615177154541, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 20.80843033969584, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.06943833827972412, "logits/rejected": 0.028343399986624718, "logps/chosen": -4.107184410095215, "logps/rejected": -5.092995643615723, "loss": 0.5261, "rewards/accuracies": 0.78125, "rewards/chosen": -4.107184410095215, "rewards/margins": 0.9858118891716003, "rewards/rejected": -5.092995643615723, "sft_loss": 4.021084785461426, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 25.24141079264102, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.022642286494374275, "logits/rejected": 0.06688891351222992, "logps/chosen": -4.051756381988525, "logps/rejected": -5.051907062530518, "loss": 0.5089, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.051756381988525, "rewards/margins": 1.0001503229141235, "rewards/rejected": -5.051907062530518, "sft_loss": 4.085324764251709, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 18.25910620581888, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.11980477720499039, "logits/rejected": 0.024762999266386032, "logps/chosen": -4.187039375305176, "logps/rejected": -5.186661720275879, "loss": 0.5251, "rewards/accuracies": 0.75, "rewards/chosen": -4.187039375305176, "rewards/margins": 0.9996216893196106, "rewards/rejected": -5.186661720275879, "sft_loss": 4.254944801330566, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 23.297284899305616, "learning_rate": 5.982515179388486e-07, "logits/chosen": 0.014777600765228271, "logits/rejected": 0.12283160537481308, "logps/chosen": -4.188802242279053, "logps/rejected": -5.0527024269104, "loss": 0.5634, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.188802242279053, "rewards/margins": 0.8639005422592163, "rewards/rejected": -5.0527024269104, "sft_loss": 4.229619026184082, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 15.362375581310394, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.08131209015846252, "logits/rejected": -0.010910587385296822, "logps/chosen": -4.099891662597656, "logps/rejected": -5.010560035705566, "loss": 0.5453, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.099891662597656, "rewards/margins": 0.9106694459915161, "rewards/rejected": -5.010560035705566, "sft_loss": 4.198235511779785, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 18.285294232389813, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.021242473274469376, "logits/rejected": 0.047610148787498474, "logps/chosen": -4.324985504150391, "logps/rejected": -5.272888660430908, "loss": 0.5546, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.324985504150391, "rewards/margins": 0.9479031562805176, "rewards/rejected": -5.272888660430908, "sft_loss": 4.410521507263184, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 14.359212704553558, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.0435919351875782, "logits/rejected": 0.043762434273958206, "logps/chosen": -4.3246612548828125, "logps/rejected": -5.148694038391113, "loss": 0.5852, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.3246612548828125, "rewards/margins": 0.824032187461853, "rewards/rejected": -5.148694038391113, "sft_loss": 4.269598960876465, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 14.576563616406881, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.05614321678876877, "logits/rejected": 0.0751347541809082, "logps/chosen": -4.400944709777832, "logps/rejected": -5.452885627746582, "loss": 0.5033, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.400944709777832, "rewards/margins": 1.0519407987594604, "rewards/rejected": -5.452885627746582, "sft_loss": 4.369610786437988, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 20.812593709518424, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.05414440482854843, "logits/rejected": -0.05643627047538757, "logps/chosen": -4.343169689178467, "logps/rejected": -5.196185111999512, "loss": 0.5817, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.343169689178467, "rewards/margins": 0.8530157208442688, "rewards/rejected": -5.196185111999512, "sft_loss": 4.5007429122924805, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 15.582274519884054, "learning_rate": 5.890726635828919e-07, "logits/chosen": 0.011871114373207092, "logits/rejected": 0.017706722021102905, "logps/chosen": -4.198437690734863, "logps/rejected": -5.182080268859863, "loss": 0.5529, "rewards/accuracies": 0.71875, "rewards/chosen": -4.198437690734863, "rewards/margins": 0.9836423993110657, "rewards/rejected": -5.182080268859863, "sft_loss": 4.292941093444824, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 17.999513649508582, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.06686706840991974, "logits/rejected": 0.0006932914257049561, "logps/chosen": -4.182742595672607, "logps/rejected": -5.087394714355469, "loss": 0.5393, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.182742595672607, "rewards/margins": 0.9046524167060852, "rewards/rejected": -5.087394714355469, "sft_loss": 4.188513278961182, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.2569299042224884, "eval_logits/rejected": 0.3460070788860321, "eval_logps/chosen": -4.325809955596924, "eval_logps/rejected": -5.222593784332275, "eval_loss": 0.5736204981803894, "eval_rewards/accuracies": 0.7255192995071411, "eval_rewards/chosen": -4.325809955596924, "eval_rewards/margins": 0.8967837691307068, "eval_rewards/rejected": -5.222593784332275, "eval_runtime": 43.14, "eval_samples_per_second": 31.178, "eval_sft_loss": 4.350619316101074, "eval_steps_per_second": 7.812, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 14.121410420431141, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.13204196095466614, "logits/rejected": 0.014378545805811882, "logps/chosen": -4.165450096130371, "logps/rejected": -5.127333164215088, "loss": 0.5343, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.165450096130371, "rewards/margins": 0.9618828892707825, "rewards/rejected": -5.127333164215088, "sft_loss": 4.18618106842041, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 19.46324147029196, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.035107698291540146, "logits/rejected": 0.040776319801807404, "logps/chosen": -4.294785022735596, "logps/rejected": -5.255988121032715, "loss": 0.5356, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.294785022735596, "rewards/margins": 0.9612032771110535, "rewards/rejected": -5.255988121032715, "sft_loss": 4.381821632385254, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 14.028530499360638, "learning_rate": 5.829359458171714e-07, "logits/chosen": 0.010254351422190666, "logits/rejected": 0.11219009011983871, "logps/chosen": -4.247211456298828, "logps/rejected": -5.452017307281494, "loss": 0.4634, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.247211456298828, "rewards/margins": 1.2048051357269287, "rewards/rejected": -5.452017307281494, "sft_loss": 4.285406112670898, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 16.72202278142961, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.039016492664813995, "logits/rejected": 0.13155677914619446, "logps/chosen": -4.415073394775391, "logps/rejected": -5.258774757385254, "loss": 0.5975, "rewards/accuracies": 0.71875, "rewards/chosen": -4.415073394775391, "rewards/margins": 0.8437017202377319, "rewards/rejected": -5.258774757385254, "sft_loss": 4.524782657623291, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 16.689398458973308, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.10748390853404999, "logits/rejected": 0.059959232807159424, "logps/chosen": -4.438462734222412, "logps/rejected": -5.426039695739746, "loss": 0.5321, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.438462734222412, "rewards/margins": 0.9875761866569519, "rewards/rejected": -5.426039695739746, "sft_loss": 4.467665195465088, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 14.3991656602208, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.08068076521158218, "logits/rejected": 0.029455062001943588, "logps/chosen": -4.187173843383789, "logps/rejected": -5.219720363616943, "loss": 0.513, "rewards/accuracies": 0.78125, "rewards/chosen": -4.187173843383789, "rewards/margins": 1.032546043395996, "rewards/rejected": -5.219720363616943, "sft_loss": 4.1944260597229, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 21.016670098652725, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.06701847165822983, "logits/rejected": 0.07092493772506714, "logps/chosen": -4.410538196563721, "logps/rejected": -5.553727149963379, "loss": 0.4838, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.410538196563721, "rewards/margins": 1.1431890726089478, "rewards/rejected": -5.553727149963379, "sft_loss": 4.4488654136657715, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 19.997525903684426, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.07559563219547272, "logits/rejected": -0.018185529857873917, "logps/chosen": -4.372363090515137, "logps/rejected": -5.206852912902832, "loss": 0.6082, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.372363090515137, "rewards/margins": 0.8344897031784058, "rewards/rejected": -5.206852912902832, "sft_loss": 4.432497978210449, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 18.233783403550085, "learning_rate": 5.737070283532444e-07, "logits/chosen": 0.005519936792552471, "logits/rejected": 0.06497209519147873, "logps/chosen": -4.373464584350586, "logps/rejected": -5.253887176513672, "loss": 0.6108, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.373464584350586, "rewards/margins": 0.8804221153259277, "rewards/rejected": -5.253887176513672, "sft_loss": 4.2910027503967285, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 15.533337761409932, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.05558420345187187, "logits/rejected": 0.037349000573158264, "logps/chosen": -4.265897750854492, "logps/rejected": -5.312526226043701, "loss": 0.5117, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.265897750854492, "rewards/margins": 1.046628713607788, "rewards/rejected": -5.312526226043701, "sft_loss": 4.260438442230225, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 15.660868862531949, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.1164829283952713, "logits/rejected": 0.04899804666638374, "logps/chosen": -4.255067825317383, "logps/rejected": -5.125923156738281, "loss": 0.5641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.255067825317383, "rewards/margins": 0.8708555102348328, "rewards/rejected": -5.125923156738281, "sft_loss": 4.302035331726074, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 19.828532059494787, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.12655727565288544, "logits/rejected": 0.047389380633831024, "logps/chosen": -4.236365795135498, "logps/rejected": -5.012572288513184, "loss": 0.559, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.236365795135498, "rewards/margins": 0.776206910610199, "rewards/rejected": -5.012572288513184, "sft_loss": 4.229098320007324, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 21.900113408790784, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.11166088283061981, "logits/rejected": -0.0014119551051408052, "logps/chosen": -4.32798433303833, "logps/rejected": -5.046576499938965, "loss": 0.6016, "rewards/accuracies": 0.71875, "rewards/chosen": -4.32798433303833, "rewards/margins": 0.7185924649238586, "rewards/rejected": -5.046576499938965, "sft_loss": 4.378843784332275, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 17.076733572048365, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.06736056506633759, "logits/rejected": 0.03894350677728653, "logps/chosen": -4.179556846618652, "logps/rejected": -5.024938583374023, "loss": 0.5693, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.179556846618652, "rewards/margins": 0.8453825116157532, "rewards/rejected": -5.024938583374023, "sft_loss": 4.135404109954834, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 18.572646119062384, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.08097034692764282, "logits/rejected": 0.03127686679363251, "logps/chosen": -4.0773797035217285, "logps/rejected": -4.944389820098877, "loss": 0.5427, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.0773797035217285, "rewards/margins": 0.867010772228241, "rewards/rejected": -4.944389820098877, "sft_loss": 4.0974225997924805, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 15.435970154352313, "learning_rate": 5.629076484188952e-07, "logits/chosen": 0.0010766386985778809, "logits/rejected": 0.09980317950248718, "logps/chosen": -3.981600284576416, "logps/rejected": -4.928833961486816, "loss": 0.5176, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.981600284576416, "rewards/margins": 0.9472335577011108, "rewards/rejected": -4.928833961486816, "sft_loss": 4.042459011077881, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 16.972547805466146, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.06025050953030586, "logits/rejected": 0.05759192630648613, "logps/chosen": -4.010976314544678, "logps/rejected": -4.981903076171875, "loss": 0.5245, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.010976314544678, "rewards/margins": 0.9709264636039734, "rewards/rejected": -4.981903076171875, "sft_loss": 4.036219596862793, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 15.801610924913678, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.10592453181743622, "logits/rejected": 0.04858104884624481, "logps/chosen": -4.068356037139893, "logps/rejected": -5.140035152435303, "loss": 0.5061, "rewards/accuracies": 0.78125, "rewards/chosen": -4.068356037139893, "rewards/margins": 1.0716798305511475, "rewards/rejected": -5.140035152435303, "sft_loss": 4.047433376312256, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 17.01761497153551, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.060587383806705475, "logits/rejected": 0.06671730428934097, "logps/chosen": -4.050535678863525, "logps/rejected": -4.982683181762695, "loss": 0.5429, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.050535678863525, "rewards/margins": 0.9321478009223938, "rewards/rejected": -4.982683181762695, "sft_loss": 4.0685577392578125, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 15.744499216863726, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.02025572769343853, "logits/rejected": 0.0811544805765152, "logps/chosen": -4.085963726043701, "logps/rejected": -5.134238243103027, "loss": 0.4891, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.085963726043701, "rewards/margins": 1.0482741594314575, "rewards/rejected": -5.134238243103027, "sft_loss": 4.075760841369629, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 18.491593244869883, "learning_rate": 5.551751964760838e-07, "logits/chosen": 0.0378764346241951, "logits/rejected": 0.06506892293691635, "logps/chosen": -4.117175102233887, "logps/rejected": -5.12311315536499, "loss": 0.5095, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.117175102233887, "rewards/margins": 1.0059373378753662, "rewards/rejected": -5.12311315536499, "sft_loss": 4.185202598571777, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 23.070900818143077, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.073255255818367, "logits/rejected": 0.017674123868346214, "logps/chosen": -4.214170932769775, "logps/rejected": -5.127688884735107, "loss": 0.5452, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.214170932769775, "rewards/margins": 0.9135181307792664, "rewards/rejected": -5.127688884735107, "sft_loss": 4.309821605682373, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 23.941503621141138, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.02814924158155918, "logits/rejected": 0.14941860735416412, "logps/chosen": -4.352537155151367, "logps/rejected": -5.3630266189575195, "loss": 0.5536, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.352537155151367, "rewards/margins": 1.0104889869689941, "rewards/rejected": -5.3630266189575195, "sft_loss": 4.485054969787598, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 21.75450442026472, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.05518549680709839, "logits/rejected": 0.041710685938596725, "logps/chosen": -4.426823139190674, "logps/rejected": -5.402629375457764, "loss": 0.5684, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.426823139190674, "rewards/margins": 0.9758065938949585, "rewards/rejected": -5.402629375457764, "sft_loss": 4.535459041595459, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 19.47160041089576, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.007011513225734234, "logits/rejected": 0.0902138352394104, "logps/chosen": -4.3132548332214355, "logps/rejected": -5.257037162780762, "loss": 0.559, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.3132548332214355, "rewards/margins": 0.9437819719314575, "rewards/rejected": -5.257037162780762, "sft_loss": 4.44458532333374, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 16.69099412236686, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.1297324001789093, "logits/rejected": 0.018423620611429214, "logps/chosen": -4.200294494628906, "logps/rejected": -5.5349555015563965, "loss": 0.4807, "rewards/accuracies": 0.8125, "rewards/chosen": -4.200294494628906, "rewards/margins": 1.3346607685089111, "rewards/rejected": -5.5349555015563965, "sft_loss": 4.3675408363342285, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 17.542272466508237, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.07640831172466278, "logits/rejected": 0.025533486157655716, "logps/chosen": -4.482228755950928, "logps/rejected": -5.5046586990356445, "loss": 0.5226, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.482228755950928, "rewards/margins": 1.022430181503296, "rewards/rejected": -5.5046586990356445, "sft_loss": 4.561526298522949, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 18.032181115430728, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.10058959573507309, "logits/rejected": 0.06195586919784546, "logps/chosen": -4.4972920417785645, "logps/rejected": -5.676342964172363, "loss": 0.5305, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.4972920417785645, "rewards/margins": 1.1790510416030884, "rewards/rejected": -5.676342964172363, "sft_loss": 4.620232582092285, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 26.202395628359643, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.039661847054958344, "logits/rejected": 0.07111156731843948, "logps/chosen": -4.454710960388184, "logps/rejected": -5.439589023590088, "loss": 0.5548, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.454710960388184, "rewards/margins": 0.9848777055740356, "rewards/rejected": -5.439589023590088, "sft_loss": 4.483078956604004, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 19.59733889441095, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.05810501426458359, "logits/rejected": 0.1393880695104599, "logps/chosen": -4.310373306274414, "logps/rejected": -5.385622501373291, "loss": 0.4906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.310373306274414, "rewards/margins": 1.0752495527267456, "rewards/rejected": -5.385622501373291, "sft_loss": 4.3996477127075195, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 22.137994578470998, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.016352495178580284, "logits/rejected": 0.0893014445900917, "logps/chosen": -4.396595478057861, "logps/rejected": -5.309807777404785, "loss": 0.5674, "rewards/accuracies": 0.71875, "rewards/chosen": -4.396595478057861, "rewards/margins": 0.9132122993469238, "rewards/rejected": -5.309807777404785, "sft_loss": 4.540419101715088, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 14.24671095972288, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.11190303415060043, "logits/rejected": 0.024014584720134735, "logps/chosen": -4.204039096832275, "logps/rejected": -5.1999311447143555, "loss": 0.4986, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.204039096832275, "rewards/margins": 0.9958921670913696, "rewards/rejected": -5.1999311447143555, "sft_loss": 4.269665718078613, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 17.965126451922142, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.09304491430521011, "logits/rejected": 0.029250601306557655, "logps/chosen": -4.340491771697998, "logps/rejected": -5.416264057159424, "loss": 0.5196, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.340491771697998, "rewards/margins": 1.0757719278335571, "rewards/rejected": -5.416264057159424, "sft_loss": 4.462465763092041, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 18.49642126158975, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.11003684997558594, "logits/rejected": 0.03975386545062065, "logps/chosen": -4.333467483520508, "logps/rejected": -5.315537452697754, "loss": 0.5351, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.333467483520508, "rewards/margins": 0.9820694923400879, "rewards/rejected": -5.315537452697754, "sft_loss": 4.381396293640137, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 22.539386691478214, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.0853196457028389, "logits/rejected": 0.05843697860836983, "logps/chosen": -4.250127792358398, "logps/rejected": -5.1255669593811035, "loss": 0.5554, "rewards/accuracies": 0.71875, "rewards/chosen": -4.250127792358398, "rewards/margins": 0.8754390478134155, "rewards/rejected": -5.1255669593811035, "sft_loss": 4.269168853759766, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 14.453356313159919, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.113250732421875, "logits/rejected": 0.03299799561500549, "logps/chosen": -4.246047496795654, "logps/rejected": -5.113133430480957, "loss": 0.5179, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.246047496795654, "rewards/margins": 0.8670861124992371, "rewards/rejected": -5.113133430480957, "sft_loss": 4.2835211753845215, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 22.56856552556845, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.09829998016357422, "logits/rejected": 0.08882582932710648, "logps/chosen": -4.22990608215332, "logps/rejected": -5.405078887939453, "loss": 0.5154, "rewards/accuracies": 0.78125, "rewards/chosen": -4.22990608215332, "rewards/margins": 1.1751729249954224, "rewards/rejected": -5.405078887939453, "sft_loss": 4.287569046020508, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 15.64837650021201, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.11544670164585114, "logits/rejected": 0.04846612364053726, "logps/chosen": -4.176478385925293, "logps/rejected": -5.209988117218018, "loss": 0.498, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.176478385925293, "rewards/margins": 1.0335088968276978, "rewards/rejected": -5.209988117218018, "sft_loss": 4.232048034667969, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 20.047395770434285, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.0673823282122612, "logits/rejected": 0.05862687900662422, "logps/chosen": -4.264988899230957, "logps/rejected": -5.426611423492432, "loss": 0.5053, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.264988899230957, "rewards/margins": 1.1616226434707642, "rewards/rejected": -5.426611423492432, "sft_loss": 4.324214935302734, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 12.420600674643524, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.08667416870594025, "logits/rejected": 0.07295207679271698, "logps/chosen": -4.2573137283325195, "logps/rejected": -5.147179126739502, "loss": 0.5647, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.2573137283325195, "rewards/margins": 0.8898651003837585, "rewards/rejected": -5.147179126739502, "sft_loss": 4.249060153961182, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 34.817428653201006, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.17020973563194275, "logits/rejected": -0.02446267567574978, "logps/chosen": -4.2911906242370605, "logps/rejected": -5.05643892288208, "loss": 0.6009, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.2911906242370605, "rewards/margins": 0.7652486562728882, "rewards/rejected": -5.05643892288208, "sft_loss": 4.294539451599121, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 25.68838864785532, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.13014598190784454, "logits/rejected": -0.024837497621774673, "logps/chosen": -4.2041826248168945, "logps/rejected": -5.022336006164551, "loss": 0.6079, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.2041826248168945, "rewards/margins": 0.8181535005569458, "rewards/rejected": -5.022336006164551, "sft_loss": 4.219718933105469, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 21.011750396811554, "learning_rate": 5.210187404905735e-07, "logits/chosen": 0.0073079378344118595, "logits/rejected": 0.06562227755784988, "logps/chosen": -4.269503593444824, "logps/rejected": -5.181615352630615, "loss": 0.5409, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.269503593444824, "rewards/margins": 0.912111759185791, "rewards/rejected": -5.181615352630615, "sft_loss": 4.28148889541626, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 14.804538548601718, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.07849368453025818, "logits/rejected": 0.01196741871535778, "logps/chosen": -4.193101406097412, "logps/rejected": -4.992875099182129, "loss": 0.5668, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.193101406097412, "rewards/margins": 0.7997739315032959, "rewards/rejected": -4.992875099182129, "sft_loss": 4.28323221206665, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 15.709663615878485, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.11467244476079941, "logits/rejected": 0.026135023683309555, "logps/chosen": -4.037603855133057, "logps/rejected": -4.962246894836426, "loss": 0.5525, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.037603855133057, "rewards/margins": 0.9246425628662109, "rewards/rejected": -4.962246894836426, "sft_loss": 4.146195411682129, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 18.153743062770673, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.049486517906188965, "logits/rejected": 0.07292356342077255, "logps/chosen": -4.202414512634277, "logps/rejected": -5.069171905517578, "loss": 0.5718, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.202414512634277, "rewards/margins": 0.866757869720459, "rewards/rejected": -5.069171905517578, "sft_loss": 4.199142932891846, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 19.154269364694123, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.005182699766010046, "logits/rejected": 0.10528840124607086, "logps/chosen": -4.11621618270874, "logps/rejected": -4.913527488708496, "loss": 0.5481, "rewards/accuracies": 0.75, "rewards/chosen": -4.11621618270874, "rewards/margins": 0.7973116636276245, "rewards/rejected": -4.913527488708496, "sft_loss": 4.1073455810546875, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 20.68341814608387, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.10306306183338165, "logits/rejected": -0.03484489768743515, "logps/chosen": -4.04842472076416, "logps/rejected": -5.121028423309326, "loss": 0.5202, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.04842472076416, "rewards/margins": 1.0726039409637451, "rewards/rejected": -5.121028423309326, "sft_loss": 4.06124210357666, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 29.02963400980283, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.060677748173475266, "logits/rejected": 0.045880045741796494, "logps/chosen": -3.9956302642822266, "logps/rejected": -4.8577165603637695, "loss": 0.5514, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.9956302642822266, "rewards/margins": 0.8620861768722534, "rewards/rejected": -4.8577165603637695, "sft_loss": 4.059117317199707, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 13.869042600567854, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.07129235565662384, "logits/rejected": 0.05507662147283554, "logps/chosen": -4.214221954345703, "logps/rejected": -5.314478397369385, "loss": 0.4822, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.214221954345703, "rewards/margins": 1.1002566814422607, "rewards/rejected": -5.314478397369385, "sft_loss": 4.279477596282959, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 16.093245713923835, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.11817373335361481, "logits/rejected": 0.01352071762084961, "logps/chosen": -4.390268325805664, "logps/rejected": -5.223195552825928, "loss": 0.5693, "rewards/accuracies": 0.71875, "rewards/chosen": -4.390268325805664, "rewards/margins": 0.8329275250434875, "rewards/rejected": -5.223195552825928, "sft_loss": 4.474215507507324, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 17.82188236509395, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.07951287925243378, "logits/rejected": 0.07106980681419373, "logps/chosen": -4.244257926940918, "logps/rejected": -5.166312217712402, "loss": 0.5795, "rewards/accuracies": 0.75, "rewards/chosen": -4.244257926940918, "rewards/margins": 0.9220544695854187, "rewards/rejected": -5.166312217712402, "sft_loss": 4.341263294219971, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 19.49501064735196, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.05764093995094299, "logits/rejected": 0.01515720784664154, "logps/chosen": -4.19266939163208, "logps/rejected": -5.072085380554199, "loss": 0.5839, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.19266939163208, "rewards/margins": 0.8794158101081848, "rewards/rejected": -5.072085380554199, "sft_loss": 4.263916015625, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 23.97253984730568, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.11488769948482513, "logits/rejected": -0.014919767156243324, "logps/chosen": -4.181607246398926, "logps/rejected": -5.144347190856934, "loss": 0.5614, "rewards/accuracies": 0.71875, "rewards/chosen": -4.181607246398926, "rewards/margins": 0.9627391695976257, "rewards/rejected": -5.144347190856934, "sft_loss": 4.271900653839111, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 19.499945514414925, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.11179818958044052, "logits/rejected": -0.02782334014773369, "logps/chosen": -4.056575298309326, "logps/rejected": -4.9648356437683105, "loss": 0.5052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.056575298309326, "rewards/margins": 0.9082603454589844, "rewards/rejected": -4.9648356437683105, "sft_loss": 4.106502532958984, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 15.566938456996795, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.15935197472572327, "logits/rejected": -0.006320520304143429, "logps/chosen": -3.8887240886688232, "logps/rejected": -4.9238152503967285, "loss": 0.4759, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8887240886688232, "rewards/margins": 1.035091757774353, "rewards/rejected": -4.9238152503967285, "sft_loss": 3.980847120285034, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 18.645578002041546, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.04259735345840454, "logits/rejected": -0.012678694911301136, "logps/chosen": -4.119019985198975, "logps/rejected": -4.899888038635254, "loss": 0.5709, "rewards/accuracies": 0.75, "rewards/chosen": -4.119019985198975, "rewards/margins": 0.7808682322502136, "rewards/rejected": -4.899888038635254, "sft_loss": 4.167031288146973, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 21.47625387200884, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.05810439586639404, "logits/rejected": 0.004169926047325134, "logps/chosen": -4.052678108215332, "logps/rejected": -4.817889213562012, "loss": 0.6303, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.052678108215332, "rewards/margins": 0.7652114629745483, "rewards/rejected": -4.817889213562012, "sft_loss": 4.158685684204102, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 19.301067077983053, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.06106138974428177, "logits/rejected": 0.04724999517202377, "logps/chosen": -4.15666389465332, "logps/rejected": -4.9704060554504395, "loss": 0.5664, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.15666389465332, "rewards/margins": 0.8137421607971191, "rewards/rejected": -4.9704060554504395, "sft_loss": 4.189563751220703, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 18.182766248763908, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.16863279044628143, "logits/rejected": -0.05148882791399956, "logps/chosen": -4.012679576873779, "logps/rejected": -4.946879863739014, "loss": 0.5421, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.012679576873779, "rewards/margins": 0.9341999292373657, "rewards/rejected": -4.946879863739014, "sft_loss": 4.170138359069824, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 19.05104896048302, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.15180808305740356, "logits/rejected": -0.02108634077012539, "logps/chosen": -3.9979186058044434, "logps/rejected": -4.824479103088379, "loss": 0.5475, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.9979186058044434, "rewards/margins": 0.8265607953071594, "rewards/rejected": -4.824479103088379, "sft_loss": 4.137550354003906, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 15.934164660261775, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.08919097483158112, "logits/rejected": 0.017639994621276855, "logps/chosen": -4.064725875854492, "logps/rejected": -5.006443500518799, "loss": 0.516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.064725875854492, "rewards/margins": 0.9417168498039246, "rewards/rejected": -5.006443500518799, "sft_loss": 4.004981994628906, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 17.991808063740628, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.08161447197198868, "logits/rejected": 0.01697484403848648, "logps/chosen": -4.060294151306152, "logps/rejected": -5.105404376983643, "loss": 0.4824, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.060294151306152, "rewards/margins": 1.0451104640960693, "rewards/rejected": -5.105404376983643, "sft_loss": 4.122883319854736, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 12.433180956157578, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.17363722622394562, "logits/rejected": -0.05253412574529648, "logps/chosen": -4.116326332092285, "logps/rejected": -5.152121067047119, "loss": 0.505, "rewards/accuracies": 0.78125, "rewards/chosen": -4.116326332092285, "rewards/margins": 1.0357939004898071, "rewards/rejected": -5.152121067047119, "sft_loss": 4.167025566101074, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 17.926724117264285, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.11365020275115967, "logits/rejected": -0.02593514882028103, "logps/chosen": -4.264039039611816, "logps/rejected": -5.186556816101074, "loss": 0.5399, "rewards/accuracies": 0.8125, "rewards/chosen": -4.264039039611816, "rewards/margins": 0.9225172996520996, "rewards/rejected": -5.186556816101074, "sft_loss": 4.256202697753906, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 21.535233328316657, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.0966586321592331, "logits/rejected": 0.0101277781650424, "logps/chosen": -4.302945137023926, "logps/rejected": -5.152838230133057, "loss": 0.5384, "rewards/accuracies": 0.75, "rewards/chosen": -4.302945137023926, "rewards/margins": 0.8498929738998413, "rewards/rejected": -5.152838230133057, "sft_loss": 4.336602210998535, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 19.348539479077594, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.04772127419710159, "logits/rejected": 0.019659971818327904, "logps/chosen": -4.417842864990234, "logps/rejected": -5.282958984375, "loss": 0.5443, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.417842864990234, "rewards/margins": 0.8651165962219238, "rewards/rejected": -5.282958984375, "sft_loss": 4.4471940994262695, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 23.45924489258539, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.22262386977672577, "logits/rejected": -0.12868838012218475, "logps/chosen": -4.260302543640137, "logps/rejected": -5.084171772003174, "loss": 0.549, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.260302543640137, "rewards/margins": 0.8238682746887207, "rewards/rejected": -5.084171772003174, "sft_loss": 4.291085720062256, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 19.19538120962937, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.0920620784163475, "logits/rejected": 0.010093789547681808, "logps/chosen": -4.094933986663818, "logps/rejected": -5.171164512634277, "loss": 0.4992, "rewards/accuracies": 0.8125, "rewards/chosen": -4.094933986663818, "rewards/margins": 1.0762296915054321, "rewards/rejected": -5.171164512634277, "sft_loss": 4.0767130851745605, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 22.70065476135362, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.1846626251935959, "logits/rejected": -0.0954296737909317, "logps/chosen": -4.166723728179932, "logps/rejected": -5.224819183349609, "loss": 0.497, "rewards/accuracies": 0.8125, "rewards/chosen": -4.166723728179932, "rewards/margins": 1.0580958127975464, "rewards/rejected": -5.224819183349609, "sft_loss": 4.194915771484375, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 15.588264832781391, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.18257483839988708, "logits/rejected": -0.06619922071695328, "logps/chosen": -4.142666816711426, "logps/rejected": -5.177772045135498, "loss": 0.4982, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.142666816711426, "rewards/margins": 1.0351049900054932, "rewards/rejected": -5.177772045135498, "sft_loss": 4.206771373748779, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 21.630102221903424, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.10724954307079315, "logits/rejected": -0.12234711647033691, "logps/chosen": -4.20287561416626, "logps/rejected": -5.214568138122559, "loss": 0.5072, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.20287561416626, "rewards/margins": 1.0116924047470093, "rewards/rejected": -5.214568138122559, "sft_loss": 4.358765602111816, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 20.033921886742174, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.2289537489414215, "logits/rejected": -0.10383538901805878, "logps/chosen": -4.348308563232422, "logps/rejected": -5.09984016418457, "loss": 0.6191, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.348308563232422, "rewards/margins": 0.7515309453010559, "rewards/rejected": -5.09984016418457, "sft_loss": 4.463630199432373, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 17.134255878271137, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.11890985071659088, "logits/rejected": -0.046903759241104126, "logps/chosen": -4.308679103851318, "logps/rejected": -5.342099189758301, "loss": 0.5077, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.308679103851318, "rewards/margins": 1.0334198474884033, "rewards/rejected": -5.342099189758301, "sft_loss": 4.364779472351074, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 23.161647576779664, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.16019386053085327, "logits/rejected": -0.0389467254281044, "logps/chosen": -4.390309810638428, "logps/rejected": -5.0757060050964355, "loss": 0.6471, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.390309810638428, "rewards/margins": 0.685396134853363, "rewards/rejected": -5.0757060050964355, "sft_loss": 4.451899528503418, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 16.909814275556286, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.045469336211681366, "logits/rejected": 0.010172396898269653, "logps/chosen": -4.298305511474609, "logps/rejected": -5.328028678894043, "loss": 0.5117, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.298305511474609, "rewards/margins": 1.029723882675171, "rewards/rejected": -5.328028678894043, "sft_loss": 4.3115234375, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 20.0691172417075, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.11708267033100128, "logits/rejected": -0.07965598255395889, "logps/chosen": -4.334473133087158, "logps/rejected": -4.946959018707275, "loss": 0.6378, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.334473133087158, "rewards/margins": 0.6124860644340515, "rewards/rejected": -4.946959018707275, "sft_loss": 4.341203689575195, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 14.375996759345945, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.17368324100971222, "logits/rejected": -0.048559121787548065, "logps/chosen": -4.239495277404785, "logps/rejected": -5.280592441558838, "loss": 0.5036, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.239495277404785, "rewards/margins": 1.0410963296890259, "rewards/rejected": -5.280592441558838, "sft_loss": 4.296621799468994, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 15.178072771537483, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.17286744713783264, "logits/rejected": -0.07282562553882599, "logps/chosen": -4.2494306564331055, "logps/rejected": -5.157985687255859, "loss": 0.5495, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.2494306564331055, "rewards/margins": 0.9085548520088196, "rewards/rejected": -5.157985687255859, "sft_loss": 4.245641231536865, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 17.146266923159065, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.07511468231678009, "logits/rejected": 0.02954399213194847, "logps/chosen": -4.374552249908447, "logps/rejected": -5.253323078155518, "loss": 0.5981, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.374552249908447, "rewards/margins": 0.8787716031074524, "rewards/rejected": -5.253323078155518, "sft_loss": 4.383390426635742, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.1183885782957077, "eval_logits/rejected": 0.19278573989868164, "eval_logps/chosen": -4.256961822509766, "eval_logps/rejected": -5.173403739929199, "eval_loss": 0.5694618225097656, "eval_rewards/accuracies": 0.7270029783248901, "eval_rewards/chosen": -4.256961822509766, "eval_rewards/margins": 0.9164420366287231, "eval_rewards/rejected": -5.173403739929199, "eval_runtime": 43.1761, "eval_samples_per_second": 31.152, "eval_sft_loss": 4.277877330780029, "eval_steps_per_second": 7.805, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 13.739952321881674, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.18275292217731476, "logits/rejected": -0.08163726329803467, "logps/chosen": -4.217829704284668, "logps/rejected": -5.261086940765381, "loss": 0.5194, "rewards/accuracies": 0.75, "rewards/chosen": -4.217829704284668, "rewards/margins": 1.0432568788528442, "rewards/rejected": -5.261086940765381, "sft_loss": 4.270330429077148, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 21.115462789349642, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.1848531812429428, "logits/rejected": -0.0920737236738205, "logps/chosen": -4.141186237335205, "logps/rejected": -4.958422660827637, "loss": 0.6006, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.141186237335205, "rewards/margins": 0.8172367215156555, "rewards/rejected": -4.958422660827637, "sft_loss": 4.176085472106934, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 17.527792344129246, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.08552250266075134, "logits/rejected": -0.0013067282270640135, "logps/chosen": -4.216039657592773, "logps/rejected": -5.193731307983398, "loss": 0.5323, "rewards/accuracies": 0.78125, "rewards/chosen": -4.216039657592773, "rewards/margins": 0.9776918292045593, "rewards/rejected": -5.193731307983398, "sft_loss": 4.2045440673828125, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 16.395034924437546, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.13056273758411407, "logits/rejected": 0.05236934870481491, "logps/chosen": -4.405067443847656, "logps/rejected": -5.288483619689941, "loss": 0.6144, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.405067443847656, "rewards/margins": 0.8834161758422852, "rewards/rejected": -5.288483619689941, "sft_loss": 4.329681396484375, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 17.388519714863275, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.09959354251623154, "logits/rejected": 0.0017538412939757109, "logps/chosen": -4.105210304260254, "logps/rejected": -4.967142105102539, "loss": 0.5609, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.105210304260254, "rewards/margins": 0.8619323968887329, "rewards/rejected": -4.967142105102539, "sft_loss": 4.156649589538574, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 24.155114988120655, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.13659390807151794, "logits/rejected": -0.026909206062555313, "logps/chosen": -4.200619220733643, "logps/rejected": -5.389188289642334, "loss": 0.5156, "rewards/accuracies": 0.71875, "rewards/chosen": -4.200619220733643, "rewards/margins": 1.1885693073272705, "rewards/rejected": -5.389188289642334, "sft_loss": 4.245565891265869, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 19.98274438302079, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.10791487991809845, "logits/rejected": -0.030868541449308395, "logps/chosen": -4.1000776290893555, "logps/rejected": -5.038941383361816, "loss": 0.5404, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.1000776290893555, "rewards/margins": 0.9388639330863953, "rewards/rejected": -5.038941383361816, "sft_loss": 4.160983562469482, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 16.181202230471833, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.09710139781236649, "logits/rejected": -0.011764958500862122, "logps/chosen": -4.07378625869751, "logps/rejected": -5.1709818840026855, "loss": 0.4991, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.07378625869751, "rewards/margins": 1.0971953868865967, "rewards/rejected": -5.1709818840026855, "sft_loss": 4.0234856605529785, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 19.089885220258388, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.2048162966966629, "logits/rejected": -0.12096717208623886, "logps/chosen": -4.21279239654541, "logps/rejected": -5.121549606323242, "loss": 0.5607, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.21279239654541, "rewards/margins": 0.9087567329406738, "rewards/rejected": -5.121549606323242, "sft_loss": 4.211321830749512, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 16.273027199291345, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.08466310799121857, "logits/rejected": 0.045869845896959305, "logps/chosen": -4.143354892730713, "logps/rejected": -5.182480812072754, "loss": 0.5185, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.143354892730713, "rewards/margins": 1.0391263961791992, "rewards/rejected": -5.182480812072754, "sft_loss": 4.090240001678467, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 13.724951195026941, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.09971622377634048, "logits/rejected": -0.024418365210294724, "logps/chosen": -4.057034492492676, "logps/rejected": -5.0171966552734375, "loss": 0.5262, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.057034492492676, "rewards/margins": 0.9601619839668274, "rewards/rejected": -5.0171966552734375, "sft_loss": 4.123260974884033, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 20.907854592456673, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.2061711847782135, "logits/rejected": -0.08672699332237244, "logps/chosen": -4.1651105880737305, "logps/rejected": -5.111360549926758, "loss": 0.5392, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.1651105880737305, "rewards/margins": 0.9462505578994751, "rewards/rejected": -5.111360549926758, "sft_loss": 4.181136608123779, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 23.650308827928992, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.11200736463069916, "logits/rejected": -0.010071463882923126, "logps/chosen": -4.173468589782715, "logps/rejected": -5.128382682800293, "loss": 0.5304, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.173468589782715, "rewards/margins": 0.9549140930175781, "rewards/rejected": -5.128382682800293, "sft_loss": 4.241128444671631, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 22.109028806365686, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.09953314810991287, "logits/rejected": -0.010271935723721981, "logps/chosen": -4.216373920440674, "logps/rejected": -5.176015853881836, "loss": 0.5498, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.216373920440674, "rewards/margins": 0.9596416354179382, "rewards/rejected": -5.176015853881836, "sft_loss": 4.237557411193848, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 19.706781989174747, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.1606660634279251, "logits/rejected": 0.051355648785829544, "logps/chosen": -4.122914791107178, "logps/rejected": -5.148131370544434, "loss": 0.5219, "rewards/accuracies": 0.71875, "rewards/chosen": -4.122914791107178, "rewards/margins": 1.0252161026000977, "rewards/rejected": -5.148131370544434, "sft_loss": 4.13117790222168, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 21.56296392336687, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.1537935584783554, "logits/rejected": -0.09401834011077881, "logps/chosen": -4.051243782043457, "logps/rejected": -5.064203262329102, "loss": 0.5357, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.051243782043457, "rewards/margins": 1.0129592418670654, "rewards/rejected": -5.064203262329102, "sft_loss": 4.121866226196289, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 17.711612391458424, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.16775628924369812, "logits/rejected": -0.014869133941829205, "logps/chosen": -4.1766228675842285, "logps/rejected": -5.212665557861328, "loss": 0.5093, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.1766228675842285, "rewards/margins": 1.0360426902770996, "rewards/rejected": -5.212665557861328, "sft_loss": 4.168249130249023, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 13.87655714330121, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.12899205088615417, "logits/rejected": -0.0327068492770195, "logps/chosen": -4.141658782958984, "logps/rejected": -5.138970375061035, "loss": 0.4945, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.141658782958984, "rewards/margins": 0.9973119497299194, "rewards/rejected": -5.138970375061035, "sft_loss": 4.159889221191406, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 19.220576359739102, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.15308910608291626, "logits/rejected": -0.07327961176633835, "logps/chosen": -4.265786647796631, "logps/rejected": -5.289236068725586, "loss": 0.5383, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.265786647796631, "rewards/margins": 1.023449182510376, "rewards/rejected": -5.289236068725586, "sft_loss": 4.193299770355225, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 20.46061855557512, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.20100100338459015, "logits/rejected": -0.08928118646144867, "logps/chosen": -4.368012428283691, "logps/rejected": -5.361285209655762, "loss": 0.5534, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.368012428283691, "rewards/margins": 0.9932721853256226, "rewards/rejected": -5.361285209655762, "sft_loss": 4.412505149841309, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 21.193931033259123, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.2275840938091278, "logits/rejected": -0.08153820037841797, "logps/chosen": -4.363230228424072, "logps/rejected": -5.38161563873291, "loss": 0.5327, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.363230228424072, "rewards/margins": 1.0183861255645752, "rewards/rejected": -5.38161563873291, "sft_loss": 4.3429274559021, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 22.862480463300454, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.08414838463068008, "logits/rejected": -0.03929399698972702, "logps/chosen": -4.420595645904541, "logps/rejected": -5.371197700500488, "loss": 0.5192, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.420595645904541, "rewards/margins": 0.9506022334098816, "rewards/rejected": -5.371197700500488, "sft_loss": 4.414549827575684, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 29.09289354380726, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.16041435301303864, "logits/rejected": -0.0331757515668869, "logps/chosen": -4.369056701660156, "logps/rejected": -5.273859977722168, "loss": 0.5428, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.369056701660156, "rewards/margins": 0.9048035740852356, "rewards/rejected": -5.273859977722168, "sft_loss": 4.352242946624756, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 14.15545369611903, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.13385829329490662, "logits/rejected": 0.025810521095991135, "logps/chosen": -4.274466514587402, "logps/rejected": -5.444076061248779, "loss": 0.5061, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.274466514587402, "rewards/margins": 1.1696093082427979, "rewards/rejected": -5.444076061248779, "sft_loss": 4.29061222076416, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 17.259932516111828, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.14594252407550812, "logits/rejected": -0.017102601006627083, "logps/chosen": -4.262748718261719, "logps/rejected": -5.186878204345703, "loss": 0.559, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.262748718261719, "rewards/margins": 0.9241297841072083, "rewards/rejected": -5.186878204345703, "sft_loss": 4.339690685272217, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 19.17161374595759, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.12570686638355255, "logits/rejected": -0.044585347175598145, "logps/chosen": -4.1848015785217285, "logps/rejected": -5.268584251403809, "loss": 0.5124, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.1848015785217285, "rewards/margins": 1.0837829113006592, "rewards/rejected": -5.268584251403809, "sft_loss": 4.249958515167236, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 23.22537086801625, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.155071422457695, "logits/rejected": -0.020992886275053024, "logps/chosen": -4.429194450378418, "logps/rejected": -5.286048889160156, "loss": 0.5677, "rewards/accuracies": 0.75, "rewards/chosen": -4.429194450378418, "rewards/margins": 0.8568543195724487, "rewards/rejected": -5.286048889160156, "sft_loss": 4.3752641677856445, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 39.54595794572819, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.14056840538978577, "logits/rejected": -0.041243575513362885, "logps/chosen": -4.156649112701416, "logps/rejected": -5.233036994934082, "loss": 0.5217, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.156649112701416, "rewards/margins": 1.0763883590698242, "rewards/rejected": -5.233036994934082, "sft_loss": 4.208165645599365, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 15.607691512733764, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.1249227300286293, "logits/rejected": 0.013692038133740425, "logps/chosen": -4.091392993927002, "logps/rejected": -5.043036460876465, "loss": 0.5274, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.091392993927002, "rewards/margins": 0.9516437649726868, "rewards/rejected": -5.043036460876465, "sft_loss": 4.152836799621582, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 17.824995315805673, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.2082880288362503, "logits/rejected": -0.11089960485696793, "logps/chosen": -4.208667755126953, "logps/rejected": -5.084951877593994, "loss": 0.5505, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.208667755126953, "rewards/margins": 0.8762838244438171, "rewards/rejected": -5.084951877593994, "sft_loss": 4.241647243499756, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 22.978605746214726, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.06792887300252914, "logits/rejected": -0.021539511159062386, "logps/chosen": -4.2966156005859375, "logps/rejected": -5.186400413513184, "loss": 0.5906, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.2966156005859375, "rewards/margins": 0.8897849917411804, "rewards/rejected": -5.186400413513184, "sft_loss": 4.250180244445801, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 17.4446702090662, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.17651358246803284, "logits/rejected": -0.05707705765962601, "logps/chosen": -3.9511427879333496, "logps/rejected": -5.0004706382751465, "loss": 0.5096, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.9511427879333496, "rewards/margins": 1.0493286848068237, "rewards/rejected": -5.0004706382751465, "sft_loss": 3.935319185256958, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 15.88468673012, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.17244072258472443, "logits/rejected": -0.03228010609745979, "logps/chosen": -4.157092094421387, "logps/rejected": -4.983794212341309, "loss": 0.5546, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.157092094421387, "rewards/margins": 0.8267024159431458, "rewards/rejected": -4.983794212341309, "sft_loss": 4.224639415740967, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 19.192023331560634, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.16004905104637146, "logits/rejected": -0.058913685381412506, "logps/chosen": -4.1700592041015625, "logps/rejected": -4.868165493011475, "loss": 0.6268, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.1700592041015625, "rewards/margins": 0.6981061697006226, "rewards/rejected": -4.868165493011475, "sft_loss": 4.16310977935791, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 20.894293928782755, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.06739739328622818, "logits/rejected": -0.001216635457240045, "logps/chosen": -4.290366172790527, "logps/rejected": -5.070502281188965, "loss": 0.6411, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.290366172790527, "rewards/margins": 0.7801357507705688, "rewards/rejected": -5.070502281188965, "sft_loss": 4.278425693511963, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 15.495752358141823, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.200698584318161, "logits/rejected": -0.05970926955342293, "logps/chosen": -4.080262660980225, "logps/rejected": -4.968288421630859, "loss": 0.555, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.080262660980225, "rewards/margins": 0.888026237487793, "rewards/rejected": -4.968288421630859, "sft_loss": 3.9992785453796387, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 14.9598019996281, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.16302593052387238, "logits/rejected": -0.14525838196277618, "logps/chosen": -4.063436985015869, "logps/rejected": -4.868227005004883, "loss": 0.5737, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.063436985015869, "rewards/margins": 0.8047906160354614, "rewards/rejected": -4.868227005004883, "sft_loss": 4.049282550811768, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 24.824050343951967, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.1567583680152893, "logits/rejected": -0.05262039229273796, "logps/chosen": -4.254889488220215, "logps/rejected": -5.024016380310059, "loss": 0.6236, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.254889488220215, "rewards/margins": 0.7691268920898438, "rewards/rejected": -5.024016380310059, "sft_loss": 4.200462818145752, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 14.986770011000203, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.20013757050037384, "logits/rejected": -0.08344296365976334, "logps/chosen": -4.004967212677002, "logps/rejected": -5.059557914733887, "loss": 0.49, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.004967212677002, "rewards/margins": 1.0545909404754639, "rewards/rejected": -5.059557914733887, "sft_loss": 4.046624183654785, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 17.686613090286425, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.15689019858837128, "logits/rejected": -0.055449776351451874, "logps/chosen": -4.107612133026123, "logps/rejected": -5.058051109313965, "loss": 0.5275, "rewards/accuracies": 0.78125, "rewards/chosen": -4.107612133026123, "rewards/margins": 0.9504392743110657, "rewards/rejected": -5.058051109313965, "sft_loss": 4.0827226638793945, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 20.619625811533965, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.16872528195381165, "logits/rejected": -0.0544901080429554, "logps/chosen": -4.091769695281982, "logps/rejected": -4.962288856506348, "loss": 0.5235, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.091769695281982, "rewards/margins": 0.8705183863639832, "rewards/rejected": -4.962288856506348, "sft_loss": 4.082983493804932, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 19.70934234192068, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.13666436076164246, "logits/rejected": 0.00520123029127717, "logps/chosen": -4.1989336013793945, "logps/rejected": -5.079361438751221, "loss": 0.5343, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.1989336013793945, "rewards/margins": 0.880427360534668, "rewards/rejected": -5.079361438751221, "sft_loss": 4.141883850097656, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 24.19864006279397, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.1475699245929718, "logits/rejected": -0.05400124937295914, "logps/chosen": -4.168356895446777, "logps/rejected": -5.142523765563965, "loss": 0.553, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.168356895446777, "rewards/margins": 0.9741671681404114, "rewards/rejected": -5.142523765563965, "sft_loss": 4.242750644683838, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 17.59114134719486, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.05430952459573746, "logits/rejected": 0.03733807057142258, "logps/chosen": -4.2969841957092285, "logps/rejected": -5.275949478149414, "loss": 0.5866, "rewards/accuracies": 0.6875, "rewards/chosen": -4.2969841957092285, "rewards/margins": 0.9789649844169617, "rewards/rejected": -5.275949478149414, "sft_loss": 4.335600852966309, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 17.66037474695137, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.15076126158237457, "logits/rejected": -0.10337958484888077, "logps/chosen": -4.24198055267334, "logps/rejected": -5.192962169647217, "loss": 0.5445, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.24198055267334, "rewards/margins": 0.9509812593460083, "rewards/rejected": -5.192962169647217, "sft_loss": 4.194026470184326, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 17.990633898393206, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.12183426320552826, "logits/rejected": -0.02019437588751316, "logps/chosen": -4.105195045471191, "logps/rejected": -5.314507961273193, "loss": 0.4936, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.105195045471191, "rewards/margins": 1.209313154220581, "rewards/rejected": -5.314507961273193, "sft_loss": 4.080663681030273, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 20.832243914758877, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.12777313590049744, "logits/rejected": -0.029532218351960182, "logps/chosen": -4.139020919799805, "logps/rejected": -5.124658107757568, "loss": 0.4992, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.139020919799805, "rewards/margins": 0.9856374859809875, "rewards/rejected": -5.124658107757568, "sft_loss": 4.1206889152526855, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 19.749623232301467, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.15492835640907288, "logits/rejected": -0.07339377701282501, "logps/chosen": -4.259757041931152, "logps/rejected": -5.20042085647583, "loss": 0.5948, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.259757041931152, "rewards/margins": 0.9406633377075195, "rewards/rejected": -5.20042085647583, "sft_loss": 4.314055442810059, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 21.03359954772715, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.1447829157114029, "logits/rejected": 0.0038142502307891846, "logps/chosen": -4.160075664520264, "logps/rejected": -5.173608779907227, "loss": 0.533, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.160075664520264, "rewards/margins": 1.013533353805542, "rewards/rejected": -5.173608779907227, "sft_loss": 4.226978302001953, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 16.25438738433027, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.20611099898815155, "logits/rejected": -0.06895715743303299, "logps/chosen": -4.167816162109375, "logps/rejected": -5.257494926452637, "loss": 0.5047, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.167816162109375, "rewards/margins": 1.089678406715393, "rewards/rejected": -5.257494926452637, "sft_loss": 4.187711715698242, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 21.90203147752327, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.12661200761795044, "logits/rejected": 0.0346415713429451, "logps/chosen": -4.3217668533325195, "logps/rejected": -5.258645057678223, "loss": 0.5709, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.3217668533325195, "rewards/margins": 0.9368780255317688, "rewards/rejected": -5.258645057678223, "sft_loss": 4.307793140411377, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 15.496954998388917, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.04699116200208664, "logits/rejected": 0.04914768785238266, "logps/chosen": -4.097477912902832, "logps/rejected": -5.124687194824219, "loss": 0.526, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.097477912902832, "rewards/margins": 1.0272096395492554, "rewards/rejected": -5.124687194824219, "sft_loss": 4.059412956237793, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 17.05974309769058, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.11163085699081421, "logits/rejected": -0.026383381336927414, "logps/chosen": -4.076812744140625, "logps/rejected": -5.011106967926025, "loss": 0.5498, "rewards/accuracies": 0.78125, "rewards/chosen": -4.076812744140625, "rewards/margins": 0.9342945218086243, "rewards/rejected": -5.011106967926025, "sft_loss": 4.031030178070068, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 17.020735837715733, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.16300682723522186, "logits/rejected": -0.07162132114171982, "logps/chosen": -4.137753009796143, "logps/rejected": -5.187546730041504, "loss": 0.5061, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.137753009796143, "rewards/margins": 1.0497941970825195, "rewards/rejected": -5.187546730041504, "sft_loss": 4.171750545501709, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 20.478259568185432, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.12015841901302338, "logits/rejected": -0.07270057499408722, "logps/chosen": -4.1296234130859375, "logps/rejected": -5.0713019371032715, "loss": 0.5379, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.1296234130859375, "rewards/margins": 0.9416790008544922, "rewards/rejected": -5.0713019371032715, "sft_loss": 4.139330863952637, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 19.231642583061976, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.0636029839515686, "logits/rejected": -0.03613414242863655, "logps/chosen": -4.1893086433410645, "logps/rejected": -5.160582065582275, "loss": 0.5445, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.1893086433410645, "rewards/margins": 0.9712736010551453, "rewards/rejected": -5.160582065582275, "sft_loss": 4.223686695098877, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 20.455838066662558, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.1425260305404663, "logits/rejected": -0.016420168802142143, "logps/chosen": -4.095873832702637, "logps/rejected": -5.037788391113281, "loss": 0.5451, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.095873832702637, "rewards/margins": 0.9419152140617371, "rewards/rejected": -5.037788391113281, "sft_loss": 4.152099609375, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 20.59747876700388, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.08216296136379242, "logits/rejected": -0.01140972413122654, "logps/chosen": -3.9835262298583984, "logps/rejected": -5.026278018951416, "loss": 0.4817, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.9835262298583984, "rewards/margins": 1.0427509546279907, "rewards/rejected": -5.026278018951416, "sft_loss": 4.098433017730713, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 15.707515872721908, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.15804629027843475, "logits/rejected": -0.008402202278375626, "logps/chosen": -4.166210651397705, "logps/rejected": -5.008130073547363, "loss": 0.5235, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.166210651397705, "rewards/margins": 0.8419189453125, "rewards/rejected": -5.008130073547363, "sft_loss": 4.215886116027832, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 17.958433949746656, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.08625032007694244, "logits/rejected": 0.042555466294288635, "logps/chosen": -4.064800262451172, "logps/rejected": -5.0926899909973145, "loss": 0.53, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.064800262451172, "rewards/margins": 1.0278890132904053, "rewards/rejected": -5.0926899909973145, "sft_loss": 4.08921480178833, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 23.990799056307704, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.13713189959526062, "logits/rejected": -0.024845337495207787, "logps/chosen": -4.0670576095581055, "logps/rejected": -5.089943885803223, "loss": 0.5002, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.0670576095581055, "rewards/margins": 1.0228854417800903, "rewards/rejected": -5.089943885803223, "sft_loss": 4.173640727996826, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 24.049200595342356, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.13475707173347473, "logits/rejected": -0.054293811321258545, "logps/chosen": -3.9485535621643066, "logps/rejected": -4.891864776611328, "loss": 0.5098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.9485535621643066, "rewards/margins": 0.9433116912841797, "rewards/rejected": -4.891864776611328, "sft_loss": 4.00014591217041, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 27.11973407946868, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.13104240596294403, "logits/rejected": 0.018518714234232903, "logps/chosen": -4.24746561050415, "logps/rejected": -5.167351245880127, "loss": 0.5381, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.24746561050415, "rewards/margins": 0.9198853373527527, "rewards/rejected": -5.167351245880127, "sft_loss": 4.26193380355835, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 19.658703047840522, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.15275278687477112, "logits/rejected": -0.042724233120679855, "logps/chosen": -4.083222389221191, "logps/rejected": -5.0945048332214355, "loss": 0.5277, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.083222389221191, "rewards/margins": 1.01128351688385, "rewards/rejected": -5.0945048332214355, "sft_loss": 4.171942234039307, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 19.505670229478117, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.11327368021011353, "logits/rejected": 0.04385864734649658, "logps/chosen": -4.1887898445129395, "logps/rejected": -5.175089359283447, "loss": 0.538, "rewards/accuracies": 0.75, "rewards/chosen": -4.1887898445129395, "rewards/margins": 0.9862992167472839, "rewards/rejected": -5.175089359283447, "sft_loss": 4.245810031890869, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 21.70040417144261, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.09647150337696075, "logits/rejected": -0.030451273545622826, "logps/chosen": -4.218400955200195, "logps/rejected": -5.127978324890137, "loss": 0.5215, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.218400955200195, "rewards/margins": 0.9095777273178101, "rewards/rejected": -5.127978324890137, "sft_loss": 4.29921817779541, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 14.186563976709435, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.15827597677707672, "logits/rejected": -0.047300536185503006, "logps/chosen": -4.203457355499268, "logps/rejected": -5.193899631500244, "loss": 0.5356, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.203457355499268, "rewards/margins": 0.9904430508613586, "rewards/rejected": -5.193899631500244, "sft_loss": 4.272356986999512, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 23.872995455089058, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.17173142731189728, "logits/rejected": -0.015093426220119, "logps/chosen": -4.212442874908447, "logps/rejected": -5.019708633422852, "loss": 0.5935, "rewards/accuracies": 0.75, "rewards/chosen": -4.212442874908447, "rewards/margins": 0.8072662353515625, "rewards/rejected": -5.019708633422852, "sft_loss": 4.270554065704346, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 21.46918673011061, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.25270357728004456, "logits/rejected": -0.05684171989560127, "logps/chosen": -4.086533546447754, "logps/rejected": -5.053985595703125, "loss": 0.493, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.086533546447754, "rewards/margins": 0.9674515724182129, "rewards/rejected": -5.053985595703125, "sft_loss": 4.029145240783691, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 22.333149628635958, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.10256993770599365, "logits/rejected": -0.02860853634774685, "logps/chosen": -4.1633830070495605, "logps/rejected": -4.950932502746582, "loss": 0.598, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.1633830070495605, "rewards/margins": 0.7875494360923767, "rewards/rejected": -4.950932502746582, "sft_loss": 4.042609691619873, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 15.37692037295358, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.185903400182724, "logits/rejected": -0.11153104156255722, "logps/chosen": -4.057262420654297, "logps/rejected": -5.0266618728637695, "loss": 0.5535, "rewards/accuracies": 0.71875, "rewards/chosen": -4.057262420654297, "rewards/margins": 0.9693989753723145, "rewards/rejected": -5.0266618728637695, "sft_loss": 4.037737846374512, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 20.81202001686929, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.16729159653186798, "logits/rejected": -0.08528953790664673, "logps/chosen": -4.080422878265381, "logps/rejected": -4.926032066345215, "loss": 0.5463, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.080422878265381, "rewards/margins": 0.8456095457077026, "rewards/rejected": -4.926032066345215, "sft_loss": 4.083813667297363, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 15.231184553571023, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.1267562061548233, "logits/rejected": -0.03941858932375908, "logps/chosen": -4.1407952308654785, "logps/rejected": -5.257883548736572, "loss": 0.4893, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.1407952308654785, "rewards/margins": 1.1170886754989624, "rewards/rejected": -5.257883548736572, "sft_loss": 4.170060634613037, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 16.097207489875522, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.1753147691488266, "logits/rejected": -0.03046388551592827, "logps/chosen": -3.917523145675659, "logps/rejected": -4.898660182952881, "loss": 0.501, "rewards/accuracies": 0.78125, "rewards/chosen": -3.917523145675659, "rewards/margins": 0.9811370968818665, "rewards/rejected": -4.898660182952881, "sft_loss": 3.8685176372528076, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 15.107710878871499, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.09951204061508179, "logits/rejected": -0.008878534659743309, "logps/chosen": -3.9246585369110107, "logps/rejected": -4.985711097717285, "loss": 0.5155, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.9246585369110107, "rewards/margins": 1.0610524415969849, "rewards/rejected": -4.985711097717285, "sft_loss": 3.904801845550537, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 15.097741117008953, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.1479146033525467, "logits/rejected": -0.09697943180799484, "logps/chosen": -3.9680991172790527, "logps/rejected": -4.9519877433776855, "loss": 0.5071, "rewards/accuracies": 0.78125, "rewards/chosen": -3.9680991172790527, "rewards/margins": 0.983887791633606, "rewards/rejected": -4.9519877433776855, "sft_loss": 4.009753704071045, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 16.027082366258657, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.09173516184091568, "logits/rejected": -0.032550834119319916, "logps/chosen": -4.174387454986572, "logps/rejected": -5.171200752258301, "loss": 0.5149, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.174387454986572, "rewards/margins": 0.9968129992485046, "rewards/rejected": -5.171200752258301, "sft_loss": 4.2629289627075195, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 19.16035789184787, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.09119559824466705, "logits/rejected": -0.04381603002548218, "logps/chosen": -4.111305236816406, "logps/rejected": -4.907462120056152, "loss": 0.5905, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.111305236816406, "rewards/margins": 0.79615718126297, "rewards/rejected": -4.907462120056152, "sft_loss": 4.138059139251709, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 13.659664145902656, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.1878870278596878, "logits/rejected": -0.03675522282719612, "logps/chosen": -4.168497562408447, "logps/rejected": -5.069736480712891, "loss": 0.5518, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.168497562408447, "rewards/margins": 0.901239275932312, "rewards/rejected": -5.069736480712891, "sft_loss": 4.146317958831787, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 19.47720756254695, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.11716008186340332, "logits/rejected": 0.011091604828834534, "logps/chosen": -4.178882122039795, "logps/rejected": -5.08506965637207, "loss": 0.5856, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.178882122039795, "rewards/margins": 0.9061868786811829, "rewards/rejected": -5.08506965637207, "sft_loss": 4.272181510925293, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.08892910927534103, "eval_logits/rejected": 0.16326628625392914, "eval_logps/chosen": -4.089351177215576, "eval_logps/rejected": -4.974943161010742, "eval_loss": 0.5677979588508606, "eval_rewards/accuracies": 0.7336795330047607, "eval_rewards/chosen": -4.089351177215576, "eval_rewards/margins": 0.885591983795166, "eval_rewards/rejected": -4.974943161010742, "eval_runtime": 43.2056, "eval_samples_per_second": 31.13, "eval_sft_loss": 4.112879276275635, "eval_steps_per_second": 7.8, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 18.243432365338094, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.15508165955543518, "logits/rejected": -0.1456497758626938, "logps/chosen": -3.9924635887145996, "logps/rejected": -4.866345405578613, "loss": 0.543, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.9924635887145996, "rewards/margins": 0.8738818168640137, "rewards/rejected": -4.866345405578613, "sft_loss": 4.0659685134887695, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 15.494806094440678, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.1938861459493637, "logits/rejected": -0.08331102132797241, "logps/chosen": -3.9935669898986816, "logps/rejected": -4.946460723876953, "loss": 0.4851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.9935669898986816, "rewards/margins": 0.9528937339782715, "rewards/rejected": -4.946460723876953, "sft_loss": 4.025681972503662, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 16.30131991335173, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.13050523400306702, "logits/rejected": -0.03008384443819523, "logps/chosen": -4.111358642578125, "logps/rejected": -5.042483329772949, "loss": 0.5328, "rewards/accuracies": 0.75, "rewards/chosen": -4.111358642578125, "rewards/margins": 0.9311251640319824, "rewards/rejected": -5.042483329772949, "sft_loss": 4.114981651306152, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 21.56330380356665, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.1629718393087387, "logits/rejected": -0.08070691674947739, "logps/chosen": -4.215979099273682, "logps/rejected": -5.047354221343994, "loss": 0.567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.215979099273682, "rewards/margins": 0.8313754200935364, "rewards/rejected": -5.047354221343994, "sft_loss": 4.292203426361084, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 16.224892327360532, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.1000644713640213, "logits/rejected": -0.08347281068563461, "logps/chosen": -4.007589340209961, "logps/rejected": -4.895975589752197, "loss": 0.5239, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.007589340209961, "rewards/margins": 0.8883861303329468, "rewards/rejected": -4.895975589752197, "sft_loss": 4.015894412994385, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 18.77742441655429, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.15146948397159576, "logits/rejected": -0.12039715051651001, "logps/chosen": -4.100152492523193, "logps/rejected": -5.13730001449585, "loss": 0.5434, "rewards/accuracies": 0.75, "rewards/chosen": -4.100152492523193, "rewards/margins": 1.0371477603912354, "rewards/rejected": -5.13730001449585, "sft_loss": 4.252596378326416, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 16.75135604546139, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.21641497313976288, "logits/rejected": -0.11199178546667099, "logps/chosen": -4.030362129211426, "logps/rejected": -5.064505577087402, "loss": 0.5093, "rewards/accuracies": 0.78125, "rewards/chosen": -4.030362129211426, "rewards/margins": 1.0341436862945557, "rewards/rejected": -5.064505577087402, "sft_loss": 4.088961124420166, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 16.977008961327975, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.19566890597343445, "logits/rejected": -0.12340422719717026, "logps/chosen": -4.214592933654785, "logps/rejected": -5.046109676361084, "loss": 0.5407, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.214592933654785, "rewards/margins": 0.831516444683075, "rewards/rejected": -5.046109676361084, "sft_loss": 4.267856597900391, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 15.751632591599389, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.17926757037639618, "logits/rejected": -0.0764254480600357, "logps/chosen": -4.195579528808594, "logps/rejected": -4.90426778793335, "loss": 0.578, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.195579528808594, "rewards/margins": 0.7086880803108215, "rewards/rejected": -4.90426778793335, "sft_loss": 4.237638473510742, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 17.69253712852231, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.23232333362102509, "logits/rejected": -0.16036547720432281, "logps/chosen": -4.313710689544678, "logps/rejected": -5.077432632446289, "loss": 0.5754, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.313710689544678, "rewards/margins": 0.7637217044830322, "rewards/rejected": -5.077432632446289, "sft_loss": 4.4273176193237305, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 17.61474823443831, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.12839333713054657, "logits/rejected": -0.04960453137755394, "logps/chosen": -4.242055892944336, "logps/rejected": -5.083164691925049, "loss": 0.5837, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.242055892944336, "rewards/margins": 0.8411084413528442, "rewards/rejected": -5.083164691925049, "sft_loss": 4.243819236755371, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 18.573663769325606, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.16934773325920105, "logits/rejected": -0.07936234027147293, "logps/chosen": -4.232779026031494, "logps/rejected": -5.081835746765137, "loss": 0.5695, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.232779026031494, "rewards/margins": 0.8490568399429321, "rewards/rejected": -5.081835746765137, "sft_loss": 4.286954879760742, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 17.840037475857958, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.16358472406864166, "logits/rejected": -0.09386870265007019, "logps/chosen": -4.062561511993408, "logps/rejected": -5.115314960479736, "loss": 0.5287, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.062561511993408, "rewards/margins": 1.052753210067749, "rewards/rejected": -5.115314960479736, "sft_loss": 4.042852878570557, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 16.441670874267846, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.17569497227668762, "logits/rejected": -0.08433757722377777, "logps/chosen": -4.120484352111816, "logps/rejected": -5.193161964416504, "loss": 0.4853, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.120484352111816, "rewards/margins": 1.0726778507232666, "rewards/rejected": -5.193161964416504, "sft_loss": 4.253143787384033, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 17.437470155409795, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.2014768421649933, "logits/rejected": -0.08412306755781174, "logps/chosen": -4.331996440887451, "logps/rejected": -5.299182415008545, "loss": 0.5817, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.331996440887451, "rewards/margins": 0.9671859741210938, "rewards/rejected": -5.299182415008545, "sft_loss": 4.386224269866943, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 16.337209237194713, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.16797594726085663, "logits/rejected": -0.0618429072201252, "logps/chosen": -3.9733901023864746, "logps/rejected": -4.945980072021484, "loss": 0.493, "rewards/accuracies": 0.78125, "rewards/chosen": -3.9733901023864746, "rewards/margins": 0.9725903272628784, "rewards/rejected": -4.945980072021484, "sft_loss": 4.024728298187256, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 22.587274153214803, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.19083619117736816, "logits/rejected": -0.11363337188959122, "logps/chosen": -4.246734142303467, "logps/rejected": -5.188324928283691, "loss": 0.5522, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.246734142303467, "rewards/margins": 0.9415907859802246, "rewards/rejected": -5.188324928283691, "sft_loss": 4.359339237213135, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 17.87721609193545, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.15688160061836243, "logits/rejected": -0.02689887024462223, "logps/chosen": -4.333113193511963, "logps/rejected": -5.23205041885376, "loss": 0.5568, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.333113193511963, "rewards/margins": 0.8989373445510864, "rewards/rejected": -5.23205041885376, "sft_loss": 4.344038963317871, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 15.53445740689773, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.15794029831886292, "logits/rejected": -0.0844997763633728, "logps/chosen": -4.22260856628418, "logps/rejected": -5.181676387786865, "loss": 0.519, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.22260856628418, "rewards/margins": 0.9590682983398438, "rewards/rejected": -5.181676387786865, "sft_loss": 4.298532962799072, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 15.416147657161021, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.14069563150405884, "logits/rejected": -0.03486606106162071, "logps/chosen": -4.119704246520996, "logps/rejected": -5.0963358879089355, "loss": 0.5201, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.119704246520996, "rewards/margins": 0.9766316413879395, "rewards/rejected": -5.0963358879089355, "sft_loss": 4.224505424499512, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 18.161096420990678, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.13422636687755585, "logits/rejected": -0.07517583668231964, "logps/chosen": -4.114509105682373, "logps/rejected": -5.2362751960754395, "loss": 0.4915, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.114509105682373, "rewards/margins": 1.1217658519744873, "rewards/rejected": -5.2362751960754395, "sft_loss": 4.160386085510254, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 18.852324743480146, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.05760626867413521, "logits/rejected": -0.020621730014681816, "logps/chosen": -4.1999030113220215, "logps/rejected": -5.0476579666137695, "loss": 0.5822, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.1999030113220215, "rewards/margins": 0.847754955291748, "rewards/rejected": -5.0476579666137695, "sft_loss": 4.281563758850098, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 17.976192486737958, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.1256621778011322, "logits/rejected": -0.07518371194601059, "logps/chosen": -4.190843105316162, "logps/rejected": -5.2432451248168945, "loss": 0.5056, "rewards/accuracies": 0.75, "rewards/chosen": -4.190843105316162, "rewards/margins": 1.0524019002914429, "rewards/rejected": -5.2432451248168945, "sft_loss": 4.277374267578125, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 17.99746447214315, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.1489638090133667, "logits/rejected": -0.07571511715650558, "logps/chosen": -4.129316329956055, "logps/rejected": -5.018464088439941, "loss": 0.5469, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.129316329956055, "rewards/margins": 0.8891475796699524, "rewards/rejected": -5.018464088439941, "sft_loss": 4.103287696838379, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 16.142726001792713, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.18727155029773712, "logits/rejected": -0.0963721051812172, "logps/chosen": -4.147437572479248, "logps/rejected": -5.175294399261475, "loss": 0.4879, "rewards/accuracies": 0.78125, "rewards/chosen": -4.147437572479248, "rewards/margins": 1.0278565883636475, "rewards/rejected": -5.175294399261475, "sft_loss": 4.290009021759033, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 17.54951804309544, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.1390126645565033, "logits/rejected": -0.03979702293872833, "logps/chosen": -4.171058654785156, "logps/rejected": -5.114262104034424, "loss": 0.5063, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.171058654785156, "rewards/margins": 0.9432040452957153, "rewards/rejected": -5.114262104034424, "sft_loss": 4.278780937194824, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 24.23113549285882, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.1034446507692337, "logits/rejected": -0.03407427668571472, "logps/chosen": -4.303669452667236, "logps/rejected": -5.211419105529785, "loss": 0.5779, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.303669452667236, "rewards/margins": 0.9077495336532593, "rewards/rejected": -5.211419105529785, "sft_loss": 4.328175067901611, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 19.12008536365398, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.0796082392334938, "logits/rejected": -0.0246458537876606, "logps/chosen": -4.290400505065918, "logps/rejected": -5.158785820007324, "loss": 0.5272, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.290400505065918, "rewards/margins": 0.8683861494064331, "rewards/rejected": -5.158785820007324, "sft_loss": 4.287832736968994, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 16.47165153331192, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.12000022828578949, "logits/rejected": -0.058578480035066605, "logps/chosen": -4.189848899841309, "logps/rejected": -5.388810157775879, "loss": 0.4366, "rewards/accuracies": 0.84375, "rewards/chosen": -4.189848899841309, "rewards/margins": 1.198961615562439, "rewards/rejected": -5.388810157775879, "sft_loss": 4.208093643188477, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 15.261664502257158, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.18162012100219727, "logits/rejected": -0.09842869639396667, "logps/chosen": -4.083524703979492, "logps/rejected": -5.226982593536377, "loss": 0.4649, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.083524703979492, "rewards/margins": 1.1434574127197266, "rewards/rejected": -5.226982593536377, "sft_loss": 4.203307151794434, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 13.915547135421344, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.1771707683801651, "logits/rejected": -0.054735828191041946, "logps/chosen": -4.246321201324463, "logps/rejected": -5.439396858215332, "loss": 0.4377, "rewards/accuracies": 0.84375, "rewards/chosen": -4.246321201324463, "rewards/margins": 1.1930756568908691, "rewards/rejected": -5.439396858215332, "sft_loss": 4.391330242156982, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 13.860572104307709, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.1839606910943985, "logits/rejected": -0.12588202953338623, "logps/chosen": -4.3138556480407715, "logps/rejected": -5.5978102684021, "loss": 0.4409, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.3138556480407715, "rewards/margins": 1.2839547395706177, "rewards/rejected": -5.5978102684021, "sft_loss": 4.4018049240112305, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 20.756274256793866, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.14228351414203644, "logits/rejected": 0.011768890544772148, "logps/chosen": -4.53744649887085, "logps/rejected": -5.743522644042969, "loss": 0.4872, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.53744649887085, "rewards/margins": 1.20607590675354, "rewards/rejected": -5.743522644042969, "sft_loss": 4.558747291564941, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 23.859097029142497, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.22040753066539764, "logits/rejected": -0.08732731640338898, "logps/chosen": -4.480866432189941, "logps/rejected": -5.846318244934082, "loss": 0.4618, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.480866432189941, "rewards/margins": 1.365451455116272, "rewards/rejected": -5.846318244934082, "sft_loss": 4.562276840209961, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 23.37780742989256, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.10928299278020859, "logits/rejected": -0.07061926275491714, "logps/chosen": -4.536766052246094, "logps/rejected": -5.758551597595215, "loss": 0.4649, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.536766052246094, "rewards/margins": 1.2217856645584106, "rewards/rejected": -5.758551597595215, "sft_loss": 4.541500091552734, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 16.277017836639978, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.15758976340293884, "logits/rejected": -0.11704935133457184, "logps/chosen": -4.346386909484863, "logps/rejected": -5.59005880355835, "loss": 0.4565, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.346386909484863, "rewards/margins": 1.2436727285385132, "rewards/rejected": -5.59005880355835, "sft_loss": 4.381421089172363, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 15.324472681567837, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.18570008873939514, "logits/rejected": -0.06011120602488518, "logps/chosen": -4.408538341522217, "logps/rejected": -5.611115455627441, "loss": 0.476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.408538341522217, "rewards/margins": 1.2025763988494873, "rewards/rejected": -5.611115455627441, "sft_loss": 4.448096752166748, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 13.550043619241059, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.20547811686992645, "logits/rejected": -0.07174699753522873, "logps/chosen": -4.346865177154541, "logps/rejected": -5.615252494812012, "loss": 0.4546, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.346865177154541, "rewards/margins": 1.268387794494629, "rewards/rejected": -5.615252494812012, "sft_loss": 4.4165849685668945, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 18.022751885568077, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.11128351837396622, "logits/rejected": -0.06616206467151642, "logps/chosen": -4.275948524475098, "logps/rejected": -5.372910976409912, "loss": 0.4918, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.275948524475098, "rewards/margins": 1.096962571144104, "rewards/rejected": -5.372910976409912, "sft_loss": 4.3458147048950195, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 22.99000433508882, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.1808985471725464, "logits/rejected": -0.0658467561006546, "logps/chosen": -4.3522162437438965, "logps/rejected": -5.611857891082764, "loss": 0.4523, "rewards/accuracies": 0.8125, "rewards/chosen": -4.3522162437438965, "rewards/margins": 1.2596412897109985, "rewards/rejected": -5.611857891082764, "sft_loss": 4.430613040924072, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 24.666474774126513, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.13231855630874634, "logits/rejected": -0.009453452192246914, "logps/chosen": -4.427382469177246, "logps/rejected": -5.60190486907959, "loss": 0.4588, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.427382469177246, "rewards/margins": 1.1745221614837646, "rewards/rejected": -5.60190486907959, "sft_loss": 4.450659275054932, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 16.447166238503364, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.09605690091848373, "logits/rejected": 0.001893743872642517, "logps/chosen": -4.376564025878906, "logps/rejected": -5.6669602394104, "loss": 0.4185, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.376564025878906, "rewards/margins": 1.290395975112915, "rewards/rejected": -5.6669602394104, "sft_loss": 4.277453422546387, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 17.66655526695026, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.1871604323387146, "logits/rejected": -0.09622781723737717, "logps/chosen": -4.417603492736816, "logps/rejected": -5.642507076263428, "loss": 0.4296, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.417603492736816, "rewards/margins": 1.2249035835266113, "rewards/rejected": -5.642507076263428, "sft_loss": 4.344175815582275, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 17.318780900757023, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.22635912895202637, "logits/rejected": -0.02497316151857376, "logps/chosen": -4.55142879486084, "logps/rejected": -5.799140930175781, "loss": 0.4524, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.55142879486084, "rewards/margins": 1.2477116584777832, "rewards/rejected": -5.799140930175781, "sft_loss": 4.594216823577881, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 17.86834053588433, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.14959703385829926, "logits/rejected": -0.09499450027942657, "logps/chosen": -4.484490394592285, "logps/rejected": -5.76265811920166, "loss": 0.4261, "rewards/accuracies": 0.84375, "rewards/chosen": -4.484490394592285, "rewards/margins": 1.2781678438186646, "rewards/rejected": -5.76265811920166, "sft_loss": 4.595266342163086, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 22.682792372551617, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.2396484911441803, "logits/rejected": -0.10936751216650009, "logps/chosen": -4.647583961486816, "logps/rejected": -6.019547462463379, "loss": 0.4402, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.647583961486816, "rewards/margins": 1.3719635009765625, "rewards/rejected": -6.019547462463379, "sft_loss": 4.67317008972168, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 14.747593491123746, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.17778167128562927, "logits/rejected": -0.03496559336781502, "logps/chosen": -4.572454452514648, "logps/rejected": -5.8602800369262695, "loss": 0.4438, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.572454452514648, "rewards/margins": 1.287825107574463, "rewards/rejected": -5.8602800369262695, "sft_loss": 4.662432670593262, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 17.514615706709293, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.12904831767082214, "logits/rejected": -0.016960179433226585, "logps/chosen": -4.486534118652344, "logps/rejected": -5.739012241363525, "loss": 0.4474, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.486534118652344, "rewards/margins": 1.2524782419204712, "rewards/rejected": -5.739012241363525, "sft_loss": 4.5322442054748535, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 14.513457288617543, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.20153093338012695, "logits/rejected": -0.033189572393894196, "logps/chosen": -4.680139064788818, "logps/rejected": -6.050618648529053, "loss": 0.4093, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.680139064788818, "rewards/margins": 1.3704793453216553, "rewards/rejected": -6.050618648529053, "sft_loss": 4.678463935852051, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 18.202688604141244, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.16980481147766113, "logits/rejected": -0.024973779916763306, "logps/chosen": -4.6503071784973145, "logps/rejected": -5.726879119873047, "loss": 0.4963, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.6503071784973145, "rewards/margins": 1.0765719413757324, "rewards/rejected": -5.726879119873047, "sft_loss": 4.77303409576416, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 22.41727403559302, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.12804751098155975, "logits/rejected": 0.015957217663526535, "logps/chosen": -4.61462926864624, "logps/rejected": -6.049999237060547, "loss": 0.4642, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.61462926864624, "rewards/margins": 1.4353702068328857, "rewards/rejected": -6.049999237060547, "sft_loss": 4.627757549285889, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 23.925564315420097, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.1258188784122467, "logits/rejected": -0.03270752355456352, "logps/chosen": -4.608656883239746, "logps/rejected": -5.895892143249512, "loss": 0.456, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.608656883239746, "rewards/margins": 1.287234902381897, "rewards/rejected": -5.895892143249512, "sft_loss": 4.617652893066406, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 18.932471018151297, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.23533248901367188, "logits/rejected": -0.11765459924936295, "logps/chosen": -4.57756233215332, "logps/rejected": -5.859441757202148, "loss": 0.4244, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.57756233215332, "rewards/margins": 1.2818796634674072, "rewards/rejected": -5.859441757202148, "sft_loss": 4.577887058258057, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 21.18272309195914, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.2512609362602234, "logits/rejected": -0.16672050952911377, "logps/chosen": -4.517947673797607, "logps/rejected": -5.76108980178833, "loss": 0.463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.517947673797607, "rewards/margins": 1.2431416511535645, "rewards/rejected": -5.76108980178833, "sft_loss": 4.614432334899902, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 19.19306771230631, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.16765734553337097, "logits/rejected": -0.06805787980556488, "logps/chosen": -4.5621490478515625, "logps/rejected": -5.844240188598633, "loss": 0.4722, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.5621490478515625, "rewards/margins": 1.2820910215377808, "rewards/rejected": -5.844240188598633, "sft_loss": 4.612701416015625, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 20.707099640973883, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.14436683058738708, "logits/rejected": -0.009703554213047028, "logps/chosen": -4.375124931335449, "logps/rejected": -5.790663719177246, "loss": 0.408, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.375124931335449, "rewards/margins": 1.4155378341674805, "rewards/rejected": -5.790663719177246, "sft_loss": 4.3841071128845215, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 21.6609372072673, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.13693545758724213, "logits/rejected": -0.043102920055389404, "logps/chosen": -4.388991832733154, "logps/rejected": -5.805684566497803, "loss": 0.4339, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.388991832733154, "rewards/margins": 1.416691541671753, "rewards/rejected": -5.805684566497803, "sft_loss": 4.460447788238525, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 25.05993756394716, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.11307881772518158, "logits/rejected": -0.025426015257835388, "logps/chosen": -4.619956970214844, "logps/rejected": -5.880894184112549, "loss": 0.4679, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.619956970214844, "rewards/margins": 1.2609364986419678, "rewards/rejected": -5.880894184112549, "sft_loss": 4.613383769989014, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 15.459973618296203, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.19261129200458527, "logits/rejected": -0.010503212921321392, "logps/chosen": -4.513919353485107, "logps/rejected": -5.790491580963135, "loss": 0.4555, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.513919353485107, "rewards/margins": 1.27657151222229, "rewards/rejected": -5.790491580963135, "sft_loss": 4.4933366775512695, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 21.559862109780585, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.10432785749435425, "logits/rejected": -0.0616467110812664, "logps/chosen": -4.550601005554199, "logps/rejected": -5.83347225189209, "loss": 0.451, "rewards/accuracies": 0.8125, "rewards/chosen": -4.550601005554199, "rewards/margins": 1.282871127128601, "rewards/rejected": -5.83347225189209, "sft_loss": 4.568717002868652, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 16.97024088090951, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.11373122781515121, "logits/rejected": -0.02093740925192833, "logps/chosen": -4.710618495941162, "logps/rejected": -5.993484020233154, "loss": 0.4828, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.710618495941162, "rewards/margins": 1.2828651666641235, "rewards/rejected": -5.993484020233154, "sft_loss": 4.71906042098999, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 18.231620168854963, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.12412917613983154, "logits/rejected": -0.02643829584121704, "logps/chosen": -4.418890476226807, "logps/rejected": -5.6583147048950195, "loss": 0.4517, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.418890476226807, "rewards/margins": 1.2394243478775024, "rewards/rejected": -5.6583147048950195, "sft_loss": 4.488974571228027, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 22.63151693266803, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.1703699827194214, "logits/rejected": -0.04086681082844734, "logps/chosen": -4.718385219573975, "logps/rejected": -5.87748384475708, "loss": 0.4958, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.718385219573975, "rewards/margins": 1.1590980291366577, "rewards/rejected": -5.87748384475708, "sft_loss": 4.733608722686768, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 19.158970635544666, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.19361944496631622, "logits/rejected": -0.0514645092189312, "logps/chosen": -4.5244221687316895, "logps/rejected": -6.010369300842285, "loss": 0.4163, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.5244221687316895, "rewards/margins": 1.4859468936920166, "rewards/rejected": -6.010369300842285, "sft_loss": 4.477473258972168, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 14.404258310123362, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.13890644907951355, "logits/rejected": -0.06422574818134308, "logps/chosen": -4.478633880615234, "logps/rejected": -5.896307945251465, "loss": 0.4169, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.478633880615234, "rewards/margins": 1.4176738262176514, "rewards/rejected": -5.896307945251465, "sft_loss": 4.554999351501465, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 14.82955366511182, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.19433699548244476, "logits/rejected": -0.041183482855558395, "logps/chosen": -4.494211673736572, "logps/rejected": -5.957503795623779, "loss": 0.4309, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.494211673736572, "rewards/margins": 1.4632922410964966, "rewards/rejected": -5.957503795623779, "sft_loss": 4.525946617126465, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 19.36649542789264, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.1541624218225479, "logits/rejected": -0.05928944796323776, "logps/chosen": -4.555628776550293, "logps/rejected": -5.938043594360352, "loss": 0.4473, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.555628776550293, "rewards/margins": 1.38241446018219, "rewards/rejected": -5.938043594360352, "sft_loss": 4.585000991821289, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 22.316750205317803, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.16120049357414246, "logits/rejected": -0.09427209198474884, "logps/chosen": -4.504244804382324, "logps/rejected": -5.9012041091918945, "loss": 0.4642, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.504244804382324, "rewards/margins": 1.3969593048095703, "rewards/rejected": -5.9012041091918945, "sft_loss": 4.468905925750732, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 22.710270989091576, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.2358735054731369, "logits/rejected": -0.10350503772497177, "logps/chosen": -4.722151279449463, "logps/rejected": -5.764187812805176, "loss": 0.5286, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.722151279449463, "rewards/margins": 1.0420368909835815, "rewards/rejected": -5.764187812805176, "sft_loss": 4.828040599822998, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 31.008446115751816, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.127239391207695, "logits/rejected": 0.017133042216300964, "logps/chosen": -4.513893127441406, "logps/rejected": -5.921486854553223, "loss": 0.4369, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.513893127441406, "rewards/margins": 1.407593846321106, "rewards/rejected": -5.921486854553223, "sft_loss": 4.540274620056152, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 22.636623943914152, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.18237945437431335, "logits/rejected": -0.11283756792545319, "logps/chosen": -4.60231876373291, "logps/rejected": -6.050238609313965, "loss": 0.4553, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.60231876373291, "rewards/margins": 1.4479202032089233, "rewards/rejected": -6.050238609313965, "sft_loss": 4.625402927398682, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 12.845987005897252, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.13675661385059357, "logits/rejected": -0.032407719641923904, "logps/chosen": -4.471343040466309, "logps/rejected": -6.141912460327148, "loss": 0.3834, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.471343040466309, "rewards/margins": 1.670569658279419, "rewards/rejected": -6.141912460327148, "sft_loss": 4.563107967376709, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 22.196012568551822, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.15787668526172638, "logits/rejected": -0.027476048097014427, "logps/chosen": -4.659394264221191, "logps/rejected": -6.09100341796875, "loss": 0.4359, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.659394264221191, "rewards/margins": 1.43160879611969, "rewards/rejected": -6.09100341796875, "sft_loss": 4.784261226654053, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 16.456475986320505, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.1013205274939537, "logits/rejected": -0.004166866652667522, "logps/chosen": -4.576447010040283, "logps/rejected": -6.026632308959961, "loss": 0.4768, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.576447010040283, "rewards/margins": 1.4501855373382568, "rewards/rejected": -6.026632308959961, "sft_loss": 4.765728950500488, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 20.356120149703255, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.09850762784481049, "logits/rejected": 0.010118888691067696, "logps/chosen": -4.386921405792236, "logps/rejected": -5.722302436828613, "loss": 0.4143, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.386921405792236, "rewards/margins": 1.3353809118270874, "rewards/rejected": -5.722302436828613, "sft_loss": 4.426734447479248, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 16.710172332706804, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.1513948142528534, "logits/rejected": -0.08292113244533539, "logps/chosen": -4.454089164733887, "logps/rejected": -5.841625213623047, "loss": 0.4568, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.454089164733887, "rewards/margins": 1.3875354528427124, "rewards/rejected": -5.841625213623047, "sft_loss": 4.680628776550293, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 25.683784114058085, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.1821201890707016, "logits/rejected": -0.07850952446460724, "logps/chosen": -4.567347526550293, "logps/rejected": -5.961523056030273, "loss": 0.4302, "rewards/accuracies": 0.84375, "rewards/chosen": -4.567347526550293, "rewards/margins": 1.3941757678985596, "rewards/rejected": -5.961523056030273, "sft_loss": 4.69355583190918, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 17.18321446845876, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.21645669639110565, "logits/rejected": -0.10848214477300644, "logps/chosen": -4.355871677398682, "logps/rejected": -5.764237403869629, "loss": 0.409, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.355871677398682, "rewards/margins": 1.4083654880523682, "rewards/rejected": -5.764237403869629, "sft_loss": 4.4289021492004395, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 24.6685682814048, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.21960720419883728, "logits/rejected": -0.020451117306947708, "logps/chosen": -4.579361915588379, "logps/rejected": -5.956488132476807, "loss": 0.4663, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.579361915588379, "rewards/margins": 1.3771260976791382, "rewards/rejected": -5.956488132476807, "sft_loss": 4.609208106994629, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 24.724242452029113, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.16371068358421326, "logits/rejected": -0.09416019916534424, "logps/chosen": -4.51413631439209, "logps/rejected": -5.884792327880859, "loss": 0.4692, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.51413631439209, "rewards/margins": 1.3706560134887695, "rewards/rejected": -5.884792327880859, "sft_loss": 4.492763996124268, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.07501842081546783, "eval_logits/rejected": 0.15692532062530518, "eval_logps/chosen": -4.701979637145996, "eval_logps/rejected": -5.741509437561035, "eval_loss": 0.5829208493232727, "eval_rewards/accuracies": 0.7299703359603882, "eval_rewards/chosen": -4.701979637145996, "eval_rewards/margins": 1.0395296812057495, "eval_rewards/rejected": -5.741509437561035, "eval_runtime": 43.1772, "eval_samples_per_second": 31.151, "eval_sft_loss": 4.6998491287231445, "eval_steps_per_second": 7.805, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 18.413899749315984, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.11675839126110077, "logits/rejected": 0.04759988561272621, "logps/chosen": -4.67446231842041, "logps/rejected": -5.886545658111572, "loss": 0.4805, "rewards/accuracies": 0.78125, "rewards/chosen": -4.67446231842041, "rewards/margins": 1.212082862854004, "rewards/rejected": -5.886545658111572, "sft_loss": 4.659073352813721, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 22.822288857524793, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.17632398009300232, "logits/rejected": -0.07813229411840439, "logps/chosen": -4.495655536651611, "logps/rejected": -5.676532745361328, "loss": 0.4882, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.495655536651611, "rewards/margins": 1.1808770895004272, "rewards/rejected": -5.676532745361328, "sft_loss": 4.489293098449707, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 21.430619108993792, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.18073612451553345, "logits/rejected": 0.013807791285216808, "logps/chosen": -4.501663684844971, "logps/rejected": -5.728493690490723, "loss": 0.4803, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.501663684844971, "rewards/margins": 1.2268303632736206, "rewards/rejected": -5.728493690490723, "sft_loss": 4.570557594299316, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 16.281583131218813, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.2418348342180252, "logits/rejected": -0.10622209310531616, "logps/chosen": -4.473881721496582, "logps/rejected": -5.790538311004639, "loss": 0.4472, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.473881721496582, "rewards/margins": 1.3166567087173462, "rewards/rejected": -5.790538311004639, "sft_loss": 4.559691429138184, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 25.919500533512633, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.12916305661201477, "logits/rejected": -0.04332312196493149, "logps/chosen": -4.4670305252075195, "logps/rejected": -5.607809543609619, "loss": 0.5011, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.4670305252075195, "rewards/margins": 1.1407783031463623, "rewards/rejected": -5.607809543609619, "sft_loss": 4.542896270751953, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 17.465509733604573, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.20880010724067688, "logits/rejected": -0.10321755707263947, "logps/chosen": -4.294320106506348, "logps/rejected": -5.752496719360352, "loss": 0.3927, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.294320106506348, "rewards/margins": 1.4581772089004517, "rewards/rejected": -5.752496719360352, "sft_loss": 4.305559158325195, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 15.467504253265371, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.1964549869298935, "logits/rejected": -0.17378666996955872, "logps/chosen": -4.268540859222412, "logps/rejected": -5.400685787200928, "loss": 0.4746, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.268540859222412, "rewards/margins": 1.132144808769226, "rewards/rejected": -5.400685787200928, "sft_loss": 4.281256675720215, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 19.621379459284213, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.17080268263816833, "logits/rejected": -0.16464689373970032, "logps/chosen": -4.361598491668701, "logps/rejected": -5.593031883239746, "loss": 0.4473, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.361598491668701, "rewards/margins": 1.231433391571045, "rewards/rejected": -5.593031883239746, "sft_loss": 4.3913750648498535, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 23.171660283421172, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.1492532193660736, "logits/rejected": -0.03700689598917961, "logps/chosen": -4.333517551422119, "logps/rejected": -5.488014221191406, "loss": 0.4813, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.333517551422119, "rewards/margins": 1.1544969081878662, "rewards/rejected": -5.488014221191406, "sft_loss": 4.4080305099487305, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 27.373463948574507, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.20800991356372833, "logits/rejected": -0.10373286157846451, "logps/chosen": -4.645865440368652, "logps/rejected": -5.66098165512085, "loss": 0.5345, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.645865440368652, "rewards/margins": 1.0151169300079346, "rewards/rejected": -5.66098165512085, "sft_loss": 4.785841941833496, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 20.341752564006676, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.16706502437591553, "logits/rejected": -0.09402771294116974, "logps/chosen": -4.453884601593018, "logps/rejected": -5.4885149002075195, "loss": 0.49, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.453884601593018, "rewards/margins": 1.0346308946609497, "rewards/rejected": -5.4885149002075195, "sft_loss": 4.488232612609863, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 17.91385130623742, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.09191139042377472, "logits/rejected": 0.025123313069343567, "logps/chosen": -4.451213836669922, "logps/rejected": -5.648017883300781, "loss": 0.4838, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.451213836669922, "rewards/margins": 1.196804165840149, "rewards/rejected": -5.648017883300781, "sft_loss": 4.565667629241943, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 20.33735857354431, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.23012490570545197, "logits/rejected": -0.1606210172176361, "logps/chosen": -4.371445655822754, "logps/rejected": -5.787907123565674, "loss": 0.4296, "rewards/accuracies": 0.84375, "rewards/chosen": -4.371445655822754, "rewards/margins": 1.4164615869522095, "rewards/rejected": -5.787907123565674, "sft_loss": 4.472637176513672, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 22.32451002093342, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.14594785869121552, "logits/rejected": -0.002728702500462532, "logps/chosen": -4.419862747192383, "logps/rejected": -5.816904067993164, "loss": 0.4756, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.419862747192383, "rewards/margins": 1.3970410823822021, "rewards/rejected": -5.816904067993164, "sft_loss": 4.503393650054932, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 16.866151143936293, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.20224475860595703, "logits/rejected": -0.0694878101348877, "logps/chosen": -4.323914051055908, "logps/rejected": -5.589296340942383, "loss": 0.4463, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.323914051055908, "rewards/margins": 1.2653824090957642, "rewards/rejected": -5.589296340942383, "sft_loss": 4.306329727172852, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 17.495137641658133, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.20146453380584717, "logits/rejected": -0.09485888481140137, "logps/chosen": -4.480501174926758, "logps/rejected": -5.844509601593018, "loss": 0.4583, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.480501174926758, "rewards/margins": 1.3640084266662598, "rewards/rejected": -5.844509601593018, "sft_loss": 4.449831008911133, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 19.05490248938216, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.1656581461429596, "logits/rejected": -0.09022749960422516, "logps/chosen": -4.398980617523193, "logps/rejected": -5.686980247497559, "loss": 0.4286, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.398980617523193, "rewards/margins": 1.2879998683929443, "rewards/rejected": -5.686980247497559, "sft_loss": 4.383788585662842, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 17.55373016826154, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.2608991265296936, "logits/rejected": -0.11179272085428238, "logps/chosen": -4.350119590759277, "logps/rejected": -5.653628826141357, "loss": 0.4321, "rewards/accuracies": 0.84375, "rewards/chosen": -4.350119590759277, "rewards/margins": 1.303508996963501, "rewards/rejected": -5.653628826141357, "sft_loss": 4.3665666580200195, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 23.91389177081075, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.2029423713684082, "logits/rejected": -0.11459051072597504, "logps/chosen": -4.565677642822266, "logps/rejected": -5.735536575317383, "loss": 0.4509, "rewards/accuracies": 0.84375, "rewards/chosen": -4.565677642822266, "rewards/margins": 1.1698591709136963, "rewards/rejected": -5.735536575317383, "sft_loss": 4.558390140533447, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 22.697274610095754, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.14836429059505463, "logits/rejected": -0.023218411952257156, "logps/chosen": -4.4364728927612305, "logps/rejected": -5.839684009552002, "loss": 0.4072, "rewards/accuracies": 0.875, "rewards/chosen": -4.4364728927612305, "rewards/margins": 1.4032106399536133, "rewards/rejected": -5.839684009552002, "sft_loss": 4.519501686096191, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 22.896311624054537, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.1370099037885666, "logits/rejected": 0.014382824301719666, "logps/chosen": -4.753531455993652, "logps/rejected": -6.031103134155273, "loss": 0.4672, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.753531455993652, "rewards/margins": 1.2775717973709106, "rewards/rejected": -6.031103134155273, "sft_loss": 4.664409637451172, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 29.55597936412726, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.16040173172950745, "logits/rejected": 0.008886401541531086, "logps/chosen": -4.453537940979004, "logps/rejected": -5.899094104766846, "loss": 0.4303, "rewards/accuracies": 0.84375, "rewards/chosen": -4.453537940979004, "rewards/margins": 1.4455565214157104, "rewards/rejected": -5.899094104766846, "sft_loss": 4.534873962402344, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 20.09663557576658, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.12617313861846924, "logits/rejected": -0.03719883784651756, "logps/chosen": -4.5755133628845215, "logps/rejected": -5.795089244842529, "loss": 0.4644, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.5755133628845215, "rewards/margins": 1.219576120376587, "rewards/rejected": -5.795089244842529, "sft_loss": 4.645591735839844, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 22.72420330168612, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.14607496559619904, "logits/rejected": -0.07823510468006134, "logps/chosen": -4.544275283813477, "logps/rejected": -5.835035800933838, "loss": 0.4423, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.544275283813477, "rewards/margins": 1.2907607555389404, "rewards/rejected": -5.835035800933838, "sft_loss": 4.6323394775390625, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 17.82196448474604, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.19549353420734406, "logits/rejected": -0.05719345808029175, "logps/chosen": -4.4385786056518555, "logps/rejected": -5.997511863708496, "loss": 0.3972, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.4385786056518555, "rewards/margins": 1.5589336156845093, "rewards/rejected": -5.997511863708496, "sft_loss": 4.5320329666137695, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 16.45368871148943, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.12866944074630737, "logits/rejected": -0.03789632394909859, "logps/chosen": -4.623317241668701, "logps/rejected": -5.921300411224365, "loss": 0.4717, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.623317241668701, "rewards/margins": 1.297982931137085, "rewards/rejected": -5.921300411224365, "sft_loss": 4.7191081047058105, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 17.071193680660553, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.09355298429727554, "logits/rejected": -0.07170907407999039, "logps/chosen": -4.602305889129639, "logps/rejected": -5.94113826751709, "loss": 0.487, "rewards/accuracies": 0.8125, "rewards/chosen": -4.602305889129639, "rewards/margins": 1.3388313055038452, "rewards/rejected": -5.94113826751709, "sft_loss": 4.62455940246582, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 20.772452374202366, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.2547750473022461, "logits/rejected": -0.19936183094978333, "logps/chosen": -4.52876615524292, "logps/rejected": -5.710700988769531, "loss": 0.4924, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.52876615524292, "rewards/margins": 1.1819345951080322, "rewards/rejected": -5.710700988769531, "sft_loss": 4.539527416229248, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 20.46616081123614, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.10002921521663666, "logits/rejected": 0.00514222402125597, "logps/chosen": -4.70742130279541, "logps/rejected": -5.912622928619385, "loss": 0.4682, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.70742130279541, "rewards/margins": 1.205202341079712, "rewards/rejected": -5.912622928619385, "sft_loss": 4.747741222381592, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 16.643090117581625, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.1663173884153366, "logits/rejected": -0.08629349619150162, "logps/chosen": -4.3680830001831055, "logps/rejected": -5.625277042388916, "loss": 0.4588, "rewards/accuracies": 0.8125, "rewards/chosen": -4.3680830001831055, "rewards/margins": 1.2571938037872314, "rewards/rejected": -5.625277042388916, "sft_loss": 4.439862251281738, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 19.103701178808972, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.13490690290927887, "logits/rejected": -0.11030188947916031, "logps/chosen": -4.417179107666016, "logps/rejected": -5.694499969482422, "loss": 0.4652, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.417179107666016, "rewards/margins": 1.2773202657699585, "rewards/rejected": -5.694499969482422, "sft_loss": 4.395775318145752, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 18.744252464756403, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.1814108043909073, "logits/rejected": -0.06273964792490005, "logps/chosen": -4.500998497009277, "logps/rejected": -5.8500285148620605, "loss": 0.4362, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.500998497009277, "rewards/margins": 1.3490300178527832, "rewards/rejected": -5.8500285148620605, "sft_loss": 4.531396389007568, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 23.09120396450256, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.1780509650707245, "logits/rejected": -0.0687517300248146, "logps/chosen": -4.513307094573975, "logps/rejected": -5.991667747497559, "loss": 0.4279, "rewards/accuracies": 0.84375, "rewards/chosen": -4.513307094573975, "rewards/margins": 1.4783604145050049, "rewards/rejected": -5.991667747497559, "sft_loss": 4.592909336090088, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 29.263796253204035, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.04930751398205757, "logits/rejected": 0.015498518943786621, "logps/chosen": -4.494831562042236, "logps/rejected": -5.8351731300354, "loss": 0.4457, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.494831562042236, "rewards/margins": 1.340341567993164, "rewards/rejected": -5.8351731300354, "sft_loss": 4.675837516784668, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 26.915621545601567, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.15022674202919006, "logits/rejected": -0.0800619125366211, "logps/chosen": -4.476630210876465, "logps/rejected": -5.6053466796875, "loss": 0.4733, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.476630210876465, "rewards/margins": 1.1287164688110352, "rewards/rejected": -5.6053466796875, "sft_loss": 4.568302154541016, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 19.529674888613087, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.18381746113300323, "logits/rejected": -0.043142445385456085, "logps/chosen": -4.506960391998291, "logps/rejected": -5.8851518630981445, "loss": 0.428, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.506960391998291, "rewards/margins": 1.3781912326812744, "rewards/rejected": -5.8851518630981445, "sft_loss": 4.607504844665527, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 25.227913751490263, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.21912606060504913, "logits/rejected": -0.15963666141033173, "logps/chosen": -4.3517045974731445, "logps/rejected": -5.7596116065979, "loss": 0.4374, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.3517045974731445, "rewards/margins": 1.407906413078308, "rewards/rejected": -5.7596116065979, "sft_loss": 4.369559288024902, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 19.094078156725242, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.09747710078954697, "logits/rejected": -0.04323968663811684, "logps/chosen": -4.51051139831543, "logps/rejected": -5.821291446685791, "loss": 0.4778, "rewards/accuracies": 0.78125, "rewards/chosen": -4.51051139831543, "rewards/margins": 1.3107802867889404, "rewards/rejected": -5.821291446685791, "sft_loss": 4.546077251434326, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 17.462313735653243, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.2526007294654846, "logits/rejected": -0.14061518013477325, "logps/chosen": -4.462028980255127, "logps/rejected": -5.620845317840576, "loss": 0.4595, "rewards/accuracies": 0.84375, "rewards/chosen": -4.462028980255127, "rewards/margins": 1.158816933631897, "rewards/rejected": -5.620845317840576, "sft_loss": 4.531667232513428, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 21.330268716607183, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.09234540909528732, "logits/rejected": -0.11624608933925629, "logps/chosen": -4.66225528717041, "logps/rejected": -5.735785961151123, "loss": 0.5142, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.66225528717041, "rewards/margins": 1.0735304355621338, "rewards/rejected": -5.735785961151123, "sft_loss": 4.689001560211182, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 18.736173702460214, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.230657696723938, "logits/rejected": -0.13760152459144592, "logps/chosen": -4.450052261352539, "logps/rejected": -5.639276504516602, "loss": 0.4645, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.450052261352539, "rewards/margins": 1.1892242431640625, "rewards/rejected": -5.639276504516602, "sft_loss": 4.513615608215332, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 22.746472066636194, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.2065122127532959, "logits/rejected": -0.09461875259876251, "logps/chosen": -4.567070960998535, "logps/rejected": -6.106113433837891, "loss": 0.4233, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.567070960998535, "rewards/margins": 1.539042592048645, "rewards/rejected": -6.106113433837891, "sft_loss": 4.609532356262207, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 21.822458722543207, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.17818565666675568, "logits/rejected": -0.043703652918338776, "logps/chosen": -4.625606536865234, "logps/rejected": -5.955820560455322, "loss": 0.463, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.625606536865234, "rewards/margins": 1.3302134275436401, "rewards/rejected": -5.955820560455322, "sft_loss": 4.593493461608887, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 25.2782048393693, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.1916409730911255, "logits/rejected": -0.017262551933526993, "logps/chosen": -4.424893379211426, "logps/rejected": -5.842418670654297, "loss": 0.4256, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.424893379211426, "rewards/margins": 1.417525291442871, "rewards/rejected": -5.842418670654297, "sft_loss": 4.535680770874023, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 19.26626818673393, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.20236441493034363, "logits/rejected": -0.03545919433236122, "logps/chosen": -4.511693477630615, "logps/rejected": -5.835480213165283, "loss": 0.4593, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.511693477630615, "rewards/margins": 1.3237863779067993, "rewards/rejected": -5.835480213165283, "sft_loss": 4.541731834411621, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 23.875300074782405, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.1965745985507965, "logits/rejected": -0.142036572098732, "logps/chosen": -4.52227783203125, "logps/rejected": -5.851922035217285, "loss": 0.4728, "rewards/accuracies": 0.8125, "rewards/chosen": -4.52227783203125, "rewards/margins": 1.3296445608139038, "rewards/rejected": -5.851922035217285, "sft_loss": 4.446340560913086, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 16.56329987288688, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.1566283404827118, "logits/rejected": -0.08194781839847565, "logps/chosen": -4.629245758056641, "logps/rejected": -5.892148971557617, "loss": 0.4698, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.629245758056641, "rewards/margins": 1.2629032135009766, "rewards/rejected": -5.892148971557617, "sft_loss": 4.6787190437316895, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 18.07301865977798, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.1772286742925644, "logits/rejected": -0.08047084510326385, "logps/chosen": -4.423053741455078, "logps/rejected": -5.777717590332031, "loss": 0.4354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.423053741455078, "rewards/margins": 1.3546632528305054, "rewards/rejected": -5.777717590332031, "sft_loss": 4.448493003845215, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 22.619828464116434, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.18942368030548096, "logits/rejected": -0.08072742819786072, "logps/chosen": -4.497479438781738, "logps/rejected": -5.813804626464844, "loss": 0.4447, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.497479438781738, "rewards/margins": 1.3163255453109741, "rewards/rejected": -5.813804626464844, "sft_loss": 4.514906406402588, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 16.656153461239374, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.20577137172222137, "logits/rejected": -0.05031289905309677, "logps/chosen": -4.567205905914307, "logps/rejected": -5.930844306945801, "loss": 0.414, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.567205905914307, "rewards/margins": 1.3636387586593628, "rewards/rejected": -5.930844306945801, "sft_loss": 4.56335973739624, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 22.169750281058562, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.18401619791984558, "logits/rejected": -0.0024409503675997257, "logps/chosen": -4.638046741485596, "logps/rejected": -6.121177673339844, "loss": 0.4219, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.638046741485596, "rewards/margins": 1.483130931854248, "rewards/rejected": -6.121177673339844, "sft_loss": 4.603224277496338, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 25.450642911331816, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.13436515629291534, "logits/rejected": -0.054591696709394455, "logps/chosen": -4.657012939453125, "logps/rejected": -5.924901485443115, "loss": 0.4568, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.657012939453125, "rewards/margins": 1.2678885459899902, "rewards/rejected": -5.924901485443115, "sft_loss": 4.595439434051514, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 15.873932160962498, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.20072786509990692, "logits/rejected": -0.07589218020439148, "logps/chosen": -4.668331623077393, "logps/rejected": -6.191826820373535, "loss": 0.4371, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.668331623077393, "rewards/margins": 1.5234956741333008, "rewards/rejected": -6.191826820373535, "sft_loss": 4.7383832931518555, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 19.993588406867985, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.12873494625091553, "logits/rejected": 0.03537944331765175, "logps/chosen": -4.6926655769348145, "logps/rejected": -6.041623115539551, "loss": 0.4639, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.6926655769348145, "rewards/margins": 1.3489577770233154, "rewards/rejected": -6.041623115539551, "sft_loss": 4.652834415435791, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 19.275439911848228, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.18791356682777405, "logits/rejected": -0.08327902108430862, "logps/chosen": -4.731060028076172, "logps/rejected": -6.04015588760376, "loss": 0.4714, "rewards/accuracies": 0.8125, "rewards/chosen": -4.731060028076172, "rewards/margins": 1.3090959787368774, "rewards/rejected": -6.04015588760376, "sft_loss": 4.791726112365723, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 21.308120586026025, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.1869659125804901, "logits/rejected": -0.11178477108478546, "logps/chosen": -4.531894683837891, "logps/rejected": -5.969760417938232, "loss": 0.4136, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.531894683837891, "rewards/margins": 1.437865972518921, "rewards/rejected": -5.969760417938232, "sft_loss": 4.582188606262207, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 21.74581959109812, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.1504012644290924, "logits/rejected": -0.1195465475320816, "logps/chosen": -4.655195236206055, "logps/rejected": -6.071804046630859, "loss": 0.418, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.655195236206055, "rewards/margins": 1.416608452796936, "rewards/rejected": -6.071804046630859, "sft_loss": 4.6643500328063965, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 32.26578170381865, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.220480278134346, "logits/rejected": -0.10281282663345337, "logps/chosen": -4.7849626541137695, "logps/rejected": -6.028841495513916, "loss": 0.4864, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.7849626541137695, "rewards/margins": 1.2438790798187256, "rewards/rejected": -6.028841495513916, "sft_loss": 4.771364212036133, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 19.27299726224707, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.19079093635082245, "logits/rejected": -0.11726488918066025, "logps/chosen": -4.7149457931518555, "logps/rejected": -5.869764804840088, "loss": 0.4676, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.7149457931518555, "rewards/margins": 1.1548184156417847, "rewards/rejected": -5.869764804840088, "sft_loss": 4.712212562561035, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 19.365753855559927, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.14444835484027863, "logits/rejected": -0.035151056945323944, "logps/chosen": -4.563656806945801, "logps/rejected": -5.817429065704346, "loss": 0.438, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.563656806945801, "rewards/margins": 1.2537721395492554, "rewards/rejected": -5.817429065704346, "sft_loss": 4.519850254058838, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 23.970306343855015, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.24623043835163116, "logits/rejected": -0.14050698280334473, "logps/chosen": -4.51409387588501, "logps/rejected": -5.959392070770264, "loss": 0.431, "rewards/accuracies": 0.84375, "rewards/chosen": -4.51409387588501, "rewards/margins": 1.4452987909317017, "rewards/rejected": -5.959392070770264, "sft_loss": 4.649843692779541, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 19.254904924716207, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.22397704422473907, "logits/rejected": -0.08937518298625946, "logps/chosen": -4.677164554595947, "logps/rejected": -6.227524757385254, "loss": 0.3844, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.677164554595947, "rewards/margins": 1.5503594875335693, "rewards/rejected": -6.227524757385254, "sft_loss": 4.709841251373291, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 24.604712868309882, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.19657690823078156, "logits/rejected": -0.07223434746265411, "logps/chosen": -4.633805274963379, "logps/rejected": -5.77435827255249, "loss": 0.5182, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.633805274963379, "rewards/margins": 1.1405527591705322, "rewards/rejected": -5.77435827255249, "sft_loss": 4.767401695251465, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 19.346911823012697, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.2063641995191574, "logits/rejected": -0.09147273004055023, "logps/chosen": -4.624153137207031, "logps/rejected": -5.970458030700684, "loss": 0.429, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.624153137207031, "rewards/margins": 1.3463048934936523, "rewards/rejected": -5.970458030700684, "sft_loss": 4.563480377197266, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 29.70356859619958, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.18906475603580475, "logits/rejected": -0.07552754133939743, "logps/chosen": -4.602474689483643, "logps/rejected": -5.924898624420166, "loss": 0.498, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.602474689483643, "rewards/margins": 1.3224231004714966, "rewards/rejected": -5.924898624420166, "sft_loss": 4.635772705078125, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 20.36618070960126, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.18279999494552612, "logits/rejected": -0.06919268518686295, "logps/chosen": -4.600367546081543, "logps/rejected": -5.84440803527832, "loss": 0.4585, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.600367546081543, "rewards/margins": 1.2440404891967773, "rewards/rejected": -5.84440803527832, "sft_loss": 4.5165205001831055, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 21.237739192090743, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.1806211918592453, "logits/rejected": -0.10852392017841339, "logps/chosen": -4.569619655609131, "logps/rejected": -5.892071723937988, "loss": 0.438, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.569619655609131, "rewards/margins": 1.3224519491195679, "rewards/rejected": -5.892071723937988, "sft_loss": 4.4747748374938965, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 21.58619347687164, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.21733203530311584, "logits/rejected": -0.08858387172222137, "logps/chosen": -4.599829196929932, "logps/rejected": -6.069632053375244, "loss": 0.4195, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.599829196929932, "rewards/margins": 1.4698026180267334, "rewards/rejected": -6.069632053375244, "sft_loss": 4.639847278594971, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 40.36680316737448, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.20439603924751282, "logits/rejected": -0.11707202345132828, "logps/chosen": -4.695111274719238, "logps/rejected": -5.897182464599609, "loss": 0.5233, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.695111274719238, "rewards/margins": 1.2020713090896606, "rewards/rejected": -5.897182464599609, "sft_loss": 4.595641136169434, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 18.09753407104881, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.17222526669502258, "logits/rejected": -0.0842718854546547, "logps/chosen": -4.666041374206543, "logps/rejected": -6.027068614959717, "loss": 0.4485, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.666041374206543, "rewards/margins": 1.361027479171753, "rewards/rejected": -6.027068614959717, "sft_loss": 4.675548553466797, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 16.052846367611284, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.2250470370054245, "logits/rejected": -0.11605800688266754, "logps/chosen": -4.556109428405762, "logps/rejected": -5.891709327697754, "loss": 0.4295, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.556109428405762, "rewards/margins": 1.33560049533844, "rewards/rejected": -5.891709327697754, "sft_loss": 4.656196594238281, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 21.862687536384428, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.25336316227912903, "logits/rejected": -0.09333156794309616, "logps/chosen": -4.592108726501465, "logps/rejected": -5.923739433288574, "loss": 0.4437, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.592108726501465, "rewards/margins": 1.3316295146942139, "rewards/rejected": -5.923739433288574, "sft_loss": 4.607272148132324, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 19.63578607815804, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.13836193084716797, "logits/rejected": -0.10894973576068878, "logps/chosen": -4.5957818031311035, "logps/rejected": -5.848198890686035, "loss": 0.4655, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.5957818031311035, "rewards/margins": 1.2524168491363525, "rewards/rejected": -5.848198890686035, "sft_loss": 4.647026062011719, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 27.572479225734174, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.12179931253194809, "logits/rejected": -0.0321027971804142, "logps/chosen": -4.529701232910156, "logps/rejected": -6.020511627197266, "loss": 0.4097, "rewards/accuracies": 0.84375, "rewards/chosen": -4.529701232910156, "rewards/margins": 1.4908112287521362, "rewards/rejected": -6.020511627197266, "sft_loss": 4.580036163330078, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 24.36795285820144, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.26527541875839233, "logits/rejected": -0.08893623948097229, "logps/chosen": -4.502415657043457, "logps/rejected": -5.973431587219238, "loss": 0.4135, "rewards/accuracies": 0.84375, "rewards/chosen": -4.502415657043457, "rewards/margins": 1.4710155725479126, "rewards/rejected": -5.973431587219238, "sft_loss": 4.644287109375, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 20.54890891777198, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.20819087326526642, "logits/rejected": -0.03349591791629791, "logps/chosen": -4.512115478515625, "logps/rejected": -5.900969982147217, "loss": 0.453, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.512115478515625, "rewards/margins": 1.3888546228408813, "rewards/rejected": -5.900969982147217, "sft_loss": 4.571647644042969, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 23.865875017997862, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.24579854309558868, "logits/rejected": -0.1403156965970993, "logps/chosen": -4.625274658203125, "logps/rejected": -5.83488655090332, "loss": 0.4773, "rewards/accuracies": 0.8125, "rewards/chosen": -4.625274658203125, "rewards/margins": 1.2096123695373535, "rewards/rejected": -5.83488655090332, "sft_loss": 4.556929111480713, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 16.577622925228273, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.19563975930213928, "logits/rejected": -0.08206583559513092, "logps/chosen": -4.448563575744629, "logps/rejected": -5.7410783767700195, "loss": 0.44, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.448563575744629, "rewards/margins": 1.2925150394439697, "rewards/rejected": -5.7410783767700195, "sft_loss": 4.540297031402588, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 19.63967959827264, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.17274947464466095, "logits/rejected": -0.05672701448202133, "logps/chosen": -4.5420379638671875, "logps/rejected": -6.012210845947266, "loss": 0.4476, "rewards/accuracies": 0.8125, "rewards/chosen": -4.5420379638671875, "rewards/margins": 1.4701731204986572, "rewards/rejected": -6.012210845947266, "sft_loss": 4.543997764587402, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 18.225681233142744, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.19474905729293823, "logits/rejected": -0.0927257388830185, "logps/chosen": -4.614598274230957, "logps/rejected": -5.783566474914551, "loss": 0.4844, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.614598274230957, "rewards/margins": 1.1689684391021729, "rewards/rejected": -5.783566474914551, "sft_loss": 4.706032752990723, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.06405708938837051, "eval_logits/rejected": 0.1450890153646469, "eval_logps/chosen": -4.723513126373291, "eval_logps/rejected": -5.77617883682251, "eval_loss": 0.5826747417449951, "eval_rewards/accuracies": 0.7314540147781372, "eval_rewards/chosen": -4.723513126373291, "eval_rewards/margins": 1.0526658296585083, "eval_rewards/rejected": -5.77617883682251, "eval_runtime": 43.1706, "eval_samples_per_second": 31.155, "eval_sft_loss": 4.6692376136779785, "eval_steps_per_second": 7.806, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 21.315599162846883, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.2303951233625412, "logits/rejected": -0.16923975944519043, "logps/chosen": -4.504847049713135, "logps/rejected": -5.79945182800293, "loss": 0.4302, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.504847049713135, "rewards/margins": 1.2946048974990845, "rewards/rejected": -5.79945182800293, "sft_loss": 4.5163960456848145, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 17.0429736757707, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.15470165014266968, "logits/rejected": -0.044807516038417816, "logps/chosen": -4.7473931312561035, "logps/rejected": -6.170556545257568, "loss": 0.4241, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.7473931312561035, "rewards/margins": 1.423163890838623, "rewards/rejected": -6.170556545257568, "sft_loss": 4.7713727951049805, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 19.303030539992708, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.1884719729423523, "logits/rejected": -0.06602375209331512, "logps/chosen": -4.579176425933838, "logps/rejected": -5.957730293273926, "loss": 0.4416, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.579176425933838, "rewards/margins": 1.3785539865493774, "rewards/rejected": -5.957730293273926, "sft_loss": 4.538491249084473, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 25.84773812381602, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.17406593263149261, "logits/rejected": -0.09588642418384552, "logps/chosen": -4.64821720123291, "logps/rejected": -5.7471160888671875, "loss": 0.4927, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.64821720123291, "rewards/margins": 1.0988987684249878, "rewards/rejected": -5.7471160888671875, "sft_loss": 4.654828071594238, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 15.604304686195245, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.1851363182067871, "logits/rejected": -0.08538545668125153, "logps/chosen": -4.357602596282959, "logps/rejected": -5.7189836502075195, "loss": 0.3819, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.357602596282959, "rewards/margins": 1.361380934715271, "rewards/rejected": -5.7189836502075195, "sft_loss": 4.434847354888916, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 22.95940029833988, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.21504803001880646, "logits/rejected": -0.09006574004888535, "logps/chosen": -4.535211563110352, "logps/rejected": -5.892853736877441, "loss": 0.4556, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.535211563110352, "rewards/margins": 1.3576418161392212, "rewards/rejected": -5.892853736877441, "sft_loss": 4.4517669677734375, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 16.778468087674916, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.2044660598039627, "logits/rejected": -0.057089339941740036, "logps/chosen": -4.5894551277160645, "logps/rejected": -5.9368085861206055, "loss": 0.4431, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.5894551277160645, "rewards/margins": 1.3473538160324097, "rewards/rejected": -5.9368085861206055, "sft_loss": 4.624999046325684, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 18.588050354393058, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.2678273022174835, "logits/rejected": -0.06287500262260437, "logps/chosen": -4.446804523468018, "logps/rejected": -5.7586870193481445, "loss": 0.4534, "rewards/accuracies": 0.78125, "rewards/chosen": -4.446804523468018, "rewards/margins": 1.3118834495544434, "rewards/rejected": -5.7586870193481445, "sft_loss": 4.468583583831787, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 22.527000565403007, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.21276633441448212, "logits/rejected": -0.05496319383382797, "logps/chosen": -4.646910667419434, "logps/rejected": -5.7714104652404785, "loss": 0.5162, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.646910667419434, "rewards/margins": 1.124500036239624, "rewards/rejected": -5.7714104652404785, "sft_loss": 4.6487135887146, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 28.147353828645194, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.1935139149427414, "logits/rejected": -0.07589694857597351, "logps/chosen": -4.688645362854004, "logps/rejected": -6.109697341918945, "loss": 0.4635, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.688645362854004, "rewards/margins": 1.4210526943206787, "rewards/rejected": -6.109697341918945, "sft_loss": 4.691089630126953, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 20.696087353859202, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.2511764168739319, "logits/rejected": -0.1509692221879959, "logps/chosen": -4.562219619750977, "logps/rejected": -5.868638038635254, "loss": 0.4661, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.562219619750977, "rewards/margins": 1.3064180612564087, "rewards/rejected": -5.868638038635254, "sft_loss": 4.586970329284668, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 15.486007590775456, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.23120412230491638, "logits/rejected": -0.0854119285941124, "logps/chosen": -4.435896396636963, "logps/rejected": -5.71600341796875, "loss": 0.4603, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.435896396636963, "rewards/margins": 1.2801072597503662, "rewards/rejected": -5.71600341796875, "sft_loss": 4.478748321533203, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 20.515836091470668, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.2171090543270111, "logits/rejected": -0.044152624905109406, "logps/chosen": -4.647055149078369, "logps/rejected": -5.933724880218506, "loss": 0.4803, "rewards/accuracies": 0.8125, "rewards/chosen": -4.647055149078369, "rewards/margins": 1.2866696119308472, "rewards/rejected": -5.933724880218506, "sft_loss": 4.59279727935791, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 20.383111324167274, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.15860562026500702, "logits/rejected": -0.07073559612035751, "logps/chosen": -4.495818138122559, "logps/rejected": -5.804797649383545, "loss": 0.4818, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.495818138122559, "rewards/margins": 1.3089797496795654, "rewards/rejected": -5.804797649383545, "sft_loss": 4.603878974914551, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 20.539838960506497, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.21058157086372375, "logits/rejected": -0.04740305244922638, "logps/chosen": -4.5614752769470215, "logps/rejected": -5.795866966247559, "loss": 0.4481, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.5614752769470215, "rewards/margins": 1.234391450881958, "rewards/rejected": -5.795866966247559, "sft_loss": 4.5146684646606445, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 24.234417451454213, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.21933972835540771, "logits/rejected": -0.10762651264667511, "logps/chosen": -4.50011682510376, "logps/rejected": -5.7032012939453125, "loss": 0.4548, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.50011682510376, "rewards/margins": 1.2030847072601318, "rewards/rejected": -5.7032012939453125, "sft_loss": 4.520796775817871, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 25.196603293763232, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.18634046614170074, "logits/rejected": -0.10818295180797577, "logps/chosen": -4.60623836517334, "logps/rejected": -5.995389938354492, "loss": 0.4354, "rewards/accuracies": 0.8125, "rewards/chosen": -4.60623836517334, "rewards/margins": 1.389150619506836, "rewards/rejected": -5.995389938354492, "sft_loss": 4.585078239440918, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 29.397343565816843, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.20830197632312775, "logits/rejected": -0.05398184061050415, "logps/chosen": -4.6167826652526855, "logps/rejected": -5.977439880371094, "loss": 0.4562, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.6167826652526855, "rewards/margins": 1.3606570959091187, "rewards/rejected": -5.977439880371094, "sft_loss": 4.660190582275391, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 20.32175487272762, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.29008620977401733, "logits/rejected": -0.134123295545578, "logps/chosen": -4.646373271942139, "logps/rejected": -5.834511756896973, "loss": 0.4876, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.646373271942139, "rewards/margins": 1.1881383657455444, "rewards/rejected": -5.834511756896973, "sft_loss": 4.637969017028809, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 20.418388075845467, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.15654954314231873, "logits/rejected": -0.11684336513280869, "logps/chosen": -4.565447807312012, "logps/rejected": -5.900856971740723, "loss": 0.4304, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.565447807312012, "rewards/margins": 1.3354084491729736, "rewards/rejected": -5.900856971740723, "sft_loss": 4.608338356018066, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 19.032283203730458, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.17595365643501282, "logits/rejected": -0.025563359260559082, "logps/chosen": -4.589142322540283, "logps/rejected": -5.8382744789123535, "loss": 0.4695, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.589142322540283, "rewards/margins": 1.2491323947906494, "rewards/rejected": -5.8382744789123535, "sft_loss": 4.566729545593262, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 19.68571244852352, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.207304909825325, "logits/rejected": -0.10162524878978729, "logps/chosen": -4.6496076583862305, "logps/rejected": -5.775534629821777, "loss": 0.5329, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.6496076583862305, "rewards/margins": 1.1259268522262573, "rewards/rejected": -5.775534629821777, "sft_loss": 4.658164024353027, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 19.34129821855337, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.21180382370948792, "logits/rejected": -0.15992474555969238, "logps/chosen": -4.42584228515625, "logps/rejected": -5.605154514312744, "loss": 0.4819, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.42584228515625, "rewards/margins": 1.1793123483657837, "rewards/rejected": -5.605154514312744, "sft_loss": 4.382297515869141, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 19.812438204851706, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.19067828357219696, "logits/rejected": -0.10383911430835724, "logps/chosen": -4.532150745391846, "logps/rejected": -5.899567127227783, "loss": 0.4297, "rewards/accuracies": 0.84375, "rewards/chosen": -4.532150745391846, "rewards/margins": 1.3674169778823853, "rewards/rejected": -5.899567127227783, "sft_loss": 4.567543029785156, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 29.735821894641667, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.2346051186323166, "logits/rejected": -0.08108378946781158, "logps/chosen": -4.545472145080566, "logps/rejected": -5.927277565002441, "loss": 0.4372, "rewards/accuracies": 0.84375, "rewards/chosen": -4.545472145080566, "rewards/margins": 1.3818058967590332, "rewards/rejected": -5.927277565002441, "sft_loss": 4.522289752960205, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 23.66699357162162, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.21056047081947327, "logits/rejected": -0.13503794372081757, "logps/chosen": -4.474379539489746, "logps/rejected": -5.554043769836426, "loss": 0.5149, "rewards/accuracies": 0.78125, "rewards/chosen": -4.474379539489746, "rewards/margins": 1.079664707183838, "rewards/rejected": -5.554043769836426, "sft_loss": 4.390741348266602, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 21.118495725284937, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.11235556751489639, "logits/rejected": -0.03817780688405037, "logps/chosen": -4.439657688140869, "logps/rejected": -5.766303062438965, "loss": 0.4661, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.439657688140869, "rewards/margins": 1.3266453742980957, "rewards/rejected": -5.766303062438965, "sft_loss": 4.571423053741455, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 19.0949745109115, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.1611403375864029, "logits/rejected": -0.05527140945196152, "logps/chosen": -4.475289821624756, "logps/rejected": -5.60997200012207, "loss": 0.5012, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.475289821624756, "rewards/margins": 1.1346817016601562, "rewards/rejected": -5.60997200012207, "sft_loss": 4.55513858795166, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 17.935024919965333, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.1411706805229187, "logits/rejected": -0.05848114565014839, "logps/chosen": -4.385329246520996, "logps/rejected": -5.652647972106934, "loss": 0.4731, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.385329246520996, "rewards/margins": 1.2673189640045166, "rewards/rejected": -5.652647972106934, "sft_loss": 4.418089389801025, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 23.491598615059704, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.23309126496315002, "logits/rejected": -0.13317319750785828, "logps/chosen": -4.3950018882751465, "logps/rejected": -5.676304817199707, "loss": 0.4687, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.3950018882751465, "rewards/margins": 1.2813031673431396, "rewards/rejected": -5.676304817199707, "sft_loss": 4.40733528137207, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 19.125890362345572, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.1768358051776886, "logits/rejected": -0.07692781090736389, "logps/chosen": -4.521668434143066, "logps/rejected": -5.560128211975098, "loss": 0.5188, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.521668434143066, "rewards/margins": 1.0384600162506104, "rewards/rejected": -5.560128211975098, "sft_loss": 4.5766682624816895, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 26.133069340360283, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.1954447627067566, "logits/rejected": -0.040132030844688416, "logps/chosen": -4.481562614440918, "logps/rejected": -5.8571977615356445, "loss": 0.4486, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.481562614440918, "rewards/margins": 1.3756355047225952, "rewards/rejected": -5.8571977615356445, "sft_loss": 4.460504055023193, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 17.47989034183169, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.19489526748657227, "logits/rejected": -0.08836908638477325, "logps/chosen": -4.388577461242676, "logps/rejected": -5.794994354248047, "loss": 0.4129, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.388577461242676, "rewards/margins": 1.4064165353775024, "rewards/rejected": -5.794994354248047, "sft_loss": 4.320601463317871, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 20.420147657931476, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.14283771812915802, "logits/rejected": -0.031055014580488205, "logps/chosen": -4.619053363800049, "logps/rejected": -5.802786827087402, "loss": 0.506, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.619053363800049, "rewards/margins": 1.1837328672409058, "rewards/rejected": -5.802786827087402, "sft_loss": 4.631751537322998, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 16.9951576504403, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.1547360122203827, "logits/rejected": -0.07837474346160889, "logps/chosen": -4.421803951263428, "logps/rejected": -5.884800910949707, "loss": 0.4068, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.421803951263428, "rewards/margins": 1.4629971981048584, "rewards/rejected": -5.884800910949707, "sft_loss": 4.476590156555176, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 24.62143076337965, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.19433817267417908, "logits/rejected": -0.1463998556137085, "logps/chosen": -4.4533257484436035, "logps/rejected": -5.545527935028076, "loss": 0.509, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.4533257484436035, "rewards/margins": 1.0922021865844727, "rewards/rejected": -5.545527935028076, "sft_loss": 4.443113803863525, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 18.97664228771073, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.20991036295890808, "logits/rejected": -0.07485126703977585, "logps/chosen": -4.437937259674072, "logps/rejected": -5.744898796081543, "loss": 0.4425, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.437937259674072, "rewards/margins": 1.3069615364074707, "rewards/rejected": -5.744898796081543, "sft_loss": 4.504698276519775, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 15.47708517526208, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.21960651874542236, "logits/rejected": -0.11510632187128067, "logps/chosen": -4.449195861816406, "logps/rejected": -5.644477844238281, "loss": 0.4695, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.449195861816406, "rewards/margins": 1.1952823400497437, "rewards/rejected": -5.644477844238281, "sft_loss": 4.520184516906738, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 18.900248936085628, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.2260456085205078, "logits/rejected": -0.11913847923278809, "logps/chosen": -4.4166579246521, "logps/rejected": -5.84757137298584, "loss": 0.4537, "rewards/accuracies": 0.8125, "rewards/chosen": -4.4166579246521, "rewards/margins": 1.4309136867523193, "rewards/rejected": -5.84757137298584, "sft_loss": 4.432002544403076, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 14.872545795163669, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.19411954283714294, "logits/rejected": -0.03489295765757561, "logps/chosen": -4.461108684539795, "logps/rejected": -5.979926109313965, "loss": 0.3952, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.461108684539795, "rewards/margins": 1.5188171863555908, "rewards/rejected": -5.979926109313965, "sft_loss": 4.563147068023682, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 18.348094072972412, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.1995922327041626, "logits/rejected": -0.08010158687829971, "logps/chosen": -4.613968372344971, "logps/rejected": -5.970013618469238, "loss": 0.444, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.613968372344971, "rewards/margins": 1.356046199798584, "rewards/rejected": -5.970013618469238, "sft_loss": 4.647368907928467, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 21.471179179679307, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.20935878157615662, "logits/rejected": -0.1149827390909195, "logps/chosen": -4.587584495544434, "logps/rejected": -5.908937931060791, "loss": 0.472, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.587584495544434, "rewards/margins": 1.321352243423462, "rewards/rejected": -5.908937931060791, "sft_loss": 4.5859479904174805, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 21.406477229056268, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.14232993125915527, "logits/rejected": 0.005499015562236309, "logps/chosen": -4.5797929763793945, "logps/rejected": -5.923649787902832, "loss": 0.4475, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.5797929763793945, "rewards/margins": 1.3438562154769897, "rewards/rejected": -5.923649787902832, "sft_loss": 4.64937162399292, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 17.06758141464982, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.17623497545719147, "logits/rejected": -0.02270549163222313, "logps/chosen": -4.4073710441589355, "logps/rejected": -5.731764793395996, "loss": 0.4124, "rewards/accuracies": 0.875, "rewards/chosen": -4.4073710441589355, "rewards/margins": 1.3243931531906128, "rewards/rejected": -5.731764793395996, "sft_loss": 4.500627517700195, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 19.144035150054716, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.2248564213514328, "logits/rejected": -0.2040160447359085, "logps/chosen": -4.470706939697266, "logps/rejected": -5.8308563232421875, "loss": 0.4544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.470706939697266, "rewards/margins": 1.3601499795913696, "rewards/rejected": -5.8308563232421875, "sft_loss": 4.481385231018066, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 22.95375049487042, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.1677057445049286, "logits/rejected": -0.08975866436958313, "logps/chosen": -4.464261531829834, "logps/rejected": -5.763180732727051, "loss": 0.4599, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.464261531829834, "rewards/margins": 1.2989187240600586, "rewards/rejected": -5.763180732727051, "sft_loss": 4.50925874710083, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 24.606368604482732, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.08617188036441803, "logits/rejected": 0.027888232842087746, "logps/chosen": -4.720548629760742, "logps/rejected": -6.075860023498535, "loss": 0.4594, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.720548629760742, "rewards/margins": 1.3553111553192139, "rewards/rejected": -6.075860023498535, "sft_loss": 4.730906009674072, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 20.88976247783765, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.17314712703227997, "logits/rejected": -0.0717170462012291, "logps/chosen": -4.5507965087890625, "logps/rejected": -5.720621585845947, "loss": 0.4765, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.5507965087890625, "rewards/margins": 1.1698250770568848, "rewards/rejected": -5.720621585845947, "sft_loss": 4.556031703948975, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 21.419404863988852, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.1612890064716339, "logits/rejected": -0.03897259384393692, "logps/chosen": -4.47631311416626, "logps/rejected": -5.901510715484619, "loss": 0.4277, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.47631311416626, "rewards/margins": 1.4251978397369385, "rewards/rejected": -5.901510715484619, "sft_loss": 4.563240051269531, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 22.99349099347903, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.19505569338798523, "logits/rejected": -0.03052748367190361, "logps/chosen": -4.442262649536133, "logps/rejected": -5.754072189331055, "loss": 0.455, "rewards/accuracies": 0.78125, "rewards/chosen": -4.442262649536133, "rewards/margins": 1.311809778213501, "rewards/rejected": -5.754072189331055, "sft_loss": 4.400628089904785, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 24.39677745821148, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.21414044499397278, "logits/rejected": -0.08336290717124939, "logps/chosen": -4.606078624725342, "logps/rejected": -5.677779197692871, "loss": 0.5027, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.606078624725342, "rewards/margins": 1.0717008113861084, "rewards/rejected": -5.677779197692871, "sft_loss": 4.675948143005371, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 16.67601373235496, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.21918518841266632, "logits/rejected": -0.11786095798015594, "logps/chosen": -4.559045314788818, "logps/rejected": -5.861749172210693, "loss": 0.4316, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.559045314788818, "rewards/margins": 1.3027039766311646, "rewards/rejected": -5.861749172210693, "sft_loss": 4.51124906539917, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 24.482882384709082, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.17604468762874603, "logits/rejected": -0.08922283351421356, "logps/chosen": -4.497008800506592, "logps/rejected": -5.698709487915039, "loss": 0.4736, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.497008800506592, "rewards/margins": 1.2017009258270264, "rewards/rejected": -5.698709487915039, "sft_loss": 4.484135627746582, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 23.61614221979428, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.18093156814575195, "logits/rejected": -0.06446141004562378, "logps/chosen": -4.420372486114502, "logps/rejected": -5.614178657531738, "loss": 0.4644, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.420372486114502, "rewards/margins": 1.1938055753707886, "rewards/rejected": -5.614178657531738, "sft_loss": 4.375947952270508, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 20.222681675125408, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.14194490015506744, "logits/rejected": -0.02431192621588707, "logps/chosen": -4.505153179168701, "logps/rejected": -5.789482593536377, "loss": 0.4896, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.505153179168701, "rewards/margins": 1.284328579902649, "rewards/rejected": -5.789482593536377, "sft_loss": 4.513728141784668, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 26.123162472684733, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.13329777121543884, "logits/rejected": -0.041541434824466705, "logps/chosen": -4.424647331237793, "logps/rejected": -5.76423978805542, "loss": 0.4591, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.424647331237793, "rewards/margins": 1.339592695236206, "rewards/rejected": -5.76423978805542, "sft_loss": 4.433927536010742, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 18.289289506883286, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.19680531322956085, "logits/rejected": -0.023869935423135757, "logps/chosen": -4.395602226257324, "logps/rejected": -5.696691036224365, "loss": 0.4444, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.395602226257324, "rewards/margins": 1.3010880947113037, "rewards/rejected": -5.696691036224365, "sft_loss": 4.393080711364746, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 23.42798024718646, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.16270551085472107, "logits/rejected": -0.09330668300390244, "logps/chosen": -4.4305620193481445, "logps/rejected": -5.681832313537598, "loss": 0.4913, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.4305620193481445, "rewards/margins": 1.2512702941894531, "rewards/rejected": -5.681832313537598, "sft_loss": 4.389323711395264, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 18.106279307173804, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.12708857655525208, "logits/rejected": 0.012785923667252064, "logps/chosen": -4.554836750030518, "logps/rejected": -5.922824859619141, "loss": 0.4373, "rewards/accuracies": 0.84375, "rewards/chosen": -4.554836750030518, "rewards/margins": 1.367988109588623, "rewards/rejected": -5.922824859619141, "sft_loss": 4.59060525894165, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 20.937481813773303, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.23682132363319397, "logits/rejected": -0.12528641521930695, "logps/chosen": -4.4873552322387695, "logps/rejected": -5.65595006942749, "loss": 0.4756, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.4873552322387695, "rewards/margins": 1.1685948371887207, "rewards/rejected": -5.65595006942749, "sft_loss": 4.530056953430176, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 21.90677504227491, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.16787834465503693, "logits/rejected": -0.11186468601226807, "logps/chosen": -4.4697723388671875, "logps/rejected": -5.794044494628906, "loss": 0.4555, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.4697723388671875, "rewards/margins": 1.3242720365524292, "rewards/rejected": -5.794044494628906, "sft_loss": 4.4074015617370605, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 30.185251138033113, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.20095805823802948, "logits/rejected": -0.054882220923900604, "logps/chosen": -4.525169372558594, "logps/rejected": -5.853812217712402, "loss": 0.4558, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.525169372558594, "rewards/margins": 1.3286423683166504, "rewards/rejected": -5.853812217712402, "sft_loss": 4.510501384735107, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 16.891355001288364, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.12636831402778625, "logits/rejected": -0.0890660285949707, "logps/chosen": -4.470862865447998, "logps/rejected": -5.971408843994141, "loss": 0.3888, "rewards/accuracies": 0.84375, "rewards/chosen": -4.470862865447998, "rewards/margins": 1.5005453824996948, "rewards/rejected": -5.971408843994141, "sft_loss": 4.454566955566406, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 22.97275851657326, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.2002909630537033, "logits/rejected": -0.07835756242275238, "logps/chosen": -4.467009544372559, "logps/rejected": -5.867058753967285, "loss": 0.4153, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.467009544372559, "rewards/margins": 1.4000494480133057, "rewards/rejected": -5.867058753967285, "sft_loss": 4.419620513916016, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 21.449232242890638, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.20349101722240448, "logits/rejected": -0.12118975818157196, "logps/chosen": -4.590170383453369, "logps/rejected": -5.913938999176025, "loss": 0.4874, "rewards/accuracies": 0.78125, "rewards/chosen": -4.590170383453369, "rewards/margins": 1.3237687349319458, "rewards/rejected": -5.913938999176025, "sft_loss": 4.63556432723999, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 23.121120970429534, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.1534847766160965, "logits/rejected": -0.010326864197850227, "logps/chosen": -4.503686428070068, "logps/rejected": -5.993782043457031, "loss": 0.3923, "rewards/accuracies": 0.875, "rewards/chosen": -4.503686428070068, "rewards/margins": 1.490094542503357, "rewards/rejected": -5.993782043457031, "sft_loss": 4.55232572555542, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 23.65333237348769, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.13787660002708435, "logits/rejected": -0.1245235949754715, "logps/chosen": -4.618861198425293, "logps/rejected": -5.837153434753418, "loss": 0.4411, "rewards/accuracies": 0.8125, "rewards/chosen": -4.618861198425293, "rewards/margins": 1.2182929515838623, "rewards/rejected": -5.837153434753418, "sft_loss": 4.580845832824707, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 15.40543868202436, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.16966880857944489, "logits/rejected": -0.0535128228366375, "logps/chosen": -4.637502670288086, "logps/rejected": -5.820466995239258, "loss": 0.4795, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.637502670288086, "rewards/margins": 1.1829637289047241, "rewards/rejected": -5.820466995239258, "sft_loss": 4.6907525062561035, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 21.93460157693937, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.16312888264656067, "logits/rejected": -0.10609817504882812, "logps/chosen": -4.652926445007324, "logps/rejected": -5.991219520568848, "loss": 0.4601, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.652926445007324, "rewards/margins": 1.3382928371429443, "rewards/rejected": -5.991219520568848, "sft_loss": 4.78752326965332, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 17.835844674143832, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.13755422830581665, "logits/rejected": 0.015838632360100746, "logps/chosen": -4.4717793464660645, "logps/rejected": -6.0261688232421875, "loss": 0.3881, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.4717793464660645, "rewards/margins": 1.55439031124115, "rewards/rejected": -6.0261688232421875, "sft_loss": 4.499505996704102, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 19.09786893711236, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.21794326603412628, "logits/rejected": -0.10194908082485199, "logps/chosen": -4.589028358459473, "logps/rejected": -6.074400424957275, "loss": 0.4358, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.589028358459473, "rewards/margins": 1.485371708869934, "rewards/rejected": -6.074400424957275, "sft_loss": 4.711187362670898, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 18.429959869226373, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.19435083866119385, "logits/rejected": -0.1180337518453598, "logps/chosen": -4.5188517570495605, "logps/rejected": -6.058254718780518, "loss": 0.3959, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.5188517570495605, "rewards/margins": 1.5394032001495361, "rewards/rejected": -6.058254718780518, "sft_loss": 4.598109722137451, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 23.62838873223034, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.21478557586669922, "logits/rejected": -0.09002258628606796, "logps/chosen": -4.732692241668701, "logps/rejected": -5.903644561767578, "loss": 0.4967, "rewards/accuracies": 0.75, "rewards/chosen": -4.732692241668701, "rewards/margins": 1.1709522008895874, "rewards/rejected": -5.903644561767578, "sft_loss": 4.822795391082764, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 24.455111835636977, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.1903330385684967, "logits/rejected": -0.08132892847061157, "logps/chosen": -4.575113773345947, "logps/rejected": -6.173352241516113, "loss": 0.4173, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.575113773345947, "rewards/margins": 1.5982379913330078, "rewards/rejected": -6.173352241516113, "sft_loss": 4.533818244934082, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 25.654521999410346, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.08815367519855499, "logits/rejected": -0.01084714476019144, "logps/chosen": -4.67705774307251, "logps/rejected": -5.991430282592773, "loss": 0.5073, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.67705774307251, "rewards/margins": 1.3143724203109741, "rewards/rejected": -5.991430282592773, "sft_loss": 4.634039878845215, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 16.36428300774091, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.21527810394763947, "logits/rejected": -0.10454368591308594, "logps/chosen": -4.534337043762207, "logps/rejected": -5.820903778076172, "loss": 0.4397, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.534337043762207, "rewards/margins": 1.2865673303604126, "rewards/rejected": -5.820903778076172, "sft_loss": 4.576798439025879, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 24.91226273927461, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.16502061486244202, "logits/rejected": -0.11993427574634552, "logps/chosen": -4.4962334632873535, "logps/rejected": -5.591463088989258, "loss": 0.5122, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.4962334632873535, "rewards/margins": 1.0952298641204834, "rewards/rejected": -5.591463088989258, "sft_loss": 4.489527702331543, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 20.207249255688556, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.22081604599952698, "logits/rejected": -0.12781080603599548, "logps/chosen": -4.531004905700684, "logps/rejected": -5.898108959197998, "loss": 0.4655, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.531004905700684, "rewards/margins": 1.3671048879623413, "rewards/rejected": -5.898108959197998, "sft_loss": 4.539250373840332, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 25.688124298764876, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.15706852078437805, "logits/rejected": -0.07518056035041809, "logps/chosen": -4.669719696044922, "logps/rejected": -5.945387363433838, "loss": 0.4959, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.669719696044922, "rewards/margins": 1.275667667388916, "rewards/rejected": -5.945387363433838, "sft_loss": 4.66204309463501, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 15.85847980111402, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.16903331875801086, "logits/rejected": -0.08962948620319366, "logps/chosen": -4.538792610168457, "logps/rejected": -5.683047771453857, "loss": 0.488, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.538792610168457, "rewards/margins": 1.1442553997039795, "rewards/rejected": -5.683047771453857, "sft_loss": 4.609192848205566, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.04863176867365837, "eval_logits/rejected": 0.12808875739574432, "eval_logps/chosen": -4.621333599090576, "eval_logps/rejected": -5.670347690582275, "eval_loss": 0.5791551470756531, "eval_rewards/accuracies": 0.7314540147781372, "eval_rewards/chosen": -4.621333599090576, "eval_rewards/margins": 1.0490138530731201, "eval_rewards/rejected": -5.670347690582275, "eval_runtime": 43.0522, "eval_samples_per_second": 31.241, "eval_sft_loss": 4.5805439949035645, "eval_steps_per_second": 7.828, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 28.40401193081405, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.22222542762756348, "logits/rejected": -0.12885430455207825, "logps/chosen": -4.472836494445801, "logps/rejected": -5.696991920471191, "loss": 0.5093, "rewards/accuracies": 0.75, "rewards/chosen": -4.472836494445801, "rewards/margins": 1.2241547107696533, "rewards/rejected": -5.696991920471191, "sft_loss": 4.47830867767334, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 19.21048232317064, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.1699303239583969, "logits/rejected": -0.09898178279399872, "logps/chosen": -4.3903422355651855, "logps/rejected": -5.721817970275879, "loss": 0.4544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.3903422355651855, "rewards/margins": 1.3314763307571411, "rewards/rejected": -5.721817970275879, "sft_loss": 4.442036151885986, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 23.669309007915814, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.1649537980556488, "logits/rejected": -0.08180878311395645, "logps/chosen": -4.342033863067627, "logps/rejected": -5.731744766235352, "loss": 0.4991, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.342033863067627, "rewards/margins": 1.3897111415863037, "rewards/rejected": -5.731744766235352, "sft_loss": 4.478572845458984, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 23.418773392822963, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.15347550809383392, "logits/rejected": -0.06517787277698517, "logps/chosen": -4.519292831420898, "logps/rejected": -5.659243583679199, "loss": 0.4812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.519292831420898, "rewards/margins": 1.1399506330490112, "rewards/rejected": -5.659243583679199, "sft_loss": 4.461321830749512, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 21.89645608677314, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.1391296088695526, "logits/rejected": -0.016931544989347458, "logps/chosen": -4.406808376312256, "logps/rejected": -5.737373352050781, "loss": 0.444, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.406808376312256, "rewards/margins": 1.3305647373199463, "rewards/rejected": -5.737373352050781, "sft_loss": 4.336057662963867, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 26.078054368087265, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.1936092972755432, "logits/rejected": -0.07124066352844238, "logps/chosen": -4.561959266662598, "logps/rejected": -5.777171611785889, "loss": 0.4782, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.561959266662598, "rewards/margins": 1.2152132987976074, "rewards/rejected": -5.777171611785889, "sft_loss": 4.577217102050781, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 24.452744091437754, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.2836676239967346, "logits/rejected": -0.13801008462905884, "logps/chosen": -4.371440410614014, "logps/rejected": -5.8245134353637695, "loss": 0.3939, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.371440410614014, "rewards/margins": 1.4530731439590454, "rewards/rejected": -5.8245134353637695, "sft_loss": 4.396610736846924, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 26.04172258859895, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.22704604268074036, "logits/rejected": -0.06665907800197601, "logps/chosen": -4.336719989776611, "logps/rejected": -5.717637538909912, "loss": 0.4088, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.336719989776611, "rewards/margins": 1.3809177875518799, "rewards/rejected": -5.717637538909912, "sft_loss": 4.287871360778809, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 19.417064570155066, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.17872127890586853, "logits/rejected": -0.13257905840873718, "logps/chosen": -4.509469985961914, "logps/rejected": -5.748656272888184, "loss": 0.4598, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.509469985961914, "rewards/margins": 1.2391860485076904, "rewards/rejected": -5.748656272888184, "sft_loss": 4.612636566162109, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 22.393659481421555, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.2519914507865906, "logits/rejected": -0.1284635365009308, "logps/chosen": -4.571159839630127, "logps/rejected": -5.952051162719727, "loss": 0.4556, "rewards/accuracies": 0.8125, "rewards/chosen": -4.571159839630127, "rewards/margins": 1.3808910846710205, "rewards/rejected": -5.952051162719727, "sft_loss": 4.579138278961182, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 23.395650557521304, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.1402450054883957, "logits/rejected": -0.042079776525497437, "logps/chosen": -4.393143177032471, "logps/rejected": -5.735674858093262, "loss": 0.4525, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.393143177032471, "rewards/margins": 1.3425315618515015, "rewards/rejected": -5.735674858093262, "sft_loss": 4.468847751617432, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 21.597640854860202, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.23359227180480957, "logits/rejected": -0.16883578896522522, "logps/chosen": -4.371655464172363, "logps/rejected": -5.632245063781738, "loss": 0.4662, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.371655464172363, "rewards/margins": 1.2605891227722168, "rewards/rejected": -5.632245063781738, "sft_loss": 4.476510047912598, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 14.828011791421977, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.231710746884346, "logits/rejected": -0.15900401771068573, "logps/chosen": -4.545974254608154, "logps/rejected": -5.904170989990234, "loss": 0.4425, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.545974254608154, "rewards/margins": 1.3581969738006592, "rewards/rejected": -5.904170989990234, "sft_loss": 4.569455146789551, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 17.522975343074165, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.17051482200622559, "logits/rejected": -0.05159657076001167, "logps/chosen": -4.31076717376709, "logps/rejected": -5.912814140319824, "loss": 0.3718, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.31076717376709, "rewards/margins": 1.6020472049713135, "rewards/rejected": -5.912814140319824, "sft_loss": 4.290585517883301, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 23.416699694984377, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.15978005528450012, "logits/rejected": -0.07179448008537292, "logps/chosen": -4.384526252746582, "logps/rejected": -5.747738838195801, "loss": 0.4704, "rewards/accuracies": 0.8125, "rewards/chosen": -4.384526252746582, "rewards/margins": 1.3632128238677979, "rewards/rejected": -5.747738838195801, "sft_loss": 4.449900150299072, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 13.224366039507197, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.1055552214384079, "logits/rejected": -0.021091172471642494, "logps/chosen": -4.444762229919434, "logps/rejected": -5.985116958618164, "loss": 0.4185, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.444762229919434, "rewards/margins": 1.5403542518615723, "rewards/rejected": -5.985116958618164, "sft_loss": 4.531604290008545, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 18.263200480543908, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.14596322178840637, "logits/rejected": -0.07446818053722382, "logps/chosen": -4.471606254577637, "logps/rejected": -5.95240592956543, "loss": 0.4304, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.471606254577637, "rewards/margins": 1.4807993173599243, "rewards/rejected": -5.95240592956543, "sft_loss": 4.543080806732178, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 24.880001903936733, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.24391219019889832, "logits/rejected": -0.12067997455596924, "logps/chosen": -4.523651599884033, "logps/rejected": -5.8638715744018555, "loss": 0.4841, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.523651599884033, "rewards/margins": 1.3402198553085327, "rewards/rejected": -5.8638715744018555, "sft_loss": 4.605062007904053, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 16.600718727582695, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.17819949984550476, "logits/rejected": -0.09157485514879227, "logps/chosen": -4.5048604011535645, "logps/rejected": -5.635102272033691, "loss": 0.4862, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.5048604011535645, "rewards/margins": 1.130241870880127, "rewards/rejected": -5.635102272033691, "sft_loss": 4.5297441482543945, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 20.472988122442064, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.1733657419681549, "logits/rejected": -0.02662757597863674, "logps/chosen": -4.53520393371582, "logps/rejected": -5.874253273010254, "loss": 0.4225, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.53520393371582, "rewards/margins": 1.3390496969223022, "rewards/rejected": -5.874253273010254, "sft_loss": 4.551875591278076, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 18.341533084524283, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.19445575773715973, "logits/rejected": -0.06603357195854187, "logps/chosen": -4.4857587814331055, "logps/rejected": -5.981295108795166, "loss": 0.4073, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.4857587814331055, "rewards/margins": 1.4955353736877441, "rewards/rejected": -5.981295108795166, "sft_loss": 4.578952789306641, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 19.871427369730235, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.18480312824249268, "logits/rejected": -0.17410850524902344, "logps/chosen": -4.46319580078125, "logps/rejected": -5.887537956237793, "loss": 0.4518, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.46319580078125, "rewards/margins": 1.4243428707122803, "rewards/rejected": -5.887537956237793, "sft_loss": 4.509976387023926, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 24.37262155582398, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.1744360476732254, "logits/rejected": -0.06569372117519379, "logps/chosen": -4.4764838218688965, "logps/rejected": -5.778898239135742, "loss": 0.4683, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.4764838218688965, "rewards/margins": 1.3024141788482666, "rewards/rejected": -5.778898239135742, "sft_loss": 4.4765305519104, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 24.944943946905017, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.19176459312438965, "logits/rejected": -0.10058772563934326, "logps/chosen": -4.545384883880615, "logps/rejected": -5.858805179595947, "loss": 0.4509, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.545384883880615, "rewards/margins": 1.313420057296753, "rewards/rejected": -5.858805179595947, "sft_loss": 4.5715155601501465, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 21.66738204995071, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.29400959610939026, "logits/rejected": -0.11239423602819443, "logps/chosen": -4.527563571929932, "logps/rejected": -5.918554782867432, "loss": 0.4511, "rewards/accuracies": 0.8125, "rewards/chosen": -4.527563571929932, "rewards/margins": 1.3909912109375, "rewards/rejected": -5.918554782867432, "sft_loss": 4.582327365875244, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 18.703531975391247, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.17439576983451843, "logits/rejected": -0.0235174261033535, "logps/chosen": -4.596892356872559, "logps/rejected": -6.077221870422363, "loss": 0.4409, "rewards/accuracies": 0.8125, "rewards/chosen": -4.596892356872559, "rewards/margins": 1.4803297519683838, "rewards/rejected": -6.077221870422363, "sft_loss": 4.584397315979004, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 27.68319198176985, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.22327034175395966, "logits/rejected": -0.07386596500873566, "logps/chosen": -4.4573798179626465, "logps/rejected": -5.789246559143066, "loss": 0.4672, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.4573798179626465, "rewards/margins": 1.3318663835525513, "rewards/rejected": -5.789246559143066, "sft_loss": 4.468945026397705, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 17.45229984530719, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.206891268491745, "logits/rejected": -0.12008295953273773, "logps/chosen": -4.3516435623168945, "logps/rejected": -5.558261871337891, "loss": 0.4572, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.3516435623168945, "rewards/margins": 1.206618309020996, "rewards/rejected": -5.558261871337891, "sft_loss": 4.415630340576172, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 20.112252743949814, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.17998364567756653, "logits/rejected": -0.018372971564531326, "logps/chosen": -4.503326416015625, "logps/rejected": -5.923501968383789, "loss": 0.4281, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.503326416015625, "rewards/margins": 1.4201748371124268, "rewards/rejected": -5.923501968383789, "sft_loss": 4.460989952087402, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 20.59258516506547, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.13803830742835999, "logits/rejected": -0.009364024735987186, "logps/chosen": -4.485349178314209, "logps/rejected": -5.7924652099609375, "loss": 0.4846, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.485349178314209, "rewards/margins": 1.307115912437439, "rewards/rejected": -5.7924652099609375, "sft_loss": 4.460837364196777, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 20.11852869952089, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.19584974646568298, "logits/rejected": -0.13580578565597534, "logps/chosen": -4.634210586547852, "logps/rejected": -5.871407985687256, "loss": 0.4463, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.634210586547852, "rewards/margins": 1.2371981143951416, "rewards/rejected": -5.871407985687256, "sft_loss": 4.665798664093018, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 17.373477823994854, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.1474190056324005, "logits/rejected": -0.04052499681711197, "logps/chosen": -4.419074058532715, "logps/rejected": -5.859983444213867, "loss": 0.4256, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.419074058532715, "rewards/margins": 1.4409091472625732, "rewards/rejected": -5.859983444213867, "sft_loss": 4.4970293045043945, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 29.72961340708789, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.2138177454471588, "logits/rejected": -0.10813770443201065, "logps/chosen": -4.4357476234436035, "logps/rejected": -5.829280376434326, "loss": 0.4271, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.4357476234436035, "rewards/margins": 1.3935325145721436, "rewards/rejected": -5.829280376434326, "sft_loss": 4.5532660484313965, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 18.7127890117876, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.13064627349376678, "logits/rejected": -0.059014834463596344, "logps/chosen": -4.3094706535339355, "logps/rejected": -5.6937994956970215, "loss": 0.4111, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.3094706535339355, "rewards/margins": 1.3843294382095337, "rewards/rejected": -5.6937994956970215, "sft_loss": 4.256752014160156, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 22.453793570838283, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.16990868747234344, "logits/rejected": -0.12744008004665375, "logps/chosen": -4.531280040740967, "logps/rejected": -5.8299784660339355, "loss": 0.4399, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.531280040740967, "rewards/margins": 1.2986981868743896, "rewards/rejected": -5.8299784660339355, "sft_loss": 4.526133060455322, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 20.880815590545996, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.08516103774309158, "logits/rejected": -0.051754094660282135, "logps/chosen": -4.615936756134033, "logps/rejected": -5.832221031188965, "loss": 0.488, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.615936756134033, "rewards/margins": 1.216284155845642, "rewards/rejected": -5.832221031188965, "sft_loss": 4.542150020599365, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 21.3886728116191, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.15201832354068756, "logits/rejected": -0.011524543166160583, "logps/chosen": -4.438300132751465, "logps/rejected": -5.841962814331055, "loss": 0.475, "rewards/accuracies": 0.78125, "rewards/chosen": -4.438300132751465, "rewards/margins": 1.4036626815795898, "rewards/rejected": -5.841962814331055, "sft_loss": 4.412807464599609, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 20.236923101464793, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.22584716975688934, "logits/rejected": -0.0964241698384285, "logps/chosen": -4.3834710121154785, "logps/rejected": -5.847751617431641, "loss": 0.3923, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.3834710121154785, "rewards/margins": 1.464280605316162, "rewards/rejected": -5.847751617431641, "sft_loss": 4.467441082000732, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 19.207220133232692, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.16925375163555145, "logits/rejected": -0.14062543213367462, "logps/chosen": -4.560122489929199, "logps/rejected": -5.656733512878418, "loss": 0.5064, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.560122489929199, "rewards/margins": 1.0966103076934814, "rewards/rejected": -5.656733512878418, "sft_loss": 4.595259189605713, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 20.535443637891856, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.18426719307899475, "logits/rejected": -0.13645689189434052, "logps/chosen": -4.579115867614746, "logps/rejected": -5.923556327819824, "loss": 0.4787, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.579115867614746, "rewards/margins": 1.34443998336792, "rewards/rejected": -5.923556327819824, "sft_loss": 4.646383762359619, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 28.422069720008892, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.13655778765678406, "logits/rejected": -0.09209270775318146, "logps/chosen": -4.541821479797363, "logps/rejected": -5.684293270111084, "loss": 0.4885, "rewards/accuracies": 0.78125, "rewards/chosen": -4.541821479797363, "rewards/margins": 1.142472505569458, "rewards/rejected": -5.684293270111084, "sft_loss": 4.564027786254883, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 22.58653703001671, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.1473383754491806, "logits/rejected": -0.06587468087673187, "logps/chosen": -4.53951358795166, "logps/rejected": -5.852242469787598, "loss": 0.4315, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.53951358795166, "rewards/margins": 1.3127288818359375, "rewards/rejected": -5.852242469787598, "sft_loss": 4.513645172119141, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 20.623675139503522, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.1609375923871994, "logits/rejected": -0.09451863914728165, "logps/chosen": -4.543221950531006, "logps/rejected": -5.874283313751221, "loss": 0.446, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.543221950531006, "rewards/margins": 1.3310611248016357, "rewards/rejected": -5.874283313751221, "sft_loss": 4.466743469238281, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 27.424395934018733, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.1978655755519867, "logits/rejected": -0.022298278287053108, "logps/chosen": -4.504530429840088, "logps/rejected": -5.7812910079956055, "loss": 0.4722, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.504530429840088, "rewards/margins": 1.2767612934112549, "rewards/rejected": -5.7812910079956055, "sft_loss": 4.603614330291748, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 18.544009839500788, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.21366408467292786, "logits/rejected": -0.09033291786909103, "logps/chosen": -4.505580902099609, "logps/rejected": -5.765819072723389, "loss": 0.4661, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.505580902099609, "rewards/margins": 1.2602382898330688, "rewards/rejected": -5.765819072723389, "sft_loss": 4.506991386413574, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 17.208970877155696, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.17819947004318237, "logits/rejected": -0.10755584388971329, "logps/chosen": -4.437621593475342, "logps/rejected": -5.8965911865234375, "loss": 0.42, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.437621593475342, "rewards/margins": 1.4589693546295166, "rewards/rejected": -5.8965911865234375, "sft_loss": 4.461600303649902, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 29.64941568431905, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.20304307341575623, "logits/rejected": -0.09568870812654495, "logps/chosen": -4.634688854217529, "logps/rejected": -5.94057559967041, "loss": 0.4707, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.634688854217529, "rewards/margins": 1.3058862686157227, "rewards/rejected": -5.94057559967041, "sft_loss": 4.67722225189209, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 21.137086003538187, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.04583621770143509, "logits/rejected": -0.04740852862596512, "logps/chosen": -4.654051780700684, "logps/rejected": -5.890873908996582, "loss": 0.4632, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.654051780700684, "rewards/margins": 1.2368220090866089, "rewards/rejected": -5.890873908996582, "sft_loss": 4.616780757904053, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 23.607549620355137, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.12733376026153564, "logits/rejected": -0.1056627482175827, "logps/chosen": -4.537964820861816, "logps/rejected": -5.770925998687744, "loss": 0.4473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.537964820861816, "rewards/margins": 1.232961654663086, "rewards/rejected": -5.770925998687744, "sft_loss": 4.539140224456787, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 20.197067487892724, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.19815947115421295, "logits/rejected": -0.1313100904226303, "logps/chosen": -4.479102611541748, "logps/rejected": -5.704444885253906, "loss": 0.4442, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.479102611541748, "rewards/margins": 1.225342035293579, "rewards/rejected": -5.704444885253906, "sft_loss": 4.477992534637451, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 20.31153163158568, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.2582859992980957, "logits/rejected": -0.16854605078697205, "logps/chosen": -4.308932304382324, "logps/rejected": -5.7119317054748535, "loss": 0.4183, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.308932304382324, "rewards/margins": 1.4029991626739502, "rewards/rejected": -5.7119317054748535, "sft_loss": 4.360212802886963, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 27.64314575330156, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.17026914656162262, "logits/rejected": -0.1082666888833046, "logps/chosen": -4.433135986328125, "logps/rejected": -5.6108222007751465, "loss": 0.508, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.433135986328125, "rewards/margins": 1.1776866912841797, "rewards/rejected": -5.6108222007751465, "sft_loss": 4.465333461761475, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 28.104949777755163, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.19289958477020264, "logits/rejected": -0.027002420276403427, "logps/chosen": -4.320894241333008, "logps/rejected": -5.665889263153076, "loss": 0.4547, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.320894241333008, "rewards/margins": 1.3449946641921997, "rewards/rejected": -5.665889263153076, "sft_loss": 4.365025520324707, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 24.584489114151147, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.11105088144540787, "logits/rejected": -0.0008586436742916703, "logps/chosen": -4.451926231384277, "logps/rejected": -5.640947341918945, "loss": 0.4576, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.451926231384277, "rewards/margins": 1.1890199184417725, "rewards/rejected": -5.640947341918945, "sft_loss": 4.59604549407959, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 24.071061668209328, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.14649274945259094, "logits/rejected": -0.08841396123170853, "logps/chosen": -4.365932941436768, "logps/rejected": -5.378201961517334, "loss": 0.5444, "rewards/accuracies": 0.75, "rewards/chosen": -4.365932941436768, "rewards/margins": 1.0122692584991455, "rewards/rejected": -5.378201961517334, "sft_loss": 4.456226348876953, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 20.194893373629437, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.23665407299995422, "logits/rejected": -0.11111712455749512, "logps/chosen": -4.488452434539795, "logps/rejected": -5.783753871917725, "loss": 0.48, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.488452434539795, "rewards/margins": 1.2953013181686401, "rewards/rejected": -5.783753871917725, "sft_loss": 4.546128273010254, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 29.905678180384932, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.20285113155841827, "logits/rejected": -0.11245715618133545, "logps/chosen": -4.482570171356201, "logps/rejected": -6.00087833404541, "loss": 0.3989, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.482570171356201, "rewards/margins": 1.5183079242706299, "rewards/rejected": -6.00087833404541, "sft_loss": 4.5170183181762695, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 25.34663503129227, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.23477861285209656, "logits/rejected": -0.09957059472799301, "logps/chosen": -4.484523296356201, "logps/rejected": -5.851978778839111, "loss": 0.4357, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.484523296356201, "rewards/margins": 1.3674547672271729, "rewards/rejected": -5.851978778839111, "sft_loss": 4.524469375610352, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 20.649410979346033, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.2946515679359436, "logits/rejected": -0.1434052437543869, "logps/chosen": -4.407382965087891, "logps/rejected": -5.828757286071777, "loss": 0.4491, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.407382965087891, "rewards/margins": 1.421373724937439, "rewards/rejected": -5.828757286071777, "sft_loss": 4.452073574066162, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 14.846615490916948, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.12604109942913055, "logits/rejected": -0.03208146244287491, "logps/chosen": -4.457130432128906, "logps/rejected": -5.801394462585449, "loss": 0.4663, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.457130432128906, "rewards/margins": 1.3442646265029907, "rewards/rejected": -5.801394462585449, "sft_loss": 4.512373447418213, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 23.775898636913556, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.17571856081485748, "logits/rejected": -0.03184592351317406, "logps/chosen": -4.659298419952393, "logps/rejected": -5.972817420959473, "loss": 0.4619, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.659298419952393, "rewards/margins": 1.3135192394256592, "rewards/rejected": -5.972817420959473, "sft_loss": 4.618160724639893, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 21.048811512128385, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.2019394189119339, "logits/rejected": -0.10773040354251862, "logps/chosen": -4.420794486999512, "logps/rejected": -5.719508171081543, "loss": 0.4866, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.420794486999512, "rewards/margins": 1.298714518547058, "rewards/rejected": -5.719508171081543, "sft_loss": 4.430346965789795, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 20.902772933512118, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.2315623015165329, "logits/rejected": -0.08020760864019394, "logps/chosen": -4.5212626457214355, "logps/rejected": -5.924075603485107, "loss": 0.4086, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.5212626457214355, "rewards/margins": 1.402813196182251, "rewards/rejected": -5.924075603485107, "sft_loss": 4.498833656311035, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 22.033682126642848, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.18592925369739532, "logits/rejected": -0.1098877415060997, "logps/chosen": -4.457711219787598, "logps/rejected": -5.814774513244629, "loss": 0.4181, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.457711219787598, "rewards/margins": 1.3570640087127686, "rewards/rejected": -5.814774513244629, "sft_loss": 4.481400966644287, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 26.65306589485683, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.2802940607070923, "logits/rejected": -0.09975512325763702, "logps/chosen": -4.371427536010742, "logps/rejected": -5.729374885559082, "loss": 0.4328, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.371427536010742, "rewards/margins": 1.3579468727111816, "rewards/rejected": -5.729374885559082, "sft_loss": 4.468634605407715, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 22.924473762441067, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.25745660066604614, "logits/rejected": -0.11118185520172119, "logps/chosen": -4.527166366577148, "logps/rejected": -5.842087268829346, "loss": 0.4577, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.527166366577148, "rewards/margins": 1.3149210214614868, "rewards/rejected": -5.842087268829346, "sft_loss": 4.613772392272949, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 28.080862924064185, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.10482903569936752, "logits/rejected": -0.03856141120195389, "logps/chosen": -4.515239715576172, "logps/rejected": -5.718465805053711, "loss": 0.4708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.515239715576172, "rewards/margins": 1.2032257318496704, "rewards/rejected": -5.718465805053711, "sft_loss": 4.546355247497559, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 20.91587610599803, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.13982251286506653, "logits/rejected": -0.006131963804364204, "logps/chosen": -4.486544609069824, "logps/rejected": -5.8271355628967285, "loss": 0.4315, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.486544609069824, "rewards/margins": 1.3405910730361938, "rewards/rejected": -5.8271355628967285, "sft_loss": 4.430808067321777, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 29.84654159466731, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.15242649614810944, "logits/rejected": -0.048605434596538544, "logps/chosen": -4.512834072113037, "logps/rejected": -5.716804504394531, "loss": 0.495, "rewards/accuracies": 0.78125, "rewards/chosen": -4.512834072113037, "rewards/margins": 1.2039706707000732, "rewards/rejected": -5.716804504394531, "sft_loss": 4.546947479248047, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 20.74155029479811, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.20727205276489258, "logits/rejected": -0.11031541973352432, "logps/chosen": -4.420102596282959, "logps/rejected": -5.79683256149292, "loss": 0.455, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.420102596282959, "rewards/margins": 1.37673020362854, "rewards/rejected": -5.79683256149292, "sft_loss": 4.442891597747803, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 17.627201565134865, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.23875252902507782, "logits/rejected": -0.16499489545822144, "logps/chosen": -4.42592191696167, "logps/rejected": -5.7121901512146, "loss": 0.4581, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.42592191696167, "rewards/margins": 1.2862684726715088, "rewards/rejected": -5.7121901512146, "sft_loss": 4.519508361816406, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 16.664853153492285, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.20979630947113037, "logits/rejected": -0.15912654995918274, "logps/chosen": -4.3233184814453125, "logps/rejected": -5.714047431945801, "loss": 0.4377, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.3233184814453125, "rewards/margins": 1.3907288312911987, "rewards/rejected": -5.714047431945801, "sft_loss": 4.3558807373046875, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 23.880807330166387, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.20513634383678436, "logits/rejected": -0.0711812898516655, "logps/chosen": -4.392642021179199, "logps/rejected": -5.718276500701904, "loss": 0.4452, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.392642021179199, "rewards/margins": 1.3256343603134155, "rewards/rejected": -5.718276500701904, "sft_loss": 4.4098711013793945, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 19.38565844548832, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.1626313477754593, "logits/rejected": -0.0420430526137352, "logps/chosen": -4.573996067047119, "logps/rejected": -5.955582141876221, "loss": 0.4305, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.573996067047119, "rewards/margins": 1.3815863132476807, "rewards/rejected": -5.955582141876221, "sft_loss": 4.616714000701904, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 19.94973916398742, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.19201573729515076, "logits/rejected": -0.09315890818834305, "logps/chosen": -4.613008975982666, "logps/rejected": -6.132317543029785, "loss": 0.3862, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.613008975982666, "rewards/margins": 1.5193077325820923, "rewards/rejected": -6.132317543029785, "sft_loss": 4.541656017303467, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 28.348181428348507, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.15497739613056183, "logits/rejected": -0.12709686160087585, "logps/chosen": -4.606642723083496, "logps/rejected": -5.7925705909729, "loss": 0.4855, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.606642723083496, "rewards/margins": 1.1859276294708252, "rewards/rejected": -5.7925705909729, "sft_loss": 4.743796348571777, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 19.21117198685213, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.16036248207092285, "logits/rejected": -0.06798889487981796, "logps/chosen": -4.460104942321777, "logps/rejected": -5.908174991607666, "loss": 0.4599, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.460104942321777, "rewards/margins": 1.4480699300765991, "rewards/rejected": -5.908174991607666, "sft_loss": 4.530155181884766, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 18.868779898400874, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.22829198837280273, "logits/rejected": -0.08963672816753387, "logps/chosen": -4.394246578216553, "logps/rejected": -5.759983062744141, "loss": 0.4229, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.394246578216553, "rewards/margins": 1.365736722946167, "rewards/rejected": -5.759983062744141, "sft_loss": 4.44966983795166, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 23.987850542329372, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.118210569024086, "logits/rejected": 0.015512818470597267, "logps/chosen": -4.505238056182861, "logps/rejected": -5.804290771484375, "loss": 0.4518, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.505238056182861, "rewards/margins": 1.299053430557251, "rewards/rejected": -5.804290771484375, "sft_loss": 4.5167741775512695, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 28.404369786995133, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.23681633174419403, "logits/rejected": -0.10690312087535858, "logps/chosen": -4.501443862915039, "logps/rejected": -5.874948978424072, "loss": 0.4404, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.501443862915039, "rewards/margins": 1.3735042810440063, "rewards/rejected": -5.874948978424072, "sft_loss": 4.492587089538574, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.004443091340363026, "eval_logits/rejected": 0.08074604719877243, "eval_logps/chosen": -4.662295341491699, "eval_logps/rejected": -5.713897705078125, "eval_loss": 0.5804030895233154, "eval_rewards/accuracies": 0.7299703359603882, "eval_rewards/chosen": -4.662295341491699, "eval_rewards/margins": 1.0516023635864258, "eval_rewards/rejected": -5.713897705078125, "eval_runtime": 43.3695, "eval_samples_per_second": 31.013, "eval_sft_loss": 4.627921104431152, "eval_steps_per_second": 7.77, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 14.254841170461553, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.1561775803565979, "logits/rejected": -0.09926787763834, "logps/chosen": -4.417360782623291, "logps/rejected": -5.744927406311035, "loss": 0.4445, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.417360782623291, "rewards/margins": 1.327566385269165, "rewards/rejected": -5.744927406311035, "sft_loss": 4.419335842132568, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 23.786728971169076, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.16632170975208282, "logits/rejected": -0.0739097073674202, "logps/chosen": -4.5204010009765625, "logps/rejected": -5.843813896179199, "loss": 0.4654, "rewards/accuracies": 0.78125, "rewards/chosen": -4.5204010009765625, "rewards/margins": 1.3234120607376099, "rewards/rejected": -5.843813896179199, "sft_loss": 4.5244293212890625, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 19.868166471320215, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.15556737780570984, "logits/rejected": -0.08350914716720581, "logps/chosen": -4.369675636291504, "logps/rejected": -5.737880229949951, "loss": 0.4225, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.369675636291504, "rewards/margins": 1.3682044744491577, "rewards/rejected": -5.737880229949951, "sft_loss": 4.400622844696045, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 21.003889345878, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.22633075714111328, "logits/rejected": -0.13380743563175201, "logps/chosen": -4.498518943786621, "logps/rejected": -6.063634395599365, "loss": 0.3886, "rewards/accuracies": 0.84375, "rewards/chosen": -4.498518943786621, "rewards/margins": 1.5651153326034546, "rewards/rejected": -6.063634395599365, "sft_loss": 4.527626991271973, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 30.031111476221355, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.19734984636306763, "logits/rejected": -0.06000862643122673, "logps/chosen": -4.527247428894043, "logps/rejected": -5.8383283615112305, "loss": 0.4602, "rewards/accuracies": 0.8125, "rewards/chosen": -4.527247428894043, "rewards/margins": 1.3110812902450562, "rewards/rejected": -5.8383283615112305, "sft_loss": 4.496323108673096, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 13.79924264199127, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.25380197167396545, "logits/rejected": -0.14649678766727448, "logps/chosen": -4.464987277984619, "logps/rejected": -5.718521595001221, "loss": 0.4444, "rewards/accuracies": 0.8125, "rewards/chosen": -4.464987277984619, "rewards/margins": 1.2535343170166016, "rewards/rejected": -5.718521595001221, "sft_loss": 4.407475471496582, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 17.07340604138416, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.23314595222473145, "logits/rejected": -0.10958156734704971, "logps/chosen": -4.368558883666992, "logps/rejected": -5.746652603149414, "loss": 0.4283, "rewards/accuracies": 0.875, "rewards/chosen": -4.368558883666992, "rewards/margins": 1.3780934810638428, "rewards/rejected": -5.746652603149414, "sft_loss": 4.360150337219238, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 23.518858818671433, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.20950329303741455, "logits/rejected": -0.06466137617826462, "logps/chosen": -4.563027381896973, "logps/rejected": -5.900126934051514, "loss": 0.4538, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.563027381896973, "rewards/margins": 1.337099552154541, "rewards/rejected": -5.900126934051514, "sft_loss": 4.570873737335205, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 24.999816021866938, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.1656816601753235, "logits/rejected": -0.11390833556652069, "logps/chosen": -4.472662925720215, "logps/rejected": -5.7310919761657715, "loss": 0.4565, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.472662925720215, "rewards/margins": 1.2584285736083984, "rewards/rejected": -5.7310919761657715, "sft_loss": 4.561809062957764, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 29.623488406148326, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.24879387021064758, "logits/rejected": -0.10537517070770264, "logps/chosen": -4.528120994567871, "logps/rejected": -5.904018402099609, "loss": 0.4535, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.528120994567871, "rewards/margins": 1.3758974075317383, "rewards/rejected": -5.904018402099609, "sft_loss": 4.5284528732299805, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 24.91543026763308, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.20283707976341248, "logits/rejected": -0.04213991388678551, "logps/chosen": -4.693485736846924, "logps/rejected": -6.021272659301758, "loss": 0.437, "rewards/accuracies": 0.8125, "rewards/chosen": -4.693485736846924, "rewards/margins": 1.3277864456176758, "rewards/rejected": -6.021272659301758, "sft_loss": 4.669791221618652, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 27.77678531151836, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.19460207223892212, "logits/rejected": -0.14369544386863708, "logps/chosen": -4.511590957641602, "logps/rejected": -5.745457172393799, "loss": 0.5007, "rewards/accuracies": 0.8125, "rewards/chosen": -4.511590957641602, "rewards/margins": 1.233865737915039, "rewards/rejected": -5.745457172393799, "sft_loss": 4.625870704650879, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 16.198917525623145, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.23445066809654236, "logits/rejected": -0.07343915104866028, "logps/chosen": -4.5098185539245605, "logps/rejected": -5.809727668762207, "loss": 0.4351, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.5098185539245605, "rewards/margins": 1.2999083995819092, "rewards/rejected": -5.809727668762207, "sft_loss": 4.538376808166504, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 19.862797168255526, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.12250743806362152, "logits/rejected": -0.02016635611653328, "logps/chosen": -4.527732849121094, "logps/rejected": -5.898034572601318, "loss": 0.4157, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.527732849121094, "rewards/margins": 1.3703019618988037, "rewards/rejected": -5.898034572601318, "sft_loss": 4.592249870300293, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 17.504706416015686, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.18533574044704437, "logits/rejected": -0.06128733232617378, "logps/chosen": -4.526907920837402, "logps/rejected": -5.828028678894043, "loss": 0.4167, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.526907920837402, "rewards/margins": 1.3011205196380615, "rewards/rejected": -5.828028678894043, "sft_loss": 4.476199150085449, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 17.491483364419327, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.16981545090675354, "logits/rejected": -0.03595195338129997, "logps/chosen": -4.685810565948486, "logps/rejected": -6.149535179138184, "loss": 0.4297, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.685810565948486, "rewards/margins": 1.463724136352539, "rewards/rejected": -6.149535179138184, "sft_loss": 4.639585494995117, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 22.288334925361855, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.21739113330841064, "logits/rejected": -0.10404038429260254, "logps/chosen": -4.522845268249512, "logps/rejected": -5.9513468742370605, "loss": 0.4301, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.522845268249512, "rewards/margins": 1.4285022020339966, "rewards/rejected": -5.9513468742370605, "sft_loss": 4.524170875549316, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 22.12734147118357, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.2011188268661499, "logits/rejected": -0.08531059324741364, "logps/chosen": -4.478837013244629, "logps/rejected": -5.821646213531494, "loss": 0.437, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.478837013244629, "rewards/margins": 1.3428093194961548, "rewards/rejected": -5.821646213531494, "sft_loss": 4.5520429611206055, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 17.345638013409147, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.2327616959810257, "logits/rejected": -0.09521780163049698, "logps/chosen": -4.699143409729004, "logps/rejected": -6.117920398712158, "loss": 0.441, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.699143409729004, "rewards/margins": 1.4187771081924438, "rewards/rejected": -6.117920398712158, "sft_loss": 4.68923282623291, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 22.487036504680443, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.2045392543077469, "logits/rejected": -0.12336097657680511, "logps/chosen": -4.553119659423828, "logps/rejected": -5.9328413009643555, "loss": 0.4254, "rewards/accuracies": 0.84375, "rewards/chosen": -4.553119659423828, "rewards/margins": 1.3797214031219482, "rewards/rejected": -5.9328413009643555, "sft_loss": 4.5086469650268555, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 32.11819853883587, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.205168679356575, "logits/rejected": -0.1323503851890564, "logps/chosen": -4.535611152648926, "logps/rejected": -5.771416187286377, "loss": 0.4884, "rewards/accuracies": 0.78125, "rewards/chosen": -4.535611152648926, "rewards/margins": 1.235804796218872, "rewards/rejected": -5.771416187286377, "sft_loss": 4.574914455413818, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 16.610425842713934, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.27479854226112366, "logits/rejected": -0.12086167186498642, "logps/chosen": -4.571263313293457, "logps/rejected": -6.1058855056762695, "loss": 0.4631, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.571263313293457, "rewards/margins": 1.5346229076385498, "rewards/rejected": -6.1058855056762695, "sft_loss": 4.600201606750488, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 23.574681868463003, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.1529221087694168, "logits/rejected": -0.018142305314540863, "logps/chosen": -4.53646993637085, "logps/rejected": -5.736028671264648, "loss": 0.4657, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.53646993637085, "rewards/margins": 1.1995582580566406, "rewards/rejected": -5.736028671264648, "sft_loss": 4.598328113555908, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 20.776939138193498, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.14258281886577606, "logits/rejected": -0.08337229490280151, "logps/chosen": -4.521661281585693, "logps/rejected": -5.844458103179932, "loss": 0.4226, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.521661281585693, "rewards/margins": 1.3227964639663696, "rewards/rejected": -5.844458103179932, "sft_loss": 4.469352722167969, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 17.319274589751448, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.19008512794971466, "logits/rejected": -0.04209180548787117, "logps/chosen": -4.5797271728515625, "logps/rejected": -6.05983829498291, "loss": 0.4024, "rewards/accuracies": 0.84375, "rewards/chosen": -4.5797271728515625, "rewards/margins": 1.4801113605499268, "rewards/rejected": -6.05983829498291, "sft_loss": 4.614199638366699, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 24.1489639271008, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.14118270576000214, "logits/rejected": -0.0662260577082634, "logps/chosen": -4.535606384277344, "logps/rejected": -5.975742340087891, "loss": 0.4511, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.535606384277344, "rewards/margins": 1.4401354789733887, "rewards/rejected": -5.975742340087891, "sft_loss": 4.580137252807617, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 23.231064740110014, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.13686378300189972, "logits/rejected": -0.15220175683498383, "logps/chosen": -4.509222984313965, "logps/rejected": -5.577731132507324, "loss": 0.5055, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.509222984313965, "rewards/margins": 1.0685081481933594, "rewards/rejected": -5.577731132507324, "sft_loss": 4.560286521911621, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 19.238118519796025, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.1739339679479599, "logits/rejected": -0.10641157627105713, "logps/chosen": -4.436322212219238, "logps/rejected": -5.848782539367676, "loss": 0.4311, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.436322212219238, "rewards/margins": 1.4124599695205688, "rewards/rejected": -5.848782539367676, "sft_loss": 4.444838523864746, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 17.343251096745842, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.27960461378097534, "logits/rejected": -0.1422613561153412, "logps/chosen": -4.423908710479736, "logps/rejected": -5.740886688232422, "loss": 0.4248, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.423908710479736, "rewards/margins": 1.3169782161712646, "rewards/rejected": -5.740886688232422, "sft_loss": 4.494323253631592, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 20.911071402007103, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.1199701800942421, "logits/rejected": -0.10320685058832169, "logps/chosen": -4.56076717376709, "logps/rejected": -5.661184787750244, "loss": 0.5211, "rewards/accuracies": 0.8125, "rewards/chosen": -4.56076717376709, "rewards/margins": 1.100416660308838, "rewards/rejected": -5.661184787750244, "sft_loss": 4.549746513366699, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 17.127061328827036, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.1598750650882721, "logits/rejected": -0.08833174407482147, "logps/chosen": -4.526861667633057, "logps/rejected": -5.655618190765381, "loss": 0.4739, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.526861667633057, "rewards/margins": 1.1287572383880615, "rewards/rejected": -5.655618190765381, "sft_loss": 4.569698333740234, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 23.43981155553961, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.2364230901002884, "logits/rejected": -0.11034750938415527, "logps/chosen": -4.5471601486206055, "logps/rejected": -5.74447774887085, "loss": 0.4807, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.5471601486206055, "rewards/margins": 1.1973185539245605, "rewards/rejected": -5.74447774887085, "sft_loss": 4.5481438636779785, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 15.1758889330313, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.2452334612607956, "logits/rejected": -0.14978325366973877, "logps/chosen": -4.424814701080322, "logps/rejected": -5.862797737121582, "loss": 0.4287, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.424814701080322, "rewards/margins": 1.4379831552505493, "rewards/rejected": -5.862797737121582, "sft_loss": 4.417214393615723, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 30.66560852395439, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.10269834846258163, "logits/rejected": -0.06337957084178925, "logps/chosen": -4.571722984313965, "logps/rejected": -5.661180019378662, "loss": 0.5119, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.571722984313965, "rewards/margins": 1.0894571542739868, "rewards/rejected": -5.661180019378662, "sft_loss": 4.687636375427246, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 23.770599957503787, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.1984288990497589, "logits/rejected": -0.1162128821015358, "logps/chosen": -4.328442096710205, "logps/rejected": -6.042115688323975, "loss": 0.3941, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.328442096710205, "rewards/margins": 1.7136739492416382, "rewards/rejected": -6.042115688323975, "sft_loss": 4.449916362762451, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 22.056122132564443, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.22836804389953613, "logits/rejected": -0.05655550956726074, "logps/chosen": -4.483396053314209, "logps/rejected": -5.878445148468018, "loss": 0.4264, "rewards/accuracies": 0.8125, "rewards/chosen": -4.483396053314209, "rewards/margins": 1.3950486183166504, "rewards/rejected": -5.878445148468018, "sft_loss": 4.536786079406738, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 13.671372307202047, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.15236499905586243, "logits/rejected": -0.08293196558952332, "logps/chosen": -4.356810569763184, "logps/rejected": -5.7592902183532715, "loss": 0.4255, "rewards/accuracies": 0.84375, "rewards/chosen": -4.356810569763184, "rewards/margins": 1.4024803638458252, "rewards/rejected": -5.7592902183532715, "sft_loss": 4.476162433624268, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 21.38253964489955, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.20900790393352509, "logits/rejected": -0.08444569259881973, "logps/chosen": -4.661978244781494, "logps/rejected": -5.805869102478027, "loss": 0.4992, "rewards/accuracies": 0.78125, "rewards/chosen": -4.661978244781494, "rewards/margins": 1.1438905000686646, "rewards/rejected": -5.805869102478027, "sft_loss": 4.756087303161621, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 20.906677985013843, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.3279460668563843, "logits/rejected": -0.16175897419452667, "logps/chosen": -4.424436569213867, "logps/rejected": -5.78138542175293, "loss": 0.4469, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.424436569213867, "rewards/margins": 1.3569484949111938, "rewards/rejected": -5.78138542175293, "sft_loss": 4.508726119995117, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 21.798446467514992, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.21985670924186707, "logits/rejected": -0.03419061750173569, "logps/chosen": -4.344112396240234, "logps/rejected": -6.010276794433594, "loss": 0.3772, "rewards/accuracies": 0.84375, "rewards/chosen": -4.344112396240234, "rewards/margins": 1.6661643981933594, "rewards/rejected": -6.010276794433594, "sft_loss": 4.2952728271484375, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 16.7135439809327, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.15331804752349854, "logits/rejected": -0.09507913887500763, "logps/chosen": -4.596510410308838, "logps/rejected": -5.7885541915893555, "loss": 0.4997, "rewards/accuracies": 0.78125, "rewards/chosen": -4.596510410308838, "rewards/margins": 1.192043423652649, "rewards/rejected": -5.7885541915893555, "sft_loss": 4.558037281036377, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 14.363309540256903, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.19555512070655823, "logits/rejected": -0.10160569846630096, "logps/chosen": -4.516275405883789, "logps/rejected": -6.0450921058654785, "loss": 0.4521, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.516275405883789, "rewards/margins": 1.5288162231445312, "rewards/rejected": -6.0450921058654785, "sft_loss": 4.563652038574219, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 15.761856304056796, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.20529595017433167, "logits/rejected": -0.04511453956365585, "logps/chosen": -4.611117839813232, "logps/rejected": -5.723728179931641, "loss": 0.4991, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.611117839813232, "rewards/margins": 1.112610936164856, "rewards/rejected": -5.723728179931641, "sft_loss": 4.745556354522705, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 18.519294928115645, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.17930540442466736, "logits/rejected": -0.12331312894821167, "logps/chosen": -4.547412872314453, "logps/rejected": -5.6964521408081055, "loss": 0.5089, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.547412872314453, "rewards/margins": 1.1490387916564941, "rewards/rejected": -5.6964521408081055, "sft_loss": 4.611541748046875, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 19.685910361653935, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.18745280802249908, "logits/rejected": -0.11353820562362671, "logps/chosen": -4.590461730957031, "logps/rejected": -6.08115291595459, "loss": 0.4268, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.590461730957031, "rewards/margins": 1.4906920194625854, "rewards/rejected": -6.08115291595459, "sft_loss": 4.633131504058838, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 18.955138875512645, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.2041642665863037, "logits/rejected": -0.051218412816524506, "logps/chosen": -4.56999397277832, "logps/rejected": -5.976227760314941, "loss": 0.4072, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.56999397277832, "rewards/margins": 1.406233310699463, "rewards/rejected": -5.976227760314941, "sft_loss": 4.611903667449951, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 20.24734901216716, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.2677224576473236, "logits/rejected": -0.19448812305927277, "logps/chosen": -4.410407066345215, "logps/rejected": -5.844793796539307, "loss": 0.4192, "rewards/accuracies": 0.84375, "rewards/chosen": -4.410407066345215, "rewards/margins": 1.434386968612671, "rewards/rejected": -5.844793796539307, "sft_loss": 4.4703145027160645, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 17.967450509751167, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.21685071289539337, "logits/rejected": -0.11500433832406998, "logps/chosen": -4.51969575881958, "logps/rejected": -5.686825275421143, "loss": 0.4601, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.51969575881958, "rewards/margins": 1.1671292781829834, "rewards/rejected": -5.686825275421143, "sft_loss": 4.533324718475342, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 21.95311126238352, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.06685139238834381, "logits/rejected": -0.013647640123963356, "logps/chosen": -4.410046100616455, "logps/rejected": -5.914144992828369, "loss": 0.4668, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.410046100616455, "rewards/margins": 1.504098892211914, "rewards/rejected": -5.914144992828369, "sft_loss": 4.446289539337158, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 27.19157248100409, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.08666609227657318, "logits/rejected": -0.014598068781197071, "logps/chosen": -4.55157995223999, "logps/rejected": -5.894360542297363, "loss": 0.4501, "rewards/accuracies": 0.84375, "rewards/chosen": -4.55157995223999, "rewards/margins": 1.3427811861038208, "rewards/rejected": -5.894360542297363, "sft_loss": 4.572675704956055, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 18.407918261075594, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.2196597307920456, "logits/rejected": -0.06329698115587234, "logps/chosen": -4.536431789398193, "logps/rejected": -5.69393253326416, "loss": 0.4873, "rewards/accuracies": 0.78125, "rewards/chosen": -4.536431789398193, "rewards/margins": 1.157500982284546, "rewards/rejected": -5.69393253326416, "sft_loss": 4.546270847320557, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 13.718386083706063, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.23244929313659668, "logits/rejected": -0.10611079633235931, "logps/chosen": -4.383098602294922, "logps/rejected": -5.782674312591553, "loss": 0.4394, "rewards/accuracies": 0.84375, "rewards/chosen": -4.383098602294922, "rewards/margins": 1.3995754718780518, "rewards/rejected": -5.782674312591553, "sft_loss": 4.387734889984131, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 13.546582964970408, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.23045757412910461, "logits/rejected": -0.06325678527355194, "logps/chosen": -4.604036808013916, "logps/rejected": -5.757791042327881, "loss": 0.4742, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.604036808013916, "rewards/margins": 1.1537543535232544, "rewards/rejected": -5.757791042327881, "sft_loss": 4.583401679992676, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 31.614601842690373, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.17662116885185242, "logits/rejected": -0.09832396358251572, "logps/chosen": -4.517539024353027, "logps/rejected": -5.673015117645264, "loss": 0.5014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.517539024353027, "rewards/margins": 1.1554756164550781, "rewards/rejected": -5.673015117645264, "sft_loss": 4.618490695953369, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 20.929350242079916, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.15189789235591888, "logits/rejected": -0.18337905406951904, "logps/chosen": -4.477793216705322, "logps/rejected": -5.649487495422363, "loss": 0.483, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.477793216705322, "rewards/margins": 1.1716941595077515, "rewards/rejected": -5.649487495422363, "sft_loss": 4.517508029937744, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 20.138531932920706, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.20178821682929993, "logits/rejected": -0.10650360584259033, "logps/chosen": -4.510470390319824, "logps/rejected": -5.833439826965332, "loss": 0.461, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.510470390319824, "rewards/margins": 1.3229695558547974, "rewards/rejected": -5.833439826965332, "sft_loss": 4.496905326843262, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 19.23378663117495, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.15263617038726807, "logits/rejected": -0.020247094333171844, "logps/chosen": -4.409241676330566, "logps/rejected": -5.532423496246338, "loss": 0.4561, "rewards/accuracies": 0.84375, "rewards/chosen": -4.409241676330566, "rewards/margins": 1.1231818199157715, "rewards/rejected": -5.532423496246338, "sft_loss": 4.408474445343018, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 23.308633524253832, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.22360029816627502, "logits/rejected": -0.17736531794071198, "logps/chosen": -4.466728210449219, "logps/rejected": -5.737008094787598, "loss": 0.4687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.466728210449219, "rewards/margins": 1.2702802419662476, "rewards/rejected": -5.737008094787598, "sft_loss": 4.493401527404785, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 16.189471930127553, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.19785809516906738, "logits/rejected": -0.04793568700551987, "logps/chosen": -4.383427619934082, "logps/rejected": -5.96859884262085, "loss": 0.386, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.383427619934082, "rewards/margins": 1.5851705074310303, "rewards/rejected": -5.96859884262085, "sft_loss": 4.509185791015625, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 17.670331891374648, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.10283637046813965, "logits/rejected": -0.01070125587284565, "logps/chosen": -4.565319538116455, "logps/rejected": -5.777436256408691, "loss": 0.4657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.565319538116455, "rewards/margins": 1.2121164798736572, "rewards/rejected": -5.777436256408691, "sft_loss": 4.566296577453613, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 25.038050542939718, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.16167476773262024, "logits/rejected": -0.13245835900306702, "logps/chosen": -4.51481294631958, "logps/rejected": -5.77095365524292, "loss": 0.4608, "rewards/accuracies": 0.8125, "rewards/chosen": -4.51481294631958, "rewards/margins": 1.256141185760498, "rewards/rejected": -5.77095365524292, "sft_loss": 4.587597370147705, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 20.326158561359126, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.24194279313087463, "logits/rejected": -0.1200401782989502, "logps/chosen": -4.5456061363220215, "logps/rejected": -6.069350242614746, "loss": 0.4157, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.5456061363220215, "rewards/margins": 1.5237447023391724, "rewards/rejected": -6.069350242614746, "sft_loss": 4.527360439300537, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 32.48053677381899, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.2554609477519989, "logits/rejected": -0.14878250658512115, "logps/chosen": -4.514296531677246, "logps/rejected": -5.8708391189575195, "loss": 0.471, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.514296531677246, "rewards/margins": 1.356541395187378, "rewards/rejected": -5.8708391189575195, "sft_loss": 4.559177875518799, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 20.44644783764318, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.18892745673656464, "logits/rejected": -0.13082177937030792, "logps/chosen": -4.487765312194824, "logps/rejected": -5.767231464385986, "loss": 0.436, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.487765312194824, "rewards/margins": 1.2794665098190308, "rewards/rejected": -5.767231464385986, "sft_loss": 4.460915565490723, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 19.29951548663288, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.21081380546092987, "logits/rejected": -0.08385033160448074, "logps/chosen": -4.683680534362793, "logps/rejected": -6.0807600021362305, "loss": 0.4437, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.683680534362793, "rewards/margins": 1.3970798254013062, "rewards/rejected": -6.0807600021362305, "sft_loss": 4.776706695556641, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 17.770995311277893, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.2691715359687805, "logits/rejected": -0.11873998492956161, "logps/chosen": -4.626561641693115, "logps/rejected": -5.735688209533691, "loss": 0.4969, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.626561641693115, "rewards/margins": 1.109127402305603, "rewards/rejected": -5.735688209533691, "sft_loss": 4.608339786529541, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 23.104121952800956, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.2036346197128296, "logits/rejected": -0.13959690928459167, "logps/chosen": -4.6203203201293945, "logps/rejected": -5.855708122253418, "loss": 0.485, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.6203203201293945, "rewards/margins": 1.235388159751892, "rewards/rejected": -5.855708122253418, "sft_loss": 4.670767307281494, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 22.17772707759298, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.2483900487422943, "logits/rejected": -0.14307789504528046, "logps/chosen": -4.501260280609131, "logps/rejected": -5.735810279846191, "loss": 0.4689, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.501260280609131, "rewards/margins": 1.23455011844635, "rewards/rejected": -5.735810279846191, "sft_loss": 4.489443302154541, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 20.034360313400903, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.15290217101573944, "logits/rejected": -0.06998590379953384, "logps/chosen": -4.484408378601074, "logps/rejected": -5.721133708953857, "loss": 0.4558, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.484408378601074, "rewards/margins": 1.2367255687713623, "rewards/rejected": -5.721133708953857, "sft_loss": 4.4762349128723145, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 18.00377929861334, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.12864898145198822, "logits/rejected": -0.06968877464532852, "logps/chosen": -4.501049041748047, "logps/rejected": -5.645022392272949, "loss": 0.4559, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.501049041748047, "rewards/margins": 1.1439731121063232, "rewards/rejected": -5.645022392272949, "sft_loss": 4.486703395843506, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 23.72032930756754, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.2211860716342926, "logits/rejected": -0.08631716668605804, "logps/chosen": -4.508865833282471, "logps/rejected": -5.988263130187988, "loss": 0.4365, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.508865833282471, "rewards/margins": 1.4793974161148071, "rewards/rejected": -5.988263130187988, "sft_loss": 4.437670707702637, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 20.951770949095856, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.18464908003807068, "logits/rejected": -0.1366146057844162, "logps/chosen": -4.533401966094971, "logps/rejected": -5.666837215423584, "loss": 0.4905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.533401966094971, "rewards/margins": 1.1334350109100342, "rewards/rejected": -5.666837215423584, "sft_loss": 4.637621879577637, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 22.224432062005373, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.21795520186424255, "logits/rejected": -0.12108222395181656, "logps/chosen": -4.572412014007568, "logps/rejected": -5.863051414489746, "loss": 0.4725, "rewards/accuracies": 0.8125, "rewards/chosen": -4.572412014007568, "rewards/margins": 1.29063880443573, "rewards/rejected": -5.863051414489746, "sft_loss": 4.595419883728027, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 18.77963189428479, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.19824708998203278, "logits/rejected": -0.09077189117670059, "logps/chosen": -4.486553192138672, "logps/rejected": -6.102303504943848, "loss": 0.4185, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.486553192138672, "rewards/margins": 1.6157506704330444, "rewards/rejected": -6.102303504943848, "sft_loss": 4.467534065246582, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 18.796983598911968, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.2092389166355133, "logits/rejected": -0.08719463646411896, "logps/chosen": -4.276993751525879, "logps/rejected": -5.6778717041015625, "loss": 0.4692, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.276993751525879, "rewards/margins": 1.4008785486221313, "rewards/rejected": -5.6778717041015625, "sft_loss": 4.343761920928955, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 24.112545714969144, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.25244978070259094, "logits/rejected": -0.06573184579610825, "logps/chosen": -4.505316257476807, "logps/rejected": -5.831053733825684, "loss": 0.4599, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.505316257476807, "rewards/margins": 1.3257373571395874, "rewards/rejected": -5.831053733825684, "sft_loss": 4.507748126983643, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 17.207660074491685, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.25891369581222534, "logits/rejected": -0.12769198417663574, "logps/chosen": -4.457161903381348, "logps/rejected": -5.8368024826049805, "loss": 0.436, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.457161903381348, "rewards/margins": 1.3796398639678955, "rewards/rejected": -5.8368024826049805, "sft_loss": 4.56962251663208, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 19.680737690718, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.18700532615184784, "logits/rejected": -0.10863462835550308, "logps/chosen": -4.646914005279541, "logps/rejected": -5.864001750946045, "loss": 0.4952, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.646914005279541, "rewards/margins": 1.2170881032943726, "rewards/rejected": -5.864001750946045, "sft_loss": 4.645427703857422, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 24.903963415968907, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.21200552582740784, "logits/rejected": -0.06936580687761307, "logps/chosen": -4.589808464050293, "logps/rejected": -5.938838005065918, "loss": 0.4508, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.589808464050293, "rewards/margins": 1.349029302597046, "rewards/rejected": -5.938838005065918, "sft_loss": 4.675601959228516, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 21.750718778937895, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.20201346278190613, "logits/rejected": -0.10818527638912201, "logps/chosen": -4.553851127624512, "logps/rejected": -6.130950927734375, "loss": 0.4531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.553851127624512, "rewards/margins": 1.5770998001098633, "rewards/rejected": -6.130950927734375, "sft_loss": 4.525333881378174, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.01564759947359562, "eval_logits/rejected": 0.09269643574953079, "eval_logps/chosen": -4.668255805969238, "eval_logps/rejected": -5.721538066864014, "eval_loss": 0.5805554389953613, "eval_rewards/accuracies": 0.7292284965515137, "eval_rewards/chosen": -4.668255805969238, "eval_rewards/margins": 1.0532816648483276, "eval_rewards/rejected": -5.721538066864014, "eval_runtime": 42.9693, "eval_samples_per_second": 31.301, "eval_sft_loss": 4.639232158660889, "eval_steps_per_second": 7.843, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.5697717436673725, "train_runtime": 31532.5259, "train_samples_per_second": 5.688, "train_steps_per_second": 0.178 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }